wukong 1.4.10 → 1.4.11

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG.textile CHANGED
@@ -1,3 +1,14 @@
1
+ h2. Wukong v.14.11 2010-07-30
2
+
3
+ * added the @max_(maps|reduces)_per_(node|cluster)@ jobconfs.
4
+ * added jobconfs for io_job_mb and friends.
5
+ * added a loadable module to convert output data to pig bags and tuples
6
+ * pulled in several methods from active_support, incl. Enumerable#sum
7
+ * Scripts to find percentile rank of elements in a dataset
8
+ * We are starting to move wukong to a model where streaming is from a generic
9
+ source into a generic sink. Several stores have been landed in the code, but
10
+ many are in a half- or un-baked state. Please ignore this for the moment.
11
+
1
12
  h2. Wukong v1.4.8 2010-06-05
2
13
 
3
14
  * made scripts inject a helpful job name using mapred.job.name
data/bin/hdp-mkdirp ADDED
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env bash
2
+
3
+ #
4
+ # Despite arguments and deliberation, this IS a necessary script. Azkaban, should you choose to
5
+ # use it, will fail if (it seems) ANY of its spawned subprocesses fails
6
+ #
7
+
8
+ hadoop fs -test -e "$@"
9
+ if [ "$?" != "0" ] ; then
10
+ # echo "File does not exist, making..."
11
+ exec hadoop fs -mkdir "$@"
12
+ fi
data/bin/hdp-rm CHANGED
@@ -1,11 +1,32 @@
1
1
  #!/usr/bin/env bash
2
2
 
3
+ #
4
+ # Documentation for hadoop fs -rmr says "acts just like Unix rm -rf command". If this
5
+ # is true then we need to ignore directories that don't exist and still return 0.
6
+ #
7
+
8
+ #
9
+ # All the dirty conditional logic here does is test whether a directory exists. If so, remove it
10
+ #
3
11
  if [ "$1" == "-r" ] ; then
12
+ shift
13
+ if [ "$1" == "-skipTrash" ] ; then
4
14
  shift
5
- action=rmr
15
+ hadoop fs -test -e "$@"
16
+ if [ "$?" == "0" ] ; then
17
+ # echo "File exists, skipping trash, removing it..."
18
+ echo hadoop dfs -rmr "$@"
19
+ exec hadoop dfs -rmr "$@"
20
+ fi
21
+ else
22
+ hadoop fs -test -e "$@"
23
+ if [ "$?" == "0" ] ; then
24
+ # echo "File exists, removing it..."
25
+ echo hadoop dfs -rmr "$@"
26
+ exec hadoop dfs -rmr "$@"
27
+ fi
28
+ fi
6
29
  else
7
- action=rm
30
+ echo hadoop dfs -rm "$@"
31
+ exec hadoop dfs -rm "$@"
8
32
  fi
9
- echo hadoop dfs -$action "$@"
10
- # read -p "Hit ctrl-C to abort or enter to do this...."
11
- exec hadoop dfs -$action "$@"
data/bin/hdp-sort CHANGED
@@ -13,7 +13,7 @@ if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/
13
13
  HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
14
14
 
15
15
  cmd="${HADOOP_HOME}/bin/hadoop \
16
- jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar
16
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar
17
17
  -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
18
18
  -jobconf num.key.fields.for.partition=\"$partfields\"
19
19
  -jobconf stream.num.map.output.key.fields=\"$sortfields\"
data/bin/hdp-stream CHANGED
@@ -13,7 +13,7 @@ if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/
13
13
  HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
14
14
 
15
15
  cmd="${HADOOP_HOME}/bin/hadoop \
16
- jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar
16
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar
17
17
  -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
18
18
  -jobconf num.key.fields.for.partition=\"$partfields\"
19
19
  -jobconf stream.num.map.output.key.fields=\"$sortfields\"
data/bin/hdp-stream-flat CHANGED
@@ -14,7 +14,7 @@ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
14
14
  # -jobconf mapred.reduce.tasks=3 \
15
15
 
16
16
  exec ${HADOOP_HOME}/bin/hadoop \
17
- jar ${HADOOP_HOME}/contrib/streaming/hadoop*streaming*.jar \
17
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
18
18
  "$@" \
19
19
  -jobconf "mapred.job.name=`basename $0`-$map_script-$input_file-$output_file" \
20
20
  -mapper "$map_script" \
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'wukong'
4
+ # require 'wukong/store'
5
+
6
+ require 'configliere'
7
+ Configliere.use :commandline, :define, :config_file
8
+ Settings.read('foo.yaml')
9
+
10
+ # store = ChunkedFlatFileStore.new(Settings)
11
+
12
+ 100.times do |iter|
13
+ # store.save [iter, Time.now.to_flat].join("\t")
14
+ $stdout.puts [iter, Time.now.to_flat].join("\t")
15
+ sleep 2
16
+ end
17
+
18
+
@@ -0,0 +1,23 @@
1
+ module Enumerable
2
+ #
3
+ # Convert an array of values to a string representing it as a pig tuple
4
+ #
5
+ def to_pig_tuple
6
+ map{|*vals| '(' + vals.join(',') + ')' }
7
+ end
8
+
9
+ #
10
+ # Convert an array of values to a string pig format
11
+ # Delegates to to_pig_tuple -- see also to_pig_bag
12
+ #
13
+ def to_pig *args
14
+ to_pig_tuple *args
15
+ end
16
+
17
+ #
18
+ # Convert an array of values to a string representing it as a pig bag
19
+ #
20
+ def to_pig_bag
21
+ '{' + self.join(',') + '}'
22
+ end
23
+ end
@@ -0,0 +1,7 @@
1
+ module Monkeyshines
2
+ module Monitor
3
+ autoload :PeriodicMonitor, 'monkeyshines/monitor/periodic_monitor'
4
+ autoload :PeriodicLogger, 'monkeyshines/monitor/periodic_logger'
5
+ end
6
+ end
7
+
@@ -0,0 +1,23 @@
1
+ require 'monkeyshines/monitor/periodic_monitor'
2
+ module Monkeyshines
3
+ module Monitor
4
+ module ChunkedStore
5
+ attr_accessor :file_pattern
6
+ def initialize file_pattern
7
+ self.file_pattern = file_pattern
8
+ super file_pattern.make
9
+ end
10
+
11
+ def close_and_reopen
12
+ close
13
+ self.filename = file_pattern.make
14
+ dump_file
15
+ end
16
+
17
+ def save *args
18
+ chunk_monitor.periodically{ close_rename_and_open }
19
+ super *args
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,34 @@
1
+ module Monkeyshines
2
+ module Monitor
3
+
4
+ #
5
+ # Emits a log line but only every +iter_interval+ calls or +time_interval+
6
+ # lapse.
7
+ #
8
+ # Since the contents of the block aren't called until the criteria are met,
9
+ # you can put relatively expensive operations in the log without killing
10
+ # your iteration time.
11
+ #
12
+ class PeriodicLogger < PeriodicMonitor
13
+ #
14
+ # Call with a block that returns a string or array to log.
15
+ # If you return
16
+ #
17
+ # Ex: log if it has been at least 5 minutes since last announcement:
18
+ #
19
+ # periodic_logger = Monkeyshines::Monitor::PeriodicLogger.new(:time => 300)
20
+ # loop do
21
+ # # ... stuff ...
22
+ # periodic_logger.periodically{ [morbenfactor, crunkosity, exuberance] }
23
+ # end
24
+ #
25
+ def periodically &block
26
+ super do
27
+ now = Time.now.utc.to_f
28
+ result = [ "%10d"%iter, "%7.1f"%since, "%7.1f"%inst_rate(now), (block ? block.call : nil) ].flatten.compact
29
+ Log.info result.join("\t")
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,72 @@
1
+ module Monkeyshines
2
+ module Monitor
3
+ #
4
+ # Accepts a lightweight call every iteration.
5
+ #
6
+ # Once either a time or an iteration criterion is met, executes the block
7
+ # and resets the timer until next execution.
8
+ #
9
+ # Note that the +time_interval+ is measured *excution to execution* and not
10
+ # in multiples of iter_interval. Say I set a time_interval of 300s, and
11
+ # happen to iterate at 297s and 310s after start. Then the monitor will
12
+ # execute at 310s, and the next execution will happen on or after 610s.
13
+ #
14
+ # Also note that when *either* criterion is met, *both* criteria are
15
+ # reset. Say I set a time interval of 300s and an +iter_interval+ of 10_000;
16
+ # and that at 250s I reach iteration 10_000. Then the monitor will execute
17
+ # on or after 20_000 iteration or 550s, whichever happens first.
18
+ #
19
+ class PeriodicMonitor
20
+ attr_accessor :time_interval, :iter_interval
21
+ attr_accessor :last_time, :current_iter, :iter, :started_at
22
+
23
+ def initialize options={}
24
+ self.started_at = Time.now.utc.to_f
25
+ self.last_time = started_at
26
+ self.iter = 0
27
+ self.current_iter = 0
28
+ self.time_interval = options[:time]
29
+ self.iter_interval = options[:iters]
30
+ end
31
+
32
+ # True if more than +iter_interval+ has elapsed since last execution.
33
+ def enough_iterations?
34
+ iter % iter_interval == 0 if iter_interval
35
+ end
36
+
37
+ # True if more than +time_interval+ has elapsed since last execution.
38
+ def enough_time? now
39
+ (now - last_time) > time_interval if time_interval
40
+ end
41
+
42
+ # Time since monitor was created
43
+ def since
44
+ Time.now.utc.to_f - started_at
45
+ end
46
+ # Overall iterations per second
47
+ def rate
48
+ iter.to_f / since.to_f
49
+ end
50
+ # "Instantaneous" iterations per second
51
+ def inst_rate now
52
+ current_iter.to_f / (now-last_time).to_f
53
+ end
54
+
55
+ #
56
+ # if the interval conditions are met, executes block; otherwise just does
57
+ # bookkeeping and returns.
58
+ #
59
+ def periodically &block
60
+ self.iter += 1
61
+ self.current_iter += 1
62
+ now = Time.now.utc.to_f
63
+ if enough_iterations? || enough_time?(now)
64
+ block.call(iter, (now-last_time))
65
+ self.last_time = now
66
+ self.current_iter = 0
67
+ end
68
+ end
69
+ end
70
+
71
+ end
72
+ end
@@ -32,6 +32,10 @@ module Wukong
32
32
  Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
33
33
  Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
34
34
  # mapred.linerecordreader.maxlength :description => "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
35
+ Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
36
+ Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
37
+ Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
38
+ Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
35
39
 
36
40
  # emit a -jobconf hadoop option if the simplified command line arg is present
37
41
  # if not, the resulting nil will be elided later
@@ -0,0 +1,14 @@
1
+ module Monkeyshines
2
+ module Store
3
+ extend FactoryModule
4
+ autoload :Base, 'monkeyshines/store/base'
5
+ autoload :FlatFileStore, 'monkeyshines/store/flat_file_store'
6
+ autoload :ConditionalStore, 'monkeyshines/store/conditional_store'
7
+ autoload :ChunkedFlatFileStore, 'monkeyshines/store/chunked_flat_file_store'
8
+ autoload :KeyStore, 'monkeyshines/store/key_store'
9
+ autoload :TokyoTdbKeyStore, 'monkeyshines/store/tokyo_tdb_key_store'
10
+ autoload :TyrantTdbKeyStore, 'monkeyshines/store/tyrant_tdb_key_store'
11
+ autoload :TyrantRdbKeyStore, 'monkeyshines/store/tyrant_rdb_key_store'
12
+ autoload :ReadThruStore, 'monkeyshines/store/read_thru_store'
13
+ end
14
+ end
@@ -0,0 +1,29 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class Base
4
+ attr_accessor :options
5
+ def initialize _options={}
6
+ self.options = _options
7
+ Log.info "Creating #{self.class}"
8
+ end
9
+
10
+ #
11
+ def each_as klass, &block
12
+ self.each do |*args|
13
+ begin
14
+ item = klass.new *args[1..-1]
15
+ rescue Exception => e
16
+ Log.info [args, e.to_s, self].join("\t")
17
+ raise e
18
+ end
19
+ yield item
20
+ end
21
+ end
22
+
23
+ def log_line
24
+ nil
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,37 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class ChunkedFlatFileStore < Monkeyshines::Store::FlatFileStore
4
+ attr_accessor :filename_pattern, :chunk_monitor, :handle
5
+
6
+ DEFAULT_OPTIONS = {
7
+ :chunktime => 4*60*60, # default 4 hours
8
+ :pattern => ":rootdir/:date/:handle+:timestamp-:pid.tsv",
9
+ :rootdir => nil,
10
+ :filemode => 'w',
11
+ }
12
+
13
+ def initialize _options
14
+ self.options = DEFAULT_OPTIONS.deep_merge(_options)
15
+ raise "You don't really want a chunk time this small: #{options[:chunktime]}" unless options[:chunktime] > 600
16
+ self.chunk_monitor = Monkeyshines::Monitor::PeriodicMonitor.new( :time => options[:chunktime] )
17
+ self.handle = options[:handle] || Monkeyshines::CONFIG[:handle]
18
+ self.filename_pattern = Monkeyshines::Utils::FilenamePattern.new(options[:pattern], :handle => handle, :rootdir => options[:rootdir])
19
+ super options.merge(:filename => filename_pattern.make())
20
+ self.mkdir!
21
+ end
22
+
23
+ def save *args
24
+ result = super *args
25
+ chunk_monitor.periodically do
26
+ new_filename = filename_pattern.make()
27
+ Log.info "Rotating chunked file #{filename} into #{new_filename}"
28
+ self.close
29
+ @filename = new_filename
30
+ self.mkdir!
31
+ end
32
+ result
33
+ end
34
+
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,57 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class ConditionalStore < Monkeyshines::Store::Base
4
+ attr_accessor :options, :cache, :store, :misses
5
+
6
+ DEFAULT_OPTIONS = {
7
+ :cache => { :type => :tyrant_rdb_key_store },
8
+ :store => { :type => :chunked_flat_file_store },
9
+ }
10
+
11
+ #
12
+ #
13
+ # +cache+ must behave like a hash (Hash and
14
+ # Monkeyshines::Store::TyrantRdbKeyStore are both cromulent
15
+ # choices).
16
+ #
17
+ #
18
+ #
19
+ def initialize _options
20
+ self.options = DEFAULT_OPTIONS.deep_merge(_options)
21
+ self.cache = Monkeyshines::Store.create(options[:cache])
22
+ self.store = Monkeyshines::Store.create(options[:store])
23
+ self.misses = 0
24
+ end
25
+
26
+ #
27
+ # If key is absent, save the result of calling the block.
28
+ # If key is present, block is never called.
29
+ #
30
+ # Ex:
31
+ # rt_store.set(url) do
32
+ # fetcher.get url # will only be called if url isn't in rt_store
33
+ # end
34
+ #
35
+ def set key, force=nil, &block
36
+ return if (!force) && cache.include?(key)
37
+ cache_val, store_val = block.call()
38
+ return unless cache_val
39
+ cache.set_nr key, cache_val # update cache
40
+ store << store_val # save value
41
+ self.misses += 1 # track the cache miss
42
+ store_val
43
+ end
44
+
45
+ def size() cache.size end
46
+
47
+ def log_line
48
+ [size, "%8d misses"%misses]
49
+ end
50
+
51
+ def close()
52
+ cache.close
53
+ store.close
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,8 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class Factory
4
+ def self.generate type, opts
5
+ end
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,90 @@
1
+ require 'fileutils'; include FileUtils
2
+
3
+ module Monkeyshines
4
+ module Store
5
+ #
6
+ class FlatFileStore < Store::Base
7
+ attr_accessor :filename, :filemode
8
+
9
+ #
10
+ # +filename_root+ : first part of name for files
11
+ #
12
+ def initialize options={}
13
+ Log.debug "New #{self.class} as #{options.inspect}"
14
+ self.filename = options[:filename] or raise "Missing filename in #{self.class}"
15
+ self.filemode = options[:filemode] || 'r'
16
+ skip!(options[:skip]) if options[:skip]
17
+ end
18
+
19
+ #
20
+ #
21
+ #
22
+ def each &block
23
+ file.each do |line|
24
+ next if line[0..0] == '#'
25
+ attrs = line.chomp.split("\t")
26
+ next if attrs.blank?
27
+ yield *attrs
28
+ end
29
+ end
30
+
31
+ #
32
+ # Read ahead n_lines lines in the file
33
+ #
34
+ def skip! n_lines
35
+ Log.info "Skipping #{n_lines} in #{self.class}:#{filename}"
36
+ n_lines.times do
37
+ file.readline
38
+ end
39
+ end
40
+
41
+ #
42
+ # Open the timestamped file,
43
+ # ensuring its directory exists
44
+ #
45
+ def file
46
+ return @file if @file
47
+ Log.info "Opening file #{filename} with mode #{filemode}"
48
+ @file = File.open(filename, filemode)
49
+ end
50
+
51
+ # Close the dump file
52
+ def close
53
+ @file.close if @file
54
+ @file = nil
55
+ end
56
+
57
+ # Ensure the file's directory exists
58
+ def mkdir!
59
+ dir = File.dirname(filename)
60
+ return if File.directory?(dir)
61
+ Log.info "Making directory #{dir}"
62
+ FileUtils.mkdir_p dir
63
+ end
64
+
65
+ # write to the file
66
+ def save obj
67
+ file << obj.to_flat.join("\t")+"\n"
68
+ obj
69
+ end
70
+
71
+ # returns the size of the current file
72
+ def size
73
+ return 0 if !@file
74
+ File.size(filename)
75
+ end
76
+
77
+ def set key, *args, &block
78
+ tok, obj = block.call
79
+ save obj
80
+ end
81
+
82
+ # delegates to +#save+ -- writes the object to the file
83
+ def <<(obj)
84
+ save obj
85
+ end
86
+
87
+ end
88
+ end
89
+ end
90
+
@@ -0,0 +1,51 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class KeyStore < Monkeyshines::Store::Base
4
+ # The actual backing store; should respond to #set and #get methods
5
+ attr_accessor :db
6
+
7
+ #
8
+ # Executes block once for each element in the whole DB, in whatever order
9
+ # the DB thinks you should see it.
10
+ #
11
+ # Your block will see |key, val|
12
+ #
13
+ # key_store.each do |key, val|
14
+ # # ... stuff ...
15
+ # end
16
+ #
17
+ def each &block
18
+ db.iterinit
19
+ loop do
20
+ key = db.iternext or break
21
+ val = db[key]
22
+ yield key, val
23
+ end
24
+ end
25
+
26
+
27
+ # Save the value into the database
28
+ def set(key, val)
29
+ return unless val
30
+ db[key] = val
31
+ end
32
+
33
+ alias_method :save, :set
34
+ def get(key) db[key] end
35
+ def [](key) db[key] end
36
+ def close() db.close end
37
+ def size() db.size end
38
+
39
+ #
40
+ # Load from standard command-line options
41
+ #
42
+ # obvs only works when there's just one store
43
+ #
44
+ def self.new_from_command_line cmdline_opts, default_opts={}
45
+ options = default_opts.merge(cmdline_opts)
46
+ store = self.new(options[:store_db])
47
+ store
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,15 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class NullStore < Monkeyshines::Store::Base
4
+
5
+ def each *args, &block
6
+ end
7
+
8
+
9
+ # Does nothing!
10
+ def set *args
11
+ end
12
+
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,22 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class ReadThruStore < Monkeyshines::Store::TyrantTdbKeyStore
4
+
5
+ #
6
+ # If key is absent, save the result of calling the block.
7
+ # If key is present, block is never called.
8
+ #
9
+ # Ex:
10
+ # rt_store.set(url) do
11
+ # fetcher.get url # will only be called if url isn't in rt_store
12
+ # end
13
+ #
14
+ def set key, force=nil, &block
15
+ return if !force && db.has_key?(key)
16
+ result = block.call() or return
17
+ super(key, result)
18
+ end
19
+
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,33 @@
1
+ require 'tokyocabinet'
2
+ module Monkeyshines
3
+ module Store
4
+ #
5
+ # Implementation of KeyStore with a Local TokyoCabinet table database (TDB)
6
+ #
7
+ class TokyoTdbKeyStore < Monkeyshines::Store::KeyStore
8
+
9
+ # pass in the filename or URI of a tokyo cabinet table-style DB
10
+ # set create_db = true if you want to create a missing DB file
11
+ def initialize db_uri, *args
12
+ self.db = TokyoCabinet::TDB.new
13
+ db.open(db_uri, TokyoCabinet::TDB::OWRITER) or raise "#{self.class.to_s}: Can't open TokyoCabinet TDB #{db_uri}"
14
+ super *args
15
+ end
16
+
17
+
18
+ def each_as klass, &block
19
+ self.each do |key, hsh|
20
+ yield klass.from_hash hsh
21
+ end
22
+ end
23
+ # Delegate to store
24
+ def set(key, val)
25
+ return unless val
26
+ db.put key, val.to_hash.compact
27
+ end
28
+
29
+ def size() db.rnum end
30
+
31
+ end #class
32
+ end
33
+ end
@@ -0,0 +1,57 @@
1
+ require 'tokyotyrant'
2
+ module Monkeyshines
3
+ module Store
4
+
5
+ #
6
+ # Implementation of KeyStore with a Local TokyoCabinet hash database (RDB)
7
+ #
8
+ class TyrantRdbKeyStore < Monkeyshines::Store::KeyStore
9
+ attr_accessor :db_host, :db_port
10
+
11
+ # pass in the host:port uri of the key store.
12
+ def initialize options
13
+ raise "URI for #{self.class} is required" if options[:uri].blank?
14
+ self.db_host, self.db_port = options[:uri].to_s.split(':')
15
+ self.db_host.gsub!(/^(localhost|127\.0\.0\.1)$/,'')
16
+ super options
17
+ end
18
+
19
+ def db
20
+ return @db if @db
21
+ @db ||= TokyoTyrant::RDB.new
22
+ @db.open(db_host, db_port) or raise("Can't open DB at host #{db_host} port #{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
23
+ @db
24
+ end
25
+
26
+ def close
27
+ @db.close if @db
28
+ @db = nil
29
+ end
30
+
31
+ # Save the value into the database without waiting for a response.
32
+ def set_nr(key, val)
33
+ db.putnr key, val if val
34
+ end
35
+
36
+ def size() db.rnum end
37
+ def include? *args
38
+ db.has_key? *args
39
+ end
40
+
41
+ # require 'memcache'
42
+ # def initialize db_uri=nil, *args
43
+ # # db_uri ||= ':1978'
44
+ # # self.db_host, self.db_port = db_uri.split(':')
45
+ # self.db = MemCache.new(db_uri, :no_reply => true)
46
+ # if !self.db then raise("Can't open DB #{db_uri}. Pass in host:port, default is ':1978' #{db.ecode}: #{db.errmsg(db.ecode)}") end
47
+ # super *args
48
+ # end
49
+ #
50
+ # def size
51
+ # db.stats
52
+ # end
53
+
54
+ end #class
55
+ end
56
+ end
57
+
@@ -0,0 +1,20 @@
1
+ require 'tokyotyrant'
2
+ require 'tyrant_rdb_key_store'
3
+ module Monkeyshines
4
+ module Store
5
+ #
6
+ # Implementation of KeyStore with a Local TokyoCabinet Table database (RDBTBL)
7
+ #
8
+ class TyrantRdbKeyStore < TyrantRdbKeyStore Monkeyshines::Store::KeyStore
9
+
10
+ def db
11
+ return @db if @db
12
+ @db ||= TokyoTyrant::RDBTBL.new
13
+ @db.open(db_host, db_port) or raise("Can't open DB #{db_host}:#{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
14
+ @db
15
+ end
16
+
17
+ end #class
18
+ end
19
+ end
20
+
data/wukong.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{wukong}
8
- s.version = "1.4.10"
8
+ s.version = "1.4.11"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Philip (flip) Kromer"]
12
- s.date = %q{2010-07-19}
12
+ s.date = %q{2010-07-30}
13
13
  s.description = %q{ Treat your dataset like a:
14
14
 
15
15
  * stream of lines when it's efficient to process by lines
@@ -42,6 +42,7 @@ Gem::Specification.new do |s|
42
42
  "bin/hdp-kill-task",
43
43
  "bin/hdp-ls",
44
44
  "bin/hdp-mkdir",
45
+ "bin/hdp-mkdirp",
45
46
  "bin/hdp-mv",
46
47
  "bin/hdp-parts_to_keys.rb",
47
48
  "bin/hdp-ps",
@@ -151,9 +152,11 @@ Gem::Specification.new do |s|
151
152
  "examples/size.rb",
152
153
  "examples/stats/avg_value_frequency.rb",
153
154
  "examples/stats/data/avg_value_frequency.tsv",
155
+ "examples/store/chunked_store_example.rb",
154
156
  "examples/stupidly_simple_filter.rb",
155
157
  "examples/word_count.rb",
156
158
  "lib/wukong.rb",
159
+ "lib/wukong/and_pig.rb",
157
160
  "lib/wukong/bad_record.rb",
158
161
  "lib/wukong/datatypes.rb",
159
162
  "lib/wukong/datatypes/enum.rb",
@@ -182,12 +185,28 @@ Gem::Specification.new do |s|
182
185
  "lib/wukong/keystore/tyrant_notes.textile",
183
186
  "lib/wukong/logger.rb",
184
187
  "lib/wukong/models/graph.rb",
188
+ "lib/wukong/monitor.rb",
189
+ "lib/wukong/monitor/chunked_store.rb",
190
+ "lib/wukong/monitor/periodic_logger.rb",
191
+ "lib/wukong/monitor/periodic_monitor.rb",
185
192
  "lib/wukong/periodic_monitor.rb",
186
193
  "lib/wukong/rdf.rb",
187
194
  "lib/wukong/schema.rb",
188
195
  "lib/wukong/script.rb",
189
196
  "lib/wukong/script/hadoop_command.rb",
190
197
  "lib/wukong/script/local_command.rb",
198
+ "lib/wukong/store.rb",
199
+ "lib/wukong/store/base.rb",
200
+ "lib/wukong/store/chunked_flat_file_store.rb",
201
+ "lib/wukong/store/conditional_store.rb",
202
+ "lib/wukong/store/factory.rb",
203
+ "lib/wukong/store/flat_file_store.rb",
204
+ "lib/wukong/store/key_store.rb",
205
+ "lib/wukong/store/null_store.rb",
206
+ "lib/wukong/store/read_thru_store.rb",
207
+ "lib/wukong/store/tokyo_tdb_key_store.rb",
208
+ "lib/wukong/store/tyrant_rdb_key_store.rb",
209
+ "lib/wukong/store/tyrant_tdb_key_store.rb",
191
210
  "lib/wukong/streamer.rb",
192
211
  "lib/wukong/streamer/accumulating_reducer.rb",
193
212
  "lib/wukong/streamer/base.rb",
@@ -226,30 +245,31 @@ Gem::Specification.new do |s|
226
245
  "spec/spec_helper.rb",
227
246
  "spec/wukong/encoding_spec.rb",
228
247
  "spec/wukong/script_spec.rb",
248
+ "examples/binning_percentile_estimator.rb",
249
+ "examples/contrib/jeans/normalize.rb",
250
+ "examples/contrib/jeans/sizes.rb",
251
+ "examples/corpus/words_to_bigrams.rb",
252
+ "examples/count_keys.rb",
253
+ "examples/count_keys_at_mapper.rb",
254
+ "examples/keystore/cassandra_batch_test.rb",
255
+ "examples/keystore/conditional_outputter_example.rb",
256
+ "examples/network_graph/adjacency_list.rb",
257
+ "examples/network_graph/breadth_first_search.rb",
258
+ "examples/network_graph/gen_2paths.rb",
259
+ "examples/network_graph/gen_multi_edge.rb",
260
+ "examples/network_graph/gen_symmetric_links.rb",
229
261
  "examples/pagerank/pagerank.rb",
230
262
  "examples/pagerank/pagerank_initialize.rb",
263
+ "examples/rank_and_bin.rb",
231
264
  "examples/sample_records.rb",
232
265
  "examples/server_logs/apache_log_parser.rb",
233
266
  "examples/server_logs/breadcrumbs.rb",
234
267
  "examples/server_logs/user_agent.rb",
235
- "examples/corpus/words_to_bigrams.rb",
236
- "examples/count_keys.rb",
237
- "examples/rank_and_bin.rb",
238
- "examples/binning_percentile_estimator.rb",
239
268
  "examples/size.rb",
240
- "examples/network_graph/breadth_first_search.rb",
241
- "examples/network_graph/gen_symmetric_links.rb",
242
- "examples/network_graph/gen_multi_edge.rb",
243
- "examples/network_graph/adjacency_list.rb",
244
- "examples/network_graph/gen_2paths.rb",
245
- "examples/keystore/cassandra_batch_test.rb",
246
- "examples/keystore/conditional_outputter_example.rb",
247
269
  "examples/stats/avg_value_frequency.rb",
248
- "examples/contrib/jeans/sizes.rb",
249
- "examples/contrib/jeans/normalize.rb",
250
- "examples/word_count.rb",
270
+ "examples/store/chunked_store_example.rb",
251
271
  "examples/stupidly_simple_filter.rb",
252
- "examples/count_keys_at_mapper.rb"
272
+ "examples/word_count.rb"
253
273
  ]
254
274
 
255
275
  if s.respond_to? :specification_version then
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wukong
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 17
5
5
  prerelease: false
6
6
  segments:
7
7
  - 1
8
8
  - 4
9
- - 10
10
- version: 1.4.10
9
+ - 11
10
+ version: 1.4.11
11
11
  platform: ruby
12
12
  authors:
13
13
  - Philip (flip) Kromer
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-07-19 00:00:00 +00:00
18
+ date: 2010-07-30 00:00:00 -05:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -136,6 +136,7 @@ files:
136
136
  - bin/hdp-kill-task
137
137
  - bin/hdp-ls
138
138
  - bin/hdp-mkdir
139
+ - bin/hdp-mkdirp
139
140
  - bin/hdp-mv
140
141
  - bin/hdp-parts_to_keys.rb
141
142
  - bin/hdp-ps
@@ -245,9 +246,11 @@ files:
245
246
  - examples/size.rb
246
247
  - examples/stats/avg_value_frequency.rb
247
248
  - examples/stats/data/avg_value_frequency.tsv
249
+ - examples/store/chunked_store_example.rb
248
250
  - examples/stupidly_simple_filter.rb
249
251
  - examples/word_count.rb
250
252
  - lib/wukong.rb
253
+ - lib/wukong/and_pig.rb
251
254
  - lib/wukong/bad_record.rb
252
255
  - lib/wukong/datatypes.rb
253
256
  - lib/wukong/datatypes/enum.rb
@@ -276,12 +279,28 @@ files:
276
279
  - lib/wukong/keystore/tyrant_notes.textile
277
280
  - lib/wukong/logger.rb
278
281
  - lib/wukong/models/graph.rb
282
+ - lib/wukong/monitor.rb
283
+ - lib/wukong/monitor/chunked_store.rb
284
+ - lib/wukong/monitor/periodic_logger.rb
285
+ - lib/wukong/monitor/periodic_monitor.rb
279
286
  - lib/wukong/periodic_monitor.rb
280
287
  - lib/wukong/rdf.rb
281
288
  - lib/wukong/schema.rb
282
289
  - lib/wukong/script.rb
283
290
  - lib/wukong/script/hadoop_command.rb
284
291
  - lib/wukong/script/local_command.rb
292
+ - lib/wukong/store.rb
293
+ - lib/wukong/store/base.rb
294
+ - lib/wukong/store/chunked_flat_file_store.rb
295
+ - lib/wukong/store/conditional_store.rb
296
+ - lib/wukong/store/factory.rb
297
+ - lib/wukong/store/flat_file_store.rb
298
+ - lib/wukong/store/key_store.rb
299
+ - lib/wukong/store/null_store.rb
300
+ - lib/wukong/store/read_thru_store.rb
301
+ - lib/wukong/store/tokyo_tdb_key_store.rb
302
+ - lib/wukong/store/tyrant_rdb_key_store.rb
303
+ - lib/wukong/store/tyrant_tdb_key_store.rb
285
304
  - lib/wukong/streamer.rb
286
305
  - lib/wukong/streamer/accumulating_reducer.rb
287
306
  - lib/wukong/streamer/base.rb
@@ -348,27 +367,28 @@ test_files:
348
367
  - spec/spec_helper.rb
349
368
  - spec/wukong/encoding_spec.rb
350
369
  - spec/wukong/script_spec.rb
370
+ - examples/binning_percentile_estimator.rb
371
+ - examples/contrib/jeans/normalize.rb
372
+ - examples/contrib/jeans/sizes.rb
373
+ - examples/corpus/words_to_bigrams.rb
374
+ - examples/count_keys.rb
375
+ - examples/count_keys_at_mapper.rb
376
+ - examples/keystore/cassandra_batch_test.rb
377
+ - examples/keystore/conditional_outputter_example.rb
378
+ - examples/network_graph/adjacency_list.rb
379
+ - examples/network_graph/breadth_first_search.rb
380
+ - examples/network_graph/gen_2paths.rb
381
+ - examples/network_graph/gen_multi_edge.rb
382
+ - examples/network_graph/gen_symmetric_links.rb
351
383
  - examples/pagerank/pagerank.rb
352
384
  - examples/pagerank/pagerank_initialize.rb
385
+ - examples/rank_and_bin.rb
353
386
  - examples/sample_records.rb
354
387
  - examples/server_logs/apache_log_parser.rb
355
388
  - examples/server_logs/breadcrumbs.rb
356
389
  - examples/server_logs/user_agent.rb
357
- - examples/corpus/words_to_bigrams.rb
358
- - examples/count_keys.rb
359
- - examples/rank_and_bin.rb
360
- - examples/binning_percentile_estimator.rb
361
390
  - examples/size.rb
362
- - examples/network_graph/breadth_first_search.rb
363
- - examples/network_graph/gen_symmetric_links.rb
364
- - examples/network_graph/gen_multi_edge.rb
365
- - examples/network_graph/adjacency_list.rb
366
- - examples/network_graph/gen_2paths.rb
367
- - examples/keystore/cassandra_batch_test.rb
368
- - examples/keystore/conditional_outputter_example.rb
369
391
  - examples/stats/avg_value_frequency.rb
370
- - examples/contrib/jeans/sizes.rb
371
- - examples/contrib/jeans/normalize.rb
372
- - examples/word_count.rb
392
+ - examples/store/chunked_store_example.rb
373
393
  - examples/stupidly_simple_filter.rb
374
- - examples/count_keys_at_mapper.rb
394
+ - examples/word_count.rb