wukong 1.4.10 → 1.4.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG.textile CHANGED
@@ -1,3 +1,14 @@
1
+ h2. Wukong v.14.11 2010-07-30
2
+
3
+ * added the @max_(maps|reduces)_per_(node|cluster)@ jobconfs.
4
+ * added jobconfs for io_job_mb and friends.
5
+ * added a loadable module to convert output data to pig bags and tuples
6
+ * pulled in several methods from active_support, incl. Enumerable#sum
7
+ * Scripts to find percentile rank of elements in a dataset
8
+ * We are starting to move wukong to a model where streaming is from a generic
9
+ source into a generic sink. Several stores have been landed in the code, but
10
+ many are in a half- or un-baked state. Please ignore this for the moment.
11
+
1
12
  h2. Wukong v1.4.8 2010-06-05
2
13
 
3
14
  * made scripts inject a helpful job name using mapred.job.name
data/bin/hdp-mkdirp ADDED
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env bash
2
+
3
+ #
4
+ # Despite arguments and deliberation, this IS a necessary script. Azkaban, should you choose to
5
+ # use it, will fail if (it seems) ANY of its spawned subprocesses fails
6
+ #
7
+
8
+ hadoop fs -test -e "$@"
9
+ if [ "$?" != "0" ] ; then
10
+ # echo "File does not exist, making..."
11
+ exec hadoop fs -mkdir "$@"
12
+ fi
data/bin/hdp-rm CHANGED
@@ -1,11 +1,32 @@
1
1
  #!/usr/bin/env bash
2
2
 
3
+ #
4
+ # Documentation for hadoop fs -rmr says "acts just like Unix rm -rf command". If this
5
+ # is true then we need to ignore directories that don't exist and still return 0.
6
+ #
7
+
8
+ #
9
+ # All the dirty conditional logic here does is test whether a directory exists. If so, remove it
10
+ #
3
11
  if [ "$1" == "-r" ] ; then
12
+ shift
13
+ if [ "$1" == "-skipTrash" ] ; then
4
14
  shift
5
- action=rmr
15
+ hadoop fs -test -e "$@"
16
+ if [ "$?" == "0" ] ; then
17
+ # echo "File exists, skipping trash, removing it..."
18
+ echo hadoop dfs -rmr "$@"
19
+ exec hadoop dfs -rmr "$@"
20
+ fi
21
+ else
22
+ hadoop fs -test -e "$@"
23
+ if [ "$?" == "0" ] ; then
24
+ # echo "File exists, removing it..."
25
+ echo hadoop dfs -rmr "$@"
26
+ exec hadoop dfs -rmr "$@"
27
+ fi
28
+ fi
6
29
  else
7
- action=rm
30
+ echo hadoop dfs -rm "$@"
31
+ exec hadoop dfs -rm "$@"
8
32
  fi
9
- echo hadoop dfs -$action "$@"
10
- # read -p "Hit ctrl-C to abort or enter to do this...."
11
- exec hadoop dfs -$action "$@"
data/bin/hdp-sort CHANGED
@@ -13,7 +13,7 @@ if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/
13
13
  HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
14
14
 
15
15
  cmd="${HADOOP_HOME}/bin/hadoop \
16
- jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar
16
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar
17
17
  -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
18
18
  -jobconf num.key.fields.for.partition=\"$partfields\"
19
19
  -jobconf stream.num.map.output.key.fields=\"$sortfields\"
data/bin/hdp-stream CHANGED
@@ -13,7 +13,7 @@ if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/
13
13
  HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
14
14
 
15
15
  cmd="${HADOOP_HOME}/bin/hadoop \
16
- jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar
16
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar
17
17
  -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
18
18
  -jobconf num.key.fields.for.partition=\"$partfields\"
19
19
  -jobconf stream.num.map.output.key.fields=\"$sortfields\"
data/bin/hdp-stream-flat CHANGED
@@ -14,7 +14,7 @@ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
14
14
  # -jobconf mapred.reduce.tasks=3 \
15
15
 
16
16
  exec ${HADOOP_HOME}/bin/hadoop \
17
- jar ${HADOOP_HOME}/contrib/streaming/hadoop*streaming*.jar \
17
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
18
18
  "$@" \
19
19
  -jobconf "mapred.job.name=`basename $0`-$map_script-$input_file-$output_file" \
20
20
  -mapper "$map_script" \
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'wukong'
4
+ # require 'wukong/store'
5
+
6
+ require 'configliere'
7
+ Configliere.use :commandline, :define, :config_file
8
+ Settings.read('foo.yaml')
9
+
10
+ # store = ChunkedFlatFileStore.new(Settings)
11
+
12
+ 100.times do |iter|
13
+ # store.save [iter, Time.now.to_flat].join("\t")
14
+ $stdout.puts [iter, Time.now.to_flat].join("\t")
15
+ sleep 2
16
+ end
17
+
18
+
@@ -0,0 +1,23 @@
1
+ module Enumerable
2
+ #
3
+ # Convert an array of values to a string representing it as a pig tuple
4
+ #
5
+ def to_pig_tuple
6
+ map{|*vals| '(' + vals.join(',') + ')' }
7
+ end
8
+
9
+ #
10
+ # Convert an array of values to a string pig format
11
+ # Delegates to to_pig_tuple -- see also to_pig_bag
12
+ #
13
+ def to_pig *args
14
+ to_pig_tuple *args
15
+ end
16
+
17
+ #
18
+ # Convert an array of values to a string representing it as a pig bag
19
+ #
20
+ def to_pig_bag
21
+ '{' + self.join(',') + '}'
22
+ end
23
+ end
@@ -0,0 +1,7 @@
1
+ module Monkeyshines
2
+ module Monitor
3
+ autoload :PeriodicMonitor, 'monkeyshines/monitor/periodic_monitor'
4
+ autoload :PeriodicLogger, 'monkeyshines/monitor/periodic_logger'
5
+ end
6
+ end
7
+
@@ -0,0 +1,23 @@
1
+ require 'monkeyshines/monitor/periodic_monitor'
2
+ module Monkeyshines
3
+ module Monitor
4
+ module ChunkedStore
5
+ attr_accessor :file_pattern
6
+ def initialize file_pattern
7
+ self.file_pattern = file_pattern
8
+ super file_pattern.make
9
+ end
10
+
11
+ def close_and_reopen
12
+ close
13
+ self.filename = file_pattern.make
14
+ dump_file
15
+ end
16
+
17
+ def save *args
18
+ chunk_monitor.periodically{ close_rename_and_open }
19
+ super *args
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,34 @@
1
+ module Monkeyshines
2
+ module Monitor
3
+
4
+ #
5
+ # Emits a log line but only every +iter_interval+ calls or +time_interval+
6
+ # lapse.
7
+ #
8
+ # Since the contents of the block aren't called until the criteria are met,
9
+ # you can put relatively expensive operations in the log without killing
10
+ # your iteration time.
11
+ #
12
+ class PeriodicLogger < PeriodicMonitor
13
+ #
14
+ # Call with a block that returns a string or array to log.
15
+ # If you return
16
+ #
17
+ # Ex: log if it has been at least 5 minutes since last announcement:
18
+ #
19
+ # periodic_logger = Monkeyshines::Monitor::PeriodicLogger.new(:time => 300)
20
+ # loop do
21
+ # # ... stuff ...
22
+ # periodic_logger.periodically{ [morbenfactor, crunkosity, exuberance] }
23
+ # end
24
+ #
25
+ def periodically &block
26
+ super do
27
+ now = Time.now.utc.to_f
28
+ result = [ "%10d"%iter, "%7.1f"%since, "%7.1f"%inst_rate(now), (block ? block.call : nil) ].flatten.compact
29
+ Log.info result.join("\t")
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,72 @@
1
+ module Monkeyshines
2
+ module Monitor
3
+ #
4
+ # Accepts a lightweight call every iteration.
5
+ #
6
+ # Once either a time or an iteration criterion is met, executes the block
7
+ # and resets the timer until next execution.
8
+ #
9
+ # Note that the +time_interval+ is measured *excution to execution* and not
10
+ # in multiples of iter_interval. Say I set a time_interval of 300s, and
11
+ # happen to iterate at 297s and 310s after start. Then the monitor will
12
+ # execute at 310s, and the next execution will happen on or after 610s.
13
+ #
14
+ # Also note that when *either* criterion is met, *both* criteria are
15
+ # reset. Say I set a time interval of 300s and an +iter_interval+ of 10_000;
16
+ # and that at 250s I reach iteration 10_000. Then the monitor will execute
17
+ # on or after 20_000 iteration or 550s, whichever happens first.
18
+ #
19
+ class PeriodicMonitor
20
+ attr_accessor :time_interval, :iter_interval
21
+ attr_accessor :last_time, :current_iter, :iter, :started_at
22
+
23
+ def initialize options={}
24
+ self.started_at = Time.now.utc.to_f
25
+ self.last_time = started_at
26
+ self.iter = 0
27
+ self.current_iter = 0
28
+ self.time_interval = options[:time]
29
+ self.iter_interval = options[:iters]
30
+ end
31
+
32
+ # True if more than +iter_interval+ has elapsed since last execution.
33
+ def enough_iterations?
34
+ iter % iter_interval == 0 if iter_interval
35
+ end
36
+
37
+ # True if more than +time_interval+ has elapsed since last execution.
38
+ def enough_time? now
39
+ (now - last_time) > time_interval if time_interval
40
+ end
41
+
42
+ # Time since monitor was created
43
+ def since
44
+ Time.now.utc.to_f - started_at
45
+ end
46
+ # Overall iterations per second
47
+ def rate
48
+ iter.to_f / since.to_f
49
+ end
50
+ # "Instantaneous" iterations per second
51
+ def inst_rate now
52
+ current_iter.to_f / (now-last_time).to_f
53
+ end
54
+
55
+ #
56
+ # if the interval conditions are met, executes block; otherwise just does
57
+ # bookkeeping and returns.
58
+ #
59
+ def periodically &block
60
+ self.iter += 1
61
+ self.current_iter += 1
62
+ now = Time.now.utc.to_f
63
+ if enough_iterations? || enough_time?(now)
64
+ block.call(iter, (now-last_time))
65
+ self.last_time = now
66
+ self.current_iter = 0
67
+ end
68
+ end
69
+ end
70
+
71
+ end
72
+ end
@@ -32,6 +32,10 @@ module Wukong
32
32
  Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
33
33
  Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
34
34
  # mapred.linerecordreader.maxlength :description => "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
35
+ Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
36
+ Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
37
+ Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
38
+ Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
35
39
 
36
40
  # emit a -jobconf hadoop option if the simplified command line arg is present
37
41
  # if not, the resulting nil will be elided later
@@ -0,0 +1,14 @@
1
+ module Monkeyshines
2
+ module Store
3
+ extend FactoryModule
4
+ autoload :Base, 'monkeyshines/store/base'
5
+ autoload :FlatFileStore, 'monkeyshines/store/flat_file_store'
6
+ autoload :ConditionalStore, 'monkeyshines/store/conditional_store'
7
+ autoload :ChunkedFlatFileStore, 'monkeyshines/store/chunked_flat_file_store'
8
+ autoload :KeyStore, 'monkeyshines/store/key_store'
9
+ autoload :TokyoTdbKeyStore, 'monkeyshines/store/tokyo_tdb_key_store'
10
+ autoload :TyrantTdbKeyStore, 'monkeyshines/store/tyrant_tdb_key_store'
11
+ autoload :TyrantRdbKeyStore, 'monkeyshines/store/tyrant_rdb_key_store'
12
+ autoload :ReadThruStore, 'monkeyshines/store/read_thru_store'
13
+ end
14
+ end
@@ -0,0 +1,29 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class Base
4
+ attr_accessor :options
5
+ def initialize _options={}
6
+ self.options = _options
7
+ Log.info "Creating #{self.class}"
8
+ end
9
+
10
+ #
11
+ def each_as klass, &block
12
+ self.each do |*args|
13
+ begin
14
+ item = klass.new *args[1..-1]
15
+ rescue Exception => e
16
+ Log.info [args, e.to_s, self].join("\t")
17
+ raise e
18
+ end
19
+ yield item
20
+ end
21
+ end
22
+
23
+ def log_line
24
+ nil
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,37 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class ChunkedFlatFileStore < Monkeyshines::Store::FlatFileStore
4
+ attr_accessor :filename_pattern, :chunk_monitor, :handle
5
+
6
+ DEFAULT_OPTIONS = {
7
+ :chunktime => 4*60*60, # default 4 hours
8
+ :pattern => ":rootdir/:date/:handle+:timestamp-:pid.tsv",
9
+ :rootdir => nil,
10
+ :filemode => 'w',
11
+ }
12
+
13
+ def initialize _options
14
+ self.options = DEFAULT_OPTIONS.deep_merge(_options)
15
+ raise "You don't really want a chunk time this small: #{options[:chunktime]}" unless options[:chunktime] > 600
16
+ self.chunk_monitor = Monkeyshines::Monitor::PeriodicMonitor.new( :time => options[:chunktime] )
17
+ self.handle = options[:handle] || Monkeyshines::CONFIG[:handle]
18
+ self.filename_pattern = Monkeyshines::Utils::FilenamePattern.new(options[:pattern], :handle => handle, :rootdir => options[:rootdir])
19
+ super options.merge(:filename => filename_pattern.make())
20
+ self.mkdir!
21
+ end
22
+
23
+ def save *args
24
+ result = super *args
25
+ chunk_monitor.periodically do
26
+ new_filename = filename_pattern.make()
27
+ Log.info "Rotating chunked file #{filename} into #{new_filename}"
28
+ self.close
29
+ @filename = new_filename
30
+ self.mkdir!
31
+ end
32
+ result
33
+ end
34
+
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,57 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class ConditionalStore < Monkeyshines::Store::Base
4
+ attr_accessor :options, :cache, :store, :misses
5
+
6
+ DEFAULT_OPTIONS = {
7
+ :cache => { :type => :tyrant_rdb_key_store },
8
+ :store => { :type => :chunked_flat_file_store },
9
+ }
10
+
11
+ #
12
+ #
13
+ # +cache+ must behave like a hash (Hash and
14
+ # Monkeyshines::Store::TyrantRdbKeyStore are both cromulent
15
+ # choices).
16
+ #
17
+ #
18
+ #
19
+ def initialize _options
20
+ self.options = DEFAULT_OPTIONS.deep_merge(_options)
21
+ self.cache = Monkeyshines::Store.create(options[:cache])
22
+ self.store = Monkeyshines::Store.create(options[:store])
23
+ self.misses = 0
24
+ end
25
+
26
+ #
27
+ # If key is absent, save the result of calling the block.
28
+ # If key is present, block is never called.
29
+ #
30
+ # Ex:
31
+ # rt_store.set(url) do
32
+ # fetcher.get url # will only be called if url isn't in rt_store
33
+ # end
34
+ #
35
+ def set key, force=nil, &block
36
+ return if (!force) && cache.include?(key)
37
+ cache_val, store_val = block.call()
38
+ return unless cache_val
39
+ cache.set_nr key, cache_val # update cache
40
+ store << store_val # save value
41
+ self.misses += 1 # track the cache miss
42
+ store_val
43
+ end
44
+
45
+ def size() cache.size end
46
+
47
+ def log_line
48
+ [size, "%8d misses"%misses]
49
+ end
50
+
51
+ def close()
52
+ cache.close
53
+ store.close
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,8 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class Factory
4
+ def self.generate type, opts
5
+ end
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,90 @@
1
+ require 'fileutils'; include FileUtils
2
+
3
+ module Monkeyshines
4
+ module Store
5
+ #
6
+ class FlatFileStore < Store::Base
7
+ attr_accessor :filename, :filemode
8
+
9
+ #
10
+ # +filename_root+ : first part of name for files
11
+ #
12
+ def initialize options={}
13
+ Log.debug "New #{self.class} as #{options.inspect}"
14
+ self.filename = options[:filename] or raise "Missing filename in #{self.class}"
15
+ self.filemode = options[:filemode] || 'r'
16
+ skip!(options[:skip]) if options[:skip]
17
+ end
18
+
19
+ #
20
+ #
21
+ #
22
+ def each &block
23
+ file.each do |line|
24
+ next if line[0..0] == '#'
25
+ attrs = line.chomp.split("\t")
26
+ next if attrs.blank?
27
+ yield *attrs
28
+ end
29
+ end
30
+
31
+ #
32
+ # Read ahead n_lines lines in the file
33
+ #
34
+ def skip! n_lines
35
+ Log.info "Skipping #{n_lines} in #{self.class}:#{filename}"
36
+ n_lines.times do
37
+ file.readline
38
+ end
39
+ end
40
+
41
+ #
42
+ # Open the timestamped file,
43
+ # ensuring its directory exists
44
+ #
45
+ def file
46
+ return @file if @file
47
+ Log.info "Opening file #{filename} with mode #{filemode}"
48
+ @file = File.open(filename, filemode)
49
+ end
50
+
51
+ # Close the dump file
52
+ def close
53
+ @file.close if @file
54
+ @file = nil
55
+ end
56
+
57
+ # Ensure the file's directory exists
58
+ def mkdir!
59
+ dir = File.dirname(filename)
60
+ return if File.directory?(dir)
61
+ Log.info "Making directory #{dir}"
62
+ FileUtils.mkdir_p dir
63
+ end
64
+
65
+ # write to the file
66
+ def save obj
67
+ file << obj.to_flat.join("\t")+"\n"
68
+ obj
69
+ end
70
+
71
+ # returns the size of the current file
72
+ def size
73
+ return 0 if !@file
74
+ File.size(filename)
75
+ end
76
+
77
+ def set key, *args, &block
78
+ tok, obj = block.call
79
+ save obj
80
+ end
81
+
82
+ # delegates to +#save+ -- writes the object to the file
83
+ def <<(obj)
84
+ save obj
85
+ end
86
+
87
+ end
88
+ end
89
+ end
90
+
@@ -0,0 +1,51 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class KeyStore < Monkeyshines::Store::Base
4
+ # The actual backing store; should respond to #set and #get methods
5
+ attr_accessor :db
6
+
7
+ #
8
+ # Executes block once for each element in the whole DB, in whatever order
9
+ # the DB thinks you should see it.
10
+ #
11
+ # Your block will see |key, val|
12
+ #
13
+ # key_store.each do |key, val|
14
+ # # ... stuff ...
15
+ # end
16
+ #
17
+ def each &block
18
+ db.iterinit
19
+ loop do
20
+ key = db.iternext or break
21
+ val = db[key]
22
+ yield key, val
23
+ end
24
+ end
25
+
26
+
27
+ # Save the value into the database
28
+ def set(key, val)
29
+ return unless val
30
+ db[key] = val
31
+ end
32
+
33
+ alias_method :save, :set
34
+ def get(key) db[key] end
35
+ def [](key) db[key] end
36
+ def close() db.close end
37
+ def size() db.size end
38
+
39
+ #
40
+ # Load from standard command-line options
41
+ #
42
+ # obvs only works when there's just one store
43
+ #
44
+ def self.new_from_command_line cmdline_opts, default_opts={}
45
+ options = default_opts.merge(cmdline_opts)
46
+ store = self.new(options[:store_db])
47
+ store
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,15 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class NullStore < Monkeyshines::Store::Base
4
+
5
+ def each *args, &block
6
+ end
7
+
8
+
9
+ # Does nothing!
10
+ def set *args
11
+ end
12
+
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,22 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class ReadThruStore < Monkeyshines::Store::TyrantTdbKeyStore
4
+
5
+ #
6
+ # If key is absent, save the result of calling the block.
7
+ # If key is present, block is never called.
8
+ #
9
+ # Ex:
10
+ # rt_store.set(url) do
11
+ # fetcher.get url # will only be called if url isn't in rt_store
12
+ # end
13
+ #
14
+ def set key, force=nil, &block
15
+ return if !force && db.has_key?(key)
16
+ result = block.call() or return
17
+ super(key, result)
18
+ end
19
+
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,33 @@
1
+ require 'tokyocabinet'
2
+ module Monkeyshines
3
+ module Store
4
+ #
5
+ # Implementation of KeyStore with a Local TokyoCabinet table database (TDB)
6
+ #
7
+ class TokyoTdbKeyStore < Monkeyshines::Store::KeyStore
8
+
9
+ # pass in the filename or URI of a tokyo cabinet table-style DB
10
+ # set create_db = true if you want to create a missing DB file
11
+ def initialize db_uri, *args
12
+ self.db = TokyoCabinet::TDB.new
13
+ db.open(db_uri, TokyoCabinet::TDB::OWRITER) or raise "#{self.class.to_s}: Can't open TokyoCabinet TDB #{db_uri}"
14
+ super *args
15
+ end
16
+
17
+
18
+ def each_as klass, &block
19
+ self.each do |key, hsh|
20
+ yield klass.from_hash hsh
21
+ end
22
+ end
23
+ # Delegate to store
24
+ def set(key, val)
25
+ return unless val
26
+ db.put key, val.to_hash.compact
27
+ end
28
+
29
+ def size() db.rnum end
30
+
31
+ end #class
32
+ end
33
+ end
@@ -0,0 +1,57 @@
1
+ require 'tokyotyrant'
2
+ module Monkeyshines
3
+ module Store
4
+
5
+ #
6
+ # Implementation of KeyStore with a Local TokyoCabinet hash database (RDB)
7
+ #
8
+ class TyrantRdbKeyStore < Monkeyshines::Store::KeyStore
9
+ attr_accessor :db_host, :db_port
10
+
11
+ # pass in the host:port uri of the key store.
12
+ def initialize options
13
+ raise "URI for #{self.class} is required" if options[:uri].blank?
14
+ self.db_host, self.db_port = options[:uri].to_s.split(':')
15
+ self.db_host.gsub!(/^(localhost|127\.0\.0\.1)$/,'')
16
+ super options
17
+ end
18
+
19
+ def db
20
+ return @db if @db
21
+ @db ||= TokyoTyrant::RDB.new
22
+ @db.open(db_host, db_port) or raise("Can't open DB at host #{db_host} port #{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
23
+ @db
24
+ end
25
+
26
+ def close
27
+ @db.close if @db
28
+ @db = nil
29
+ end
30
+
31
+ # Save the value into the database without waiting for a response.
32
+ def set_nr(key, val)
33
+ db.putnr key, val if val
34
+ end
35
+
36
+ def size() db.rnum end
37
+ def include? *args
38
+ db.has_key? *args
39
+ end
40
+
41
+ # require 'memcache'
42
+ # def initialize db_uri=nil, *args
43
+ # # db_uri ||= ':1978'
44
+ # # self.db_host, self.db_port = db_uri.split(':')
45
+ # self.db = MemCache.new(db_uri, :no_reply => true)
46
+ # if !self.db then raise("Can't open DB #{db_uri}. Pass in host:port, default is ':1978' #{db.ecode}: #{db.errmsg(db.ecode)}") end
47
+ # super *args
48
+ # end
49
+ #
50
+ # def size
51
+ # db.stats
52
+ # end
53
+
54
+ end #class
55
+ end
56
+ end
57
+
@@ -0,0 +1,20 @@
1
+ require 'tokyotyrant'
2
+ require 'tyrant_rdb_key_store'
3
+ module Monkeyshines
4
+ module Store
5
+ #
6
+ # Implementation of KeyStore with a Local TokyoCabinet Table database (RDBTBL)
7
+ #
8
+ class TyrantRdbKeyStore < TyrantRdbKeyStore Monkeyshines::Store::KeyStore
9
+
10
+ def db
11
+ return @db if @db
12
+ @db ||= TokyoTyrant::RDBTBL.new
13
+ @db.open(db_host, db_port) or raise("Can't open DB #{db_host}:#{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
14
+ @db
15
+ end
16
+
17
+ end #class
18
+ end
19
+ end
20
+
data/wukong.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{wukong}
8
- s.version = "1.4.10"
8
+ s.version = "1.4.11"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Philip (flip) Kromer"]
12
- s.date = %q{2010-07-19}
12
+ s.date = %q{2010-07-30}
13
13
  s.description = %q{ Treat your dataset like a:
14
14
 
15
15
  * stream of lines when it's efficient to process by lines
@@ -42,6 +42,7 @@ Gem::Specification.new do |s|
42
42
  "bin/hdp-kill-task",
43
43
  "bin/hdp-ls",
44
44
  "bin/hdp-mkdir",
45
+ "bin/hdp-mkdirp",
45
46
  "bin/hdp-mv",
46
47
  "bin/hdp-parts_to_keys.rb",
47
48
  "bin/hdp-ps",
@@ -151,9 +152,11 @@ Gem::Specification.new do |s|
151
152
  "examples/size.rb",
152
153
  "examples/stats/avg_value_frequency.rb",
153
154
  "examples/stats/data/avg_value_frequency.tsv",
155
+ "examples/store/chunked_store_example.rb",
154
156
  "examples/stupidly_simple_filter.rb",
155
157
  "examples/word_count.rb",
156
158
  "lib/wukong.rb",
159
+ "lib/wukong/and_pig.rb",
157
160
  "lib/wukong/bad_record.rb",
158
161
  "lib/wukong/datatypes.rb",
159
162
  "lib/wukong/datatypes/enum.rb",
@@ -182,12 +185,28 @@ Gem::Specification.new do |s|
182
185
  "lib/wukong/keystore/tyrant_notes.textile",
183
186
  "lib/wukong/logger.rb",
184
187
  "lib/wukong/models/graph.rb",
188
+ "lib/wukong/monitor.rb",
189
+ "lib/wukong/monitor/chunked_store.rb",
190
+ "lib/wukong/monitor/periodic_logger.rb",
191
+ "lib/wukong/monitor/periodic_monitor.rb",
185
192
  "lib/wukong/periodic_monitor.rb",
186
193
  "lib/wukong/rdf.rb",
187
194
  "lib/wukong/schema.rb",
188
195
  "lib/wukong/script.rb",
189
196
  "lib/wukong/script/hadoop_command.rb",
190
197
  "lib/wukong/script/local_command.rb",
198
+ "lib/wukong/store.rb",
199
+ "lib/wukong/store/base.rb",
200
+ "lib/wukong/store/chunked_flat_file_store.rb",
201
+ "lib/wukong/store/conditional_store.rb",
202
+ "lib/wukong/store/factory.rb",
203
+ "lib/wukong/store/flat_file_store.rb",
204
+ "lib/wukong/store/key_store.rb",
205
+ "lib/wukong/store/null_store.rb",
206
+ "lib/wukong/store/read_thru_store.rb",
207
+ "lib/wukong/store/tokyo_tdb_key_store.rb",
208
+ "lib/wukong/store/tyrant_rdb_key_store.rb",
209
+ "lib/wukong/store/tyrant_tdb_key_store.rb",
191
210
  "lib/wukong/streamer.rb",
192
211
  "lib/wukong/streamer/accumulating_reducer.rb",
193
212
  "lib/wukong/streamer/base.rb",
@@ -226,30 +245,31 @@ Gem::Specification.new do |s|
226
245
  "spec/spec_helper.rb",
227
246
  "spec/wukong/encoding_spec.rb",
228
247
  "spec/wukong/script_spec.rb",
248
+ "examples/binning_percentile_estimator.rb",
249
+ "examples/contrib/jeans/normalize.rb",
250
+ "examples/contrib/jeans/sizes.rb",
251
+ "examples/corpus/words_to_bigrams.rb",
252
+ "examples/count_keys.rb",
253
+ "examples/count_keys_at_mapper.rb",
254
+ "examples/keystore/cassandra_batch_test.rb",
255
+ "examples/keystore/conditional_outputter_example.rb",
256
+ "examples/network_graph/adjacency_list.rb",
257
+ "examples/network_graph/breadth_first_search.rb",
258
+ "examples/network_graph/gen_2paths.rb",
259
+ "examples/network_graph/gen_multi_edge.rb",
260
+ "examples/network_graph/gen_symmetric_links.rb",
229
261
  "examples/pagerank/pagerank.rb",
230
262
  "examples/pagerank/pagerank_initialize.rb",
263
+ "examples/rank_and_bin.rb",
231
264
  "examples/sample_records.rb",
232
265
  "examples/server_logs/apache_log_parser.rb",
233
266
  "examples/server_logs/breadcrumbs.rb",
234
267
  "examples/server_logs/user_agent.rb",
235
- "examples/corpus/words_to_bigrams.rb",
236
- "examples/count_keys.rb",
237
- "examples/rank_and_bin.rb",
238
- "examples/binning_percentile_estimator.rb",
239
268
  "examples/size.rb",
240
- "examples/network_graph/breadth_first_search.rb",
241
- "examples/network_graph/gen_symmetric_links.rb",
242
- "examples/network_graph/gen_multi_edge.rb",
243
- "examples/network_graph/adjacency_list.rb",
244
- "examples/network_graph/gen_2paths.rb",
245
- "examples/keystore/cassandra_batch_test.rb",
246
- "examples/keystore/conditional_outputter_example.rb",
247
269
  "examples/stats/avg_value_frequency.rb",
248
- "examples/contrib/jeans/sizes.rb",
249
- "examples/contrib/jeans/normalize.rb",
250
- "examples/word_count.rb",
270
+ "examples/store/chunked_store_example.rb",
251
271
  "examples/stupidly_simple_filter.rb",
252
- "examples/count_keys_at_mapper.rb"
272
+ "examples/word_count.rb"
253
273
  ]
254
274
 
255
275
  if s.respond_to? :specification_version then
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wukong
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 17
5
5
  prerelease: false
6
6
  segments:
7
7
  - 1
8
8
  - 4
9
- - 10
10
- version: 1.4.10
9
+ - 11
10
+ version: 1.4.11
11
11
  platform: ruby
12
12
  authors:
13
13
  - Philip (flip) Kromer
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-07-19 00:00:00 +00:00
18
+ date: 2010-07-30 00:00:00 -05:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -136,6 +136,7 @@ files:
136
136
  - bin/hdp-kill-task
137
137
  - bin/hdp-ls
138
138
  - bin/hdp-mkdir
139
+ - bin/hdp-mkdirp
139
140
  - bin/hdp-mv
140
141
  - bin/hdp-parts_to_keys.rb
141
142
  - bin/hdp-ps
@@ -245,9 +246,11 @@ files:
245
246
  - examples/size.rb
246
247
  - examples/stats/avg_value_frequency.rb
247
248
  - examples/stats/data/avg_value_frequency.tsv
249
+ - examples/store/chunked_store_example.rb
248
250
  - examples/stupidly_simple_filter.rb
249
251
  - examples/word_count.rb
250
252
  - lib/wukong.rb
253
+ - lib/wukong/and_pig.rb
251
254
  - lib/wukong/bad_record.rb
252
255
  - lib/wukong/datatypes.rb
253
256
  - lib/wukong/datatypes/enum.rb
@@ -276,12 +279,28 @@ files:
276
279
  - lib/wukong/keystore/tyrant_notes.textile
277
280
  - lib/wukong/logger.rb
278
281
  - lib/wukong/models/graph.rb
282
+ - lib/wukong/monitor.rb
283
+ - lib/wukong/monitor/chunked_store.rb
284
+ - lib/wukong/monitor/periodic_logger.rb
285
+ - lib/wukong/monitor/periodic_monitor.rb
279
286
  - lib/wukong/periodic_monitor.rb
280
287
  - lib/wukong/rdf.rb
281
288
  - lib/wukong/schema.rb
282
289
  - lib/wukong/script.rb
283
290
  - lib/wukong/script/hadoop_command.rb
284
291
  - lib/wukong/script/local_command.rb
292
+ - lib/wukong/store.rb
293
+ - lib/wukong/store/base.rb
294
+ - lib/wukong/store/chunked_flat_file_store.rb
295
+ - lib/wukong/store/conditional_store.rb
296
+ - lib/wukong/store/factory.rb
297
+ - lib/wukong/store/flat_file_store.rb
298
+ - lib/wukong/store/key_store.rb
299
+ - lib/wukong/store/null_store.rb
300
+ - lib/wukong/store/read_thru_store.rb
301
+ - lib/wukong/store/tokyo_tdb_key_store.rb
302
+ - lib/wukong/store/tyrant_rdb_key_store.rb
303
+ - lib/wukong/store/tyrant_tdb_key_store.rb
285
304
  - lib/wukong/streamer.rb
286
305
  - lib/wukong/streamer/accumulating_reducer.rb
287
306
  - lib/wukong/streamer/base.rb
@@ -348,27 +367,28 @@ test_files:
348
367
  - spec/spec_helper.rb
349
368
  - spec/wukong/encoding_spec.rb
350
369
  - spec/wukong/script_spec.rb
370
+ - examples/binning_percentile_estimator.rb
371
+ - examples/contrib/jeans/normalize.rb
372
+ - examples/contrib/jeans/sizes.rb
373
+ - examples/corpus/words_to_bigrams.rb
374
+ - examples/count_keys.rb
375
+ - examples/count_keys_at_mapper.rb
376
+ - examples/keystore/cassandra_batch_test.rb
377
+ - examples/keystore/conditional_outputter_example.rb
378
+ - examples/network_graph/adjacency_list.rb
379
+ - examples/network_graph/breadth_first_search.rb
380
+ - examples/network_graph/gen_2paths.rb
381
+ - examples/network_graph/gen_multi_edge.rb
382
+ - examples/network_graph/gen_symmetric_links.rb
351
383
  - examples/pagerank/pagerank.rb
352
384
  - examples/pagerank/pagerank_initialize.rb
385
+ - examples/rank_and_bin.rb
353
386
  - examples/sample_records.rb
354
387
  - examples/server_logs/apache_log_parser.rb
355
388
  - examples/server_logs/breadcrumbs.rb
356
389
  - examples/server_logs/user_agent.rb
357
- - examples/corpus/words_to_bigrams.rb
358
- - examples/count_keys.rb
359
- - examples/rank_and_bin.rb
360
- - examples/binning_percentile_estimator.rb
361
390
  - examples/size.rb
362
- - examples/network_graph/breadth_first_search.rb
363
- - examples/network_graph/gen_symmetric_links.rb
364
- - examples/network_graph/gen_multi_edge.rb
365
- - examples/network_graph/adjacency_list.rb
366
- - examples/network_graph/gen_2paths.rb
367
- - examples/keystore/cassandra_batch_test.rb
368
- - examples/keystore/conditional_outputter_example.rb
369
391
  - examples/stats/avg_value_frequency.rb
370
- - examples/contrib/jeans/sizes.rb
371
- - examples/contrib/jeans/normalize.rb
372
- - examples/word_count.rb
392
+ - examples/store/chunked_store_example.rb
373
393
  - examples/stupidly_simple_filter.rb
374
- - examples/count_keys_at_mapper.rb
394
+ - examples/word_count.rb