wukong 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/README.textile +0 -1
  2. data/TODO.textile +6 -0
  3. data/examples/corpus/dbpedia_abstract_to_sentences.rb +1 -0
  4. data/examples/corpus/sentence_bigrams.rb +53 -0
  5. data/examples/corpus/sentence_coocurrence.rb +1 -5
  6. data/examples/corpus/words_to_bigrams.rb +2 -1
  7. data/examples/ignore_me/counting.rb +1 -2
  8. data/examples/network_graph/adjacency_list.rb +1 -1
  9. data/examples/network_graph/breadth_first_search.rb +1 -1
  10. data/examples/network_graph/gen_2paths.rb +2 -2
  11. data/examples/network_graph/gen_multi_edge.rb +0 -1
  12. data/examples/network_graph/gen_symmetric_links.rb +1 -0
  13. data/examples/pagerank/pagerank.rb +5 -21
  14. data/examples/pagerank/pagerank_initialize.rb +1 -1
  15. data/examples/server_logs/apache_log_parser.rb +8 -48
  16. data/examples/server_logs/logline.rb +37 -13
  17. data/examples/server_logs/nook.rb +48 -0
  18. data/examples/server_logs/nook/faraday_dummy_adapter.rb +94 -0
  19. data/examples/simple_word_count.rb +82 -0
  20. data/examples/size.rb +1 -1
  21. data/examples/stats/binning_percentile_estimator.rb +1 -1
  22. data/examples/stats/rank_and_bin.rb +1 -1
  23. data/examples/stupidly_simple_filter.rb +1 -1
  24. data/lib/wukong.rb +1 -1
  25. data/lib/wukong/extensions.rb +2 -2
  26. data/lib/wukong/extensions/blank.rb +6 -6
  27. data/lib/wukong/extensions/hash.rb +9 -9
  28. data/lib/wukong/extensions/hash_like.rb +2 -2
  29. data/lib/wukong/extensions/symbol.rb +1 -1
  30. data/lib/wukong/logger.rb +1 -1
  31. data/lib/wukong/periodic_monitor.rb +2 -2
  32. data/lib/wukong/script.rb +18 -8
  33. data/lib/wukong/script/emr_command.rb +6 -4
  34. data/lib/wukong/script/hadoop_command.rb +9 -4
  35. data/lib/wukong/script/local_command.rb +7 -1
  36. data/lib/wukong/streamer/base.rb +6 -2
  37. data/wukong.gemspec +11 -3
  38. metadata +11 -34
@@ -19,7 +19,6 @@ The **main documentation** lives on the "Wukong Pages.":http://mrflip.github.com
19
19
  * Wukong is licensed under the "Apache License":http://mrflip.github.com/wukong/LICENSE.html (same as Hadoop)
20
20
  * "More info":http://mrflip.github.com/wukong/moreinfo.html
21
21
 
22
-
23
22
  h2. Help!
24
23
 
25
24
  Send Wukong questions to the "Infinite Monkeywrench mailing list":http://groups.google.com/group/infochimps-code
@@ -3,3 +3,9 @@
3
3
  ** We should be able to specify comma *or* space separated paths; the last
4
4
  space-separated path in Settings.rest becomes the output file, the others are
5
5
  used as the input_file list.
6
+
7
+ at_exit do
8
+ if $!.nil? && $0 == Goliath::Application.app_file
9
+ Application.run!
10
+ end
11
+ end
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'rubygems'
2
3
  require 'wukong/script'
3
4
 
4
5
  #
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)
3
+ require 'rubygems'
4
+ require 'wukong/script'
5
+ require 'bucket_counter'
6
+
7
+ #
8
+ # Coocurrence counts
9
+ #
10
+
11
+ #
12
+ # Input is a list of document-idx-sentences, each field is tab-separated
13
+ # title idx word_a word_b word_c ...
14
+ #
15
+ # This emits each co-courring pair exactly once; in the case of a three-word
16
+ # sentence the output would be
17
+ #
18
+ # word_a word_b
19
+ # word_a word_c
20
+ # word_b word_c
21
+ #
22
+ class SentenceBigrams < Wukong::Streamer::RecordStreamer
23
+ def process title, idx, *words
24
+ words[0..-2].zip(words[1..-1]).each do |word_a, word_b|
25
+ yield [word_a, word_b]
26
+ end
27
+ end
28
+ end
29
+
30
+ #
31
+ # Combine multiple bucket counts into a single on
32
+ #
33
+ class CombineBuckets < Wukong::Streamer::AccumulatingReducer
34
+ def get_key *fields
35
+ fields[0..1]
36
+ end
37
+ def start! *args
38
+ @total = 0
39
+ end
40
+ def accumulate *fields
41
+ @total += 1
42
+ end
43
+ def finalize
44
+ yield [@total, key].flatten
45
+ end
46
+ end
47
+
48
+ Wukong.run(
49
+ SentenceBigrams,
50
+ CombineBuckets,
51
+ :io_sort_record_percent => 0.3,
52
+ :io_sort_mb => 300
53
+ )
@@ -26,11 +26,7 @@ class SentenceCoocurrence < Wukong::Streamer::RecordStreamer
26
26
  end
27
27
 
28
28
  def process title, idx, *words
29
- words.each_with_index do |word_a, idx|
30
- words[(idx+1) .. -1].each do |word_b|
31
- @bucket << [word_a, word_b]
32
- end
33
- end
29
+ @bucket << words[0..-2].zip(words[1..-1])
34
30
  dump_bucket if @bucket.full?
35
31
  end
36
32
 
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
- require 'wukong'
2
+ require 'rubygems'
3
+ require 'wukong/script'
3
4
 
4
5
  #
5
6
  # Bigram counts
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
-
3
2
  require 'rubygems'
4
- require 'wukong'
3
+ require 'wukong/script'
5
4
 
6
5
  require 'bloomfilter-rb'
7
6
 
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../lib'
2
+ require 'rubygems'
3
3
  require 'wukong/script'
4
4
 
5
5
  #
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../lib'
2
+ require 'rubygems'
3
3
  require 'wukong/script'
4
4
 
5
5
  #
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'
3
- require 'wukong'
2
+ require 'rubygems'
3
+ require 'wukong/script'
4
4
 
5
5
  class Edge < Struct.new(:src, :dest)
6
6
  end
@@ -1,6 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
  require 'rubygems'
3
- $: << File.dirname(__FILE__)+'/../../lib'
4
3
  require 'wukong'
5
4
 
6
5
  #
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'rubygems'
2
3
  $: << File.dirname(__FILE__)+'/../../lib'
3
4
  require 'wukong'
4
5
 
@@ -1,11 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'
2
+ require 'rubygems'
3
3
  require 'wukong/script'
4
4
 
5
- #
6
- #
7
- #
8
-
9
5
  module PageRank
10
6
  #
11
7
  # Damping factor (prob. of a 'random' jump)
@@ -13,16 +9,12 @@ module PageRank
13
9
  #
14
10
  DAMPING_FACTOR = 0.85
15
11
 
16
- #
17
12
  # Each user's line looks like
18
- #
19
13
  # user_a pagerank id1,id2,...,idN
20
- #
21
14
  # we need to disperse this user's pagerank to each of id1..idN, and
22
15
  # rendezvous the list of outbound links at user_a's reducer as well.
23
- #
24
16
  module Iterating
25
- class Mapper < Wukong::Streamer::Base
17
+ class PagerankMapper < Wukong::Streamer::Base
26
18
  #
27
19
  # Send pagerank to each page, and send the dests list back to self
28
20
  #
@@ -34,9 +26,7 @@ module PageRank
34
26
  yield_own_dest_list src, dests_str, &block
35
27
  end
36
28
 
37
- #
38
29
  # Take the source node's pagerank and distribute it among all the out-nodes
39
- #
40
30
  def yield_pagerank_shares src, pagerank, dests
41
31
  pagerank_share = pagerank.to_f / dests.length
42
32
  dests.each do |dest|
@@ -44,15 +34,13 @@ module PageRank
44
34
  end
45
35
  end
46
36
 
47
- #
48
37
  # Dispatch this user's out-node list to rendezvous with itself.
49
- #
50
38
  def yield_own_dest_list src, dests_str
51
39
  yield [src, 'd', dests_str]
52
40
  end
53
41
  end
54
42
 
55
- class Reducer < Wukong::Streamer::AccumulatingReducer
43
+ class PagerankReducer < Wukong::Streamer::AccumulatingReducer
56
44
  attr_accessor :node_id, :pagerank, :dests_str
57
45
  # Begin reduction with 0 accumulated pagerank and no dests as yet
58
46
  def start! node_id, *args
@@ -78,11 +66,7 @@ module PageRank
78
66
  end
79
67
  end
80
68
 
81
- class Script < Wukong::Script
82
- def default_options
83
- super.merge :extra_args => ' -jobconf io.sort.record.percent=0.25 '
84
- end
85
- end
86
- Script.new(Mapper, Reducer).run
69
+ Wukong.run(PagerankMapper, PagerankReducer,
70
+ :extra_args => ' -jobconf io.sort.record.percent=0.25 ')
87
71
  end
88
72
  end
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'
2
+ require 'rubygems'
3
3
  require 'wukong/script'
4
4
  require 'wukong/streamer/list_reducer'
5
5
 
@@ -1,58 +1,18 @@
1
- #!/usr/bin/env ruby
1
+ #!/usr/bin/env ruby -E ASCII-8BIT
2
2
  require 'rubygems'
3
3
  require 'wukong/script'
4
+ $: << File.dirname(__FILE__)
5
+ require 'logline'
4
6
 
5
- module ApacheLogParser
6
- class Mapper < Wukong::Streamer::LineStreamer
7
-
8
- #
9
- # Regular expression to parse an apache log line.
10
- #
11
- # 83.240.154.3 - - [07/Jun/2008:20:37:11 +0000] "GET /faq HTTP/1.1" 200 569 "http://infochimps.org/search?query=CAC" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
12
- #
13
- LOG_RE = Regexp.compile(%r{\A
14
- (\S+) # ip 83.240.154.3
15
- \s(\S+) # j1 -
16
- \s(\S+) # j2 -
17
- \s\[(\d+)/(\w+)/(\d+) # date part [07/Jun/2008
18
- :(\d+):(\d+):(\d+) # time part :20:37:11
19
- \s(\+.*)\] # timezone +0000]
20
- \s\"(?:(\S+) # http_method "GET
21
- \s(\S+) # path /faq
22
- \s(\S+)|-)" # protocol HTTP/1.1"
23
- \s(\d+) # response_code 200
24
- \s(\d+) # duration 569
25
- \s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
26
- \s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
27
- \z}x)
28
- MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
29
-
30
- # Use the regex to break line into fields
31
- # Emit each record as flat line
32
- def process line
33
- line.chomp
34
- m = LOG_RE.match(line)
35
- if m
36
- (ip, j1, j2,
37
- ts_day, ts_mo, ts_year,
38
- ts_hour, ts_min, ts_sec, tz,
39
- http_method, path, protocol,
40
- response_code, duration,
41
- referer, ua, *cruft) = m.captures
42
- date = [ts_year, MONTHS[ts_mo], ts_day].join("")
43
- time = [ts_hour, ts_min, ts_sec].join("")
44
- yield [:logline, ip, date, time, http_method, protocol, path, response_code, duration, referer, ua, tz]
45
- else
46
- yield [:unparseable, line]
47
- end
48
- end
7
+ class ApacheLogParser < Wukong::Streamer::LineStreamer
49
8
 
9
+ # create a Logline object from each record and serialize it flat to disk
10
+ def process line
11
+ yield Logline.parse(line)
50
12
  end
51
13
  end
52
14
 
53
- Wukong.run(ApacheLogParser::Mapper, nil, :sort_fields => 7)
54
-
55
- # 55.55.155.55 - - [04/Feb/2008:11:37:52 +0000] 301 "GET /robots.txt HTTP/1.1" 185 "-" "WebAlta Crawler/2.0 (http://www.webalta.net/ru/about_webmaster.html) (Windows; U; Windows NT 5.1; ru-RU)" "-"
15
+ Wukong.run( ApacheLogParser, nil, :sort_fields => 7 ) if $0 == __FILE__
56
16
 
57
17
 
58
18
 
@@ -1,9 +1,6 @@
1
- #!/usr/bin/env ruby
2
- require 'rubygems'
3
- require 'wukong/script'
4
-
5
1
  class Logline < Struct.new(
6
- :ip, :date, :time, :http_method, :protocol, :path, :response_code, :duration, :referer, :ua, :tz)
2
+ :ip, :dt, :tm, :http_method, :protocol, :path, :response_code, :size, :referer, :ua, :tz, :j1, :j2)
3
+ # 1 2 3 4 5 6 7 8 9 10 11
7
4
 
8
5
  def page_type
9
6
  case
@@ -14,14 +11,41 @@ class Logline < Struct.new(
14
11
  end
15
12
  end
16
13
 
17
- def is_page?
18
- page_type == :page
19
- end
20
- end
14
+ #
15
+ # Regular expression to parse an apache log line.
16
+ #
17
+ # 83.240.154.3 - - [07/Jun/2008:20:37:11 +0000] "GET /faq HTTP/1.1" 200 569 "http://infochimps.org/search?query=CAC" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
18
+ #
19
+ LOG_RE = Regexp.compile(%r{\A
20
+ (\S+) # ip 83.240.154.3
21
+ \s(\S+) # j1 -
22
+ \s(\S+) # j2 -
23
+ \s\[(\d+)/(\w+)/(\d+) # date part [07/Jun/2008
24
+ :(\d+):(\d+):(\d+) # time part :20:37:11
25
+ \s(\+.*)\] # timezone +0000]
26
+ \s\"(?:(\S+) # http_method "GET
27
+ \s(\S+) # path /faq
28
+ \s(\S+)|-)" # protocol HTTP/1.1"
29
+ \s(\d+) # response_code 200
30
+ \s(\d+) # size 569
31
+ \s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
32
+ \s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
33
+ \z}x)
34
+ MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
21
35
 
22
- class PageFilter < Wukong::Streamer::StructStreamer
23
- def process visit, *args
24
- yield visit.ua if visit.
36
+ # Use the regex to break line into fields
37
+ # Emit each record as flat line
38
+ def self.parse line
39
+ m = LOG_RE.match(line.chomp) or return BadRecord.new(line)
40
+ (ip, j1, j2,
41
+ ts_day, ts_mo, ts_year,
42
+ ts_hour, ts_min, ts_sec, tz,
43
+ http_method, path, protocol,
44
+ response_code, size,
45
+ referer, ua, *cruft) = m.captures
46
+ dt = [ts_year, MONTHS[ts_mo], ts_day].join("")
47
+ tm = [ts_hour, ts_min, ts_sec].join("")
48
+ self.new( ip, dt, tm, http_method, protocol, path, response_code, size, referer, ua, tz, j1, j2 )
25
49
  end
50
+
26
51
  end
27
- Wukong.run(PageFilter)
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env ruby -E BINARY
2
+ require 'rubygems'
3
+ require 'faraday'
4
+ require 'wukong/script'
5
+ require 'json'
6
+ $: << File.dirname(__FILE__)
7
+ require 'apache_log_parser'
8
+ require 'nook/faraday_dummy_adapter'
9
+
10
+ Settings.define :target_host, :default => 'localhost', :description => "The host name or IP address to target"
11
+ Settings.define :target_scheme, :default => 'http', :description => "Request scheme (http, https)"
12
+
13
+ #
14
+ # A Nook consumes its input stream and, for each input, generates an HTTP
15
+ # request against a remote host. Please use it for good and never for evil.
16
+ #
17
+ # You can use it from your command line:
18
+ # zcat /var/www/app/current/log/*access*.log.gz | ./nook.rb --map --host=http://my_own_host.com
19
+ #
20
+ #
21
+ class NookMapper < ApacheLogParser
22
+ # create a Logline object from each record and serialize it flat to disk
23
+ def process line
24
+ super(line) do |logline|
25
+ start = Time.now
26
+ resp = fetcher.get(logline.path, :user_agent => logline.ua, :referer => logline.referer)
27
+ yield [Time.now.to_flat, (Time.now - start).to_f, resp.status, resp.body.size, logline.path, resp.body]
28
+ end
29
+ end
30
+
31
+ def track record
32
+ monitor.periodically do |m|
33
+ m.progress
34
+ end
35
+ end
36
+
37
+ # a mock fetcher with a uniformly distributed variable delay
38
+ def fetcher
39
+ @fetcher ||= Faraday::Connection.new(:url => 'http://localhost:80/') do |f|
40
+ f.use Faraday::Adapter::Dummy do |dummy|
41
+ dummy.delay = Proc.new{|env| 0.05 } # 0.2 * rand()
42
+ # dummy.body = Proc.new{|env| env[:url] }
43
+ end
44
+ end
45
+ end
46
+ end
47
+
48
+ Wukong.run( NookMapper, nil, :sort_fields => 7 )
@@ -0,0 +1,94 @@
1
+
2
+ module Faraday
3
+ class Adapter
4
+
5
+ # test = Faraday::Connection.new do |f|
6
+ # f.use Faraday::Adapter::Dummy do |dummy|
7
+ # dummy.status 404
8
+ # dummy.delay 1
9
+ # end
10
+ # end
11
+ #
12
+ # # this will delay 0.2s, returning 404 with
13
+ # resp = text.get("/your/mom", :dummy_delay => 0.2)
14
+ # resp.body # => {"method":"get","url":"/your/mom","request_headers":{"Dummy-Delay":"0.2","dummy_delay":0.2},"request":{"proxy":null},"ssl":{}}
15
+ #
16
+ # More example:
17
+ #
18
+ # test = Faraday::Connection.new do |f|
19
+ # f.use Faraday::Adapter::Dummy, :status => 503
20
+ # end
21
+ #
22
+ # test = Faraday::Connection.new do |f|
23
+ # f.use Faraday::Adapter::Dummy do |dummy|
24
+ # dummy.delay = Proc.new{|env| 0.1 + 0.8 * rand() }
25
+ # end
26
+ # end
27
+ #
28
+ class Dummy < Middleware
29
+ include Addressable
30
+ attr_reader :config
31
+ def self.loaded?() false end
32
+
33
+ # gets value from environment if set, configured instance variable otherwise
34
+ def value_for env, key
35
+ val = env[:request_headers]["Dummy-#{header_hash_key(key)}"] || config[key]
36
+ if val.respond_to?(:call)
37
+ val = val.call(env)
38
+ end
39
+ val
40
+ end
41
+
42
+ # With an optional delay, constructs a [status, headers, response] based on the first of:
43
+ # * request header field (Dummy-Status, Dummy-Headers, Dummy-Resonse)
44
+ # * adapter's configuration:
45
+ # * Unless one of the above is set, body will return a json string taken from the request hash
46
+ #
47
+ def call(env)
48
+ status = value_for(env, :status)
49
+ headers = value_for(env, :headers)
50
+ headers = JSON.load(headers) if headers.is_a? String
51
+ body = value_for(env, :body) ||
52
+ env.dup.tap{|hsh| [:response, :parallel_manager, :body].each{|k| hsh.delete k} }.to_json
53
+ delay = value_for(env, :delay).to_f
54
+ sleep delay if delay > 0
55
+ headers[:dummy_delay] = delay
56
+ env.update(
57
+ :status => status,
58
+ :response_headers => headers,
59
+ :body => body)
60
+ @app.call(env)
61
+ end
62
+
63
+ class Configurator < Struct.new(:status, :headers, :delay, :body)
64
+ def status(val=nil) self.status = val if val ; super() end
65
+ def headers(val=nil) self.headers = val if val ; super() end
66
+ def body(val=nil) self.body = val if val ; super() end
67
+ def delay(val=nil) self.delay = val if val ; super() end
68
+ def self.from_hash hsh
69
+ new().tap{|config| hsh.each{|k,v| config.send("#{k}=", v) } }
70
+ end
71
+ end
72
+
73
+ def initialize(app, defaults={}, &block)
74
+ super(app)
75
+ @config = Configurator.from_hash(defaults.reverse_merge(:status => 200, :delay => 0, :headers => {}))
76
+ configure(&block) if block
77
+ end
78
+
79
+ def configure
80
+ yield config
81
+ end
82
+
83
+ # same as in Faraday::Utils -- turns :dummy_response_status into 'Dummy-Response-Status'
84
+ def header_hash_key(str)
85
+ str.to_s.split('_').each{|w| w.capitalize! }.join('-')
86
+ end
87
+
88
+ def create_multipart(env, params, boundary = nil)
89
+ stream = super
90
+ stream.read
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'wukong/script'
4
+
5
+ module WordCount
6
+ class Mapper < Wukong::Streamer::LineStreamer
7
+ #
8
+ # Emit each word in each line.
9
+ #
10
+ def process line
11
+ tokenize(line).each{|word| yield [word, 1] }
12
+ end
13
+
14
+ #
15
+ # Split a string into its constituent words.
16
+ #
17
+ # This is pretty simpleminded:
18
+ # * downcase the word
19
+ # * Split at any non-alphanumeric boundary, including '_'
20
+ # * However, preserve the special cases of 's, 'd or 't at the end of a
21
+ # word.
22
+ #
23
+ # tokenize("Ability is a poor man's wealth #johnwoodenquote")
24
+ # # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
25
+ #
26
+ def tokenize str
27
+ return [] if str.blank?
28
+ str = str.downcase;
29
+ # kill off all punctuation except [stuff]'s or [stuff]'t
30
+ # this includes hyphens (words are split)
31
+ str = str.
32
+ gsub(/[^a-zA-Z0-9\']+/, ' ').
33
+ gsub(/(\w)\'([std])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
34
+ # Busticate at whitespace
35
+ words = str.split(/\s+/)
36
+ words.reject!{|w| w.blank? }
37
+ words
38
+ end
39
+ end
40
+
41
+ #
42
+ # A bit kinder to your memory manager: accumulate the sum record-by-record:
43
+ #
44
+ class Reducer2 < Wukong::Streamer::AccumulatingReducer
45
+
46
+ def start!(*args)
47
+ @key_count = 0
48
+ end
49
+
50
+ def accumulate(*args)
51
+ @key_count += 1
52
+ end
53
+
54
+ def finalize
55
+ yield [ key, @key_count ]
56
+ end
57
+ end
58
+
59
+ #
60
+ # You can stack up all the values in a list then sum them at once.
61
+ #
62
+ # This isn't good style, as it means the whole list is held in memory
63
+ #
64
+ class Reducer1 < Wukong::Streamer::ListReducer
65
+ def finalize
66
+ yield [ key, values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot } ]
67
+ end
68
+ end
69
+
70
+ #
71
+ # ... easiest of all, though: this is common enough that it's already included
72
+ #
73
+ require 'wukong/streamer/count_keys'
74
+ class Reducer3 < Wukong::Streamer::CountKeys
75
+ end
76
+ end
77
+
78
+ # Execute the script
79
+ Wukong.run(
80
+ WordCount::Mapper,
81
+ WordCount::Reducer2
82
+ )
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../lib'
2
+ require 'rubygems'
3
3
  require 'wukong/script'
4
4
 
5
5
  module Size
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'
2
+ require 'rubygems'
3
3
  require 'wukong/script'
4
4
  require 'wukong/streamer/count_keys'
5
5
 
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'
2
+ require 'rubygems'
3
3
  require 'wukong/script'
4
4
  require 'wukong/streamer/rank_and_bin_reducer'
5
5
 
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'
2
+ require 'rubygems'
3
3
  require 'wukong/script'
4
4
 
5
5
  # Run as (local mode)
@@ -1,5 +1,5 @@
1
- require 'configliere'; Settings.use :define
2
1
  require 'wukong/extensions'
2
+ require 'configliere'; Settings.use :define
3
3
  require 'wukong/datatypes'
4
4
  require 'wukong/periodic_monitor'
5
5
  require 'wukong/logger'
@@ -2,8 +2,8 @@
2
2
  # These pull in the minimal functionality of the extlib|activesupport family of
3
3
  # gems.
4
4
  #
5
- require 'wukong/extensions/blank'
6
- require 'wukong/extensions/class'
5
+ require 'extlib/blank'
6
+ require 'extlib/class'
7
7
  require 'wukong/extensions/enumerable'
8
8
  require 'wukong/extensions/symbol'
9
9
  require 'wukong/extensions/hash'
@@ -15,7 +15,7 @@ class Object
15
15
  # @api public
16
16
  def blank?
17
17
  nil? || (respond_to?(:empty?) && empty?)
18
- end
18
+ end unless method_defined?(:blank?)
19
19
  end # class Object
20
20
 
21
21
  class Numeric
@@ -31,7 +31,7 @@ class Numeric
31
31
  # @api public
32
32
  def blank?
33
33
  false
34
- end
34
+ end unless method_defined?(:blank?)
35
35
  end # class Numeric
36
36
 
37
37
  class NilClass
@@ -45,7 +45,7 @@ class NilClass
45
45
  # @api public
46
46
  def blank?
47
47
  true
48
- end
48
+ end unless method_defined?(:blank?)
49
49
  end # class NilClass
50
50
 
51
51
  class TrueClass
@@ -59,7 +59,7 @@ class TrueClass
59
59
  # @api public
60
60
  def blank?
61
61
  false
62
- end
62
+ end unless method_defined?(:blank?)
63
63
  end # class TrueClass
64
64
 
65
65
  class FalseClass
@@ -73,7 +73,7 @@ class FalseClass
73
73
  # @api public
74
74
  def blank?
75
75
  true
76
- end
76
+ end unless method_defined?(:blank?)
77
77
  end # class FalseClass
78
78
 
79
79
  class String
@@ -89,5 +89,5 @@ class String
89
89
  # @api public
90
90
  def blank?
91
91
  strip.empty?
92
- end
92
+ end unless method_defined?(:blank?)
93
93
  end # class String
@@ -66,11 +66,11 @@ class Hash
66
66
  #
67
67
  def deep_merge hsh2
68
68
  merge hsh2, &Hash::DEEP_MERGER
69
- end
69
+ end unless method_defined?(:deep_merge)
70
70
 
71
71
  def deep_merge! hsh2
72
72
  merge! hsh2, &Hash::DEEP_MERGER
73
- end
73
+ end unless method_defined?(:deep_merge!)
74
74
 
75
75
  #
76
76
  # Treat hash as tree of hashes:
@@ -86,10 +86,10 @@ class Hash
86
86
  val = args.pop
87
87
  last_key = args.pop
88
88
  # dig down to last subtree (building out if necessary)
89
- hsh = args.empty? ? self : args.inject(self){|hsh, key| hsh[key] ||= {} }
89
+ hsh = args.empty? ? self : args.inject(self){|h, k| h[k] ||= {} }
90
90
  # set leaf value
91
91
  hsh[last_key] = val
92
- end
92
+ end unless method_defined?(:deep_set)
93
93
 
94
94
  #
95
95
  # Treat hash as tree of hashes:
@@ -107,10 +107,10 @@ class Hash
107
107
  def deep_get *args
108
108
  last_key = args.pop
109
109
  # dig down to last subtree (building out if necessary)
110
- hsh = args.inject(self){|hsh, key| hsh[key] || {} }
110
+ hsh = args.inject(self){|h, k| h[k] || {} }
111
111
  # get leaf value
112
112
  hsh[last_key]
113
- end
113
+ end unless method_defined?(:deep_get)
114
114
 
115
115
 
116
116
  #
@@ -126,20 +126,20 @@ class Hash
126
126
  last_key = args.pop
127
127
  last_hsh = args.empty? ? self : (deep_get(*args)||{})
128
128
  last_hsh.delete(last_key)
129
- end
129
+ end unless method_defined?(:deep_delete)
130
130
 
131
131
  #
132
132
  # remove all key-value pairs where the value is nil
133
133
  #
134
134
  def compact
135
135
  reject{|key,val| val.nil? }
136
- end
136
+ end unless method_defined?(:compact)
137
137
  #
138
138
  # Replace the hash with its compacted self
139
139
  #
140
140
  def compact!
141
141
  replace(compact)
142
- end
142
+ end unless method_defined?(:compact!)
143
143
 
144
144
  #
145
145
  # remove all key-value pairs where the value is blank
@@ -64,7 +64,7 @@ module Wukong
64
64
  # Analagous to Hash#merge
65
65
  #
66
66
  def merge *args
67
- self.dup.merge! *args
67
+ self.dup.merge!(*args)
68
68
  end
69
69
  def merge! hsh, &block
70
70
  raise "can't handle block arg yet" if block
@@ -104,7 +104,7 @@ module Wukong
104
104
  #
105
105
  def from_hash(hsh, has_symbol_keys=false)
106
106
  extract_keys = has_symbol_keys ? self.keys.map(&:to_sym) : self.keys.map(&:to_s)
107
- self.new *hsh.values_of(*extract_keys)
107
+ self.new(*hsh.values_of(*extract_keys))
108
108
  end
109
109
  #
110
110
  # The last portion of the class in underscored form
@@ -7,5 +7,5 @@ class Symbol
7
7
  # <tt>ActiveSupport::CoreExtensions::Symbol</tt>).
8
8
  def to_proc
9
9
  Proc.new { |*args| args.shift.__send__(self, *args) }
10
- end
10
+ end unless method_defined?(:to_proc)
11
11
  end
@@ -13,7 +13,7 @@ module Wukong
13
13
  # I, [2009-07-26T19:58:46-05:00 #12332]: Up to 2000 char message
14
14
  #
15
15
  def self.logger
16
- return @logger if @logger
16
+ return @logger if defined?(@logger)
17
17
  require 'logger'
18
18
  @logger = Logger.new STDERR
19
19
  @logger.instance_eval do
@@ -28,9 +28,9 @@ class PeriodicMonitor
28
28
  if ready?
29
29
  @last_report = Time.now
30
30
  if block
31
- block.call(iter, *args)
31
+ emit block.call(self, *args)
32
32
  else
33
- self.emit progress(*args)
33
+ emit progress(*args)
34
34
  end
35
35
  end
36
36
  end
@@ -1,4 +1,5 @@
1
1
  require 'pathname'
2
+ require 'wukong/extensions'
2
3
  require 'configliere' ; Settings.use(:commandline, :env_var, :define)
3
4
  require 'wukong'
4
5
  require 'wukong/script/hadoop_command'
@@ -127,7 +128,7 @@ module Wukong
127
128
  def initialize mapper, reducer=nil, extra_options={}
128
129
  Settings.resolve!
129
130
  @options = Settings
130
- options.merge extra_options
131
+ options.merge! extra_options
131
132
  @mapper = (case mapper when Class then mapper.new when nil then nil else mapper ; end)
132
133
  @reducer = (case reducer when Class then reducer.new when nil then nil else reducer ; end)
133
134
  @output_path = options.rest.pop
@@ -173,10 +174,14 @@ module Wukong
173
174
  # In hadoop mode, this is given to the hadoop streaming command.
174
175
  # In local mode, it's given to the system() call
175
176
  #
176
- def mapper_commandline
177
+ def mapper_commandline(run_option=:local)
177
178
  if mapper
178
- "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
179
- # "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --map " + non_wukong_params
179
+ case run_option
180
+ when :local then
181
+ "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
182
+ when :hadoop then
183
+ "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --map " + non_wukong_params
184
+ end
180
185
  else
181
186
  options[:map_command]
182
187
  end
@@ -187,10 +192,14 @@ module Wukong
187
192
  # In hadoop mode, this is given to the hadoop streaming command.
188
193
  # In local mode, it's given to the system() call
189
194
  #
190
- def reducer_commandline
195
+ def reducer_commandline(run_option=:local)
191
196
  if reducer
192
- "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
193
- # "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --reduce " + non_wukong_params
197
+ case run_option
198
+ when :local then
199
+ "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
200
+ when :hadoop then
201
+ "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --reduce " + non_wukong_params
202
+ end
194
203
  else
195
204
  options[:reduce_command]
196
205
  end
@@ -222,6 +231,7 @@ module Wukong
222
231
  else
223
232
  maybe_overwrite_output_paths! output_path
224
233
  $stdout.puts `#{command}`
234
+ raise "Streaming command failed!" unless $?.success?
225
235
  end
226
236
  end
227
237
 
@@ -242,7 +252,7 @@ module Wukong
242
252
  # the map/reducer phase scripts
243
253
  def non_wukong_params
244
254
  options.
245
- reject{|param, val| options.param_definitions[param][:wukong] }.
255
+ reject{|param, val| options.definition_of(param, :wukong) }.
246
256
  map{|param,val| "--#{param}=#{val}" }.
247
257
  join(" ")
248
258
  end
@@ -13,8 +13,8 @@ Settings.define :emr_bootstrap_script, :description => 'Bootstrap actions for El
13
13
  Settings.define :emr_extra_args, :description => 'kludge: allows you to stuff extra args into the elastic-mapreduce invocation', :type => Array, :wukong => true
14
14
  Settings.define :alive, :description => 'Whether to keep machine running after job invocation', :type => :boolean
15
15
  #
16
- Settings.define :keypair_file, :description => 'AWS Key pair file', :type => :filename
17
- Settings.define :keypair, :description => "AWS Key pair name. If not specified, it's taken from keypair_file's basename", :finally => lambda{ Settings.keypair ||= File.basename(Settings.keypair_file.to_s, '.pem') if Settings.keypair_file }
16
+ Settings.define :key_pair_file, :description => 'AWS Key pair file', :type => :filename
17
+ Settings.define :key_pair, :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') if Settings.key_pair_file }
18
18
  Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
19
19
  Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
20
20
  Settings.define :jobflow, :description => "ID of an existing EMR job flow. Wukong will create a new job flow"
@@ -45,7 +45,9 @@ module Wukong
45
45
  end
46
46
 
47
47
  def hadoop_options_for_emr_runner
48
- [hadoop_jobconf_options, hadoop_other_args].flatten.compact.map{|hdp_opt| "--arg '#{hdp_opt}'"}
48
+ [hadoop_jobconf_options, hadoop_other_args].flatten.compact.uniq.map do |hdp_opt|
49
+ hdp_opt.split(' ').map {|part| "--arg '#{part}'"}
50
+ end.flatten
49
51
  end
50
52
 
51
53
  def execute_emr_runner
@@ -57,7 +59,7 @@ module Wukong
57
59
  command_args << "--create --name=#{job_name}"
58
60
  command_args << Settings.dashed_flag_for(:alive)
59
61
  command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type, :hadoop_version).join(' ')
60
- command_args << Settings.dashed_flags(:availability_zone, :keypair, :keypair_file).join(' ')
62
+ command_args << Settings.dashed_flags(:availability_zone, :key_pair, :key_pair_file).join(' ')
61
63
  command_args << "--bootstrap-action=#{bootstrap_s3_uri}"
62
64
  end
63
65
  command_args << Settings.dashed_flags(:enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
@@ -25,6 +25,9 @@ module Wukong
25
25
  Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
26
26
  Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
27
27
  Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
28
+ Settings.define :max_tracker_failures, :jobconf => true, :description => 'mapred.max.tracker.failures', :wukong => true
29
+ Settings.define :max_map_attempts, :jobconf => true, :description => 'mapred.map.max.attempts', :wukong => true
30
+ Settings.define :max_reduce_attempts, :jobconf => true, :description => 'mapred.reduce.max.attempts', :wukong => true
28
31
  Settings.define :min_split_size, :jobconf => true, :description => 'mapred.min.split.size', :wukong => true
29
32
  Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
30
33
  Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
@@ -40,8 +43,8 @@ module Wukong
40
43
  # if not, the resulting nil will be elided later
41
44
  def jobconf option
42
45
  if options[option]
43
- # "-jobconf %s=%s" % [options.description_for(option), options[option]]
44
- "-D %s=%s" % [options.description_for(option), options[option]]
46
+ # "-jobconf %s=%s" % [options.definition_of(option, :description), options[option]]
47
+ "-D %s=%s" % [options.definition_of(option, :description), options[option]]
45
48
  end
46
49
  end
47
50
 
@@ -64,8 +67,8 @@ module Wukong
64
67
  hadoop_jobconf_options,
65
68
  "-D mapred.job.name='#{job_name}'",
66
69
  hadoop_other_args,
67
- "-mapper '#{mapper_commandline}'",
68
- "-reducer '#{reducer_commandline}'",
70
+ "-mapper '#{mapper_commandline(:hadoop)}'",
71
+ "-reducer '#{reducer_commandline(:hadoop)}'",
69
72
  "-input '#{input_paths}'",
70
73
  "-output '#{output_path}'",
71
74
  "-file '#{this_script_filename}'",
@@ -100,6 +103,8 @@ module Wukong
100
103
  :partition_fields, :sort_fields,
101
104
  :reduce_tasks, :respect_exit_status,
102
105
  :reuse_jvms, :timeout,
106
+ :max_tracker_failures, :max_map_attempts,
107
+ :max_reduce_attempts
103
108
  ].map{|opt| jobconf(opt)}
104
109
  jobconf_options.flatten.compact
105
110
  end
@@ -25,7 +25,13 @@ module Wukong
25
25
  @input_paths = input_paths.map(&:strip).join(' ')
26
26
  cmd_input_str = (input_paths == '-') ? "" : "cat '#{input_paths}' | "
27
27
  cmd_output_str = (output_path == '-') ? "" : "> '#{output_path}'"
28
- %Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} | #{reducer_commandline} #{cmd_output_str} }
28
+
29
+ if (reducer || options[:reduce_command])
30
+ %Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} | #{reducer_commandline} #{cmd_output_str} }
31
+ else
32
+ %Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} #{cmd_output_str} }
33
+ end
34
+
29
35
  end
30
36
 
31
37
  end
@@ -28,11 +28,15 @@ module Wukong
28
28
  process(*record) do |output_record|
29
29
  emit output_record
30
30
  end
31
- monitor.periodically(record.to_s[0..1000])
31
+ track(record)
32
32
  end
33
33
  after_stream
34
34
  end
35
35
 
36
+ def track record
37
+ monitor.periodically(record.to_s[0..1000])
38
+ end
39
+
36
40
  def each_record &block
37
41
  $stdin.each(&block)
38
42
  end
@@ -103,7 +107,7 @@ module Wukong
103
107
  # Creates a new object of this class and injects the given block
104
108
  # as the process method
105
109
  def self.mapper *args, &block
106
- self.new.mapper *args, &block
110
+ self.new.mapper(*args, &block)
107
111
  end
108
112
 
109
113
  # Delegates back to Wukong to run this instance as a mapper
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{wukong}
8
- s.version = "2.0.0"
8
+ s.version = "2.0.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Philip (flip) Kromer"]
12
- s.date = %q{2011-01-29}
12
+ s.date = %q{2011-07-01}
13
13
  s.description = %q{ Treat your dataset like a:
14
14
 
15
15
  * stream of lines when it's efficient to process by lines
@@ -139,6 +139,7 @@ Gem::Specification.new do |s|
139
139
  "examples/contrib/jeans/sizes.rb",
140
140
  "examples/corpus/bucket_counter.rb",
141
141
  "examples/corpus/dbpedia_abstract_to_sentences.rb",
142
+ "examples/corpus/sentence_bigrams.rb",
142
143
  "examples/corpus/sentence_coocurrence.rb",
143
144
  "examples/corpus/words_to_bigrams.rb",
144
145
  "examples/emr/README.textile",
@@ -162,7 +163,10 @@ Gem::Specification.new do |s|
162
163
  "examples/server_logs/apache_log_parser.rb",
163
164
  "examples/server_logs/breadcrumbs.rb",
164
165
  "examples/server_logs/logline.rb",
166
+ "examples/server_logs/nook.rb",
167
+ "examples/server_logs/nook/faraday_dummy_adapter.rb",
165
168
  "examples/server_logs/user_agent.rb",
169
+ "examples/simple_word_count.rb",
166
170
  "examples/size.rb",
167
171
  "examples/stats/avg_value_frequency.rb",
168
172
  "examples/stats/binning_percentile_estimator.rb",
@@ -252,13 +256,14 @@ Gem::Specification.new do |s|
252
256
  ]
253
257
  s.homepage = %q{http://mrflip.github.com/wukong}
254
258
  s.require_paths = ["lib"]
255
- s.rubygems_version = %q{1.4.2}
259
+ s.rubygems_version = %q{1.5.0}
256
260
  s.summary = %q{Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.}
257
261
  s.test_files = [
258
262
  "examples/contrib/jeans/normalize.rb",
259
263
  "examples/contrib/jeans/sizes.rb",
260
264
  "examples/corpus/bucket_counter.rb",
261
265
  "examples/corpus/dbpedia_abstract_to_sentences.rb",
266
+ "examples/corpus/sentence_bigrams.rb",
262
267
  "examples/corpus/sentence_coocurrence.rb",
263
268
  "examples/corpus/words_to_bigrams.rb",
264
269
  "examples/emr/elastic_mapreduce_example.rb",
@@ -275,7 +280,10 @@ Gem::Specification.new do |s|
275
280
  "examples/server_logs/apache_log_parser.rb",
276
281
  "examples/server_logs/breadcrumbs.rb",
277
282
  "examples/server_logs/logline.rb",
283
+ "examples/server_logs/nook.rb",
284
+ "examples/server_logs/nook/faraday_dummy_adapter.rb",
278
285
  "examples/server_logs/user_agent.rb",
286
+ "examples/simple_word_count.rb",
279
287
  "examples/size.rb",
280
288
  "examples/stats/avg_value_frequency.rb",
281
289
  "examples/stats/binning_percentile_estimator.rb",
metadata CHANGED
@@ -1,13 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wukong
3
3
  version: !ruby/object:Gem::Version
4
- hash: 15
5
4
  prerelease:
6
- segments:
7
- - 2
8
- - 0
9
- - 0
10
- version: 2.0.0
5
+ version: 2.0.1
11
6
  platform: ruby
12
7
  authors:
13
8
  - Philip (flip) Kromer
@@ -15,7 +10,7 @@ autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
12
 
18
- date: 2011-01-29 00:00:00 -06:00
13
+ date: 2011-07-01 00:00:00 -05:00
19
14
  default_executable:
20
15
  dependencies:
21
16
  - !ruby/object:Gem::Dependency
@@ -26,11 +21,6 @@ dependencies:
26
21
  requirements:
27
22
  - - ">="
28
23
  - !ruby/object:Gem::Version
29
- hash: 13
30
- segments:
31
- - 1
32
- - 2
33
- - 9
34
24
  version: 1.2.9
35
25
  type: :development
36
26
  version_requirements: *id001
@@ -42,9 +32,6 @@ dependencies:
42
32
  requirements:
43
33
  - - ">="
44
34
  - !ruby/object:Gem::Version
45
- hash: 3
46
- segments:
47
- - 0
48
35
  version: "0"
49
36
  type: :development
50
37
  version_requirements: *id002
@@ -56,9 +43,6 @@ dependencies:
56
43
  requirements:
57
44
  - - ">="
58
45
  - !ruby/object:Gem::Version
59
- hash: 3
60
- segments:
61
- - 0
62
46
  version: "0"
63
47
  type: :runtime
64
48
  version_requirements: *id003
@@ -70,9 +54,6 @@ dependencies:
70
54
  requirements:
71
55
  - - ">="
72
56
  - !ruby/object:Gem::Version
73
- hash: 3
74
- segments:
75
- - 0
76
57
  version: "0"
77
58
  type: :runtime
78
59
  version_requirements: *id004
@@ -84,9 +65,6 @@ dependencies:
84
65
  requirements:
85
66
  - - ">="
86
67
  - !ruby/object:Gem::Version
87
- hash: 3
88
- segments:
89
- - 0
90
68
  version: "0"
91
69
  type: :runtime
92
70
  version_requirements: *id005
@@ -98,9 +76,6 @@ dependencies:
98
76
  requirements:
99
77
  - - ">="
100
78
  - !ruby/object:Gem::Version
101
- hash: 3
102
- segments:
103
- - 0
104
79
  version: "0"
105
80
  type: :runtime
106
81
  version_requirements: *id006
@@ -233,6 +208,7 @@ files:
233
208
  - examples/contrib/jeans/sizes.rb
234
209
  - examples/corpus/bucket_counter.rb
235
210
  - examples/corpus/dbpedia_abstract_to_sentences.rb
211
+ - examples/corpus/sentence_bigrams.rb
236
212
  - examples/corpus/sentence_coocurrence.rb
237
213
  - examples/corpus/words_to_bigrams.rb
238
214
  - examples/emr/README.textile
@@ -256,7 +232,10 @@ files:
256
232
  - examples/server_logs/apache_log_parser.rb
257
233
  - examples/server_logs/breadcrumbs.rb
258
234
  - examples/server_logs/logline.rb
235
+ - examples/server_logs/nook.rb
236
+ - examples/server_logs/nook/faraday_dummy_adapter.rb
259
237
  - examples/server_logs/user_agent.rb
238
+ - examples/simple_word_count.rb
260
239
  - examples/size.rb
261
240
  - examples/stats/avg_value_frequency.rb
262
241
  - examples/stats/binning_percentile_estimator.rb
@@ -357,23 +336,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
357
336
  requirements:
358
337
  - - ">="
359
338
  - !ruby/object:Gem::Version
360
- hash: 3
361
- segments:
362
- - 0
363
339
  version: "0"
364
340
  required_rubygems_version: !ruby/object:Gem::Requirement
365
341
  none: false
366
342
  requirements:
367
343
  - - ">="
368
344
  - !ruby/object:Gem::Version
369
- hash: 3
370
- segments:
371
- - 0
372
345
  version: "0"
373
346
  requirements: []
374
347
 
375
348
  rubyforge_project:
376
- rubygems_version: 1.4.2
349
+ rubygems_version: 1.5.0
377
350
  signing_key:
378
351
  specification_version: 3
379
352
  summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.
@@ -382,6 +355,7 @@ test_files:
382
355
  - examples/contrib/jeans/sizes.rb
383
356
  - examples/corpus/bucket_counter.rb
384
357
  - examples/corpus/dbpedia_abstract_to_sentences.rb
358
+ - examples/corpus/sentence_bigrams.rb
385
359
  - examples/corpus/sentence_coocurrence.rb
386
360
  - examples/corpus/words_to_bigrams.rb
387
361
  - examples/emr/elastic_mapreduce_example.rb
@@ -398,7 +372,10 @@ test_files:
398
372
  - examples/server_logs/apache_log_parser.rb
399
373
  - examples/server_logs/breadcrumbs.rb
400
374
  - examples/server_logs/logline.rb
375
+ - examples/server_logs/nook.rb
376
+ - examples/server_logs/nook/faraday_dummy_adapter.rb
401
377
  - examples/server_logs/user_agent.rb
378
+ - examples/simple_word_count.rb
402
379
  - examples/size.rb
403
380
  - examples/stats/avg_value_frequency.rb
404
381
  - examples/stats/binning_percentile_estimator.rb