wukong 2.0.0 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/README.textile +0 -1
  2. data/TODO.textile +6 -0
  3. data/examples/corpus/dbpedia_abstract_to_sentences.rb +1 -0
  4. data/examples/corpus/sentence_bigrams.rb +53 -0
  5. data/examples/corpus/sentence_coocurrence.rb +1 -5
  6. data/examples/corpus/words_to_bigrams.rb +2 -1
  7. data/examples/ignore_me/counting.rb +1 -2
  8. data/examples/network_graph/adjacency_list.rb +1 -1
  9. data/examples/network_graph/breadth_first_search.rb +1 -1
  10. data/examples/network_graph/gen_2paths.rb +2 -2
  11. data/examples/network_graph/gen_multi_edge.rb +0 -1
  12. data/examples/network_graph/gen_symmetric_links.rb +1 -0
  13. data/examples/pagerank/pagerank.rb +5 -21
  14. data/examples/pagerank/pagerank_initialize.rb +1 -1
  15. data/examples/server_logs/apache_log_parser.rb +8 -48
  16. data/examples/server_logs/logline.rb +37 -13
  17. data/examples/server_logs/nook.rb +48 -0
  18. data/examples/server_logs/nook/faraday_dummy_adapter.rb +94 -0
  19. data/examples/simple_word_count.rb +82 -0
  20. data/examples/size.rb +1 -1
  21. data/examples/stats/binning_percentile_estimator.rb +1 -1
  22. data/examples/stats/rank_and_bin.rb +1 -1
  23. data/examples/stupidly_simple_filter.rb +1 -1
  24. data/lib/wukong.rb +1 -1
  25. data/lib/wukong/extensions.rb +2 -2
  26. data/lib/wukong/extensions/blank.rb +6 -6
  27. data/lib/wukong/extensions/hash.rb +9 -9
  28. data/lib/wukong/extensions/hash_like.rb +2 -2
  29. data/lib/wukong/extensions/symbol.rb +1 -1
  30. data/lib/wukong/logger.rb +1 -1
  31. data/lib/wukong/periodic_monitor.rb +2 -2
  32. data/lib/wukong/script.rb +18 -8
  33. data/lib/wukong/script/emr_command.rb +6 -4
  34. data/lib/wukong/script/hadoop_command.rb +9 -4
  35. data/lib/wukong/script/local_command.rb +7 -1
  36. data/lib/wukong/streamer/base.rb +6 -2
  37. data/wukong.gemspec +11 -3
  38. metadata +11 -34
@@ -19,7 +19,6 @@ The **main documentation** lives on the "Wukong Pages.":http://mrflip.github.com
19
19
  * Wukong is licensed under the "Apache License":http://mrflip.github.com/wukong/LICENSE.html (same as Hadoop)
20
20
  * "More info":http://mrflip.github.com/wukong/moreinfo.html
21
21
 
22
-
23
22
  h2. Help!
24
23
 
25
24
  Send Wukong questions to the "Infinite Monkeywrench mailing list":http://groups.google.com/group/infochimps-code
@@ -3,3 +3,9 @@
3
3
  ** We should be able to specify comma *or* space separated paths; the last
4
4
  space-separated path in Settings.rest becomes the output file, the others are
5
5
  used as the input_file list.
6
+
7
+ at_exit do
8
+ if $!.nil? && $0 == Goliath::Application.app_file
9
+ Application.run!
10
+ end
11
+ end
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'rubygems'
2
3
  require 'wukong/script'
3
4
 
4
5
  #
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)
3
+ require 'rubygems'
4
+ require 'wukong/script'
5
+ require 'bucket_counter'
6
+
7
+ #
8
+ # Coocurrence counts
9
+ #
10
+
11
+ #
12
+ # Input is a list of document-idx-sentences, each field is tab-separated
13
+ # title idx word_a word_b word_c ...
14
+ #
15
+ # This emits each co-courring pair exactly once; in the case of a three-word
16
+ # sentence the output would be
17
+ #
18
+ # word_a word_b
19
+ # word_a word_c
20
+ # word_b word_c
21
+ #
22
+ class SentenceBigrams < Wukong::Streamer::RecordStreamer
23
+ def process title, idx, *words
24
+ words[0..-2].zip(words[1..-1]).each do |word_a, word_b|
25
+ yield [word_a, word_b]
26
+ end
27
+ end
28
+ end
29
+
30
+ #
31
+ # Combine multiple bucket counts into a single on
32
+ #
33
+ class CombineBuckets < Wukong::Streamer::AccumulatingReducer
34
+ def get_key *fields
35
+ fields[0..1]
36
+ end
37
+ def start! *args
38
+ @total = 0
39
+ end
40
+ def accumulate *fields
41
+ @total += 1
42
+ end
43
+ def finalize
44
+ yield [@total, key].flatten
45
+ end
46
+ end
47
+
48
+ Wukong.run(
49
+ SentenceBigrams,
50
+ CombineBuckets,
51
+ :io_sort_record_percent => 0.3,
52
+ :io_sort_mb => 300
53
+ )
@@ -26,11 +26,7 @@ class SentenceCoocurrence < Wukong::Streamer::RecordStreamer
26
26
  end
27
27
 
28
28
  def process title, idx, *words
29
- words.each_with_index do |word_a, idx|
30
- words[(idx+1) .. -1].each do |word_b|
31
- @bucket << [word_a, word_b]
32
- end
33
- end
29
+ @bucket << words[0..-2].zip(words[1..-1])
34
30
  dump_bucket if @bucket.full?
35
31
  end
36
32
 
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
- require 'wukong'
2
+ require 'rubygems'
3
+ require 'wukong/script'
3
4
 
4
5
  #
5
6
  # Bigram counts
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
-
3
2
  require 'rubygems'
4
- require 'wukong'
3
+ require 'wukong/script'
5
4
 
6
5
  require 'bloomfilter-rb'
7
6
 
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../lib'
2
+ require 'rubygems'
3
3
  require 'wukong/script'
4
4
 
5
5
  #
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../lib'
2
+ require 'rubygems'
3
3
  require 'wukong/script'
4
4
 
5
5
  #
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'
3
- require 'wukong'
2
+ require 'rubygems'
3
+ require 'wukong/script'
4
4
 
5
5
  class Edge < Struct.new(:src, :dest)
6
6
  end
@@ -1,6 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
  require 'rubygems'
3
- $: << File.dirname(__FILE__)+'/../../lib'
4
3
  require 'wukong'
5
4
 
6
5
  #
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'rubygems'
2
3
  $: << File.dirname(__FILE__)+'/../../lib'
3
4
  require 'wukong'
4
5
 
@@ -1,11 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'
2
+ require 'rubygems'
3
3
  require 'wukong/script'
4
4
 
5
- #
6
- #
7
- #
8
-
9
5
  module PageRank
10
6
  #
11
7
  # Damping factor (prob. of a 'random' jump)
@@ -13,16 +9,12 @@ module PageRank
13
9
  #
14
10
  DAMPING_FACTOR = 0.85
15
11
 
16
- #
17
12
  # Each user's line looks like
18
- #
19
13
  # user_a pagerank id1,id2,...,idN
20
- #
21
14
  # we need to disperse this user's pagerank to each of id1..idN, and
22
15
  # rendezvous the list of outbound links at user_a's reducer as well.
23
- #
24
16
  module Iterating
25
- class Mapper < Wukong::Streamer::Base
17
+ class PagerankMapper < Wukong::Streamer::Base
26
18
  #
27
19
  # Send pagerank to each page, and send the dests list back to self
28
20
  #
@@ -34,9 +26,7 @@ module PageRank
34
26
  yield_own_dest_list src, dests_str, &block
35
27
  end
36
28
 
37
- #
38
29
  # Take the source node's pagerank and distribute it among all the out-nodes
39
- #
40
30
  def yield_pagerank_shares src, pagerank, dests
41
31
  pagerank_share = pagerank.to_f / dests.length
42
32
  dests.each do |dest|
@@ -44,15 +34,13 @@ module PageRank
44
34
  end
45
35
  end
46
36
 
47
- #
48
37
  # Dispatch this user's out-node list to rendezvous with itself.
49
- #
50
38
  def yield_own_dest_list src, dests_str
51
39
  yield [src, 'd', dests_str]
52
40
  end
53
41
  end
54
42
 
55
- class Reducer < Wukong::Streamer::AccumulatingReducer
43
+ class PagerankReducer < Wukong::Streamer::AccumulatingReducer
56
44
  attr_accessor :node_id, :pagerank, :dests_str
57
45
  # Begin reduction with 0 accumulated pagerank and no dests as yet
58
46
  def start! node_id, *args
@@ -78,11 +66,7 @@ module PageRank
78
66
  end
79
67
  end
80
68
 
81
- class Script < Wukong::Script
82
- def default_options
83
- super.merge :extra_args => ' -jobconf io.sort.record.percent=0.25 '
84
- end
85
- end
86
- Script.new(Mapper, Reducer).run
69
+ Wukong.run(PagerankMapper, PagerankReducer,
70
+ :extra_args => ' -jobconf io.sort.record.percent=0.25 ')
87
71
  end
88
72
  end
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'
2
+ require 'rubygems'
3
3
  require 'wukong/script'
4
4
  require 'wukong/streamer/list_reducer'
5
5
 
@@ -1,58 +1,18 @@
1
- #!/usr/bin/env ruby
1
+ #!/usr/bin/env ruby -E ASCII-8BIT
2
2
  require 'rubygems'
3
3
  require 'wukong/script'
4
+ $: << File.dirname(__FILE__)
5
+ require 'logline'
4
6
 
5
- module ApacheLogParser
6
- class Mapper < Wukong::Streamer::LineStreamer
7
-
8
- #
9
- # Regular expression to parse an apache log line.
10
- #
11
- # 83.240.154.3 - - [07/Jun/2008:20:37:11 +0000] "GET /faq HTTP/1.1" 200 569 "http://infochimps.org/search?query=CAC" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
12
- #
13
- LOG_RE = Regexp.compile(%r{\A
14
- (\S+) # ip 83.240.154.3
15
- \s(\S+) # j1 -
16
- \s(\S+) # j2 -
17
- \s\[(\d+)/(\w+)/(\d+) # date part [07/Jun/2008
18
- :(\d+):(\d+):(\d+) # time part :20:37:11
19
- \s(\+.*)\] # timezone +0000]
20
- \s\"(?:(\S+) # http_method "GET
21
- \s(\S+) # path /faq
22
- \s(\S+)|-)" # protocol HTTP/1.1"
23
- \s(\d+) # response_code 200
24
- \s(\d+) # duration 569
25
- \s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
26
- \s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
27
- \z}x)
28
- MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
29
-
30
- # Use the regex to break line into fields
31
- # Emit each record as flat line
32
- def process line
33
- line.chomp
34
- m = LOG_RE.match(line)
35
- if m
36
- (ip, j1, j2,
37
- ts_day, ts_mo, ts_year,
38
- ts_hour, ts_min, ts_sec, tz,
39
- http_method, path, protocol,
40
- response_code, duration,
41
- referer, ua, *cruft) = m.captures
42
- date = [ts_year, MONTHS[ts_mo], ts_day].join("")
43
- time = [ts_hour, ts_min, ts_sec].join("")
44
- yield [:logline, ip, date, time, http_method, protocol, path, response_code, duration, referer, ua, tz]
45
- else
46
- yield [:unparseable, line]
47
- end
48
- end
7
+ class ApacheLogParser < Wukong::Streamer::LineStreamer
49
8
 
9
+ # create a Logline object from each record and serialize it flat to disk
10
+ def process line
11
+ yield Logline.parse(line)
50
12
  end
51
13
  end
52
14
 
53
- Wukong.run(ApacheLogParser::Mapper, nil, :sort_fields => 7)
54
-
55
- # 55.55.155.55 - - [04/Feb/2008:11:37:52 +0000] 301 "GET /robots.txt HTTP/1.1" 185 "-" "WebAlta Crawler/2.0 (http://www.webalta.net/ru/about_webmaster.html) (Windows; U; Windows NT 5.1; ru-RU)" "-"
15
+ Wukong.run( ApacheLogParser, nil, :sort_fields => 7 ) if $0 == __FILE__
56
16
 
57
17
 
58
18
 
@@ -1,9 +1,6 @@
1
- #!/usr/bin/env ruby
2
- require 'rubygems'
3
- require 'wukong/script'
4
-
5
1
  class Logline < Struct.new(
6
- :ip, :date, :time, :http_method, :protocol, :path, :response_code, :duration, :referer, :ua, :tz)
2
+ :ip, :dt, :tm, :http_method, :protocol, :path, :response_code, :size, :referer, :ua, :tz, :j1, :j2)
3
+ # 1 2 3 4 5 6 7 8 9 10 11
7
4
 
8
5
  def page_type
9
6
  case
@@ -14,14 +11,41 @@ class Logline < Struct.new(
14
11
  end
15
12
  end
16
13
 
17
- def is_page?
18
- page_type == :page
19
- end
20
- end
14
+ #
15
+ # Regular expression to parse an apache log line.
16
+ #
17
+ # 83.240.154.3 - - [07/Jun/2008:20:37:11 +0000] "GET /faq HTTP/1.1" 200 569 "http://infochimps.org/search?query=CAC" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
18
+ #
19
+ LOG_RE = Regexp.compile(%r{\A
20
+ (\S+) # ip 83.240.154.3
21
+ \s(\S+) # j1 -
22
+ \s(\S+) # j2 -
23
+ \s\[(\d+)/(\w+)/(\d+) # date part [07/Jun/2008
24
+ :(\d+):(\d+):(\d+) # time part :20:37:11
25
+ \s(\+.*)\] # timezone +0000]
26
+ \s\"(?:(\S+) # http_method "GET
27
+ \s(\S+) # path /faq
28
+ \s(\S+)|-)" # protocol HTTP/1.1"
29
+ \s(\d+) # response_code 200
30
+ \s(\d+) # size 569
31
+ \s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
32
+ \s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
33
+ \z}x)
34
+ MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
21
35
 
22
- class PageFilter < Wukong::Streamer::StructStreamer
23
- def process visit, *args
24
- yield visit.ua if visit.
36
+ # Use the regex to break line into fields
37
+ # Emit each record as flat line
38
+ def self.parse line
39
+ m = LOG_RE.match(line.chomp) or return BadRecord.new(line)
40
+ (ip, j1, j2,
41
+ ts_day, ts_mo, ts_year,
42
+ ts_hour, ts_min, ts_sec, tz,
43
+ http_method, path, protocol,
44
+ response_code, size,
45
+ referer, ua, *cruft) = m.captures
46
+ dt = [ts_year, MONTHS[ts_mo], ts_day].join("")
47
+ tm = [ts_hour, ts_min, ts_sec].join("")
48
+ self.new( ip, dt, tm, http_method, protocol, path, response_code, size, referer, ua, tz, j1, j2 )
25
49
  end
50
+
26
51
  end
27
- Wukong.run(PageFilter)
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env ruby -E BINARY
2
+ require 'rubygems'
3
+ require 'faraday'
4
+ require 'wukong/script'
5
+ require 'json'
6
+ $: << File.dirname(__FILE__)
7
+ require 'apache_log_parser'
8
+ require 'nook/faraday_dummy_adapter'
9
+
10
+ Settings.define :target_host, :default => 'localhost', :description => "The host name or IP address to target"
11
+ Settings.define :target_scheme, :default => 'http', :description => "Request scheme (http, https)"
12
+
13
+ #
14
+ # A Nook consumes its input stream and, for each input, generates an HTTP
15
+ # request against a remote host. Please use it for good and never for evil.
16
+ #
17
+ # You can use it from your command line:
18
+ # zcat /var/www/app/current/log/*access*.log.gz | ./nook.rb --map --host=http://my_own_host.com
19
+ #
20
+ #
21
+ class NookMapper < ApacheLogParser
22
+ # create a Logline object from each record and serialize it flat to disk
23
+ def process line
24
+ super(line) do |logline|
25
+ start = Time.now
26
+ resp = fetcher.get(logline.path, :user_agent => logline.ua, :referer => logline.referer)
27
+ yield [Time.now.to_flat, (Time.now - start).to_f, resp.status, resp.body.size, logline.path, resp.body]
28
+ end
29
+ end
30
+
31
+ def track record
32
+ monitor.periodically do |m|
33
+ m.progress
34
+ end
35
+ end
36
+
37
+ # a mock fetcher with a uniformly distributed variable delay
38
+ def fetcher
39
+ @fetcher ||= Faraday::Connection.new(:url => 'http://localhost:80/') do |f|
40
+ f.use Faraday::Adapter::Dummy do |dummy|
41
+ dummy.delay = Proc.new{|env| 0.05 } # 0.2 * rand()
42
+ # dummy.body = Proc.new{|env| env[:url] }
43
+ end
44
+ end
45
+ end
46
+ end
47
+
48
+ Wukong.run( NookMapper, nil, :sort_fields => 7 )
@@ -0,0 +1,94 @@
1
+
2
+ module Faraday
3
+ class Adapter
4
+
5
+ # test = Faraday::Connection.new do |f|
6
+ # f.use Faraday::Adapter::Dummy do |dummy|
7
+ # dummy.status 404
8
+ # dummy.delay 1
9
+ # end
10
+ # end
11
+ #
12
+ # # this will delay 0.2s, returning 404 with
13
+ # resp = text.get("/your/mom", :dummy_delay => 0.2)
14
+ # resp.body # => {"method":"get","url":"/your/mom","request_headers":{"Dummy-Delay":"0.2","dummy_delay":0.2},"request":{"proxy":null},"ssl":{}}
15
+ #
16
+ # More example:
17
+ #
18
+ # test = Faraday::Connection.new do |f|
19
+ # f.use Faraday::Adapter::Dummy, :status => 503
20
+ # end
21
+ #
22
+ # test = Faraday::Connection.new do |f|
23
+ # f.use Faraday::Adapter::Dummy do |dummy|
24
+ # dummy.delay = Proc.new{|env| 0.1 + 0.8 * rand() }
25
+ # end
26
+ # end
27
+ #
28
+ class Dummy < Middleware
29
+ include Addressable
30
+ attr_reader :config
31
+ def self.loaded?() false end
32
+
33
+ # gets value from environment if set, configured instance variable otherwise
34
+ def value_for env, key
35
+ val = env[:request_headers]["Dummy-#{header_hash_key(key)}"] || config[key]
36
+ if val.respond_to?(:call)
37
+ val = val.call(env)
38
+ end
39
+ val
40
+ end
41
+
42
+ # With an optional delay, constructs a [status, headers, response] based on the first of:
43
+ # * request header field (Dummy-Status, Dummy-Headers, Dummy-Resonse)
44
+ # * adapter's configuration:
45
+ # * Unless one of the above is set, body will return a json string taken from the request hash
46
+ #
47
+ def call(env)
48
+ status = value_for(env, :status)
49
+ headers = value_for(env, :headers)
50
+ headers = JSON.load(headers) if headers.is_a? String
51
+ body = value_for(env, :body) ||
52
+ env.dup.tap{|hsh| [:response, :parallel_manager, :body].each{|k| hsh.delete k} }.to_json
53
+ delay = value_for(env, :delay).to_f
54
+ sleep delay if delay > 0
55
+ headers[:dummy_delay] = delay
56
+ env.update(
57
+ :status => status,
58
+ :response_headers => headers,
59
+ :body => body)
60
+ @app.call(env)
61
+ end
62
+
63
+ class Configurator < Struct.new(:status, :headers, :delay, :body)
64
+ def status(val=nil) self.status = val if val ; super() end
65
+ def headers(val=nil) self.headers = val if val ; super() end
66
+ def body(val=nil) self.body = val if val ; super() end
67
+ def delay(val=nil) self.delay = val if val ; super() end
68
+ def self.from_hash hsh
69
+ new().tap{|config| hsh.each{|k,v| config.send("#{k}=", v) } }
70
+ end
71
+ end
72
+
73
+ def initialize(app, defaults={}, &block)
74
+ super(app)
75
+ @config = Configurator.from_hash(defaults.reverse_merge(:status => 200, :delay => 0, :headers => {}))
76
+ configure(&block) if block
77
+ end
78
+
79
+ def configure
80
+ yield config
81
+ end
82
+
83
+ # same as in Faraday::Utils -- turns :dummy_response_status into 'Dummy-Response-Status'
84
+ def header_hash_key(str)
85
+ str.to_s.split('_').each{|w| w.capitalize! }.join('-')
86
+ end
87
+
88
+ def create_multipart(env, params, boundary = nil)
89
+ stream = super
90
+ stream.read
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'wukong/script'
4
+
5
+ module WordCount
6
+ class Mapper < Wukong::Streamer::LineStreamer
7
+ #
8
+ # Emit each word in each line.
9
+ #
10
+ def process line
11
+ tokenize(line).each{|word| yield [word, 1] }
12
+ end
13
+
14
+ #
15
+ # Split a string into its constituent words.
16
+ #
17
+ # This is pretty simpleminded:
18
+ # * downcase the word
19
+ # * Split at any non-alphanumeric boundary, including '_'
20
+ # * However, preserve the special cases of 's, 'd or 't at the end of a
21
+ # word.
22
+ #
23
+ # tokenize("Ability is a poor man's wealth #johnwoodenquote")
24
+ # # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
25
+ #
26
+ def tokenize str
27
+ return [] if str.blank?
28
+ str = str.downcase;
29
+ # kill off all punctuation except [stuff]'s or [stuff]'t
30
+ # this includes hyphens (words are split)
31
+ str = str.
32
+ gsub(/[^a-zA-Z0-9\']+/, ' ').
33
+ gsub(/(\w)\'([std])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
34
+ # Busticate at whitespace
35
+ words = str.split(/\s+/)
36
+ words.reject!{|w| w.blank? }
37
+ words
38
+ end
39
+ end
40
+
41
+ #
42
+ # A bit kinder to your memory manager: accumulate the sum record-by-record:
43
+ #
44
+ class Reducer2 < Wukong::Streamer::AccumulatingReducer
45
+
46
+ def start!(*args)
47
+ @key_count = 0
48
+ end
49
+
50
+ def accumulate(*args)
51
+ @key_count += 1
52
+ end
53
+
54
+ def finalize
55
+ yield [ key, @key_count ]
56
+ end
57
+ end
58
+
59
+ #
60
+ # You can stack up all the values in a list then sum them at once.
61
+ #
62
+ # This isn't good style, as it means the whole list is held in memory
63
+ #
64
+ class Reducer1 < Wukong::Streamer::ListReducer
65
+ def finalize
66
+ yield [ key, values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot } ]
67
+ end
68
+ end
69
+
70
+ #
71
+ # ... easiest of all, though: this is common enough that it's already included
72
+ #
73
+ require 'wukong/streamer/count_keys'
74
+ class Reducer3 < Wukong::Streamer::CountKeys
75
+ end
76
+ end
77
+
78
+ # Execute the script
79
+ Wukong.run(
80
+ WordCount::Mapper,
81
+ WordCount::Reducer2
82
+ )
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../lib'
2
+ require 'rubygems'
3
3
  require 'wukong/script'
4
4
 
5
5
  module Size
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'
2
+ require 'rubygems'
3
3
  require 'wukong/script'
4
4
  require 'wukong/streamer/count_keys'
5
5
 
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'
2
+ require 'rubygems'
3
3
  require 'wukong/script'
4
4
  require 'wukong/streamer/rank_and_bin_reducer'
5
5
 
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'
2
+ require 'rubygems'
3
3
  require 'wukong/script'
4
4
 
5
5
  # Run as (local mode)
@@ -1,5 +1,5 @@
1
- require 'configliere'; Settings.use :define
2
1
  require 'wukong/extensions'
2
+ require 'configliere'; Settings.use :define
3
3
  require 'wukong/datatypes'
4
4
  require 'wukong/periodic_monitor'
5
5
  require 'wukong/logger'
@@ -2,8 +2,8 @@
2
2
  # These pull in the minimal functionality of the extlib|activesupport family of
3
3
  # gems.
4
4
  #
5
- require 'wukong/extensions/blank'
6
- require 'wukong/extensions/class'
5
+ require 'extlib/blank'
6
+ require 'extlib/class'
7
7
  require 'wukong/extensions/enumerable'
8
8
  require 'wukong/extensions/symbol'
9
9
  require 'wukong/extensions/hash'
@@ -15,7 +15,7 @@ class Object
15
15
  # @api public
16
16
  def blank?
17
17
  nil? || (respond_to?(:empty?) && empty?)
18
- end
18
+ end unless method_defined?(:blank?)
19
19
  end # class Object
20
20
 
21
21
  class Numeric
@@ -31,7 +31,7 @@ class Numeric
31
31
  # @api public
32
32
  def blank?
33
33
  false
34
- end
34
+ end unless method_defined?(:blank?)
35
35
  end # class Numeric
36
36
 
37
37
  class NilClass
@@ -45,7 +45,7 @@ class NilClass
45
45
  # @api public
46
46
  def blank?
47
47
  true
48
- end
48
+ end unless method_defined?(:blank?)
49
49
  end # class NilClass
50
50
 
51
51
  class TrueClass
@@ -59,7 +59,7 @@ class TrueClass
59
59
  # @api public
60
60
  def blank?
61
61
  false
62
- end
62
+ end unless method_defined?(:blank?)
63
63
  end # class TrueClass
64
64
 
65
65
  class FalseClass
@@ -73,7 +73,7 @@ class FalseClass
73
73
  # @api public
74
74
  def blank?
75
75
  true
76
- end
76
+ end unless method_defined?(:blank?)
77
77
  end # class FalseClass
78
78
 
79
79
  class String
@@ -89,5 +89,5 @@ class String
89
89
  # @api public
90
90
  def blank?
91
91
  strip.empty?
92
- end
92
+ end unless method_defined?(:blank?)
93
93
  end # class String
@@ -66,11 +66,11 @@ class Hash
66
66
  #
67
67
  def deep_merge hsh2
68
68
  merge hsh2, &Hash::DEEP_MERGER
69
- end
69
+ end unless method_defined?(:deep_merge)
70
70
 
71
71
  def deep_merge! hsh2
72
72
  merge! hsh2, &Hash::DEEP_MERGER
73
- end
73
+ end unless method_defined?(:deep_merge!)
74
74
 
75
75
  #
76
76
  # Treat hash as tree of hashes:
@@ -86,10 +86,10 @@ class Hash
86
86
  val = args.pop
87
87
  last_key = args.pop
88
88
  # dig down to last subtree (building out if necessary)
89
- hsh = args.empty? ? self : args.inject(self){|hsh, key| hsh[key] ||= {} }
89
+ hsh = args.empty? ? self : args.inject(self){|h, k| h[k] ||= {} }
90
90
  # set leaf value
91
91
  hsh[last_key] = val
92
- end
92
+ end unless method_defined?(:deep_set)
93
93
 
94
94
  #
95
95
  # Treat hash as tree of hashes:
@@ -107,10 +107,10 @@ class Hash
107
107
  def deep_get *args
108
108
  last_key = args.pop
109
109
  # dig down to last subtree (building out if necessary)
110
- hsh = args.inject(self){|hsh, key| hsh[key] || {} }
110
+ hsh = args.inject(self){|h, k| h[k] || {} }
111
111
  # get leaf value
112
112
  hsh[last_key]
113
- end
113
+ end unless method_defined?(:deep_get)
114
114
 
115
115
 
116
116
  #
@@ -126,20 +126,20 @@ class Hash
126
126
  last_key = args.pop
127
127
  last_hsh = args.empty? ? self : (deep_get(*args)||{})
128
128
  last_hsh.delete(last_key)
129
- end
129
+ end unless method_defined?(:deep_delete)
130
130
 
131
131
  #
132
132
  # remove all key-value pairs where the value is nil
133
133
  #
134
134
  def compact
135
135
  reject{|key,val| val.nil? }
136
- end
136
+ end unless method_defined?(:compact)
137
137
  #
138
138
  # Replace the hash with its compacted self
139
139
  #
140
140
  def compact!
141
141
  replace(compact)
142
- end
142
+ end unless method_defined?(:compact!)
143
143
 
144
144
  #
145
145
  # remove all key-value pairs where the value is blank
@@ -64,7 +64,7 @@ module Wukong
64
64
  # Analagous to Hash#merge
65
65
  #
66
66
  def merge *args
67
- self.dup.merge! *args
67
+ self.dup.merge!(*args)
68
68
  end
69
69
  def merge! hsh, &block
70
70
  raise "can't handle block arg yet" if block
@@ -104,7 +104,7 @@ module Wukong
104
104
  #
105
105
  def from_hash(hsh, has_symbol_keys=false)
106
106
  extract_keys = has_symbol_keys ? self.keys.map(&:to_sym) : self.keys.map(&:to_s)
107
- self.new *hsh.values_of(*extract_keys)
107
+ self.new(*hsh.values_of(*extract_keys))
108
108
  end
109
109
  #
110
110
  # The last portion of the class in underscored form
@@ -7,5 +7,5 @@ class Symbol
7
7
  # <tt>ActiveSupport::CoreExtensions::Symbol</tt>).
8
8
  def to_proc
9
9
  Proc.new { |*args| args.shift.__send__(self, *args) }
10
- end
10
+ end unless method_defined?(:to_proc)
11
11
  end
@@ -13,7 +13,7 @@ module Wukong
13
13
  # I, [2009-07-26T19:58:46-05:00 #12332]: Up to 2000 char message
14
14
  #
15
15
  def self.logger
16
- return @logger if @logger
16
+ return @logger if defined?(@logger)
17
17
  require 'logger'
18
18
  @logger = Logger.new STDERR
19
19
  @logger.instance_eval do
@@ -28,9 +28,9 @@ class PeriodicMonitor
28
28
  if ready?
29
29
  @last_report = Time.now
30
30
  if block
31
- block.call(iter, *args)
31
+ emit block.call(self, *args)
32
32
  else
33
- self.emit progress(*args)
33
+ emit progress(*args)
34
34
  end
35
35
  end
36
36
  end
@@ -1,4 +1,5 @@
1
1
  require 'pathname'
2
+ require 'wukong/extensions'
2
3
  require 'configliere' ; Settings.use(:commandline, :env_var, :define)
3
4
  require 'wukong'
4
5
  require 'wukong/script/hadoop_command'
@@ -127,7 +128,7 @@ module Wukong
127
128
  def initialize mapper, reducer=nil, extra_options={}
128
129
  Settings.resolve!
129
130
  @options = Settings
130
- options.merge extra_options
131
+ options.merge! extra_options
131
132
  @mapper = (case mapper when Class then mapper.new when nil then nil else mapper ; end)
132
133
  @reducer = (case reducer when Class then reducer.new when nil then nil else reducer ; end)
133
134
  @output_path = options.rest.pop
@@ -173,10 +174,14 @@ module Wukong
173
174
  # In hadoop mode, this is given to the hadoop streaming command.
174
175
  # In local mode, it's given to the system() call
175
176
  #
176
- def mapper_commandline
177
+ def mapper_commandline(run_option=:local)
177
178
  if mapper
178
- "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
179
- # "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --map " + non_wukong_params
179
+ case run_option
180
+ when :local then
181
+ "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
182
+ when :hadoop then
183
+ "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --map " + non_wukong_params
184
+ end
180
185
  else
181
186
  options[:map_command]
182
187
  end
@@ -187,10 +192,14 @@ module Wukong
187
192
  # In hadoop mode, this is given to the hadoop streaming command.
188
193
  # In local mode, it's given to the system() call
189
194
  #
190
- def reducer_commandline
195
+ def reducer_commandline(run_option=:local)
191
196
  if reducer
192
- "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
193
- # "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --reduce " + non_wukong_params
197
+ case run_option
198
+ when :local then
199
+ "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
200
+ when :hadoop then
201
+ "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --reduce " + non_wukong_params
202
+ end
194
203
  else
195
204
  options[:reduce_command]
196
205
  end
@@ -222,6 +231,7 @@ module Wukong
222
231
  else
223
232
  maybe_overwrite_output_paths! output_path
224
233
  $stdout.puts `#{command}`
234
+ raise "Streaming command failed!" unless $?.success?
225
235
  end
226
236
  end
227
237
 
@@ -242,7 +252,7 @@ module Wukong
242
252
  # the map/reducer phase scripts
243
253
  def non_wukong_params
244
254
  options.
245
- reject{|param, val| options.param_definitions[param][:wukong] }.
255
+ reject{|param, val| options.definition_of(param, :wukong) }.
246
256
  map{|param,val| "--#{param}=#{val}" }.
247
257
  join(" ")
248
258
  end
@@ -13,8 +13,8 @@ Settings.define :emr_bootstrap_script, :description => 'Bootstrap actions for El
13
13
  Settings.define :emr_extra_args, :description => 'kludge: allows you to stuff extra args into the elastic-mapreduce invocation', :type => Array, :wukong => true
14
14
  Settings.define :alive, :description => 'Whether to keep machine running after job invocation', :type => :boolean
15
15
  #
16
- Settings.define :keypair_file, :description => 'AWS Key pair file', :type => :filename
17
- Settings.define :keypair, :description => "AWS Key pair name. If not specified, it's taken from keypair_file's basename", :finally => lambda{ Settings.keypair ||= File.basename(Settings.keypair_file.to_s, '.pem') if Settings.keypair_file }
16
+ Settings.define :key_pair_file, :description => 'AWS Key pair file', :type => :filename
17
+ Settings.define :key_pair, :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') if Settings.key_pair_file }
18
18
  Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
19
19
  Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
20
20
  Settings.define :jobflow, :description => "ID of an existing EMR job flow. Wukong will create a new job flow"
@@ -45,7 +45,9 @@ module Wukong
45
45
  end
46
46
 
47
47
  def hadoop_options_for_emr_runner
48
- [hadoop_jobconf_options, hadoop_other_args].flatten.compact.map{|hdp_opt| "--arg '#{hdp_opt}'"}
48
+ [hadoop_jobconf_options, hadoop_other_args].flatten.compact.uniq.map do |hdp_opt|
49
+ hdp_opt.split(' ').map {|part| "--arg '#{part}'"}
50
+ end.flatten
49
51
  end
50
52
 
51
53
  def execute_emr_runner
@@ -57,7 +59,7 @@ module Wukong
57
59
  command_args << "--create --name=#{job_name}"
58
60
  command_args << Settings.dashed_flag_for(:alive)
59
61
  command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type, :hadoop_version).join(' ')
60
- command_args << Settings.dashed_flags(:availability_zone, :keypair, :keypair_file).join(' ')
62
+ command_args << Settings.dashed_flags(:availability_zone, :key_pair, :key_pair_file).join(' ')
61
63
  command_args << "--bootstrap-action=#{bootstrap_s3_uri}"
62
64
  end
63
65
  command_args << Settings.dashed_flags(:enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
@@ -25,6 +25,9 @@ module Wukong
25
25
  Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
26
26
  Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
27
27
  Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
28
+ Settings.define :max_tracker_failures, :jobconf => true, :description => 'mapred.max.tracker.failures', :wukong => true
29
+ Settings.define :max_map_attempts, :jobconf => true, :description => 'mapred.map.max.attempts', :wukong => true
30
+ Settings.define :max_reduce_attempts, :jobconf => true, :description => 'mapred.reduce.max.attempts', :wukong => true
28
31
  Settings.define :min_split_size, :jobconf => true, :description => 'mapred.min.split.size', :wukong => true
29
32
  Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
30
33
  Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
@@ -40,8 +43,8 @@ module Wukong
40
43
  # if not, the resulting nil will be elided later
41
44
  def jobconf option
42
45
  if options[option]
43
- # "-jobconf %s=%s" % [options.description_for(option), options[option]]
44
- "-D %s=%s" % [options.description_for(option), options[option]]
46
+ # "-jobconf %s=%s" % [options.definition_of(option, :description), options[option]]
47
+ "-D %s=%s" % [options.definition_of(option, :description), options[option]]
45
48
  end
46
49
  end
47
50
 
@@ -64,8 +67,8 @@ module Wukong
64
67
  hadoop_jobconf_options,
65
68
  "-D mapred.job.name='#{job_name}'",
66
69
  hadoop_other_args,
67
- "-mapper '#{mapper_commandline}'",
68
- "-reducer '#{reducer_commandline}'",
70
+ "-mapper '#{mapper_commandline(:hadoop)}'",
71
+ "-reducer '#{reducer_commandline(:hadoop)}'",
69
72
  "-input '#{input_paths}'",
70
73
  "-output '#{output_path}'",
71
74
  "-file '#{this_script_filename}'",
@@ -100,6 +103,8 @@ module Wukong
100
103
  :partition_fields, :sort_fields,
101
104
  :reduce_tasks, :respect_exit_status,
102
105
  :reuse_jvms, :timeout,
106
+ :max_tracker_failures, :max_map_attempts,
107
+ :max_reduce_attempts
103
108
  ].map{|opt| jobconf(opt)}
104
109
  jobconf_options.flatten.compact
105
110
  end
@@ -25,7 +25,13 @@ module Wukong
25
25
  @input_paths = input_paths.map(&:strip).join(' ')
26
26
  cmd_input_str = (input_paths == '-') ? "" : "cat '#{input_paths}' | "
27
27
  cmd_output_str = (output_path == '-') ? "" : "> '#{output_path}'"
28
- %Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} | #{reducer_commandline} #{cmd_output_str} }
28
+
29
+ if (reducer || options[:reduce_command])
30
+ %Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} | #{reducer_commandline} #{cmd_output_str} }
31
+ else
32
+ %Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} #{cmd_output_str} }
33
+ end
34
+
29
35
  end
30
36
 
31
37
  end
@@ -28,11 +28,15 @@ module Wukong
28
28
  process(*record) do |output_record|
29
29
  emit output_record
30
30
  end
31
- monitor.periodically(record.to_s[0..1000])
31
+ track(record)
32
32
  end
33
33
  after_stream
34
34
  end
35
35
 
36
+ def track record
37
+ monitor.periodically(record.to_s[0..1000])
38
+ end
39
+
36
40
  def each_record &block
37
41
  $stdin.each(&block)
38
42
  end
@@ -103,7 +107,7 @@ module Wukong
103
107
  # Creates a new object of this class and injects the given block
104
108
  # as the process method
105
109
  def self.mapper *args, &block
106
- self.new.mapper *args, &block
110
+ self.new.mapper(*args, &block)
107
111
  end
108
112
 
109
113
  # Delegates back to Wukong to run this instance as a mapper
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{wukong}
8
- s.version = "2.0.0"
8
+ s.version = "2.0.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Philip (flip) Kromer"]
12
- s.date = %q{2011-01-29}
12
+ s.date = %q{2011-07-01}
13
13
  s.description = %q{ Treat your dataset like a:
14
14
 
15
15
  * stream of lines when it's efficient to process by lines
@@ -139,6 +139,7 @@ Gem::Specification.new do |s|
139
139
  "examples/contrib/jeans/sizes.rb",
140
140
  "examples/corpus/bucket_counter.rb",
141
141
  "examples/corpus/dbpedia_abstract_to_sentences.rb",
142
+ "examples/corpus/sentence_bigrams.rb",
142
143
  "examples/corpus/sentence_coocurrence.rb",
143
144
  "examples/corpus/words_to_bigrams.rb",
144
145
  "examples/emr/README.textile",
@@ -162,7 +163,10 @@ Gem::Specification.new do |s|
162
163
  "examples/server_logs/apache_log_parser.rb",
163
164
  "examples/server_logs/breadcrumbs.rb",
164
165
  "examples/server_logs/logline.rb",
166
+ "examples/server_logs/nook.rb",
167
+ "examples/server_logs/nook/faraday_dummy_adapter.rb",
165
168
  "examples/server_logs/user_agent.rb",
169
+ "examples/simple_word_count.rb",
166
170
  "examples/size.rb",
167
171
  "examples/stats/avg_value_frequency.rb",
168
172
  "examples/stats/binning_percentile_estimator.rb",
@@ -252,13 +256,14 @@ Gem::Specification.new do |s|
252
256
  ]
253
257
  s.homepage = %q{http://mrflip.github.com/wukong}
254
258
  s.require_paths = ["lib"]
255
- s.rubygems_version = %q{1.4.2}
259
+ s.rubygems_version = %q{1.5.0}
256
260
  s.summary = %q{Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.}
257
261
  s.test_files = [
258
262
  "examples/contrib/jeans/normalize.rb",
259
263
  "examples/contrib/jeans/sizes.rb",
260
264
  "examples/corpus/bucket_counter.rb",
261
265
  "examples/corpus/dbpedia_abstract_to_sentences.rb",
266
+ "examples/corpus/sentence_bigrams.rb",
262
267
  "examples/corpus/sentence_coocurrence.rb",
263
268
  "examples/corpus/words_to_bigrams.rb",
264
269
  "examples/emr/elastic_mapreduce_example.rb",
@@ -275,7 +280,10 @@ Gem::Specification.new do |s|
275
280
  "examples/server_logs/apache_log_parser.rb",
276
281
  "examples/server_logs/breadcrumbs.rb",
277
282
  "examples/server_logs/logline.rb",
283
+ "examples/server_logs/nook.rb",
284
+ "examples/server_logs/nook/faraday_dummy_adapter.rb",
278
285
  "examples/server_logs/user_agent.rb",
286
+ "examples/simple_word_count.rb",
279
287
  "examples/size.rb",
280
288
  "examples/stats/avg_value_frequency.rb",
281
289
  "examples/stats/binning_percentile_estimator.rb",
metadata CHANGED
@@ -1,13 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wukong
3
3
  version: !ruby/object:Gem::Version
4
- hash: 15
5
4
  prerelease:
6
- segments:
7
- - 2
8
- - 0
9
- - 0
10
- version: 2.0.0
5
+ version: 2.0.1
11
6
  platform: ruby
12
7
  authors:
13
8
  - Philip (flip) Kromer
@@ -15,7 +10,7 @@ autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
12
 
18
- date: 2011-01-29 00:00:00 -06:00
13
+ date: 2011-07-01 00:00:00 -05:00
19
14
  default_executable:
20
15
  dependencies:
21
16
  - !ruby/object:Gem::Dependency
@@ -26,11 +21,6 @@ dependencies:
26
21
  requirements:
27
22
  - - ">="
28
23
  - !ruby/object:Gem::Version
29
- hash: 13
30
- segments:
31
- - 1
32
- - 2
33
- - 9
34
24
  version: 1.2.9
35
25
  type: :development
36
26
  version_requirements: *id001
@@ -42,9 +32,6 @@ dependencies:
42
32
  requirements:
43
33
  - - ">="
44
34
  - !ruby/object:Gem::Version
45
- hash: 3
46
- segments:
47
- - 0
48
35
  version: "0"
49
36
  type: :development
50
37
  version_requirements: *id002
@@ -56,9 +43,6 @@ dependencies:
56
43
  requirements:
57
44
  - - ">="
58
45
  - !ruby/object:Gem::Version
59
- hash: 3
60
- segments:
61
- - 0
62
46
  version: "0"
63
47
  type: :runtime
64
48
  version_requirements: *id003
@@ -70,9 +54,6 @@ dependencies:
70
54
  requirements:
71
55
  - - ">="
72
56
  - !ruby/object:Gem::Version
73
- hash: 3
74
- segments:
75
- - 0
76
57
  version: "0"
77
58
  type: :runtime
78
59
  version_requirements: *id004
@@ -84,9 +65,6 @@ dependencies:
84
65
  requirements:
85
66
  - - ">="
86
67
  - !ruby/object:Gem::Version
87
- hash: 3
88
- segments:
89
- - 0
90
68
  version: "0"
91
69
  type: :runtime
92
70
  version_requirements: *id005
@@ -98,9 +76,6 @@ dependencies:
98
76
  requirements:
99
77
  - - ">="
100
78
  - !ruby/object:Gem::Version
101
- hash: 3
102
- segments:
103
- - 0
104
79
  version: "0"
105
80
  type: :runtime
106
81
  version_requirements: *id006
@@ -233,6 +208,7 @@ files:
233
208
  - examples/contrib/jeans/sizes.rb
234
209
  - examples/corpus/bucket_counter.rb
235
210
  - examples/corpus/dbpedia_abstract_to_sentences.rb
211
+ - examples/corpus/sentence_bigrams.rb
236
212
  - examples/corpus/sentence_coocurrence.rb
237
213
  - examples/corpus/words_to_bigrams.rb
238
214
  - examples/emr/README.textile
@@ -256,7 +232,10 @@ files:
256
232
  - examples/server_logs/apache_log_parser.rb
257
233
  - examples/server_logs/breadcrumbs.rb
258
234
  - examples/server_logs/logline.rb
235
+ - examples/server_logs/nook.rb
236
+ - examples/server_logs/nook/faraday_dummy_adapter.rb
259
237
  - examples/server_logs/user_agent.rb
238
+ - examples/simple_word_count.rb
260
239
  - examples/size.rb
261
240
  - examples/stats/avg_value_frequency.rb
262
241
  - examples/stats/binning_percentile_estimator.rb
@@ -357,23 +336,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
357
336
  requirements:
358
337
  - - ">="
359
338
  - !ruby/object:Gem::Version
360
- hash: 3
361
- segments:
362
- - 0
363
339
  version: "0"
364
340
  required_rubygems_version: !ruby/object:Gem::Requirement
365
341
  none: false
366
342
  requirements:
367
343
  - - ">="
368
344
  - !ruby/object:Gem::Version
369
- hash: 3
370
- segments:
371
- - 0
372
345
  version: "0"
373
346
  requirements: []
374
347
 
375
348
  rubyforge_project:
376
- rubygems_version: 1.4.2
349
+ rubygems_version: 1.5.0
377
350
  signing_key:
378
351
  specification_version: 3
379
352
  summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.
@@ -382,6 +355,7 @@ test_files:
382
355
  - examples/contrib/jeans/sizes.rb
383
356
  - examples/corpus/bucket_counter.rb
384
357
  - examples/corpus/dbpedia_abstract_to_sentences.rb
358
+ - examples/corpus/sentence_bigrams.rb
385
359
  - examples/corpus/sentence_coocurrence.rb
386
360
  - examples/corpus/words_to_bigrams.rb
387
361
  - examples/emr/elastic_mapreduce_example.rb
@@ -398,7 +372,10 @@ test_files:
398
372
  - examples/server_logs/apache_log_parser.rb
399
373
  - examples/server_logs/breadcrumbs.rb
400
374
  - examples/server_logs/logline.rb
375
+ - examples/server_logs/nook.rb
376
+ - examples/server_logs/nook/faraday_dummy_adapter.rb
401
377
  - examples/server_logs/user_agent.rb
378
+ - examples/simple_word_count.rb
402
379
  - examples/size.rb
403
380
  - examples/stats/avg_value_frequency.rb
404
381
  - examples/stats/binning_percentile_estimator.rb