wukong 2.0.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +0 -1
- data/TODO.textile +6 -0
- data/examples/corpus/dbpedia_abstract_to_sentences.rb +1 -0
- data/examples/corpus/sentence_bigrams.rb +53 -0
- data/examples/corpus/sentence_coocurrence.rb +1 -5
- data/examples/corpus/words_to_bigrams.rb +2 -1
- data/examples/ignore_me/counting.rb +1 -2
- data/examples/network_graph/adjacency_list.rb +1 -1
- data/examples/network_graph/breadth_first_search.rb +1 -1
- data/examples/network_graph/gen_2paths.rb +2 -2
- data/examples/network_graph/gen_multi_edge.rb +0 -1
- data/examples/network_graph/gen_symmetric_links.rb +1 -0
- data/examples/pagerank/pagerank.rb +5 -21
- data/examples/pagerank/pagerank_initialize.rb +1 -1
- data/examples/server_logs/apache_log_parser.rb +8 -48
- data/examples/server_logs/logline.rb +37 -13
- data/examples/server_logs/nook.rb +48 -0
- data/examples/server_logs/nook/faraday_dummy_adapter.rb +94 -0
- data/examples/simple_word_count.rb +82 -0
- data/examples/size.rb +1 -1
- data/examples/stats/binning_percentile_estimator.rb +1 -1
- data/examples/stats/rank_and_bin.rb +1 -1
- data/examples/stupidly_simple_filter.rb +1 -1
- data/lib/wukong.rb +1 -1
- data/lib/wukong/extensions.rb +2 -2
- data/lib/wukong/extensions/blank.rb +6 -6
- data/lib/wukong/extensions/hash.rb +9 -9
- data/lib/wukong/extensions/hash_like.rb +2 -2
- data/lib/wukong/extensions/symbol.rb +1 -1
- data/lib/wukong/logger.rb +1 -1
- data/lib/wukong/periodic_monitor.rb +2 -2
- data/lib/wukong/script.rb +18 -8
- data/lib/wukong/script/emr_command.rb +6 -4
- data/lib/wukong/script/hadoop_command.rb +9 -4
- data/lib/wukong/script/local_command.rb +7 -1
- data/lib/wukong/streamer/base.rb +6 -2
- data/wukong.gemspec +11 -3
- metadata +11 -34
data/README.textile
CHANGED
@@ -19,7 +19,6 @@ The **main documentation** lives on the "Wukong Pages.":http://mrflip.github.com
|
|
19
19
|
* Wukong is licensed under the "Apache License":http://mrflip.github.com/wukong/LICENSE.html (same as Hadoop)
|
20
20
|
* "More info":http://mrflip.github.com/wukong/moreinfo.html
|
21
21
|
|
22
|
-
|
23
22
|
h2. Help!
|
24
23
|
|
25
24
|
Send Wukong questions to the "Infinite Monkeywrench mailing list":http://groups.google.com/group/infochimps-code
|
data/TODO.textile
CHANGED
@@ -3,3 +3,9 @@
|
|
3
3
|
** We should be able to specify comma *or* space separated paths; the last
|
4
4
|
space-separated path in Settings.rest becomes the output file, the others are
|
5
5
|
used as the input_file list.
|
6
|
+
|
7
|
+
at_exit do
|
8
|
+
if $!.nil? && $0 == Goliath::Application.app_file
|
9
|
+
Application.run!
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)
|
3
|
+
require 'rubygems'
|
4
|
+
require 'wukong/script'
|
5
|
+
require 'bucket_counter'
|
6
|
+
|
7
|
+
#
|
8
|
+
# Coocurrence counts
|
9
|
+
#
|
10
|
+
|
11
|
+
#
|
12
|
+
# Input is a list of document-idx-sentences, each field is tab-separated
|
13
|
+
# title idx word_a word_b word_c ...
|
14
|
+
#
|
15
|
+
# This emits each co-courring pair exactly once; in the case of a three-word
|
16
|
+
# sentence the output would be
|
17
|
+
#
|
18
|
+
# word_a word_b
|
19
|
+
# word_a word_c
|
20
|
+
# word_b word_c
|
21
|
+
#
|
22
|
+
class SentenceBigrams < Wukong::Streamer::RecordStreamer
|
23
|
+
def process title, idx, *words
|
24
|
+
words[0..-2].zip(words[1..-1]).each do |word_a, word_b|
|
25
|
+
yield [word_a, word_b]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
#
|
31
|
+
# Combine multiple bucket counts into a single on
|
32
|
+
#
|
33
|
+
class CombineBuckets < Wukong::Streamer::AccumulatingReducer
|
34
|
+
def get_key *fields
|
35
|
+
fields[0..1]
|
36
|
+
end
|
37
|
+
def start! *args
|
38
|
+
@total = 0
|
39
|
+
end
|
40
|
+
def accumulate *fields
|
41
|
+
@total += 1
|
42
|
+
end
|
43
|
+
def finalize
|
44
|
+
yield [@total, key].flatten
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
Wukong.run(
|
49
|
+
SentenceBigrams,
|
50
|
+
CombineBuckets,
|
51
|
+
:io_sort_record_percent => 0.3,
|
52
|
+
:io_sort_mb => 300
|
53
|
+
)
|
@@ -26,11 +26,7 @@ class SentenceCoocurrence < Wukong::Streamer::RecordStreamer
|
|
26
26
|
end
|
27
27
|
|
28
28
|
def process title, idx, *words
|
29
|
-
words.
|
30
|
-
words[(idx+1) .. -1].each do |word_b|
|
31
|
-
@bucket << [word_a, word_b]
|
32
|
-
end
|
33
|
-
end
|
29
|
+
@bucket << words[0..-2].zip(words[1..-1])
|
34
30
|
dump_bucket if @bucket.full?
|
35
31
|
end
|
36
32
|
|
@@ -1,11 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
2
|
+
require 'rubygems'
|
3
3
|
require 'wukong/script'
|
4
4
|
|
5
|
-
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
|
9
5
|
module PageRank
|
10
6
|
#
|
11
7
|
# Damping factor (prob. of a 'random' jump)
|
@@ -13,16 +9,12 @@ module PageRank
|
|
13
9
|
#
|
14
10
|
DAMPING_FACTOR = 0.85
|
15
11
|
|
16
|
-
#
|
17
12
|
# Each user's line looks like
|
18
|
-
#
|
19
13
|
# user_a pagerank id1,id2,...,idN
|
20
|
-
#
|
21
14
|
# we need to disperse this user's pagerank to each of id1..idN, and
|
22
15
|
# rendezvous the list of outbound links at user_a's reducer as well.
|
23
|
-
#
|
24
16
|
module Iterating
|
25
|
-
class
|
17
|
+
class PagerankMapper < Wukong::Streamer::Base
|
26
18
|
#
|
27
19
|
# Send pagerank to each page, and send the dests list back to self
|
28
20
|
#
|
@@ -34,9 +26,7 @@ module PageRank
|
|
34
26
|
yield_own_dest_list src, dests_str, &block
|
35
27
|
end
|
36
28
|
|
37
|
-
#
|
38
29
|
# Take the source node's pagerank and distribute it among all the out-nodes
|
39
|
-
#
|
40
30
|
def yield_pagerank_shares src, pagerank, dests
|
41
31
|
pagerank_share = pagerank.to_f / dests.length
|
42
32
|
dests.each do |dest|
|
@@ -44,15 +34,13 @@ module PageRank
|
|
44
34
|
end
|
45
35
|
end
|
46
36
|
|
47
|
-
#
|
48
37
|
# Dispatch this user's out-node list to rendezvous with itself.
|
49
|
-
#
|
50
38
|
def yield_own_dest_list src, dests_str
|
51
39
|
yield [src, 'd', dests_str]
|
52
40
|
end
|
53
41
|
end
|
54
42
|
|
55
|
-
class
|
43
|
+
class PagerankReducer < Wukong::Streamer::AccumulatingReducer
|
56
44
|
attr_accessor :node_id, :pagerank, :dests_str
|
57
45
|
# Begin reduction with 0 accumulated pagerank and no dests as yet
|
58
46
|
def start! node_id, *args
|
@@ -78,11 +66,7 @@ module PageRank
|
|
78
66
|
end
|
79
67
|
end
|
80
68
|
|
81
|
-
|
82
|
-
|
83
|
-
super.merge :extra_args => ' -jobconf io.sort.record.percent=0.25 '
|
84
|
-
end
|
85
|
-
end
|
86
|
-
Script.new(Mapper, Reducer).run
|
69
|
+
Wukong.run(PagerankMapper, PagerankReducer,
|
70
|
+
:extra_args => ' -jobconf io.sort.record.percent=0.25 ')
|
87
71
|
end
|
88
72
|
end
|
@@ -1,58 +1,18 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
1
|
+
#!/usr/bin/env ruby -E ASCII-8BIT
|
2
2
|
require 'rubygems'
|
3
3
|
require 'wukong/script'
|
4
|
+
$: << File.dirname(__FILE__)
|
5
|
+
require 'logline'
|
4
6
|
|
5
|
-
|
6
|
-
class Mapper < Wukong::Streamer::LineStreamer
|
7
|
-
|
8
|
-
#
|
9
|
-
# Regular expression to parse an apache log line.
|
10
|
-
#
|
11
|
-
# 83.240.154.3 - - [07/Jun/2008:20:37:11 +0000] "GET /faq HTTP/1.1" 200 569 "http://infochimps.org/search?query=CAC" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
|
12
|
-
#
|
13
|
-
LOG_RE = Regexp.compile(%r{\A
|
14
|
-
(\S+) # ip 83.240.154.3
|
15
|
-
\s(\S+) # j1 -
|
16
|
-
\s(\S+) # j2 -
|
17
|
-
\s\[(\d+)/(\w+)/(\d+) # date part [07/Jun/2008
|
18
|
-
:(\d+):(\d+):(\d+) # time part :20:37:11
|
19
|
-
\s(\+.*)\] # timezone +0000]
|
20
|
-
\s\"(?:(\S+) # http_method "GET
|
21
|
-
\s(\S+) # path /faq
|
22
|
-
\s(\S+)|-)" # protocol HTTP/1.1"
|
23
|
-
\s(\d+) # response_code 200
|
24
|
-
\s(\d+) # duration 569
|
25
|
-
\s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
|
26
|
-
\s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
|
27
|
-
\z}x)
|
28
|
-
MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
|
29
|
-
|
30
|
-
# Use the regex to break line into fields
|
31
|
-
# Emit each record as flat line
|
32
|
-
def process line
|
33
|
-
line.chomp
|
34
|
-
m = LOG_RE.match(line)
|
35
|
-
if m
|
36
|
-
(ip, j1, j2,
|
37
|
-
ts_day, ts_mo, ts_year,
|
38
|
-
ts_hour, ts_min, ts_sec, tz,
|
39
|
-
http_method, path, protocol,
|
40
|
-
response_code, duration,
|
41
|
-
referer, ua, *cruft) = m.captures
|
42
|
-
date = [ts_year, MONTHS[ts_mo], ts_day].join("")
|
43
|
-
time = [ts_hour, ts_min, ts_sec].join("")
|
44
|
-
yield [:logline, ip, date, time, http_method, protocol, path, response_code, duration, referer, ua, tz]
|
45
|
-
else
|
46
|
-
yield [:unparseable, line]
|
47
|
-
end
|
48
|
-
end
|
7
|
+
class ApacheLogParser < Wukong::Streamer::LineStreamer
|
49
8
|
|
9
|
+
# create a Logline object from each record and serialize it flat to disk
|
10
|
+
def process line
|
11
|
+
yield Logline.parse(line)
|
50
12
|
end
|
51
13
|
end
|
52
14
|
|
53
|
-
Wukong.run(ApacheLogParser
|
54
|
-
|
55
|
-
# 55.55.155.55 - - [04/Feb/2008:11:37:52 +0000] 301 "GET /robots.txt HTTP/1.1" 185 "-" "WebAlta Crawler/2.0 (http://www.webalta.net/ru/about_webmaster.html) (Windows; U; Windows NT 5.1; ru-RU)" "-"
|
15
|
+
Wukong.run( ApacheLogParser, nil, :sort_fields => 7 ) if $0 == __FILE__
|
56
16
|
|
57
17
|
|
58
18
|
|
@@ -1,9 +1,6 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
|
5
1
|
class Logline < Struct.new(
|
6
|
-
|
2
|
+
:ip, :dt, :tm, :http_method, :protocol, :path, :response_code, :size, :referer, :ua, :tz, :j1, :j2)
|
3
|
+
# 1 2 3 4 5 6 7 8 9 10 11
|
7
4
|
|
8
5
|
def page_type
|
9
6
|
case
|
@@ -14,14 +11,41 @@ class Logline < Struct.new(
|
|
14
11
|
end
|
15
12
|
end
|
16
13
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
14
|
+
#
|
15
|
+
# Regular expression to parse an apache log line.
|
16
|
+
#
|
17
|
+
# 83.240.154.3 - - [07/Jun/2008:20:37:11 +0000] "GET /faq HTTP/1.1" 200 569 "http://infochimps.org/search?query=CAC" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
|
18
|
+
#
|
19
|
+
LOG_RE = Regexp.compile(%r{\A
|
20
|
+
(\S+) # ip 83.240.154.3
|
21
|
+
\s(\S+) # j1 -
|
22
|
+
\s(\S+) # j2 -
|
23
|
+
\s\[(\d+)/(\w+)/(\d+) # date part [07/Jun/2008
|
24
|
+
:(\d+):(\d+):(\d+) # time part :20:37:11
|
25
|
+
\s(\+.*)\] # timezone +0000]
|
26
|
+
\s\"(?:(\S+) # http_method "GET
|
27
|
+
\s(\S+) # path /faq
|
28
|
+
\s(\S+)|-)" # protocol HTTP/1.1"
|
29
|
+
\s(\d+) # response_code 200
|
30
|
+
\s(\d+) # size 569
|
31
|
+
\s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
|
32
|
+
\s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
|
33
|
+
\z}x)
|
34
|
+
MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
|
21
35
|
|
22
|
-
|
23
|
-
|
24
|
-
|
36
|
+
# Use the regex to break line into fields
|
37
|
+
# Emit each record as flat line
|
38
|
+
def self.parse line
|
39
|
+
m = LOG_RE.match(line.chomp) or return BadRecord.new(line)
|
40
|
+
(ip, j1, j2,
|
41
|
+
ts_day, ts_mo, ts_year,
|
42
|
+
ts_hour, ts_min, ts_sec, tz,
|
43
|
+
http_method, path, protocol,
|
44
|
+
response_code, size,
|
45
|
+
referer, ua, *cruft) = m.captures
|
46
|
+
dt = [ts_year, MONTHS[ts_mo], ts_day].join("")
|
47
|
+
tm = [ts_hour, ts_min, ts_sec].join("")
|
48
|
+
self.new( ip, dt, tm, http_method, protocol, path, response_code, size, referer, ua, tz, j1, j2 )
|
25
49
|
end
|
50
|
+
|
26
51
|
end
|
27
|
-
Wukong.run(PageFilter)
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env ruby -E BINARY
|
2
|
+
require 'rubygems'
|
3
|
+
require 'faraday'
|
4
|
+
require 'wukong/script'
|
5
|
+
require 'json'
|
6
|
+
$: << File.dirname(__FILE__)
|
7
|
+
require 'apache_log_parser'
|
8
|
+
require 'nook/faraday_dummy_adapter'
|
9
|
+
|
10
|
+
Settings.define :target_host, :default => 'localhost', :description => "The host name or IP address to target"
|
11
|
+
Settings.define :target_scheme, :default => 'http', :description => "Request scheme (http, https)"
|
12
|
+
|
13
|
+
#
|
14
|
+
# A Nook consumes its input stream and, for each input, generates an HTTP
|
15
|
+
# request against a remote host. Please use it for good and never for evil.
|
16
|
+
#
|
17
|
+
# You can use it from your command line:
|
18
|
+
# zcat /var/www/app/current/log/*access*.log.gz | ./nook.rb --map --host=http://my_own_host.com
|
19
|
+
#
|
20
|
+
#
|
21
|
+
class NookMapper < ApacheLogParser
|
22
|
+
# create a Logline object from each record and serialize it flat to disk
|
23
|
+
def process line
|
24
|
+
super(line) do |logline|
|
25
|
+
start = Time.now
|
26
|
+
resp = fetcher.get(logline.path, :user_agent => logline.ua, :referer => logline.referer)
|
27
|
+
yield [Time.now.to_flat, (Time.now - start).to_f, resp.status, resp.body.size, logline.path, resp.body]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def track record
|
32
|
+
monitor.periodically do |m|
|
33
|
+
m.progress
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# a mock fetcher with a uniformly distributed variable delay
|
38
|
+
def fetcher
|
39
|
+
@fetcher ||= Faraday::Connection.new(:url => 'http://localhost:80/') do |f|
|
40
|
+
f.use Faraday::Adapter::Dummy do |dummy|
|
41
|
+
dummy.delay = Proc.new{|env| 0.05 } # 0.2 * rand()
|
42
|
+
# dummy.body = Proc.new{|env| env[:url] }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
Wukong.run( NookMapper, nil, :sort_fields => 7 )
|
@@ -0,0 +1,94 @@
|
|
1
|
+
|
2
|
+
module Faraday
|
3
|
+
class Adapter
|
4
|
+
|
5
|
+
# test = Faraday::Connection.new do |f|
|
6
|
+
# f.use Faraday::Adapter::Dummy do |dummy|
|
7
|
+
# dummy.status 404
|
8
|
+
# dummy.delay 1
|
9
|
+
# end
|
10
|
+
# end
|
11
|
+
#
|
12
|
+
# # this will delay 0.2s, returning 404 with
|
13
|
+
# resp = text.get("/your/mom", :dummy_delay => 0.2)
|
14
|
+
# resp.body # => {"method":"get","url":"/your/mom","request_headers":{"Dummy-Delay":"0.2","dummy_delay":0.2},"request":{"proxy":null},"ssl":{}}
|
15
|
+
#
|
16
|
+
# More example:
|
17
|
+
#
|
18
|
+
# test = Faraday::Connection.new do |f|
|
19
|
+
# f.use Faraday::Adapter::Dummy, :status => 503
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# test = Faraday::Connection.new do |f|
|
23
|
+
# f.use Faraday::Adapter::Dummy do |dummy|
|
24
|
+
# dummy.delay = Proc.new{|env| 0.1 + 0.8 * rand() }
|
25
|
+
# end
|
26
|
+
# end
|
27
|
+
#
|
28
|
+
class Dummy < Middleware
|
29
|
+
include Addressable
|
30
|
+
attr_reader :config
|
31
|
+
def self.loaded?() false end
|
32
|
+
|
33
|
+
# gets value from environment if set, configured instance variable otherwise
|
34
|
+
def value_for env, key
|
35
|
+
val = env[:request_headers]["Dummy-#{header_hash_key(key)}"] || config[key]
|
36
|
+
if val.respond_to?(:call)
|
37
|
+
val = val.call(env)
|
38
|
+
end
|
39
|
+
val
|
40
|
+
end
|
41
|
+
|
42
|
+
# With an optional delay, constructs a [status, headers, response] based on the first of:
|
43
|
+
# * request header field (Dummy-Status, Dummy-Headers, Dummy-Resonse)
|
44
|
+
# * adapter's configuration:
|
45
|
+
# * Unless one of the above is set, body will return a json string taken from the request hash
|
46
|
+
#
|
47
|
+
def call(env)
|
48
|
+
status = value_for(env, :status)
|
49
|
+
headers = value_for(env, :headers)
|
50
|
+
headers = JSON.load(headers) if headers.is_a? String
|
51
|
+
body = value_for(env, :body) ||
|
52
|
+
env.dup.tap{|hsh| [:response, :parallel_manager, :body].each{|k| hsh.delete k} }.to_json
|
53
|
+
delay = value_for(env, :delay).to_f
|
54
|
+
sleep delay if delay > 0
|
55
|
+
headers[:dummy_delay] = delay
|
56
|
+
env.update(
|
57
|
+
:status => status,
|
58
|
+
:response_headers => headers,
|
59
|
+
:body => body)
|
60
|
+
@app.call(env)
|
61
|
+
end
|
62
|
+
|
63
|
+
class Configurator < Struct.new(:status, :headers, :delay, :body)
|
64
|
+
def status(val=nil) self.status = val if val ; super() end
|
65
|
+
def headers(val=nil) self.headers = val if val ; super() end
|
66
|
+
def body(val=nil) self.body = val if val ; super() end
|
67
|
+
def delay(val=nil) self.delay = val if val ; super() end
|
68
|
+
def self.from_hash hsh
|
69
|
+
new().tap{|config| hsh.each{|k,v| config.send("#{k}=", v) } }
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def initialize(app, defaults={}, &block)
|
74
|
+
super(app)
|
75
|
+
@config = Configurator.from_hash(defaults.reverse_merge(:status => 200, :delay => 0, :headers => {}))
|
76
|
+
configure(&block) if block
|
77
|
+
end
|
78
|
+
|
79
|
+
def configure
|
80
|
+
yield config
|
81
|
+
end
|
82
|
+
|
83
|
+
# same as in Faraday::Utils -- turns :dummy_response_status into 'Dummy-Response-Status'
|
84
|
+
def header_hash_key(str)
|
85
|
+
str.to_s.split('_').each{|w| w.capitalize! }.join('-')
|
86
|
+
end
|
87
|
+
|
88
|
+
def create_multipart(env, params, boundary = nil)
|
89
|
+
stream = super
|
90
|
+
stream.read
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'wukong/script'
|
4
|
+
|
5
|
+
module WordCount
|
6
|
+
class Mapper < Wukong::Streamer::LineStreamer
|
7
|
+
#
|
8
|
+
# Emit each word in each line.
|
9
|
+
#
|
10
|
+
def process line
|
11
|
+
tokenize(line).each{|word| yield [word, 1] }
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Split a string into its constituent words.
|
16
|
+
#
|
17
|
+
# This is pretty simpleminded:
|
18
|
+
# * downcase the word
|
19
|
+
# * Split at any non-alphanumeric boundary, including '_'
|
20
|
+
# * However, preserve the special cases of 's, 'd or 't at the end of a
|
21
|
+
# word.
|
22
|
+
#
|
23
|
+
# tokenize("Ability is a poor man's wealth #johnwoodenquote")
|
24
|
+
# # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
|
25
|
+
#
|
26
|
+
def tokenize str
|
27
|
+
return [] if str.blank?
|
28
|
+
str = str.downcase;
|
29
|
+
# kill off all punctuation except [stuff]'s or [stuff]'t
|
30
|
+
# this includes hyphens (words are split)
|
31
|
+
str = str.
|
32
|
+
gsub(/[^a-zA-Z0-9\']+/, ' ').
|
33
|
+
gsub(/(\w)\'([std])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
|
34
|
+
# Busticate at whitespace
|
35
|
+
words = str.split(/\s+/)
|
36
|
+
words.reject!{|w| w.blank? }
|
37
|
+
words
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# A bit kinder to your memory manager: accumulate the sum record-by-record:
|
43
|
+
#
|
44
|
+
class Reducer2 < Wukong::Streamer::AccumulatingReducer
|
45
|
+
|
46
|
+
def start!(*args)
|
47
|
+
@key_count = 0
|
48
|
+
end
|
49
|
+
|
50
|
+
def accumulate(*args)
|
51
|
+
@key_count += 1
|
52
|
+
end
|
53
|
+
|
54
|
+
def finalize
|
55
|
+
yield [ key, @key_count ]
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
#
|
60
|
+
# You can stack up all the values in a list then sum them at once.
|
61
|
+
#
|
62
|
+
# This isn't good style, as it means the whole list is held in memory
|
63
|
+
#
|
64
|
+
class Reducer1 < Wukong::Streamer::ListReducer
|
65
|
+
def finalize
|
66
|
+
yield [ key, values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot } ]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
#
|
71
|
+
# ... easiest of all, though: this is common enough that it's already included
|
72
|
+
#
|
73
|
+
require 'wukong/streamer/count_keys'
|
74
|
+
class Reducer3 < Wukong::Streamer::CountKeys
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Execute the script
|
79
|
+
Wukong.run(
|
80
|
+
WordCount::Mapper,
|
81
|
+
WordCount::Reducer2
|
82
|
+
)
|
data/examples/size.rb
CHANGED
data/lib/wukong.rb
CHANGED
data/lib/wukong/extensions.rb
CHANGED
@@ -2,8 +2,8 @@
|
|
2
2
|
# These pull in the minimal functionality of the extlib|activesupport family of
|
3
3
|
# gems.
|
4
4
|
#
|
5
|
-
require '
|
6
|
-
require '
|
5
|
+
require 'extlib/blank'
|
6
|
+
require 'extlib/class'
|
7
7
|
require 'wukong/extensions/enumerable'
|
8
8
|
require 'wukong/extensions/symbol'
|
9
9
|
require 'wukong/extensions/hash'
|
@@ -15,7 +15,7 @@ class Object
|
|
15
15
|
# @api public
|
16
16
|
def blank?
|
17
17
|
nil? || (respond_to?(:empty?) && empty?)
|
18
|
-
end
|
18
|
+
end unless method_defined?(:blank?)
|
19
19
|
end # class Object
|
20
20
|
|
21
21
|
class Numeric
|
@@ -31,7 +31,7 @@ class Numeric
|
|
31
31
|
# @api public
|
32
32
|
def blank?
|
33
33
|
false
|
34
|
-
end
|
34
|
+
end unless method_defined?(:blank?)
|
35
35
|
end # class Numeric
|
36
36
|
|
37
37
|
class NilClass
|
@@ -45,7 +45,7 @@ class NilClass
|
|
45
45
|
# @api public
|
46
46
|
def blank?
|
47
47
|
true
|
48
|
-
end
|
48
|
+
end unless method_defined?(:blank?)
|
49
49
|
end # class NilClass
|
50
50
|
|
51
51
|
class TrueClass
|
@@ -59,7 +59,7 @@ class TrueClass
|
|
59
59
|
# @api public
|
60
60
|
def blank?
|
61
61
|
false
|
62
|
-
end
|
62
|
+
end unless method_defined?(:blank?)
|
63
63
|
end # class TrueClass
|
64
64
|
|
65
65
|
class FalseClass
|
@@ -73,7 +73,7 @@ class FalseClass
|
|
73
73
|
# @api public
|
74
74
|
def blank?
|
75
75
|
true
|
76
|
-
end
|
76
|
+
end unless method_defined?(:blank?)
|
77
77
|
end # class FalseClass
|
78
78
|
|
79
79
|
class String
|
@@ -89,5 +89,5 @@ class String
|
|
89
89
|
# @api public
|
90
90
|
def blank?
|
91
91
|
strip.empty?
|
92
|
-
end
|
92
|
+
end unless method_defined?(:blank?)
|
93
93
|
end # class String
|
@@ -66,11 +66,11 @@ class Hash
|
|
66
66
|
#
|
67
67
|
def deep_merge hsh2
|
68
68
|
merge hsh2, &Hash::DEEP_MERGER
|
69
|
-
end
|
69
|
+
end unless method_defined?(:deep_merge)
|
70
70
|
|
71
71
|
def deep_merge! hsh2
|
72
72
|
merge! hsh2, &Hash::DEEP_MERGER
|
73
|
-
end
|
73
|
+
end unless method_defined?(:deep_merge!)
|
74
74
|
|
75
75
|
#
|
76
76
|
# Treat hash as tree of hashes:
|
@@ -86,10 +86,10 @@ class Hash
|
|
86
86
|
val = args.pop
|
87
87
|
last_key = args.pop
|
88
88
|
# dig down to last subtree (building out if necessary)
|
89
|
-
hsh = args.empty? ? self : args.inject(self){|
|
89
|
+
hsh = args.empty? ? self : args.inject(self){|h, k| h[k] ||= {} }
|
90
90
|
# set leaf value
|
91
91
|
hsh[last_key] = val
|
92
|
-
end
|
92
|
+
end unless method_defined?(:deep_set)
|
93
93
|
|
94
94
|
#
|
95
95
|
# Treat hash as tree of hashes:
|
@@ -107,10 +107,10 @@ class Hash
|
|
107
107
|
def deep_get *args
|
108
108
|
last_key = args.pop
|
109
109
|
# dig down to last subtree (building out if necessary)
|
110
|
-
hsh = args.inject(self){|
|
110
|
+
hsh = args.inject(self){|h, k| h[k] || {} }
|
111
111
|
# get leaf value
|
112
112
|
hsh[last_key]
|
113
|
-
end
|
113
|
+
end unless method_defined?(:deep_get)
|
114
114
|
|
115
115
|
|
116
116
|
#
|
@@ -126,20 +126,20 @@ class Hash
|
|
126
126
|
last_key = args.pop
|
127
127
|
last_hsh = args.empty? ? self : (deep_get(*args)||{})
|
128
128
|
last_hsh.delete(last_key)
|
129
|
-
end
|
129
|
+
end unless method_defined?(:deep_delete)
|
130
130
|
|
131
131
|
#
|
132
132
|
# remove all key-value pairs where the value is nil
|
133
133
|
#
|
134
134
|
def compact
|
135
135
|
reject{|key,val| val.nil? }
|
136
|
-
end
|
136
|
+
end unless method_defined?(:compact)
|
137
137
|
#
|
138
138
|
# Replace the hash with its compacted self
|
139
139
|
#
|
140
140
|
def compact!
|
141
141
|
replace(compact)
|
142
|
-
end
|
142
|
+
end unless method_defined?(:compact!)
|
143
143
|
|
144
144
|
#
|
145
145
|
# remove all key-value pairs where the value is blank
|
@@ -64,7 +64,7 @@ module Wukong
|
|
64
64
|
# Analagous to Hash#merge
|
65
65
|
#
|
66
66
|
def merge *args
|
67
|
-
self.dup.merge!
|
67
|
+
self.dup.merge!(*args)
|
68
68
|
end
|
69
69
|
def merge! hsh, &block
|
70
70
|
raise "can't handle block arg yet" if block
|
@@ -104,7 +104,7 @@ module Wukong
|
|
104
104
|
#
|
105
105
|
def from_hash(hsh, has_symbol_keys=false)
|
106
106
|
extract_keys = has_symbol_keys ? self.keys.map(&:to_sym) : self.keys.map(&:to_s)
|
107
|
-
self.new
|
107
|
+
self.new(*hsh.values_of(*extract_keys))
|
108
108
|
end
|
109
109
|
#
|
110
110
|
# The last portion of the class in underscored form
|
data/lib/wukong/logger.rb
CHANGED
data/lib/wukong/script.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'pathname'
|
2
|
+
require 'wukong/extensions'
|
2
3
|
require 'configliere' ; Settings.use(:commandline, :env_var, :define)
|
3
4
|
require 'wukong'
|
4
5
|
require 'wukong/script/hadoop_command'
|
@@ -127,7 +128,7 @@ module Wukong
|
|
127
128
|
def initialize mapper, reducer=nil, extra_options={}
|
128
129
|
Settings.resolve!
|
129
130
|
@options = Settings
|
130
|
-
options.merge extra_options
|
131
|
+
options.merge! extra_options
|
131
132
|
@mapper = (case mapper when Class then mapper.new when nil then nil else mapper ; end)
|
132
133
|
@reducer = (case reducer when Class then reducer.new when nil then nil else reducer ; end)
|
133
134
|
@output_path = options.rest.pop
|
@@ -173,10 +174,14 @@ module Wukong
|
|
173
174
|
# In hadoop mode, this is given to the hadoop streaming command.
|
174
175
|
# In local mode, it's given to the system() call
|
175
176
|
#
|
176
|
-
def mapper_commandline
|
177
|
+
def mapper_commandline(run_option=:local)
|
177
178
|
if mapper
|
178
|
-
|
179
|
-
|
179
|
+
case run_option
|
180
|
+
when :local then
|
181
|
+
"#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
|
182
|
+
when :hadoop then
|
183
|
+
"#{ruby_interpreter_path} #{File.basename(this_script_filename)} --map " + non_wukong_params
|
184
|
+
end
|
180
185
|
else
|
181
186
|
options[:map_command]
|
182
187
|
end
|
@@ -187,10 +192,14 @@ module Wukong
|
|
187
192
|
# In hadoop mode, this is given to the hadoop streaming command.
|
188
193
|
# In local mode, it's given to the system() call
|
189
194
|
#
|
190
|
-
def reducer_commandline
|
195
|
+
def reducer_commandline(run_option=:local)
|
191
196
|
if reducer
|
192
|
-
|
193
|
-
|
197
|
+
case run_option
|
198
|
+
when :local then
|
199
|
+
"#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
|
200
|
+
when :hadoop then
|
201
|
+
"#{ruby_interpreter_path} #{File.basename(this_script_filename)} --reduce " + non_wukong_params
|
202
|
+
end
|
194
203
|
else
|
195
204
|
options[:reduce_command]
|
196
205
|
end
|
@@ -222,6 +231,7 @@ module Wukong
|
|
222
231
|
else
|
223
232
|
maybe_overwrite_output_paths! output_path
|
224
233
|
$stdout.puts `#{command}`
|
234
|
+
raise "Streaming command failed!" unless $?.success?
|
225
235
|
end
|
226
236
|
end
|
227
237
|
|
@@ -242,7 +252,7 @@ module Wukong
|
|
242
252
|
# the map/reducer phase scripts
|
243
253
|
def non_wukong_params
|
244
254
|
options.
|
245
|
-
reject{|param, val| options.
|
255
|
+
reject{|param, val| options.definition_of(param, :wukong) }.
|
246
256
|
map{|param,val| "--#{param}=#{val}" }.
|
247
257
|
join(" ")
|
248
258
|
end
|
@@ -13,8 +13,8 @@ Settings.define :emr_bootstrap_script, :description => 'Bootstrap actions for El
|
|
13
13
|
Settings.define :emr_extra_args, :description => 'kludge: allows you to stuff extra args into the elastic-mapreduce invocation', :type => Array, :wukong => true
|
14
14
|
Settings.define :alive, :description => 'Whether to keep machine running after job invocation', :type => :boolean
|
15
15
|
#
|
16
|
-
Settings.define :
|
17
|
-
Settings.define :
|
16
|
+
Settings.define :key_pair_file, :description => 'AWS Key pair file', :type => :filename
|
17
|
+
Settings.define :key_pair, :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') if Settings.key_pair_file }
|
18
18
|
Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
|
19
19
|
Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
|
20
20
|
Settings.define :jobflow, :description => "ID of an existing EMR job flow. Wukong will create a new job flow"
|
@@ -45,7 +45,9 @@ module Wukong
|
|
45
45
|
end
|
46
46
|
|
47
47
|
def hadoop_options_for_emr_runner
|
48
|
-
[hadoop_jobconf_options, hadoop_other_args].flatten.compact.map
|
48
|
+
[hadoop_jobconf_options, hadoop_other_args].flatten.compact.uniq.map do |hdp_opt|
|
49
|
+
hdp_opt.split(' ').map {|part| "--arg '#{part}'"}
|
50
|
+
end.flatten
|
49
51
|
end
|
50
52
|
|
51
53
|
def execute_emr_runner
|
@@ -57,7 +59,7 @@ module Wukong
|
|
57
59
|
command_args << "--create --name=#{job_name}"
|
58
60
|
command_args << Settings.dashed_flag_for(:alive)
|
59
61
|
command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type, :hadoop_version).join(' ')
|
60
|
-
command_args << Settings.dashed_flags(:availability_zone, :
|
62
|
+
command_args << Settings.dashed_flags(:availability_zone, :key_pair, :key_pair_file).join(' ')
|
61
63
|
command_args << "--bootstrap-action=#{bootstrap_s3_uri}"
|
62
64
|
end
|
63
65
|
command_args << Settings.dashed_flags(:enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
|
@@ -25,6 +25,9 @@ module Wukong
|
|
25
25
|
Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
|
26
26
|
Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
|
27
27
|
Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
|
28
|
+
Settings.define :max_tracker_failures, :jobconf => true, :description => 'mapred.max.tracker.failures', :wukong => true
|
29
|
+
Settings.define :max_map_attempts, :jobconf => true, :description => 'mapred.map.max.attempts', :wukong => true
|
30
|
+
Settings.define :max_reduce_attempts, :jobconf => true, :description => 'mapred.reduce.max.attempts', :wukong => true
|
28
31
|
Settings.define :min_split_size, :jobconf => true, :description => 'mapred.min.split.size', :wukong => true
|
29
32
|
Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
|
30
33
|
Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
|
@@ -40,8 +43,8 @@ module Wukong
|
|
40
43
|
# if not, the resulting nil will be elided later
|
41
44
|
def jobconf option
|
42
45
|
if options[option]
|
43
|
-
# "-jobconf %s=%s" % [options.
|
44
|
-
"-D %s=%s" % [options.
|
46
|
+
# "-jobconf %s=%s" % [options.definition_of(option, :description), options[option]]
|
47
|
+
"-D %s=%s" % [options.definition_of(option, :description), options[option]]
|
45
48
|
end
|
46
49
|
end
|
47
50
|
|
@@ -64,8 +67,8 @@ module Wukong
|
|
64
67
|
hadoop_jobconf_options,
|
65
68
|
"-D mapred.job.name='#{job_name}'",
|
66
69
|
hadoop_other_args,
|
67
|
-
"-mapper '#{mapper_commandline}'",
|
68
|
-
"-reducer '#{reducer_commandline}'",
|
70
|
+
"-mapper '#{mapper_commandline(:hadoop)}'",
|
71
|
+
"-reducer '#{reducer_commandline(:hadoop)}'",
|
69
72
|
"-input '#{input_paths}'",
|
70
73
|
"-output '#{output_path}'",
|
71
74
|
"-file '#{this_script_filename}'",
|
@@ -100,6 +103,8 @@ module Wukong
|
|
100
103
|
:partition_fields, :sort_fields,
|
101
104
|
:reduce_tasks, :respect_exit_status,
|
102
105
|
:reuse_jvms, :timeout,
|
106
|
+
:max_tracker_failures, :max_map_attempts,
|
107
|
+
:max_reduce_attempts
|
103
108
|
].map{|opt| jobconf(opt)}
|
104
109
|
jobconf_options.flatten.compact
|
105
110
|
end
|
@@ -25,7 +25,13 @@ module Wukong
|
|
25
25
|
@input_paths = input_paths.map(&:strip).join(' ')
|
26
26
|
cmd_input_str = (input_paths == '-') ? "" : "cat '#{input_paths}' | "
|
27
27
|
cmd_output_str = (output_path == '-') ? "" : "> '#{output_path}'"
|
28
|
-
|
28
|
+
|
29
|
+
if (reducer || options[:reduce_command])
|
30
|
+
%Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} | #{reducer_commandline} #{cmd_output_str} }
|
31
|
+
else
|
32
|
+
%Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} #{cmd_output_str} }
|
33
|
+
end
|
34
|
+
|
29
35
|
end
|
30
36
|
|
31
37
|
end
|
data/lib/wukong/streamer/base.rb
CHANGED
@@ -28,11 +28,15 @@ module Wukong
|
|
28
28
|
process(*record) do |output_record|
|
29
29
|
emit output_record
|
30
30
|
end
|
31
|
-
|
31
|
+
track(record)
|
32
32
|
end
|
33
33
|
after_stream
|
34
34
|
end
|
35
35
|
|
36
|
+
def track record
|
37
|
+
monitor.periodically(record.to_s[0..1000])
|
38
|
+
end
|
39
|
+
|
36
40
|
def each_record &block
|
37
41
|
$stdin.each(&block)
|
38
42
|
end
|
@@ -103,7 +107,7 @@ module Wukong
|
|
103
107
|
# Creates a new object of this class and injects the given block
|
104
108
|
# as the process method
|
105
109
|
def self.mapper *args, &block
|
106
|
-
self.new.mapper
|
110
|
+
self.new.mapper(*args, &block)
|
107
111
|
end
|
108
112
|
|
109
113
|
# Delegates back to Wukong to run this instance as a mapper
|
data/wukong.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{wukong}
|
8
|
-
s.version = "2.0.
|
8
|
+
s.version = "2.0.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Philip (flip) Kromer"]
|
12
|
-
s.date = %q{2011-01
|
12
|
+
s.date = %q{2011-07-01}
|
13
13
|
s.description = %q{ Treat your dataset like a:
|
14
14
|
|
15
15
|
* stream of lines when it's efficient to process by lines
|
@@ -139,6 +139,7 @@ Gem::Specification.new do |s|
|
|
139
139
|
"examples/contrib/jeans/sizes.rb",
|
140
140
|
"examples/corpus/bucket_counter.rb",
|
141
141
|
"examples/corpus/dbpedia_abstract_to_sentences.rb",
|
142
|
+
"examples/corpus/sentence_bigrams.rb",
|
142
143
|
"examples/corpus/sentence_coocurrence.rb",
|
143
144
|
"examples/corpus/words_to_bigrams.rb",
|
144
145
|
"examples/emr/README.textile",
|
@@ -162,7 +163,10 @@ Gem::Specification.new do |s|
|
|
162
163
|
"examples/server_logs/apache_log_parser.rb",
|
163
164
|
"examples/server_logs/breadcrumbs.rb",
|
164
165
|
"examples/server_logs/logline.rb",
|
166
|
+
"examples/server_logs/nook.rb",
|
167
|
+
"examples/server_logs/nook/faraday_dummy_adapter.rb",
|
165
168
|
"examples/server_logs/user_agent.rb",
|
169
|
+
"examples/simple_word_count.rb",
|
166
170
|
"examples/size.rb",
|
167
171
|
"examples/stats/avg_value_frequency.rb",
|
168
172
|
"examples/stats/binning_percentile_estimator.rb",
|
@@ -252,13 +256,14 @@ Gem::Specification.new do |s|
|
|
252
256
|
]
|
253
257
|
s.homepage = %q{http://mrflip.github.com/wukong}
|
254
258
|
s.require_paths = ["lib"]
|
255
|
-
s.rubygems_version = %q{1.
|
259
|
+
s.rubygems_version = %q{1.5.0}
|
256
260
|
s.summary = %q{Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.}
|
257
261
|
s.test_files = [
|
258
262
|
"examples/contrib/jeans/normalize.rb",
|
259
263
|
"examples/contrib/jeans/sizes.rb",
|
260
264
|
"examples/corpus/bucket_counter.rb",
|
261
265
|
"examples/corpus/dbpedia_abstract_to_sentences.rb",
|
266
|
+
"examples/corpus/sentence_bigrams.rb",
|
262
267
|
"examples/corpus/sentence_coocurrence.rb",
|
263
268
|
"examples/corpus/words_to_bigrams.rb",
|
264
269
|
"examples/emr/elastic_mapreduce_example.rb",
|
@@ -275,7 +280,10 @@ Gem::Specification.new do |s|
|
|
275
280
|
"examples/server_logs/apache_log_parser.rb",
|
276
281
|
"examples/server_logs/breadcrumbs.rb",
|
277
282
|
"examples/server_logs/logline.rb",
|
283
|
+
"examples/server_logs/nook.rb",
|
284
|
+
"examples/server_logs/nook/faraday_dummy_adapter.rb",
|
278
285
|
"examples/server_logs/user_agent.rb",
|
286
|
+
"examples/simple_word_count.rb",
|
279
287
|
"examples/size.rb",
|
280
288
|
"examples/stats/avg_value_frequency.rb",
|
281
289
|
"examples/stats/binning_percentile_estimator.rb",
|
metadata
CHANGED
@@ -1,13 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wukong
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash: 15
|
5
4
|
prerelease:
|
6
|
-
|
7
|
-
- 2
|
8
|
-
- 0
|
9
|
-
- 0
|
10
|
-
version: 2.0.0
|
5
|
+
version: 2.0.1
|
11
6
|
platform: ruby
|
12
7
|
authors:
|
13
8
|
- Philip (flip) Kromer
|
@@ -15,7 +10,7 @@ autorequire:
|
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
12
|
|
18
|
-
date: 2011-01
|
13
|
+
date: 2011-07-01 00:00:00 -05:00
|
19
14
|
default_executable:
|
20
15
|
dependencies:
|
21
16
|
- !ruby/object:Gem::Dependency
|
@@ -26,11 +21,6 @@ dependencies:
|
|
26
21
|
requirements:
|
27
22
|
- - ">="
|
28
23
|
- !ruby/object:Gem::Version
|
29
|
-
hash: 13
|
30
|
-
segments:
|
31
|
-
- 1
|
32
|
-
- 2
|
33
|
-
- 9
|
34
24
|
version: 1.2.9
|
35
25
|
type: :development
|
36
26
|
version_requirements: *id001
|
@@ -42,9 +32,6 @@ dependencies:
|
|
42
32
|
requirements:
|
43
33
|
- - ">="
|
44
34
|
- !ruby/object:Gem::Version
|
45
|
-
hash: 3
|
46
|
-
segments:
|
47
|
-
- 0
|
48
35
|
version: "0"
|
49
36
|
type: :development
|
50
37
|
version_requirements: *id002
|
@@ -56,9 +43,6 @@ dependencies:
|
|
56
43
|
requirements:
|
57
44
|
- - ">="
|
58
45
|
- !ruby/object:Gem::Version
|
59
|
-
hash: 3
|
60
|
-
segments:
|
61
|
-
- 0
|
62
46
|
version: "0"
|
63
47
|
type: :runtime
|
64
48
|
version_requirements: *id003
|
@@ -70,9 +54,6 @@ dependencies:
|
|
70
54
|
requirements:
|
71
55
|
- - ">="
|
72
56
|
- !ruby/object:Gem::Version
|
73
|
-
hash: 3
|
74
|
-
segments:
|
75
|
-
- 0
|
76
57
|
version: "0"
|
77
58
|
type: :runtime
|
78
59
|
version_requirements: *id004
|
@@ -84,9 +65,6 @@ dependencies:
|
|
84
65
|
requirements:
|
85
66
|
- - ">="
|
86
67
|
- !ruby/object:Gem::Version
|
87
|
-
hash: 3
|
88
|
-
segments:
|
89
|
-
- 0
|
90
68
|
version: "0"
|
91
69
|
type: :runtime
|
92
70
|
version_requirements: *id005
|
@@ -98,9 +76,6 @@ dependencies:
|
|
98
76
|
requirements:
|
99
77
|
- - ">="
|
100
78
|
- !ruby/object:Gem::Version
|
101
|
-
hash: 3
|
102
|
-
segments:
|
103
|
-
- 0
|
104
79
|
version: "0"
|
105
80
|
type: :runtime
|
106
81
|
version_requirements: *id006
|
@@ -233,6 +208,7 @@ files:
|
|
233
208
|
- examples/contrib/jeans/sizes.rb
|
234
209
|
- examples/corpus/bucket_counter.rb
|
235
210
|
- examples/corpus/dbpedia_abstract_to_sentences.rb
|
211
|
+
- examples/corpus/sentence_bigrams.rb
|
236
212
|
- examples/corpus/sentence_coocurrence.rb
|
237
213
|
- examples/corpus/words_to_bigrams.rb
|
238
214
|
- examples/emr/README.textile
|
@@ -256,7 +232,10 @@ files:
|
|
256
232
|
- examples/server_logs/apache_log_parser.rb
|
257
233
|
- examples/server_logs/breadcrumbs.rb
|
258
234
|
- examples/server_logs/logline.rb
|
235
|
+
- examples/server_logs/nook.rb
|
236
|
+
- examples/server_logs/nook/faraday_dummy_adapter.rb
|
259
237
|
- examples/server_logs/user_agent.rb
|
238
|
+
- examples/simple_word_count.rb
|
260
239
|
- examples/size.rb
|
261
240
|
- examples/stats/avg_value_frequency.rb
|
262
241
|
- examples/stats/binning_percentile_estimator.rb
|
@@ -357,23 +336,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
357
336
|
requirements:
|
358
337
|
- - ">="
|
359
338
|
- !ruby/object:Gem::Version
|
360
|
-
hash: 3
|
361
|
-
segments:
|
362
|
-
- 0
|
363
339
|
version: "0"
|
364
340
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
365
341
|
none: false
|
366
342
|
requirements:
|
367
343
|
- - ">="
|
368
344
|
- !ruby/object:Gem::Version
|
369
|
-
hash: 3
|
370
|
-
segments:
|
371
|
-
- 0
|
372
345
|
version: "0"
|
373
346
|
requirements: []
|
374
347
|
|
375
348
|
rubyforge_project:
|
376
|
-
rubygems_version: 1.
|
349
|
+
rubygems_version: 1.5.0
|
377
350
|
signing_key:
|
378
351
|
specification_version: 3
|
379
352
|
summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.
|
@@ -382,6 +355,7 @@ test_files:
|
|
382
355
|
- examples/contrib/jeans/sizes.rb
|
383
356
|
- examples/corpus/bucket_counter.rb
|
384
357
|
- examples/corpus/dbpedia_abstract_to_sentences.rb
|
358
|
+
- examples/corpus/sentence_bigrams.rb
|
385
359
|
- examples/corpus/sentence_coocurrence.rb
|
386
360
|
- examples/corpus/words_to_bigrams.rb
|
387
361
|
- examples/emr/elastic_mapreduce_example.rb
|
@@ -398,7 +372,10 @@ test_files:
|
|
398
372
|
- examples/server_logs/apache_log_parser.rb
|
399
373
|
- examples/server_logs/breadcrumbs.rb
|
400
374
|
- examples/server_logs/logline.rb
|
375
|
+
- examples/server_logs/nook.rb
|
376
|
+
- examples/server_logs/nook/faraday_dummy_adapter.rb
|
401
377
|
- examples/server_logs/user_agent.rb
|
378
|
+
- examples/simple_word_count.rb
|
402
379
|
- examples/size.rb
|
403
380
|
- examples/stats/avg_value_frequency.rb
|
404
381
|
- examples/stats/binning_percentile_estimator.rb
|