wukong 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +0 -1
- data/TODO.textile +6 -0
- data/examples/corpus/dbpedia_abstract_to_sentences.rb +1 -0
- data/examples/corpus/sentence_bigrams.rb +53 -0
- data/examples/corpus/sentence_coocurrence.rb +1 -5
- data/examples/corpus/words_to_bigrams.rb +2 -1
- data/examples/ignore_me/counting.rb +1 -2
- data/examples/network_graph/adjacency_list.rb +1 -1
- data/examples/network_graph/breadth_first_search.rb +1 -1
- data/examples/network_graph/gen_2paths.rb +2 -2
- data/examples/network_graph/gen_multi_edge.rb +0 -1
- data/examples/network_graph/gen_symmetric_links.rb +1 -0
- data/examples/pagerank/pagerank.rb +5 -21
- data/examples/pagerank/pagerank_initialize.rb +1 -1
- data/examples/server_logs/apache_log_parser.rb +8 -48
- data/examples/server_logs/logline.rb +37 -13
- data/examples/server_logs/nook.rb +48 -0
- data/examples/server_logs/nook/faraday_dummy_adapter.rb +94 -0
- data/examples/simple_word_count.rb +82 -0
- data/examples/size.rb +1 -1
- data/examples/stats/binning_percentile_estimator.rb +1 -1
- data/examples/stats/rank_and_bin.rb +1 -1
- data/examples/stupidly_simple_filter.rb +1 -1
- data/lib/wukong.rb +1 -1
- data/lib/wukong/extensions.rb +2 -2
- data/lib/wukong/extensions/blank.rb +6 -6
- data/lib/wukong/extensions/hash.rb +9 -9
- data/lib/wukong/extensions/hash_like.rb +2 -2
- data/lib/wukong/extensions/symbol.rb +1 -1
- data/lib/wukong/logger.rb +1 -1
- data/lib/wukong/periodic_monitor.rb +2 -2
- data/lib/wukong/script.rb +18 -8
- data/lib/wukong/script/emr_command.rb +6 -4
- data/lib/wukong/script/hadoop_command.rb +9 -4
- data/lib/wukong/script/local_command.rb +7 -1
- data/lib/wukong/streamer/base.rb +6 -2
- data/wukong.gemspec +11 -3
- metadata +11 -34
data/README.textile
CHANGED
@@ -19,7 +19,6 @@ The **main documentation** lives on the "Wukong Pages.":http://mrflip.github.com
|
|
19
19
|
* Wukong is licensed under the "Apache License":http://mrflip.github.com/wukong/LICENSE.html (same as Hadoop)
|
20
20
|
* "More info":http://mrflip.github.com/wukong/moreinfo.html
|
21
21
|
|
22
|
-
|
23
22
|
h2. Help!
|
24
23
|
|
25
24
|
Send Wukong questions to the "Infinite Monkeywrench mailing list":http://groups.google.com/group/infochimps-code
|
data/TODO.textile
CHANGED
@@ -3,3 +3,9 @@
|
|
3
3
|
** We should be able to specify comma *or* space separated paths; the last
|
4
4
|
space-separated path in Settings.rest becomes the output file, the others are
|
5
5
|
used as the input_file list.
|
6
|
+
|
7
|
+
at_exit do
|
8
|
+
if $!.nil? && $0 == Goliath::Application.app_file
|
9
|
+
Application.run!
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)
|
3
|
+
require 'rubygems'
|
4
|
+
require 'wukong/script'
|
5
|
+
require 'bucket_counter'
|
6
|
+
|
7
|
+
#
|
8
|
+
# Coocurrence counts
|
9
|
+
#
|
10
|
+
|
11
|
+
#
|
12
|
+
# Input is a list of document-idx-sentences, each field is tab-separated
|
13
|
+
# title idx word_a word_b word_c ...
|
14
|
+
#
|
15
|
+
# This emits each co-courring pair exactly once; in the case of a three-word
|
16
|
+
# sentence the output would be
|
17
|
+
#
|
18
|
+
# word_a word_b
|
19
|
+
# word_a word_c
|
20
|
+
# word_b word_c
|
21
|
+
#
|
22
|
+
class SentenceBigrams < Wukong::Streamer::RecordStreamer
|
23
|
+
def process title, idx, *words
|
24
|
+
words[0..-2].zip(words[1..-1]).each do |word_a, word_b|
|
25
|
+
yield [word_a, word_b]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
#
|
31
|
+
# Combine multiple bucket counts into a single on
|
32
|
+
#
|
33
|
+
class CombineBuckets < Wukong::Streamer::AccumulatingReducer
|
34
|
+
def get_key *fields
|
35
|
+
fields[0..1]
|
36
|
+
end
|
37
|
+
def start! *args
|
38
|
+
@total = 0
|
39
|
+
end
|
40
|
+
def accumulate *fields
|
41
|
+
@total += 1
|
42
|
+
end
|
43
|
+
def finalize
|
44
|
+
yield [@total, key].flatten
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
Wukong.run(
|
49
|
+
SentenceBigrams,
|
50
|
+
CombineBuckets,
|
51
|
+
:io_sort_record_percent => 0.3,
|
52
|
+
:io_sort_mb => 300
|
53
|
+
)
|
@@ -26,11 +26,7 @@ class SentenceCoocurrence < Wukong::Streamer::RecordStreamer
|
|
26
26
|
end
|
27
27
|
|
28
28
|
def process title, idx, *words
|
29
|
-
words.
|
30
|
-
words[(idx+1) .. -1].each do |word_b|
|
31
|
-
@bucket << [word_a, word_b]
|
32
|
-
end
|
33
|
-
end
|
29
|
+
@bucket << words[0..-2].zip(words[1..-1])
|
34
30
|
dump_bucket if @bucket.full?
|
35
31
|
end
|
36
32
|
|
@@ -1,11 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
2
|
+
require 'rubygems'
|
3
3
|
require 'wukong/script'
|
4
4
|
|
5
|
-
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
|
9
5
|
module PageRank
|
10
6
|
#
|
11
7
|
# Damping factor (prob. of a 'random' jump)
|
@@ -13,16 +9,12 @@ module PageRank
|
|
13
9
|
#
|
14
10
|
DAMPING_FACTOR = 0.85
|
15
11
|
|
16
|
-
#
|
17
12
|
# Each user's line looks like
|
18
|
-
#
|
19
13
|
# user_a pagerank id1,id2,...,idN
|
20
|
-
#
|
21
14
|
# we need to disperse this user's pagerank to each of id1..idN, and
|
22
15
|
# rendezvous the list of outbound links at user_a's reducer as well.
|
23
|
-
#
|
24
16
|
module Iterating
|
25
|
-
class
|
17
|
+
class PagerankMapper < Wukong::Streamer::Base
|
26
18
|
#
|
27
19
|
# Send pagerank to each page, and send the dests list back to self
|
28
20
|
#
|
@@ -34,9 +26,7 @@ module PageRank
|
|
34
26
|
yield_own_dest_list src, dests_str, &block
|
35
27
|
end
|
36
28
|
|
37
|
-
#
|
38
29
|
# Take the source node's pagerank and distribute it among all the out-nodes
|
39
|
-
#
|
40
30
|
def yield_pagerank_shares src, pagerank, dests
|
41
31
|
pagerank_share = pagerank.to_f / dests.length
|
42
32
|
dests.each do |dest|
|
@@ -44,15 +34,13 @@ module PageRank
|
|
44
34
|
end
|
45
35
|
end
|
46
36
|
|
47
|
-
#
|
48
37
|
# Dispatch this user's out-node list to rendezvous with itself.
|
49
|
-
#
|
50
38
|
def yield_own_dest_list src, dests_str
|
51
39
|
yield [src, 'd', dests_str]
|
52
40
|
end
|
53
41
|
end
|
54
42
|
|
55
|
-
class
|
43
|
+
class PagerankReducer < Wukong::Streamer::AccumulatingReducer
|
56
44
|
attr_accessor :node_id, :pagerank, :dests_str
|
57
45
|
# Begin reduction with 0 accumulated pagerank and no dests as yet
|
58
46
|
def start! node_id, *args
|
@@ -78,11 +66,7 @@ module PageRank
|
|
78
66
|
end
|
79
67
|
end
|
80
68
|
|
81
|
-
|
82
|
-
|
83
|
-
super.merge :extra_args => ' -jobconf io.sort.record.percent=0.25 '
|
84
|
-
end
|
85
|
-
end
|
86
|
-
Script.new(Mapper, Reducer).run
|
69
|
+
Wukong.run(PagerankMapper, PagerankReducer,
|
70
|
+
:extra_args => ' -jobconf io.sort.record.percent=0.25 ')
|
87
71
|
end
|
88
72
|
end
|
@@ -1,58 +1,18 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
1
|
+
#!/usr/bin/env ruby -E ASCII-8BIT
|
2
2
|
require 'rubygems'
|
3
3
|
require 'wukong/script'
|
4
|
+
$: << File.dirname(__FILE__)
|
5
|
+
require 'logline'
|
4
6
|
|
5
|
-
|
6
|
-
class Mapper < Wukong::Streamer::LineStreamer
|
7
|
-
|
8
|
-
#
|
9
|
-
# Regular expression to parse an apache log line.
|
10
|
-
#
|
11
|
-
# 83.240.154.3 - - [07/Jun/2008:20:37:11 +0000] "GET /faq HTTP/1.1" 200 569 "http://infochimps.org/search?query=CAC" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
|
12
|
-
#
|
13
|
-
LOG_RE = Regexp.compile(%r{\A
|
14
|
-
(\S+) # ip 83.240.154.3
|
15
|
-
\s(\S+) # j1 -
|
16
|
-
\s(\S+) # j2 -
|
17
|
-
\s\[(\d+)/(\w+)/(\d+) # date part [07/Jun/2008
|
18
|
-
:(\d+):(\d+):(\d+) # time part :20:37:11
|
19
|
-
\s(\+.*)\] # timezone +0000]
|
20
|
-
\s\"(?:(\S+) # http_method "GET
|
21
|
-
\s(\S+) # path /faq
|
22
|
-
\s(\S+)|-)" # protocol HTTP/1.1"
|
23
|
-
\s(\d+) # response_code 200
|
24
|
-
\s(\d+) # duration 569
|
25
|
-
\s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
|
26
|
-
\s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
|
27
|
-
\z}x)
|
28
|
-
MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
|
29
|
-
|
30
|
-
# Use the regex to break line into fields
|
31
|
-
# Emit each record as flat line
|
32
|
-
def process line
|
33
|
-
line.chomp
|
34
|
-
m = LOG_RE.match(line)
|
35
|
-
if m
|
36
|
-
(ip, j1, j2,
|
37
|
-
ts_day, ts_mo, ts_year,
|
38
|
-
ts_hour, ts_min, ts_sec, tz,
|
39
|
-
http_method, path, protocol,
|
40
|
-
response_code, duration,
|
41
|
-
referer, ua, *cruft) = m.captures
|
42
|
-
date = [ts_year, MONTHS[ts_mo], ts_day].join("")
|
43
|
-
time = [ts_hour, ts_min, ts_sec].join("")
|
44
|
-
yield [:logline, ip, date, time, http_method, protocol, path, response_code, duration, referer, ua, tz]
|
45
|
-
else
|
46
|
-
yield [:unparseable, line]
|
47
|
-
end
|
48
|
-
end
|
7
|
+
class ApacheLogParser < Wukong::Streamer::LineStreamer
|
49
8
|
|
9
|
+
# create a Logline object from each record and serialize it flat to disk
|
10
|
+
def process line
|
11
|
+
yield Logline.parse(line)
|
50
12
|
end
|
51
13
|
end
|
52
14
|
|
53
|
-
Wukong.run(ApacheLogParser
|
54
|
-
|
55
|
-
# 55.55.155.55 - - [04/Feb/2008:11:37:52 +0000] 301 "GET /robots.txt HTTP/1.1" 185 "-" "WebAlta Crawler/2.0 (http://www.webalta.net/ru/about_webmaster.html) (Windows; U; Windows NT 5.1; ru-RU)" "-"
|
15
|
+
Wukong.run( ApacheLogParser, nil, :sort_fields => 7 ) if $0 == __FILE__
|
56
16
|
|
57
17
|
|
58
18
|
|
@@ -1,9 +1,6 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong/script'
|
4
|
-
|
5
1
|
class Logline < Struct.new(
|
6
|
-
|
2
|
+
:ip, :dt, :tm, :http_method, :protocol, :path, :response_code, :size, :referer, :ua, :tz, :j1, :j2)
|
3
|
+
# 1 2 3 4 5 6 7 8 9 10 11
|
7
4
|
|
8
5
|
def page_type
|
9
6
|
case
|
@@ -14,14 +11,41 @@ class Logline < Struct.new(
|
|
14
11
|
end
|
15
12
|
end
|
16
13
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
14
|
+
#
|
15
|
+
# Regular expression to parse an apache log line.
|
16
|
+
#
|
17
|
+
# 83.240.154.3 - - [07/Jun/2008:20:37:11 +0000] "GET /faq HTTP/1.1" 200 569 "http://infochimps.org/search?query=CAC" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
|
18
|
+
#
|
19
|
+
LOG_RE = Regexp.compile(%r{\A
|
20
|
+
(\S+) # ip 83.240.154.3
|
21
|
+
\s(\S+) # j1 -
|
22
|
+
\s(\S+) # j2 -
|
23
|
+
\s\[(\d+)/(\w+)/(\d+) # date part [07/Jun/2008
|
24
|
+
:(\d+):(\d+):(\d+) # time part :20:37:11
|
25
|
+
\s(\+.*)\] # timezone +0000]
|
26
|
+
\s\"(?:(\S+) # http_method "GET
|
27
|
+
\s(\S+) # path /faq
|
28
|
+
\s(\S+)|-)" # protocol HTTP/1.1"
|
29
|
+
\s(\d+) # response_code 200
|
30
|
+
\s(\d+) # size 569
|
31
|
+
\s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
|
32
|
+
\s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
|
33
|
+
\z}x)
|
34
|
+
MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
|
21
35
|
|
22
|
-
|
23
|
-
|
24
|
-
|
36
|
+
# Use the regex to break line into fields
|
37
|
+
# Emit each record as flat line
|
38
|
+
def self.parse line
|
39
|
+
m = LOG_RE.match(line.chomp) or return BadRecord.new(line)
|
40
|
+
(ip, j1, j2,
|
41
|
+
ts_day, ts_mo, ts_year,
|
42
|
+
ts_hour, ts_min, ts_sec, tz,
|
43
|
+
http_method, path, protocol,
|
44
|
+
response_code, size,
|
45
|
+
referer, ua, *cruft) = m.captures
|
46
|
+
dt = [ts_year, MONTHS[ts_mo], ts_day].join("")
|
47
|
+
tm = [ts_hour, ts_min, ts_sec].join("")
|
48
|
+
self.new( ip, dt, tm, http_method, protocol, path, response_code, size, referer, ua, tz, j1, j2 )
|
25
49
|
end
|
50
|
+
|
26
51
|
end
|
27
|
-
Wukong.run(PageFilter)
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env ruby -E BINARY
|
2
|
+
require 'rubygems'
|
3
|
+
require 'faraday'
|
4
|
+
require 'wukong/script'
|
5
|
+
require 'json'
|
6
|
+
$: << File.dirname(__FILE__)
|
7
|
+
require 'apache_log_parser'
|
8
|
+
require 'nook/faraday_dummy_adapter'
|
9
|
+
|
10
|
+
Settings.define :target_host, :default => 'localhost', :description => "The host name or IP address to target"
|
11
|
+
Settings.define :target_scheme, :default => 'http', :description => "Request scheme (http, https)"
|
12
|
+
|
13
|
+
#
|
14
|
+
# A Nook consumes its input stream and, for each input, generates an HTTP
|
15
|
+
# request against a remote host. Please use it for good and never for evil.
|
16
|
+
#
|
17
|
+
# You can use it from your command line:
|
18
|
+
# zcat /var/www/app/current/log/*access*.log.gz | ./nook.rb --map --host=http://my_own_host.com
|
19
|
+
#
|
20
|
+
#
|
21
|
+
class NookMapper < ApacheLogParser
|
22
|
+
# create a Logline object from each record and serialize it flat to disk
|
23
|
+
def process line
|
24
|
+
super(line) do |logline|
|
25
|
+
start = Time.now
|
26
|
+
resp = fetcher.get(logline.path, :user_agent => logline.ua, :referer => logline.referer)
|
27
|
+
yield [Time.now.to_flat, (Time.now - start).to_f, resp.status, resp.body.size, logline.path, resp.body]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def track record
|
32
|
+
monitor.periodically do |m|
|
33
|
+
m.progress
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# a mock fetcher with a uniformly distributed variable delay
|
38
|
+
def fetcher
|
39
|
+
@fetcher ||= Faraday::Connection.new(:url => 'http://localhost:80/') do |f|
|
40
|
+
f.use Faraday::Adapter::Dummy do |dummy|
|
41
|
+
dummy.delay = Proc.new{|env| 0.05 } # 0.2 * rand()
|
42
|
+
# dummy.body = Proc.new{|env| env[:url] }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
Wukong.run( NookMapper, nil, :sort_fields => 7 )
|
@@ -0,0 +1,94 @@
|
|
1
|
+
|
2
|
+
module Faraday
|
3
|
+
class Adapter
|
4
|
+
|
5
|
+
# test = Faraday::Connection.new do |f|
|
6
|
+
# f.use Faraday::Adapter::Dummy do |dummy|
|
7
|
+
# dummy.status 404
|
8
|
+
# dummy.delay 1
|
9
|
+
# end
|
10
|
+
# end
|
11
|
+
#
|
12
|
+
# # this will delay 0.2s, returning 404 with
|
13
|
+
# resp = text.get("/your/mom", :dummy_delay => 0.2)
|
14
|
+
# resp.body # => {"method":"get","url":"/your/mom","request_headers":{"Dummy-Delay":"0.2","dummy_delay":0.2},"request":{"proxy":null},"ssl":{}}
|
15
|
+
#
|
16
|
+
# More example:
|
17
|
+
#
|
18
|
+
# test = Faraday::Connection.new do |f|
|
19
|
+
# f.use Faraday::Adapter::Dummy, :status => 503
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# test = Faraday::Connection.new do |f|
|
23
|
+
# f.use Faraday::Adapter::Dummy do |dummy|
|
24
|
+
# dummy.delay = Proc.new{|env| 0.1 + 0.8 * rand() }
|
25
|
+
# end
|
26
|
+
# end
|
27
|
+
#
|
28
|
+
class Dummy < Middleware
|
29
|
+
include Addressable
|
30
|
+
attr_reader :config
|
31
|
+
def self.loaded?() false end
|
32
|
+
|
33
|
+
# gets value from environment if set, configured instance variable otherwise
|
34
|
+
def value_for env, key
|
35
|
+
val = env[:request_headers]["Dummy-#{header_hash_key(key)}"] || config[key]
|
36
|
+
if val.respond_to?(:call)
|
37
|
+
val = val.call(env)
|
38
|
+
end
|
39
|
+
val
|
40
|
+
end
|
41
|
+
|
42
|
+
# With an optional delay, constructs a [status, headers, response] based on the first of:
|
43
|
+
# * request header field (Dummy-Status, Dummy-Headers, Dummy-Resonse)
|
44
|
+
# * adapter's configuration:
|
45
|
+
# * Unless one of the above is set, body will return a json string taken from the request hash
|
46
|
+
#
|
47
|
+
def call(env)
|
48
|
+
status = value_for(env, :status)
|
49
|
+
headers = value_for(env, :headers)
|
50
|
+
headers = JSON.load(headers) if headers.is_a? String
|
51
|
+
body = value_for(env, :body) ||
|
52
|
+
env.dup.tap{|hsh| [:response, :parallel_manager, :body].each{|k| hsh.delete k} }.to_json
|
53
|
+
delay = value_for(env, :delay).to_f
|
54
|
+
sleep delay if delay > 0
|
55
|
+
headers[:dummy_delay] = delay
|
56
|
+
env.update(
|
57
|
+
:status => status,
|
58
|
+
:response_headers => headers,
|
59
|
+
:body => body)
|
60
|
+
@app.call(env)
|
61
|
+
end
|
62
|
+
|
63
|
+
class Configurator < Struct.new(:status, :headers, :delay, :body)
|
64
|
+
def status(val=nil) self.status = val if val ; super() end
|
65
|
+
def headers(val=nil) self.headers = val if val ; super() end
|
66
|
+
def body(val=nil) self.body = val if val ; super() end
|
67
|
+
def delay(val=nil) self.delay = val if val ; super() end
|
68
|
+
def self.from_hash hsh
|
69
|
+
new().tap{|config| hsh.each{|k,v| config.send("#{k}=", v) } }
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def initialize(app, defaults={}, &block)
|
74
|
+
super(app)
|
75
|
+
@config = Configurator.from_hash(defaults.reverse_merge(:status => 200, :delay => 0, :headers => {}))
|
76
|
+
configure(&block) if block
|
77
|
+
end
|
78
|
+
|
79
|
+
def configure
|
80
|
+
yield config
|
81
|
+
end
|
82
|
+
|
83
|
+
# same as in Faraday::Utils -- turns :dummy_response_status into 'Dummy-Response-Status'
|
84
|
+
def header_hash_key(str)
|
85
|
+
str.to_s.split('_').each{|w| w.capitalize! }.join('-')
|
86
|
+
end
|
87
|
+
|
88
|
+
def create_multipart(env, params, boundary = nil)
|
89
|
+
stream = super
|
90
|
+
stream.read
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'wukong/script'
|
4
|
+
|
5
|
+
module WordCount
|
6
|
+
class Mapper < Wukong::Streamer::LineStreamer
|
7
|
+
#
|
8
|
+
# Emit each word in each line.
|
9
|
+
#
|
10
|
+
def process line
|
11
|
+
tokenize(line).each{|word| yield [word, 1] }
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Split a string into its constituent words.
|
16
|
+
#
|
17
|
+
# This is pretty simpleminded:
|
18
|
+
# * downcase the word
|
19
|
+
# * Split at any non-alphanumeric boundary, including '_'
|
20
|
+
# * However, preserve the special cases of 's, 'd or 't at the end of a
|
21
|
+
# word.
|
22
|
+
#
|
23
|
+
# tokenize("Ability is a poor man's wealth #johnwoodenquote")
|
24
|
+
# # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
|
25
|
+
#
|
26
|
+
def tokenize str
|
27
|
+
return [] if str.blank?
|
28
|
+
str = str.downcase;
|
29
|
+
# kill off all punctuation except [stuff]'s or [stuff]'t
|
30
|
+
# this includes hyphens (words are split)
|
31
|
+
str = str.
|
32
|
+
gsub(/[^a-zA-Z0-9\']+/, ' ').
|
33
|
+
gsub(/(\w)\'([std])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
|
34
|
+
# Busticate at whitespace
|
35
|
+
words = str.split(/\s+/)
|
36
|
+
words.reject!{|w| w.blank? }
|
37
|
+
words
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# A bit kinder to your memory manager: accumulate the sum record-by-record:
|
43
|
+
#
|
44
|
+
class Reducer2 < Wukong::Streamer::AccumulatingReducer
|
45
|
+
|
46
|
+
def start!(*args)
|
47
|
+
@key_count = 0
|
48
|
+
end
|
49
|
+
|
50
|
+
def accumulate(*args)
|
51
|
+
@key_count += 1
|
52
|
+
end
|
53
|
+
|
54
|
+
def finalize
|
55
|
+
yield [ key, @key_count ]
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
#
|
60
|
+
# You can stack up all the values in a list then sum them at once.
|
61
|
+
#
|
62
|
+
# This isn't good style, as it means the whole list is held in memory
|
63
|
+
#
|
64
|
+
class Reducer1 < Wukong::Streamer::ListReducer
|
65
|
+
def finalize
|
66
|
+
yield [ key, values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot } ]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
#
|
71
|
+
# ... easiest of all, though: this is common enough that it's already included
|
72
|
+
#
|
73
|
+
require 'wukong/streamer/count_keys'
|
74
|
+
class Reducer3 < Wukong::Streamer::CountKeys
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Execute the script
|
79
|
+
Wukong.run(
|
80
|
+
WordCount::Mapper,
|
81
|
+
WordCount::Reducer2
|
82
|
+
)
|
data/examples/size.rb
CHANGED
data/lib/wukong.rb
CHANGED
data/lib/wukong/extensions.rb
CHANGED
@@ -2,8 +2,8 @@
|
|
2
2
|
# These pull in the minimal functionality of the extlib|activesupport family of
|
3
3
|
# gems.
|
4
4
|
#
|
5
|
-
require '
|
6
|
-
require '
|
5
|
+
require 'extlib/blank'
|
6
|
+
require 'extlib/class'
|
7
7
|
require 'wukong/extensions/enumerable'
|
8
8
|
require 'wukong/extensions/symbol'
|
9
9
|
require 'wukong/extensions/hash'
|
@@ -15,7 +15,7 @@ class Object
|
|
15
15
|
# @api public
|
16
16
|
def blank?
|
17
17
|
nil? || (respond_to?(:empty?) && empty?)
|
18
|
-
end
|
18
|
+
end unless method_defined?(:blank?)
|
19
19
|
end # class Object
|
20
20
|
|
21
21
|
class Numeric
|
@@ -31,7 +31,7 @@ class Numeric
|
|
31
31
|
# @api public
|
32
32
|
def blank?
|
33
33
|
false
|
34
|
-
end
|
34
|
+
end unless method_defined?(:blank?)
|
35
35
|
end # class Numeric
|
36
36
|
|
37
37
|
class NilClass
|
@@ -45,7 +45,7 @@ class NilClass
|
|
45
45
|
# @api public
|
46
46
|
def blank?
|
47
47
|
true
|
48
|
-
end
|
48
|
+
end unless method_defined?(:blank?)
|
49
49
|
end # class NilClass
|
50
50
|
|
51
51
|
class TrueClass
|
@@ -59,7 +59,7 @@ class TrueClass
|
|
59
59
|
# @api public
|
60
60
|
def blank?
|
61
61
|
false
|
62
|
-
end
|
62
|
+
end unless method_defined?(:blank?)
|
63
63
|
end # class TrueClass
|
64
64
|
|
65
65
|
class FalseClass
|
@@ -73,7 +73,7 @@ class FalseClass
|
|
73
73
|
# @api public
|
74
74
|
def blank?
|
75
75
|
true
|
76
|
-
end
|
76
|
+
end unless method_defined?(:blank?)
|
77
77
|
end # class FalseClass
|
78
78
|
|
79
79
|
class String
|
@@ -89,5 +89,5 @@ class String
|
|
89
89
|
# @api public
|
90
90
|
def blank?
|
91
91
|
strip.empty?
|
92
|
-
end
|
92
|
+
end unless method_defined?(:blank?)
|
93
93
|
end # class String
|
@@ -66,11 +66,11 @@ class Hash
|
|
66
66
|
#
|
67
67
|
def deep_merge hsh2
|
68
68
|
merge hsh2, &Hash::DEEP_MERGER
|
69
|
-
end
|
69
|
+
end unless method_defined?(:deep_merge)
|
70
70
|
|
71
71
|
def deep_merge! hsh2
|
72
72
|
merge! hsh2, &Hash::DEEP_MERGER
|
73
|
-
end
|
73
|
+
end unless method_defined?(:deep_merge!)
|
74
74
|
|
75
75
|
#
|
76
76
|
# Treat hash as tree of hashes:
|
@@ -86,10 +86,10 @@ class Hash
|
|
86
86
|
val = args.pop
|
87
87
|
last_key = args.pop
|
88
88
|
# dig down to last subtree (building out if necessary)
|
89
|
-
hsh = args.empty? ? self : args.inject(self){|
|
89
|
+
hsh = args.empty? ? self : args.inject(self){|h, k| h[k] ||= {} }
|
90
90
|
# set leaf value
|
91
91
|
hsh[last_key] = val
|
92
|
-
end
|
92
|
+
end unless method_defined?(:deep_set)
|
93
93
|
|
94
94
|
#
|
95
95
|
# Treat hash as tree of hashes:
|
@@ -107,10 +107,10 @@ class Hash
|
|
107
107
|
def deep_get *args
|
108
108
|
last_key = args.pop
|
109
109
|
# dig down to last subtree (building out if necessary)
|
110
|
-
hsh = args.inject(self){|
|
110
|
+
hsh = args.inject(self){|h, k| h[k] || {} }
|
111
111
|
# get leaf value
|
112
112
|
hsh[last_key]
|
113
|
-
end
|
113
|
+
end unless method_defined?(:deep_get)
|
114
114
|
|
115
115
|
|
116
116
|
#
|
@@ -126,20 +126,20 @@ class Hash
|
|
126
126
|
last_key = args.pop
|
127
127
|
last_hsh = args.empty? ? self : (deep_get(*args)||{})
|
128
128
|
last_hsh.delete(last_key)
|
129
|
-
end
|
129
|
+
end unless method_defined?(:deep_delete)
|
130
130
|
|
131
131
|
#
|
132
132
|
# remove all key-value pairs where the value is nil
|
133
133
|
#
|
134
134
|
def compact
|
135
135
|
reject{|key,val| val.nil? }
|
136
|
-
end
|
136
|
+
end unless method_defined?(:compact)
|
137
137
|
#
|
138
138
|
# Replace the hash with its compacted self
|
139
139
|
#
|
140
140
|
def compact!
|
141
141
|
replace(compact)
|
142
|
-
end
|
142
|
+
end unless method_defined?(:compact!)
|
143
143
|
|
144
144
|
#
|
145
145
|
# remove all key-value pairs where the value is blank
|
@@ -64,7 +64,7 @@ module Wukong
|
|
64
64
|
# Analagous to Hash#merge
|
65
65
|
#
|
66
66
|
def merge *args
|
67
|
-
self.dup.merge!
|
67
|
+
self.dup.merge!(*args)
|
68
68
|
end
|
69
69
|
def merge! hsh, &block
|
70
70
|
raise "can't handle block arg yet" if block
|
@@ -104,7 +104,7 @@ module Wukong
|
|
104
104
|
#
|
105
105
|
def from_hash(hsh, has_symbol_keys=false)
|
106
106
|
extract_keys = has_symbol_keys ? self.keys.map(&:to_sym) : self.keys.map(&:to_s)
|
107
|
-
self.new
|
107
|
+
self.new(*hsh.values_of(*extract_keys))
|
108
108
|
end
|
109
109
|
#
|
110
110
|
# The last portion of the class in underscored form
|
data/lib/wukong/logger.rb
CHANGED
data/lib/wukong/script.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'pathname'
|
2
|
+
require 'wukong/extensions'
|
2
3
|
require 'configliere' ; Settings.use(:commandline, :env_var, :define)
|
3
4
|
require 'wukong'
|
4
5
|
require 'wukong/script/hadoop_command'
|
@@ -127,7 +128,7 @@ module Wukong
|
|
127
128
|
def initialize mapper, reducer=nil, extra_options={}
|
128
129
|
Settings.resolve!
|
129
130
|
@options = Settings
|
130
|
-
options.merge extra_options
|
131
|
+
options.merge! extra_options
|
131
132
|
@mapper = (case mapper when Class then mapper.new when nil then nil else mapper ; end)
|
132
133
|
@reducer = (case reducer when Class then reducer.new when nil then nil else reducer ; end)
|
133
134
|
@output_path = options.rest.pop
|
@@ -173,10 +174,14 @@ module Wukong
|
|
173
174
|
# In hadoop mode, this is given to the hadoop streaming command.
|
174
175
|
# In local mode, it's given to the system() call
|
175
176
|
#
|
176
|
-
def mapper_commandline
|
177
|
+
def mapper_commandline(run_option=:local)
|
177
178
|
if mapper
|
178
|
-
|
179
|
-
|
179
|
+
case run_option
|
180
|
+
when :local then
|
181
|
+
"#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
|
182
|
+
when :hadoop then
|
183
|
+
"#{ruby_interpreter_path} #{File.basename(this_script_filename)} --map " + non_wukong_params
|
184
|
+
end
|
180
185
|
else
|
181
186
|
options[:map_command]
|
182
187
|
end
|
@@ -187,10 +192,14 @@ module Wukong
|
|
187
192
|
# In hadoop mode, this is given to the hadoop streaming command.
|
188
193
|
# In local mode, it's given to the system() call
|
189
194
|
#
|
190
|
-
def reducer_commandline
|
195
|
+
def reducer_commandline(run_option=:local)
|
191
196
|
if reducer
|
192
|
-
|
193
|
-
|
197
|
+
case run_option
|
198
|
+
when :local then
|
199
|
+
"#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
|
200
|
+
when :hadoop then
|
201
|
+
"#{ruby_interpreter_path} #{File.basename(this_script_filename)} --reduce " + non_wukong_params
|
202
|
+
end
|
194
203
|
else
|
195
204
|
options[:reduce_command]
|
196
205
|
end
|
@@ -222,6 +231,7 @@ module Wukong
|
|
222
231
|
else
|
223
232
|
maybe_overwrite_output_paths! output_path
|
224
233
|
$stdout.puts `#{command}`
|
234
|
+
raise "Streaming command failed!" unless $?.success?
|
225
235
|
end
|
226
236
|
end
|
227
237
|
|
@@ -242,7 +252,7 @@ module Wukong
|
|
242
252
|
# the map/reducer phase scripts
|
243
253
|
def non_wukong_params
|
244
254
|
options.
|
245
|
-
reject{|param, val| options.
|
255
|
+
reject{|param, val| options.definition_of(param, :wukong) }.
|
246
256
|
map{|param,val| "--#{param}=#{val}" }.
|
247
257
|
join(" ")
|
248
258
|
end
|
@@ -13,8 +13,8 @@ Settings.define :emr_bootstrap_script, :description => 'Bootstrap actions for El
|
|
13
13
|
Settings.define :emr_extra_args, :description => 'kludge: allows you to stuff extra args into the elastic-mapreduce invocation', :type => Array, :wukong => true
|
14
14
|
Settings.define :alive, :description => 'Whether to keep machine running after job invocation', :type => :boolean
|
15
15
|
#
|
16
|
-
Settings.define :
|
17
|
-
Settings.define :
|
16
|
+
Settings.define :key_pair_file, :description => 'AWS Key pair file', :type => :filename
|
17
|
+
Settings.define :key_pair, :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') if Settings.key_pair_file }
|
18
18
|
Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
|
19
19
|
Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
|
20
20
|
Settings.define :jobflow, :description => "ID of an existing EMR job flow. Wukong will create a new job flow"
|
@@ -45,7 +45,9 @@ module Wukong
|
|
45
45
|
end
|
46
46
|
|
47
47
|
def hadoop_options_for_emr_runner
|
48
|
-
[hadoop_jobconf_options, hadoop_other_args].flatten.compact.map
|
48
|
+
[hadoop_jobconf_options, hadoop_other_args].flatten.compact.uniq.map do |hdp_opt|
|
49
|
+
hdp_opt.split(' ').map {|part| "--arg '#{part}'"}
|
50
|
+
end.flatten
|
49
51
|
end
|
50
52
|
|
51
53
|
def execute_emr_runner
|
@@ -57,7 +59,7 @@ module Wukong
|
|
57
59
|
command_args << "--create --name=#{job_name}"
|
58
60
|
command_args << Settings.dashed_flag_for(:alive)
|
59
61
|
command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type, :hadoop_version).join(' ')
|
60
|
-
command_args << Settings.dashed_flags(:availability_zone, :
|
62
|
+
command_args << Settings.dashed_flags(:availability_zone, :key_pair, :key_pair_file).join(' ')
|
61
63
|
command_args << "--bootstrap-action=#{bootstrap_s3_uri}"
|
62
64
|
end
|
63
65
|
command_args << Settings.dashed_flags(:enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
|
@@ -25,6 +25,9 @@ module Wukong
|
|
25
25
|
Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
|
26
26
|
Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
|
27
27
|
Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
|
28
|
+
Settings.define :max_tracker_failures, :jobconf => true, :description => 'mapred.max.tracker.failures', :wukong => true
|
29
|
+
Settings.define :max_map_attempts, :jobconf => true, :description => 'mapred.map.max.attempts', :wukong => true
|
30
|
+
Settings.define :max_reduce_attempts, :jobconf => true, :description => 'mapred.reduce.max.attempts', :wukong => true
|
28
31
|
Settings.define :min_split_size, :jobconf => true, :description => 'mapred.min.split.size', :wukong => true
|
29
32
|
Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
|
30
33
|
Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
|
@@ -40,8 +43,8 @@ module Wukong
|
|
40
43
|
# if not, the resulting nil will be elided later
|
41
44
|
def jobconf option
|
42
45
|
if options[option]
|
43
|
-
# "-jobconf %s=%s" % [options.
|
44
|
-
"-D %s=%s" % [options.
|
46
|
+
# "-jobconf %s=%s" % [options.definition_of(option, :description), options[option]]
|
47
|
+
"-D %s=%s" % [options.definition_of(option, :description), options[option]]
|
45
48
|
end
|
46
49
|
end
|
47
50
|
|
@@ -64,8 +67,8 @@ module Wukong
|
|
64
67
|
hadoop_jobconf_options,
|
65
68
|
"-D mapred.job.name='#{job_name}'",
|
66
69
|
hadoop_other_args,
|
67
|
-
"-mapper '#{mapper_commandline}'",
|
68
|
-
"-reducer '#{reducer_commandline}'",
|
70
|
+
"-mapper '#{mapper_commandline(:hadoop)}'",
|
71
|
+
"-reducer '#{reducer_commandline(:hadoop)}'",
|
69
72
|
"-input '#{input_paths}'",
|
70
73
|
"-output '#{output_path}'",
|
71
74
|
"-file '#{this_script_filename}'",
|
@@ -100,6 +103,8 @@ module Wukong
|
|
100
103
|
:partition_fields, :sort_fields,
|
101
104
|
:reduce_tasks, :respect_exit_status,
|
102
105
|
:reuse_jvms, :timeout,
|
106
|
+
:max_tracker_failures, :max_map_attempts,
|
107
|
+
:max_reduce_attempts
|
103
108
|
].map{|opt| jobconf(opt)}
|
104
109
|
jobconf_options.flatten.compact
|
105
110
|
end
|
@@ -25,7 +25,13 @@ module Wukong
|
|
25
25
|
@input_paths = input_paths.map(&:strip).join(' ')
|
26
26
|
cmd_input_str = (input_paths == '-') ? "" : "cat '#{input_paths}' | "
|
27
27
|
cmd_output_str = (output_path == '-') ? "" : "> '#{output_path}'"
|
28
|
-
|
28
|
+
|
29
|
+
if (reducer || options[:reduce_command])
|
30
|
+
%Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} | #{reducer_commandline} #{cmd_output_str} }
|
31
|
+
else
|
32
|
+
%Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} #{cmd_output_str} }
|
33
|
+
end
|
34
|
+
|
29
35
|
end
|
30
36
|
|
31
37
|
end
|
data/lib/wukong/streamer/base.rb
CHANGED
@@ -28,11 +28,15 @@ module Wukong
|
|
28
28
|
process(*record) do |output_record|
|
29
29
|
emit output_record
|
30
30
|
end
|
31
|
-
|
31
|
+
track(record)
|
32
32
|
end
|
33
33
|
after_stream
|
34
34
|
end
|
35
35
|
|
36
|
+
def track record
|
37
|
+
monitor.periodically(record.to_s[0..1000])
|
38
|
+
end
|
39
|
+
|
36
40
|
def each_record &block
|
37
41
|
$stdin.each(&block)
|
38
42
|
end
|
@@ -103,7 +107,7 @@ module Wukong
|
|
103
107
|
# Creates a new object of this class and injects the given block
|
104
108
|
# as the process method
|
105
109
|
def self.mapper *args, &block
|
106
|
-
self.new.mapper
|
110
|
+
self.new.mapper(*args, &block)
|
107
111
|
end
|
108
112
|
|
109
113
|
# Delegates back to Wukong to run this instance as a mapper
|
data/wukong.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{wukong}
|
8
|
-
s.version = "2.0.
|
8
|
+
s.version = "2.0.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Philip (flip) Kromer"]
|
12
|
-
s.date = %q{2011-01
|
12
|
+
s.date = %q{2011-07-01}
|
13
13
|
s.description = %q{ Treat your dataset like a:
|
14
14
|
|
15
15
|
* stream of lines when it's efficient to process by lines
|
@@ -139,6 +139,7 @@ Gem::Specification.new do |s|
|
|
139
139
|
"examples/contrib/jeans/sizes.rb",
|
140
140
|
"examples/corpus/bucket_counter.rb",
|
141
141
|
"examples/corpus/dbpedia_abstract_to_sentences.rb",
|
142
|
+
"examples/corpus/sentence_bigrams.rb",
|
142
143
|
"examples/corpus/sentence_coocurrence.rb",
|
143
144
|
"examples/corpus/words_to_bigrams.rb",
|
144
145
|
"examples/emr/README.textile",
|
@@ -162,7 +163,10 @@ Gem::Specification.new do |s|
|
|
162
163
|
"examples/server_logs/apache_log_parser.rb",
|
163
164
|
"examples/server_logs/breadcrumbs.rb",
|
164
165
|
"examples/server_logs/logline.rb",
|
166
|
+
"examples/server_logs/nook.rb",
|
167
|
+
"examples/server_logs/nook/faraday_dummy_adapter.rb",
|
165
168
|
"examples/server_logs/user_agent.rb",
|
169
|
+
"examples/simple_word_count.rb",
|
166
170
|
"examples/size.rb",
|
167
171
|
"examples/stats/avg_value_frequency.rb",
|
168
172
|
"examples/stats/binning_percentile_estimator.rb",
|
@@ -252,13 +256,14 @@ Gem::Specification.new do |s|
|
|
252
256
|
]
|
253
257
|
s.homepage = %q{http://mrflip.github.com/wukong}
|
254
258
|
s.require_paths = ["lib"]
|
255
|
-
s.rubygems_version = %q{1.
|
259
|
+
s.rubygems_version = %q{1.5.0}
|
256
260
|
s.summary = %q{Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.}
|
257
261
|
s.test_files = [
|
258
262
|
"examples/contrib/jeans/normalize.rb",
|
259
263
|
"examples/contrib/jeans/sizes.rb",
|
260
264
|
"examples/corpus/bucket_counter.rb",
|
261
265
|
"examples/corpus/dbpedia_abstract_to_sentences.rb",
|
266
|
+
"examples/corpus/sentence_bigrams.rb",
|
262
267
|
"examples/corpus/sentence_coocurrence.rb",
|
263
268
|
"examples/corpus/words_to_bigrams.rb",
|
264
269
|
"examples/emr/elastic_mapreduce_example.rb",
|
@@ -275,7 +280,10 @@ Gem::Specification.new do |s|
|
|
275
280
|
"examples/server_logs/apache_log_parser.rb",
|
276
281
|
"examples/server_logs/breadcrumbs.rb",
|
277
282
|
"examples/server_logs/logline.rb",
|
283
|
+
"examples/server_logs/nook.rb",
|
284
|
+
"examples/server_logs/nook/faraday_dummy_adapter.rb",
|
278
285
|
"examples/server_logs/user_agent.rb",
|
286
|
+
"examples/simple_word_count.rb",
|
279
287
|
"examples/size.rb",
|
280
288
|
"examples/stats/avg_value_frequency.rb",
|
281
289
|
"examples/stats/binning_percentile_estimator.rb",
|
metadata
CHANGED
@@ -1,13 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wukong
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash: 15
|
5
4
|
prerelease:
|
6
|
-
|
7
|
-
- 2
|
8
|
-
- 0
|
9
|
-
- 0
|
10
|
-
version: 2.0.0
|
5
|
+
version: 2.0.1
|
11
6
|
platform: ruby
|
12
7
|
authors:
|
13
8
|
- Philip (flip) Kromer
|
@@ -15,7 +10,7 @@ autorequire:
|
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
12
|
|
18
|
-
date: 2011-01
|
13
|
+
date: 2011-07-01 00:00:00 -05:00
|
19
14
|
default_executable:
|
20
15
|
dependencies:
|
21
16
|
- !ruby/object:Gem::Dependency
|
@@ -26,11 +21,6 @@ dependencies:
|
|
26
21
|
requirements:
|
27
22
|
- - ">="
|
28
23
|
- !ruby/object:Gem::Version
|
29
|
-
hash: 13
|
30
|
-
segments:
|
31
|
-
- 1
|
32
|
-
- 2
|
33
|
-
- 9
|
34
24
|
version: 1.2.9
|
35
25
|
type: :development
|
36
26
|
version_requirements: *id001
|
@@ -42,9 +32,6 @@ dependencies:
|
|
42
32
|
requirements:
|
43
33
|
- - ">="
|
44
34
|
- !ruby/object:Gem::Version
|
45
|
-
hash: 3
|
46
|
-
segments:
|
47
|
-
- 0
|
48
35
|
version: "0"
|
49
36
|
type: :development
|
50
37
|
version_requirements: *id002
|
@@ -56,9 +43,6 @@ dependencies:
|
|
56
43
|
requirements:
|
57
44
|
- - ">="
|
58
45
|
- !ruby/object:Gem::Version
|
59
|
-
hash: 3
|
60
|
-
segments:
|
61
|
-
- 0
|
62
46
|
version: "0"
|
63
47
|
type: :runtime
|
64
48
|
version_requirements: *id003
|
@@ -70,9 +54,6 @@ dependencies:
|
|
70
54
|
requirements:
|
71
55
|
- - ">="
|
72
56
|
- !ruby/object:Gem::Version
|
73
|
-
hash: 3
|
74
|
-
segments:
|
75
|
-
- 0
|
76
57
|
version: "0"
|
77
58
|
type: :runtime
|
78
59
|
version_requirements: *id004
|
@@ -84,9 +65,6 @@ dependencies:
|
|
84
65
|
requirements:
|
85
66
|
- - ">="
|
86
67
|
- !ruby/object:Gem::Version
|
87
|
-
hash: 3
|
88
|
-
segments:
|
89
|
-
- 0
|
90
68
|
version: "0"
|
91
69
|
type: :runtime
|
92
70
|
version_requirements: *id005
|
@@ -98,9 +76,6 @@ dependencies:
|
|
98
76
|
requirements:
|
99
77
|
- - ">="
|
100
78
|
- !ruby/object:Gem::Version
|
101
|
-
hash: 3
|
102
|
-
segments:
|
103
|
-
- 0
|
104
79
|
version: "0"
|
105
80
|
type: :runtime
|
106
81
|
version_requirements: *id006
|
@@ -233,6 +208,7 @@ files:
|
|
233
208
|
- examples/contrib/jeans/sizes.rb
|
234
209
|
- examples/corpus/bucket_counter.rb
|
235
210
|
- examples/corpus/dbpedia_abstract_to_sentences.rb
|
211
|
+
- examples/corpus/sentence_bigrams.rb
|
236
212
|
- examples/corpus/sentence_coocurrence.rb
|
237
213
|
- examples/corpus/words_to_bigrams.rb
|
238
214
|
- examples/emr/README.textile
|
@@ -256,7 +232,10 @@ files:
|
|
256
232
|
- examples/server_logs/apache_log_parser.rb
|
257
233
|
- examples/server_logs/breadcrumbs.rb
|
258
234
|
- examples/server_logs/logline.rb
|
235
|
+
- examples/server_logs/nook.rb
|
236
|
+
- examples/server_logs/nook/faraday_dummy_adapter.rb
|
259
237
|
- examples/server_logs/user_agent.rb
|
238
|
+
- examples/simple_word_count.rb
|
260
239
|
- examples/size.rb
|
261
240
|
- examples/stats/avg_value_frequency.rb
|
262
241
|
- examples/stats/binning_percentile_estimator.rb
|
@@ -357,23 +336,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
357
336
|
requirements:
|
358
337
|
- - ">="
|
359
338
|
- !ruby/object:Gem::Version
|
360
|
-
hash: 3
|
361
|
-
segments:
|
362
|
-
- 0
|
363
339
|
version: "0"
|
364
340
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
365
341
|
none: false
|
366
342
|
requirements:
|
367
343
|
- - ">="
|
368
344
|
- !ruby/object:Gem::Version
|
369
|
-
hash: 3
|
370
|
-
segments:
|
371
|
-
- 0
|
372
345
|
version: "0"
|
373
346
|
requirements: []
|
374
347
|
|
375
348
|
rubyforge_project:
|
376
|
-
rubygems_version: 1.
|
349
|
+
rubygems_version: 1.5.0
|
377
350
|
signing_key:
|
378
351
|
specification_version: 3
|
379
352
|
summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.
|
@@ -382,6 +355,7 @@ test_files:
|
|
382
355
|
- examples/contrib/jeans/sizes.rb
|
383
356
|
- examples/corpus/bucket_counter.rb
|
384
357
|
- examples/corpus/dbpedia_abstract_to_sentences.rb
|
358
|
+
- examples/corpus/sentence_bigrams.rb
|
385
359
|
- examples/corpus/sentence_coocurrence.rb
|
386
360
|
- examples/corpus/words_to_bigrams.rb
|
387
361
|
- examples/emr/elastic_mapreduce_example.rb
|
@@ -398,7 +372,10 @@ test_files:
|
|
398
372
|
- examples/server_logs/apache_log_parser.rb
|
399
373
|
- examples/server_logs/breadcrumbs.rb
|
400
374
|
- examples/server_logs/logline.rb
|
375
|
+
- examples/server_logs/nook.rb
|
376
|
+
- examples/server_logs/nook/faraday_dummy_adapter.rb
|
401
377
|
- examples/server_logs/user_agent.rb
|
378
|
+
- examples/simple_word_count.rb
|
402
379
|
- examples/size.rb
|
403
380
|
- examples/stats/avg_value_frequency.rb
|
404
381
|
- examples/stats/binning_percentile_estimator.rb
|