wukong 1.4.6 → 1.4.7
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.textile +12 -0
- data/README.textile +1 -0
- data/examples/apache_log_parser.rb +49 -40
- data/examples/server_logs/breadcrumbs.rb +36 -0
- data/examples/server_logs/user_agent.rb +40 -0
- data/examples/stats/avg_value_frequency.rb +86 -0
- data/examples/stats/data/avg_value_frequency.tsv +3 -0
- data/lib/wukong/script.rb +1 -0
- data/lib/wukong/script/hadoop_command.rb +4 -1
- data/lib/wukong/streamer.rb +1 -0
- data/lib/wukong/streamer/base.rb +6 -3
- data/lib/wukong/streamer/counting_reducer.rb +25 -0
- data/wukong.gemspec +11 -3
- metadata +38 -17
data/CHANGELOG.textile
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
h2. Wukong v1.4.7 2010-03-05
|
2
|
+
|
3
|
+
Lots more examples:
|
4
|
+
* examples/stats/avg_value_frequency.rb does an Average Value Frequency histogram
|
5
|
+
* examples/server_logs has a quite useful apache log file parser
|
6
|
+
* Made the base streamer use each_record, opening the door for alternative record injection (eg Datamapper!)
|
7
|
+
* wukong/streamer/counting_reducer.rb is an um reducer and it counts things.
|
8
|
+
|
9
|
+
h2. Wukong v1.4.6 2010-01-26
|
10
|
+
|
11
|
+
* A HELLA AWESOME working example from retail web analytics by @lenbust
|
12
|
+
|
1
13
|
h2. Wukong v1.4.5 2010-01-18
|
2
14
|
|
3
15
|
* In @--run=local@ mode, you can use '-' alone as a filename to indicate STDIN / STDOUT as input/output respectively.
|
data/README.textile
CHANGED
@@ -220,6 +220,7 @@ Patches submitted by:
|
|
220
220
|
* ruby interpreter path fix by "Yuichiro MASUI":http://github.com/masuidrive - masui at masuidrive.jp - http://blog.masuidrive.jp/
|
221
221
|
|
222
222
|
Thanks to:
|
223
|
+
* "Fredrik Möllerstrand (@lenbust)":http://twitter.com/lenbust for the examples/contrib/jeans working example
|
223
224
|
* "Brad Heintz":http://www.bradheintz.com/no1thing/talks/ for his early feedback
|
224
225
|
* "Phil Ripperger":http://blog.pdatasolutions.com for his "wukong in the Amazon AWS cloud":http://blog.pdatasolutions.com/post/191978092/ruby-on-hadoop-quickstart tutorial.
|
225
226
|
|
@@ -1,22 +1,45 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
$: << File.dirname(__FILE__)+'/../lib'
|
3
|
+
require 'rubygems'
|
3
4
|
require 'wukong'
|
4
5
|
|
6
|
+
MONTHS = {
|
7
|
+
'Jan' => '01',
|
8
|
+
'Feb' => '02',
|
9
|
+
'Mar' => '03',
|
10
|
+
'Apr' => '04',
|
11
|
+
'May' => '05',
|
12
|
+
'Jun' => '06',
|
13
|
+
'Jul' => '07',
|
14
|
+
'Aug' => '08',
|
15
|
+
'Sep' => '09',
|
16
|
+
'Oct' => '10',
|
17
|
+
'Nov' => '11',
|
18
|
+
'Dec' => '12',
|
19
|
+
}
|
5
20
|
module ApacheLogParser
|
6
21
|
class Mapper < Wukong::Streamer::LineStreamer
|
7
22
|
|
8
|
-
#
|
9
|
-
#
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
\s(\
|
16
|
-
\s
|
17
|
-
|
18
|
-
|
19
|
-
|
23
|
+
#
|
24
|
+
# Regular expression to parse an apache log line.
|
25
|
+
#
|
26
|
+
# 83.240.154.3 - - [07/Jun/2008:20:37:11 +0000] "GET /faq HTTP/1.1" 200 569 "http://infochimps.org/search?query=CAC" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
|
27
|
+
#
|
28
|
+
LOG_RE = Regexp.compile(%r{\A
|
29
|
+
(\S+) # ip 83.240.154.3
|
30
|
+
\s(\S+) # j1 -
|
31
|
+
\s(\S+) # j2 -
|
32
|
+
\s\[(\d+)/(\w+)/(\d+) # date part [07/Jun/2008
|
33
|
+
:(\d+):(\d+):(\d+) # time part :20:37:11
|
34
|
+
\s(\+.*)\] # timezone +0000]
|
35
|
+
\s\"(?:(\S+) # http_method "GET
|
36
|
+
\s(\S+) # path /faq
|
37
|
+
\s(\S+)|-)" # protocol HTTP/1.1"
|
38
|
+
\s(\d+) # response_code 200
|
39
|
+
\s(\d+) # duration 569
|
40
|
+
\s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
|
41
|
+
\s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
|
42
|
+
\z}x)
|
20
43
|
|
21
44
|
# Use the regex to break line into fields
|
22
45
|
# Emit each record as flat line
|
@@ -24,42 +47,28 @@ module ApacheLogParser
|
|
24
47
|
line.chomp
|
25
48
|
m = LOG_RE.match(line)
|
26
49
|
if m
|
27
|
-
ip, j1, j2,
|
28
|
-
|
29
|
-
|
30
|
-
|
50
|
+
(ip, j1, j2,
|
51
|
+
ts_day, ts_mo, ts_year,
|
52
|
+
ts_hour, ts_min, ts_sec, req_tz,
|
53
|
+
http_method, path, protocol,
|
54
|
+
response_code, duration,
|
55
|
+
referer, ua, *cruft) = m.captures
|
56
|
+
# DateTime.parse("#{datepart} #{timepart}").to_flat # this takes way too long
|
57
|
+
req_date = [ts_year, MONTHS[ts_mo], ts_day].join("")
|
58
|
+
req_time = [ts_hour, ts_min, ts_sec].join("")
|
59
|
+
yield [:logline, ip, req_date, req_time, http_method, protocol, path, response_code, duration, referer, ua, req_tz]
|
31
60
|
else
|
32
61
|
yield [:unparseable, line]
|
33
62
|
end
|
34
63
|
end
|
35
64
|
|
36
|
-
|
37
|
-
def parse_request req
|
38
|
-
m = %r{\A(\w+) (.*) (\w+/[\w\.]+)\z}.match(req)
|
39
|
-
if m
|
40
|
-
[''] + m.captures
|
41
|
-
else
|
42
|
-
[req, '', '', '']
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
65
|
end
|
66
|
+
end
|
47
67
|
|
68
|
+
Wukong::Script.new(ApacheLogParser::Mapper, nil, :sort_fields => 7).run
|
69
|
+
|
70
|
+
# 55.55.155.55 - - [04/Feb/2008:11:37:52 +0000] 301 "GET /robots.txt HTTP/1.1" 185 "-" "WebAlta Crawler/2.0 (http://www.webalta.net/ru/about_webmaster.html) (Windows; U; Windows NT 5.1; ru-RU)" "-"
|
48
71
|
|
49
|
-
class Reducer < Wukong::Streamer::LineStreamer
|
50
|
-
end
|
51
72
|
|
52
|
-
# Execute the script
|
53
|
-
class Script < Wukong::Script
|
54
|
-
def reduce_command
|
55
|
-
"/usr/bin/uniq"
|
56
|
-
end
|
57
|
-
def default_options
|
58
|
-
super.merge :sort_fields => 8 # , :reduce_tasks => 0
|
59
|
-
end
|
60
|
-
end
|
61
73
|
|
62
|
-
Script.new(Mapper,nil).run
|
63
|
-
end
|
64
74
|
|
65
|
-
# 55.55.155.55 - - [04/Feb/2008:11:37:52 +0000] 301 "GET /robots.txt HTTP/1.1" 185 "-" "WebAlta Crawler/2.0 (http://www.webalta.net/ru/about_webmaster.html) (Windows; U; Windows NT 5.1; ru-RU)" "-"
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#
|
2
|
+
# Group all visitors, and then troll through all the pages they've visited
|
3
|
+
# breaking each into distinct visits (where more than an [hour|day|whatever]
|
4
|
+
# separate subsequent pageviews
|
5
|
+
#
|
6
|
+
|
7
|
+
#
|
8
|
+
# Mapper parses log files and created a visitor_id from the visitor's user_id,
|
9
|
+
# cookie or ip. It emits
|
10
|
+
#
|
11
|
+
# <visitor_id> <datetime> <url_path>
|
12
|
+
#
|
13
|
+
# where the partition key is visitor_id, and we sort by visitor_id and datetime.
|
14
|
+
#
|
15
|
+
|
16
|
+
#
|
17
|
+
# Reducer:
|
18
|
+
#
|
19
|
+
# The reducer is given all page requests for the given visitor id, sorted by
|
20
|
+
# timestamp.
|
21
|
+
#
|
22
|
+
# It group by visits (pageviews separated by more than DISTINCT_VISIT_TIMEGAP)
|
23
|
+
# and emits
|
24
|
+
#
|
25
|
+
# trail <visitor_id> <n_pages_in_visit> <duration> <timestamp> < page1,page2,... >
|
26
|
+
#
|
27
|
+
# where the last is a comma-separated string of URL encoded paths (any internal comma is converted to %2C).
|
28
|
+
#
|
29
|
+
# You can instead emit
|
30
|
+
#
|
31
|
+
# page_trails <page1> <n_pages_in_visit> <duration> <timestamp> < page1,page2,... >
|
32
|
+
# page_trails <page2> <n_pages_in_visit> <duration> <timestamp> < page1,page2,... >
|
33
|
+
# ....
|
34
|
+
# page_trails <pagen> <n_pages_in_visit> <duration> <timestamp> < page1,page2,... >
|
35
|
+
#
|
36
|
+
# to discover all trails passing through a given page.
|
@@ -0,0 +1,40 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
# For later, if we want to parse user agents:
|
4
|
+
# http://code.google.com/p/browserscope/source/browse/trunk/models/user_agent.py
|
5
|
+
# http://www.useragentstring.com/pages/All/
|
6
|
+
# http://github.com/jaxn/parse-user-agent
|
7
|
+
# http://code.google.com/p/browserscope/wiki/UserAgentParsing
|
8
|
+
# http://code.google.com/p/ua-parser/source/browse/
|
9
|
+
# http://github.com/shenoudab/active_device/tree/master/lib/active_device/
|
10
|
+
|
11
|
+
|
12
|
+
#
|
13
|
+
# * Mozilla based
|
14
|
+
# * Mozilla version
|
15
|
+
# * X11 based
|
16
|
+
# * Security
|
17
|
+
# * OS
|
18
|
+
# * CPU family
|
19
|
+
# * Language Tag
|
20
|
+
# * Renderer (i.e. Webkit, Trident, Presto)
|
21
|
+
# * Renderer Version
|
22
|
+
# * I don't see a utility for the "KHTML" and "like Gecko" bits, but whatever.
|
23
|
+
# * Based on
|
24
|
+
# * Browser Build (not really sure about this either)
|
25
|
+
|
26
|
+
# * Browser Family (i.e. Firefox, IE, Chrome, etc..)
|
27
|
+
# * Project Name (optional, i.e. Namoroka, Shiretoko)
|
28
|
+
# * Major Version
|
29
|
+
# * Minor Version
|
30
|
+
# * Version Third Bit
|
31
|
+
# * Version Fourth Bit
|
32
|
+
# * Open Question: How should we handle the "alpha/beta" bit, like apre1? I'm inclined to say we put it in its own datapoint and let people group together how ever they want, but not leave it attached to any of the version bits.
|
33
|
+
|
34
|
+
# Bot
|
35
|
+
# Brand
|
36
|
+
# Browser
|
37
|
+
# Engine
|
38
|
+
# Handset
|
39
|
+
# Model
|
40
|
+
# OS
|
@@ -0,0 +1,86 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# run like so:
|
3
|
+
# $> ruby average_value_frequecy.rb --run=local data/stats.tsv data/avf_out.tsv
|
4
|
+
require 'rubygems'
|
5
|
+
require 'wukong'
|
6
|
+
|
7
|
+
#
|
8
|
+
# Calculate the average value frequency (AVF) for each data row. AVF for a data
|
9
|
+
# point with m attributes is defined as:
|
10
|
+
#
|
11
|
+
# avf = (1/m)* sum (frequencies of attributes 1..m)
|
12
|
+
#
|
13
|
+
# so with the data
|
14
|
+
#
|
15
|
+
# 1 15 30 25
|
16
|
+
# 2 10 10 20
|
17
|
+
# 3 50 30 30
|
18
|
+
#
|
19
|
+
# for the first row, avf = (1/3)*(1+2+1) ~= 1.33. An outlier is identified by
|
20
|
+
# a low AVF.
|
21
|
+
#
|
22
|
+
module AverageValueFrequency
|
23
|
+
# Names for each column's attribute, in order
|
24
|
+
ATTR_NAMES = %w[length width height]
|
25
|
+
|
26
|
+
class HistogramMapper < Wukong::Streamer::RecordStreamer
|
27
|
+
# unroll each row from
|
28
|
+
# [id, val1, val2, ....]
|
29
|
+
# into
|
30
|
+
# [attr1, val1]
|
31
|
+
# [attr2, val2]
|
32
|
+
# ...
|
33
|
+
def process id, *values
|
34
|
+
ATTR_NAMES.zip(values).each do |attr, val|
|
35
|
+
yield [attr, val]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
#
|
41
|
+
# Build a histogram of values
|
42
|
+
#
|
43
|
+
class HistogramReducer < Wukong::Streamer::CountingReducer
|
44
|
+
# use the attr and val as the key
|
45
|
+
def get_key attr, val=nil, *_
|
46
|
+
[attr, val]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
class AvfRecordMapper < Wukong::Streamer::RecordStreamer
|
51
|
+
# average the frequency of each value
|
52
|
+
def process id, *values
|
53
|
+
sum = 0.0
|
54
|
+
ATTR_NAMES.zip(values).each do |attr, val|
|
55
|
+
sum += histogram[ [attr, val] ].to_i
|
56
|
+
end
|
57
|
+
avf = sum / ATTR_NAMES.length.to_f
|
58
|
+
yield [id, avf, *values]
|
59
|
+
end
|
60
|
+
|
61
|
+
# Load the histogram from a tab-separated file with
|
62
|
+
# attr val freq
|
63
|
+
def histogram
|
64
|
+
return @histogram if @histogram
|
65
|
+
@histogram = { }
|
66
|
+
File.open(options[:histogram_file]).each do |line|
|
67
|
+
attr, val, freq = line.chomp.split("\t")
|
68
|
+
@histogram[ [attr, val] ] = freq
|
69
|
+
end
|
70
|
+
@histogram
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
Settings.use :commandline, :define
|
76
|
+
Settings.define :histogram, :description => "Run the first pass to calculate a histogram"
|
77
|
+
Settings.define :avf, :description => "Run the second pass, to run back over the records with the histogram and find the AVF for each row."
|
78
|
+
Settings.define :histogram_file, :description => "File to load the histogram from (supply name of the output file from first pass)"
|
79
|
+
Settings.resolve!
|
80
|
+
if Settings[:histogram]
|
81
|
+
Wukong::Script.new(AverageValueFrequency::HistogramMapper, AverageValueFrequency::HistogramReducer).run
|
82
|
+
elsif Settings[:avf]
|
83
|
+
Wukong::Script.new(AverageValueFrequency::AvfRecordMapper, nil).run
|
84
|
+
else
|
85
|
+
raise "Please specify either --histogram (for first round) or --avf (second round)"
|
86
|
+
end
|
data/lib/wukong/script.rb
CHANGED
@@ -121,6 +121,7 @@ module Wukong
|
|
121
121
|
def initialize mapper_klass, reducer_klass, extra_options={}
|
122
122
|
self.options = Settings.dup
|
123
123
|
options.resolve!
|
124
|
+
options.merge! self.default_options
|
124
125
|
options.merge! extra_options
|
125
126
|
self.mapper_klass = mapper_klass
|
126
127
|
self.reducer_klass = reducer_klass
|
@@ -27,6 +27,8 @@ module Wukong
|
|
27
27
|
Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
|
28
28
|
Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
|
29
29
|
Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
|
30
|
+
Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
|
31
|
+
# mapred.linerecordreader.maxlength :description => "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
|
30
32
|
|
31
33
|
# emit a -jobconf hadoop option if the simplified command line arg is present
|
32
34
|
# if not, the resulting nil will be elided later
|
@@ -66,7 +68,8 @@ module Wukong
|
|
66
68
|
end
|
67
69
|
|
68
70
|
def hadoop_other_args
|
69
|
-
extra_str_args
|
71
|
+
extra_str_args = [ options[:extra_args] ]
|
72
|
+
extra_str_args += ' -lazyOutput' if options[:noempty] # don't create reduce file if no records
|
70
73
|
options[:reuse_jvms] = '-1' if (options[:reuse_jvms] == true)
|
71
74
|
options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
|
72
75
|
extra_hsh_args = [:map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt) }
|
data/lib/wukong/streamer.rb
CHANGED
@@ -11,5 +11,6 @@ module Wukong
|
|
11
11
|
autoload :AccumulatingReducer, 'wukong/streamer/accumulating_reducer'
|
12
12
|
autoload :ListReducer, 'wukong/streamer/list_reducer'
|
13
13
|
autoload :UniqByLastReducer, 'wukong/streamer/uniq_by_last_reducer'
|
14
|
+
autoload :CountingReducer, 'wukong/streamer/counting_reducer'
|
14
15
|
end
|
15
16
|
end
|
data/lib/wukong/streamer/base.rb
CHANGED
@@ -19,9 +19,8 @@ module Wukong
|
|
19
19
|
def stream
|
20
20
|
Log.info("Streaming on:\t%s" % [Script.input_file]) unless Script.input_file.blank?
|
21
21
|
before_stream
|
22
|
-
|
23
|
-
record = recordize(line.chomp)
|
24
|
-
next unless record
|
22
|
+
each_record do |line|
|
23
|
+
record = recordize(line.chomp) or next
|
25
24
|
process(*record) do |output_record|
|
26
25
|
emit output_record
|
27
26
|
end
|
@@ -29,6 +28,10 @@ module Wukong
|
|
29
28
|
after_stream
|
30
29
|
end
|
31
30
|
|
31
|
+
def each_record &block
|
32
|
+
$stdin.each(&block)
|
33
|
+
end
|
34
|
+
|
32
35
|
# Called exactly once, before streaming begins
|
33
36
|
def before_stream
|
34
37
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Streamer
|
3
|
+
|
4
|
+
#
|
5
|
+
# Count the number of records for each key.
|
6
|
+
#
|
7
|
+
class CountingReducer < AccumulatingReducer
|
8
|
+
attr_accessor :count
|
9
|
+
|
10
|
+
# start the sum with 0 for each key
|
11
|
+
def start! *_
|
12
|
+
self.count = 0
|
13
|
+
end
|
14
|
+
# ... and count the number of records for this key
|
15
|
+
def accumulate *_
|
16
|
+
self.count += 1
|
17
|
+
end
|
18
|
+
# emit [key, count]
|
19
|
+
def finalize
|
20
|
+
yield [key, count].flatten
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|
data/wukong.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{wukong}
|
8
|
-
s.version = "1.4.
|
8
|
+
s.version = "1.4.7"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Philip (flip) Kromer"]
|
12
|
-
s.date = %q{2010-
|
12
|
+
s.date = %q{2010-03-04}
|
13
13
|
s.description = %q{ Treat your dataset like a:
|
14
14
|
|
15
15
|
* stream of lines when it’s efficient to process by lines
|
@@ -109,7 +109,11 @@ Gem::Specification.new do |s|
|
|
109
109
|
"examples/rank_and_bin.rb",
|
110
110
|
"examples/run_all.sh",
|
111
111
|
"examples/sample_records.rb",
|
112
|
+
"examples/server_logs/breadcrumbs.rb",
|
113
|
+
"examples/server_logs/user_agent.rb",
|
112
114
|
"examples/size.rb",
|
115
|
+
"examples/stats/avg_value_frequency.rb",
|
116
|
+
"examples/stats/data/avg_value_frequency.tsv",
|
113
117
|
"examples/word_count.rb",
|
114
118
|
"lib/wukong.rb",
|
115
119
|
"lib/wukong/bad_record.rb",
|
@@ -145,6 +149,7 @@ Gem::Specification.new do |s|
|
|
145
149
|
"lib/wukong/streamer/base.rb",
|
146
150
|
"lib/wukong/streamer/count_keys.rb",
|
147
151
|
"lib/wukong/streamer/count_lines.rb",
|
152
|
+
"lib/wukong/streamer/counting_reducer.rb",
|
148
153
|
"lib/wukong/streamer/filter.rb",
|
149
154
|
"lib/wukong/streamer/line_streamer.rb",
|
150
155
|
"lib/wukong/streamer/list_reducer.rb",
|
@@ -170,7 +175,7 @@ Gem::Specification.new do |s|
|
|
170
175
|
s.homepage = %q{http://mrflip.github.com/wukong}
|
171
176
|
s.rdoc_options = ["--charset=UTF-8"]
|
172
177
|
s.require_paths = ["lib"]
|
173
|
-
s.rubygems_version = %q{1.3.
|
178
|
+
s.rubygems_version = %q{1.3.6}
|
174
179
|
s.summary = %q{Wukong makes Hadoop so easy a chimpanzee can use it.}
|
175
180
|
s.test_files = [
|
176
181
|
"spec/spec_helper.rb",
|
@@ -193,7 +198,10 @@ Gem::Specification.new do |s|
|
|
193
198
|
"examples/pagerank/pagerank_initialize.rb",
|
194
199
|
"examples/rank_and_bin.rb",
|
195
200
|
"examples/sample_records.rb",
|
201
|
+
"examples/server_logs/breadcrumbs.rb",
|
202
|
+
"examples/server_logs/user_agent.rb",
|
196
203
|
"examples/size.rb",
|
204
|
+
"examples/stats/avg_value_frequency.rb",
|
197
205
|
"examples/word_count.rb"
|
198
206
|
]
|
199
207
|
|
metadata
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wukong
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 1
|
7
|
+
- 4
|
8
|
+
- 7
|
9
|
+
version: 1.4.7
|
5
10
|
platform: ruby
|
6
11
|
authors:
|
7
12
|
- Philip (flip) Kromer
|
@@ -9,39 +14,45 @@ autorequire:
|
|
9
14
|
bindir: bin
|
10
15
|
cert_chain: []
|
11
16
|
|
12
|
-
date: 2010-
|
17
|
+
date: 2010-03-04 00:00:00 -06:00
|
13
18
|
default_executable:
|
14
19
|
dependencies:
|
15
20
|
- !ruby/object:Gem::Dependency
|
16
21
|
name: addressable
|
17
|
-
|
18
|
-
|
19
|
-
version_requirements: !ruby/object:Gem::Requirement
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
24
|
requirements:
|
21
25
|
- - ">="
|
22
26
|
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 0
|
23
29
|
version: "0"
|
24
|
-
|
30
|
+
type: :runtime
|
31
|
+
version_requirements: *id001
|
25
32
|
- !ruby/object:Gem::Dependency
|
26
33
|
name: extlib
|
27
|
-
|
28
|
-
|
29
|
-
version_requirements: !ruby/object:Gem::Requirement
|
34
|
+
prerelease: false
|
35
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
30
36
|
requirements:
|
31
37
|
- - ">="
|
32
38
|
- !ruby/object:Gem::Version
|
39
|
+
segments:
|
40
|
+
- 0
|
33
41
|
version: "0"
|
34
|
-
|
42
|
+
type: :runtime
|
43
|
+
version_requirements: *id002
|
35
44
|
- !ruby/object:Gem::Dependency
|
36
45
|
name: htmlentities
|
37
|
-
|
38
|
-
|
39
|
-
version_requirements: !ruby/object:Gem::Requirement
|
46
|
+
prerelease: false
|
47
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
40
48
|
requirements:
|
41
49
|
- - ">="
|
42
50
|
- !ruby/object:Gem::Version
|
51
|
+
segments:
|
52
|
+
- 0
|
43
53
|
version: "0"
|
44
|
-
|
54
|
+
type: :runtime
|
55
|
+
version_requirements: *id003
|
45
56
|
description: " Treat your dataset like a:\n\n * stream of lines when it\xE2\x80\x99s efficient to process by lines\n * stream of field arrays when it\xE2\x80\x99s efficient to deal directly with fields\n * stream of lightweight objects when it\xE2\x80\x99s efficient to deal with objects\n\n Wukong is friends with Hadoop the elephant, Pig the query language, and the cat on your command line.\n"
|
46
57
|
email: flip@infochimps.org
|
47
58
|
executables:
|
@@ -141,7 +152,11 @@ files:
|
|
141
152
|
- examples/rank_and_bin.rb
|
142
153
|
- examples/run_all.sh
|
143
154
|
- examples/sample_records.rb
|
155
|
+
- examples/server_logs/breadcrumbs.rb
|
156
|
+
- examples/server_logs/user_agent.rb
|
144
157
|
- examples/size.rb
|
158
|
+
- examples/stats/avg_value_frequency.rb
|
159
|
+
- examples/stats/data/avg_value_frequency.tsv
|
145
160
|
- examples/word_count.rb
|
146
161
|
- lib/wukong.rb
|
147
162
|
- lib/wukong/bad_record.rb
|
@@ -177,6 +192,7 @@ files:
|
|
177
192
|
- lib/wukong/streamer/base.rb
|
178
193
|
- lib/wukong/streamer/count_keys.rb
|
179
194
|
- lib/wukong/streamer/count_lines.rb
|
195
|
+
- lib/wukong/streamer/counting_reducer.rb
|
180
196
|
- lib/wukong/streamer/filter.rb
|
181
197
|
- lib/wukong/streamer/line_streamer.rb
|
182
198
|
- lib/wukong/streamer/list_reducer.rb
|
@@ -211,18 +227,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
211
227
|
requirements:
|
212
228
|
- - ">="
|
213
229
|
- !ruby/object:Gem::Version
|
230
|
+
segments:
|
231
|
+
- 0
|
214
232
|
version: "0"
|
215
|
-
version:
|
216
233
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
217
234
|
requirements:
|
218
235
|
- - ">="
|
219
236
|
- !ruby/object:Gem::Version
|
237
|
+
segments:
|
238
|
+
- 0
|
220
239
|
version: "0"
|
221
|
-
version:
|
222
240
|
requirements: []
|
223
241
|
|
224
242
|
rubyforge_project:
|
225
|
-
rubygems_version: 1.3.
|
243
|
+
rubygems_version: 1.3.6
|
226
244
|
signing_key:
|
227
245
|
specification_version: 3
|
228
246
|
summary: Wukong makes Hadoop so easy a chimpanzee can use it.
|
@@ -247,5 +265,8 @@ test_files:
|
|
247
265
|
- examples/pagerank/pagerank_initialize.rb
|
248
266
|
- examples/rank_and_bin.rb
|
249
267
|
- examples/sample_records.rb
|
268
|
+
- examples/server_logs/breadcrumbs.rb
|
269
|
+
- examples/server_logs/user_agent.rb
|
250
270
|
- examples/size.rb
|
271
|
+
- examples/stats/avg_value_frequency.rb
|
251
272
|
- examples/word_count.rb
|