wukong 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.textile +107 -0
- data/README.textile +166 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/INSTALL.textile +41 -0
- data/doc/LICENSE.textile +107 -0
- data/doc/README-tutorial.textile +163 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/TODO.textile +61 -0
- data/doc/UsingWukong-part1-setup.textile +2 -0
- data/doc/UsingWukong-part2-scraping.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-nfs.textile +51 -0
- data/doc/hadoop-setup.textile +29 -0
- data/doc/index.textile +124 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +116 -0
- data/doc/usage.textile +102 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +119 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +53 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +37 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +21 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +179 -0
- metadata +214 -0
@@ -0,0 +1,111 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
module Wukong
|
3
|
+
module HadoopCommand
|
4
|
+
|
5
|
+
# ===========================================================================
|
6
|
+
#
|
7
|
+
# Hadoop Options
|
8
|
+
#
|
9
|
+
|
10
|
+
#
|
11
|
+
# Translate the simplified args to their hairy-assed hadoop equivalents
|
12
|
+
#
|
13
|
+
HADOOP_OPTIONS_MAP = {
|
14
|
+
:max_node_map_tasks => 'mapred.tasktracker.map.tasks.maximum',
|
15
|
+
:max_node_reduce_tasks => 'mapred.tasktracker.reduce.tasks.maximum',
|
16
|
+
:map_tasks => 'mapred.map.tasks',
|
17
|
+
:reduce_tasks => 'mapred.reduce.tasks',
|
18
|
+
:sort_fields => 'stream.num.map.output.key.fields',
|
19
|
+
:key_field_separator => 'map.output.key.field.separator',
|
20
|
+
:partition_fields => 'num.key.fields.for.partition',
|
21
|
+
:output_field_separator => 'stream.map.output.field.separator',
|
22
|
+
:map_speculative => 'mapred.map.tasks.speculative.execution',
|
23
|
+
:timeout => 'mapred.task.timeout',
|
24
|
+
}
|
25
|
+
|
26
|
+
# emit a -jobconf hadoop option if the simplified command line arg is present
|
27
|
+
# if not, the resulting nil will be elided later
|
28
|
+
def jobconf option
|
29
|
+
if options[option]
|
30
|
+
"-jobconf %s=%s" % [HADOOP_OPTIONS_MAP[option], options[option]]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Define what fields hadoop should treat as the keys
|
35
|
+
def hadoop_sort_args
|
36
|
+
[
|
37
|
+
jobconf(:key_field_separator),
|
38
|
+
jobconf(:sort_fields),
|
39
|
+
]
|
40
|
+
end
|
41
|
+
|
42
|
+
# Define what fields hadoop should use to distribute records to reducers
|
43
|
+
def hadoop_partition_args
|
44
|
+
if options[:partition_fields]
|
45
|
+
[
|
46
|
+
'-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
|
47
|
+
jobconf(:output_field_separator),
|
48
|
+
jobconf(:partition_fields),
|
49
|
+
]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Emit options for setting the number of mappers and reducers.
|
54
|
+
def hadoop_num_tasks_args
|
55
|
+
[
|
56
|
+
jobconf(:max_node_map_tasks),
|
57
|
+
jobconf(:max_node_reduce_tasks),
|
58
|
+
jobconf(:map_tasks),
|
59
|
+
jobconf(:reduce_tasks)
|
60
|
+
]
|
61
|
+
end
|
62
|
+
|
63
|
+
def hadoop_other_args
|
64
|
+
extra_str_args = [ options[:extra_args] ]
|
65
|
+
extra_hsh_args = [:map_speculative, :timeout].map{|opt| jobconf(opt) }
|
66
|
+
extra_str_args + extra_hsh_args
|
67
|
+
end
|
68
|
+
|
69
|
+
#
|
70
|
+
# Assemble the hadoop command to execute
|
71
|
+
#
|
72
|
+
def hadoop_command input_path, output_path
|
73
|
+
# If this is wrong, create a config/wukong-site.rb or
|
74
|
+
# otherwise set Wukong::CONFIG[:hadoop_home] to the
|
75
|
+
# root of your config install.
|
76
|
+
hadoop_program = Wukong::CONFIG[:hadoop_home]+'/bin/hadoop'
|
77
|
+
[
|
78
|
+
hadoop_program,
|
79
|
+
"jar #{Wukong::CONFIG[:hadoop_home]}/contrib/streaming/hadoop-*-streaming.jar",
|
80
|
+
hadoop_partition_args,
|
81
|
+
hadoop_sort_args,
|
82
|
+
hadoop_num_tasks_args,
|
83
|
+
"-mapper '#{map_command}'",
|
84
|
+
"-reducer '#{reduce_command}'",
|
85
|
+
"-input '#{input_path}'",
|
86
|
+
"-output '#{output_path}'",
|
87
|
+
hadoop_other_args,
|
88
|
+
].flatten.compact.join(" \t\\\n ")
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
# -inputformat <name of inputformat (class)> (“auto” by default)
|
96
|
+
# -input <additional DFS input path>
|
97
|
+
# -python <python command to use on nodes> (“python” by default)
|
98
|
+
# -name <job name> (“program.py” by default)
|
99
|
+
# -numMapTasks <number>
|
100
|
+
# -numReduceTasks <number> (no sorting or reducing will take place if this is 0)
|
101
|
+
# -priority <priority value> (“NORMAL” by default)
|
102
|
+
# -libjar <path to jar> (this jar gets put in the class path)
|
103
|
+
# -libegg <path to egg> (this egg gets put in the Python path)
|
104
|
+
# -file <local file> (this file will be put in the dir where the python program gets executed)
|
105
|
+
# -cacheFile hdfs://<host>:<fs_port>/<path to file>#<link name> (a link ”<link name>” to the given file will be in the dir)
|
106
|
+
# -cacheArchive hdfs://<host>:<fs_port>/<path to jar>#<link name> (link points to dir that contains files from given jar)
|
107
|
+
# -cmdenv <env var name>=<value>
|
108
|
+
# -jobconf <property name>=<value>
|
109
|
+
# -addpath yes (replace each input key by a tuple consisting of the path of the corresponding input file and the original key)
|
110
|
+
# -fake yes (fake run, only prints the underlying shell commands but does not actually execute them)
|
111
|
+
# -memlimit <number of bytes> (set an upper limit on the amount of memory that can be used)
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Wukong
|
2
|
+
module LocalCommand
|
3
|
+
|
4
|
+
# ===========================================================================
|
5
|
+
#
|
6
|
+
# Local execution Options
|
7
|
+
#
|
8
|
+
|
9
|
+
def local_command input_path, output_path
|
10
|
+
%Q{ cat #{input_path} | #{map_command} | sort | #{reduce_command} > '#{output_path}'}
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Streamer
|
3
|
+
autoload :Base, 'wukong/streamer/base'
|
4
|
+
autoload :LineStreamer, 'wukong/streamer/line_streamer'
|
5
|
+
autoload :StructStreamer, 'wukong/streamer/struct_streamer'
|
6
|
+
#
|
7
|
+
autoload :Filter, 'wukong/streamer/filter'
|
8
|
+
#
|
9
|
+
autoload :AccumulatingReducer, 'wukong/streamer/accumulating_reducer'
|
10
|
+
autoload :ListReducer, 'wukong/streamer/list_reducer'
|
11
|
+
autoload :UniqByLastReducer, 'wukong/streamer/uniq_by_last_reducer'
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Streamer
|
3
|
+
|
4
|
+
#
|
5
|
+
# AccumulatingReducer makes it easy to apply one operation across all
|
6
|
+
# occurrences of each key
|
7
|
+
#
|
8
|
+
# On each occurrence of a given key, AccumulatingReducer calls
|
9
|
+
# accumulate, and at the final occurrence calls finalize.
|
10
|
+
#
|
11
|
+
# See ListAccumulatingReducer and KeyCountingReducer for examples
|
12
|
+
#
|
13
|
+
# Make sure you don't have the bad luck, bad judgement or bad approach to
|
14
|
+
# accumulate more data than your box can hold before finalizing.
|
15
|
+
#
|
16
|
+
class AccumulatingReducer < Wukong::Streamer::Base
|
17
|
+
attr_accessor :key
|
18
|
+
def initialize options
|
19
|
+
super options
|
20
|
+
self.key = :__first_pass__
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# override for multiple-field keys, etc.
|
25
|
+
#
|
26
|
+
# Note that get_key is called by +process+ -- so the arguments have
|
27
|
+
# already been +recordize+d. In particular, if you are using
|
28
|
+
# StructRecordizer (or StructStreamer), you can write this as
|
29
|
+
#
|
30
|
+
# def get_key(thing) thing.id.to_i ; end
|
31
|
+
#
|
32
|
+
# or whatever
|
33
|
+
def get_key *record
|
34
|
+
record.first
|
35
|
+
end
|
36
|
+
|
37
|
+
#
|
38
|
+
# Accumulate all records for a given key.
|
39
|
+
#
|
40
|
+
# When the last record for the key is seen, finalize processing and adopt the
|
41
|
+
# new key.
|
42
|
+
#
|
43
|
+
def process *args, &block
|
44
|
+
this_key = get_key(*args)
|
45
|
+
if this_key != self.key # if this is a new key,
|
46
|
+
unless self.key == :__first_pass__
|
47
|
+
finalize(&block) # process what we've collected so far
|
48
|
+
end
|
49
|
+
self.key = this_key # adopt the new key
|
50
|
+
start! *args # and set up for the next accumulation
|
51
|
+
end
|
52
|
+
# collect the current record
|
53
|
+
accumulate *args, &block
|
54
|
+
end
|
55
|
+
|
56
|
+
#
|
57
|
+
# start! is called on the the first record of the new key
|
58
|
+
#
|
59
|
+
def start! *args
|
60
|
+
raise %Q{start! is the new reset! -- it has args now, namely the first
|
61
|
+
record of the new key. It doesn\'t want #super either}
|
62
|
+
end
|
63
|
+
|
64
|
+
#
|
65
|
+
# Override this to accumulate each record for the given key in turn.
|
66
|
+
#
|
67
|
+
def accumulate *args, &block
|
68
|
+
raise "override the accumulate method in your subclass"
|
69
|
+
end
|
70
|
+
|
71
|
+
#
|
72
|
+
#
|
73
|
+
# You must override this method.
|
74
|
+
#
|
75
|
+
def finalize
|
76
|
+
raise "override the finalize method in your subclass"
|
77
|
+
end
|
78
|
+
|
79
|
+
#
|
80
|
+
# Must make sure to finalize the last-seen accumulation.
|
81
|
+
#
|
82
|
+
def stream
|
83
|
+
super
|
84
|
+
finalize(){|record| emit record }
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Streamer
|
3
|
+
class Base
|
4
|
+
|
5
|
+
# Options, initially set from the command-line args -- see
|
6
|
+
# Script#process_argv!
|
7
|
+
attr_accessor :options
|
8
|
+
|
9
|
+
#
|
10
|
+
# Accepts option hash from script runner
|
11
|
+
#
|
12
|
+
def initialize options={}
|
13
|
+
self.options = options
|
14
|
+
end
|
15
|
+
|
16
|
+
#
|
17
|
+
# Pass each record to +#process+
|
18
|
+
#
|
19
|
+
def stream
|
20
|
+
before_stream
|
21
|
+
$stdin.each do |line|
|
22
|
+
record = recordize(line.chomp)
|
23
|
+
next unless record
|
24
|
+
process(*record) do |output_record|
|
25
|
+
emit output_record
|
26
|
+
end
|
27
|
+
end
|
28
|
+
after_stream
|
29
|
+
end
|
30
|
+
|
31
|
+
# Called exactly once, before streaming begins
|
32
|
+
def before_stream
|
33
|
+
end
|
34
|
+
|
35
|
+
# Called exactly once, after streaming completes
|
36
|
+
def after_stream
|
37
|
+
end
|
38
|
+
|
39
|
+
#
|
40
|
+
# Default recordizer: returns array of fields by splitting at tabs
|
41
|
+
#
|
42
|
+
def recordize line
|
43
|
+
line.split("\t") rescue nil
|
44
|
+
end
|
45
|
+
|
46
|
+
#
|
47
|
+
# Serializes the record to output.
|
48
|
+
#
|
49
|
+
# Emits a single line of tab-separated fields created by calling #to_flat
|
50
|
+
# on the record and joining with "\t".
|
51
|
+
#
|
52
|
+
# Does no escaping or processing of the record -- that's to_flat's job, or
|
53
|
+
# yours if you override this method.
|
54
|
+
#
|
55
|
+
def emit record
|
56
|
+
puts record.to_flat.join("\t")
|
57
|
+
end
|
58
|
+
|
59
|
+
#
|
60
|
+
# Process each record in turn, yielding the records to emit
|
61
|
+
#
|
62
|
+
def process *args, &block
|
63
|
+
raise "override the process method in your implementation: it should process each record."
|
64
|
+
end
|
65
|
+
|
66
|
+
#
|
67
|
+
# To track processing errors inline,
|
68
|
+
# pass the line back to bad_record!
|
69
|
+
#
|
70
|
+
def bad_record! key, *args
|
71
|
+
warn "Bad record #{args.inspect[0..400]}"
|
72
|
+
puts ["bad_record-"+key, *args].join("\t")
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Streamer
|
3
|
+
#
|
4
|
+
# Emit each unique key and the count of its occurrences
|
5
|
+
#
|
6
|
+
class CountKeys < Wukong::Streamer::AccumulatingReducer
|
7
|
+
attr_accessor :key_count
|
8
|
+
|
9
|
+
def formatted_key_count
|
10
|
+
"%10d"%key_count.to_i
|
11
|
+
end
|
12
|
+
|
13
|
+
# reset the counter to zero
|
14
|
+
def start! *args
|
15
|
+
self.key_count = 0
|
16
|
+
end
|
17
|
+
|
18
|
+
# record one more for this key
|
19
|
+
def accumulate *vals
|
20
|
+
self.key_count += 1
|
21
|
+
end
|
22
|
+
|
23
|
+
# emit each key field and the count, tab-separated.
|
24
|
+
def finalize
|
25
|
+
yield [key, formatted_key_count]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Streamer
|
3
|
+
#
|
4
|
+
# For each identical line in the map phase output, emit one representative
|
5
|
+
# line followed by the count of occrrences (separated by a tab).
|
6
|
+
#
|
7
|
+
# (This is the functional equivalent of +'uniq -c'+)
|
8
|
+
#
|
9
|
+
class CountLines < Wukong::Streamer::Base
|
10
|
+
def formatted_count item, key_count
|
11
|
+
"%s\t%10d" % [item, key_count.to_i]
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Delegate to +uniq -c+, but put the count last for idempotence.
|
16
|
+
#
|
17
|
+
def stream
|
18
|
+
%x{/usr/bin/uniq -c}.split("\n").each do |line|
|
19
|
+
key_count, item = line.chomp.strip.split(/\s+/, 2)
|
20
|
+
puts formatted_count(item, key_count)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Streamer
|
3
|
+
#
|
4
|
+
# emit only some records, as dictated by the #emit? method
|
5
|
+
#
|
6
|
+
# This is a mixin: including this module in your streamer
|
7
|
+
# implements its +#process+ method.
|
8
|
+
#
|
9
|
+
module Filter
|
10
|
+
#
|
11
|
+
# Filter out a subset of record/lines
|
12
|
+
#
|
13
|
+
# Subclass and re-define the emit? method
|
14
|
+
#
|
15
|
+
def process *record, &block
|
16
|
+
yield record if emit?(record)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Streamer
|
3
|
+
#
|
4
|
+
# Emit each unique key and the count of its occurrences
|
5
|
+
#
|
6
|
+
class ListReducer < Wukong::Streamer::AccumulatingReducer
|
7
|
+
attr_accessor :values
|
8
|
+
|
9
|
+
# reset the counter to zero
|
10
|
+
def start! *args
|
11
|
+
self.values = []
|
12
|
+
end
|
13
|
+
|
14
|
+
# record one more for this key
|
15
|
+
def accumulate *record
|
16
|
+
self.values << record
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Streamer
|
3
|
+
module PreprocessWithPipeStreamer
|
4
|
+
#
|
5
|
+
# Runs STDIN through a shell command and then begins processing.
|
6
|
+
#
|
7
|
+
# If you don't need to do anything to the output of the command, just
|
8
|
+
# inherit from Wukong::Script and override the #map_command.
|
9
|
+
#
|
10
|
+
# You must provide a @preprocess_pipe_command@ method that returns a shell
|
11
|
+
# command to run the input through.
|
12
|
+
#
|
13
|
+
def stream
|
14
|
+
#
|
15
|
+
`#{preprocess_pipe_command}`.each do |line|
|
16
|
+
item = itemize(line) ; next if item.blank?
|
17
|
+
process(*item)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|