wukong 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +107 -0
- data/README.textile +166 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/INSTALL.textile +41 -0
- data/doc/LICENSE.textile +107 -0
- data/doc/README-tutorial.textile +163 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/TODO.textile +61 -0
- data/doc/UsingWukong-part1-setup.textile +2 -0
- data/doc/UsingWukong-part2-scraping.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-nfs.textile +51 -0
- data/doc/hadoop-setup.textile +29 -0
- data/doc/index.textile +124 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +116 -0
- data/doc/usage.textile +102 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +119 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +53 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +37 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +21 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +179 -0
- metadata +214 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
module Wukong
|
|
3
|
+
module HadoopCommand
|
|
4
|
+
|
|
5
|
+
# ===========================================================================
|
|
6
|
+
#
|
|
7
|
+
# Hadoop Options
|
|
8
|
+
#
|
|
9
|
+
|
|
10
|
+
#
|
|
11
|
+
# Translate the simplified args to their hairy-assed hadoop equivalents
|
|
12
|
+
#
|
|
13
|
+
HADOOP_OPTIONS_MAP = {
|
|
14
|
+
:max_node_map_tasks => 'mapred.tasktracker.map.tasks.maximum',
|
|
15
|
+
:max_node_reduce_tasks => 'mapred.tasktracker.reduce.tasks.maximum',
|
|
16
|
+
:map_tasks => 'mapred.map.tasks',
|
|
17
|
+
:reduce_tasks => 'mapred.reduce.tasks',
|
|
18
|
+
:sort_fields => 'stream.num.map.output.key.fields',
|
|
19
|
+
:key_field_separator => 'map.output.key.field.separator',
|
|
20
|
+
:partition_fields => 'num.key.fields.for.partition',
|
|
21
|
+
:output_field_separator => 'stream.map.output.field.separator',
|
|
22
|
+
:map_speculative => 'mapred.map.tasks.speculative.execution',
|
|
23
|
+
:timeout => 'mapred.task.timeout',
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
# emit a -jobconf hadoop option if the simplified command line arg is present
|
|
27
|
+
# if not, the resulting nil will be elided later
|
|
28
|
+
def jobconf option
|
|
29
|
+
if options[option]
|
|
30
|
+
"-jobconf %s=%s" % [HADOOP_OPTIONS_MAP[option], options[option]]
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Define what fields hadoop should treat as the keys
|
|
35
|
+
def hadoop_sort_args
|
|
36
|
+
[
|
|
37
|
+
jobconf(:key_field_separator),
|
|
38
|
+
jobconf(:sort_fields),
|
|
39
|
+
]
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Define what fields hadoop should use to distribute records to reducers
|
|
43
|
+
def hadoop_partition_args
|
|
44
|
+
if options[:partition_fields]
|
|
45
|
+
[
|
|
46
|
+
'-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
|
|
47
|
+
jobconf(:output_field_separator),
|
|
48
|
+
jobconf(:partition_fields),
|
|
49
|
+
]
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Emit options for setting the number of mappers and reducers.
|
|
54
|
+
def hadoop_num_tasks_args
|
|
55
|
+
[
|
|
56
|
+
jobconf(:max_node_map_tasks),
|
|
57
|
+
jobconf(:max_node_reduce_tasks),
|
|
58
|
+
jobconf(:map_tasks),
|
|
59
|
+
jobconf(:reduce_tasks)
|
|
60
|
+
]
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def hadoop_other_args
|
|
64
|
+
extra_str_args = [ options[:extra_args] ]
|
|
65
|
+
extra_hsh_args = [:map_speculative, :timeout].map{|opt| jobconf(opt) }
|
|
66
|
+
extra_str_args + extra_hsh_args
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
#
|
|
70
|
+
# Assemble the hadoop command to execute
|
|
71
|
+
#
|
|
72
|
+
def hadoop_command input_path, output_path
|
|
73
|
+
# If this is wrong, create a config/wukong-site.rb or
|
|
74
|
+
# otherwise set Wukong::CONFIG[:hadoop_home] to the
|
|
75
|
+
# root of your config install.
|
|
76
|
+
hadoop_program = Wukong::CONFIG[:hadoop_home]+'/bin/hadoop'
|
|
77
|
+
[
|
|
78
|
+
hadoop_program,
|
|
79
|
+
"jar #{Wukong::CONFIG[:hadoop_home]}/contrib/streaming/hadoop-*-streaming.jar",
|
|
80
|
+
hadoop_partition_args,
|
|
81
|
+
hadoop_sort_args,
|
|
82
|
+
hadoop_num_tasks_args,
|
|
83
|
+
"-mapper '#{map_command}'",
|
|
84
|
+
"-reducer '#{reduce_command}'",
|
|
85
|
+
"-input '#{input_path}'",
|
|
86
|
+
"-output '#{output_path}'",
|
|
87
|
+
hadoop_other_args,
|
|
88
|
+
].flatten.compact.join(" \t\\\n ")
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# -inputformat <name of inputformat (class)> (“auto” by default)
|
|
96
|
+
# -input <additional DFS input path>
|
|
97
|
+
# -python <python command to use on nodes> (“python” by default)
|
|
98
|
+
# -name <job name> (“program.py” by default)
|
|
99
|
+
# -numMapTasks <number>
|
|
100
|
+
# -numReduceTasks <number> (no sorting or reducing will take place if this is 0)
|
|
101
|
+
# -priority <priority value> (“NORMAL” by default)
|
|
102
|
+
# -libjar <path to jar> (this jar gets put in the class path)
|
|
103
|
+
# -libegg <path to egg> (this egg gets put in the Python path)
|
|
104
|
+
# -file <local file> (this file will be put in the dir where the python program gets executed)
|
|
105
|
+
# -cacheFile hdfs://<host>:<fs_port>/<path to file>#<link name> (a link ”<link name>” to the given file will be in the dir)
|
|
106
|
+
# -cacheArchive hdfs://<host>:<fs_port>/<path to jar>#<link name> (link points to dir that contains files from given jar)
|
|
107
|
+
# -cmdenv <env var name>=<value>
|
|
108
|
+
# -jobconf <property name>=<value>
|
|
109
|
+
# -addpath yes (replace each input key by a tuple consisting of the path of the corresponding input file and the original key)
|
|
110
|
+
# -fake yes (fake run, only prints the underlying shell commands but does not actually execute them)
|
|
111
|
+
# -memlimit <number of bytes> (set an upper limit on the amount of memory that can be used)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
module Wukong
|
|
2
|
+
module LocalCommand
|
|
3
|
+
|
|
4
|
+
# ===========================================================================
|
|
5
|
+
#
|
|
6
|
+
# Local execution Options
|
|
7
|
+
#
|
|
8
|
+
|
|
9
|
+
def local_command input_path, output_path
|
|
10
|
+
%Q{ cat #{input_path} | #{map_command} | sort | #{reduce_command} > '#{output_path}'}
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
end
|
|
14
|
+
end
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
module Wukong
|
|
2
|
+
module Streamer
|
|
3
|
+
autoload :Base, 'wukong/streamer/base'
|
|
4
|
+
autoload :LineStreamer, 'wukong/streamer/line_streamer'
|
|
5
|
+
autoload :StructStreamer, 'wukong/streamer/struct_streamer'
|
|
6
|
+
#
|
|
7
|
+
autoload :Filter, 'wukong/streamer/filter'
|
|
8
|
+
#
|
|
9
|
+
autoload :AccumulatingReducer, 'wukong/streamer/accumulating_reducer'
|
|
10
|
+
autoload :ListReducer, 'wukong/streamer/list_reducer'
|
|
11
|
+
autoload :UniqByLastReducer, 'wukong/streamer/uniq_by_last_reducer'
|
|
12
|
+
end
|
|
13
|
+
end
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
module Wukong
|
|
2
|
+
module Streamer
|
|
3
|
+
|
|
4
|
+
#
|
|
5
|
+
# AccumulatingReducer makes it easy to apply one operation across all
|
|
6
|
+
# occurrences of each key
|
|
7
|
+
#
|
|
8
|
+
# On each occurrence of a given key, AccumulatingReducer calls
|
|
9
|
+
# accumulate, and at the final occurrence calls finalize.
|
|
10
|
+
#
|
|
11
|
+
# See ListAccumulatingReducer and KeyCountingReducer for examples
|
|
12
|
+
#
|
|
13
|
+
# Make sure you don't have the bad luck, bad judgement or bad approach to
|
|
14
|
+
# accumulate more data than your box can hold before finalizing.
|
|
15
|
+
#
|
|
16
|
+
class AccumulatingReducer < Wukong::Streamer::Base
|
|
17
|
+
attr_accessor :key
|
|
18
|
+
def initialize options
|
|
19
|
+
super options
|
|
20
|
+
self.key = :__first_pass__
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
#
|
|
24
|
+
# override for multiple-field keys, etc.
|
|
25
|
+
#
|
|
26
|
+
# Note that get_key is called by +process+ -- so the arguments have
|
|
27
|
+
# already been +recordize+d. In particular, if you are using
|
|
28
|
+
# StructRecordizer (or StructStreamer), you can write this as
|
|
29
|
+
#
|
|
30
|
+
# def get_key(thing) thing.id.to_i ; end
|
|
31
|
+
#
|
|
32
|
+
# or whatever
|
|
33
|
+
def get_key *record
|
|
34
|
+
record.first
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
#
|
|
38
|
+
# Accumulate all records for a given key.
|
|
39
|
+
#
|
|
40
|
+
# When the last record for the key is seen, finalize processing and adopt the
|
|
41
|
+
# new key.
|
|
42
|
+
#
|
|
43
|
+
def process *args, &block
|
|
44
|
+
this_key = get_key(*args)
|
|
45
|
+
if this_key != self.key # if this is a new key,
|
|
46
|
+
unless self.key == :__first_pass__
|
|
47
|
+
finalize(&block) # process what we've collected so far
|
|
48
|
+
end
|
|
49
|
+
self.key = this_key # adopt the new key
|
|
50
|
+
start! *args # and set up for the next accumulation
|
|
51
|
+
end
|
|
52
|
+
# collect the current record
|
|
53
|
+
accumulate *args, &block
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
#
|
|
57
|
+
# start! is called on the the first record of the new key
|
|
58
|
+
#
|
|
59
|
+
def start! *args
|
|
60
|
+
raise %Q{start! is the new reset! -- it has args now, namely the first
|
|
61
|
+
record of the new key. It doesn\'t want #super either}
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
#
|
|
65
|
+
# Override this to accumulate each record for the given key in turn.
|
|
66
|
+
#
|
|
67
|
+
def accumulate *args, &block
|
|
68
|
+
raise "override the accumulate method in your subclass"
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
#
|
|
72
|
+
#
|
|
73
|
+
# You must override this method.
|
|
74
|
+
#
|
|
75
|
+
def finalize
|
|
76
|
+
raise "override the finalize method in your subclass"
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
#
|
|
80
|
+
# Must make sure to finalize the last-seen accumulation.
|
|
81
|
+
#
|
|
82
|
+
def stream
|
|
83
|
+
super
|
|
84
|
+
finalize(){|record| emit record }
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
end
|
|
89
|
+
end
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
module Wukong
|
|
2
|
+
module Streamer
|
|
3
|
+
class Base
|
|
4
|
+
|
|
5
|
+
# Options, initially set from the command-line args -- see
|
|
6
|
+
# Script#process_argv!
|
|
7
|
+
attr_accessor :options
|
|
8
|
+
|
|
9
|
+
#
|
|
10
|
+
# Accepts option hash from script runner
|
|
11
|
+
#
|
|
12
|
+
def initialize options={}
|
|
13
|
+
self.options = options
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
#
|
|
17
|
+
# Pass each record to +#process+
|
|
18
|
+
#
|
|
19
|
+
def stream
|
|
20
|
+
before_stream
|
|
21
|
+
$stdin.each do |line|
|
|
22
|
+
record = recordize(line.chomp)
|
|
23
|
+
next unless record
|
|
24
|
+
process(*record) do |output_record|
|
|
25
|
+
emit output_record
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
after_stream
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Called exactly once, before streaming begins
|
|
32
|
+
def before_stream
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Called exactly once, after streaming completes
|
|
36
|
+
def after_stream
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
#
|
|
40
|
+
# Default recordizer: returns array of fields by splitting at tabs
|
|
41
|
+
#
|
|
42
|
+
def recordize line
|
|
43
|
+
line.split("\t") rescue nil
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
#
|
|
47
|
+
# Serializes the record to output.
|
|
48
|
+
#
|
|
49
|
+
# Emits a single line of tab-separated fields created by calling #to_flat
|
|
50
|
+
# on the record and joining with "\t".
|
|
51
|
+
#
|
|
52
|
+
# Does no escaping or processing of the record -- that's to_flat's job, or
|
|
53
|
+
# yours if you override this method.
|
|
54
|
+
#
|
|
55
|
+
def emit record
|
|
56
|
+
puts record.to_flat.join("\t")
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
#
|
|
60
|
+
# Process each record in turn, yielding the records to emit
|
|
61
|
+
#
|
|
62
|
+
def process *args, &block
|
|
63
|
+
raise "override the process method in your implementation: it should process each record."
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
#
|
|
67
|
+
# To track processing errors inline,
|
|
68
|
+
# pass the line back to bad_record!
|
|
69
|
+
#
|
|
70
|
+
def bad_record! key, *args
|
|
71
|
+
warn "Bad record #{args.inspect[0..400]}"
|
|
72
|
+
puts ["bad_record-"+key, *args].join("\t")
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
module Wukong
|
|
2
|
+
module Streamer
|
|
3
|
+
#
|
|
4
|
+
# Emit each unique key and the count of its occurrences
|
|
5
|
+
#
|
|
6
|
+
class CountKeys < Wukong::Streamer::AccumulatingReducer
|
|
7
|
+
attr_accessor :key_count
|
|
8
|
+
|
|
9
|
+
def formatted_key_count
|
|
10
|
+
"%10d"%key_count.to_i
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
# reset the counter to zero
|
|
14
|
+
def start! *args
|
|
15
|
+
self.key_count = 0
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# record one more for this key
|
|
19
|
+
def accumulate *vals
|
|
20
|
+
self.key_count += 1
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# emit each key field and the count, tab-separated.
|
|
24
|
+
def finalize
|
|
25
|
+
yield [key, formatted_key_count]
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
module Wukong
|
|
2
|
+
module Streamer
|
|
3
|
+
#
|
|
4
|
+
# For each identical line in the map phase output, emit one representative
|
|
5
|
+
# line followed by the count of occrrences (separated by a tab).
|
|
6
|
+
#
|
|
7
|
+
# (This is the functional equivalent of +'uniq -c'+)
|
|
8
|
+
#
|
|
9
|
+
class CountLines < Wukong::Streamer::Base
|
|
10
|
+
def formatted_count item, key_count
|
|
11
|
+
"%s\t%10d" % [item, key_count.to_i]
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
#
|
|
15
|
+
# Delegate to +uniq -c+, but put the count last for idempotence.
|
|
16
|
+
#
|
|
17
|
+
def stream
|
|
18
|
+
%x{/usr/bin/uniq -c}.split("\n").each do |line|
|
|
19
|
+
key_count, item = line.chomp.strip.split(/\s+/, 2)
|
|
20
|
+
puts formatted_count(item, key_count)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
module Wukong
|
|
2
|
+
module Streamer
|
|
3
|
+
#
|
|
4
|
+
# emit only some records, as dictated by the #emit? method
|
|
5
|
+
#
|
|
6
|
+
# This is a mixin: including this module in your streamer
|
|
7
|
+
# implements its +#process+ method.
|
|
8
|
+
#
|
|
9
|
+
module Filter
|
|
10
|
+
#
|
|
11
|
+
# Filter out a subset of record/lines
|
|
12
|
+
#
|
|
13
|
+
# Subclass and re-define the emit? method
|
|
14
|
+
#
|
|
15
|
+
def process *record, &block
|
|
16
|
+
yield record if emit?(record)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
module Wukong
|
|
2
|
+
module Streamer
|
|
3
|
+
#
|
|
4
|
+
# Emit each unique key and the count of its occurrences
|
|
5
|
+
#
|
|
6
|
+
class ListReducer < Wukong::Streamer::AccumulatingReducer
|
|
7
|
+
attr_accessor :values
|
|
8
|
+
|
|
9
|
+
# reset the counter to zero
|
|
10
|
+
def start! *args
|
|
11
|
+
self.values = []
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# record one more for this key
|
|
15
|
+
def accumulate *record
|
|
16
|
+
self.values << record
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
module Wukong
|
|
2
|
+
module Streamer
|
|
3
|
+
module PreprocessWithPipeStreamer
|
|
4
|
+
#
|
|
5
|
+
# Runs STDIN through a shell command and then begins processing.
|
|
6
|
+
#
|
|
7
|
+
# If you don't need to do anything to the output of the command, just
|
|
8
|
+
# inherit from Wukong::Script and override the #map_command.
|
|
9
|
+
#
|
|
10
|
+
# You must provide a @preprocess_pipe_command@ method that returns a shell
|
|
11
|
+
# command to run the input through.
|
|
12
|
+
#
|
|
13
|
+
def stream
|
|
14
|
+
#
|
|
15
|
+
`#{preprocess_pipe_command}`.each do |line|
|
|
16
|
+
item = itemize(line) ; next if item.blank?
|
|
17
|
+
process(*item)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|