wukong 1.5.4 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.textile +32 -0
- data/README.textile +58 -12
- data/TODO.textile +0 -8
- data/bin/hdp-bzip +12 -17
- data/bin/hdp-kill-task +1 -1
- data/bin/hdp-sort +7 -7
- data/bin/hdp-stream +7 -7
- data/bin/hdp-stream-flat +2 -3
- data/bin/setcat +11 -0
- data/bin/uniq-ord +59 -0
- data/examples/corpus/bucket_counter.rb +47 -0
- data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
- data/examples/corpus/sentence_coocurrence.rb +70 -0
- data/examples/emr/README.textile +110 -0
- data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
- data/examples/emr/elastic_mapreduce_example.rb +2 -2
- data/examples/ignore_me/counting.rb +56 -0
- data/examples/ignore_me/grouper.rb +71 -0
- data/examples/network_graph/adjacency_list.rb +2 -2
- data/examples/network_graph/breadth_first_search.rb +14 -21
- data/examples/network_graph/gen_multi_edge.rb +22 -13
- data/examples/pagerank/pagerank.rb +1 -1
- data/examples/pagerank/pagerank_initialize.rb +6 -10
- data/examples/sample_records.rb +6 -16
- data/examples/server_logs/apache_log_parser.rb +7 -22
- data/examples/server_logs/breadcrumbs.rb +39 -0
- data/examples/server_logs/logline.rb +27 -0
- data/examples/size.rb +3 -2
- data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
- data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
- data/examples/stupidly_simple_filter.rb +11 -14
- data/examples/word_count.rb +16 -36
- data/lib/wukong/and_pig.rb +2 -15
- data/lib/wukong/logger.rb +7 -28
- data/lib/wukong/periodic_monitor.rb +24 -9
- data/lib/wukong/script/emr_command.rb +1 -0
- data/lib/wukong/script/hadoop_command.rb +31 -29
- data/lib/wukong/script.rb +19 -14
- data/lib/wukong/store/cassandra_model.rb +2 -1
- data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
- data/lib/wukong/streamer/base.rb +44 -3
- data/lib/wukong/streamer/counting_reducer.rb +12 -12
- data/lib/wukong/streamer/filter.rb +2 -2
- data/lib/wukong/streamer/list_reducer.rb +3 -3
- data/lib/wukong/streamer/reducer.rb +11 -0
- data/lib/wukong/streamer.rb +7 -3
- data/lib/wukong.rb +7 -3
- data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
- data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
- data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
- data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
- data/wukong.gemspec +257 -285
- metadata +45 -62
- data/examples/cassandra_streaming/avromapper.rb +0 -85
- data/examples/cassandra_streaming/cassandra.avpr +0 -468
- data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
- data/examples/cassandra_streaming/catter.sh +0 -45
- data/examples/cassandra_streaming/client_schema.avpr +0 -211
- data/examples/cassandra_streaming/foofile.avr +0 -0
- data/examples/cassandra_streaming/pymap.sh +0 -1
- data/examples/cassandra_streaming/pyreduce.sh +0 -1
- data/examples/cassandra_streaming/smutation.avpr +0 -188
- data/examples/cassandra_streaming/streamer.sh +0 -51
- data/examples/cassandra_streaming/struct_loader.rb +0 -24
- data/examples/count_keys.rb +0 -56
- data/examples/count_keys_at_mapper.rb +0 -57
- data/examples/emr/README-elastic_map_reduce.textile +0 -26
- data/examples/keystore/cassandra_batch_test.rb +0 -41
- data/examples/keystore/conditional_outputter_example.rb +0 -70
- data/examples/store/chunked_store_example.rb +0 -18
- data/lib/wukong/dfs.rb +0 -81
- data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
- data/lib/wukong/keystore/redis_db.rb +0 -24
- data/lib/wukong/keystore/tyrant_db.rb +0 -137
- data/lib/wukong/keystore/tyrant_notes.textile +0 -145
- data/lib/wukong/models/graph.rb +0 -25
- data/lib/wukong/monitor/chunked_store.rb +0 -23
- data/lib/wukong/monitor/periodic_logger.rb +0 -34
- data/lib/wukong/monitor/periodic_monitor.rb +0 -70
- data/lib/wukong/monitor.rb +0 -7
- data/lib/wukong/rdf.rb +0 -104
- data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
- data/lib/wukong/streamer/count_keys.rb +0 -30
- data/lib/wukong/streamer/count_lines.rb +0 -26
- data/lib/wukong/streamer/em_streamer.rb +0 -7
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
- data/lib/wukong/wukong_class.rb +0 -21
data/examples/word_count.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
require 'rubygems'
|
3
|
-
require 'wukong'
|
3
|
+
require 'wukong/script'
|
4
4
|
|
5
5
|
module WordCount
|
6
6
|
class Mapper < Wukong::Streamer::LineStreamer
|
@@ -10,22 +10,22 @@ module WordCount
|
|
10
10
|
# This is pretty simpleminded:
|
11
11
|
# * downcase the word
|
12
12
|
# * Split at any non-alphanumeric boundary, including '_'
|
13
|
-
# * However, preserve the special cases of 's or 't at the end of a
|
13
|
+
# * However, preserve the special cases of 's, 'd or 't at the end of a
|
14
14
|
# word.
|
15
15
|
#
|
16
|
-
# tokenize("
|
17
|
-
# # => ["
|
16
|
+
# tokenize("Ability is a poor man's wealth #johnwoodenquote")
|
17
|
+
# # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
|
18
18
|
#
|
19
19
|
def tokenize str
|
20
|
-
return []
|
20
|
+
return [] if str.blank?
|
21
21
|
str = str.downcase;
|
22
22
|
# kill off all punctuation except [stuff]'s or [stuff]'t
|
23
23
|
# this includes hyphens (words are split)
|
24
24
|
str = str.
|
25
25
|
gsub(/[^a-zA-Z0-9\']+/, ' ').
|
26
|
-
gsub(/(\w)\'([
|
26
|
+
gsub(/(\w)\'([std])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
|
27
27
|
# Busticate at whitespace
|
28
|
-
words = str.
|
28
|
+
words = str.split(/\s+/)
|
29
29
|
words.reject!{|w| w.blank? }
|
30
30
|
words
|
31
31
|
end
|
@@ -39,31 +39,13 @@ module WordCount
|
|
39
39
|
end
|
40
40
|
|
41
41
|
#
|
42
|
-
#
|
42
|
+
# You can stack up all the values in a list then sum them at once.
|
43
43
|
#
|
44
|
-
|
45
|
-
attr_accessor :key_count
|
46
|
-
def process word, count
|
47
|
-
@last_word ||= word
|
48
|
-
if (@last_word == word)
|
49
|
-
self.key_count += 1
|
50
|
-
else
|
51
|
-
yield [ @last_word, key_count ]
|
52
|
-
@last_word = word
|
53
|
-
end
|
54
|
-
end
|
55
|
-
def stream
|
56
|
-
emit @last_word, key_count
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
#
|
61
|
-
# You can stack up all the values in a list then sum them at once:
|
44
|
+
# This isn't good style, as it means the whole list is held in memory
|
62
45
|
#
|
63
|
-
require 'active_support/core_ext/enumerable'
|
64
46
|
class Reducer1 < Wukong::Streamer::ListReducer
|
65
47
|
def finalize
|
66
|
-
yield [ key, values.map(&:last).map(&:to_i).
|
48
|
+
yield [ key, values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot } ]
|
67
49
|
end
|
68
50
|
end
|
69
51
|
|
@@ -71,11 +53,10 @@ module WordCount
|
|
71
53
|
# A bit kinder to your memory manager: accumulate the sum record-by-record:
|
72
54
|
#
|
73
55
|
class Reducer2 < Wukong::Streamer::AccumulatingReducer
|
74
|
-
|
75
|
-
def
|
76
|
-
def accumulate(*args) self.key_count += 1 end
|
56
|
+
def start!(*args) @key_count = 0 end
|
57
|
+
def accumulate(*args) @key_count += 1 end
|
77
58
|
def finalize
|
78
|
-
yield [ key, key_count ]
|
59
|
+
yield [ key, @key_count ]
|
79
60
|
end
|
80
61
|
end
|
81
62
|
|
@@ -85,11 +66,10 @@ module WordCount
|
|
85
66
|
require 'wukong/streamer/count_keys'
|
86
67
|
class Reducer3 < Wukong::Streamer::CountKeys
|
87
68
|
end
|
88
|
-
|
89
69
|
end
|
90
70
|
|
91
71
|
# Execute the script
|
92
|
-
Wukong
|
72
|
+
Wukong.run(
|
93
73
|
WordCount::Mapper,
|
94
|
-
WordCount::
|
95
|
-
)
|
74
|
+
WordCount::Reducer
|
75
|
+
)
|
data/lib/wukong/and_pig.rb
CHANGED
@@ -2,19 +2,13 @@ module Enumerable
|
|
2
2
|
#
|
3
3
|
# Convert an array of values to a string representing it as a pig tuple
|
4
4
|
#
|
5
|
-
# def to_pig_tuple
|
6
|
-
# map{|*vals| '(' + vals.join(',') + ')' }
|
7
|
-
# end
|
8
|
-
|
9
|
-
#
|
10
|
-
# Convert an array to a pig tuple
|
11
|
-
#
|
12
5
|
def to_pig_tuple
|
13
6
|
'(' + self.join(',') + ')'
|
14
7
|
end
|
8
|
+
|
15
9
|
#
|
16
10
|
# Convert an array of values to a string pig format
|
17
|
-
#
|
11
|
+
# see also to_pig_bag
|
18
12
|
#
|
19
13
|
def to_pig *args
|
20
14
|
to_pig_tuple *args
|
@@ -23,13 +17,6 @@ module Enumerable
|
|
23
17
|
#
|
24
18
|
# Convert an array of values to a string representing it as a pig bag
|
25
19
|
#
|
26
|
-
# def to_pig_bag
|
27
|
-
# '{' + self.join(',') + '}'
|
28
|
-
# end
|
29
|
-
|
30
|
-
#
|
31
|
-
# Convert and array of values to a string representing it as a pig bag
|
32
|
-
#
|
33
20
|
def to_pig_bag
|
34
21
|
'{' + self.map{|*vals| vals.to_pig_tuple}.join(",") + '}'
|
35
22
|
end
|
data/lib/wukong/logger.rb
CHANGED
@@ -13,37 +13,15 @@ module Wukong
|
|
13
13
|
# I, [2009-07-26T19:58:46-05:00 #12332]: Up to 2000 char message
|
14
14
|
#
|
15
15
|
def self.logger
|
16
|
-
@logger
|
17
|
-
end
|
18
|
-
|
19
|
-
#
|
20
|
-
# Log4r logger, set up to produce tab-delimited (and thus, wukong|hadoop
|
21
|
-
# friendly) output lines
|
22
|
-
#
|
23
|
-
def self.default_log4r_logger logger_handle='wukong'
|
24
|
-
require 'log4r'
|
25
|
-
lgr = Log4r::Logger.new logger_handle
|
26
|
-
outputter = Log4r::Outputter.stderr
|
27
|
-
# Define timestamp formatter method
|
28
|
-
::Time.class_eval do def utc_iso8601() utc.iso8601 ; end ; end
|
29
|
-
# 2009-07-25T00:12:05Z INFO PID\t
|
30
|
-
outputter.formatter = Log4r::PatternFormatter.new(
|
31
|
-
:pattern => "%d %.4l #{Process.pid}\t%.2000m",
|
32
|
-
:date_method => :utc_iso8601
|
33
|
-
)
|
34
|
-
lgr.outputters = outputter
|
35
|
-
lgr
|
36
|
-
end
|
37
|
-
|
38
|
-
def self.default_ruby_logger
|
16
|
+
return @logger if @logger
|
39
17
|
require 'logger'
|
40
|
-
logger = Logger.new STDERR
|
41
|
-
logger.instance_eval do
|
18
|
+
@logger = Logger.new STDERR
|
19
|
+
@logger.instance_eval do
|
42
20
|
def dump *args
|
43
21
|
debug args.inspect
|
44
22
|
end
|
45
23
|
end
|
46
|
-
logger
|
24
|
+
@logger
|
47
25
|
end
|
48
26
|
|
49
27
|
def self.logger= logger
|
@@ -54,6 +32,7 @@ end
|
|
54
32
|
#
|
55
33
|
# A convenient logger.
|
56
34
|
#
|
57
|
-
#
|
35
|
+
# define Log yourself to prevent its creation
|
58
36
|
#
|
59
|
-
Log
|
37
|
+
Log = Wukong.logger unless defined?(Log)
|
38
|
+
|
@@ -1,4 +1,5 @@
|
|
1
|
-
Settings.define :log_interval, :default =>
|
1
|
+
Settings.define :log_interval, :default => 10_000, :type => Integer, :description => 'How many iterations between log statements'
|
2
|
+
Settings.define :log_seconds, :default => 30, :type => Integer, :description => 'How many seconds between log statements'
|
2
3
|
|
3
4
|
#
|
4
5
|
# Periodic monitor
|
@@ -9,40 +10,48 @@ Settings.define :log_interval, :default => 1000, :type => Integer, :description
|
|
9
10
|
class PeriodicMonitor
|
10
11
|
attr_reader :iter, :start_time, :options
|
11
12
|
attr_accessor :interval
|
13
|
+
attr_accessor :time_interval
|
12
14
|
|
13
15
|
def initialize extra_options={}
|
14
|
-
@options
|
16
|
+
@options = {}
|
15
17
|
@options.deep_merge!( extra_options || {} )
|
16
|
-
@iter
|
17
|
-
@start_time
|
18
|
-
@
|
19
|
-
@interval
|
18
|
+
@iter = 0
|
19
|
+
@start_time = now
|
20
|
+
@last_report = @start_time
|
21
|
+
@interval = (options[:log_interval] || Settings[:log_interval]).to_i
|
22
|
+
@interval = 1000 unless @interval >= 1
|
23
|
+
@time_interval = (options[:log_seconds] || Settings[:log_seconds]).to_i
|
20
24
|
end
|
21
25
|
|
22
26
|
def periodically *args, &block
|
23
27
|
incr!
|
24
28
|
if ready?
|
29
|
+
@last_report = Time.now
|
25
30
|
if block
|
26
31
|
block.call(iter, *args)
|
27
32
|
else
|
28
|
-
|
33
|
+
self.emit progress(*args)
|
29
34
|
end
|
30
35
|
end
|
31
36
|
end
|
32
37
|
|
38
|
+
def emit log_line
|
39
|
+
Log.info log_line
|
40
|
+
end
|
41
|
+
|
33
42
|
def incr!
|
34
43
|
@iter += 1
|
35
44
|
end
|
36
45
|
|
37
46
|
def ready?
|
38
|
-
iter % @interval == 0
|
47
|
+
(iter % @interval == 0) || (since > time_interval)
|
39
48
|
end
|
40
49
|
|
41
50
|
def progress *stuff
|
42
51
|
[
|
43
52
|
"%15d" % iter,
|
44
53
|
"%7.1f"% elapsed_time, "sec",
|
45
|
-
"%7.1f"%
|
54
|
+
"%7.1f"% rate, "/sec",
|
46
55
|
now.to_flat,
|
47
56
|
*stuff
|
48
57
|
].flatten.join("\t")
|
@@ -51,7 +60,13 @@ class PeriodicMonitor
|
|
51
60
|
def elapsed_time
|
52
61
|
now - start_time
|
53
62
|
end
|
63
|
+
def since
|
64
|
+
now - @last_report
|
65
|
+
end
|
54
66
|
def now
|
55
67
|
Time.now.utc
|
56
68
|
end
|
69
|
+
def rate
|
70
|
+
iter.to_f / elapsed_time
|
71
|
+
end
|
57
72
|
end
|
@@ -12,27 +12,27 @@ module Wukong
|
|
12
12
|
#
|
13
13
|
# Translate simplified args to their hairy hadoop equivalents
|
14
14
|
#
|
15
|
-
Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
|
16
|
-
Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
|
17
|
-
Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
|
18
|
-
Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
|
19
|
-
Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
|
20
|
-
Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
|
21
|
-
Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
|
22
|
-
Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
|
23
|
-
Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
|
24
|
-
Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
|
25
|
-
Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
|
26
|
-
Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
|
27
15
|
Settings.define :io_sort_mb, :jobconf => true, :description => 'io.sort.mb', :wukong => true
|
28
16
|
Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent', :wukong => true
|
29
17
|
Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
|
30
|
-
Settings.define :
|
31
|
-
Settings.define :
|
32
|
-
Settings.define :
|
18
|
+
Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
|
19
|
+
Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
|
20
|
+
Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
|
33
21
|
Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
|
22
|
+
Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
|
23
|
+
Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
|
24
|
+
Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
|
34
25
|
Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
|
26
|
+
Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
|
27
|
+
Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
|
35
28
|
Settings.define :min_split_size, :jobconf => true, :description => 'mapred.min.split.size', :wukong => true
|
29
|
+
Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
|
30
|
+
Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
|
31
|
+
Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
|
32
|
+
Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
|
33
|
+
Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
|
34
|
+
Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
|
35
|
+
Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
|
36
36
|
Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
|
37
37
|
Settings.define :split_on_xml_tag, :description => "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'", :wukong => true
|
38
38
|
|
@@ -60,7 +60,7 @@ module Wukong
|
|
60
60
|
# Use Settings[:hadoop_home] to set the path your config install.
|
61
61
|
hadoop_commandline = [
|
62
62
|
hadoop_runner,
|
63
|
-
"jar #{
|
63
|
+
"jar #{options[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
|
64
64
|
hadoop_jobconf_options,
|
65
65
|
"-D mapred.job.name='#{job_name}'",
|
66
66
|
hadoop_other_args,
|
@@ -68,6 +68,7 @@ module Wukong
|
|
68
68
|
"-reducer '#{reducer_commandline}'",
|
69
69
|
"-input '#{input_paths}'",
|
70
70
|
"-output '#{output_path}'",
|
71
|
+
"-file '#{this_script_filename}'",
|
71
72
|
hadoop_recycle_env,
|
72
73
|
].flatten.compact.join(" \t\\\n ")
|
73
74
|
Log.info " Launching hadoop!"
|
@@ -79,8 +80,8 @@ module Wukong
|
|
79
80
|
# Fixup these options
|
80
81
|
options[:reuse_jvms] = '-1' if (options[:reuse_jvms] == true)
|
81
82
|
options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
|
82
|
-
# If no
|
83
|
-
options[:reduce_tasks] = 0 if (!
|
83
|
+
# If no reducer and no reduce_command, then skip the reduce phase
|
84
|
+
options[:reduce_tasks] = 0 if (! reducer) && (! options[:reduce_command]) && (! options[:reduce_tasks])
|
84
85
|
# Fields hadoop should use to distribute records to reducers
|
85
86
|
unless options[:partition_fields].blank?
|
86
87
|
jobconf_options += [
|
@@ -89,23 +90,24 @@ module Wukong
|
|
89
90
|
]
|
90
91
|
end
|
91
92
|
jobconf_options += [
|
92
|
-
:
|
93
|
-
:
|
94
|
-
:
|
95
|
-
:
|
96
|
-
:
|
97
|
-
:min_split_size,
|
98
|
-
:
|
99
|
-
:
|
100
|
-
:
|
93
|
+
:io_sort_mb, :io_sort_record_percent,
|
94
|
+
:map_speculative, :map_tasks,
|
95
|
+
:max_maps_per_cluster, :max_maps_per_node,
|
96
|
+
:max_node_map_tasks, :max_node_reduce_tasks,
|
97
|
+
:max_reduces_per_cluster, :max_reduces_per_node,
|
98
|
+
:max_record_length, :min_split_size,
|
99
|
+
:output_field_separator, :key_field_separator,
|
100
|
+
:partition_fields, :sort_fields,
|
101
|
+
:reduce_tasks, :respect_exit_status,
|
102
|
+
:reuse_jvms, :timeout,
|
101
103
|
].map{|opt| jobconf(opt)}
|
102
104
|
jobconf_options.flatten.compact
|
103
105
|
end
|
104
106
|
|
105
107
|
def hadoop_other_args
|
106
108
|
extra_str_args = [ options[:extra_args] ]
|
107
|
-
if
|
108
|
-
extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{
|
109
|
+
if options.split_on_xml_tag
|
110
|
+
extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{options.split_on_xml_tag}>,end=</#{options.split_on_xml_tag}>'}
|
109
111
|
end
|
110
112
|
extra_str_args << ' -lazyOutput' if options[:noempty] # don't create reduce file if no records
|
111
113
|
extra_str_args << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless options[:partition_fields].blank?
|
data/lib/wukong/script.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
require 'pathname'
|
2
|
+
require 'configliere' ; Settings.use(:commandline, :env_var, :define)
|
3
|
+
require 'wukong'
|
2
4
|
require 'wukong/script/hadoop_command'
|
3
5
|
require 'wukong/script/local_command'
|
4
|
-
require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
|
5
6
|
require 'rbconfig' # for uncovering ruby_interpreter_path
|
7
|
+
require 'wukong/streamer' ; include Wukong::Streamer
|
6
8
|
module Wukong
|
7
9
|
# == How to run a Wukong script
|
8
10
|
#
|
@@ -63,7 +65,7 @@ module Wukong
|
|
63
65
|
class Script
|
64
66
|
include Wukong::HadoopCommand
|
65
67
|
include Wukong::LocalCommand
|
66
|
-
attr_reader :
|
68
|
+
attr_reader :mapper, :reducer, :options
|
67
69
|
attr_reader :input_paths, :output_path
|
68
70
|
|
69
71
|
# ---------------------------------------------------------------------------
|
@@ -122,12 +124,12 @@ module Wukong
|
|
122
124
|
# end
|
123
125
|
# MyScript.new(MyMapper, nil).run
|
124
126
|
#
|
125
|
-
def initialize
|
127
|
+
def initialize mapper, reducer=nil, extra_options={}
|
126
128
|
Settings.resolve!
|
127
|
-
@options = Settings
|
128
|
-
options.merge
|
129
|
-
@
|
130
|
-
@
|
129
|
+
@options = Settings
|
130
|
+
options.merge extra_options
|
131
|
+
@mapper = (case mapper when Class then mapper.new when nil then nil else mapper ; end)
|
132
|
+
@reducer = (case reducer when Class then reducer.new when nil then nil else reducer ; end)
|
131
133
|
@output_path = options.rest.pop
|
132
134
|
@input_paths = options.rest.reject(&:blank?)
|
133
135
|
if (input_paths.blank? || output_path.blank?) && (not options[:dry_run]) && (not ['map', 'reduce'].include?(run_mode))
|
@@ -142,8 +144,8 @@ module Wukong
|
|
142
144
|
#
|
143
145
|
def run
|
144
146
|
case run_mode
|
145
|
-
when 'map' then
|
146
|
-
when 'reduce' then
|
147
|
+
when 'map' then mapper.stream
|
148
|
+
when 'reduce' then reducer.stream
|
147
149
|
when 'local' then execute_local_workflow
|
148
150
|
when 'cassandra' then execute_hadoop_workflow
|
149
151
|
when 'hadoop', 'mapred' then execute_hadoop_workflow
|
@@ -172,8 +174,9 @@ module Wukong
|
|
172
174
|
# In local mode, it's given to the system() call
|
173
175
|
#
|
174
176
|
def mapper_commandline
|
175
|
-
if
|
177
|
+
if mapper
|
176
178
|
"#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
|
179
|
+
# "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --map " + non_wukong_params
|
177
180
|
else
|
178
181
|
options[:map_command]
|
179
182
|
end
|
@@ -185,8 +188,9 @@ module Wukong
|
|
185
188
|
# In local mode, it's given to the system() call
|
186
189
|
#
|
187
190
|
def reducer_commandline
|
188
|
-
if
|
189
|
-
|
191
|
+
if reducer
|
192
|
+
"#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
|
193
|
+
# "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --reduce " + non_wukong_params
|
190
194
|
else
|
191
195
|
options[:reduce_command]
|
192
196
|
end
|
@@ -228,8 +232,9 @@ module Wukong
|
|
228
232
|
#
|
229
233
|
def maybe_overwrite_output_paths! output_path
|
230
234
|
if (options[:overwrite] || options[:rm]) && (run_mode == 'hadoop')
|
231
|
-
|
232
|
-
|
235
|
+
cmd = %Q{#{hadoop_runner} fs -rmr '#{output_path}'}
|
236
|
+
Log.info "Removing output file #{output_path}: #{cmd}"
|
237
|
+
puts `#{cmd}`
|
233
238
|
end
|
234
239
|
end
|
235
240
|
|
@@ -26,10 +26,11 @@ module Wukong
|
|
26
26
|
#
|
27
27
|
def to_db_hash
|
28
28
|
db_hsh = {}
|
29
|
-
|
29
|
+
each_pair{|k,v| db_hsh[k.to_s] = v.to_s unless v.nil? }
|
30
30
|
db_hsh
|
31
31
|
end
|
32
32
|
|
33
|
+
|
33
34
|
module ClassMethods
|
34
35
|
# Cassandra column family -- taken from the class name by default.
|
35
36
|
def table_name
|
@@ -15,10 +15,6 @@
|
|
15
15
|
#
|
16
16
|
class AccumulatingReducer < Wukong::Streamer::Base
|
17
17
|
attr_accessor :key
|
18
|
-
def initialize options
|
19
|
-
super options
|
20
|
-
self.key = :__first_pass__
|
21
|
-
end
|
22
18
|
|
23
19
|
#
|
24
20
|
# override for multiple-field keys, etc.
|
@@ -57,15 +53,12 @@
|
|
57
53
|
# start! is called on the the first record of the new key
|
58
54
|
#
|
59
55
|
def start! *args
|
60
|
-
raise %Q{start! is the new reset! -- it has args now, namely the first
|
61
|
-
record of the new key. It doesn\'t want #super either}
|
62
56
|
end
|
63
57
|
|
64
58
|
#
|
65
59
|
# Override this to accumulate each record for the given key in turn.
|
66
60
|
#
|
67
61
|
def accumulate *args, &block
|
68
|
-
raise "override the accumulate method in your subclass"
|
69
62
|
end
|
70
63
|
|
71
64
|
#
|
@@ -73,7 +66,11 @@
|
|
73
66
|
# You must override this method.
|
74
67
|
#
|
75
68
|
def finalize
|
76
|
-
|
69
|
+
end
|
70
|
+
|
71
|
+
# make a sentinel
|
72
|
+
def before_stream
|
73
|
+
self.key = :__first_pass__
|
77
74
|
end
|
78
75
|
|
79
76
|
# Finalize the last-seen group.
|
@@ -82,6 +79,5 @@
|
|
82
79
|
super *args
|
83
80
|
end
|
84
81
|
end
|
85
|
-
|
86
82
|
end
|
87
83
|
end
|
data/lib/wukong/streamer/base.rb
CHANGED
@@ -4,13 +4,17 @@ module Wukong
|
|
4
4
|
|
5
5
|
# Options, initially set from the command-line args -- see
|
6
6
|
# Script#process_argv!
|
7
|
-
|
7
|
+
attr_reader :own_options
|
8
8
|
|
9
9
|
#
|
10
10
|
# Accepts option hash from script runner
|
11
11
|
#
|
12
12
|
def initialize options={}
|
13
|
-
|
13
|
+
@own_options = options
|
14
|
+
end
|
15
|
+
|
16
|
+
def options
|
17
|
+
Settings.deep_merge own_options
|
14
18
|
end
|
15
19
|
|
16
20
|
#
|
@@ -24,6 +28,7 @@ module Wukong
|
|
24
28
|
process(*record) do |output_record|
|
25
29
|
emit output_record
|
26
30
|
end
|
31
|
+
monitor.periodically(record.to_s[0..1000])
|
27
32
|
end
|
28
33
|
after_stream
|
29
34
|
end
|
@@ -64,7 +69,6 @@ module Wukong
|
|
64
69
|
# Process each record in turn, yielding the records to emit
|
65
70
|
#
|
66
71
|
def process *args, &block
|
67
|
-
raise "override the process method in your implementation: it should process each record."
|
68
72
|
end
|
69
73
|
|
70
74
|
#
|
@@ -75,6 +79,43 @@ module Wukong
|
|
75
79
|
warn "Bad record #{args.inspect[0..400]}"
|
76
80
|
puts ["bad_record-"+key, *args].join("\t")
|
77
81
|
end
|
82
|
+
|
83
|
+
# A periodic logger to track progress
|
84
|
+
def monitor
|
85
|
+
@monitor ||= PeriodicMonitor.new
|
86
|
+
end
|
87
|
+
|
88
|
+
# Defines a process method on the fly to execute the given mapper.
|
89
|
+
#
|
90
|
+
# This is still experimental.
|
91
|
+
# Among other limitations, you can't use ++yield++ -- you have to call
|
92
|
+
# emit() directly.
|
93
|
+
def mapper &mapper_block
|
94
|
+
@mapper_block = mapper_block.to_proc
|
95
|
+
self.instance_eval do
|
96
|
+
def process *args, &block
|
97
|
+
instance_exec(*args, &@mapper_block)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
self
|
101
|
+
end
|
102
|
+
|
103
|
+
# Creates a new object of this class and injects the given block
|
104
|
+
# as the process method
|
105
|
+
def self.mapper *args, &block
|
106
|
+
self.new.mapper *args, &block
|
107
|
+
end
|
108
|
+
|
109
|
+
# Delegates back to Wukong to run this instance as a mapper
|
110
|
+
def run options={}
|
111
|
+
Wukong.run(self, nil, options)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Creates a new object of this class and runs it
|
115
|
+
def self.run options={}
|
116
|
+
Wukong.run(self.new, nil, options)
|
117
|
+
end
|
118
|
+
|
78
119
|
end
|
79
120
|
end
|
80
121
|
end
|
@@ -1,23 +1,23 @@
|
|
1
1
|
module Wukong
|
2
2
|
module Streamer
|
3
|
-
|
4
3
|
#
|
5
|
-
#
|
4
|
+
# Emit each unique key and the count of its occurrences
|
6
5
|
#
|
7
|
-
class CountingReducer < AccumulatingReducer
|
8
|
-
attr_accessor :count
|
6
|
+
class CountingReducer < Wukong::Streamer::AccumulatingReducer
|
9
7
|
|
10
|
-
#
|
11
|
-
def start! *
|
12
|
-
|
8
|
+
# reset the counter to zero
|
9
|
+
def start! *args
|
10
|
+
@count = 0
|
13
11
|
end
|
14
|
-
|
15
|
-
|
16
|
-
|
12
|
+
|
13
|
+
# record one more for this key
|
14
|
+
def accumulate *vals
|
15
|
+
@count += 1
|
17
16
|
end
|
18
|
-
|
17
|
+
|
18
|
+
# emit each key field and the count, tab-separated.
|
19
19
|
def finalize
|
20
|
-
yield [key, count]
|
20
|
+
yield [key, @count]
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|