wukong 1.5.4 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.textile +32 -0
- data/README.textile +58 -12
- data/TODO.textile +0 -8
- data/bin/hdp-bzip +12 -17
- data/bin/hdp-kill-task +1 -1
- data/bin/hdp-sort +7 -7
- data/bin/hdp-stream +7 -7
- data/bin/hdp-stream-flat +2 -3
- data/bin/setcat +11 -0
- data/bin/uniq-ord +59 -0
- data/examples/corpus/bucket_counter.rb +47 -0
- data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
- data/examples/corpus/sentence_coocurrence.rb +70 -0
- data/examples/emr/README.textile +110 -0
- data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
- data/examples/emr/elastic_mapreduce_example.rb +2 -2
- data/examples/ignore_me/counting.rb +56 -0
- data/examples/ignore_me/grouper.rb +71 -0
- data/examples/network_graph/adjacency_list.rb +2 -2
- data/examples/network_graph/breadth_first_search.rb +14 -21
- data/examples/network_graph/gen_multi_edge.rb +22 -13
- data/examples/pagerank/pagerank.rb +1 -1
- data/examples/pagerank/pagerank_initialize.rb +6 -10
- data/examples/sample_records.rb +6 -16
- data/examples/server_logs/apache_log_parser.rb +7 -22
- data/examples/server_logs/breadcrumbs.rb +39 -0
- data/examples/server_logs/logline.rb +27 -0
- data/examples/size.rb +3 -2
- data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
- data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
- data/examples/stupidly_simple_filter.rb +11 -14
- data/examples/word_count.rb +16 -36
- data/lib/wukong/and_pig.rb +2 -15
- data/lib/wukong/logger.rb +7 -28
- data/lib/wukong/periodic_monitor.rb +24 -9
- data/lib/wukong/script/emr_command.rb +1 -0
- data/lib/wukong/script/hadoop_command.rb +31 -29
- data/lib/wukong/script.rb +19 -14
- data/lib/wukong/store/cassandra_model.rb +2 -1
- data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
- data/lib/wukong/streamer/base.rb +44 -3
- data/lib/wukong/streamer/counting_reducer.rb +12 -12
- data/lib/wukong/streamer/filter.rb +2 -2
- data/lib/wukong/streamer/list_reducer.rb +3 -3
- data/lib/wukong/streamer/reducer.rb +11 -0
- data/lib/wukong/streamer.rb +7 -3
- data/lib/wukong.rb +7 -3
- data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
- data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
- data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
- data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
- data/wukong.gemspec +257 -285
- metadata +45 -62
- data/examples/cassandra_streaming/avromapper.rb +0 -85
- data/examples/cassandra_streaming/cassandra.avpr +0 -468
- data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
- data/examples/cassandra_streaming/catter.sh +0 -45
- data/examples/cassandra_streaming/client_schema.avpr +0 -211
- data/examples/cassandra_streaming/foofile.avr +0 -0
- data/examples/cassandra_streaming/pymap.sh +0 -1
- data/examples/cassandra_streaming/pyreduce.sh +0 -1
- data/examples/cassandra_streaming/smutation.avpr +0 -188
- data/examples/cassandra_streaming/streamer.sh +0 -51
- data/examples/cassandra_streaming/struct_loader.rb +0 -24
- data/examples/count_keys.rb +0 -56
- data/examples/count_keys_at_mapper.rb +0 -57
- data/examples/emr/README-elastic_map_reduce.textile +0 -26
- data/examples/keystore/cassandra_batch_test.rb +0 -41
- data/examples/keystore/conditional_outputter_example.rb +0 -70
- data/examples/store/chunked_store_example.rb +0 -18
- data/lib/wukong/dfs.rb +0 -81
- data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
- data/lib/wukong/keystore/redis_db.rb +0 -24
- data/lib/wukong/keystore/tyrant_db.rb +0 -137
- data/lib/wukong/keystore/tyrant_notes.textile +0 -145
- data/lib/wukong/models/graph.rb +0 -25
- data/lib/wukong/monitor/chunked_store.rb +0 -23
- data/lib/wukong/monitor/periodic_logger.rb +0 -34
- data/lib/wukong/monitor/periodic_monitor.rb +0 -70
- data/lib/wukong/monitor.rb +0 -7
- data/lib/wukong/rdf.rb +0 -104
- data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
- data/lib/wukong/streamer/count_keys.rb +0 -30
- data/lib/wukong/streamer/count_lines.rb +0 -26
- data/lib/wukong/streamer/em_streamer.rb +0 -7
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
- data/lib/wukong/wukong_class.rb +0 -21
data/examples/word_count.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
require 'rubygems'
|
3
|
-
require 'wukong'
|
3
|
+
require 'wukong/script'
|
4
4
|
|
5
5
|
module WordCount
|
6
6
|
class Mapper < Wukong::Streamer::LineStreamer
|
@@ -10,22 +10,22 @@ module WordCount
|
|
10
10
|
# This is pretty simpleminded:
|
11
11
|
# * downcase the word
|
12
12
|
# * Split at any non-alphanumeric boundary, including '_'
|
13
|
-
# * However, preserve the special cases of 's or 't at the end of a
|
13
|
+
# * However, preserve the special cases of 's, 'd or 't at the end of a
|
14
14
|
# word.
|
15
15
|
#
|
16
|
-
# tokenize("
|
17
|
-
# # => ["
|
16
|
+
# tokenize("Ability is a poor man's wealth #johnwoodenquote")
|
17
|
+
# # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
|
18
18
|
#
|
19
19
|
def tokenize str
|
20
|
-
return []
|
20
|
+
return [] if str.blank?
|
21
21
|
str = str.downcase;
|
22
22
|
# kill off all punctuation except [stuff]'s or [stuff]'t
|
23
23
|
# this includes hyphens (words are split)
|
24
24
|
str = str.
|
25
25
|
gsub(/[^a-zA-Z0-9\']+/, ' ').
|
26
|
-
gsub(/(\w)\'([
|
26
|
+
gsub(/(\w)\'([std])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
|
27
27
|
# Busticate at whitespace
|
28
|
-
words = str.
|
28
|
+
words = str.split(/\s+/)
|
29
29
|
words.reject!{|w| w.blank? }
|
30
30
|
words
|
31
31
|
end
|
@@ -39,31 +39,13 @@ module WordCount
|
|
39
39
|
end
|
40
40
|
|
41
41
|
#
|
42
|
-
#
|
42
|
+
# You can stack up all the values in a list then sum them at once.
|
43
43
|
#
|
44
|
-
|
45
|
-
attr_accessor :key_count
|
46
|
-
def process word, count
|
47
|
-
@last_word ||= word
|
48
|
-
if (@last_word == word)
|
49
|
-
self.key_count += 1
|
50
|
-
else
|
51
|
-
yield [ @last_word, key_count ]
|
52
|
-
@last_word = word
|
53
|
-
end
|
54
|
-
end
|
55
|
-
def stream
|
56
|
-
emit @last_word, key_count
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
#
|
61
|
-
# You can stack up all the values in a list then sum them at once:
|
44
|
+
# This isn't good style, as it means the whole list is held in memory
|
62
45
|
#
|
63
|
-
require 'active_support/core_ext/enumerable'
|
64
46
|
class Reducer1 < Wukong::Streamer::ListReducer
|
65
47
|
def finalize
|
66
|
-
yield [ key, values.map(&:last).map(&:to_i).
|
48
|
+
yield [ key, values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot } ]
|
67
49
|
end
|
68
50
|
end
|
69
51
|
|
@@ -71,11 +53,10 @@ module WordCount
|
|
71
53
|
# A bit kinder to your memory manager: accumulate the sum record-by-record:
|
72
54
|
#
|
73
55
|
class Reducer2 < Wukong::Streamer::AccumulatingReducer
|
74
|
-
|
75
|
-
def
|
76
|
-
def accumulate(*args) self.key_count += 1 end
|
56
|
+
def start!(*args) @key_count = 0 end
|
57
|
+
def accumulate(*args) @key_count += 1 end
|
77
58
|
def finalize
|
78
|
-
yield [ key, key_count ]
|
59
|
+
yield [ key, @key_count ]
|
79
60
|
end
|
80
61
|
end
|
81
62
|
|
@@ -85,11 +66,10 @@ module WordCount
|
|
85
66
|
require 'wukong/streamer/count_keys'
|
86
67
|
class Reducer3 < Wukong::Streamer::CountKeys
|
87
68
|
end
|
88
|
-
|
89
69
|
end
|
90
70
|
|
91
71
|
# Execute the script
|
92
|
-
Wukong
|
72
|
+
Wukong.run(
|
93
73
|
WordCount::Mapper,
|
94
|
-
WordCount::
|
95
|
-
)
|
74
|
+
WordCount::Reducer
|
75
|
+
)
|
data/lib/wukong/and_pig.rb
CHANGED
@@ -2,19 +2,13 @@ module Enumerable
|
|
2
2
|
#
|
3
3
|
# Convert an array of values to a string representing it as a pig tuple
|
4
4
|
#
|
5
|
-
# def to_pig_tuple
|
6
|
-
# map{|*vals| '(' + vals.join(',') + ')' }
|
7
|
-
# end
|
8
|
-
|
9
|
-
#
|
10
|
-
# Convert an array to a pig tuple
|
11
|
-
#
|
12
5
|
def to_pig_tuple
|
13
6
|
'(' + self.join(',') + ')'
|
14
7
|
end
|
8
|
+
|
15
9
|
#
|
16
10
|
# Convert an array of values to a string pig format
|
17
|
-
#
|
11
|
+
# see also to_pig_bag
|
18
12
|
#
|
19
13
|
def to_pig *args
|
20
14
|
to_pig_tuple *args
|
@@ -23,13 +17,6 @@ module Enumerable
|
|
23
17
|
#
|
24
18
|
# Convert an array of values to a string representing it as a pig bag
|
25
19
|
#
|
26
|
-
# def to_pig_bag
|
27
|
-
# '{' + self.join(',') + '}'
|
28
|
-
# end
|
29
|
-
|
30
|
-
#
|
31
|
-
# Convert and array of values to a string representing it as a pig bag
|
32
|
-
#
|
33
20
|
def to_pig_bag
|
34
21
|
'{' + self.map{|*vals| vals.to_pig_tuple}.join(",") + '}'
|
35
22
|
end
|
data/lib/wukong/logger.rb
CHANGED
@@ -13,37 +13,15 @@ module Wukong
|
|
13
13
|
# I, [2009-07-26T19:58:46-05:00 #12332]: Up to 2000 char message
|
14
14
|
#
|
15
15
|
def self.logger
|
16
|
-
@logger
|
17
|
-
end
|
18
|
-
|
19
|
-
#
|
20
|
-
# Log4r logger, set up to produce tab-delimited (and thus, wukong|hadoop
|
21
|
-
# friendly) output lines
|
22
|
-
#
|
23
|
-
def self.default_log4r_logger logger_handle='wukong'
|
24
|
-
require 'log4r'
|
25
|
-
lgr = Log4r::Logger.new logger_handle
|
26
|
-
outputter = Log4r::Outputter.stderr
|
27
|
-
# Define timestamp formatter method
|
28
|
-
::Time.class_eval do def utc_iso8601() utc.iso8601 ; end ; end
|
29
|
-
# 2009-07-25T00:12:05Z INFO PID\t
|
30
|
-
outputter.formatter = Log4r::PatternFormatter.new(
|
31
|
-
:pattern => "%d %.4l #{Process.pid}\t%.2000m",
|
32
|
-
:date_method => :utc_iso8601
|
33
|
-
)
|
34
|
-
lgr.outputters = outputter
|
35
|
-
lgr
|
36
|
-
end
|
37
|
-
|
38
|
-
def self.default_ruby_logger
|
16
|
+
return @logger if @logger
|
39
17
|
require 'logger'
|
40
|
-
logger = Logger.new STDERR
|
41
|
-
logger.instance_eval do
|
18
|
+
@logger = Logger.new STDERR
|
19
|
+
@logger.instance_eval do
|
42
20
|
def dump *args
|
43
21
|
debug args.inspect
|
44
22
|
end
|
45
23
|
end
|
46
|
-
logger
|
24
|
+
@logger
|
47
25
|
end
|
48
26
|
|
49
27
|
def self.logger= logger
|
@@ -54,6 +32,7 @@ end
|
|
54
32
|
#
|
55
33
|
# A convenient logger.
|
56
34
|
#
|
57
|
-
#
|
35
|
+
# define Log yourself to prevent its creation
|
58
36
|
#
|
59
|
-
Log
|
37
|
+
Log = Wukong.logger unless defined?(Log)
|
38
|
+
|
@@ -1,4 +1,5 @@
|
|
1
|
-
Settings.define :log_interval, :default =>
|
1
|
+
Settings.define :log_interval, :default => 10_000, :type => Integer, :description => 'How many iterations between log statements'
|
2
|
+
Settings.define :log_seconds, :default => 30, :type => Integer, :description => 'How many seconds between log statements'
|
2
3
|
|
3
4
|
#
|
4
5
|
# Periodic monitor
|
@@ -9,40 +10,48 @@ Settings.define :log_interval, :default => 1000, :type => Integer, :description
|
|
9
10
|
class PeriodicMonitor
|
10
11
|
attr_reader :iter, :start_time, :options
|
11
12
|
attr_accessor :interval
|
13
|
+
attr_accessor :time_interval
|
12
14
|
|
13
15
|
def initialize extra_options={}
|
14
|
-
@options
|
16
|
+
@options = {}
|
15
17
|
@options.deep_merge!( extra_options || {} )
|
16
|
-
@iter
|
17
|
-
@start_time
|
18
|
-
@
|
19
|
-
@interval
|
18
|
+
@iter = 0
|
19
|
+
@start_time = now
|
20
|
+
@last_report = @start_time
|
21
|
+
@interval = (options[:log_interval] || Settings[:log_interval]).to_i
|
22
|
+
@interval = 1000 unless @interval >= 1
|
23
|
+
@time_interval = (options[:log_seconds] || Settings[:log_seconds]).to_i
|
20
24
|
end
|
21
25
|
|
22
26
|
def periodically *args, &block
|
23
27
|
incr!
|
24
28
|
if ready?
|
29
|
+
@last_report = Time.now
|
25
30
|
if block
|
26
31
|
block.call(iter, *args)
|
27
32
|
else
|
28
|
-
|
33
|
+
self.emit progress(*args)
|
29
34
|
end
|
30
35
|
end
|
31
36
|
end
|
32
37
|
|
38
|
+
def emit log_line
|
39
|
+
Log.info log_line
|
40
|
+
end
|
41
|
+
|
33
42
|
def incr!
|
34
43
|
@iter += 1
|
35
44
|
end
|
36
45
|
|
37
46
|
def ready?
|
38
|
-
iter % @interval == 0
|
47
|
+
(iter % @interval == 0) || (since > time_interval)
|
39
48
|
end
|
40
49
|
|
41
50
|
def progress *stuff
|
42
51
|
[
|
43
52
|
"%15d" % iter,
|
44
53
|
"%7.1f"% elapsed_time, "sec",
|
45
|
-
"%7.1f"%
|
54
|
+
"%7.1f"% rate, "/sec",
|
46
55
|
now.to_flat,
|
47
56
|
*stuff
|
48
57
|
].flatten.join("\t")
|
@@ -51,7 +60,13 @@ class PeriodicMonitor
|
|
51
60
|
def elapsed_time
|
52
61
|
now - start_time
|
53
62
|
end
|
63
|
+
def since
|
64
|
+
now - @last_report
|
65
|
+
end
|
54
66
|
def now
|
55
67
|
Time.now.utc
|
56
68
|
end
|
69
|
+
def rate
|
70
|
+
iter.to_f / elapsed_time
|
71
|
+
end
|
57
72
|
end
|
@@ -12,27 +12,27 @@ module Wukong
|
|
12
12
|
#
|
13
13
|
# Translate simplified args to their hairy hadoop equivalents
|
14
14
|
#
|
15
|
-
Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
|
16
|
-
Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
|
17
|
-
Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
|
18
|
-
Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
|
19
|
-
Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
|
20
|
-
Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
|
21
|
-
Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
|
22
|
-
Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
|
23
|
-
Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
|
24
|
-
Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
|
25
|
-
Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
|
26
|
-
Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
|
27
15
|
Settings.define :io_sort_mb, :jobconf => true, :description => 'io.sort.mb', :wukong => true
|
28
16
|
Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent', :wukong => true
|
29
17
|
Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
|
30
|
-
Settings.define :
|
31
|
-
Settings.define :
|
32
|
-
Settings.define :
|
18
|
+
Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
|
19
|
+
Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
|
20
|
+
Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
|
33
21
|
Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
|
22
|
+
Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
|
23
|
+
Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
|
24
|
+
Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
|
34
25
|
Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
|
26
|
+
Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
|
27
|
+
Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
|
35
28
|
Settings.define :min_split_size, :jobconf => true, :description => 'mapred.min.split.size', :wukong => true
|
29
|
+
Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
|
30
|
+
Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
|
31
|
+
Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
|
32
|
+
Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
|
33
|
+
Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
|
34
|
+
Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
|
35
|
+
Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
|
36
36
|
Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
|
37
37
|
Settings.define :split_on_xml_tag, :description => "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'", :wukong => true
|
38
38
|
|
@@ -60,7 +60,7 @@ module Wukong
|
|
60
60
|
# Use Settings[:hadoop_home] to set the path your config install.
|
61
61
|
hadoop_commandline = [
|
62
62
|
hadoop_runner,
|
63
|
-
"jar #{
|
63
|
+
"jar #{options[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
|
64
64
|
hadoop_jobconf_options,
|
65
65
|
"-D mapred.job.name='#{job_name}'",
|
66
66
|
hadoop_other_args,
|
@@ -68,6 +68,7 @@ module Wukong
|
|
68
68
|
"-reducer '#{reducer_commandline}'",
|
69
69
|
"-input '#{input_paths}'",
|
70
70
|
"-output '#{output_path}'",
|
71
|
+
"-file '#{this_script_filename}'",
|
71
72
|
hadoop_recycle_env,
|
72
73
|
].flatten.compact.join(" \t\\\n ")
|
73
74
|
Log.info " Launching hadoop!"
|
@@ -79,8 +80,8 @@ module Wukong
|
|
79
80
|
# Fixup these options
|
80
81
|
options[:reuse_jvms] = '-1' if (options[:reuse_jvms] == true)
|
81
82
|
options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
|
82
|
-
# If no
|
83
|
-
options[:reduce_tasks] = 0 if (!
|
83
|
+
# If no reducer and no reduce_command, then skip the reduce phase
|
84
|
+
options[:reduce_tasks] = 0 if (! reducer) && (! options[:reduce_command]) && (! options[:reduce_tasks])
|
84
85
|
# Fields hadoop should use to distribute records to reducers
|
85
86
|
unless options[:partition_fields].blank?
|
86
87
|
jobconf_options += [
|
@@ -89,23 +90,24 @@ module Wukong
|
|
89
90
|
]
|
90
91
|
end
|
91
92
|
jobconf_options += [
|
92
|
-
:
|
93
|
-
:
|
94
|
-
:
|
95
|
-
:
|
96
|
-
:
|
97
|
-
:min_split_size,
|
98
|
-
:
|
99
|
-
:
|
100
|
-
:
|
93
|
+
:io_sort_mb, :io_sort_record_percent,
|
94
|
+
:map_speculative, :map_tasks,
|
95
|
+
:max_maps_per_cluster, :max_maps_per_node,
|
96
|
+
:max_node_map_tasks, :max_node_reduce_tasks,
|
97
|
+
:max_reduces_per_cluster, :max_reduces_per_node,
|
98
|
+
:max_record_length, :min_split_size,
|
99
|
+
:output_field_separator, :key_field_separator,
|
100
|
+
:partition_fields, :sort_fields,
|
101
|
+
:reduce_tasks, :respect_exit_status,
|
102
|
+
:reuse_jvms, :timeout,
|
101
103
|
].map{|opt| jobconf(opt)}
|
102
104
|
jobconf_options.flatten.compact
|
103
105
|
end
|
104
106
|
|
105
107
|
def hadoop_other_args
|
106
108
|
extra_str_args = [ options[:extra_args] ]
|
107
|
-
if
|
108
|
-
extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{
|
109
|
+
if options.split_on_xml_tag
|
110
|
+
extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{options.split_on_xml_tag}>,end=</#{options.split_on_xml_tag}>'}
|
109
111
|
end
|
110
112
|
extra_str_args << ' -lazyOutput' if options[:noempty] # don't create reduce file if no records
|
111
113
|
extra_str_args << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless options[:partition_fields].blank?
|
data/lib/wukong/script.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
require 'pathname'
|
2
|
+
require 'configliere' ; Settings.use(:commandline, :env_var, :define)
|
3
|
+
require 'wukong'
|
2
4
|
require 'wukong/script/hadoop_command'
|
3
5
|
require 'wukong/script/local_command'
|
4
|
-
require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
|
5
6
|
require 'rbconfig' # for uncovering ruby_interpreter_path
|
7
|
+
require 'wukong/streamer' ; include Wukong::Streamer
|
6
8
|
module Wukong
|
7
9
|
# == How to run a Wukong script
|
8
10
|
#
|
@@ -63,7 +65,7 @@ module Wukong
|
|
63
65
|
class Script
|
64
66
|
include Wukong::HadoopCommand
|
65
67
|
include Wukong::LocalCommand
|
66
|
-
attr_reader :
|
68
|
+
attr_reader :mapper, :reducer, :options
|
67
69
|
attr_reader :input_paths, :output_path
|
68
70
|
|
69
71
|
# ---------------------------------------------------------------------------
|
@@ -122,12 +124,12 @@ module Wukong
|
|
122
124
|
# end
|
123
125
|
# MyScript.new(MyMapper, nil).run
|
124
126
|
#
|
125
|
-
def initialize
|
127
|
+
def initialize mapper, reducer=nil, extra_options={}
|
126
128
|
Settings.resolve!
|
127
|
-
@options = Settings
|
128
|
-
options.merge
|
129
|
-
@
|
130
|
-
@
|
129
|
+
@options = Settings
|
130
|
+
options.merge extra_options
|
131
|
+
@mapper = (case mapper when Class then mapper.new when nil then nil else mapper ; end)
|
132
|
+
@reducer = (case reducer when Class then reducer.new when nil then nil else reducer ; end)
|
131
133
|
@output_path = options.rest.pop
|
132
134
|
@input_paths = options.rest.reject(&:blank?)
|
133
135
|
if (input_paths.blank? || output_path.blank?) && (not options[:dry_run]) && (not ['map', 'reduce'].include?(run_mode))
|
@@ -142,8 +144,8 @@ module Wukong
|
|
142
144
|
#
|
143
145
|
def run
|
144
146
|
case run_mode
|
145
|
-
when 'map' then
|
146
|
-
when 'reduce' then
|
147
|
+
when 'map' then mapper.stream
|
148
|
+
when 'reduce' then reducer.stream
|
147
149
|
when 'local' then execute_local_workflow
|
148
150
|
when 'cassandra' then execute_hadoop_workflow
|
149
151
|
when 'hadoop', 'mapred' then execute_hadoop_workflow
|
@@ -172,8 +174,9 @@ module Wukong
|
|
172
174
|
# In local mode, it's given to the system() call
|
173
175
|
#
|
174
176
|
def mapper_commandline
|
175
|
-
if
|
177
|
+
if mapper
|
176
178
|
"#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
|
179
|
+
# "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --map " + non_wukong_params
|
177
180
|
else
|
178
181
|
options[:map_command]
|
179
182
|
end
|
@@ -185,8 +188,9 @@ module Wukong
|
|
185
188
|
# In local mode, it's given to the system() call
|
186
189
|
#
|
187
190
|
def reducer_commandline
|
188
|
-
if
|
189
|
-
|
191
|
+
if reducer
|
192
|
+
"#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
|
193
|
+
# "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --reduce " + non_wukong_params
|
190
194
|
else
|
191
195
|
options[:reduce_command]
|
192
196
|
end
|
@@ -228,8 +232,9 @@ module Wukong
|
|
228
232
|
#
|
229
233
|
def maybe_overwrite_output_paths! output_path
|
230
234
|
if (options[:overwrite] || options[:rm]) && (run_mode == 'hadoop')
|
231
|
-
|
232
|
-
|
235
|
+
cmd = %Q{#{hadoop_runner} fs -rmr '#{output_path}'}
|
236
|
+
Log.info "Removing output file #{output_path}: #{cmd}"
|
237
|
+
puts `#{cmd}`
|
233
238
|
end
|
234
239
|
end
|
235
240
|
|
@@ -26,10 +26,11 @@ module Wukong
|
|
26
26
|
#
|
27
27
|
def to_db_hash
|
28
28
|
db_hsh = {}
|
29
|
-
|
29
|
+
each_pair{|k,v| db_hsh[k.to_s] = v.to_s unless v.nil? }
|
30
30
|
db_hsh
|
31
31
|
end
|
32
32
|
|
33
|
+
|
33
34
|
module ClassMethods
|
34
35
|
# Cassandra column family -- taken from the class name by default.
|
35
36
|
def table_name
|
@@ -15,10 +15,6 @@
|
|
15
15
|
#
|
16
16
|
class AccumulatingReducer < Wukong::Streamer::Base
|
17
17
|
attr_accessor :key
|
18
|
-
def initialize options
|
19
|
-
super options
|
20
|
-
self.key = :__first_pass__
|
21
|
-
end
|
22
18
|
|
23
19
|
#
|
24
20
|
# override for multiple-field keys, etc.
|
@@ -57,15 +53,12 @@
|
|
57
53
|
# start! is called on the the first record of the new key
|
58
54
|
#
|
59
55
|
def start! *args
|
60
|
-
raise %Q{start! is the new reset! -- it has args now, namely the first
|
61
|
-
record of the new key. It doesn\'t want #super either}
|
62
56
|
end
|
63
57
|
|
64
58
|
#
|
65
59
|
# Override this to accumulate each record for the given key in turn.
|
66
60
|
#
|
67
61
|
def accumulate *args, &block
|
68
|
-
raise "override the accumulate method in your subclass"
|
69
62
|
end
|
70
63
|
|
71
64
|
#
|
@@ -73,7 +66,11 @@
|
|
73
66
|
# You must override this method.
|
74
67
|
#
|
75
68
|
def finalize
|
76
|
-
|
69
|
+
end
|
70
|
+
|
71
|
+
# make a sentinel
|
72
|
+
def before_stream
|
73
|
+
self.key = :__first_pass__
|
77
74
|
end
|
78
75
|
|
79
76
|
# Finalize the last-seen group.
|
@@ -82,6 +79,5 @@
|
|
82
79
|
super *args
|
83
80
|
end
|
84
81
|
end
|
85
|
-
|
86
82
|
end
|
87
83
|
end
|
data/lib/wukong/streamer/base.rb
CHANGED
@@ -4,13 +4,17 @@ module Wukong
|
|
4
4
|
|
5
5
|
# Options, initially set from the command-line args -- see
|
6
6
|
# Script#process_argv!
|
7
|
-
|
7
|
+
attr_reader :own_options
|
8
8
|
|
9
9
|
#
|
10
10
|
# Accepts option hash from script runner
|
11
11
|
#
|
12
12
|
def initialize options={}
|
13
|
-
|
13
|
+
@own_options = options
|
14
|
+
end
|
15
|
+
|
16
|
+
def options
|
17
|
+
Settings.deep_merge own_options
|
14
18
|
end
|
15
19
|
|
16
20
|
#
|
@@ -24,6 +28,7 @@ module Wukong
|
|
24
28
|
process(*record) do |output_record|
|
25
29
|
emit output_record
|
26
30
|
end
|
31
|
+
monitor.periodically(record.to_s[0..1000])
|
27
32
|
end
|
28
33
|
after_stream
|
29
34
|
end
|
@@ -64,7 +69,6 @@ module Wukong
|
|
64
69
|
# Process each record in turn, yielding the records to emit
|
65
70
|
#
|
66
71
|
def process *args, &block
|
67
|
-
raise "override the process method in your implementation: it should process each record."
|
68
72
|
end
|
69
73
|
|
70
74
|
#
|
@@ -75,6 +79,43 @@ module Wukong
|
|
75
79
|
warn "Bad record #{args.inspect[0..400]}"
|
76
80
|
puts ["bad_record-"+key, *args].join("\t")
|
77
81
|
end
|
82
|
+
|
83
|
+
# A periodic logger to track progress
|
84
|
+
def monitor
|
85
|
+
@monitor ||= PeriodicMonitor.new
|
86
|
+
end
|
87
|
+
|
88
|
+
# Defines a process method on the fly to execute the given mapper.
|
89
|
+
#
|
90
|
+
# This is still experimental.
|
91
|
+
# Among other limitations, you can't use ++yield++ -- you have to call
|
92
|
+
# emit() directly.
|
93
|
+
def mapper &mapper_block
|
94
|
+
@mapper_block = mapper_block.to_proc
|
95
|
+
self.instance_eval do
|
96
|
+
def process *args, &block
|
97
|
+
instance_exec(*args, &@mapper_block)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
self
|
101
|
+
end
|
102
|
+
|
103
|
+
# Creates a new object of this class and injects the given block
|
104
|
+
# as the process method
|
105
|
+
def self.mapper *args, &block
|
106
|
+
self.new.mapper *args, &block
|
107
|
+
end
|
108
|
+
|
109
|
+
# Delegates back to Wukong to run this instance as a mapper
|
110
|
+
def run options={}
|
111
|
+
Wukong.run(self, nil, options)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Creates a new object of this class and runs it
|
115
|
+
def self.run options={}
|
116
|
+
Wukong.run(self.new, nil, options)
|
117
|
+
end
|
118
|
+
|
78
119
|
end
|
79
120
|
end
|
80
121
|
end
|
@@ -1,23 +1,23 @@
|
|
1
1
|
module Wukong
|
2
2
|
module Streamer
|
3
|
-
|
4
3
|
#
|
5
|
-
#
|
4
|
+
# Emit each unique key and the count of its occurrences
|
6
5
|
#
|
7
|
-
class CountingReducer < AccumulatingReducer
|
8
|
-
attr_accessor :count
|
6
|
+
class CountingReducer < Wukong::Streamer::AccumulatingReducer
|
9
7
|
|
10
|
-
#
|
11
|
-
def start! *
|
12
|
-
|
8
|
+
# reset the counter to zero
|
9
|
+
def start! *args
|
10
|
+
@count = 0
|
13
11
|
end
|
14
|
-
|
15
|
-
|
16
|
-
|
12
|
+
|
13
|
+
# record one more for this key
|
14
|
+
def accumulate *vals
|
15
|
+
@count += 1
|
17
16
|
end
|
18
|
-
|
17
|
+
|
18
|
+
# emit each key field and the count, tab-separated.
|
19
19
|
def finalize
|
20
|
-
yield [key, count]
|
20
|
+
yield [key, @count]
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|