wukong 1.5.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. data/CHANGELOG.textile +32 -0
  2. data/README.textile +58 -12
  3. data/TODO.textile +0 -8
  4. data/bin/hdp-bzip +12 -17
  5. data/bin/hdp-kill-task +1 -1
  6. data/bin/hdp-sort +7 -7
  7. data/bin/hdp-stream +7 -7
  8. data/bin/hdp-stream-flat +2 -3
  9. data/bin/setcat +11 -0
  10. data/bin/uniq-ord +59 -0
  11. data/examples/corpus/bucket_counter.rb +47 -0
  12. data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
  13. data/examples/corpus/sentence_coocurrence.rb +70 -0
  14. data/examples/emr/README.textile +110 -0
  15. data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
  16. data/examples/emr/elastic_mapreduce_example.rb +2 -2
  17. data/examples/ignore_me/counting.rb +56 -0
  18. data/examples/ignore_me/grouper.rb +71 -0
  19. data/examples/network_graph/adjacency_list.rb +2 -2
  20. data/examples/network_graph/breadth_first_search.rb +14 -21
  21. data/examples/network_graph/gen_multi_edge.rb +22 -13
  22. data/examples/pagerank/pagerank.rb +1 -1
  23. data/examples/pagerank/pagerank_initialize.rb +6 -10
  24. data/examples/sample_records.rb +6 -16
  25. data/examples/server_logs/apache_log_parser.rb +7 -22
  26. data/examples/server_logs/breadcrumbs.rb +39 -0
  27. data/examples/server_logs/logline.rb +27 -0
  28. data/examples/size.rb +3 -2
  29. data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
  30. data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
  31. data/examples/stupidly_simple_filter.rb +11 -14
  32. data/examples/word_count.rb +16 -36
  33. data/lib/wukong/and_pig.rb +2 -15
  34. data/lib/wukong/logger.rb +7 -28
  35. data/lib/wukong/periodic_monitor.rb +24 -9
  36. data/lib/wukong/script/emr_command.rb +1 -0
  37. data/lib/wukong/script/hadoop_command.rb +31 -29
  38. data/lib/wukong/script.rb +19 -14
  39. data/lib/wukong/store/cassandra_model.rb +2 -1
  40. data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
  41. data/lib/wukong/streamer/base.rb +44 -3
  42. data/lib/wukong/streamer/counting_reducer.rb +12 -12
  43. data/lib/wukong/streamer/filter.rb +2 -2
  44. data/lib/wukong/streamer/list_reducer.rb +3 -3
  45. data/lib/wukong/streamer/reducer.rb +11 -0
  46. data/lib/wukong/streamer.rb +7 -3
  47. data/lib/wukong.rb +7 -3
  48. data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
  49. data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
  50. data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
  51. data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
  52. data/wukong.gemspec +257 -285
  53. metadata +45 -62
  54. data/examples/cassandra_streaming/avromapper.rb +0 -85
  55. data/examples/cassandra_streaming/cassandra.avpr +0 -468
  56. data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
  57. data/examples/cassandra_streaming/catter.sh +0 -45
  58. data/examples/cassandra_streaming/client_schema.avpr +0 -211
  59. data/examples/cassandra_streaming/foofile.avr +0 -0
  60. data/examples/cassandra_streaming/pymap.sh +0 -1
  61. data/examples/cassandra_streaming/pyreduce.sh +0 -1
  62. data/examples/cassandra_streaming/smutation.avpr +0 -188
  63. data/examples/cassandra_streaming/streamer.sh +0 -51
  64. data/examples/cassandra_streaming/struct_loader.rb +0 -24
  65. data/examples/count_keys.rb +0 -56
  66. data/examples/count_keys_at_mapper.rb +0 -57
  67. data/examples/emr/README-elastic_map_reduce.textile +0 -26
  68. data/examples/keystore/cassandra_batch_test.rb +0 -41
  69. data/examples/keystore/conditional_outputter_example.rb +0 -70
  70. data/examples/store/chunked_store_example.rb +0 -18
  71. data/lib/wukong/dfs.rb +0 -81
  72. data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
  73. data/lib/wukong/keystore/redis_db.rb +0 -24
  74. data/lib/wukong/keystore/tyrant_db.rb +0 -137
  75. data/lib/wukong/keystore/tyrant_notes.textile +0 -145
  76. data/lib/wukong/models/graph.rb +0 -25
  77. data/lib/wukong/monitor/chunked_store.rb +0 -23
  78. data/lib/wukong/monitor/periodic_logger.rb +0 -34
  79. data/lib/wukong/monitor/periodic_monitor.rb +0 -70
  80. data/lib/wukong/monitor.rb +0 -7
  81. data/lib/wukong/rdf.rb +0 -104
  82. data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
  83. data/lib/wukong/streamer/count_keys.rb +0 -30
  84. data/lib/wukong/streamer/count_lines.rb +0 -26
  85. data/lib/wukong/streamer/em_streamer.rb +0 -7
  86. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
  87. data/lib/wukong/wukong_class.rb +0 -21
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  require 'rubygems'
3
- require 'wukong'
3
+ require 'wukong/script'
4
4
 
5
5
  module WordCount
6
6
  class Mapper < Wukong::Streamer::LineStreamer
@@ -10,22 +10,22 @@ module WordCount
10
10
  # This is pretty simpleminded:
11
11
  # * downcase the word
12
12
  # * Split at any non-alphanumeric boundary, including '_'
13
- # * However, preserve the special cases of 's or 't at the end of a
13
+ # * However, preserve the special cases of 's, 'd or 't at the end of a
14
14
  # word.
15
15
  #
16
- # tokenize("Jim's dawg won't hunt: dawg_hunt error #3007a4")
17
- # # => ["jim's", "dawd", "won't", "hunt", "dawg", "hunt", "error", "3007a4"]
16
+ # tokenize("Ability is a poor man's wealth #johnwoodenquote")
17
+ # # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
18
18
  #
19
19
  def tokenize str
20
- return [] unless str
20
+ return [] if str.blank?
21
21
  str = str.downcase;
22
22
  # kill off all punctuation except [stuff]'s or [stuff]'t
23
23
  # this includes hyphens (words are split)
24
24
  str = str.
25
25
  gsub(/[^a-zA-Z0-9\']+/, ' ').
26
- gsub(/(\w)\'([st])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
26
+ gsub(/(\w)\'([std])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
27
27
  # Busticate at whitespace
28
- words = str.strip.split(/\s+/)
28
+ words = str.split(/\s+/)
29
29
  words.reject!{|w| w.blank? }
30
30
  words
31
31
  end
@@ -39,31 +39,13 @@ module WordCount
39
39
  end
40
40
 
41
41
  #
42
- # Accumulate the sum record-by-record:
42
+ # You can stack up all the values in a list then sum them at once.
43
43
  #
44
- class Reducer0 < Wukong::Streamer::Base
45
- attr_accessor :key_count
46
- def process word, count
47
- @last_word ||= word
48
- if (@last_word == word)
49
- self.key_count += 1
50
- else
51
- yield [ @last_word, key_count ]
52
- @last_word = word
53
- end
54
- end
55
- def stream
56
- emit @last_word, key_count
57
- end
58
- end
59
-
60
- #
61
- # You can stack up all the values in a list then sum them at once:
44
+ # This isn't good style, as it means the whole list is held in memory
62
45
  #
63
- require 'active_support/core_ext/enumerable'
64
46
  class Reducer1 < Wukong::Streamer::ListReducer
65
47
  def finalize
66
- yield [ key, values.map(&:last).map(&:to_i).sum ]
48
+ yield [ key, values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot } ]
67
49
  end
68
50
  end
69
51
 
@@ -71,11 +53,10 @@ module WordCount
71
53
  # A bit kinder to your memory manager: accumulate the sum record-by-record:
72
54
  #
73
55
  class Reducer2 < Wukong::Streamer::AccumulatingReducer
74
- attr_accessor :key_count
75
- def start!(*args) self.key_count = 0 end
76
- def accumulate(*args) self.key_count += 1 end
56
+ def start!(*args) @key_count = 0 end
57
+ def accumulate(*args) @key_count += 1 end
77
58
  def finalize
78
- yield [ key, key_count ]
59
+ yield [ key, @key_count ]
79
60
  end
80
61
  end
81
62
 
@@ -85,11 +66,10 @@ module WordCount
85
66
  require 'wukong/streamer/count_keys'
86
67
  class Reducer3 < Wukong::Streamer::CountKeys
87
68
  end
88
-
89
69
  end
90
70
 
91
71
  # Execute the script
92
- Wukong::Script.new(
72
+ Wukong.run(
93
73
  WordCount::Mapper,
94
- WordCount::Reducer1
95
- ).run
74
+ WordCount::Reducer
75
+ )
@@ -2,19 +2,13 @@ module Enumerable
2
2
  #
3
3
  # Convert an array of values to a string representing it as a pig tuple
4
4
  #
5
- # def to_pig_tuple
6
- # map{|*vals| '(' + vals.join(',') + ')' }
7
- # end
8
-
9
- #
10
- # Convert an array to a pig tuple
11
- #
12
5
  def to_pig_tuple
13
6
  '(' + self.join(',') + ')'
14
7
  end
8
+
15
9
  #
16
10
  # Convert an array of values to a string pig format
17
- # Delegates to to_pig_tuple -- see also to_pig_bag
11
+ # see also to_pig_bag
18
12
  #
19
13
  def to_pig *args
20
14
  to_pig_tuple *args
@@ -23,13 +17,6 @@ module Enumerable
23
17
  #
24
18
  # Convert an array of values to a string representing it as a pig bag
25
19
  #
26
- # def to_pig_bag
27
- # '{' + self.join(',') + '}'
28
- # end
29
-
30
- #
31
- # Convert and array of values to a string representing it as a pig bag
32
- #
33
20
  def to_pig_bag
34
21
  '{' + self.map{|*vals| vals.to_pig_tuple}.join(",") + '}'
35
22
  end
data/lib/wukong/logger.rb CHANGED
@@ -13,37 +13,15 @@ module Wukong
13
13
  # I, [2009-07-26T19:58:46-05:00 #12332]: Up to 2000 char message
14
14
  #
15
15
  def self.logger
16
- @logger ||= default_ruby_logger
17
- end
18
-
19
- #
20
- # Log4r logger, set up to produce tab-delimited (and thus, wukong|hadoop
21
- # friendly) output lines
22
- #
23
- def self.default_log4r_logger logger_handle='wukong'
24
- require 'log4r'
25
- lgr = Log4r::Logger.new logger_handle
26
- outputter = Log4r::Outputter.stderr
27
- # Define timestamp formatter method
28
- ::Time.class_eval do def utc_iso8601() utc.iso8601 ; end ; end
29
- # 2009-07-25T00:12:05Z INFO PID\t
30
- outputter.formatter = Log4r::PatternFormatter.new(
31
- :pattern => "%d %.4l #{Process.pid}\t%.2000m",
32
- :date_method => :utc_iso8601
33
- )
34
- lgr.outputters = outputter
35
- lgr
36
- end
37
-
38
- def self.default_ruby_logger
16
+ return @logger if @logger
39
17
  require 'logger'
40
- logger = Logger.new STDERR
41
- logger.instance_eval do
18
+ @logger = Logger.new STDERR
19
+ @logger.instance_eval do
42
20
  def dump *args
43
21
  debug args.inspect
44
22
  end
45
23
  end
46
- logger
24
+ @logger
47
25
  end
48
26
 
49
27
  def self.logger= logger
@@ -54,6 +32,7 @@ end
54
32
  #
55
33
  # A convenient logger.
56
34
  #
57
- # Define NO_WUKONG_LOG (or define Log yourself) to prevent its creation
35
+ # define Log yourself to prevent its creation
58
36
  #
59
- Log = Wukong.logger unless (defined?(Log) || defined?(NO_WUKONG_LOG))
37
+ Log = Wukong.logger unless defined?(Log)
38
+
@@ -1,4 +1,5 @@
1
- Settings.define :log_interval, :default => 1000, :type => Integer, :description => 'How many iterations between log statements'
1
+ Settings.define :log_interval, :default => 10_000, :type => Integer, :description => 'How many iterations between log statements'
2
+ Settings.define :log_seconds, :default => 30, :type => Integer, :description => 'How many seconds between log statements'
2
3
 
3
4
  #
4
5
  # Periodic monitor
@@ -9,40 +10,48 @@ Settings.define :log_interval, :default => 1000, :type => Integer, :description
9
10
  class PeriodicMonitor
10
11
  attr_reader :iter, :start_time, :options
11
12
  attr_accessor :interval
13
+ attr_accessor :time_interval
12
14
 
13
15
  def initialize extra_options={}
14
- @options = {}
16
+ @options = {}
15
17
  @options.deep_merge!( extra_options || {} )
16
- @iter = 0
17
- @start_time = now
18
- @interval = (options[:log_interval] || Settings[:log_interval]).to_i
19
- @interval = 1000 unless @interval >= 1
18
+ @iter = 0
19
+ @start_time = now
20
+ @last_report = @start_time
21
+ @interval = (options[:log_interval] || Settings[:log_interval]).to_i
22
+ @interval = 1000 unless @interval >= 1
23
+ @time_interval = (options[:log_seconds] || Settings[:log_seconds]).to_i
20
24
  end
21
25
 
22
26
  def periodically *args, &block
23
27
  incr!
24
28
  if ready?
29
+ @last_report = Time.now
25
30
  if block
26
31
  block.call(iter, *args)
27
32
  else
28
- $stderr.puts progress(*args)
33
+ self.emit progress(*args)
29
34
  end
30
35
  end
31
36
  end
32
37
 
38
+ def emit log_line
39
+ Log.info log_line
40
+ end
41
+
33
42
  def incr!
34
43
  @iter += 1
35
44
  end
36
45
 
37
46
  def ready?
38
- iter % @interval == 0
47
+ (iter % @interval == 0) || (since > time_interval)
39
48
  end
40
49
 
41
50
  def progress *stuff
42
51
  [
43
52
  "%15d" % iter,
44
53
  "%7.1f"% elapsed_time, "sec",
45
- "%7.1f"%(iter.to_f / elapsed_time), "/sec",
54
+ "%7.1f"% rate, "/sec",
46
55
  now.to_flat,
47
56
  *stuff
48
57
  ].flatten.join("\t")
@@ -51,7 +60,13 @@ class PeriodicMonitor
51
60
  def elapsed_time
52
61
  now - start_time
53
62
  end
63
+ def since
64
+ now - @last_report
65
+ end
54
66
  def now
55
67
  Time.now.utc
56
68
  end
69
+ def rate
70
+ iter.to_f / elapsed_time
71
+ end
57
72
  end
@@ -49,6 +49,7 @@ module Wukong
49
49
  end
50
50
 
51
51
  def execute_emr_runner
52
+ # fix_paths!
52
53
  command_args = []
53
54
  if Settings.jobflow
54
55
  command_args << Settings.dashed_flag_for(:jobflow)
@@ -12,27 +12,27 @@ module Wukong
12
12
  #
13
13
  # Translate simplified args to their hairy hadoop equivalents
14
14
  #
15
- Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
16
- Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
17
- Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
18
- Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
19
- Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
20
- Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
21
- Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
22
- Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
23
- Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
24
- Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
25
- Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
26
- Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
27
15
  Settings.define :io_sort_mb, :jobconf => true, :description => 'io.sort.mb', :wukong => true
28
16
  Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent', :wukong => true
29
17
  Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
30
- Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
31
- Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
32
- Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
18
+ Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
19
+ Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
20
+ Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
33
21
  Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
22
+ Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
23
+ Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
24
+ Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
34
25
  Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
26
+ Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
27
+ Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
35
28
  Settings.define :min_split_size, :jobconf => true, :description => 'mapred.min.split.size', :wukong => true
29
+ Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
30
+ Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
31
+ Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
32
+ Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
33
+ Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
34
+ Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
35
+ Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
36
36
  Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
37
37
  Settings.define :split_on_xml_tag, :description => "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'", :wukong => true
38
38
 
@@ -60,7 +60,7 @@ module Wukong
60
60
  # Use Settings[:hadoop_home] to set the path your config install.
61
61
  hadoop_commandline = [
62
62
  hadoop_runner,
63
- "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
63
+ "jar #{options[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
64
64
  hadoop_jobconf_options,
65
65
  "-D mapred.job.name='#{job_name}'",
66
66
  hadoop_other_args,
@@ -68,6 +68,7 @@ module Wukong
68
68
  "-reducer '#{reducer_commandline}'",
69
69
  "-input '#{input_paths}'",
70
70
  "-output '#{output_path}'",
71
+ "-file '#{this_script_filename}'",
71
72
  hadoop_recycle_env,
72
73
  ].flatten.compact.join(" \t\\\n ")
73
74
  Log.info " Launching hadoop!"
@@ -79,8 +80,8 @@ module Wukong
79
80
  # Fixup these options
80
81
  options[:reuse_jvms] = '-1' if (options[:reuse_jvms] == true)
81
82
  options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
82
- # If no reducer_klass and no reduce_command, then skip the reduce phase
83
- options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
83
+ # If no reducer and no reduce_command, then skip the reduce phase
84
+ options[:reduce_tasks] = 0 if (! reducer) && (! options[:reduce_command]) && (! options[:reduce_tasks])
84
85
  # Fields hadoop should use to distribute records to reducers
85
86
  unless options[:partition_fields].blank?
86
87
  jobconf_options += [
@@ -89,23 +90,24 @@ module Wukong
89
90
  ]
90
91
  end
91
92
  jobconf_options += [
92
- :key_field_separator, :sort_fields,
93
- :map_tasks, :reduce_tasks,
94
- :max_node_map_tasks, :max_node_reduce_tasks,
95
- :max_reduces_per_node, :max_reduces_per_cluster,
96
- :max_maps_per_node, :max_maps_per_cluster,
97
- :min_split_size,
98
- :map_speculative,
99
- :timeout,
100
- :reuse_jvms, :respect_exit_status
93
+ :io_sort_mb, :io_sort_record_percent,
94
+ :map_speculative, :map_tasks,
95
+ :max_maps_per_cluster, :max_maps_per_node,
96
+ :max_node_map_tasks, :max_node_reduce_tasks,
97
+ :max_reduces_per_cluster, :max_reduces_per_node,
98
+ :max_record_length, :min_split_size,
99
+ :output_field_separator, :key_field_separator,
100
+ :partition_fields, :sort_fields,
101
+ :reduce_tasks, :respect_exit_status,
102
+ :reuse_jvms, :timeout,
101
103
  ].map{|opt| jobconf(opt)}
102
104
  jobconf_options.flatten.compact
103
105
  end
104
106
 
105
107
  def hadoop_other_args
106
108
  extra_str_args = [ options[:extra_args] ]
107
- if Settings.split_on_xml_tag
108
- extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{Settings.split_on_xml_tag}>,end=</#{Settings.split_on_xml_tag}>'}
109
+ if options.split_on_xml_tag
110
+ extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{options.split_on_xml_tag}>,end=</#{options.split_on_xml_tag}>'}
109
111
  end
110
112
  extra_str_args << ' -lazyOutput' if options[:noempty] # don't create reduce file if no records
111
113
  extra_str_args << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless options[:partition_fields].blank?
data/lib/wukong/script.rb CHANGED
@@ -1,8 +1,10 @@
1
1
  require 'pathname'
2
+ require 'configliere' ; Settings.use(:commandline, :env_var, :define)
3
+ require 'wukong'
2
4
  require 'wukong/script/hadoop_command'
3
5
  require 'wukong/script/local_command'
4
- require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
5
6
  require 'rbconfig' # for uncovering ruby_interpreter_path
7
+ require 'wukong/streamer' ; include Wukong::Streamer
6
8
  module Wukong
7
9
  # == How to run a Wukong script
8
10
  #
@@ -63,7 +65,7 @@ module Wukong
63
65
  class Script
64
66
  include Wukong::HadoopCommand
65
67
  include Wukong::LocalCommand
66
- attr_reader :mapper_klass, :reducer_klass, :options
68
+ attr_reader :mapper, :reducer, :options
67
69
  attr_reader :input_paths, :output_path
68
70
 
69
71
  # ---------------------------------------------------------------------------
@@ -122,12 +124,12 @@ module Wukong
122
124
  # end
123
125
  # MyScript.new(MyMapper, nil).run
124
126
  #
125
- def initialize mapper_klass, reducer_klass=nil, extra_options={}
127
+ def initialize mapper, reducer=nil, extra_options={}
126
128
  Settings.resolve!
127
- @options = Settings.dup
128
- options.merge! extra_options
129
- @mapper_klass = mapper_klass
130
- @reducer_klass = reducer_klass
129
+ @options = Settings
130
+ options.merge extra_options
131
+ @mapper = (case mapper when Class then mapper.new when nil then nil else mapper ; end)
132
+ @reducer = (case reducer when Class then reducer.new when nil then nil else reducer ; end)
131
133
  @output_path = options.rest.pop
132
134
  @input_paths = options.rest.reject(&:blank?)
133
135
  if (input_paths.blank? || output_path.blank?) && (not options[:dry_run]) && (not ['map', 'reduce'].include?(run_mode))
@@ -142,8 +144,8 @@ module Wukong
142
144
  #
143
145
  def run
144
146
  case run_mode
145
- when 'map' then mapper_klass.new(self.options).stream
146
- when 'reduce' then reducer_klass.new(self.options).stream
147
+ when 'map' then mapper.stream
148
+ when 'reduce' then reducer.stream
147
149
  when 'local' then execute_local_workflow
148
150
  when 'cassandra' then execute_hadoop_workflow
149
151
  when 'hadoop', 'mapred' then execute_hadoop_workflow
@@ -172,8 +174,9 @@ module Wukong
172
174
  # In local mode, it's given to the system() call
173
175
  #
174
176
  def mapper_commandline
175
- if mapper_klass
177
+ if mapper
176
178
  "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
179
+ # "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --map " + non_wukong_params
177
180
  else
178
181
  options[:map_command]
179
182
  end
@@ -185,8 +188,9 @@ module Wukong
185
188
  # In local mode, it's given to the system() call
186
189
  #
187
190
  def reducer_commandline
188
- if reducer_klass
189
- "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
191
+ if reducer
192
+ "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
193
+ # "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --reduce " + non_wukong_params
190
194
  else
191
195
  options[:reduce_command]
192
196
  end
@@ -228,8 +232,9 @@ module Wukong
228
232
  #
229
233
  def maybe_overwrite_output_paths! output_path
230
234
  if (options[:overwrite] || options[:rm]) && (run_mode == 'hadoop')
231
- Log.info "Removing output file #{output_path}"
232
- `hdp-rm -r '#{output_path}'`
235
+ cmd = %Q{#{hadoop_runner} fs -rmr '#{output_path}'}
236
+ Log.info "Removing output file #{output_path}: #{cmd}"
237
+ puts `#{cmd}`
233
238
  end
234
239
  end
235
240
 
@@ -26,10 +26,11 @@ module Wukong
26
26
  #
27
27
  def to_db_hash
28
28
  db_hsh = {}
29
- to_hash.each{|k,v| db_hsh[k.to_s] = v.to_s unless v.nil? }
29
+ each_pair{|k,v| db_hsh[k.to_s] = v.to_s unless v.nil? }
30
30
  db_hsh
31
31
  end
32
32
 
33
+
33
34
  module ClassMethods
34
35
  # Cassandra column family -- taken from the class name by default.
35
36
  def table_name
@@ -15,10 +15,6 @@
15
15
  #
16
16
  class AccumulatingReducer < Wukong::Streamer::Base
17
17
  attr_accessor :key
18
- def initialize options
19
- super options
20
- self.key = :__first_pass__
21
- end
22
18
 
23
19
  #
24
20
  # override for multiple-field keys, etc.
@@ -57,15 +53,12 @@
57
53
  # start! is called on the the first record of the new key
58
54
  #
59
55
  def start! *args
60
- raise %Q{start! is the new reset! -- it has args now, namely the first
61
- record of the new key. It doesn\'t want #super either}
62
56
  end
63
57
 
64
58
  #
65
59
  # Override this to accumulate each record for the given key in turn.
66
60
  #
67
61
  def accumulate *args, &block
68
- raise "override the accumulate method in your subclass"
69
62
  end
70
63
 
71
64
  #
@@ -73,7 +66,11 @@
73
66
  # You must override this method.
74
67
  #
75
68
  def finalize
76
- raise "override the finalize method in your subclass"
69
+ end
70
+
71
+ # make a sentinel
72
+ def before_stream
73
+ self.key = :__first_pass__
77
74
  end
78
75
 
79
76
  # Finalize the last-seen group.
@@ -82,6 +79,5 @@
82
79
  super *args
83
80
  end
84
81
  end
85
-
86
82
  end
87
83
  end
@@ -4,13 +4,17 @@ module Wukong
4
4
 
5
5
  # Options, initially set from the command-line args -- see
6
6
  # Script#process_argv!
7
- attr_accessor :options
7
+ attr_reader :own_options
8
8
 
9
9
  #
10
10
  # Accepts option hash from script runner
11
11
  #
12
12
  def initialize options={}
13
- self.options = options
13
+ @own_options = options
14
+ end
15
+
16
+ def options
17
+ Settings.deep_merge own_options
14
18
  end
15
19
 
16
20
  #
@@ -24,6 +28,7 @@ module Wukong
24
28
  process(*record) do |output_record|
25
29
  emit output_record
26
30
  end
31
+ monitor.periodically(record.to_s[0..1000])
27
32
  end
28
33
  after_stream
29
34
  end
@@ -64,7 +69,6 @@ module Wukong
64
69
  # Process each record in turn, yielding the records to emit
65
70
  #
66
71
  def process *args, &block
67
- raise "override the process method in your implementation: it should process each record."
68
72
  end
69
73
 
70
74
  #
@@ -75,6 +79,43 @@ module Wukong
75
79
  warn "Bad record #{args.inspect[0..400]}"
76
80
  puts ["bad_record-"+key, *args].join("\t")
77
81
  end
82
+
83
+ # A periodic logger to track progress
84
+ def monitor
85
+ @monitor ||= PeriodicMonitor.new
86
+ end
87
+
88
+ # Defines a process method on the fly to execute the given mapper.
89
+ #
90
+ # This is still experimental.
91
+ # Among other limitations, you can't use ++yield++ -- you have to call
92
+ # emit() directly.
93
+ def mapper &mapper_block
94
+ @mapper_block = mapper_block.to_proc
95
+ self.instance_eval do
96
+ def process *args, &block
97
+ instance_exec(*args, &@mapper_block)
98
+ end
99
+ end
100
+ self
101
+ end
102
+
103
+ # Creates a new object of this class and injects the given block
104
+ # as the process method
105
+ def self.mapper *args, &block
106
+ self.new.mapper *args, &block
107
+ end
108
+
109
+ # Delegates back to Wukong to run this instance as a mapper
110
+ def run options={}
111
+ Wukong.run(self, nil, options)
112
+ end
113
+
114
+ # Creates a new object of this class and runs it
115
+ def self.run options={}
116
+ Wukong.run(self.new, nil, options)
117
+ end
118
+
78
119
  end
79
120
  end
80
121
  end
@@ -1,23 +1,23 @@
1
1
  module Wukong
2
2
  module Streamer
3
-
4
3
  #
5
- # Count the number of records for each key.
4
+ # Emit each unique key and the count of its occurrences
6
5
  #
7
- class CountingReducer < AccumulatingReducer
8
- attr_accessor :count
6
+ class CountingReducer < Wukong::Streamer::AccumulatingReducer
9
7
 
10
- # start the sum with 0 for each key
11
- def start! *_
12
- self.count = 0
8
+ # reset the counter to zero
9
+ def start! *args
10
+ @count = 0
13
11
  end
14
- # ... and count the number of records for this key
15
- def accumulate *_
16
- self.count += 1
12
+
13
+ # record one more for this key
14
+ def accumulate *vals
15
+ @count += 1
17
16
  end
18
- # emit [key, count]
17
+
18
+ # emit each key field and the count, tab-separated.
19
19
  def finalize
20
- yield [key, count].flatten
20
+ yield [key, @count]
21
21
  end
22
22
  end
23
23
 
@@ -12,8 +12,8 @@ module Wukong
12
12
  #
13
13
  # Subclass and re-define the emit? method
14
14
  #
15
- def process *record, &block
16
- yield record if emit?(record)
15
+ def process *record
16
+ yield record if emit?(*record)
17
17
  end
18
18
  end
19
19
  end