wukong 1.5.4 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (87) hide show
  1. data/CHANGELOG.textile +32 -0
  2. data/README.textile +58 -12
  3. data/TODO.textile +0 -8
  4. data/bin/hdp-bzip +12 -17
  5. data/bin/hdp-kill-task +1 -1
  6. data/bin/hdp-sort +7 -7
  7. data/bin/hdp-stream +7 -7
  8. data/bin/hdp-stream-flat +2 -3
  9. data/bin/setcat +11 -0
  10. data/bin/uniq-ord +59 -0
  11. data/examples/corpus/bucket_counter.rb +47 -0
  12. data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
  13. data/examples/corpus/sentence_coocurrence.rb +70 -0
  14. data/examples/emr/README.textile +110 -0
  15. data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
  16. data/examples/emr/elastic_mapreduce_example.rb +2 -2
  17. data/examples/ignore_me/counting.rb +56 -0
  18. data/examples/ignore_me/grouper.rb +71 -0
  19. data/examples/network_graph/adjacency_list.rb +2 -2
  20. data/examples/network_graph/breadth_first_search.rb +14 -21
  21. data/examples/network_graph/gen_multi_edge.rb +22 -13
  22. data/examples/pagerank/pagerank.rb +1 -1
  23. data/examples/pagerank/pagerank_initialize.rb +6 -10
  24. data/examples/sample_records.rb +6 -16
  25. data/examples/server_logs/apache_log_parser.rb +7 -22
  26. data/examples/server_logs/breadcrumbs.rb +39 -0
  27. data/examples/server_logs/logline.rb +27 -0
  28. data/examples/size.rb +3 -2
  29. data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
  30. data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
  31. data/examples/stupidly_simple_filter.rb +11 -14
  32. data/examples/word_count.rb +16 -36
  33. data/lib/wukong/and_pig.rb +2 -15
  34. data/lib/wukong/logger.rb +7 -28
  35. data/lib/wukong/periodic_monitor.rb +24 -9
  36. data/lib/wukong/script/emr_command.rb +1 -0
  37. data/lib/wukong/script/hadoop_command.rb +31 -29
  38. data/lib/wukong/script.rb +19 -14
  39. data/lib/wukong/store/cassandra_model.rb +2 -1
  40. data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
  41. data/lib/wukong/streamer/base.rb +44 -3
  42. data/lib/wukong/streamer/counting_reducer.rb +12 -12
  43. data/lib/wukong/streamer/filter.rb +2 -2
  44. data/lib/wukong/streamer/list_reducer.rb +3 -3
  45. data/lib/wukong/streamer/reducer.rb +11 -0
  46. data/lib/wukong/streamer.rb +7 -3
  47. data/lib/wukong.rb +7 -3
  48. data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
  49. data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
  50. data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
  51. data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
  52. data/wukong.gemspec +257 -285
  53. metadata +45 -62
  54. data/examples/cassandra_streaming/avromapper.rb +0 -85
  55. data/examples/cassandra_streaming/cassandra.avpr +0 -468
  56. data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
  57. data/examples/cassandra_streaming/catter.sh +0 -45
  58. data/examples/cassandra_streaming/client_schema.avpr +0 -211
  59. data/examples/cassandra_streaming/foofile.avr +0 -0
  60. data/examples/cassandra_streaming/pymap.sh +0 -1
  61. data/examples/cassandra_streaming/pyreduce.sh +0 -1
  62. data/examples/cassandra_streaming/smutation.avpr +0 -188
  63. data/examples/cassandra_streaming/streamer.sh +0 -51
  64. data/examples/cassandra_streaming/struct_loader.rb +0 -24
  65. data/examples/count_keys.rb +0 -56
  66. data/examples/count_keys_at_mapper.rb +0 -57
  67. data/examples/emr/README-elastic_map_reduce.textile +0 -26
  68. data/examples/keystore/cassandra_batch_test.rb +0 -41
  69. data/examples/keystore/conditional_outputter_example.rb +0 -70
  70. data/examples/store/chunked_store_example.rb +0 -18
  71. data/lib/wukong/dfs.rb +0 -81
  72. data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
  73. data/lib/wukong/keystore/redis_db.rb +0 -24
  74. data/lib/wukong/keystore/tyrant_db.rb +0 -137
  75. data/lib/wukong/keystore/tyrant_notes.textile +0 -145
  76. data/lib/wukong/models/graph.rb +0 -25
  77. data/lib/wukong/monitor/chunked_store.rb +0 -23
  78. data/lib/wukong/monitor/periodic_logger.rb +0 -34
  79. data/lib/wukong/monitor/periodic_monitor.rb +0 -70
  80. data/lib/wukong/monitor.rb +0 -7
  81. data/lib/wukong/rdf.rb +0 -104
  82. data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
  83. data/lib/wukong/streamer/count_keys.rb +0 -30
  84. data/lib/wukong/streamer/count_lines.rb +0 -26
  85. data/lib/wukong/streamer/em_streamer.rb +0 -7
  86. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
  87. data/lib/wukong/wukong_class.rb +0 -21
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  require 'rubygems'
3
- require 'wukong'
3
+ require 'wukong/script'
4
4
 
5
5
  module WordCount
6
6
  class Mapper < Wukong::Streamer::LineStreamer
@@ -10,22 +10,22 @@ module WordCount
10
10
  # This is pretty simpleminded:
11
11
  # * downcase the word
12
12
  # * Split at any non-alphanumeric boundary, including '_'
13
- # * However, preserve the special cases of 's or 't at the end of a
13
+ # * However, preserve the special cases of 's, 'd or 't at the end of a
14
14
  # word.
15
15
  #
16
- # tokenize("Jim's dawg won't hunt: dawg_hunt error #3007a4")
17
- # # => ["jim's", "dawd", "won't", "hunt", "dawg", "hunt", "error", "3007a4"]
16
+ # tokenize("Ability is a poor man's wealth #johnwoodenquote")
17
+ # # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
18
18
  #
19
19
  def tokenize str
20
- return [] unless str
20
+ return [] if str.blank?
21
21
  str = str.downcase;
22
22
  # kill off all punctuation except [stuff]'s or [stuff]'t
23
23
  # this includes hyphens (words are split)
24
24
  str = str.
25
25
  gsub(/[^a-zA-Z0-9\']+/, ' ').
26
- gsub(/(\w)\'([st])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
26
+ gsub(/(\w)\'([std])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
27
27
  # Busticate at whitespace
28
- words = str.strip.split(/\s+/)
28
+ words = str.split(/\s+/)
29
29
  words.reject!{|w| w.blank? }
30
30
  words
31
31
  end
@@ -39,31 +39,13 @@ module WordCount
39
39
  end
40
40
 
41
41
  #
42
- # Accumulate the sum record-by-record:
42
+ # You can stack up all the values in a list then sum them at once.
43
43
  #
44
- class Reducer0 < Wukong::Streamer::Base
45
- attr_accessor :key_count
46
- def process word, count
47
- @last_word ||= word
48
- if (@last_word == word)
49
- self.key_count += 1
50
- else
51
- yield [ @last_word, key_count ]
52
- @last_word = word
53
- end
54
- end
55
- def stream
56
- emit @last_word, key_count
57
- end
58
- end
59
-
60
- #
61
- # You can stack up all the values in a list then sum them at once:
44
+ # This isn't good style, as it means the whole list is held in memory
62
45
  #
63
- require 'active_support/core_ext/enumerable'
64
46
  class Reducer1 < Wukong::Streamer::ListReducer
65
47
  def finalize
66
- yield [ key, values.map(&:last).map(&:to_i).sum ]
48
+ yield [ key, values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot } ]
67
49
  end
68
50
  end
69
51
 
@@ -71,11 +53,10 @@ module WordCount
71
53
  # A bit kinder to your memory manager: accumulate the sum record-by-record:
72
54
  #
73
55
  class Reducer2 < Wukong::Streamer::AccumulatingReducer
74
- attr_accessor :key_count
75
- def start!(*args) self.key_count = 0 end
76
- def accumulate(*args) self.key_count += 1 end
56
+ def start!(*args) @key_count = 0 end
57
+ def accumulate(*args) @key_count += 1 end
77
58
  def finalize
78
- yield [ key, key_count ]
59
+ yield [ key, @key_count ]
79
60
  end
80
61
  end
81
62
 
@@ -85,11 +66,10 @@ module WordCount
85
66
  require 'wukong/streamer/count_keys'
86
67
  class Reducer3 < Wukong::Streamer::CountKeys
87
68
  end
88
-
89
69
  end
90
70
 
91
71
  # Execute the script
92
- Wukong::Script.new(
72
+ Wukong.run(
93
73
  WordCount::Mapper,
94
- WordCount::Reducer1
95
- ).run
74
+ WordCount::Reducer
75
+ )
@@ -2,19 +2,13 @@ module Enumerable
2
2
  #
3
3
  # Convert an array of values to a string representing it as a pig tuple
4
4
  #
5
- # def to_pig_tuple
6
- # map{|*vals| '(' + vals.join(',') + ')' }
7
- # end
8
-
9
- #
10
- # Convert an array to a pig tuple
11
- #
12
5
  def to_pig_tuple
13
6
  '(' + self.join(',') + ')'
14
7
  end
8
+
15
9
  #
16
10
  # Convert an array of values to a string pig format
17
- # Delegates to to_pig_tuple -- see also to_pig_bag
11
+ # see also to_pig_bag
18
12
  #
19
13
  def to_pig *args
20
14
  to_pig_tuple *args
@@ -23,13 +17,6 @@ module Enumerable
23
17
  #
24
18
  # Convert an array of values to a string representing it as a pig bag
25
19
  #
26
- # def to_pig_bag
27
- # '{' + self.join(',') + '}'
28
- # end
29
-
30
- #
31
- # Convert and array of values to a string representing it as a pig bag
32
- #
33
20
  def to_pig_bag
34
21
  '{' + self.map{|*vals| vals.to_pig_tuple}.join(",") + '}'
35
22
  end
data/lib/wukong/logger.rb CHANGED
@@ -13,37 +13,15 @@ module Wukong
13
13
  # I, [2009-07-26T19:58:46-05:00 #12332]: Up to 2000 char message
14
14
  #
15
15
  def self.logger
16
- @logger ||= default_ruby_logger
17
- end
18
-
19
- #
20
- # Log4r logger, set up to produce tab-delimited (and thus, wukong|hadoop
21
- # friendly) output lines
22
- #
23
- def self.default_log4r_logger logger_handle='wukong'
24
- require 'log4r'
25
- lgr = Log4r::Logger.new logger_handle
26
- outputter = Log4r::Outputter.stderr
27
- # Define timestamp formatter method
28
- ::Time.class_eval do def utc_iso8601() utc.iso8601 ; end ; end
29
- # 2009-07-25T00:12:05Z INFO PID\t
30
- outputter.formatter = Log4r::PatternFormatter.new(
31
- :pattern => "%d %.4l #{Process.pid}\t%.2000m",
32
- :date_method => :utc_iso8601
33
- )
34
- lgr.outputters = outputter
35
- lgr
36
- end
37
-
38
- def self.default_ruby_logger
16
+ return @logger if @logger
39
17
  require 'logger'
40
- logger = Logger.new STDERR
41
- logger.instance_eval do
18
+ @logger = Logger.new STDERR
19
+ @logger.instance_eval do
42
20
  def dump *args
43
21
  debug args.inspect
44
22
  end
45
23
  end
46
- logger
24
+ @logger
47
25
  end
48
26
 
49
27
  def self.logger= logger
@@ -54,6 +32,7 @@ end
54
32
  #
55
33
  # A convenient logger.
56
34
  #
57
- # Define NO_WUKONG_LOG (or define Log yourself) to prevent its creation
35
+ # define Log yourself to prevent its creation
58
36
  #
59
- Log = Wukong.logger unless (defined?(Log) || defined?(NO_WUKONG_LOG))
37
+ Log = Wukong.logger unless defined?(Log)
38
+
@@ -1,4 +1,5 @@
1
- Settings.define :log_interval, :default => 1000, :type => Integer, :description => 'How many iterations between log statements'
1
+ Settings.define :log_interval, :default => 10_000, :type => Integer, :description => 'How many iterations between log statements'
2
+ Settings.define :log_seconds, :default => 30, :type => Integer, :description => 'How many seconds between log statements'
2
3
 
3
4
  #
4
5
  # Periodic monitor
@@ -9,40 +10,48 @@ Settings.define :log_interval, :default => 1000, :type => Integer, :description
9
10
  class PeriodicMonitor
10
11
  attr_reader :iter, :start_time, :options
11
12
  attr_accessor :interval
13
+ attr_accessor :time_interval
12
14
 
13
15
  def initialize extra_options={}
14
- @options = {}
16
+ @options = {}
15
17
  @options.deep_merge!( extra_options || {} )
16
- @iter = 0
17
- @start_time = now
18
- @interval = (options[:log_interval] || Settings[:log_interval]).to_i
19
- @interval = 1000 unless @interval >= 1
18
+ @iter = 0
19
+ @start_time = now
20
+ @last_report = @start_time
21
+ @interval = (options[:log_interval] || Settings[:log_interval]).to_i
22
+ @interval = 1000 unless @interval >= 1
23
+ @time_interval = (options[:log_seconds] || Settings[:log_seconds]).to_i
20
24
  end
21
25
 
22
26
  def periodically *args, &block
23
27
  incr!
24
28
  if ready?
29
+ @last_report = Time.now
25
30
  if block
26
31
  block.call(iter, *args)
27
32
  else
28
- $stderr.puts progress(*args)
33
+ self.emit progress(*args)
29
34
  end
30
35
  end
31
36
  end
32
37
 
38
+ def emit log_line
39
+ Log.info log_line
40
+ end
41
+
33
42
  def incr!
34
43
  @iter += 1
35
44
  end
36
45
 
37
46
  def ready?
38
- iter % @interval == 0
47
+ (iter % @interval == 0) || (since > time_interval)
39
48
  end
40
49
 
41
50
  def progress *stuff
42
51
  [
43
52
  "%15d" % iter,
44
53
  "%7.1f"% elapsed_time, "sec",
45
- "%7.1f"%(iter.to_f / elapsed_time), "/sec",
54
+ "%7.1f"% rate, "/sec",
46
55
  now.to_flat,
47
56
  *stuff
48
57
  ].flatten.join("\t")
@@ -51,7 +60,13 @@ class PeriodicMonitor
51
60
  def elapsed_time
52
61
  now - start_time
53
62
  end
63
+ def since
64
+ now - @last_report
65
+ end
54
66
  def now
55
67
  Time.now.utc
56
68
  end
69
+ def rate
70
+ iter.to_f / elapsed_time
71
+ end
57
72
  end
@@ -49,6 +49,7 @@ module Wukong
49
49
  end
50
50
 
51
51
  def execute_emr_runner
52
+ # fix_paths!
52
53
  command_args = []
53
54
  if Settings.jobflow
54
55
  command_args << Settings.dashed_flag_for(:jobflow)
@@ -12,27 +12,27 @@ module Wukong
12
12
  #
13
13
  # Translate simplified args to their hairy hadoop equivalents
14
14
  #
15
- Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
16
- Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
17
- Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
18
- Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
19
- Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
20
- Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
21
- Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
22
- Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
23
- Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
24
- Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
25
- Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
26
- Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
27
15
  Settings.define :io_sort_mb, :jobconf => true, :description => 'io.sort.mb', :wukong => true
28
16
  Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent', :wukong => true
29
17
  Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
30
- Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
31
- Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
32
- Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
18
+ Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
19
+ Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
20
+ Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
33
21
  Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
22
+ Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
23
+ Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
24
+ Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
34
25
  Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
26
+ Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
27
+ Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
35
28
  Settings.define :min_split_size, :jobconf => true, :description => 'mapred.min.split.size', :wukong => true
29
+ Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
30
+ Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
31
+ Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
32
+ Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
33
+ Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
34
+ Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
35
+ Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
36
36
  Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
37
37
  Settings.define :split_on_xml_tag, :description => "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'", :wukong => true
38
38
 
@@ -60,7 +60,7 @@ module Wukong
60
60
  # Use Settings[:hadoop_home] to set the path your config install.
61
61
  hadoop_commandline = [
62
62
  hadoop_runner,
63
- "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
63
+ "jar #{options[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
64
64
  hadoop_jobconf_options,
65
65
  "-D mapred.job.name='#{job_name}'",
66
66
  hadoop_other_args,
@@ -68,6 +68,7 @@ module Wukong
68
68
  "-reducer '#{reducer_commandline}'",
69
69
  "-input '#{input_paths}'",
70
70
  "-output '#{output_path}'",
71
+ "-file '#{this_script_filename}'",
71
72
  hadoop_recycle_env,
72
73
  ].flatten.compact.join(" \t\\\n ")
73
74
  Log.info " Launching hadoop!"
@@ -79,8 +80,8 @@ module Wukong
79
80
  # Fixup these options
80
81
  options[:reuse_jvms] = '-1' if (options[:reuse_jvms] == true)
81
82
  options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
82
- # If no reducer_klass and no reduce_command, then skip the reduce phase
83
- options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
83
+ # If no reducer and no reduce_command, then skip the reduce phase
84
+ options[:reduce_tasks] = 0 if (! reducer) && (! options[:reduce_command]) && (! options[:reduce_tasks])
84
85
  # Fields hadoop should use to distribute records to reducers
85
86
  unless options[:partition_fields].blank?
86
87
  jobconf_options += [
@@ -89,23 +90,24 @@ module Wukong
89
90
  ]
90
91
  end
91
92
  jobconf_options += [
92
- :key_field_separator, :sort_fields,
93
- :map_tasks, :reduce_tasks,
94
- :max_node_map_tasks, :max_node_reduce_tasks,
95
- :max_reduces_per_node, :max_reduces_per_cluster,
96
- :max_maps_per_node, :max_maps_per_cluster,
97
- :min_split_size,
98
- :map_speculative,
99
- :timeout,
100
- :reuse_jvms, :respect_exit_status
93
+ :io_sort_mb, :io_sort_record_percent,
94
+ :map_speculative, :map_tasks,
95
+ :max_maps_per_cluster, :max_maps_per_node,
96
+ :max_node_map_tasks, :max_node_reduce_tasks,
97
+ :max_reduces_per_cluster, :max_reduces_per_node,
98
+ :max_record_length, :min_split_size,
99
+ :output_field_separator, :key_field_separator,
100
+ :partition_fields, :sort_fields,
101
+ :reduce_tasks, :respect_exit_status,
102
+ :reuse_jvms, :timeout,
101
103
  ].map{|opt| jobconf(opt)}
102
104
  jobconf_options.flatten.compact
103
105
  end
104
106
 
105
107
  def hadoop_other_args
106
108
  extra_str_args = [ options[:extra_args] ]
107
- if Settings.split_on_xml_tag
108
- extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{Settings.split_on_xml_tag}>,end=</#{Settings.split_on_xml_tag}>'}
109
+ if options.split_on_xml_tag
110
+ extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{options.split_on_xml_tag}>,end=</#{options.split_on_xml_tag}>'}
109
111
  end
110
112
  extra_str_args << ' -lazyOutput' if options[:noempty] # don't create reduce file if no records
111
113
  extra_str_args << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless options[:partition_fields].blank?
data/lib/wukong/script.rb CHANGED
@@ -1,8 +1,10 @@
1
1
  require 'pathname'
2
+ require 'configliere' ; Settings.use(:commandline, :env_var, :define)
3
+ require 'wukong'
2
4
  require 'wukong/script/hadoop_command'
3
5
  require 'wukong/script/local_command'
4
- require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
5
6
  require 'rbconfig' # for uncovering ruby_interpreter_path
7
+ require 'wukong/streamer' ; include Wukong::Streamer
6
8
  module Wukong
7
9
  # == How to run a Wukong script
8
10
  #
@@ -63,7 +65,7 @@ module Wukong
63
65
  class Script
64
66
  include Wukong::HadoopCommand
65
67
  include Wukong::LocalCommand
66
- attr_reader :mapper_klass, :reducer_klass, :options
68
+ attr_reader :mapper, :reducer, :options
67
69
  attr_reader :input_paths, :output_path
68
70
 
69
71
  # ---------------------------------------------------------------------------
@@ -122,12 +124,12 @@ module Wukong
122
124
  # end
123
125
  # MyScript.new(MyMapper, nil).run
124
126
  #
125
- def initialize mapper_klass, reducer_klass=nil, extra_options={}
127
+ def initialize mapper, reducer=nil, extra_options={}
126
128
  Settings.resolve!
127
- @options = Settings.dup
128
- options.merge! extra_options
129
- @mapper_klass = mapper_klass
130
- @reducer_klass = reducer_klass
129
+ @options = Settings
130
+ options.merge extra_options
131
+ @mapper = (case mapper when Class then mapper.new when nil then nil else mapper ; end)
132
+ @reducer = (case reducer when Class then reducer.new when nil then nil else reducer ; end)
131
133
  @output_path = options.rest.pop
132
134
  @input_paths = options.rest.reject(&:blank?)
133
135
  if (input_paths.blank? || output_path.blank?) && (not options[:dry_run]) && (not ['map', 'reduce'].include?(run_mode))
@@ -142,8 +144,8 @@ module Wukong
142
144
  #
143
145
  def run
144
146
  case run_mode
145
- when 'map' then mapper_klass.new(self.options).stream
146
- when 'reduce' then reducer_klass.new(self.options).stream
147
+ when 'map' then mapper.stream
148
+ when 'reduce' then reducer.stream
147
149
  when 'local' then execute_local_workflow
148
150
  when 'cassandra' then execute_hadoop_workflow
149
151
  when 'hadoop', 'mapred' then execute_hadoop_workflow
@@ -172,8 +174,9 @@ module Wukong
172
174
  # In local mode, it's given to the system() call
173
175
  #
174
176
  def mapper_commandline
175
- if mapper_klass
177
+ if mapper
176
178
  "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
179
+ # "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --map " + non_wukong_params
177
180
  else
178
181
  options[:map_command]
179
182
  end
@@ -185,8 +188,9 @@ module Wukong
185
188
  # In local mode, it's given to the system() call
186
189
  #
187
190
  def reducer_commandline
188
- if reducer_klass
189
- "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
191
+ if reducer
192
+ "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
193
+ # "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --reduce " + non_wukong_params
190
194
  else
191
195
  options[:reduce_command]
192
196
  end
@@ -228,8 +232,9 @@ module Wukong
228
232
  #
229
233
  def maybe_overwrite_output_paths! output_path
230
234
  if (options[:overwrite] || options[:rm]) && (run_mode == 'hadoop')
231
- Log.info "Removing output file #{output_path}"
232
- `hdp-rm -r '#{output_path}'`
235
+ cmd = %Q{#{hadoop_runner} fs -rmr '#{output_path}'}
236
+ Log.info "Removing output file #{output_path}: #{cmd}"
237
+ puts `#{cmd}`
233
238
  end
234
239
  end
235
240
 
@@ -26,10 +26,11 @@ module Wukong
26
26
  #
27
27
  def to_db_hash
28
28
  db_hsh = {}
29
- to_hash.each{|k,v| db_hsh[k.to_s] = v.to_s unless v.nil? }
29
+ each_pair{|k,v| db_hsh[k.to_s] = v.to_s unless v.nil? }
30
30
  db_hsh
31
31
  end
32
32
 
33
+
33
34
  module ClassMethods
34
35
  # Cassandra column family -- taken from the class name by default.
35
36
  def table_name
@@ -15,10 +15,6 @@
15
15
  #
16
16
  class AccumulatingReducer < Wukong::Streamer::Base
17
17
  attr_accessor :key
18
- def initialize options
19
- super options
20
- self.key = :__first_pass__
21
- end
22
18
 
23
19
  #
24
20
  # override for multiple-field keys, etc.
@@ -57,15 +53,12 @@
57
53
  # start! is called on the the first record of the new key
58
54
  #
59
55
  def start! *args
60
- raise %Q{start! is the new reset! -- it has args now, namely the first
61
- record of the new key. It doesn\'t want #super either}
62
56
  end
63
57
 
64
58
  #
65
59
  # Override this to accumulate each record for the given key in turn.
66
60
  #
67
61
  def accumulate *args, &block
68
- raise "override the accumulate method in your subclass"
69
62
  end
70
63
 
71
64
  #
@@ -73,7 +66,11 @@
73
66
  # You must override this method.
74
67
  #
75
68
  def finalize
76
- raise "override the finalize method in your subclass"
69
+ end
70
+
71
+ # make a sentinel
72
+ def before_stream
73
+ self.key = :__first_pass__
77
74
  end
78
75
 
79
76
  # Finalize the last-seen group.
@@ -82,6 +79,5 @@
82
79
  super *args
83
80
  end
84
81
  end
85
-
86
82
  end
87
83
  end
@@ -4,13 +4,17 @@ module Wukong
4
4
 
5
5
  # Options, initially set from the command-line args -- see
6
6
  # Script#process_argv!
7
- attr_accessor :options
7
+ attr_reader :own_options
8
8
 
9
9
  #
10
10
  # Accepts option hash from script runner
11
11
  #
12
12
  def initialize options={}
13
- self.options = options
13
+ @own_options = options
14
+ end
15
+
16
+ def options
17
+ Settings.deep_merge own_options
14
18
  end
15
19
 
16
20
  #
@@ -24,6 +28,7 @@ module Wukong
24
28
  process(*record) do |output_record|
25
29
  emit output_record
26
30
  end
31
+ monitor.periodically(record.to_s[0..1000])
27
32
  end
28
33
  after_stream
29
34
  end
@@ -64,7 +69,6 @@ module Wukong
64
69
  # Process each record in turn, yielding the records to emit
65
70
  #
66
71
  def process *args, &block
67
- raise "override the process method in your implementation: it should process each record."
68
72
  end
69
73
 
70
74
  #
@@ -75,6 +79,43 @@ module Wukong
75
79
  warn "Bad record #{args.inspect[0..400]}"
76
80
  puts ["bad_record-"+key, *args].join("\t")
77
81
  end
82
+
83
+ # A periodic logger to track progress
84
+ def monitor
85
+ @monitor ||= PeriodicMonitor.new
86
+ end
87
+
88
+ # Defines a process method on the fly to execute the given mapper.
89
+ #
90
+ # This is still experimental.
91
+ # Among other limitations, you can't use ++yield++ -- you have to call
92
+ # emit() directly.
93
+ def mapper &mapper_block
94
+ @mapper_block = mapper_block.to_proc
95
+ self.instance_eval do
96
+ def process *args, &block
97
+ instance_exec(*args, &@mapper_block)
98
+ end
99
+ end
100
+ self
101
+ end
102
+
103
+ # Creates a new object of this class and injects the given block
104
+ # as the process method
105
+ def self.mapper *args, &block
106
+ self.new.mapper *args, &block
107
+ end
108
+
109
+ # Delegates back to Wukong to run this instance as a mapper
110
+ def run options={}
111
+ Wukong.run(self, nil, options)
112
+ end
113
+
114
+ # Creates a new object of this class and runs it
115
+ def self.run options={}
116
+ Wukong.run(self.new, nil, options)
117
+ end
118
+
78
119
  end
79
120
  end
80
121
  end
@@ -1,23 +1,23 @@
1
1
  module Wukong
2
2
  module Streamer
3
-
4
3
  #
5
- # Count the number of records for each key.
4
+ # Emit each unique key and the count of its occurrences
6
5
  #
7
- class CountingReducer < AccumulatingReducer
8
- attr_accessor :count
6
+ class CountingReducer < Wukong::Streamer::AccumulatingReducer
9
7
 
10
- # start the sum with 0 for each key
11
- def start! *_
12
- self.count = 0
8
+ # reset the counter to zero
9
+ def start! *args
10
+ @count = 0
13
11
  end
14
- # ... and count the number of records for this key
15
- def accumulate *_
16
- self.count += 1
12
+
13
+ # record one more for this key
14
+ def accumulate *vals
15
+ @count += 1
17
16
  end
18
- # emit [key, count]
17
+
18
+ # emit each key field and the count, tab-separated.
19
19
  def finalize
20
- yield [key, count].flatten
20
+ yield [key, @count]
21
21
  end
22
22
  end
23
23
 
@@ -12,8 +12,8 @@ module Wukong
12
12
  #
13
13
  # Subclass and re-define the emit? method
14
14
  #
15
- def process *record, &block
16
- yield record if emit?(record)
15
+ def process *record
16
+ yield record if emit?(*record)
17
17
  end
18
18
  end
19
19
  end