wukong 1.4.2 → 1.4.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ australia 253 499 671 663 710 687 774 654 627 422 376 132 25
2
+ spain 37 102 257 177 118 90 144 183 210 222 162 93 17
3
+ sweden 32 167 306 334 314 287 330 366 415 343 266 130 51
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env ruby
2
+ # run like so:
3
+ # $> ruby normalize.rb --run=local data/sizes.tsv data/normalized_sizes.tsv
4
+ require 'rubygems'
5
+ require 'wukong'
6
+ require 'active_support/core_ext/enumerable' # for array#sum
7
+
8
+ module Normalize
9
+ class Mapper < Wukong::Streamer::RecordStreamer
10
+ def process(country, *sizes)
11
+ sizes.map!(&:to_i)
12
+ sum = sizes.sum.to_f
13
+ normalized = sizes.map{|x| 100 * x/sum }
14
+ s = normalized.join(",")
15
+ yield [country, s]
16
+ end
17
+ end
18
+ end
19
+
20
+ Wukong::Script.new(Normalize::Mapper, nil).run
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env ruby
2
+ # run like so:
3
+ # $> ruby sizes.rb --run=local data/orders.tsv data/sizes
4
+ require 'rubygems'
5
+ require 'wukong'
6
+
7
+ module JeanSizes
8
+ class Mapper < Wukong::Streamer::RecordStreamer
9
+ def process(code,model,time,country,reg,col, n1,c1, venue,n3,n4, *sizes)
10
+ yield [country, *sizes]
11
+ end
12
+ end
13
+
14
+ #
15
+ # This uses a ListReducer. It's nice and simple, but requires first
16
+ # accumulating each key's records in memory.
17
+ #
18
+ class JeansListReducer < Wukong::Streamer::ListReducer
19
+ def finalize
20
+ return if values.empty?
21
+ sums = []; 13.times{ sums << 0 }
22
+ values.each do |country, *sizes|
23
+ sizes.map!(&:to_i)
24
+ sums = sums.zip(sizes).map{|sum, val| sum + val }
25
+ end
26
+ yield [key, *sums]
27
+ end
28
+ end
29
+
30
+
31
+ #
32
+ # This uses an AccumulatingReducer directly.
33
+ # It has the advantage of a minimal footprint.
34
+ #
35
+ class JeansAccumulatingReducer < Wukong::Streamer::AccumulatingReducer
36
+ attr_accessor :sums
37
+
38
+ # start the sum with 0 for each size
39
+ def start! *_
40
+ self.sums = []; 13.times{ self.sums << 0 }
41
+ end
42
+ # accumulate each size count into the sizes_sum
43
+ def accumulate country, *sizes
44
+ sizes.map!(&:to_i)
45
+ self.sums = self.sums.zip(sizes).map{|sum, val| sum + val }
46
+ end
47
+ # emit [country, size_0_sum, size_1_sum, ...]
48
+ def finalize
49
+ yield [key, sums].flatten
50
+ end
51
+ end
52
+
53
+ end
54
+
55
+ Wukong::Script.new(JeanSizes::Mapper, JeanSizes::JeansListReducer).run
data/examples/foo.rb ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.expand_path("~/ics/backend/configliere/lib")
3
+
4
+ require "wukong"
5
+
6
+ p Wukong::Script.new(nil,nil).options
7
+ p Wukong::Script.new(nil,nil).non_wukong_params
8
+
9
+ Wukong::Script.new(nil,nil).run
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../lib'
2
+ require 'rubygems'
3
3
  require 'wukong'
4
4
 
5
5
  module WordCount
data/lib/wukong.rb CHANGED
@@ -1,9 +1,9 @@
1
- require 'wukong/boot'
2
1
  require 'wukong/extensions'
3
2
  require 'wukong/datatypes'
4
3
  require 'wukong/logger'
5
4
  require 'wukong/bad_record'
6
5
  autoload :TypedStruct, 'wukong/typed_struct'
6
+ require 'configliere'; Configliere.use :define
7
7
  module Wukong
8
8
  autoload :Dfs, 'wukong/dfs'
9
9
  autoload :Script, 'wukong/script'
@@ -40,9 +40,9 @@ class Hash
40
40
  end
41
41
 
42
42
  # lambda for recursive merges
43
- Hash::DEEP_MERGER = proc do |key,v1,v2|
43
+ ::Hash::DEEP_MERGER = proc do |key,v1,v2|
44
44
  (v1.respond_to?(:merge) && v2.respond_to?(:merge)) ? v1.merge(v2.compact, &Hash::DEEP_MERGER) : (v2.nil? ? v1 : v2)
45
- end
45
+ end unless defined?(::Hash::DEEP_MERGER)
46
46
 
47
47
  #
48
48
  # Merge hashes recursively.
@@ -72,36 +72,60 @@ class Hash
72
72
  merge! hsh2, &Hash::DEEP_MERGER
73
73
  end
74
74
 
75
-
76
75
  #
77
76
  # Treat hash as tree of hashes:
78
77
  #
79
78
  # x = { 1 => :val, :subhash => { 1 => :val1 } }
80
- # x.deep_set(:subhash, 3, 4)
81
- # # => { 1 => :val, :subhash => { 1 => :val1, 3 => 4 } }
79
+ # x.deep_set(:subhash, :cat, :hat)
80
+ # # => { 1 => :val, :subhash => { 1 => :val1, :cat => :hat } }
82
81
  # x.deep_set(:subhash, 1, :newval)
83
- # # => { 1 => :val, :subhash => { 1 => :newval, 3 => 4 } }
82
+ # # => { 1 => :val, :subhash => { 1 => :newval, :cat => :hat } }
84
83
  #
85
84
  #
86
85
  def deep_set *args
87
- hsh = self
88
- head_keys = args[0..-3]
89
- last_key = args[-2]
90
- val = args[-1]
91
- # grab last subtree (building out if necessary)
92
- head_keys.each{|key| hsh = (hsh[key] ||= {}) }
86
+ val = args.pop
87
+ last_key = args.pop
88
+ # dig down to last subtree (building out if necessary)
89
+ hsh = args.empty? ? self : args.inject(self){|hsh, key| hsh[key] ||= {} }
93
90
  # set leaf value
94
91
  hsh[last_key] = val
95
92
  end
96
93
 
97
- # Stolen from ActiveSupport::CoreExtensions::Hash::ReverseMerge.
98
- def reverse_merge(other_hash)
99
- other_hash.merge(self)
94
+ #
95
+ # Treat hash as tree of hashes:
96
+ #
97
+ # x = { 1 => :val, :subhash => { 1 => :val1 } }
98
+ # x.deep_get(:subhash, 1)
99
+ # # => :val
100
+ # x.deep_get(:subhash, 2)
101
+ # # => nil
102
+ # x.deep_get(:subhash, 2, 3)
103
+ # # => nil
104
+ # x.deep_get(:subhash, 2)
105
+ # # => nil
106
+ #
107
+ def deep_get *args
108
+ last_key = args.pop
109
+ # dig down to last subtree (building out if necessary)
110
+ hsh = args.inject(self){|hsh, key| hsh[key] || {} }
111
+ # get leaf value
112
+ hsh[last_key]
100
113
  end
101
114
 
102
- # Stolen from ActiveSupport::CoreExtensions::Hash::ReverseMerge.
103
- def reverse_merge!(other_hash)
104
- replace(reverse_merge(other_hash))
115
+
116
+ #
117
+ # Treat hash as tree of hashes:
118
+ #
119
+ # x = { 1 => :val, :subhash => { 1 => :val1, 2 => :val2 } }
120
+ # x.deep_delete(:subhash, 1)
121
+ # #=> :val
122
+ # x
123
+ # #=> { 1 => :val, :subhash => { 2 => :val2 } }
124
+ #
125
+ def deep_delete *args
126
+ last_key = args.pop
127
+ last_hsh = args.empty? ? self : (deep_get(*args)||{})
128
+ last_hsh.delete(last_key)
105
129
  end
106
130
 
107
131
  #
@@ -117,4 +141,14 @@ class Hash
117
141
  replace(compact)
118
142
  end
119
143
 
144
+ # Stolen from ActiveSupport::CoreExtensions::Hash::ReverseMerge.
145
+ def reverse_merge(other_hash)
146
+ other_hash.merge(self)
147
+ end
148
+
149
+ # Stolen from ActiveSupport::CoreExtensions::Hash::ReverseMerge.
150
+ def reverse_merge!(other_hash)
151
+ replace(reverse_merge(other_hash))
152
+ end
153
+
120
154
  end
data/lib/wukong/schema.rb CHANGED
@@ -9,16 +9,16 @@ class << Integer ; def to_sql() 'INT' end ; end
9
9
  class << Bignum ; def to_sql() 'BIGINT' end ; end
10
10
  class << String ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end
11
11
  class << Symbol ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end
12
- class << BigDecimal ; def to_pig() 'DECIMAL' end ; end if defined?(BigDecimal)
13
- class << EpochTime ; def to_pig() 'INT' end ; end if defined?(EpochTime)
14
- class << FilePath ; def to_pig() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(FilePath)
15
- class << Flag ; def to_pig() 'CHAR(1) CHARACTER SET ASCII' end ; end if defined?(Flag)
16
- class << IPAddress ; def to_pig() 'CHAR(15) CHARACTER SET ASCII' end ; end if defined?(IPAddress)
17
- class << URI ; def to_pig() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(URI)
18
- class << Csv ; def to_pig() 'TEXT' end ; end if defined?(Csv)
19
- class << Yaml ; def to_pig() 'TEXT' end ; end if defined?(Yaml)
20
- class << Json ; def to_pig() 'TEXT' end ; end if defined?(Json)
21
- class << Regex ; def to_pig() 'TEXT' end ; end if defined?(Regex)
12
+ class << BigDecimal ; def to_sql() 'DECIMAL' end ; end if defined?(BigDecimal)
13
+ class << EpochTime ; def to_sql() 'INT' end ; end if defined?(EpochTime)
14
+ class << FilePath ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(FilePath)
15
+ class << Flag ; def to_sql() 'CHAR(1) CHARACTER SET ASCII' end ; end if defined?(Flag)
16
+ class << IPAddress ; def to_sql() 'CHAR(15) CHARACTER SET ASCII' end ; end if defined?(IPAddress)
17
+ class << URI ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(URI)
18
+ class << Csv ; def to_sql() 'TEXT' end ; end if defined?(Csv)
19
+ class << Yaml ; def to_sql() 'TEXT' end ; end if defined?(Yaml)
20
+ class << Json ; def to_sql() 'TEXT' end ; end if defined?(Json)
21
+ class << Regex ; def to_sql() 'TEXT' end ; end if defined?(Regex)
22
22
  class String ; def to_sql() self ; end ; end
23
23
  class Symbol ; def to_sql() self.to_s.upcase ; end ; end
24
24
 
data/lib/wukong/script.rb CHANGED
@@ -1,7 +1,8 @@
1
1
  require 'pathname'
2
2
  require 'wukong/script/hadoop_command'
3
3
  require 'wukong/script/local_command'
4
- require 'rbconfig'
4
+ require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
5
+ require 'rbconfig' # for uncovering ruby_interpreter_path
5
6
  module Wukong
6
7
 
7
8
  # == How to run a Wukong script
@@ -58,6 +59,38 @@ module Wukong
58
59
  include Wukong::LocalCommand
59
60
  attr_accessor :mapper_klass, :reducer_klass, :options
60
61
 
62
+ # ---------------------------------------------------------------------------
63
+ #
64
+ # Default options for Wukong
65
+ # http://github.com/infochimps/wukong
66
+ #
67
+ # If you set an environment variable WUKONG_CONFIG, *or* if the file
68
+ # $HOME/.wukong.rb exists, that file will be +require+'d as well.
69
+ #
70
+ # Important values to set:
71
+ #
72
+ # * hadoop_home -- Path to root of hadoop install. If your hadoop runner is
73
+ # /usr/local/share/hadoop/bin/hadoop
74
+ # then your hadoop_home is
75
+ # /usr/local/share/hadoop.
76
+ # You can also set a :hadoop_runner that gives the full path to the hadoop script
77
+ #
78
+ # * default_run_mode -- Whether to run using hadoop (and
79
+ # thus, requiring a working hadoop install), or to run in local mode
80
+ # (script --map | sort | script --reduce)
81
+ #
82
+ Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run as local or as hadoop?', :wukong => true, :hide_help => false
83
+ Settings.define :default_mapper, :default => '/bin/cat', :description => 'The command to run when a nil mapper is given.', :wukong => true, :hide_help => true
84
+ Settings.define :default_reducer, :default => '/bin/cat', :description => 'The command to run when a nil reducer is given.', :wukong => true, :hide_help => true
85
+ Settings.define :hadoop_home, :default => '/usr/lib/hadoop', :env_var => 'HADOOP_HOME', :description => "Path to hadoop installation; :hadoop_home/bin/hadoop should run hadoop.", :wukong => true
86
+ Settings.define :hadoop_runner, :description => "Path to hadoop script; usually, set :hadoop_home instead of this.", :wukong => true
87
+ Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
88
+ Settings.define :reduce, :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
89
+ Settings.define :run, :description => "run the script's main phase. In hadoop mode, invokes the hadoop script; in local mode, runs your_script.rb --map | sort | your_script.rb --reduce", :wukong => true
90
+ Settings.define :local, :description => "run in local mode (invokes 'your_script.rb --map | sort | your_script.rb --reduce'", :wukong => true
91
+ Settings.define :hadoop, :description => "run in hadoop mode (invokes the system hadoop runner script)", :wukong => true
92
+ Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
93
+
61
94
  #
62
95
  # Instantiate the Script with the Mapper and the Reducer class (each a
63
96
  # Wukong::Streamer) it should call back.
@@ -86,8 +119,9 @@ module Wukong
86
119
  # MyScript.new(MyMapper, nil).run
87
120
  #
88
121
  def initialize mapper_klass, reducer_klass, extra_options={}
89
- self.options = default_options.merge(extra_options)
90
- process_argv!
122
+ self.options = Settings.dup
123
+ options.resolve!
124
+ options.merge! extra_options
91
125
  self.mapper_klass = mapper_klass
92
126
  self.reducer_klass = reducer_klass
93
127
  # If no reducer_klass and no reduce_command, then skip the reduce phase
@@ -97,65 +131,12 @@ module Wukong
97
131
  #
98
132
  # Gives default options. Command line parameters take precedence
99
133
  #
100
- # MAKE SURE YOU CALL SUPER: write your script according to the patter
134
+ # MAKE SURE YOU CALL SUPER: write your script according to the pattern
101
135
  #
102
136
  # super.merge :my_option => :val
103
137
  #
104
138
  def default_options
105
- Wukong::CONFIG[:runner_defaults] || {}
106
- end
107
-
108
- # Options that don't need to go in the :all_args hash
109
- def std_options
110
- @std_options ||= [:run, :map, :reduce, ] + HADOOP_OPTIONS_MAP.keys
111
- end
112
-
113
- #
114
- # Parse the command-line args into the options hash.
115
- #
116
- # I should not reinvent the wheel.
117
- # Yet: here we are.
118
- #
119
- # '--foo=foo_val' produces :foo => 'foo_val' in the options hash.
120
- # '--' After seeing a non-'--' flag, or a '--' on its own, no further flags are parsed
121
- #
122
- # options[:all_args] contains all arguments that are not in std_options
123
- # options[:rest] contains all arguments following the first non-flag (or the '--')
124
- #
125
- def process_argv!
126
- options[:all_args] = []
127
- options[:rest] = []
128
- args = ARGV.dup
129
- while (! args.blank?) do
130
- arg = args.shift
131
- case
132
- when arg == '--'
133
- options[:rest] += args
134
- when arg =~ /\A--(\w+)(?:=(.+))?\z/
135
- opt, val = [$1, $2]
136
- opt = opt.to_sym
137
- val ||= true
138
- self.options[opt] = val
139
- options[:all_args] << arg unless std_options.include?(opt)
140
- else
141
- options[:all_args] << arg
142
- options[:rest] << arg
143
- end
144
- # p [options, arg, args]
145
- end
146
- options[:all_args] = options[:all_args].join(" ")
147
- end
148
-
149
- def this_script_filename
150
- Pathname.new($0).realpath
151
- end
152
-
153
- def ruby_interpreter_path
154
- Pathname.new(
155
- File.join(Config::CONFIG["bindir"],
156
- Config::CONFIG["RUBY_INSTALL_NAME"]+
157
- Config::CONFIG["EXEEXT"])
158
- ).realpath
139
+ {}
159
140
  end
160
141
 
161
142
  #
@@ -164,8 +145,8 @@ module Wukong
164
145
  def map_command
165
146
  case
166
147
  when mapper_klass
167
- "#{ruby_interpreter_path} #{this_script_filename} --map " + options[:all_args]
168
- else options[:map_command] || Wukong::CONFIG[:default_mapper] end
148
+ "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
149
+ else options[:map_command] || options[:default_mapper] end
169
150
  end
170
151
 
171
152
  #
@@ -175,8 +156,8 @@ module Wukong
175
156
  def reduce_command
176
157
  case
177
158
  when reducer_klass
178
- "#{ruby_interpreter_path} #{this_script_filename} --reduce " + options[:all_args]
179
- else options[:reduce_command] || Wukong::CONFIG[:default_reducer] end
159
+ "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
160
+ else options[:reduce_command] || options[:default_reducer] end
180
161
  end
181
162
 
182
163
  #
@@ -197,14 +178,16 @@ module Wukong
197
178
  end
198
179
 
199
180
  def run_mode
181
+ return 'local' if options[:local]
182
+ return 'hadoop' if options[:hadoop]
200
183
  # if only --run is given, assume default run mode
201
- options[:run] = Wukong::CONFIG[:default_run_mode] if (options[:run] == true)
184
+ options[:run] = options[:default_run_mode] if (options[:run] == true)
202
185
  options[:run].to_s
203
186
  end
204
187
 
205
188
  def input_output_paths
206
189
  # input / output paths
207
- input_path, output_path = options[:rest][0..1]
190
+ input_path, output_path = options.rest[0..1]
208
191
  raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:dry_run]) && (input_path.blank? || output_path.blank?)
209
192
  [input_path, output_path]
210
193
  end
@@ -216,6 +199,29 @@ module Wukong
216
199
  end
217
200
  end
218
201
 
202
+ # Reassemble all the non-internal-to-wukong options into a command line for
203
+ # the map/reducer phase scripts
204
+ def non_wukong_params
205
+ options.
206
+ reject{|param, val| options.param_definitions[param][:wukong] }.
207
+ map{|param,val| "--#{param}=#{val}" }.
208
+ join(" ")
209
+ end
210
+
211
+ # the full, real path to the script file
212
+ def this_script_filename
213
+ Pathname.new($0).realpath
214
+ end
215
+
216
+ # use the full ruby interpreter path to run slave processes
217
+ def ruby_interpreter_path
218
+ Pathname.new(
219
+ File.join(Config::CONFIG["bindir"],
220
+ Config::CONFIG["RUBY_INSTALL_NAME"]+
221
+ Config::CONFIG["EXEEXT"])
222
+ ).realpath
223
+ end
224
+
219
225
  #
220
226
  # Execute the runner phase
221
227
  #
@@ -243,41 +249,15 @@ module Wukong
243
249
  when options[:run]
244
250
  exec_hadoop_streaming
245
251
  else
246
- self.help # Normant Vincent Peale is proud of you
252
+ options.dump_help %Q{Please specify a run mode: you probably want to start with
253
+ #{$0} --run --local input.tsv output.tsv
254
+ although
255
+ cat input.tsv | #{$0} --map > mapped.tsv
256
+ or
257
+ cat mapped.tsv | sort | #{$0} --reduce > reduced.tsv
258
+ can be useful for initial testing.}
247
259
  end
248
260
  end
249
-
250
- #
251
- # Command line usage
252
- #
253
- def help
254
- $stderr.puts "#{self.class} script"
255
- $stderr.puts %Q{
256
- #{$0} --run=hadoop input_hdfs_path output_hdfs_dir # run the script with hadoop streaming
257
- #{$0} --run=local input_hdfs_path output_hdfs_dir # run the script on local filesystem using unix pipes
258
- #{$0} --run input_hdfs_path output_hdfs_dir # run the script with the mode given in config/wukong*.yaml
259
- #{$0} --map
260
- #{$0} --reduce # dispatch to the mapper or reducer
261
-
262
- All flags must precede the input and output paths.
263
- Additional flags:
264
- --dry_run
265
- Hadoop Options (see hadoop documentation)
266
- --max_node_map_tasks => 'mapred.tasktracker.map.tasks.maximum',
267
- --max_node_reduce_tasks => 'mapred.tasktracker.reduce.tasks.maximum',
268
- --map_tasks => 'mapred.map.tasks',
269
- --reduce_tasks => 'mapred.reduce.tasks',
270
- --sort_fields => 'stream.num.map.output.key.fields',
271
- --key_field_separator => 'map.output.key.field.separator',
272
- --partition_fields => 'num.key.fields.for.partition',
273
- --output_field_separator => 'stream.map.output.field.separator',
274
- --map_speculative => 'mapred.map.tasks.speculative.execution',
275
- --timeout => 'mapred.task.timeout',
276
- --reuse_jvms => 'mapred.job.reuse.jvm.num.tasks',
277
- --ignore_exit_status => 'stream.non.zero.exit.status.is.failure',
278
- You can specify as well arbitrary script-specific command line flags; they are added to your options[] hash.
279
- }
280
- end
281
261
  end
282
262
 
283
263
  end