wukong 1.4.2 → 1.4.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -15,26 +15,24 @@ module Wukong
15
15
  #
16
16
  # Translate the simplified args to their hairy-assed hadoop equivalents
17
17
  #
18
- HADOOP_OPTIONS_MAP = {
19
- :max_node_map_tasks => 'mapred.tasktracker.map.tasks.maximum',
20
- :max_node_reduce_tasks => 'mapred.tasktracker.reduce.tasks.maximum',
21
- :map_tasks => 'mapred.map.tasks',
22
- :reduce_tasks => 'mapred.reduce.tasks',
23
- :sort_fields => 'stream.num.map.output.key.fields',
24
- :key_field_separator => 'map.output.key.field.separator',
25
- :partition_fields => 'num.key.fields.for.partition',
26
- :output_field_separator => 'stream.map.output.field.separator',
27
- :map_speculative => 'mapred.map.tasks.speculative.execution',
28
- :timeout => 'mapred.task.timeout',
29
- :reuse_jvms => 'mapred.job.reuse.jvm.num.tasks',
30
- :respect_exit_status => 'stream.non.zero.exit.is.failure',
31
- }
18
+ Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
19
+ Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
20
+ Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
21
+ Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
22
+ Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
23
+ Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
24
+ Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
25
+ Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
26
+ Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
27
+ Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
28
+ Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
29
+ Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
32
30
 
33
31
  # emit a -jobconf hadoop option if the simplified command line arg is present
34
32
  # if not, the resulting nil will be elided later
35
33
  def jobconf option
36
34
  if options[option]
37
- "-jobconf %s=%s" % [HADOOP_OPTIONS_MAP[option], options[option]]
35
+ "-jobconf %s=%s" % [options.description_for(option), options[option]]
38
36
  end
39
37
  end
40
38
 
@@ -81,17 +79,21 @@ module Wukong
81
79
  end.compact
82
80
  end
83
81
 
82
+ # The path to the hadoop runner script
83
+ def hadoop_runner
84
+ options[:hadoop_runner] || (options[:hadoop_home]+'/bin/hadoop')
85
+ end
86
+
84
87
  #
85
88
  # Assemble the hadoop command to execute
86
89
  #
87
90
  def hadoop_command input_path, output_path
88
91
  # If this is wrong, create a config/wukong-site.rb or
89
- # otherwise set Wukong::CONFIG[:hadoop_home] to the
92
+ # otherwise set Settings[:hadoop_home] to the
90
93
  # root of your config install.
91
- hadoop_program = Wukong::CONFIG[:hadoop_home]+'/bin/hadoop'
92
94
  [
93
- hadoop_program,
94
- "jar #{Wukong::CONFIG[:hadoop_home]}/contrib/streaming/hadoop-*-streaming.jar",
95
+ hadoop_runner,
96
+ "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*-streaming.jar",
95
97
  hadoop_partition_args,
96
98
  hadoop_sort_args,
97
99
  hadoop_num_tasks_args,
@@ -6,8 +6,16 @@ module Wukong
6
6
  # Local execution Options
7
7
  #
8
8
 
9
+ # program, including arg, to sort input between mapper and reducer in local
10
+ # mode. You could override to for example run 'sort -n' (numeric sort).
11
+ def sort_command
12
+ 'sort'
13
+ end
14
+
9
15
  def local_command input_path, output_path
10
- %Q{ cat #{input_path} | #{map_command} | sort | #{reduce_command} > '#{output_path}'}
16
+ cmd_input_str = (input_path == '-') ? "" : "cat '#{input_path}' | "
17
+ cmd_output_str = (output_path == '-') ? "" : "> '#{output_path}'"
18
+ %Q{ #{cmd_input_str} #{map_command} | #{sort_command} | #{reduce_command} #{cmd_output_str} }
11
19
  end
12
20
 
13
21
  end
@@ -17,7 +17,7 @@ module Wukong
17
17
  # Pass each record to +#process+
18
18
  #
19
19
  def stream
20
- Log.info("Streaming on:\t%s" % [Script.input_file])
20
+ Log.info("Streaming on:\t%s" % [Script.input_file]) unless Script.input_file.blank?
21
21
  before_stream
22
22
  $stdin.each do |line|
23
23
  record = recordize(line.chomp)
data/spec/spec.opts ADDED
@@ -0,0 +1 @@
1
+ --color
data/spec/spec_helper.rb CHANGED
@@ -0,0 +1,11 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+
4
+ require 'wukong'
5
+ require 'spec'
6
+ require 'spec/autorun'
7
+
8
+ Spec::Runner.configure do |config|
9
+
10
+ end
11
+
@@ -0,0 +1,36 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'rubygems'
3
+ require 'wukong/encoding'
4
+
5
+ describe "Wukong encoding" do
6
+ before do
7
+ end
8
+
9
+ it 'en/decodes to xml by default' do
10
+ Wukong.encode_str("&").should == '&'
11
+ Wukong.decode_str("&").should == '&'
12
+ end
13
+ it 'en/decodes to xml with :xml' do
14
+ Wukong.encode_str("&", :xml).should == '&'
15
+ Wukong.decode_str("&", :xml).should == '&'
16
+ end
17
+ it 'url en/decodes with :url' do
18
+ Wukong.encode_str("&", :url).should == '%26'
19
+ Wukong.decode_str("%26", :url).should == '&'
20
+ end
21
+ { "'" => "'", "\t" => "	", "\n" => "
", nil => '',}.each do |raw, enc|
22
+ it 'encodes #{raw} to #{enc}' do
23
+ Wukong.encode_str(raw, :xml).should == enc
24
+ end
25
+ it 'decodes #{enc} to #{raw}' do
26
+ Wukong.decode_str(enc, :xml).should == raw.to_s
27
+ end
28
+ end
29
+ ["normal_string with %punctuation should `not be molested", ""].each do |str|
30
+ it 'doesn\'t change #{str}' do
31
+ Wukong.encode_str(str, :xml).should == str
32
+ end
33
+ end
34
+
35
+
36
+ end
@@ -0,0 +1,80 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe "Wukong::Script" do
4
+ before do
5
+ ARGV.replace []
6
+ @script = Wukong::Script.new 'mapper', 'reducer'
7
+ end
8
+
9
+ describe 'initialize' do
10
+ it 'sets :reduce_tasks to 0 if reducer_klass is nil and no reduce_command or explicit setting' do
11
+ @script = Wukong::Script.new 'mapper', nil
12
+ @script.options[:reduce_tasks].should == 0
13
+ end
14
+ it 'respects :reduce_tasks if set even if reducer_klass is nil' do
15
+ @script = Wukong::Script.new 'mapper', nil, :reduce_tasks => 1
16
+ @script.options[:reduce_tasks].should == 1
17
+ end
18
+ it "doesn't set :reduce_tasks reduce_command is given" do
19
+ @script = Wukong::Script.new 'mapper', nil, :reduce_command => 1
20
+ @script.options[:reduce_tasks].should be_nil
21
+ end
22
+ it 'sets mapper_klass in initializer' do
23
+ @script.mapper_klass.should == 'mapper'
24
+ end
25
+ it 'sets reducer_klass in initializer' do
26
+ @script.reducer_klass.should == 'reducer'
27
+ end
28
+ end
29
+
30
+ describe 'child processes' do
31
+ it 'calls self if a mapper_klass is set' do
32
+ @script.should_receive(:ruby_interpreter_path).and_return('/path/to/ruby')
33
+ @script.should_receive(:this_script_filename).and_return('/path/to/this_script')
34
+ @script.map_command.should == %Q{/path/to/ruby /path/to/this_script --map }
35
+ end
36
+ it 'calls default_mapper if nil mapper_klass and no map_command is set' do
37
+ @script = Wukong::Script.new nil, 'reducer', :default_mapper => 'default_mapper'
38
+ @script.map_command.should == 'default_mapper'
39
+ end
40
+ it 'calls map_command if nil mapper_klass and map_command is set' do
41
+ @script = Wukong::Script.new nil, 'reducer', :map_command => 'map_command', :default_mapper => 'default_mapper'
42
+ @script.map_command.should == 'map_command'
43
+ end
44
+
45
+ it 'calls self if a reducer_klass is set' do
46
+ @script.should_receive(:ruby_interpreter_path).and_return('/path/to/ruby')
47
+ @script.should_receive(:this_script_filename).and_return('/path/to/this_script')
48
+ @script.reduce_command.should == %Q{/path/to/ruby /path/to/this_script --reduce }
49
+ end
50
+ it 'calls default_reducer if nil reducer_klass and no reduce_command is set' do
51
+ @script = Wukong::Script.new 'mapper', nil, :default_reducer => 'default_reducer'
52
+ @script.reduce_command.should == 'default_reducer'
53
+ end
54
+ it 'calls reduce_command if nil reducer_klass and reduce_command is set' do
55
+ @script = Wukong::Script.new 'mapper', nil, :reduce_command => 'reduce_command', :default_reducer => 'default_reducer'
56
+ @script.reduce_command.should == 'reduce_command'
57
+ end
58
+
59
+ it 'runs script | sort | script when in local mode' do
60
+ @script.should_receive(:run_mode).and_return('local')
61
+ @script.should_receive(:map_command).and_return('map_command')
62
+ @script.should_receive(:reduce_command).and_return('reduce_command')
63
+ @script.runner_command("/path/in", "/path/out").should == %Q{ cat '/path/in' | map_command | sort | reduce_command > '/path/out'}
64
+ end
65
+
66
+ it 'calls out to hadoop when in non-local mode' do
67
+ @script.should_receive(:run_mode).and_return('hadoop')
68
+ @script.should_receive(:hadoop_command).and_return('hadoop_command whee!')
69
+ @script.runner_command("/path/in", "/path/out").should == 'hadoop_command whee!'
70
+ end
71
+ end
72
+
73
+ describe 'runner phase'
74
+ it 'preserves non-internal-to-wukong params in non_wukong_params' do
75
+ @script.options[:foo] = 'bar'
76
+ @script.non_wukong_params.should == "--foo=bar"
77
+ end
78
+
79
+
80
+ end
data/wukong.gemspec CHANGED
@@ -1,15 +1,15 @@
1
1
  # Generated by jeweler
2
- # DO NOT EDIT THIS FILE
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
4
  # -*- encoding: utf-8 -*-
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{wukong}
8
- s.version = "1.4.2"
8
+ s.version = "1.4.5"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Philip (flip) Kromer"]
12
- s.date = %q{2009-12-15}
12
+ s.date = %q{2010-01-26}
13
13
  s.description = %q{ Treat your dataset like a:
14
14
 
15
15
  * stream of lines when it’s efficient to process by lines
@@ -25,7 +25,8 @@ Gem::Specification.new do |s|
25
25
  "README.textile"
26
26
  ]
27
27
  s.files = [
28
- "INSTALL.textile",
28
+ "CHANGELOG.textile",
29
+ "INSTALL.textile",
29
30
  "LICENSE.textile",
30
31
  "README.textile",
31
32
  "bin/cutc",
@@ -59,47 +60,40 @@ Gem::Specification.new do |s|
59
60
  "bin/wu-plus",
60
61
  "bin/wu-sum",
61
62
  "docpages/INSTALL.textile",
62
- "docpages/INSTALL.textile",
63
- "docpages/LICENSE.textile",
64
63
  "docpages/LICENSE.textile",
65
64
  "docpages/README-wulign.textile",
66
- "docpages/README-wulign.textile",
67
65
  "docpages/UsingWukong-part1-get_ready.textile",
68
- "docpages/UsingWukong-part1-get_ready.textile",
69
- "docpages/UsingWukong-part2-ThinkingBigData.textile",
70
66
  "docpages/UsingWukong-part2-ThinkingBigData.textile",
71
67
  "docpages/UsingWukong-part3-parsing.textile",
72
- "docpages/UsingWukong-part3-parsing.textile",
73
68
  "docpages/_config.yml",
74
69
  "docpages/bigdata-tips.textile",
75
- "docpages/bigdata-tips.textile",
76
70
  "docpages/code/api_response_example.txt",
77
71
  "docpages/code/parser_skeleton.rb",
78
72
  "docpages/diagrams/MapReduceDiagram.graffle",
79
73
  "docpages/favicon.ico",
80
74
  "docpages/gem.css",
81
75
  "docpages/hadoop-tips.textile",
82
- "docpages/hadoop-tips.textile",
83
- "docpages/index.textile",
84
76
  "docpages/index.textile",
85
77
  "docpages/intro.textile",
86
- "docpages/intro.textile",
87
- "docpages/moreinfo.textile",
88
78
  "docpages/moreinfo.textile",
89
79
  "docpages/news.html",
90
80
  "docpages/pig/PigLatinExpressionsList.txt",
91
81
  "docpages/pig/PigLatinReferenceManual.html",
92
82
  "docpages/pig/PigLatinReferenceManual.txt",
93
83
  "docpages/tutorial.textile",
94
- "docpages/tutorial.textile",
95
- "docpages/usage.textile",
96
84
  "docpages/usage.textile",
97
85
  "docpages/wutils.textile",
98
- "docpages/wutils.textile",
99
86
  "examples/README.txt",
100
87
  "examples/apache_log_parser.rb",
88
+ "examples/contrib/jeans/README.markdown",
89
+ "examples/contrib/jeans/data/normalized_sizes",
90
+ "examples/contrib/jeans/data/orders.tsv",
91
+ "examples/contrib/jeans/data/sizes",
92
+ "examples/contrib/jeans/normalize.rb",
93
+ "examples/contrib/jeans/sizes.rb",
101
94
  "examples/count_keys.rb",
102
95
  "examples/count_keys_at_mapper.rb",
96
+ "examples/foo.rb",
103
97
  "examples/graph/adjacency_list.rb",
104
98
  "examples/graph/breadth_first_search.rb",
105
99
  "examples/graph/gen_2paths.rb",
@@ -108,7 +102,6 @@ Gem::Specification.new do |s|
108
102
  "examples/package-local.rb",
109
103
  "examples/package.rb",
110
104
  "examples/pagerank/README.textile",
111
- "examples/pagerank/README.textile",
112
105
  "examples/pagerank/gen_initial_pagerank_graph.pig",
113
106
  "examples/pagerank/pagerank.rb",
114
107
  "examples/pagerank/pagerank_initialize.rb",
@@ -120,7 +113,6 @@ Gem::Specification.new do |s|
120
113
  "examples/word_count.rb",
121
114
  "lib/wukong.rb",
122
115
  "lib/wukong/bad_record.rb",
123
- "lib/wukong/boot.rb",
124
116
  "lib/wukong/datatypes.rb",
125
117
  "lib/wukong/datatypes/enum.rb",
126
118
  "lib/wukong/datatypes/fake_types.rb",
@@ -165,12 +157,14 @@ Gem::Specification.new do |s|
165
157
  "lib/wukong/streamer/uniq_by_last_reducer.rb",
166
158
  "lib/wukong/typed_struct.rb",
167
159
  "lib/wukong/wukong_class.rb",
168
- "spec/bin/hdp-wc_spec.rb",
169
160
  "spec/data/a_atsigns_b.tsv",
170
161
  "spec/data/a_follows_b.tsv",
171
162
  "spec/data/tweet.tsv",
172
163
  "spec/data/twitter_user.tsv",
164
+ "spec/spec.opts",
173
165
  "spec/spec_helper.rb",
166
+ "spec/wukong/encoding_spec.rb",
167
+ "spec/wukong/script_spec.rb",
174
168
  "wukong.gemspec"
175
169
  ]
176
170
  s.homepage = %q{http://mrflip.github.com/wukong}
@@ -179,11 +173,15 @@ Gem::Specification.new do |s|
179
173
  s.rubygems_version = %q{1.3.5}
180
174
  s.summary = %q{Wukong makes Hadoop so easy a chimpanzee can use it.}
181
175
  s.test_files = [
182
- "spec/bin/hdp-wc_spec.rb",
183
- "spec/spec_helper.rb",
176
+ "spec/spec_helper.rb",
177
+ "spec/wukong/encoding_spec.rb",
178
+ "spec/wukong/script_spec.rb",
184
179
  "examples/apache_log_parser.rb",
180
+ "examples/contrib/jeans/normalize.rb",
181
+ "examples/contrib/jeans/sizes.rb",
185
182
  "examples/count_keys.rb",
186
183
  "examples/count_keys_at_mapper.rb",
184
+ "examples/foo.rb",
187
185
  "examples/graph/adjacency_list.rb",
188
186
  "examples/graph/breadth_first_search.rb",
189
187
  "examples/graph/gen_2paths.rb",
@@ -218,3 +216,4 @@ Gem::Specification.new do |s|
218
216
  s.add_dependency(%q<htmlentities>, [">= 0"])
219
217
  end
220
218
  end
219
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wukong
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.2
4
+ version: 1.4.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Philip (flip) Kromer
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-12-15 00:00:00 -06:00
12
+ date: 2010-01-26 00:00:00 -06:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -57,6 +57,7 @@ extra_rdoc_files:
57
57
  - LICENSE.textile
58
58
  - README.textile
59
59
  files:
60
+ - CHANGELOG.textile
60
61
  - INSTALL.textile
61
62
  - LICENSE.textile
62
63
  - README.textile
@@ -116,8 +117,15 @@ files:
116
117
  - docpages/wutils.textile
117
118
  - examples/README.txt
118
119
  - examples/apache_log_parser.rb
120
+ - examples/contrib/jeans/README.markdown
121
+ - examples/contrib/jeans/data/normalized_sizes
122
+ - examples/contrib/jeans/data/orders.tsv
123
+ - examples/contrib/jeans/data/sizes
124
+ - examples/contrib/jeans/normalize.rb
125
+ - examples/contrib/jeans/sizes.rb
119
126
  - examples/count_keys.rb
120
127
  - examples/count_keys_at_mapper.rb
128
+ - examples/foo.rb
121
129
  - examples/graph/adjacency_list.rb
122
130
  - examples/graph/breadth_first_search.rb
123
131
  - examples/graph/gen_2paths.rb
@@ -137,7 +145,6 @@ files:
137
145
  - examples/word_count.rb
138
146
  - lib/wukong.rb
139
147
  - lib/wukong/bad_record.rb
140
- - lib/wukong/boot.rb
141
148
  - lib/wukong/datatypes.rb
142
149
  - lib/wukong/datatypes/enum.rb
143
150
  - lib/wukong/datatypes/fake_types.rb
@@ -182,12 +189,14 @@ files:
182
189
  - lib/wukong/streamer/uniq_by_last_reducer.rb
183
190
  - lib/wukong/typed_struct.rb
184
191
  - lib/wukong/wukong_class.rb
185
- - spec/bin/hdp-wc_spec.rb
186
192
  - spec/data/a_atsigns_b.tsv
187
193
  - spec/data/a_follows_b.tsv
188
194
  - spec/data/tweet.tsv
189
195
  - spec/data/twitter_user.tsv
196
+ - spec/spec.opts
190
197
  - spec/spec_helper.rb
198
+ - spec/wukong/encoding_spec.rb
199
+ - spec/wukong/script_spec.rb
191
200
  - wukong.gemspec
192
201
  has_rdoc: true
193
202
  homepage: http://mrflip.github.com/wukong
@@ -218,11 +227,15 @@ signing_key:
218
227
  specification_version: 3
219
228
  summary: Wukong makes Hadoop so easy a chimpanzee can use it.
220
229
  test_files:
221
- - spec/bin/hdp-wc_spec.rb
222
230
  - spec/spec_helper.rb
231
+ - spec/wukong/encoding_spec.rb
232
+ - spec/wukong/script_spec.rb
223
233
  - examples/apache_log_parser.rb
234
+ - examples/contrib/jeans/normalize.rb
235
+ - examples/contrib/jeans/sizes.rb
224
236
  - examples/count_keys.rb
225
237
  - examples/count_keys_at_mapper.rb
238
+ - examples/foo.rb
226
239
  - examples/graph/adjacency_list.rb
227
240
  - examples/graph/breadth_first_search.rb
228
241
  - examples/graph/gen_2paths.rb