wukong 1.5.3 → 1.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. data/CHANGELOG.textile +4 -0
  2. data/bin/hdp-bin +44 -0
  3. data/bin/hdp-ls +2 -1
  4. data/docpages/avro/performance.textile +36 -0
  5. data/examples/cassandra_streaming/avromapper.rb +85 -0
  6. data/examples/cassandra_streaming/berlitz_for_cassandra.textile +22 -0
  7. data/examples/cassandra_streaming/cassandra.avpr +468 -0
  8. data/examples/cassandra_streaming/cassandra_random_partitioner.rb +62 -0
  9. data/examples/cassandra_streaming/catter.sh +45 -0
  10. data/examples/cassandra_streaming/client_interface_notes.textile +200 -0
  11. data/examples/cassandra_streaming/client_schema.avpr +211 -0
  12. data/examples/cassandra_streaming/client_schema.textile +318 -0
  13. data/examples/cassandra_streaming/foofile.avr +0 -0
  14. data/examples/cassandra_streaming/pymap.sh +1 -0
  15. data/examples/cassandra_streaming/pyreduce.sh +1 -0
  16. data/examples/cassandra_streaming/smutation.avpr +188 -0
  17. data/examples/cassandra_streaming/streamer.sh +51 -0
  18. data/examples/cassandra_streaming/struct_loader.rb +24 -0
  19. data/examples/cassandra_streaming/tuning.textile +73 -0
  20. data/examples/emr/README-elastic_map_reduce.textile +26 -0
  21. data/examples/emr/dot_wukong_dir/credentials.json +7 -0
  22. data/examples/emr/{emr.yaml → dot_wukong_dir/emr.yaml} +33 -16
  23. data/{bin/bootstrap.sh → examples/emr/dot_wukong_dir/emr_bootstrap.sh} +1 -1
  24. data/examples/emr/elastic_mapreduce_example.rb +1 -0
  25. data/lib/wukong/encoding/asciize.rb +108 -0
  26. data/lib/wukong/extensions/date_time.rb +33 -7
  27. data/lib/wukong/extensions/emittable.rb +12 -25
  28. data/lib/wukong/extensions/hash_like.rb +13 -6
  29. data/lib/wukong/filename_pattern.rb +8 -7
  30. data/lib/wukong/schema.rb +47 -0
  31. data/lib/wukong/script.rb +7 -0
  32. data/lib/wukong/script/cassandra_loader_script.rb +40 -0
  33. data/lib/wukong/script/emr_command.rb +74 -43
  34. data/lib/wukong/script/hadoop_command.rb +89 -72
  35. data/lib/wukong/store.rb +2 -7
  36. data/lib/wukong/store/cassandra.rb +10 -0
  37. data/lib/wukong/store/cassandra/streaming.rb +75 -0
  38. data/lib/wukong/store/cassandra/struct_loader.rb +21 -0
  39. data/lib/wukong/store/cassandra_model.rb +90 -0
  40. data/lib/wukong/store/chh_chunked_flat_file_store.rb +1 -1
  41. data/lib/wukong/store/chunked_flat_file_store.rb +24 -20
  42. data/wukong.gemspec +32 -4
  43. metadata +33 -14
@@ -1,4 +1,3 @@
1
-
2
1
  Object.class_eval do
3
2
  def to_flat() [to_s] end
4
3
  end
@@ -54,29 +53,17 @@ Hash.class_eval do
54
53
  end
55
54
  end
56
55
 
57
- class Time
58
- # strftime() format to flatten a date
59
- FLAT_FORMAT = "%Y%m%d%H%M%S"
60
- # Flatten
61
- def to_flat
62
- strftime(FLAT_FORMAT)
63
- end
64
- end
65
-
66
- class Date
67
- # strftime() format to flatten a date
68
- FLAT_FORMAT = "%Y%m%d"
69
- # Flatten
70
- def to_flat
71
- strftime(FLAT_FORMAT)
72
- end
73
- end
74
-
75
- class DateTime < Date
76
- # strftime() format to flatten a date
77
- FLAT_FORMAT = "%Y%m%d%H%M%S"
78
- # Flatten
79
- def to_flat
80
- strftime(FLAT_FORMAT)
56
+ class Integer
57
+ #
58
+ # Express boolean as 1 (true) or 0 (false). In contravention of typical ruby
59
+ # semantics (but in a way that is more robust for wukong-like batch
60
+ # processing), the number 0, the string '0', nil and false are all considered
61
+ # false. (This also makes the method idempotent: repeated calls give same result.)
62
+ #
63
+ def self.unbooleanize bool
64
+ case bool
65
+ when 0, '0', false, nil then 0
66
+ else 1
67
+ end
81
68
  end
82
69
  end
@@ -103,16 +103,23 @@ module Wukong
103
103
  # otherwise they must be uniformly strings
104
104
  #
105
105
  def from_hash(hsh, has_symbol_keys=false)
106
- keys = self.keys
107
- keys = keys.map(&:to_sym) if has_symbol_keys
108
- self.new *hsh.values_of(*keys)
106
+ extract_keys = has_symbol_keys ? self.keys.map(&:to_sym) : self.keys.map(&:to_s)
107
+ self.new *hsh.values_of(*extract_keys)
109
108
  end
110
109
  #
111
110
  # The last portion of the class in underscored form
112
- # note memoization
111
+ # memoized
113
112
  #
114
- def self.resource_name
115
- @resource_name ||= self.to_s.gsub(%r{.*::}, '').underscore.to_sym
113
+ def resource_name
114
+ @resource_name ||= self.class_basename.underscore.to_sym
115
+ end
116
+ # The last portion of the class name
117
+ # memoized
118
+ #
119
+ # @example
120
+ # This::That::TheOther.new.class_basename # => TheOther
121
+ def class_basename
122
+ @class_basename ||= self.to_s.gsub(%r{.*::}, '')
116
123
  end
117
124
  end
118
125
 
@@ -16,12 +16,12 @@ module Wukong
16
16
  # walk through pattern, replacing tokens (eg :time or :pid) with the
17
17
  # corresponding value.
18
18
  #
19
+ # Don't use ':' in a pattern except to introduce a token
20
+ # and separate tokens with '-', '+' '/' or '.'
21
+ #
19
22
  def make token_vals={}
20
23
  token_vals = token_val_defaults.merge token_vals
21
24
  token_vals[:timestamp] ||= Time.now.utc.strftime("%Y%m%d%H%M%S")
22
- # CHH_NOTE: The following is broken for patterns that need a ":" or
23
- # patterns that need text following a token with no special chars in
24
- # between.
25
25
  val = pattern.gsub(/:(\w+)/){ replace($1, token_vals) }
26
26
  val
27
27
  end
@@ -39,7 +39,7 @@ module Wukong
39
39
  case token
40
40
  when :pid then pid
41
41
  when :hostname then hostname
42
- when :handle then token_vals[:handle]
42
+ when :handle then token_vals[:handle]
43
43
  when :handle_prefix then token_vals[:handle].to_s[0..5]
44
44
  when :timestamp then token_vals[:timestamp]
45
45
  when :date then token_vals[:timestamp][ 0..7]
@@ -56,7 +56,7 @@ module Wukong
56
56
 
57
57
  # Memoized: the hostname for the machine running this script.
58
58
  def hostname
59
- @hostname ||= ENV['HOSTNAME'] || `hostname`.delete("\n")
59
+ @hostname ||= ENV['HOSTNAME'] || `hostname`.chomp
60
60
  end
61
61
  # Memoized: the Process ID for this invocation.
62
62
  def pid
@@ -64,9 +64,10 @@ module Wukong
64
64
  end
65
65
 
66
66
  # Characters deemed safe in a filename;
67
- SAFE_CHARS = 'a-zA-Z0-9_\-\.\+\/\;'
67
+ SAFE_CHARS = 'a-zA-Z0-9_\-\.\+\/'
68
+ RE_SAFE_FILENAME = %r{[^#{SAFE_CHARS}]+}moxi
68
69
  def self.sanitize str
69
- str.gsub(%r{[^#{SAFE_CHARS}]+}, '-')
70
+ str.gsub(RE_SAFE_FILENAME, '-')
70
71
  end
71
72
 
72
73
  end
@@ -50,6 +50,35 @@ class << Yaml ; def to_pig() 'chararray' end ; end if defined?(Yaml)
50
50
  class << Json ; def to_pig() 'chararray' end ; end if defined?(Json)
51
51
  class << Regex ; def to_pig() 'chararray' end ; end if defined?(Regex)
52
52
 
53
+
54
+ #
55
+ # Basic types: Avro conversion
56
+ #
57
+ class << Integer ; def to_avro() 'int' end ; end
58
+ class << Bignum ; def to_avro() 'long' end ; end
59
+ class << Float ; def to_avro() 'float' end ; end
60
+ class << Symbol ; def to_avro() 'string' end ; end
61
+ class << Date ; def to_avro() 'long' end ; end
62
+ class << Time ; def to_avro() 'long' end ; end
63
+ class << DateTime ; def to_avro() 'long' end ; end
64
+ class << String ; def to_avro() 'string' end ; end
65
+ class << Text ; def to_avro() 'string' end ; end if defined?(Text)
66
+ class << Blob ; def to_avro() 'bytearray' end ; end if defined?(Blob)
67
+ class << Boolean ; def to_avro() 'bytearray' end ; end if defined?(Boolean)
68
+ class String ; def to_avro() self.to_s ; end ; end
69
+ class Symbol ; def to_avro() self.to_s ; end ; end
70
+
71
+ class << BigDecimal ; def to_avro() 'long' end ; end if defined?(BigDecimal)
72
+ class << EpochTime ; def to_avro() 'integer' end ; end if defined?(EpochTime)
73
+ class << FilePath ; def to_avro() 'string' end ; end if defined?(FilePath)
74
+ class << Flag ; def to_avro() 'string' end ; end if defined?(Flag)
75
+ class << IPAddress ; def to_avro() 'string' end ; end if defined?(IPAddress)
76
+ class << URI ; def to_avro() 'string' end ; end if defined?(URI)
77
+ class << Csv ; def to_avro() 'string' end ; end if defined?(Csv)
78
+ class << Yaml ; def to_avro() 'string' end ; end if defined?(Yaml)
79
+ class << Json ; def to_avro() 'string' end ; end if defined?(Json)
80
+ class << Regex ; def to_avro() 'string' end ; end if defined?(Regex)
81
+
53
82
  module Wukong
54
83
  #
55
84
  # Export model's structure for loading and manipulating in other frameworks,
@@ -208,6 +237,24 @@ module Wukong
208
237
  str.join("\n")
209
238
  end
210
239
 
240
+
241
+
242
+
243
+ #
244
+ # Avro
245
+ #
246
+ def to_avro
247
+ require 'json' # yikes
248
+ h = {}
249
+ h[:name] = self.name
250
+ h[:type] = "record"
251
+ h[:fields] = []
252
+ members.zip(mtypes).each do |member, type|
253
+ h[:fields] << {:name => member.to_s, :type => type.to_avro}
254
+ end
255
+ h.to_json
256
+ end
257
+
211
258
  end
212
259
  # standard stanza for making methods appear on the class itself on include
213
260
  def self.included base
@@ -145,6 +145,7 @@ module Wukong
145
145
  when 'map' then mapper_klass.new(self.options).stream
146
146
  when 'reduce' then reducer_klass.new(self.options).stream
147
147
  when 'local' then execute_local_workflow
148
+ when 'cassandra' then execute_hadoop_workflow
148
149
  when 'hadoop', 'mapred' then execute_hadoop_workflow
149
150
  when 'emr'
150
151
  require 'wukong/script/emr_command'
@@ -196,6 +197,12 @@ module Wukong
196
197
  "#{File.basename(this_script_filename)}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
197
198
  end
198
199
 
200
+ # Wrapper for dangerous operations to catch errors
201
+ def safely action, &block
202
+ begin
203
+ block.call
204
+ rescue StandardError => e ; handle_error(action, e); end
205
+ end
199
206
 
200
207
  protected
201
208
 
@@ -0,0 +1,40 @@
1
+ Settings.define :cassandra_keyspace, :required => true, :description => "The keyspace to bulk load"
2
+ Settings.define :cassandra_col_family, :required => true, :description => "The column family to bulk load"
3
+ Settings.define :cassandra_home, :env_var => 'CASSANDRA_HOME', :default => '/usr/local/share/cassandra'
4
+
5
+ module Wukong
6
+ class CassandraScript < Wukong::Script
7
+ def hadoop_other_args *args
8
+ opts = super(*args)
9
+ opts << "-D stream.map.output=\'cassandra_avro_output\'"
10
+ opts << "-D stream.io.identifier.resolver.class=\'org.apache.cassandra.hadoop.streaming.AvroResolver\'"
11
+ opts << "-D cassandra.output.keyspace=\'#{Settings.cassandra_keyspace}\'"
12
+ opts << "-D cassandra.output.columnfamily=\'#{Settings.cassandra_col_family}\'"
13
+ opts << "-D cassandra.partitioner.class=\'org.apache.cassandra.dht.RandomPartitioner\'"
14
+ opts << "-D cassandra.thrift.address=\'#{[Settings.cassandra_hosts].flatten.map{|s| s.gsub(/:.*/, '')}.join(",")}\'"
15
+ opts << "-D cassandra.thrift.port=\'9160\'"
16
+ # opts << "-D mapreduce.output.columnfamilyoutputformat.batch.threshold=\'1024\'"
17
+ # ORDER MATTERS
18
+ opts << "-libjars \'#{cassandra_jars}\'"
19
+ opts << "-file \'#{avro_schema}\'"
20
+ opts << "-outputformat \'org.apache.cassandra.hadoop.ColumnFamilyOutputFormat\'"
21
+ opts
22
+ end
23
+
24
+ #
25
+ # Return paths to cassandra jars as a string
26
+ #
27
+ def cassandra_jars
28
+ jars = []
29
+ Dir["#{Settings.cassandra_home}/build/apache-cassandra*.jar", "#{Settings.cassandra_home}/build/lib/jars/*.jar", "#{Settings.cassandra_home}/lib/*.jar"].each do |jar|
30
+ jars << jar
31
+ end
32
+ jars.join(',')
33
+ end
34
+
35
+ def avro_schema
36
+ File.join(Settings.cassandra_home, "interface/avro/cassandra.avpr")
37
+ end
38
+
39
+ end
40
+ end
@@ -1,16 +1,26 @@
1
1
  require 'right_aws'
2
2
  require 'configliere/config_block'
3
- Settings.read(File.expand_path('~/.wukong/emr.yaml'))
3
+ #
4
+ EMR_CONFIG_DIR = '~/.wukong' unless defined?(EMR_CONFIG_DIR)
5
+ #
4
6
  Settings.define :emr_credentials_file, :description => 'A .json file holding your AWS access credentials. See http://bit.ly/emr_credentials_file for format'
5
7
  Settings.define :access_key, :description => 'AWS Access key', :env_var => 'AWS_ACCESS_KEY_ID'
6
8
  Settings.define :secret_access_key, :description => 'AWS Secret Access key', :env_var => 'AWS_SECRET_ACCESS_KEY'
7
9
  Settings.define :emr_runner, :description => 'Path to the elastic-mapreduce command (~ etc will be expanded)'
8
- Settings.define :emr_root, :description => 'S3 url to use as the base for Elastic MapReduce storage'
9
- Settings.define :key_pair_file, :description => 'AWS Key pair file', :finally => lambda{ Settings.key_pair_file = File.expand_path(Settings.key_pair_file.to_s) if Settings.key_pair_file }
10
- Settings.define :key_pair, :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') if Settings.key_pair_file }
11
- Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
10
+ Settings.define :emr_root, :description => 'S3 bucket and path to use as the base for Elastic MapReduce storage, organized by job name'
11
+ Settings.define :emr_data_root, :description => 'Optional '
12
+ Settings.define :emr_bootstrap_script, :description => 'Bootstrap actions for Elastic Map Reduce machine provisioning', :default => EMR_CONFIG_DIR+'/emr_bootstrap.sh', :type => :filename, :finally => lambda{ Settings.emr_bootstrap_script = File.expand_path(Settings.emr_bootstrap_script) }
13
+ Settings.define :emr_extra_args, :description => 'kludge: allows you to stuff extra args into the elastic-mapreduce invocation', :type => Array, :wukong => true
14
+ Settings.define :alive, :description => 'Whether to keep machine running after job invocation', :type => :boolean
15
+ #
16
+ Settings.define :keypair_file, :description => 'AWS Key pair file', :type => :filename
17
+ Settings.define :keypair, :description => "AWS Key pair name. If not specified, it's taken from keypair_file's basename", :finally => lambda{ Settings.keypair ||= File.basename(Settings.keypair_file.to_s, '.pem') if Settings.keypair_file }
18
+ Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
12
19
  Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
13
- Settings.define :jobflow
20
+ Settings.define :jobflow, :description => "ID of an existing EMR job flow. Wukong will create a new job flow"
21
+ #
22
+ Settings.read(File.expand_path(EMR_CONFIG_DIR+'/emr.yaml'))
23
+
14
24
  module Wukong
15
25
  #
16
26
  # EMR Options
@@ -26,39 +36,46 @@ module Wukong
26
36
  Log.info " Copying this script to the cloud."
27
37
  S3Util.store(this_script_filename, mapper_s3_uri)
28
38
  S3Util.store(this_script_filename, reducer_s3_uri)
29
- S3Util.store(File.expand_path('~/ics/wukong/bin/bootstrap.sh'), bootstrap_s3_uri)
39
+ S3Util.store(File.expand_path(Settings.emr_bootstrap_script), bootstrap_s3_uri)
40
+ end
41
+
42
+ def copy_jars_to_cloud
43
+ S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), wukong_libs_s3_uri)
44
+ # "--cache-archive=#{wukong_libs_s3_uri}#vendor",
45
+ end
46
+
47
+ def hadoop_options_for_emr_runner
48
+ [hadoop_jobconf_options, hadoop_other_args].flatten.compact.map{|hdp_opt| "--arg '#{hdp_opt}'"}
30
49
  end
31
50
 
32
51
  def execute_emr_runner
33
52
  command_args = []
34
- command_args << Settings.dashed_flags(:hadoop_version, :enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
35
- command_args += emr_credentials
36
53
  if Settings.jobflow
37
54
  command_args << Settings.dashed_flag_for(:jobflow)
38
55
  else
39
- command_args << Settings.dashed_flag_for(:alive)
40
56
  command_args << "--create --name=#{job_name}"
41
- command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type).join(' ')
57
+ command_args << Settings.dashed_flag_for(:alive)
58
+ command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type, :hadoop_version).join(' ')
59
+ command_args << Settings.dashed_flags(:availability_zone, :keypair, :keypair_file).join(' ')
60
+ command_args << "--bootstrap-action=#{bootstrap_s3_uri}"
42
61
  end
62
+ command_args << Settings.dashed_flags(:enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
63
+ command_args += emr_credentials
43
64
  command_args += [
44
- "--bootstrap-action=#{bootstrap_s3_uri}",
45
65
  "--log-uri=#{log_s3_uri}",
46
66
  "--stream",
47
67
  "--mapper=#{mapper_s3_uri} ",
48
68
  "--reducer=#{reducer_s3_uri} ",
49
- "--input=#{input_paths} --output=#{output_path}",
50
- # to specify zero reducers:
51
- # "--arg '-D mapred.reduce.tasks=0'"
69
+ "--input=#{input_paths.join(",")} --output=#{output_path}",
52
70
  ]
71
+ # eg to specify zero reducers:
72
+ # Settings[:emr_extra_args] = "--arg '-D mapred.reduce.tasks=0'"
73
+ command_args += Settings[:emr_extra_args] unless Settings[:emr_extra_args].blank?
74
+ command_args += hadoop_options_for_emr_runner
53
75
  Log.info 'Follow along at http://localhost:9000/job'
54
76
  execute_command!( File.expand_path(Settings.emr_runner), *command_args )
55
77
  end
56
78
 
57
- def emr_ship_jars
58
- S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), wukong_libs_s3_uri)
59
- # "--cache-archive=#{wukong_libs_s3_uri}#vendor",
60
- end
61
-
62
79
  def emr_credentials
63
80
  command_args = []
64
81
  if Settings.emr_credentials_file
@@ -66,7 +83,6 @@ module Wukong
66
83
  else
67
84
  command_args << %Q{--access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} }
68
85
  end
69
- command_args << Settings.dashed_flags(:availability_zone, :key_pair, :key_pair_file).join(' ')
70
86
  command_args
71
87
  end
72
88
 
@@ -75,58 +91,73 @@ module Wukong
75
91
  File.basename($0,'.rb')
76
92
  end
77
93
 
94
+ # Produces an s3 URI within the Wukong emr sandbox from a set of path
95
+ # segments
96
+ #
97
+ # @example
98
+ # Settings.emr_root = 's3://emr.yourmom.com/wukong'
99
+ # emr_s3_path('log', 'my_happy_job', 'run-97.log')
100
+ # # => "s3://emr.yourmom.com/wukong/log/my_happy_job/run-97.log"
101
+ #
102
+ def emr_s3_path *path_segs
103
+ File.join(Settings.emr_root, path_segs.flatten.compact)
104
+ end
105
+
78
106
  def mapper_s3_uri
79
- emr_s3_path(job_handle+'-mapper.rb')
107
+ emr_s3_path(job_handle, 'code', job_handle+'-mapper.rb')
80
108
  end
81
109
  def reducer_s3_uri
82
- emr_s3_path(job_handle+'-reducer.rb')
110
+ emr_s3_path(job_handle, 'code', job_handle+'-reducer.rb')
83
111
  end
84
112
  def log_s3_uri
85
- emr_s3_path('log', job_handle)
113
+ emr_s3_path(job_handle, 'log', 'emr_jobs')
86
114
  end
87
115
  def bootstrap_s3_uri
88
- emr_s3_path('bin', "bootstrap-#{job_handle}.sh")
116
+ emr_s3_path(job_handle, 'bin', "emr_bootstrap.sh")
89
117
  end
90
118
  def wukong_libs_s3_uri
91
- emr_s3_path('bin', "wukong-libs.jar")
92
- end
93
-
94
- def emr_s3_path *path_segs
95
- File.join(Settings.emr_root, path_segs.flatten.compact)
119
+ emr_s3_path(job_handle, 'code', "wukong-libs.jar")
96
120
  end
97
121
 
98
- module ClassMethods
99
-
100
- # Standard hack to create ClassMethods-on-include
101
- def self.included base
102
- base.class_eval do
103
- extend ClassMethods
104
- end
122
+ ABSOLUTE_URI = %r{^/|^\w+://}
123
+ #
124
+ # Walk through the input paths and the output path. Prepends
125
+ # Settings.emr_data_root to any that does NOT look like
126
+ # an absolute path ("/foo") or a URI ("s3://yourmom/data")
127
+ #
128
+ def fix_paths!
129
+ return if Settings.emr_data_root.blank?
130
+ unless input_paths.blank?
131
+ @input_paths = input_paths.map{|path| (path =~ ABSOLUTE_URI) ? path : File.join(Settings.emr_data_root, path) }
132
+ end
133
+ unless output_path.blank?
134
+ @output_path = [output_path].map{|path| (path =~ ABSOLUTE_URI) ? path : File.join(Settings.emr_data_root, path) }
105
135
  end
106
136
  end
107
137
 
138
+ #
139
+ # Simple class to coordinate s3 operations
140
+ #
108
141
  class S3Util
109
142
  # class methods
110
143
  class << self
111
144
  def s3
112
145
  @s3 ||= RightAws::S3Interface.new(
113
146
  Settings.access_key, Settings.secret_access_key,
114
- {:multi_thread => true, :logger => Log})
147
+ {:multi_thread => true, :logger => Log, :port => 80, :protocol => 'http' })
115
148
  end
116
-
117
149
  def bucket_and_path_from_uri uri
118
150
  uri =~ %r{^s3\w*://([\w\.\-]+)\W*(.*)} and return([$1, $2])
119
151
  end
120
-
121
152
  def store filename, uri
122
- Log.debug " #{filename} => #{uri}"
123
153
  dest_bucket, dest_key = bucket_and_path_from_uri(uri)
124
- contents = File.open(filename)
154
+ Log.debug " #{filename} => #{dest_bucket} / #{dest_key}"
155
+ contents = File.read(filename)
125
156
  s3.store_object(:bucket => dest_bucket, :key => dest_key, :data => contents)
126
157
  end
127
-
128
158
  end
129
159
  end
160
+
130
161
  end
131
162
  Script.class_eval do
132
163
  include EmrCommand
@@ -32,16 +32,28 @@ module Wukong
32
32
  Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
33
33
  Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
34
34
  Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
35
- Settings.define :min_input_split_size, :jobconf => true, :description => 'mapred.min.split.size', :wukong => true
35
+ Settings.define :min_split_size, :jobconf => true, :description => 'mapred.min.split.size', :wukong => true
36
36
  Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
37
+ Settings.define :split_on_xml_tag, :description => "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'", :wukong => true
38
+
39
+ # emit a -jobconf hadoop option if the simplified command line arg is present
40
+ # if not, the resulting nil will be elided later
41
+ def jobconf option
42
+ if options[option]
43
+ # "-jobconf %s=%s" % [options.description_for(option), options[option]]
44
+ "-D %s=%s" % [options.description_for(option), options[option]]
45
+ end
46
+ end
37
47
 
38
48
  #
39
49
  # Assemble the hadoop command to execute
40
50
  # and launch the hadoop runner to execute the script across all tasktrackers
41
51
  #
52
+ # FIXME: Should add some simple logic to ensure that commands are in the
53
+ # right order or hadoop will complain. ie. -D options MUST come before
54
+ # others
55
+ #
42
56
  def execute_hadoop_workflow
43
- # If no reducer_klass and no reduce_command, then skip the reduce phase
44
- options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
45
57
  # Input paths join by ','
46
58
  input_paths = @input_paths.join(',')
47
59
  #
@@ -49,14 +61,14 @@ module Wukong
49
61
  hadoop_commandline = [
50
62
  hadoop_runner,
51
63
  "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
64
+ hadoop_jobconf_options,
65
+ "-D mapred.job.name='#{job_name}'",
66
+ hadoop_other_args,
52
67
  "-mapper '#{mapper_commandline}'",
53
68
  "-reducer '#{reducer_commandline}'",
54
69
  "-input '#{input_paths}'",
55
70
  "-output '#{output_path}'",
56
- hadoop_jobconf_options,
57
- "-jobconf mapred.job.name='#{job_name}'",
58
71
  hadoop_recycle_env,
59
- hadoop_other_args,
60
72
  ].flatten.compact.join(" \t\\\n ")
61
73
  Log.info " Launching hadoop!"
62
74
  execute_command!(hadoop_commandline)
@@ -64,48 +76,40 @@ module Wukong
64
76
 
65
77
  def hadoop_jobconf_options
66
78
  jobconf_options = []
67
- # The fields should hadoop treat as the keys
68
- jobconf_options += [
69
- jobconf(:key_field_separator),
70
- jobconf(:sort_fields),
71
- ]
79
+ # Fixup these options
80
+ options[:reuse_jvms] = '-1' if (options[:reuse_jvms] == true)
81
+ options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
82
+ # If no reducer_klass and no reduce_command, then skip the reduce phase
83
+ options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
72
84
  # Fields hadoop should use to distribute records to reducers
73
85
  unless options[:partition_fields].blank?
74
86
  jobconf_options += [
75
- '-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
76
- jobconf(:output_field_separator),
77
87
  jobconf(:partition_fields),
88
+ jobconf(:output_field_separator),
78
89
  ]
79
90
  end
80
- # Setting the number of mappers and reducers.
81
91
  jobconf_options += [
82
- jobconf(:max_node_map_tasks),
83
- jobconf(:max_node_reduce_tasks),
84
- jobconf(:max_reduces_per_node),
85
- jobconf(:max_reduces_per_cluster),
86
- jobconf(:max_maps_per_node),
87
- jobconf(:max_maps_per_cluster),
88
- jobconf(:map_tasks),
89
- jobconf(:reduce_tasks)
90
- ]
92
+ :key_field_separator, :sort_fields,
93
+ :map_tasks, :reduce_tasks,
94
+ :max_node_map_tasks, :max_node_reduce_tasks,
95
+ :max_reduces_per_node, :max_reduces_per_cluster,
96
+ :max_maps_per_node, :max_maps_per_cluster,
97
+ :min_split_size,
98
+ :map_speculative,
99
+ :timeout,
100
+ :reuse_jvms, :respect_exit_status
101
+ ].map{|opt| jobconf(opt)}
91
102
  jobconf_options.flatten.compact
92
103
  end
93
104
 
94
- # emit a -jobconf hadoop option if the simplified command line arg is present
95
- # if not, the resulting nil will be elided later
96
- def jobconf option
97
- if options[option]
98
- "-jobconf %s=%s" % [options.description_for(option), options[option]]
99
- end
100
- end
101
-
102
105
  def hadoop_other_args
103
106
  extra_str_args = [ options[:extra_args] ]
104
- extra_str_args += ' -lazyOutput' if options[:noempty] # don't create reduce file if no records
105
- options[:reuse_jvms] = '-1' if (options[:reuse_jvms] == true)
106
- options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
107
- extra_hsh_args = [:map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt) }
108
- extra_str_args + extra_hsh_args
107
+ if Settings.split_on_xml_tag
108
+ extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{Settings.split_on_xml_tag}>,end=</#{Settings.split_on_xml_tag}>'}
109
+ end
110
+ extra_str_args << ' -lazyOutput' if options[:noempty] # don't create reduce file if no records
111
+ extra_str_args << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless options[:partition_fields].blank?
112
+ extra_str_args
109
113
  end
110
114
 
111
115
  def hadoop_recycle_env
@@ -135,42 +139,6 @@ module Wukong
135
139
  # Thanks to Todd Lipcon for directing me to that hack.
136
140
  #
137
141
 
138
- # "HADOOP_HOME" =>"/usr/lib/hadoop-0.20/bin/..",
139
- # "HADOOP_IDENT_STRING" =>"hadoop",
140
- # "HADOOP_LOGFILE" =>"hadoop-hadoop-tasktracker-ip-10-242-14-223.log",
141
- # "HADOOP_LOG_DIR" =>"/usr/lib/hadoop-0.20/bin/../logs",
142
- # "HOME" =>"/var/run/hadoop-0.20",
143
- # "JAVA_HOME" =>"/usr/lib/jvm/java-6-sun",
144
- # "LD_LIBRARY_PATH" =>"/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386:/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386",
145
- # "PATH" =>"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games",
146
- # "USER" =>"hadoop",
147
- #
148
- # "dfs_block_size" =>"134217728",
149
- # "map_input_start" =>"0",
150
- # "map_input_length" =>"125726898",
151
- # "mapred_output_key_class" =>"org.apache.hadoop.io.Text",
152
- # "mapred_output_value_class" =>"org.apache.hadoop.io.Text",
153
- # "mapred_output_format_class" =>"org.apache.hadoop.mapred.TextOutputFormat",
154
- # "mapred_output_compression_codec" =>"org.apache.hadoop.io.compress.DefaultCodec",
155
- # "mapred_output_compression_type" =>"BLOCK",
156
- # "mapred_task_partition" =>"0",
157
- # "mapred_tasktracker_map_tasks_maximum" =>"4",
158
- # "mapred_tasktracker_reduce_tasks_maximum" =>"2",
159
- # "mapred_tip_id" =>"task_200910221152_0023_m_000000",
160
- # "mapred_task_id" =>"attempt_200910221152_0023_m_000000_0",
161
- # "mapred_job_tracker" =>"ec2-174-129-141-78.compute-1.amazonaws.com:8021",
162
- #
163
- # "mapred_input_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809",
164
- # "map_input_file" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809/com.twitter.search+20090809233441-56735-womper.tsv.bz2",
165
- # "mapred_working_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip",
166
- # "mapred_work_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809/_temporary/_attempt_200910221152_0023_m_000000_0",
167
- # "mapred_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809",
168
- # "mapred_temp_dir" =>"/mnt/tmp/hadoop-hadoop/mapred/temp",
169
- # "PWD" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work",
170
- # "TMPDIR" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work/tmp",
171
- # "stream_map_streamprocessor" =>"%2Fusr%2Fbin%2Fruby1.8+%2Fmnt%2Fhome%2Fflip%2Fics%2Fwuclan%2Fexamples%2Ftwitter%2Fparse%2Fparse_twitter_search_requests.rb+--map+--rm",
172
- # "user_name" =>"flip",
173
-
174
142
  # HDFS pathname to the input file currently being processed.
175
143
  def input_file
176
144
  ENV['map_input_file']
@@ -211,3 +179,52 @@ module Wukong
211
179
  end
212
180
  end
213
181
  end
182
+
183
+ # -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
184
+ # -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
185
+ # -D mapred.text.key.comparator.options=-k2,2nr\
186
+ # -D mapred.text.key.partitioner.options=-k1,2\
187
+ # -D mapred.text.key.partitioner.options=\"-k1,$partfields\"
188
+ # -D stream.num.map.output.key.fields=\"$sortfields\"
189
+ #
190
+ # -D stream.map.output.field.separator=\"'/t'\"
191
+ # -D map.output.key.field.separator=. \
192
+ # -D mapred.data.field.separator=. \
193
+ # -D map.output.key.value.fields.spec=6,5,1-3:0- \
194
+ # -D reduce.output.key.value.fields.spec=0-2:5- \
195
+
196
+ # "HADOOP_HOME" =>"/usr/lib/hadoop-0.20/bin/..",
197
+ # "HADOOP_IDENT_STRING" =>"hadoop",
198
+ # "HADOOP_LOGFILE" =>"hadoop-hadoop-tasktracker-ip-10-242-14-223.log",
199
+ # "HADOOP_LOG_DIR" =>"/usr/lib/hadoop-0.20/bin/../logs",
200
+ # "HOME" =>"/var/run/hadoop-0.20",
201
+ # "JAVA_HOME" =>"/usr/lib/jvm/java-6-sun",
202
+ # "LD_LIBRARY_PATH" =>"/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386:/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386",
203
+ # "PATH" =>"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games",
204
+ # "USER" =>"hadoop",
205
+ #
206
+ # "dfs_block_size" =>"134217728",
207
+ # "map_input_start" =>"0",
208
+ # "map_input_length" =>"125726898",
209
+ # "mapred_output_key_class" =>"org.apache.hadoop.io.Text",
210
+ # "mapred_output_value_class" =>"org.apache.hadoop.io.Text",
211
+ # "mapred_output_format_class" =>"org.apache.hadoop.mapred.TextOutputFormat",
212
+ # "mapred_output_compression_codec" =>"org.apache.hadoop.io.compress.DefaultCodec",
213
+ # "mapred_output_compression_type" =>"BLOCK",
214
+ # "mapred_task_partition" =>"0",
215
+ # "mapred_tasktracker_map_tasks_maximum" =>"4",
216
+ # "mapred_tasktracker_reduce_tasks_maximum" =>"2",
217
+ # "mapred_tip_id" =>"task_200910221152_0023_m_000000",
218
+ # "mapred_task_id" =>"attempt_200910221152_0023_m_000000_0",
219
+ # "mapred_job_tracker" =>"ec2-174-129-141-78.compute-1.amazonaws.com:8021",
220
+ #
221
+ # "mapred_input_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809",
222
+ # "map_input_file" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809/com.twitter.search+20090809233441-56735-womper.tsv.bz2",
223
+ # "mapred_working_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip",
224
+ # "mapred_work_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809/_temporary/_attempt_200910221152_0023_m_000000_0",
225
+ # "mapred_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809",
226
+ # "mapred_temp_dir" =>"/mnt/tmp/hadoop-hadoop/mapred/temp",
227
+ # "PWD" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work",
228
+ # "TMPDIR" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work/tmp",
229
+ # "stream_map_streamprocessor" =>"%2Fusr%2Fbin%2Fruby1.8+%2Fmnt%2Fhome%2Fflip%2Fics%2Fwuclan%2Fexamples%2Ftwitter%2Fparse%2Fparse_twitter_search_requests.rb+--map+--rm",
230
+ # "user_name" =>"flip",