wukong 1.5.3 → 1.5.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/CHANGELOG.textile +4 -0
  2. data/bin/hdp-bin +44 -0
  3. data/bin/hdp-ls +2 -1
  4. data/docpages/avro/performance.textile +36 -0
  5. data/examples/cassandra_streaming/avromapper.rb +85 -0
  6. data/examples/cassandra_streaming/berlitz_for_cassandra.textile +22 -0
  7. data/examples/cassandra_streaming/cassandra.avpr +468 -0
  8. data/examples/cassandra_streaming/cassandra_random_partitioner.rb +62 -0
  9. data/examples/cassandra_streaming/catter.sh +45 -0
  10. data/examples/cassandra_streaming/client_interface_notes.textile +200 -0
  11. data/examples/cassandra_streaming/client_schema.avpr +211 -0
  12. data/examples/cassandra_streaming/client_schema.textile +318 -0
  13. data/examples/cassandra_streaming/foofile.avr +0 -0
  14. data/examples/cassandra_streaming/pymap.sh +1 -0
  15. data/examples/cassandra_streaming/pyreduce.sh +1 -0
  16. data/examples/cassandra_streaming/smutation.avpr +188 -0
  17. data/examples/cassandra_streaming/streamer.sh +51 -0
  18. data/examples/cassandra_streaming/struct_loader.rb +24 -0
  19. data/examples/cassandra_streaming/tuning.textile +73 -0
  20. data/examples/emr/README-elastic_map_reduce.textile +26 -0
  21. data/examples/emr/dot_wukong_dir/credentials.json +7 -0
  22. data/examples/emr/{emr.yaml → dot_wukong_dir/emr.yaml} +33 -16
  23. data/{bin/bootstrap.sh → examples/emr/dot_wukong_dir/emr_bootstrap.sh} +1 -1
  24. data/examples/emr/elastic_mapreduce_example.rb +1 -0
  25. data/lib/wukong/encoding/asciize.rb +108 -0
  26. data/lib/wukong/extensions/date_time.rb +33 -7
  27. data/lib/wukong/extensions/emittable.rb +12 -25
  28. data/lib/wukong/extensions/hash_like.rb +13 -6
  29. data/lib/wukong/filename_pattern.rb +8 -7
  30. data/lib/wukong/schema.rb +47 -0
  31. data/lib/wukong/script.rb +7 -0
  32. data/lib/wukong/script/cassandra_loader_script.rb +40 -0
  33. data/lib/wukong/script/emr_command.rb +74 -43
  34. data/lib/wukong/script/hadoop_command.rb +89 -72
  35. data/lib/wukong/store.rb +2 -7
  36. data/lib/wukong/store/cassandra.rb +10 -0
  37. data/lib/wukong/store/cassandra/streaming.rb +75 -0
  38. data/lib/wukong/store/cassandra/struct_loader.rb +21 -0
  39. data/lib/wukong/store/cassandra_model.rb +90 -0
  40. data/lib/wukong/store/chh_chunked_flat_file_store.rb +1 -1
  41. data/lib/wukong/store/chunked_flat_file_store.rb +24 -20
  42. data/wukong.gemspec +32 -4
  43. metadata +33 -14
@@ -1,4 +1,3 @@
1
-
2
1
  Object.class_eval do
3
2
  def to_flat() [to_s] end
4
3
  end
@@ -54,29 +53,17 @@ Hash.class_eval do
54
53
  end
55
54
  end
56
55
 
57
- class Time
58
- # strftime() format to flatten a date
59
- FLAT_FORMAT = "%Y%m%d%H%M%S"
60
- # Flatten
61
- def to_flat
62
- strftime(FLAT_FORMAT)
63
- end
64
- end
65
-
66
- class Date
67
- # strftime() format to flatten a date
68
- FLAT_FORMAT = "%Y%m%d"
69
- # Flatten
70
- def to_flat
71
- strftime(FLAT_FORMAT)
72
- end
73
- end
74
-
75
- class DateTime < Date
76
- # strftime() format to flatten a date
77
- FLAT_FORMAT = "%Y%m%d%H%M%S"
78
- # Flatten
79
- def to_flat
80
- strftime(FLAT_FORMAT)
56
+ class Integer
57
+ #
58
+ # Express boolean as 1 (true) or 0 (false). In contravention of typical ruby
59
+ # semantics (but in a way that is more robust for wukong-like batch
60
+ # processing), the number 0, the string '0', nil and false are all considered
61
+ # false. (This also makes the method idempotent: repeated calls give same result.)
62
+ #
63
+ def self.unbooleanize bool
64
+ case bool
65
+ when 0, '0', false, nil then 0
66
+ else 1
67
+ end
81
68
  end
82
69
  end
@@ -103,16 +103,23 @@ module Wukong
103
103
  # otherwise they must be uniformly strings
104
104
  #
105
105
  def from_hash(hsh, has_symbol_keys=false)
106
- keys = self.keys
107
- keys = keys.map(&:to_sym) if has_symbol_keys
108
- self.new *hsh.values_of(*keys)
106
+ extract_keys = has_symbol_keys ? self.keys.map(&:to_sym) : self.keys.map(&:to_s)
107
+ self.new *hsh.values_of(*extract_keys)
109
108
  end
110
109
  #
111
110
  # The last portion of the class in underscored form
112
- # note memoization
111
+ # memoized
113
112
  #
114
- def self.resource_name
115
- @resource_name ||= self.to_s.gsub(%r{.*::}, '').underscore.to_sym
113
+ def resource_name
114
+ @resource_name ||= self.class_basename.underscore.to_sym
115
+ end
116
+ # The last portion of the class name
117
+ # memoized
118
+ #
119
+ # @example
120
+ # This::That::TheOther.new.class_basename # => TheOther
121
+ def class_basename
122
+ @class_basename ||= self.to_s.gsub(%r{.*::}, '')
116
123
  end
117
124
  end
118
125
 
@@ -16,12 +16,12 @@ module Wukong
16
16
  # walk through pattern, replacing tokens (eg :time or :pid) with the
17
17
  # corresponding value.
18
18
  #
19
+ # Don't use ':' in a pattern except to introduce a token
20
+ # and separate tokens with '-', '+' '/' or '.'
21
+ #
19
22
  def make token_vals={}
20
23
  token_vals = token_val_defaults.merge token_vals
21
24
  token_vals[:timestamp] ||= Time.now.utc.strftime("%Y%m%d%H%M%S")
22
- # CHH_NOTE: The following is broken for patterns that need a ":" or
23
- # patterns that need text following a token with no special chars in
24
- # between.
25
25
  val = pattern.gsub(/:(\w+)/){ replace($1, token_vals) }
26
26
  val
27
27
  end
@@ -39,7 +39,7 @@ module Wukong
39
39
  case token
40
40
  when :pid then pid
41
41
  when :hostname then hostname
42
- when :handle then token_vals[:handle]
42
+ when :handle then token_vals[:handle]
43
43
  when :handle_prefix then token_vals[:handle].to_s[0..5]
44
44
  when :timestamp then token_vals[:timestamp]
45
45
  when :date then token_vals[:timestamp][ 0..7]
@@ -56,7 +56,7 @@ module Wukong
56
56
 
57
57
  # Memoized: the hostname for the machine running this script.
58
58
  def hostname
59
- @hostname ||= ENV['HOSTNAME'] || `hostname`.delete("\n")
59
+ @hostname ||= ENV['HOSTNAME'] || `hostname`.chomp
60
60
  end
61
61
  # Memoized: the Process ID for this invocation.
62
62
  def pid
@@ -64,9 +64,10 @@ module Wukong
64
64
  end
65
65
 
66
66
  # Characters deemed safe in a filename;
67
- SAFE_CHARS = 'a-zA-Z0-9_\-\.\+\/\;'
67
+ SAFE_CHARS = 'a-zA-Z0-9_\-\.\+\/'
68
+ RE_SAFE_FILENAME = %r{[^#{SAFE_CHARS}]+}moxi
68
69
  def self.sanitize str
69
- str.gsub(%r{[^#{SAFE_CHARS}]+}, '-')
70
+ str.gsub(RE_SAFE_FILENAME, '-')
70
71
  end
71
72
 
72
73
  end
@@ -50,6 +50,35 @@ class << Yaml ; def to_pig() 'chararray' end ; end if defined?(Yaml)
50
50
  class << Json ; def to_pig() 'chararray' end ; end if defined?(Json)
51
51
  class << Regex ; def to_pig() 'chararray' end ; end if defined?(Regex)
52
52
 
53
+
54
+ #
55
+ # Basic types: Avro conversion
56
+ #
57
+ class << Integer ; def to_avro() 'int' end ; end
58
+ class << Bignum ; def to_avro() 'long' end ; end
59
+ class << Float ; def to_avro() 'float' end ; end
60
+ class << Symbol ; def to_avro() 'string' end ; end
61
+ class << Date ; def to_avro() 'long' end ; end
62
+ class << Time ; def to_avro() 'long' end ; end
63
+ class << DateTime ; def to_avro() 'long' end ; end
64
+ class << String ; def to_avro() 'string' end ; end
65
+ class << Text ; def to_avro() 'string' end ; end if defined?(Text)
66
+ class << Blob ; def to_avro() 'bytearray' end ; end if defined?(Blob)
67
+ class << Boolean ; def to_avro() 'bytearray' end ; end if defined?(Boolean)
68
+ class String ; def to_avro() self.to_s ; end ; end
69
+ class Symbol ; def to_avro() self.to_s ; end ; end
70
+
71
+ class << BigDecimal ; def to_avro() 'long' end ; end if defined?(BigDecimal)
72
+ class << EpochTime ; def to_avro() 'integer' end ; end if defined?(EpochTime)
73
+ class << FilePath ; def to_avro() 'string' end ; end if defined?(FilePath)
74
+ class << Flag ; def to_avro() 'string' end ; end if defined?(Flag)
75
+ class << IPAddress ; def to_avro() 'string' end ; end if defined?(IPAddress)
76
+ class << URI ; def to_avro() 'string' end ; end if defined?(URI)
77
+ class << Csv ; def to_avro() 'string' end ; end if defined?(Csv)
78
+ class << Yaml ; def to_avro() 'string' end ; end if defined?(Yaml)
79
+ class << Json ; def to_avro() 'string' end ; end if defined?(Json)
80
+ class << Regex ; def to_avro() 'string' end ; end if defined?(Regex)
81
+
53
82
  module Wukong
54
83
  #
55
84
  # Export model's structure for loading and manipulating in other frameworks,
@@ -208,6 +237,24 @@ module Wukong
208
237
  str.join("\n")
209
238
  end
210
239
 
240
+
241
+
242
+
243
+ #
244
+ # Avro
245
+ #
246
+ def to_avro
247
+ require 'json' # yikes
248
+ h = {}
249
+ h[:name] = self.name
250
+ h[:type] = "record"
251
+ h[:fields] = []
252
+ members.zip(mtypes).each do |member, type|
253
+ h[:fields] << {:name => member.to_s, :type => type.to_avro}
254
+ end
255
+ h.to_json
256
+ end
257
+
211
258
  end
212
259
  # standard stanza for making methods appear on the class itself on include
213
260
  def self.included base
@@ -145,6 +145,7 @@ module Wukong
145
145
  when 'map' then mapper_klass.new(self.options).stream
146
146
  when 'reduce' then reducer_klass.new(self.options).stream
147
147
  when 'local' then execute_local_workflow
148
+ when 'cassandra' then execute_hadoop_workflow
148
149
  when 'hadoop', 'mapred' then execute_hadoop_workflow
149
150
  when 'emr'
150
151
  require 'wukong/script/emr_command'
@@ -196,6 +197,12 @@ module Wukong
196
197
  "#{File.basename(this_script_filename)}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
197
198
  end
198
199
 
200
+ # Wrapper for dangerous operations to catch errors
201
+ def safely action, &block
202
+ begin
203
+ block.call
204
+ rescue StandardError => e ; handle_error(action, e); end
205
+ end
199
206
 
200
207
  protected
201
208
 
@@ -0,0 +1,40 @@
1
+ Settings.define :cassandra_keyspace, :required => true, :description => "The keyspace to bulk load"
2
+ Settings.define :cassandra_col_family, :required => true, :description => "The column family to bulk load"
3
+ Settings.define :cassandra_home, :env_var => 'CASSANDRA_HOME', :default => '/usr/local/share/cassandra'
4
+
5
+ module Wukong
6
+ class CassandraScript < Wukong::Script
7
+ def hadoop_other_args *args
8
+ opts = super(*args)
9
+ opts << "-D stream.map.output=\'cassandra_avro_output\'"
10
+ opts << "-D stream.io.identifier.resolver.class=\'org.apache.cassandra.hadoop.streaming.AvroResolver\'"
11
+ opts << "-D cassandra.output.keyspace=\'#{Settings.cassandra_keyspace}\'"
12
+ opts << "-D cassandra.output.columnfamily=\'#{Settings.cassandra_col_family}\'"
13
+ opts << "-D cassandra.partitioner.class=\'org.apache.cassandra.dht.RandomPartitioner\'"
14
+ opts << "-D cassandra.thrift.address=\'#{[Settings.cassandra_hosts].flatten.map{|s| s.gsub(/:.*/, '')}.join(",")}\'"
15
+ opts << "-D cassandra.thrift.port=\'9160\'"
16
+ # opts << "-D mapreduce.output.columnfamilyoutputformat.batch.threshold=\'1024\'"
17
+ # ORDER MATTERS
18
+ opts << "-libjars \'#{cassandra_jars}\'"
19
+ opts << "-file \'#{avro_schema}\'"
20
+ opts << "-outputformat \'org.apache.cassandra.hadoop.ColumnFamilyOutputFormat\'"
21
+ opts
22
+ end
23
+
24
+ #
25
+ # Return paths to cassandra jars as a string
26
+ #
27
+ def cassandra_jars
28
+ jars = []
29
+ Dir["#{Settings.cassandra_home}/build/apache-cassandra*.jar", "#{Settings.cassandra_home}/build/lib/jars/*.jar", "#{Settings.cassandra_home}/lib/*.jar"].each do |jar|
30
+ jars << jar
31
+ end
32
+ jars.join(',')
33
+ end
34
+
35
+ def avro_schema
36
+ File.join(Settings.cassandra_home, "interface/avro/cassandra.avpr")
37
+ end
38
+
39
+ end
40
+ end
@@ -1,16 +1,26 @@
1
1
  require 'right_aws'
2
2
  require 'configliere/config_block'
3
- Settings.read(File.expand_path('~/.wukong/emr.yaml'))
3
+ #
4
+ EMR_CONFIG_DIR = '~/.wukong' unless defined?(EMR_CONFIG_DIR)
5
+ #
4
6
  Settings.define :emr_credentials_file, :description => 'A .json file holding your AWS access credentials. See http://bit.ly/emr_credentials_file for format'
5
7
  Settings.define :access_key, :description => 'AWS Access key', :env_var => 'AWS_ACCESS_KEY_ID'
6
8
  Settings.define :secret_access_key, :description => 'AWS Secret Access key', :env_var => 'AWS_SECRET_ACCESS_KEY'
7
9
  Settings.define :emr_runner, :description => 'Path to the elastic-mapreduce command (~ etc will be expanded)'
8
- Settings.define :emr_root, :description => 'S3 url to use as the base for Elastic MapReduce storage'
9
- Settings.define :key_pair_file, :description => 'AWS Key pair file', :finally => lambda{ Settings.key_pair_file = File.expand_path(Settings.key_pair_file.to_s) if Settings.key_pair_file }
10
- Settings.define :key_pair, :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') if Settings.key_pair_file }
11
- Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
10
+ Settings.define :emr_root, :description => 'S3 bucket and path to use as the base for Elastic MapReduce storage, organized by job name'
11
+ Settings.define :emr_data_root, :description => 'Optional '
12
+ Settings.define :emr_bootstrap_script, :description => 'Bootstrap actions for Elastic Map Reduce machine provisioning', :default => EMR_CONFIG_DIR+'/emr_bootstrap.sh', :type => :filename, :finally => lambda{ Settings.emr_bootstrap_script = File.expand_path(Settings.emr_bootstrap_script) }
13
+ Settings.define :emr_extra_args, :description => 'kludge: allows you to stuff extra args into the elastic-mapreduce invocation', :type => Array, :wukong => true
14
+ Settings.define :alive, :description => 'Whether to keep machine running after job invocation', :type => :boolean
15
+ #
16
+ Settings.define :keypair_file, :description => 'AWS Key pair file', :type => :filename
17
+ Settings.define :keypair, :description => "AWS Key pair name. If not specified, it's taken from keypair_file's basename", :finally => lambda{ Settings.keypair ||= File.basename(Settings.keypair_file.to_s, '.pem') if Settings.keypair_file }
18
+ Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
12
19
  Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
13
- Settings.define :jobflow
20
+ Settings.define :jobflow, :description => "ID of an existing EMR job flow. Wukong will create a new job flow"
21
+ #
22
+ Settings.read(File.expand_path(EMR_CONFIG_DIR+'/emr.yaml'))
23
+
14
24
  module Wukong
15
25
  #
16
26
  # EMR Options
@@ -26,39 +36,46 @@ module Wukong
26
36
  Log.info " Copying this script to the cloud."
27
37
  S3Util.store(this_script_filename, mapper_s3_uri)
28
38
  S3Util.store(this_script_filename, reducer_s3_uri)
29
- S3Util.store(File.expand_path('~/ics/wukong/bin/bootstrap.sh'), bootstrap_s3_uri)
39
+ S3Util.store(File.expand_path(Settings.emr_bootstrap_script), bootstrap_s3_uri)
40
+ end
41
+
42
+ def copy_jars_to_cloud
43
+ S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), wukong_libs_s3_uri)
44
+ # "--cache-archive=#{wukong_libs_s3_uri}#vendor",
45
+ end
46
+
47
+ def hadoop_options_for_emr_runner
48
+ [hadoop_jobconf_options, hadoop_other_args].flatten.compact.map{|hdp_opt| "--arg '#{hdp_opt}'"}
30
49
  end
31
50
 
32
51
  def execute_emr_runner
33
52
  command_args = []
34
- command_args << Settings.dashed_flags(:hadoop_version, :enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
35
- command_args += emr_credentials
36
53
  if Settings.jobflow
37
54
  command_args << Settings.dashed_flag_for(:jobflow)
38
55
  else
39
- command_args << Settings.dashed_flag_for(:alive)
40
56
  command_args << "--create --name=#{job_name}"
41
- command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type).join(' ')
57
+ command_args << Settings.dashed_flag_for(:alive)
58
+ command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type, :hadoop_version).join(' ')
59
+ command_args << Settings.dashed_flags(:availability_zone, :keypair, :keypair_file).join(' ')
60
+ command_args << "--bootstrap-action=#{bootstrap_s3_uri}"
42
61
  end
62
+ command_args << Settings.dashed_flags(:enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
63
+ command_args += emr_credentials
43
64
  command_args += [
44
- "--bootstrap-action=#{bootstrap_s3_uri}",
45
65
  "--log-uri=#{log_s3_uri}",
46
66
  "--stream",
47
67
  "--mapper=#{mapper_s3_uri} ",
48
68
  "--reducer=#{reducer_s3_uri} ",
49
- "--input=#{input_paths} --output=#{output_path}",
50
- # to specify zero reducers:
51
- # "--arg '-D mapred.reduce.tasks=0'"
69
+ "--input=#{input_paths.join(",")} --output=#{output_path}",
52
70
  ]
71
+ # eg to specify zero reducers:
72
+ # Settings[:emr_extra_args] = "--arg '-D mapred.reduce.tasks=0'"
73
+ command_args += Settings[:emr_extra_args] unless Settings[:emr_extra_args].blank?
74
+ command_args += hadoop_options_for_emr_runner
53
75
  Log.info 'Follow along at http://localhost:9000/job'
54
76
  execute_command!( File.expand_path(Settings.emr_runner), *command_args )
55
77
  end
56
78
 
57
- def emr_ship_jars
58
- S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), wukong_libs_s3_uri)
59
- # "--cache-archive=#{wukong_libs_s3_uri}#vendor",
60
- end
61
-
62
79
  def emr_credentials
63
80
  command_args = []
64
81
  if Settings.emr_credentials_file
@@ -66,7 +83,6 @@ module Wukong
66
83
  else
67
84
  command_args << %Q{--access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} }
68
85
  end
69
- command_args << Settings.dashed_flags(:availability_zone, :key_pair, :key_pair_file).join(' ')
70
86
  command_args
71
87
  end
72
88
 
@@ -75,58 +91,73 @@ module Wukong
75
91
  File.basename($0,'.rb')
76
92
  end
77
93
 
94
+ # Produces an s3 URI within the Wukong emr sandbox from a set of path
95
+ # segments
96
+ #
97
+ # @example
98
+ # Settings.emr_root = 's3://emr.yourmom.com/wukong'
99
+ # emr_s3_path('log', 'my_happy_job', 'run-97.log')
100
+ # # => "s3://emr.yourmom.com/wukong/log/my_happy_job/run-97.log"
101
+ #
102
+ def emr_s3_path *path_segs
103
+ File.join(Settings.emr_root, path_segs.flatten.compact)
104
+ end
105
+
78
106
  def mapper_s3_uri
79
- emr_s3_path(job_handle+'-mapper.rb')
107
+ emr_s3_path(job_handle, 'code', job_handle+'-mapper.rb')
80
108
  end
81
109
  def reducer_s3_uri
82
- emr_s3_path(job_handle+'-reducer.rb')
110
+ emr_s3_path(job_handle, 'code', job_handle+'-reducer.rb')
83
111
  end
84
112
  def log_s3_uri
85
- emr_s3_path('log', job_handle)
113
+ emr_s3_path(job_handle, 'log', 'emr_jobs')
86
114
  end
87
115
  def bootstrap_s3_uri
88
- emr_s3_path('bin', "bootstrap-#{job_handle}.sh")
116
+ emr_s3_path(job_handle, 'bin', "emr_bootstrap.sh")
89
117
  end
90
118
  def wukong_libs_s3_uri
91
- emr_s3_path('bin', "wukong-libs.jar")
92
- end
93
-
94
- def emr_s3_path *path_segs
95
- File.join(Settings.emr_root, path_segs.flatten.compact)
119
+ emr_s3_path(job_handle, 'code', "wukong-libs.jar")
96
120
  end
97
121
 
98
- module ClassMethods
99
-
100
- # Standard hack to create ClassMethods-on-include
101
- def self.included base
102
- base.class_eval do
103
- extend ClassMethods
104
- end
122
+ ABSOLUTE_URI = %r{^/|^\w+://}
123
+ #
124
+ # Walk through the input paths and the output path. Prepends
125
+ # Settings.emr_data_root to any that does NOT look like
126
+ # an absolute path ("/foo") or a URI ("s3://yourmom/data")
127
+ #
128
+ def fix_paths!
129
+ return if Settings.emr_data_root.blank?
130
+ unless input_paths.blank?
131
+ @input_paths = input_paths.map{|path| (path =~ ABSOLUTE_URI) ? path : File.join(Settings.emr_data_root, path) }
132
+ end
133
+ unless output_path.blank?
134
+ @output_path = [output_path].map{|path| (path =~ ABSOLUTE_URI) ? path : File.join(Settings.emr_data_root, path) }
105
135
  end
106
136
  end
107
137
 
138
+ #
139
+ # Simple class to coordinate s3 operations
140
+ #
108
141
  class S3Util
109
142
  # class methods
110
143
  class << self
111
144
  def s3
112
145
  @s3 ||= RightAws::S3Interface.new(
113
146
  Settings.access_key, Settings.secret_access_key,
114
- {:multi_thread => true, :logger => Log})
147
+ {:multi_thread => true, :logger => Log, :port => 80, :protocol => 'http' })
115
148
  end
116
-
117
149
  def bucket_and_path_from_uri uri
118
150
  uri =~ %r{^s3\w*://([\w\.\-]+)\W*(.*)} and return([$1, $2])
119
151
  end
120
-
121
152
  def store filename, uri
122
- Log.debug " #{filename} => #{uri}"
123
153
  dest_bucket, dest_key = bucket_and_path_from_uri(uri)
124
- contents = File.open(filename)
154
+ Log.debug " #{filename} => #{dest_bucket} / #{dest_key}"
155
+ contents = File.read(filename)
125
156
  s3.store_object(:bucket => dest_bucket, :key => dest_key, :data => contents)
126
157
  end
127
-
128
158
  end
129
159
  end
160
+
130
161
  end
131
162
  Script.class_eval do
132
163
  include EmrCommand
@@ -32,16 +32,28 @@ module Wukong
32
32
  Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
33
33
  Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
34
34
  Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
35
- Settings.define :min_input_split_size, :jobconf => true, :description => 'mapred.min.split.size', :wukong => true
35
+ Settings.define :min_split_size, :jobconf => true, :description => 'mapred.min.split.size', :wukong => true
36
36
  Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
37
+ Settings.define :split_on_xml_tag, :description => "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'", :wukong => true
38
+
39
+ # emit a -jobconf hadoop option if the simplified command line arg is present
40
+ # if not, the resulting nil will be elided later
41
+ def jobconf option
42
+ if options[option]
43
+ # "-jobconf %s=%s" % [options.description_for(option), options[option]]
44
+ "-D %s=%s" % [options.description_for(option), options[option]]
45
+ end
46
+ end
37
47
 
38
48
  #
39
49
  # Assemble the hadoop command to execute
40
50
  # and launch the hadoop runner to execute the script across all tasktrackers
41
51
  #
52
+ # FIXME: Should add some simple logic to ensure that commands are in the
53
+ # right order or hadoop will complain. ie. -D options MUST come before
54
+ # others
55
+ #
42
56
  def execute_hadoop_workflow
43
- # If no reducer_klass and no reduce_command, then skip the reduce phase
44
- options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
45
57
  # Input paths join by ','
46
58
  input_paths = @input_paths.join(',')
47
59
  #
@@ -49,14 +61,14 @@ module Wukong
49
61
  hadoop_commandline = [
50
62
  hadoop_runner,
51
63
  "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
64
+ hadoop_jobconf_options,
65
+ "-D mapred.job.name='#{job_name}'",
66
+ hadoop_other_args,
52
67
  "-mapper '#{mapper_commandline}'",
53
68
  "-reducer '#{reducer_commandline}'",
54
69
  "-input '#{input_paths}'",
55
70
  "-output '#{output_path}'",
56
- hadoop_jobconf_options,
57
- "-jobconf mapred.job.name='#{job_name}'",
58
71
  hadoop_recycle_env,
59
- hadoop_other_args,
60
72
  ].flatten.compact.join(" \t\\\n ")
61
73
  Log.info " Launching hadoop!"
62
74
  execute_command!(hadoop_commandline)
@@ -64,48 +76,40 @@ module Wukong
64
76
 
65
77
  def hadoop_jobconf_options
66
78
  jobconf_options = []
67
- # The fields should hadoop treat as the keys
68
- jobconf_options += [
69
- jobconf(:key_field_separator),
70
- jobconf(:sort_fields),
71
- ]
79
+ # Fixup these options
80
+ options[:reuse_jvms] = '-1' if (options[:reuse_jvms] == true)
81
+ options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
82
+ # If no reducer_klass and no reduce_command, then skip the reduce phase
83
+ options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
72
84
  # Fields hadoop should use to distribute records to reducers
73
85
  unless options[:partition_fields].blank?
74
86
  jobconf_options += [
75
- '-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
76
- jobconf(:output_field_separator),
77
87
  jobconf(:partition_fields),
88
+ jobconf(:output_field_separator),
78
89
  ]
79
90
  end
80
- # Setting the number of mappers and reducers.
81
91
  jobconf_options += [
82
- jobconf(:max_node_map_tasks),
83
- jobconf(:max_node_reduce_tasks),
84
- jobconf(:max_reduces_per_node),
85
- jobconf(:max_reduces_per_cluster),
86
- jobconf(:max_maps_per_node),
87
- jobconf(:max_maps_per_cluster),
88
- jobconf(:map_tasks),
89
- jobconf(:reduce_tasks)
90
- ]
92
+ :key_field_separator, :sort_fields,
93
+ :map_tasks, :reduce_tasks,
94
+ :max_node_map_tasks, :max_node_reduce_tasks,
95
+ :max_reduces_per_node, :max_reduces_per_cluster,
96
+ :max_maps_per_node, :max_maps_per_cluster,
97
+ :min_split_size,
98
+ :map_speculative,
99
+ :timeout,
100
+ :reuse_jvms, :respect_exit_status
101
+ ].map{|opt| jobconf(opt)}
91
102
  jobconf_options.flatten.compact
92
103
  end
93
104
 
94
- # emit a -jobconf hadoop option if the simplified command line arg is present
95
- # if not, the resulting nil will be elided later
96
- def jobconf option
97
- if options[option]
98
- "-jobconf %s=%s" % [options.description_for(option), options[option]]
99
- end
100
- end
101
-
102
105
  def hadoop_other_args
103
106
  extra_str_args = [ options[:extra_args] ]
104
- extra_str_args += ' -lazyOutput' if options[:noempty] # don't create reduce file if no records
105
- options[:reuse_jvms] = '-1' if (options[:reuse_jvms] == true)
106
- options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
107
- extra_hsh_args = [:map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt) }
108
- extra_str_args + extra_hsh_args
107
+ if Settings.split_on_xml_tag
108
+ extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{Settings.split_on_xml_tag}>,end=</#{Settings.split_on_xml_tag}>'}
109
+ end
110
+ extra_str_args << ' -lazyOutput' if options[:noempty] # don't create reduce file if no records
111
+ extra_str_args << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless options[:partition_fields].blank?
112
+ extra_str_args
109
113
  end
110
114
 
111
115
  def hadoop_recycle_env
@@ -135,42 +139,6 @@ module Wukong
135
139
  # Thanks to Todd Lipcon for directing me to that hack.
136
140
  #
137
141
 
138
- # "HADOOP_HOME" =>"/usr/lib/hadoop-0.20/bin/..",
139
- # "HADOOP_IDENT_STRING" =>"hadoop",
140
- # "HADOOP_LOGFILE" =>"hadoop-hadoop-tasktracker-ip-10-242-14-223.log",
141
- # "HADOOP_LOG_DIR" =>"/usr/lib/hadoop-0.20/bin/../logs",
142
- # "HOME" =>"/var/run/hadoop-0.20",
143
- # "JAVA_HOME" =>"/usr/lib/jvm/java-6-sun",
144
- # "LD_LIBRARY_PATH" =>"/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386:/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386",
145
- # "PATH" =>"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games",
146
- # "USER" =>"hadoop",
147
- #
148
- # "dfs_block_size" =>"134217728",
149
- # "map_input_start" =>"0",
150
- # "map_input_length" =>"125726898",
151
- # "mapred_output_key_class" =>"org.apache.hadoop.io.Text",
152
- # "mapred_output_value_class" =>"org.apache.hadoop.io.Text",
153
- # "mapred_output_format_class" =>"org.apache.hadoop.mapred.TextOutputFormat",
154
- # "mapred_output_compression_codec" =>"org.apache.hadoop.io.compress.DefaultCodec",
155
- # "mapred_output_compression_type" =>"BLOCK",
156
- # "mapred_task_partition" =>"0",
157
- # "mapred_tasktracker_map_tasks_maximum" =>"4",
158
- # "mapred_tasktracker_reduce_tasks_maximum" =>"2",
159
- # "mapred_tip_id" =>"task_200910221152_0023_m_000000",
160
- # "mapred_task_id" =>"attempt_200910221152_0023_m_000000_0",
161
- # "mapred_job_tracker" =>"ec2-174-129-141-78.compute-1.amazonaws.com:8021",
162
- #
163
- # "mapred_input_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809",
164
- # "map_input_file" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809/com.twitter.search+20090809233441-56735-womper.tsv.bz2",
165
- # "mapred_working_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip",
166
- # "mapred_work_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809/_temporary/_attempt_200910221152_0023_m_000000_0",
167
- # "mapred_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809",
168
- # "mapred_temp_dir" =>"/mnt/tmp/hadoop-hadoop/mapred/temp",
169
- # "PWD" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work",
170
- # "TMPDIR" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work/tmp",
171
- # "stream_map_streamprocessor" =>"%2Fusr%2Fbin%2Fruby1.8+%2Fmnt%2Fhome%2Fflip%2Fics%2Fwuclan%2Fexamples%2Ftwitter%2Fparse%2Fparse_twitter_search_requests.rb+--map+--rm",
172
- # "user_name" =>"flip",
173
-
174
142
  # HDFS pathname to the input file currently being processed.
175
143
  def input_file
176
144
  ENV['map_input_file']
@@ -211,3 +179,52 @@ module Wukong
211
179
  end
212
180
  end
213
181
  end
182
+
183
+ # -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
184
+ # -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
185
+ # -D mapred.text.key.comparator.options=-k2,2nr\
186
+ # -D mapred.text.key.partitioner.options=-k1,2\
187
+ # -D mapred.text.key.partitioner.options=\"-k1,$partfields\"
188
+ # -D stream.num.map.output.key.fields=\"$sortfields\"
189
+ #
190
+ # -D stream.map.output.field.separator=\"'/t'\"
191
+ # -D map.output.key.field.separator=. \
192
+ # -D mapred.data.field.separator=. \
193
+ # -D map.output.key.value.fields.spec=6,5,1-3:0- \
194
+ # -D reduce.output.key.value.fields.spec=0-2:5- \
195
+
196
+ # "HADOOP_HOME" =>"/usr/lib/hadoop-0.20/bin/..",
197
+ # "HADOOP_IDENT_STRING" =>"hadoop",
198
+ # "HADOOP_LOGFILE" =>"hadoop-hadoop-tasktracker-ip-10-242-14-223.log",
199
+ # "HADOOP_LOG_DIR" =>"/usr/lib/hadoop-0.20/bin/../logs",
200
+ # "HOME" =>"/var/run/hadoop-0.20",
201
+ # "JAVA_HOME" =>"/usr/lib/jvm/java-6-sun",
202
+ # "LD_LIBRARY_PATH" =>"/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386:/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386",
203
+ # "PATH" =>"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games",
204
+ # "USER" =>"hadoop",
205
+ #
206
+ # "dfs_block_size" =>"134217728",
207
+ # "map_input_start" =>"0",
208
+ # "map_input_length" =>"125726898",
209
+ # "mapred_output_key_class" =>"org.apache.hadoop.io.Text",
210
+ # "mapred_output_value_class" =>"org.apache.hadoop.io.Text",
211
+ # "mapred_output_format_class" =>"org.apache.hadoop.mapred.TextOutputFormat",
212
+ # "mapred_output_compression_codec" =>"org.apache.hadoop.io.compress.DefaultCodec",
213
+ # "mapred_output_compression_type" =>"BLOCK",
214
+ # "mapred_task_partition" =>"0",
215
+ # "mapred_tasktracker_map_tasks_maximum" =>"4",
216
+ # "mapred_tasktracker_reduce_tasks_maximum" =>"2",
217
+ # "mapred_tip_id" =>"task_200910221152_0023_m_000000",
218
+ # "mapred_task_id" =>"attempt_200910221152_0023_m_000000_0",
219
+ # "mapred_job_tracker" =>"ec2-174-129-141-78.compute-1.amazonaws.com:8021",
220
+ #
221
+ # "mapred_input_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809",
222
+ # "map_input_file" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809/com.twitter.search+20090809233441-56735-womper.tsv.bz2",
223
+ # "mapred_working_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip",
224
+ # "mapred_work_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809/_temporary/_attempt_200910221152_0023_m_000000_0",
225
+ # "mapred_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809",
226
+ # "mapred_temp_dir" =>"/mnt/tmp/hadoop-hadoop/mapred/temp",
227
+ # "PWD" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work",
228
+ # "TMPDIR" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work/tmp",
229
+ # "stream_map_streamprocessor" =>"%2Fusr%2Fbin%2Fruby1.8+%2Fmnt%2Fhome%2Fflip%2Fics%2Fwuclan%2Fexamples%2Ftwitter%2Fparse%2Fparse_twitter_search_requests.rb+--map+--rm",
230
+ # "user_name" =>"flip",