mobilize-hive 1.0.11 → 1.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -146,7 +146,7 @@ Start
146
146
  script in the hql or source sheet and returns any output specified at the
147
147
  end. If the cmd or last query in source is a select statement, column headers will be
148
148
  returned as well.
149
- * hive.write `hql:<hql> || source:<source_path>, target:<hive_path>, user:<user>, cluster:<cluster>, schema:<gsheet_path>, drop:<true/false>`,
149
+ * hive.write `hql:<hql> || source:<source_path>, target:<hive_path>, partitions:<partition_path>, user:<user>, cluster:<cluster>, schema:<gsheet_path>, drop:<true/false>`,
150
150
  which writes the source or query result to the selected hive table.
151
151
  * hive_path
152
152
  * should be of the form `<hive_db>/<table_name>` or `<hive_db>.<table_name>`.
@@ -156,8 +156,10 @@ Start
156
156
  * if the file ends in .*ql, it's treated the same as passing hql
157
157
  * otherwise it is treated as a tsv with the first row as column headers
158
158
  * target:
159
- * Partitions can optionally be added to the hive_path, as in `<hive_db>/<table_name>/<partition1>/<partition2>`.
159
+ * Should be a hive_path, as in `<hive_db>/<table_name>` or `<hive_db>.<table_name>`.
160
+ * partitions:
160
161
  * Due to Hive limitation, partition names CANNOT be reserved keywords when writing from tsv (gsheet or hdfs source)
162
+ * Partitions should be specified as a path, as in partitions:`<partition1>/<partition2>`.
161
163
  * schema:
162
164
  * optional. gsheet_path to column schema.
163
165
  * two columns: name, datatype
@@ -47,10 +47,69 @@ module Mobilize
47
47
  end
48
48
  end
49
49
 
50
+ def Hive.databases(cluster,user_name)
51
+ Hive.run(cluster,"show databases",user_name)['stdout'].split("\n")
52
+ end
53
+
54
+ # converts a source path or target path to a dst in the context of handler and stage
55
+ def Hive.path_to_dst(path,stage_path)
56
+ has_handler = true if path.index("://")
57
+ s = Stage.where(:path=>stage_path).first
58
+ params = s.params
59
+ target_path = params['target']
60
+ cluster = params['cluster'] if Hadoop.clusters.include?(params['cluster'].to_s)
61
+ is_target = true if path == target_path
62
+ red_path = path.split("://").last
63
+ first_path_node = red_path.gsub(".","/").split("/").first
64
+ cluster ||= Hadoop.clusters.include?(first_path_node) ? first_path_node : Hadoop.default_cluster
65
+ user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
66
+ #save some time on targets
67
+ databases = Hive.databases(cluster,user_name) unless is_target
68
+ #is user has a handler, is specifying a target,
69
+ #or their first path node is a cluster name
70
+ #or their first path node is actually a database
71
+ #assume it's a hive pointer
72
+ if is_target or
73
+ has_handler or
74
+ Hadoop.clusters.include?(first_path_node) or
75
+ databases.include?(first_path_node)
76
+ #make sure cluster is legit
77
+ hive_url = Hive.url_by_path(red_path,user_name,is_target)
78
+ return Dataset.find_or_create_by_url(hive_url)
79
+ end
80
+ #otherwise, use hdfs convention
81
+ return Ssh.path_to_dst(path,stage_path)
82
+ end
83
+
84
+ def Hive.url_by_path(path,user_name,is_target=false)
85
+ red_path = path.gsub(".","/")
86
+ cluster = red_path.split("/").first.to_s
87
+ if Hadoop.clusters.include?(cluster)
88
+ #cut node out of path
89
+ red_path = red_path.split("/")[1..-1].join("/")
90
+ else
91
+ cluster = Hadoop.default_cluster
92
+ end
93
+ db, table = red_path.split("/")[0..-1]
94
+ url = "hive://#{cluster}/#{db}/#{table}"
95
+ begin
96
+ #add table stats check only if not target
97
+ if is_target or Hive.table_stats(cluster, db, table, user_name)['stderr'].to_s.length == 0
98
+ return url
99
+ else
100
+ raise "Unable to find #{url} with error: #{stat_response['stderr']}"
101
+ end
102
+ rescue => exc
103
+ raise Exception, "Unable to find #{url} with error: #{exc.to_s}", exc.backtrace
104
+ end
105
+ end
106
+
50
107
  #get field names and partition datatypes and size of a hive table
51
- def Hive.table_stats(db,table,cluster,user)
52
- describe_sql = "use #{db};describe extended #{table}"
53
- describe_output = Hive.run(describe_sql,cluster,user)
108
+ def Hive.table_stats(cluster,db,table,user_name)
109
+ describe_sql = "use #{db};describe extended #{table};"
110
+ describe_response = Hive.run(cluster, describe_sql,user_name)
111
+ return describe_response if describe_response['stdout'].length==0
112
+ describe_output = describe_response['stdout']
54
113
  describe_output.split("location:").last.split(",").first
55
114
  #get location, fields, partitions
56
115
  result_hash = {}
@@ -78,12 +137,12 @@ module Mobilize
78
137
  #assign field defs after removing partitions
79
138
  result_hash['field_defs'] = field_defs
80
139
  #get size
81
- result_hash['size'] = Hadoop.run("fs -dus #{result_hash['location']}",cluster,user).split("\t").last.strip.to_i
140
+ result_hash['size'] = Hadoop.run(cluster,"fs -dus #{result_hash['location']}",user_name)['stdout'].split("\t").last.strip.to_i
82
141
  return result_hash
83
142
  end
84
143
 
85
144
  #run a generic hive command, with the option of passing a file hash to be locally available
86
- def Hive.run(hql,cluster,user,file_hash=nil)
145
+ def Hive.run(cluster,hql,user_name,file_hash=nil)
87
146
  # no TempStatsStore
88
147
  hql = "set hive.stats.autogather=false;#{hql}"
89
148
  filename = hql.to_md5
@@ -93,22 +152,15 @@ module Mobilize
93
152
  #at hadoop read limit
94
153
  command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
95
154
  gateway_node = Hadoop.gateway_node(cluster)
96
- Ssh.run(gateway_node,command,user,file_hash)
155
+ Ssh.run(gateway_node,command,user_name,file_hash)
97
156
  end
98
157
 
99
158
  def Hive.run_by_stage_path(stage_path)
100
159
  s = Stage.where(:path=>stage_path).first
101
- u = s.job.runner.user
102
160
  params = s.params
103
- user = params['user']
104
161
  cluster = params['cluster'] || Hive.clusters.keys.first
105
- node = Hadoop.gateway_node(cluster)
106
- if user and !Ssh.sudoers(node).include?(u.name)
107
- raise "#{u.name} does not have su permissions for #{node}"
108
- elsif user.nil? and Ssh.su_all_users(node)
109
- user = u.name
110
- end
111
-
162
+ user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
163
+ job_name = s.path.sub("Runner_","")
112
164
  #slot Hive worker if available
113
165
  slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
114
166
  return false unless slot_id
@@ -122,13 +174,8 @@ module Mobilize
122
174
  if params['hql']
123
175
  hql = params['hql']
124
176
  else
125
- #user has passed in a gsheet hql
126
- gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
127
- #return blank response if there are no slots available
128
- return nil unless gdrive_slot
129
- source_dst = s.source_dsts(gdrive_slot).first
130
- Gdrive.unslot_worker_by_path(stage_path)
131
- hql = source_dst.read(user)
177
+ source = s.sources.first
178
+ hql = source.read(user_name)
132
179
  end
133
180
 
134
181
  #check for select at end
@@ -137,55 +184,59 @@ module Mobilize
137
184
  #nil if no prior commands
138
185
  prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
139
186
  select_hql = hql_array.last
140
- output_table_hql = ["drop table if exists #{output_path}",
187
+ output_table_hql = ["set mapred.job.name=#{job_name};",
188
+ "drop table if exists #{output_path}",
141
189
  "create table #{output_path} as #{select_hql};"].join(";")
142
190
  full_hql = [prior_hql, output_table_hql].compact.join(";")
143
- Hive.run(full_hql, cluster, user)
144
- #already populated, make sure dataset exists
191
+ result = Hive.run(cluster,full_hql, user_name)
145
192
  Dataset.find_or_create_by_url(out_url)
146
193
  else
147
- out_string = Hive.run(hql, cluster, user)
148
- out_string = "result\n#{out_string}"
149
- Dataset.write_by_url(out_url,out_string,user)
194
+ result = Hive.run(cluster, hql, user_name)
195
+ Dataset.find_or_create_by_url(out_url)
196
+ Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
150
197
  end
151
198
  #unslot worker
152
199
  Hive.unslot_worker_by_path(stage_path)
153
- out_url
200
+ response = {}
201
+ response['out_url'] = out_url
202
+ response['err_url'] = Dataset.write_by_url("gridfs://#{s.path}/err",result['stderr'].to_s,Gdrive.owner_name) if result['stderr'].to_s.length>0
203
+ response['signal'] = result['exit_code']
204
+ response
154
205
  end
155
206
 
156
- def Hive.schema_hash(schema_path,user,gdrive_slot)
207
+ def Hive.schema_hash(schema_path,user_name,gdrive_slot)
157
208
  if schema_path.index("/")
158
209
  #slashes mean sheets
159
- out_tsv = Gsheet.find_by_path(schema_path,gdrive_slot).read(user)
210
+ out_tsv = Gsheet.find_by_path(schema_path,gdrive_slot).read(user_name)
160
211
  else
161
- u = User.where(:name=>user).first
212
+ u = User.where(:name=>user_name).first
162
213
  #check sheets in runner
163
214
  r = u.runner
164
215
  runner_sheet = r.gbook(gdrive_slot).worksheet_by_title(schema_path)
165
216
  out_tsv = if runner_sheet
166
- runner_sheet.read(user)
217
+ runner_sheet.read(user_name)
167
218
  else
168
219
  #check for gfile. will fail if there isn't one.
169
- Gfile.find_by_path(schema_path).read(user)
220
+ Gfile.find_by_path(schema_path).read(user_name)
170
221
  end
171
- #use Gridfs to cache gdrive results
172
- file_name = schema_path.split("/").last
173
- out_url = "gridfs://#{schema_path}/#{file_name}"
174
- Dataset.write_by_url(out_url,out_tsv,user)
175
- schema_tsv = Dataset.find_by_url(out_url).read(user)
176
- schema_hash = {}
177
- schema_tsv.tsv_to_hash_array.each do |ha|
178
- schema_hash[ha['name']] = ha['datatype']
179
- end
180
- schema_hash
181
222
  end
223
+ #use Gridfs to cache gdrive results
224
+ file_name = schema_path.split("/").last
225
+ out_url = "gridfs://#{schema_path}/#{file_name}"
226
+ Dataset.write_by_url(out_url,out_tsv,user_name)
227
+ schema_tsv = Dataset.find_by_url(out_url).read(user_name)
228
+ schema_hash = {}
229
+ schema_tsv.tsv_to_hash_array.each do |ha|
230
+ schema_hash[ha['name']] = ha['datatype']
231
+ end
232
+ schema_hash
182
233
  end
183
234
 
184
- def Hive.path_params(cluster, path, user)
235
+ def Hive.path_params(cluster, path, user_name)
185
236
  db, table, partitions = path.gsub(".","/").split("/").ie{|sp| [sp.first, sp.second, sp[2..-1]]}
186
237
  #get existing table stats if any
187
238
  curr_stats = begin
188
- Hive.table_stats(db, table, cluster, user)
239
+ Hive.table_stats(cluster, db, table, user_name)
189
240
  rescue
190
241
  nil
191
242
  end
@@ -195,27 +246,34 @@ module Mobilize
195
246
  "curr_stats"=>curr_stats}
196
247
  end
197
248
 
198
- def Hive.hql_to_table(cluster, source_hql, target_path, user, drop=false, schema_hash=nil)
199
- target_params = Hive.path_params(cluster, target_path, user)
200
- target_table_path = ['db','table'].map{|k| target_params[k]}.join(".")
201
- target_partitions = target_params['partitions'].to_a
202
- target_table_stats = target_params['curr_stats']
249
+ def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil)
250
+ table_path = [db,table].join(".")
251
+ target_params = Hive.path_params(cluster, table_path, user_name)
252
+ table_stats = target_params['curr_stats']
253
+
254
+ source_hql_array = source_hql.split(";")
255
+ last_select_i = source_hql_array.rindex{|hql| hql.downcase.strip.starts_with?("select")}
256
+ #find the last select query -- it should be used for the temp table creation
257
+ last_select_hql = (source_hql_array[last_select_i..-1].join(";")+";")
258
+ #if there is anything prior to the last select, add it in prior to table creation
259
+ prior_hql = ((source_hql_array[0..(last_select_i-1)].join(";")+";") if last_select_i and last_select_i>=1).to_s
203
260
 
204
261
  #create temporary table so we can identify fields etc.
205
262
  temp_db = Hive.output_db(cluster)
206
- temp_table_name = (source_hql+target_path).to_md5
263
+ temp_table_name = (source_hql+table_path).to_md5
207
264
  temp_table_path = [temp_db,temp_table_name].join(".")
265
+ temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
208
266
  temp_drop_hql = "drop table if exists #{temp_table_path};"
209
- temp_create_hql = "#{temp_drop_hql}create table #{temp_table_path} as #{source_hql}"
210
- Hive.run(temp_create_hql,cluster,user)
267
+ temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
268
+ Hive.run(cluster,temp_create_hql,user_name)
211
269
 
212
- source_params = Hive.path_params(cluster, temp_table_path, user)
270
+ source_params = Hive.path_params(cluster, temp_table_path, user_name)
213
271
  source_table_path = ['db','table'].map{|k| source_params[k]}.join(".")
214
272
  source_table_stats = source_params['curr_stats']
215
273
  source_fields = source_table_stats['field_defs']
216
274
 
217
- if target_partitions.length == 0 and
218
- target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
275
+ if part_array.length == 0 and
276
+ table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
219
277
  #no partitions in either user params or the target table
220
278
 
221
279
  target_headers = source_fields.map{|f| f['name']}
@@ -233,21 +291,27 @@ module Mobilize
233
291
  end.join(",")})"
234
292
 
235
293
  #always drop when no partititons
236
- target_drop_hql = "drop table if exists #{target_table_path};"
294
+ target_name_hql = "set mapred.job.name=#{job_name};"
295
+
296
+ target_drop_hql = "drop table if exists #{table_path};"
237
297
 
238
- target_create_hql = "create table if not exists #{target_table_path} #{field_def_stmt};"
298
+ target_create_hql = "create table if not exists #{table_path} #{field_def_stmt};"
239
299
 
240
- target_insert_hql = "insert overwrite table #{target_table_path} select #{target_field_stmt} from #{source_table_path};"
300
+ target_insert_hql = "insert overwrite table #{table_path} select #{target_field_stmt} from #{source_table_path};"
241
301
 
242
- target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql,temp_drop_hql].join
302
+ target_full_hql = [target_name_hql,
303
+ target_drop_hql,
304
+ target_create_hql,
305
+ target_insert_hql,
306
+ temp_drop_hql].join
243
307
 
244
- Hive.run(target_full_hql, cluster, user)
308
+ Hive.run(cluster, target_full_hql, user_name)
245
309
 
246
- elsif target_partitions.length > 0 and
247
- target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == target_partitions}
310
+ elsif part_array.length > 0 and
311
+ table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
248
312
  #partitions and no target table or same partitions in both target table and user params
249
313
 
250
- target_headers = source_fields.map{|f| f['name']}.reject{|h| target_partitions.include?(h)}
314
+ target_headers = source_fields.map{|f| f['name']}.reject{|h| part_array.include?(h)}
251
315
 
252
316
  field_defs = {}
253
317
  target_headers.each do |name|
@@ -260,7 +324,7 @@ module Mobilize
260
324
  end.join(",")})"
261
325
 
262
326
  part_defs = {}
263
- target_partitions.each do |name|
327
+ part_array.each do |name|
264
328
  datatype = schema_hash[name] || "string"
265
329
  part_defs[name] = datatype
266
330
  end
@@ -271,70 +335,70 @@ module Mobilize
271
335
 
272
336
  target_field_stmt = target_headers.map{|h| "`#{h}`"}.join(",")
273
337
 
274
- target_part_stmt = target_partitions.map{|h| "`#{h}`"}.join(",")
338
+ target_part_stmt = part_array.map{|h| "`#{h}`"}.join(",")
275
339
 
276
- target_set_hql = ["set hive.exec.dynamic.partition.mode=nonstrict;",
340
+ target_set_hql = ["set mapred.job.name=#{job_name};",
341
+ "set hive.exec.dynamic.partition.mode=nonstrict;",
277
342
  "set hive.exec.max.dynamic.partitions.pernode=1000;",
278
343
  "set hive.exec.dynamic.partition=true;",
279
344
  "set hive.exec.max.created.files = 200000;",
280
345
  "set hive.max.created.files = 200000;"].join
281
346
 
282
- if drop or target_table_stats.nil?
283
- target_drop_hql = "drop table if exists #{target_table_path};"
347
+ if drop or table_stats.nil?
348
+ target_drop_hql = "drop table if exists #{table_path};"
284
349
  target_create_hql = target_drop_hql +
285
- "create table if not exists #{target_table_path} #{field_def_stmt} " +
350
+ "create table if not exists #{table_path} #{field_def_stmt} " +
286
351
  "partitioned by #{part_def_stmt};"
287
352
 
288
353
  else
289
- target_db,target_table = target_table_path.split(".")
290
354
  #get all the permutations of possible partititons
291
355
  part_perm_hql = "set hive.cli.print.header=true;select distinct #{target_part_stmt} from #{source_table_path};"
292
- part_perm_tsv = Hive.run(part_perm_hql, cluster, user)
356
+ part_perm_tsv = Hive.run(cluster, part_perm_hql, user_name)['stdout']
293
357
  #having gotten the permutations, ensure they are dropped
294
358
  part_hash_array = part_perm_tsv.tsv_to_hash_array
295
359
  part_drop_hql = part_hash_array.map do |h|
296
360
  part_drop_stmt = h.map do |name,value|
297
361
  part_defs[name[1..-2]]=="string" ? "#{name}='#{value}'" : "#{name}=#{value}"
298
362
  end.join(",")
299
- "use #{target_db};alter table #{target_table} drop if exists partition (#{part_drop_stmt});"
363
+ "use #{db};alter table #{table} drop if exists partition (#{part_drop_stmt});"
300
364
  end.join
301
365
  target_create_hql = part_drop_hql
302
366
  end
303
367
 
304
- target_insert_hql = "insert overwrite table #{target_table_path} " +
368
+ target_insert_hql = "insert overwrite table #{table_path} " +
305
369
  "partition (#{target_part_stmt}) " +
306
370
  "select #{target_field_stmt},#{target_part_stmt} from #{source_table_path};"
307
371
 
308
372
  target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
309
373
 
310
- Hive.run(target_full_hql, cluster, user)
374
+ Hive.run(cluster, target_full_hql, user_name)
311
375
  else
312
376
  error_msg = "Incompatible partition specs"
313
377
  raise error_msg
314
378
  end
315
- return target_path
379
+ url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
380
+ return url
316
381
  end
317
382
 
318
383
  #turn a tsv into a hive table.
319
384
  #Accepts options to drop existing target if any
320
385
  #also schema with column datatype overrides
321
- def Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop=false, schema_hash=nil)
386
+ def Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop=false, schema_hash=nil)
322
387
  source_headers = source_tsv.tsv_header_array
323
388
 
324
- target_params = Hive.path_params(cluster, target_path, user)
325
- target_db,target_table = ['db','table'].map{|k| target_params[k]}
326
- target_table_path = [target_db,target_table].join(".")
327
- target_partitions = target_params['partitions'].to_a
328
- target_table_stats = target_params['curr_stats']
389
+ table_path = [db,table].join(".")
390
+ target_params = Hive.path_params(cluster, table_path, user_name)
391
+ table_stats = target_params['curr_stats']
329
392
 
330
393
  schema_hash ||= {}
331
394
 
332
- if target_partitions.length == 0 and
333
- target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
395
+ if part_array.length == 0 and
396
+ table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
334
397
  #no partitions in either user params or the target table
335
398
  #or drop and start fresh
336
399
 
337
400
  #one file only, strip headers, replace tab with ctrl-a for hive
401
+ #get rid of freaking carriage return characters
338
402
  source_rows = source_tsv.split("\n")[1..-1].join("\n").gsub("\t","\001")
339
403
  source_tsv_filename = "000000_0"
340
404
  file_hash = {source_tsv_filename=>source_rows}
@@ -345,52 +409,52 @@ module Mobilize
345
409
  end.ie{|fs| "(#{fs.join(",")})"}
346
410
 
347
411
  #for single insert, use drop table and create table always
348
- target_drop_hql = "drop table if exists #{target_table_path}"
412
+ target_drop_hql = "drop table if exists #{table_path}"
349
413
 
350
- target_create_hql = "create table #{target_table_path} #{field_defs}"
414
+ target_create_hql = "create table #{table_path} #{field_defs}"
351
415
 
352
416
  #load source data
353
- target_insert_hql = "load data local inpath '#{source_tsv_filename}' overwrite into table #{target_table_path};"
417
+ target_insert_hql = "load data local inpath '#{source_tsv_filename}' overwrite into table #{table_path};"
354
418
 
355
419
  target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
356
420
 
357
- Hive.run(target_full_hql, cluster, user, file_hash)
421
+ Hive.run(cluster, target_full_hql, user_name, file_hash)
358
422
 
359
- elsif target_partitions.length > 0 and
360
- target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == target_partitions}
423
+ elsif part_array.length > 0 and
424
+ table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
361
425
  #partitions and no target table
362
426
  #or same partitions in both target table and user params
363
427
  #or drop and start fresh
364
428
 
365
- target_headers = source_headers.reject{|h| target_partitions.include?(h)}
429
+ target_headers = source_headers.reject{|h| part_array.include?(h)}
366
430
 
367
431
  field_defs = "(#{target_headers.map do |name|
368
432
  datatype = schema_hash[name] || "string"
369
433
  "`#{name}` #{datatype}"
370
434
  end.join(",")})"
371
435
 
372
- partition_defs = "(#{target_partitions.map do |name|
436
+ partition_defs = "(#{part_array.map do |name|
373
437
  datatype = schema_hash[name] || "string"
374
438
  "#{name} #{datatype}"
375
439
  end.join(",")})"
376
440
 
377
- target_drop_hql = drop ? "drop table if exists #{target_table_path};" : ""
441
+ target_drop_hql = drop ? "drop table if exists #{table_path};" : ""
378
442
 
379
443
  target_create_hql = target_drop_hql +
380
- "create table if not exists #{target_table_path} #{field_defs} " +
444
+ "create table if not exists #{table_path} #{field_defs} " +
381
445
  "partitioned by #{partition_defs}"
382
446
 
383
447
  #create target table early if not here
384
- Hive.run(target_create_hql, cluster, user)
448
+ Hive.run(cluster, target_create_hql, user_name)
385
449
 
386
- target_table_stats = Hive.table_stats(target_db, target_table, cluster, user)
450
+ table_stats = Hive.table_stats(cluster, db, table, user_name)
387
451
 
388
452
  #create data hash from source hash array
389
453
  data_hash = {}
390
454
  source_hash_array = source_tsv.tsv_to_hash_array
391
455
  source_hash_array.each do |ha|
392
- tpmk = target_partitions.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
393
- tpmv = ha.reject{|k,v| target_partitions.include?(k)}.values.join("\001")
456
+ tpmk = part_array.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
457
+ tpmv = ha.reject{|k,v| part_array.include?(k)}.values.join("\001")
394
458
  if data_hash[tpmk]
395
459
  data_hash[tpmk] += "\n#{tpmv}"
396
460
  else
@@ -399,61 +463,62 @@ module Mobilize
399
463
  end
400
464
 
401
465
  #go through completed data hash and write each key value to the table in question
466
+ target_part_hql = ""
402
467
  data_hash.each do |tpmk,tpmv|
403
468
  base_filename = "000000_0"
404
469
  part_pairs = tpmk.split("/").map{|p| p.split("=").ie{|pa| ["#{pa.first}","#{pa.second}"]}}
405
470
  part_dir = part_pairs.map{|pp| "#{pp.first}=#{pp.second}"}.join("/")
406
471
  part_stmt = part_pairs.map{|pp| "#{pp.first}='#{pp.second}'"}.join(",")
407
- hdfs_dir = "#{target_table_stats['location']}/#{part_dir}"
408
- hdfs_source_path = "/#{hdfs_dir.split("/")[3..-2].join("/")}/#{base_filename}"
409
- hdfs_target_path = "/#{hdfs_dir.split("/")[3..-1].join("/")}"
472
+ hdfs_dir = "#{table_stats['location']}/#{part_dir}"
473
+ #source the partitions from a parallel load folder since filenames are all named the same
474
+ hdfs_source_url = "#{table_stats['location']}/part_load/#{part_dir}/#{base_filename}"
475
+ hdfs_target_url = hdfs_dir
410
476
  #load partition into source path
411
- puts "Writing to #{hdfs_source_path} for #{user} at #{Time.now.utc}"
412
- Hdfs.write(hdfs_source_path,tpmv,user)
477
+ puts "Writing to #{hdfs_source_url} for #{user_name} at #{Time.now.utc}"
478
+ Hdfs.write(cluster,hdfs_source_url,tpmv,user_name)
413
479
  #let Hive know where the partition is
414
- target_add_part_hql = "use #{target_db};alter table #{target_table} add if not exists partition (#{part_stmt}) location '#{hdfs_target_path}'"
415
- target_insert_part_hql = "load data inpath '#{hdfs_source_path}' overwrite into table #{target_table} partition (#{part_stmt});"
416
- target_part_hql = [target_add_part_hql,target_insert_part_hql].join(";")
417
- puts "Adding partition #{tpmk} to #{target_table_path} for #{user} at #{Time.now.utc}"
418
- Hive.run(target_part_hql, cluster, user)
480
+ target_add_part_hql = "use #{db};alter table #{table} add if not exists partition (#{part_stmt}) location '#{hdfs_target_url}'"
481
+ target_insert_part_hql = "load data inpath '#{hdfs_source_url}' overwrite into table #{table} partition (#{part_stmt});"
482
+ target_part_hql += [target_add_part_hql,target_insert_part_hql].join(";")
483
+ end
484
+ #run actual partition adds all at once
485
+ if target_part_hql.length>0
486
+ puts "Adding partitions to #{cluster}/#{db}/#{table} for #{user_name} at #{Time.now.utc}"
487
+ Hive.run(cluster, target_part_hql, user_name)
419
488
  end
420
489
  else
421
490
  error_msg = "Incompatible partition specs: " +
422
- "target table:#{target_table_stats['partitions'].to_s}, " +
423
- "user_params:#{target_partitions.to_s}"
491
+ "target table:#{table_stats['partitions'].to_s}, " +
492
+ "user_params:#{part_array.to_s}"
424
493
  raise error_msg
425
494
  end
426
- return target_path
495
+ url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
496
+ return url
427
497
  end
428
498
 
429
499
  def Hive.write_by_stage_path(stage_path)
430
500
  s = Stage.where(:path=>stage_path).first
431
- u = s.job.runner.user
432
501
  params = s.params
433
- user = params['user']
434
- cluster = params['cluster'] || Hive.clusters.keys.first
502
+ source = s.sources.first
503
+ target = s.target
504
+ cluster, db, table = target.url.split("://").last.split("/")
505
+ #update stage with the node so we can use it
506
+ user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
507
+ job_name = s.path.sub("Runner_","")
435
508
 
436
509
  #slot Hive worker if available
437
510
  slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
438
511
  return false unless slot_id
439
512
 
440
- node = Hadoop.gateway_node(cluster)
441
- if user and !Ssh.sudoers(node).include?(u.name)
442
- raise "#{u.name} does not have su permissions for #{node}"
443
- elsif user.nil? and Ssh.su_all_users(node)
444
- user = u.name
445
- end
446
-
447
- #determine path for target
448
- target_path = params['target']
449
-
450
- gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
451
- #return blank response if there are no slots available
452
- return nil unless gdrive_slot
453
- source_dst = s.source_dsts(gdrive_slot).first
454
- schema_hash = params['schema'] ? Hive.schema_hash(params['schema'],user,gdrive_slot) : {}
513
+ schema_hash = if params['schema']
514
+ gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
515
+ #return blank response if there are no slots available
516
+ return nil unless gdrive_slot
517
+ Hive.schema_hash(params['schema'],user_name,gdrive_slot)
518
+ else
519
+ {}
520
+ end
455
521
  Gdrive.unslot_worker_by_path(stage_path)
456
-
457
522
  #drop target before create/insert?
458
523
  drop = params['drop']
459
524
 
@@ -461,64 +526,77 @@ module Mobilize
461
526
  source_tsv,source_hql = [nil]*2
462
527
  if params['hql']
463
528
  source_hql = params['hql']
464
- elsif source_dst
465
- if source_dst.handler == 'hive'
529
+ elsif source
530
+ if source.handler == 'hive'
466
531
  #source table
467
- cluster,source_path = source_dst.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
532
+ cluster,source_path = source.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
468
533
  source_hql = "select * from #{source_path};"
469
- elsif ['gridfs','hdfs'].include?(source_dst.handler)
470
- if source_dst.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
471
- source_hql = source_dst.read(user)
534
+ elsif ['gsheet','gridfs','hdfs'].include?(source.handler)
535
+ if source.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
536
+ source_hql = source.read(user_name)
472
537
  else
473
538
  #tsv from sheet
474
- source_tsv = source_dst.read(user)
539
+ source_tsv = source.read(user_name)
475
540
  end
476
541
  end
477
542
  end
478
543
 
479
- out_string = if source_hql
480
- Hive.hql_to_table(cluster, source_hql, target_path, user, drop, schema_hash)
481
- elsif source_tsv
482
- Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop, schema_hash)
483
- else
484
- raise "Unable to determine source tsv or source hql"
485
- end
486
-
544
+ part_array = if params['partitions']
545
+ params['partitions'].to_a.map{|p| p.gsub(".","/").split("/")}.flatten
546
+ elsif params['target']
547
+ #take the end parts of the target, that are not the cluster, db, table
548
+ target_array = params['target'].gsub(".","/").split("/")
549
+ [cluster,db,table].each do |term|
550
+ target_array = target_array[1..-1] if target_array.first == term
551
+ end
552
+ target_array
553
+ else
554
+ []
555
+ end
487
556
 
557
+ result = begin
558
+ url = if source_hql
559
+ Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop, schema_hash)
560
+ elsif source_tsv
561
+ Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop, schema_hash)
562
+ else
563
+ raise "Unable to determine source tsv or source hql"
564
+ end
565
+ {'stdout'=>url,'exit_code'=>0}
566
+ rescue => exc
567
+ {'stderr'=>exc.to_s, 'exit_code'=>500}
568
+ end
488
569
 
489
570
  #unslot worker and write result
490
571
  Hive.unslot_worker_by_path(stage_path)
491
572
 
492
- #output table stores stage output
493
- out_string = "result\n#{out_string}"
494
- output_db,output_table = [Hive.output_db(cluster),stage_path.gridsafe]
495
- out_url = "hive://#{cluster}/#{output_db}/#{output_table}"
496
- Dataset.write_by_url(out_url,out_string,user)
497
- out_url
573
+ response = {}
574
+ response['out_url'] = Dataset.write_by_url("gridfs://#{s.path}/out",result['stdout'].to_s,Gdrive.owner_name) if result['stdout'].to_s.length>0
575
+ response['err_url'] = Dataset.write_by_url("gridfs://#{s.path}/err",result['stderr'].to_s,Gdrive.owner_name) if result['stderr'].to_s.length>0
576
+ response['signal'] = result['exit_code']
577
+ response
498
578
  end
499
579
 
500
- def Hive.read_by_dataset_path(dst_path,user)
501
- cluster,source_path = dst_path.split("/").ie do |sp|
502
- if sp.length == 2
503
- [Hive.clusters.first.first,sp.join(".")]
504
- else
505
- [sp.first, sp[1..-1].join(".")]
506
- end
507
- end
508
- hql = "set hive.cli.print.header=true;select * from #{source_path};"
509
- Hive.run(hql,cluster,user)
580
+ def Hive.read_by_dataset_path(dst_path,user_name,*args)
581
+ cluster, db, table = dst_path.split("/")
582
+ source_path = [db,table].join(".")
583
+ job_name = "read #{cluster}/#{db}/#{table}"
584
+ set_hql = "set hive.cli.print.header=true;set mapred.job.name=#{job_name};"
585
+ select_hql = "select * from #{source_path};"
586
+ hql = [set_hql,select_hql].join
587
+ response = Hive.run(cluster, hql,user_name)
588
+ if response['exit_code']==0
589
+ return response['stdout']
590
+ else
591
+ raise "Unable to read hive://#{dst_path} with error: #{response['stderr']}"
592
+ end
510
593
  end
511
594
 
512
- def Hive.write_by_dataset_path(dst_path,source_tsv,user)
513
- cluster,target_path = dst_path.split("/").ie do |sp|
514
- if sp.length == 2
515
- [Hive.clusters.first.first,sp.join(".")]
516
- else
517
- [sp.first, sp[1..-1].join(".")]
518
- end
519
- end
595
+ def Hive.write_by_dataset_path(dst_path,source_tsv,user_name,*args)
596
+ cluster,db,table = dst_path.split("/")
597
+ part_array = []
520
598
  drop = true
521
- Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop)
599
+ Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop)
522
600
  end
523
601
  end
524
602
 
@@ -1,5 +1,5 @@
1
1
  module Mobilize
2
2
  module Hive
3
- VERSION = "1.0.11"
3
+ VERSION = "1.2"
4
4
  end
5
5
  end
@@ -7,7 +7,7 @@ Gem::Specification.new do |gem|
7
7
  gem.name = "mobilize-hive"
8
8
  gem.version = Mobilize::Hive::VERSION
9
9
  gem.authors = ["Cassio Paes-Leme"]
10
- gem.email = ["cpaesleme@ngmoco.com"]
10
+ gem.email = ["cpaesleme@dena.com"]
11
11
  gem.description = %q{Adds hive read, write, and run support to mobilize-hdfs}
12
12
  gem.summary = %q{Adds hive read, write, and run support to mobilize-hdfs}
13
13
  gem.homepage = "http://github.com/dena/mobilize-hive"
@@ -16,5 +16,5 @@ Gem::Specification.new do |gem|
16
16
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ["lib"]
19
- gem.add_runtime_dependency "mobilize-hdfs","1.0.10"
19
+ gem.add_runtime_dependency "mobilize-hdfs","1.2"
20
20
  end
@@ -3,7 +3,7 @@
3
3
  active: true
4
4
  trigger: once
5
5
  status: ""
6
- stage1: hive.write target:"mobilize/hive_test_1/act_date", drop:true,
6
+ stage1: hive.write target:"mobilize/hive_test_1", partitions:"act_date", drop:true,
7
7
  source:"Runner_mobilize(test)/hive_test_1.in", schema:"hive_test_1.schema"
8
8
  stage2: hive.run source:"hive_test_1.hql"
9
9
  stage3: hive.run hql:"show databases;"
@@ -21,6 +21,6 @@
21
21
  trigger: after hive_test_2
22
22
  status: ""
23
23
  stage1: hive.run hql:"select act_date as `date`,product,category,value from mobilize.hive_test_1;"
24
- stage2: hive.write source:"stage1",target:"mobilize/hive_test_3/date/product", drop:true
25
- stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3/date/product", drop:false
24
+ stage2: hive.write source:"stage1",target:"mobilize/hive_test_3", partitions:"date/product", drop:true
25
+ stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3", partitions:"date/product", drop:false
26
26
  stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"
@@ -52,9 +52,9 @@ describe "Mobilize" do
52
52
  hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
53
53
  [hive_3_target_sheet].each{|s| s.delete if s}
54
54
 
55
- puts "job row added, force enqueued requestor, wait 1000s"
55
+ puts "job row added, force enqueued requestor, wait for stages"
56
56
  r.enqueue!
57
- sleep 1000
57
+ wait_for_stages(1200)
58
58
 
59
59
  puts "jobtracker posted data to test sheet"
60
60
  hive_1_stage_2_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1_stage_2.out",gdrive_slot)
@@ -63,9 +63,34 @@ describe "Mobilize" do
63
63
  hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
64
64
 
65
65
  assert hive_1_stage_2_target_sheet.read(u.name).length == 219
66
- assert hive_1_stage_3_target_sheet.read(u.name).length == 325
66
+ assert hive_1_stage_3_target_sheet.read(u.name).length > 3
67
67
  assert hive_2_target_sheet.read(u.name).length == 599
68
68
  assert hive_3_target_sheet.read(u.name).length == 347
69
69
  end
70
70
 
71
+ def wait_for_stages(time_limit=600,stage_limit=120,wait_length=10)
72
+ time = 0
73
+ time_since_stage = 0
74
+ #check for 10 min
75
+ while time < time_limit and time_since_stage < stage_limit
76
+ sleep wait_length
77
+ job_classes = Mobilize::Resque.jobs.map{|j| j['class']}
78
+ if job_classes.include?("Mobilize::Stage")
79
+ time_since_stage = 0
80
+ puts "saw stage at #{time.to_s} seconds"
81
+ else
82
+ time_since_stage += wait_length
83
+ puts "#{time_since_stage.to_s} seconds since stage seen"
84
+ end
85
+ time += wait_length
86
+ puts "total wait time #{time.to_s} seconds"
87
+ end
88
+
89
+ if time >= time_limit
90
+ raise "Timed out before stage completion"
91
+ end
92
+ end
93
+
94
+
95
+
71
96
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mobilize-hive
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.11
4
+ version: '1.2'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-03-05 00:00:00.000000000 Z
12
+ date: 2013-03-21 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mobilize-hdfs
@@ -18,7 +18,7 @@ dependencies:
18
18
  requirements:
19
19
  - - '='
20
20
  - !ruby/object:Gem::Version
21
- version: 1.0.10
21
+ version: '1.2'
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
@@ -26,10 +26,10 @@ dependencies:
26
26
  requirements:
27
27
  - - '='
28
28
  - !ruby/object:Gem::Version
29
- version: 1.0.10
29
+ version: '1.2'
30
30
  description: Adds hive read, write, and run support to mobilize-hdfs
31
31
  email:
32
- - cpaesleme@ngmoco.com
32
+ - cpaesleme@dena.com
33
33
  executables: []
34
34
  extensions: []
35
35
  extra_rdoc_files: []