mobilize-hive 1.0.11 → 1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -146,7 +146,7 @@ Start
146
146
  script in the hql or source sheet and returns any output specified at the
147
147
  end. If the cmd or last query in source is a select statement, column headers will be
148
148
  returned as well.
149
- * hive.write `hql:<hql> || source:<source_path>, target:<hive_path>, user:<user>, cluster:<cluster>, schema:<gsheet_path>, drop:<true/false>`,
149
+ * hive.write `hql:<hql> || source:<source_path>, target:<hive_path>, partitions:<partition_path>, user:<user>, cluster:<cluster>, schema:<gsheet_path>, drop:<true/false>`,
150
150
  which writes the source or query result to the selected hive table.
151
151
  * hive_path
152
152
  * should be of the form `<hive_db>/<table_name>` or `<hive_db>.<table_name>`.
@@ -156,8 +156,10 @@ Start
156
156
  * if the file ends in .*ql, it's treated the same as passing hql
157
157
  * otherwise it is treated as a tsv with the first row as column headers
158
158
  * target:
159
- * Partitions can optionally be added to the hive_path, as in `<hive_db>/<table_name>/<partition1>/<partition2>`.
159
+ * Should be a hive_path, as in `<hive_db>/<table_name>` or `<hive_db>.<table_name>`.
160
+ * partitions:
160
161
  * Due to Hive limitation, partition names CANNOT be reserved keywords when writing from tsv (gsheet or hdfs source)
162
+ * Partitions should be specified as a path, as in partitions:`<partition1>/<partition2>`.
161
163
  * schema:
162
164
  * optional. gsheet_path to column schema.
163
165
  * two columns: name, datatype
@@ -47,10 +47,69 @@ module Mobilize
47
47
  end
48
48
  end
49
49
 
50
+ def Hive.databases(cluster,user_name)
51
+ Hive.run(cluster,"show databases",user_name)['stdout'].split("\n")
52
+ end
53
+
54
+ # converts a source path or target path to a dst in the context of handler and stage
55
+ def Hive.path_to_dst(path,stage_path)
56
+ has_handler = true if path.index("://")
57
+ s = Stage.where(:path=>stage_path).first
58
+ params = s.params
59
+ target_path = params['target']
60
+ cluster = params['cluster'] if Hadoop.clusters.include?(params['cluster'].to_s)
61
+ is_target = true if path == target_path
62
+ red_path = path.split("://").last
63
+ first_path_node = red_path.gsub(".","/").split("/").first
64
+ cluster ||= Hadoop.clusters.include?(first_path_node) ? first_path_node : Hadoop.default_cluster
65
+ user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
66
+ #save some time on targets
67
+ databases = Hive.databases(cluster,user_name) unless is_target
68
+ #is user has a handler, is specifying a target,
69
+ #or their first path node is a cluster name
70
+ #or their first path node is actually a database
71
+ #assume it's a hive pointer
72
+ if is_target or
73
+ has_handler or
74
+ Hadoop.clusters.include?(first_path_node) or
75
+ databases.include?(first_path_node)
76
+ #make sure cluster is legit
77
+ hive_url = Hive.url_by_path(red_path,user_name,is_target)
78
+ return Dataset.find_or_create_by_url(hive_url)
79
+ end
80
+ #otherwise, use hdfs convention
81
+ return Ssh.path_to_dst(path,stage_path)
82
+ end
83
+
84
+ def Hive.url_by_path(path,user_name,is_target=false)
85
+ red_path = path.gsub(".","/")
86
+ cluster = red_path.split("/").first.to_s
87
+ if Hadoop.clusters.include?(cluster)
88
+ #cut node out of path
89
+ red_path = red_path.split("/")[1..-1].join("/")
90
+ else
91
+ cluster = Hadoop.default_cluster
92
+ end
93
+ db, table = red_path.split("/")[0..-1]
94
+ url = "hive://#{cluster}/#{db}/#{table}"
95
+ begin
96
+ #add table stats check only if not target
97
+ if is_target or Hive.table_stats(cluster, db, table, user_name)['stderr'].to_s.length == 0
98
+ return url
99
+ else
100
+ raise "Unable to find #{url} with error: #{stat_response['stderr']}"
101
+ end
102
+ rescue => exc
103
+ raise Exception, "Unable to find #{url} with error: #{exc.to_s}", exc.backtrace
104
+ end
105
+ end
106
+
50
107
  #get field names and partition datatypes and size of a hive table
51
- def Hive.table_stats(db,table,cluster,user)
52
- describe_sql = "use #{db};describe extended #{table}"
53
- describe_output = Hive.run(describe_sql,cluster,user)
108
+ def Hive.table_stats(cluster,db,table,user_name)
109
+ describe_sql = "use #{db};describe extended #{table};"
110
+ describe_response = Hive.run(cluster, describe_sql,user_name)
111
+ return describe_response if describe_response['stdout'].length==0
112
+ describe_output = describe_response['stdout']
54
113
  describe_output.split("location:").last.split(",").first
55
114
  #get location, fields, partitions
56
115
  result_hash = {}
@@ -78,12 +137,12 @@ module Mobilize
78
137
  #assign field defs after removing partitions
79
138
  result_hash['field_defs'] = field_defs
80
139
  #get size
81
- result_hash['size'] = Hadoop.run("fs -dus #{result_hash['location']}",cluster,user).split("\t").last.strip.to_i
140
+ result_hash['size'] = Hadoop.run(cluster,"fs -dus #{result_hash['location']}",user_name)['stdout'].split("\t").last.strip.to_i
82
141
  return result_hash
83
142
  end
84
143
 
85
144
  #run a generic hive command, with the option of passing a file hash to be locally available
86
- def Hive.run(hql,cluster,user,file_hash=nil)
145
+ def Hive.run(cluster,hql,user_name,file_hash=nil)
87
146
  # no TempStatsStore
88
147
  hql = "set hive.stats.autogather=false;#{hql}"
89
148
  filename = hql.to_md5
@@ -93,22 +152,15 @@ module Mobilize
93
152
  #at hadoop read limit
94
153
  command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
95
154
  gateway_node = Hadoop.gateway_node(cluster)
96
- Ssh.run(gateway_node,command,user,file_hash)
155
+ Ssh.run(gateway_node,command,user_name,file_hash)
97
156
  end
98
157
 
99
158
  def Hive.run_by_stage_path(stage_path)
100
159
  s = Stage.where(:path=>stage_path).first
101
- u = s.job.runner.user
102
160
  params = s.params
103
- user = params['user']
104
161
  cluster = params['cluster'] || Hive.clusters.keys.first
105
- node = Hadoop.gateway_node(cluster)
106
- if user and !Ssh.sudoers(node).include?(u.name)
107
- raise "#{u.name} does not have su permissions for #{node}"
108
- elsif user.nil? and Ssh.su_all_users(node)
109
- user = u.name
110
- end
111
-
162
+ user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
163
+ job_name = s.path.sub("Runner_","")
112
164
  #slot Hive worker if available
113
165
  slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
114
166
  return false unless slot_id
@@ -122,13 +174,8 @@ module Mobilize
122
174
  if params['hql']
123
175
  hql = params['hql']
124
176
  else
125
- #user has passed in a gsheet hql
126
- gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
127
- #return blank response if there are no slots available
128
- return nil unless gdrive_slot
129
- source_dst = s.source_dsts(gdrive_slot).first
130
- Gdrive.unslot_worker_by_path(stage_path)
131
- hql = source_dst.read(user)
177
+ source = s.sources.first
178
+ hql = source.read(user_name)
132
179
  end
133
180
 
134
181
  #check for select at end
@@ -137,55 +184,59 @@ module Mobilize
137
184
  #nil if no prior commands
138
185
  prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
139
186
  select_hql = hql_array.last
140
- output_table_hql = ["drop table if exists #{output_path}",
187
+ output_table_hql = ["set mapred.job.name=#{job_name};",
188
+ "drop table if exists #{output_path}",
141
189
  "create table #{output_path} as #{select_hql};"].join(";")
142
190
  full_hql = [prior_hql, output_table_hql].compact.join(";")
143
- Hive.run(full_hql, cluster, user)
144
- #already populated, make sure dataset exists
191
+ result = Hive.run(cluster,full_hql, user_name)
145
192
  Dataset.find_or_create_by_url(out_url)
146
193
  else
147
- out_string = Hive.run(hql, cluster, user)
148
- out_string = "result\n#{out_string}"
149
- Dataset.write_by_url(out_url,out_string,user)
194
+ result = Hive.run(cluster, hql, user_name)
195
+ Dataset.find_or_create_by_url(out_url)
196
+ Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
150
197
  end
151
198
  #unslot worker
152
199
  Hive.unslot_worker_by_path(stage_path)
153
- out_url
200
+ response = {}
201
+ response['out_url'] = out_url
202
+ response['err_url'] = Dataset.write_by_url("gridfs://#{s.path}/err",result['stderr'].to_s,Gdrive.owner_name) if result['stderr'].to_s.length>0
203
+ response['signal'] = result['exit_code']
204
+ response
154
205
  end
155
206
 
156
- def Hive.schema_hash(schema_path,user,gdrive_slot)
207
+ def Hive.schema_hash(schema_path,user_name,gdrive_slot)
157
208
  if schema_path.index("/")
158
209
  #slashes mean sheets
159
- out_tsv = Gsheet.find_by_path(schema_path,gdrive_slot).read(user)
210
+ out_tsv = Gsheet.find_by_path(schema_path,gdrive_slot).read(user_name)
160
211
  else
161
- u = User.where(:name=>user).first
212
+ u = User.where(:name=>user_name).first
162
213
  #check sheets in runner
163
214
  r = u.runner
164
215
  runner_sheet = r.gbook(gdrive_slot).worksheet_by_title(schema_path)
165
216
  out_tsv = if runner_sheet
166
- runner_sheet.read(user)
217
+ runner_sheet.read(user_name)
167
218
  else
168
219
  #check for gfile. will fail if there isn't one.
169
- Gfile.find_by_path(schema_path).read(user)
220
+ Gfile.find_by_path(schema_path).read(user_name)
170
221
  end
171
- #use Gridfs to cache gdrive results
172
- file_name = schema_path.split("/").last
173
- out_url = "gridfs://#{schema_path}/#{file_name}"
174
- Dataset.write_by_url(out_url,out_tsv,user)
175
- schema_tsv = Dataset.find_by_url(out_url).read(user)
176
- schema_hash = {}
177
- schema_tsv.tsv_to_hash_array.each do |ha|
178
- schema_hash[ha['name']] = ha['datatype']
179
- end
180
- schema_hash
181
222
  end
223
+ #use Gridfs to cache gdrive results
224
+ file_name = schema_path.split("/").last
225
+ out_url = "gridfs://#{schema_path}/#{file_name}"
226
+ Dataset.write_by_url(out_url,out_tsv,user_name)
227
+ schema_tsv = Dataset.find_by_url(out_url).read(user_name)
228
+ schema_hash = {}
229
+ schema_tsv.tsv_to_hash_array.each do |ha|
230
+ schema_hash[ha['name']] = ha['datatype']
231
+ end
232
+ schema_hash
182
233
  end
183
234
 
184
- def Hive.path_params(cluster, path, user)
235
+ def Hive.path_params(cluster, path, user_name)
185
236
  db, table, partitions = path.gsub(".","/").split("/").ie{|sp| [sp.first, sp.second, sp[2..-1]]}
186
237
  #get existing table stats if any
187
238
  curr_stats = begin
188
- Hive.table_stats(db, table, cluster, user)
239
+ Hive.table_stats(cluster, db, table, user_name)
189
240
  rescue
190
241
  nil
191
242
  end
@@ -195,27 +246,34 @@ module Mobilize
195
246
  "curr_stats"=>curr_stats}
196
247
  end
197
248
 
198
- def Hive.hql_to_table(cluster, source_hql, target_path, user, drop=false, schema_hash=nil)
199
- target_params = Hive.path_params(cluster, target_path, user)
200
- target_table_path = ['db','table'].map{|k| target_params[k]}.join(".")
201
- target_partitions = target_params['partitions'].to_a
202
- target_table_stats = target_params['curr_stats']
249
+ def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil)
250
+ table_path = [db,table].join(".")
251
+ target_params = Hive.path_params(cluster, table_path, user_name)
252
+ table_stats = target_params['curr_stats']
253
+
254
+ source_hql_array = source_hql.split(";")
255
+ last_select_i = source_hql_array.rindex{|hql| hql.downcase.strip.starts_with?("select")}
256
+ #find the last select query -- it should be used for the temp table creation
257
+ last_select_hql = (source_hql_array[last_select_i..-1].join(";")+";")
258
+ #if there is anything prior to the last select, add it in prior to table creation
259
+ prior_hql = ((source_hql_array[0..(last_select_i-1)].join(";")+";") if last_select_i and last_select_i>=1).to_s
203
260
 
204
261
  #create temporary table so we can identify fields etc.
205
262
  temp_db = Hive.output_db(cluster)
206
- temp_table_name = (source_hql+target_path).to_md5
263
+ temp_table_name = (source_hql+table_path).to_md5
207
264
  temp_table_path = [temp_db,temp_table_name].join(".")
265
+ temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
208
266
  temp_drop_hql = "drop table if exists #{temp_table_path};"
209
- temp_create_hql = "#{temp_drop_hql}create table #{temp_table_path} as #{source_hql}"
210
- Hive.run(temp_create_hql,cluster,user)
267
+ temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
268
+ Hive.run(cluster,temp_create_hql,user_name)
211
269
 
212
- source_params = Hive.path_params(cluster, temp_table_path, user)
270
+ source_params = Hive.path_params(cluster, temp_table_path, user_name)
213
271
  source_table_path = ['db','table'].map{|k| source_params[k]}.join(".")
214
272
  source_table_stats = source_params['curr_stats']
215
273
  source_fields = source_table_stats['field_defs']
216
274
 
217
- if target_partitions.length == 0 and
218
- target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
275
+ if part_array.length == 0 and
276
+ table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
219
277
  #no partitions in either user params or the target table
220
278
 
221
279
  target_headers = source_fields.map{|f| f['name']}
@@ -233,21 +291,27 @@ module Mobilize
233
291
  end.join(",")})"
234
292
 
235
293
  #always drop when no partititons
236
- target_drop_hql = "drop table if exists #{target_table_path};"
294
+ target_name_hql = "set mapred.job.name=#{job_name};"
295
+
296
+ target_drop_hql = "drop table if exists #{table_path};"
237
297
 
238
- target_create_hql = "create table if not exists #{target_table_path} #{field_def_stmt};"
298
+ target_create_hql = "create table if not exists #{table_path} #{field_def_stmt};"
239
299
 
240
- target_insert_hql = "insert overwrite table #{target_table_path} select #{target_field_stmt} from #{source_table_path};"
300
+ target_insert_hql = "insert overwrite table #{table_path} select #{target_field_stmt} from #{source_table_path};"
241
301
 
242
- target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql,temp_drop_hql].join
302
+ target_full_hql = [target_name_hql,
303
+ target_drop_hql,
304
+ target_create_hql,
305
+ target_insert_hql,
306
+ temp_drop_hql].join
243
307
 
244
- Hive.run(target_full_hql, cluster, user)
308
+ Hive.run(cluster, target_full_hql, user_name)
245
309
 
246
- elsif target_partitions.length > 0 and
247
- target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == target_partitions}
310
+ elsif part_array.length > 0 and
311
+ table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
248
312
  #partitions and no target table or same partitions in both target table and user params
249
313
 
250
- target_headers = source_fields.map{|f| f['name']}.reject{|h| target_partitions.include?(h)}
314
+ target_headers = source_fields.map{|f| f['name']}.reject{|h| part_array.include?(h)}
251
315
 
252
316
  field_defs = {}
253
317
  target_headers.each do |name|
@@ -260,7 +324,7 @@ module Mobilize
260
324
  end.join(",")})"
261
325
 
262
326
  part_defs = {}
263
- target_partitions.each do |name|
327
+ part_array.each do |name|
264
328
  datatype = schema_hash[name] || "string"
265
329
  part_defs[name] = datatype
266
330
  end
@@ -271,70 +335,70 @@ module Mobilize
271
335
 
272
336
  target_field_stmt = target_headers.map{|h| "`#{h}`"}.join(",")
273
337
 
274
- target_part_stmt = target_partitions.map{|h| "`#{h}`"}.join(",")
338
+ target_part_stmt = part_array.map{|h| "`#{h}`"}.join(",")
275
339
 
276
- target_set_hql = ["set hive.exec.dynamic.partition.mode=nonstrict;",
340
+ target_set_hql = ["set mapred.job.name=#{job_name};",
341
+ "set hive.exec.dynamic.partition.mode=nonstrict;",
277
342
  "set hive.exec.max.dynamic.partitions.pernode=1000;",
278
343
  "set hive.exec.dynamic.partition=true;",
279
344
  "set hive.exec.max.created.files = 200000;",
280
345
  "set hive.max.created.files = 200000;"].join
281
346
 
282
- if drop or target_table_stats.nil?
283
- target_drop_hql = "drop table if exists #{target_table_path};"
347
+ if drop or table_stats.nil?
348
+ target_drop_hql = "drop table if exists #{table_path};"
284
349
  target_create_hql = target_drop_hql +
285
- "create table if not exists #{target_table_path} #{field_def_stmt} " +
350
+ "create table if not exists #{table_path} #{field_def_stmt} " +
286
351
  "partitioned by #{part_def_stmt};"
287
352
 
288
353
  else
289
- target_db,target_table = target_table_path.split(".")
290
354
  #get all the permutations of possible partititons
291
355
  part_perm_hql = "set hive.cli.print.header=true;select distinct #{target_part_stmt} from #{source_table_path};"
292
- part_perm_tsv = Hive.run(part_perm_hql, cluster, user)
356
+ part_perm_tsv = Hive.run(cluster, part_perm_hql, user_name)['stdout']
293
357
  #having gotten the permutations, ensure they are dropped
294
358
  part_hash_array = part_perm_tsv.tsv_to_hash_array
295
359
  part_drop_hql = part_hash_array.map do |h|
296
360
  part_drop_stmt = h.map do |name,value|
297
361
  part_defs[name[1..-2]]=="string" ? "#{name}='#{value}'" : "#{name}=#{value}"
298
362
  end.join(",")
299
- "use #{target_db};alter table #{target_table} drop if exists partition (#{part_drop_stmt});"
363
+ "use #{db};alter table #{table} drop if exists partition (#{part_drop_stmt});"
300
364
  end.join
301
365
  target_create_hql = part_drop_hql
302
366
  end
303
367
 
304
- target_insert_hql = "insert overwrite table #{target_table_path} " +
368
+ target_insert_hql = "insert overwrite table #{table_path} " +
305
369
  "partition (#{target_part_stmt}) " +
306
370
  "select #{target_field_stmt},#{target_part_stmt} from #{source_table_path};"
307
371
 
308
372
  target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
309
373
 
310
- Hive.run(target_full_hql, cluster, user)
374
+ Hive.run(cluster, target_full_hql, user_name)
311
375
  else
312
376
  error_msg = "Incompatible partition specs"
313
377
  raise error_msg
314
378
  end
315
- return target_path
379
+ url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
380
+ return url
316
381
  end
317
382
 
318
383
  #turn a tsv into a hive table.
319
384
  #Accepts options to drop existing target if any
320
385
  #also schema with column datatype overrides
321
- def Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop=false, schema_hash=nil)
386
+ def Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop=false, schema_hash=nil)
322
387
  source_headers = source_tsv.tsv_header_array
323
388
 
324
- target_params = Hive.path_params(cluster, target_path, user)
325
- target_db,target_table = ['db','table'].map{|k| target_params[k]}
326
- target_table_path = [target_db,target_table].join(".")
327
- target_partitions = target_params['partitions'].to_a
328
- target_table_stats = target_params['curr_stats']
389
+ table_path = [db,table].join(".")
390
+ target_params = Hive.path_params(cluster, table_path, user_name)
391
+ table_stats = target_params['curr_stats']
329
392
 
330
393
  schema_hash ||= {}
331
394
 
332
- if target_partitions.length == 0 and
333
- target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
395
+ if part_array.length == 0 and
396
+ table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
334
397
  #no partitions in either user params or the target table
335
398
  #or drop and start fresh
336
399
 
337
400
  #one file only, strip headers, replace tab with ctrl-a for hive
401
+ #get rid of freaking carriage return characters
338
402
  source_rows = source_tsv.split("\n")[1..-1].join("\n").gsub("\t","\001")
339
403
  source_tsv_filename = "000000_0"
340
404
  file_hash = {source_tsv_filename=>source_rows}
@@ -345,52 +409,52 @@ module Mobilize
345
409
  end.ie{|fs| "(#{fs.join(",")})"}
346
410
 
347
411
  #for single insert, use drop table and create table always
348
- target_drop_hql = "drop table if exists #{target_table_path}"
412
+ target_drop_hql = "drop table if exists #{table_path}"
349
413
 
350
- target_create_hql = "create table #{target_table_path} #{field_defs}"
414
+ target_create_hql = "create table #{table_path} #{field_defs}"
351
415
 
352
416
  #load source data
353
- target_insert_hql = "load data local inpath '#{source_tsv_filename}' overwrite into table #{target_table_path};"
417
+ target_insert_hql = "load data local inpath '#{source_tsv_filename}' overwrite into table #{table_path};"
354
418
 
355
419
  target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
356
420
 
357
- Hive.run(target_full_hql, cluster, user, file_hash)
421
+ Hive.run(cluster, target_full_hql, user_name, file_hash)
358
422
 
359
- elsif target_partitions.length > 0 and
360
- target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == target_partitions}
423
+ elsif part_array.length > 0 and
424
+ table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
361
425
  #partitions and no target table
362
426
  #or same partitions in both target table and user params
363
427
  #or drop and start fresh
364
428
 
365
- target_headers = source_headers.reject{|h| target_partitions.include?(h)}
429
+ target_headers = source_headers.reject{|h| part_array.include?(h)}
366
430
 
367
431
  field_defs = "(#{target_headers.map do |name|
368
432
  datatype = schema_hash[name] || "string"
369
433
  "`#{name}` #{datatype}"
370
434
  end.join(",")})"
371
435
 
372
- partition_defs = "(#{target_partitions.map do |name|
436
+ partition_defs = "(#{part_array.map do |name|
373
437
  datatype = schema_hash[name] || "string"
374
438
  "#{name} #{datatype}"
375
439
  end.join(",")})"
376
440
 
377
- target_drop_hql = drop ? "drop table if exists #{target_table_path};" : ""
441
+ target_drop_hql = drop ? "drop table if exists #{table_path};" : ""
378
442
 
379
443
  target_create_hql = target_drop_hql +
380
- "create table if not exists #{target_table_path} #{field_defs} " +
444
+ "create table if not exists #{table_path} #{field_defs} " +
381
445
  "partitioned by #{partition_defs}"
382
446
 
383
447
  #create target table early if not here
384
- Hive.run(target_create_hql, cluster, user)
448
+ Hive.run(cluster, target_create_hql, user_name)
385
449
 
386
- target_table_stats = Hive.table_stats(target_db, target_table, cluster, user)
450
+ table_stats = Hive.table_stats(cluster, db, table, user_name)
387
451
 
388
452
  #create data hash from source hash array
389
453
  data_hash = {}
390
454
  source_hash_array = source_tsv.tsv_to_hash_array
391
455
  source_hash_array.each do |ha|
392
- tpmk = target_partitions.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
393
- tpmv = ha.reject{|k,v| target_partitions.include?(k)}.values.join("\001")
456
+ tpmk = part_array.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
457
+ tpmv = ha.reject{|k,v| part_array.include?(k)}.values.join("\001")
394
458
  if data_hash[tpmk]
395
459
  data_hash[tpmk] += "\n#{tpmv}"
396
460
  else
@@ -399,61 +463,62 @@ module Mobilize
399
463
  end
400
464
 
401
465
  #go through completed data hash and write each key value to the table in question
466
+ target_part_hql = ""
402
467
  data_hash.each do |tpmk,tpmv|
403
468
  base_filename = "000000_0"
404
469
  part_pairs = tpmk.split("/").map{|p| p.split("=").ie{|pa| ["#{pa.first}","#{pa.second}"]}}
405
470
  part_dir = part_pairs.map{|pp| "#{pp.first}=#{pp.second}"}.join("/")
406
471
  part_stmt = part_pairs.map{|pp| "#{pp.first}='#{pp.second}'"}.join(",")
407
- hdfs_dir = "#{target_table_stats['location']}/#{part_dir}"
408
- hdfs_source_path = "/#{hdfs_dir.split("/")[3..-2].join("/")}/#{base_filename}"
409
- hdfs_target_path = "/#{hdfs_dir.split("/")[3..-1].join("/")}"
472
+ hdfs_dir = "#{table_stats['location']}/#{part_dir}"
473
+ #source the partitions from a parallel load folder since filenames are all named the same
474
+ hdfs_source_url = "#{table_stats['location']}/part_load/#{part_dir}/#{base_filename}"
475
+ hdfs_target_url = hdfs_dir
410
476
  #load partition into source path
411
- puts "Writing to #{hdfs_source_path} for #{user} at #{Time.now.utc}"
412
- Hdfs.write(hdfs_source_path,tpmv,user)
477
+ puts "Writing to #{hdfs_source_url} for #{user_name} at #{Time.now.utc}"
478
+ Hdfs.write(cluster,hdfs_source_url,tpmv,user_name)
413
479
  #let Hive know where the partition is
414
- target_add_part_hql = "use #{target_db};alter table #{target_table} add if not exists partition (#{part_stmt}) location '#{hdfs_target_path}'"
415
- target_insert_part_hql = "load data inpath '#{hdfs_source_path}' overwrite into table #{target_table} partition (#{part_stmt});"
416
- target_part_hql = [target_add_part_hql,target_insert_part_hql].join(";")
417
- puts "Adding partition #{tpmk} to #{target_table_path} for #{user} at #{Time.now.utc}"
418
- Hive.run(target_part_hql, cluster, user)
480
+ target_add_part_hql = "use #{db};alter table #{table} add if not exists partition (#{part_stmt}) location '#{hdfs_target_url}'"
481
+ target_insert_part_hql = "load data inpath '#{hdfs_source_url}' overwrite into table #{table} partition (#{part_stmt});"
482
+ target_part_hql += [target_add_part_hql,target_insert_part_hql].join(";")
483
+ end
484
+ #run actual partition adds all at once
485
+ if target_part_hql.length>0
486
+ puts "Adding partitions to #{cluster}/#{db}/#{table} for #{user_name} at #{Time.now.utc}"
487
+ Hive.run(cluster, target_part_hql, user_name)
419
488
  end
420
489
  else
421
490
  error_msg = "Incompatible partition specs: " +
422
- "target table:#{target_table_stats['partitions'].to_s}, " +
423
- "user_params:#{target_partitions.to_s}"
491
+ "target table:#{table_stats['partitions'].to_s}, " +
492
+ "user_params:#{part_array.to_s}"
424
493
  raise error_msg
425
494
  end
426
- return target_path
495
+ url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
496
+ return url
427
497
  end
428
498
 
429
499
  def Hive.write_by_stage_path(stage_path)
430
500
  s = Stage.where(:path=>stage_path).first
431
- u = s.job.runner.user
432
501
  params = s.params
433
- user = params['user']
434
- cluster = params['cluster'] || Hive.clusters.keys.first
502
+ source = s.sources.first
503
+ target = s.target
504
+ cluster, db, table = target.url.split("://").last.split("/")
505
+ #update stage with the node so we can use it
506
+ user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
507
+ job_name = s.path.sub("Runner_","")
435
508
 
436
509
  #slot Hive worker if available
437
510
  slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
438
511
  return false unless slot_id
439
512
 
440
- node = Hadoop.gateway_node(cluster)
441
- if user and !Ssh.sudoers(node).include?(u.name)
442
- raise "#{u.name} does not have su permissions for #{node}"
443
- elsif user.nil? and Ssh.su_all_users(node)
444
- user = u.name
445
- end
446
-
447
- #determine path for target
448
- target_path = params['target']
449
-
450
- gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
451
- #return blank response if there are no slots available
452
- return nil unless gdrive_slot
453
- source_dst = s.source_dsts(gdrive_slot).first
454
- schema_hash = params['schema'] ? Hive.schema_hash(params['schema'],user,gdrive_slot) : {}
513
+ schema_hash = if params['schema']
514
+ gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
515
+ #return blank response if there are no slots available
516
+ return nil unless gdrive_slot
517
+ Hive.schema_hash(params['schema'],user_name,gdrive_slot)
518
+ else
519
+ {}
520
+ end
455
521
  Gdrive.unslot_worker_by_path(stage_path)
456
-
457
522
  #drop target before create/insert?
458
523
  drop = params['drop']
459
524
 
@@ -461,64 +526,77 @@ module Mobilize
461
526
  source_tsv,source_hql = [nil]*2
462
527
  if params['hql']
463
528
  source_hql = params['hql']
464
- elsif source_dst
465
- if source_dst.handler == 'hive'
529
+ elsif source
530
+ if source.handler == 'hive'
466
531
  #source table
467
- cluster,source_path = source_dst.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
532
+ cluster,source_path = source.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
468
533
  source_hql = "select * from #{source_path};"
469
- elsif ['gridfs','hdfs'].include?(source_dst.handler)
470
- if source_dst.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
471
- source_hql = source_dst.read(user)
534
+ elsif ['gsheet','gridfs','hdfs'].include?(source.handler)
535
+ if source.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
536
+ source_hql = source.read(user_name)
472
537
  else
473
538
  #tsv from sheet
474
- source_tsv = source_dst.read(user)
539
+ source_tsv = source.read(user_name)
475
540
  end
476
541
  end
477
542
  end
478
543
 
479
- out_string = if source_hql
480
- Hive.hql_to_table(cluster, source_hql, target_path, user, drop, schema_hash)
481
- elsif source_tsv
482
- Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop, schema_hash)
483
- else
484
- raise "Unable to determine source tsv or source hql"
485
- end
486
-
544
+ part_array = if params['partitions']
545
+ params['partitions'].to_a.map{|p| p.gsub(".","/").split("/")}.flatten
546
+ elsif params['target']
547
+ #take the end parts of the target, that are not the cluster, db, table
548
+ target_array = params['target'].gsub(".","/").split("/")
549
+ [cluster,db,table].each do |term|
550
+ target_array = target_array[1..-1] if target_array.first == term
551
+ end
552
+ target_array
553
+ else
554
+ []
555
+ end
487
556
 
557
+ result = begin
558
+ url = if source_hql
559
+ Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop, schema_hash)
560
+ elsif source_tsv
561
+ Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop, schema_hash)
562
+ else
563
+ raise "Unable to determine source tsv or source hql"
564
+ end
565
+ {'stdout'=>url,'exit_code'=>0}
566
+ rescue => exc
567
+ {'stderr'=>exc.to_s, 'exit_code'=>500}
568
+ end
488
569
 
489
570
  #unslot worker and write result
490
571
  Hive.unslot_worker_by_path(stage_path)
491
572
 
492
- #output table stores stage output
493
- out_string = "result\n#{out_string}"
494
- output_db,output_table = [Hive.output_db(cluster),stage_path.gridsafe]
495
- out_url = "hive://#{cluster}/#{output_db}/#{output_table}"
496
- Dataset.write_by_url(out_url,out_string,user)
497
- out_url
573
+ response = {}
574
+ response['out_url'] = Dataset.write_by_url("gridfs://#{s.path}/out",result['stdout'].to_s,Gdrive.owner_name) if result['stdout'].to_s.length>0
575
+ response['err_url'] = Dataset.write_by_url("gridfs://#{s.path}/err",result['stderr'].to_s,Gdrive.owner_name) if result['stderr'].to_s.length>0
576
+ response['signal'] = result['exit_code']
577
+ response
498
578
  end
499
579
 
500
- def Hive.read_by_dataset_path(dst_path,user)
501
- cluster,source_path = dst_path.split("/").ie do |sp|
502
- if sp.length == 2
503
- [Hive.clusters.first.first,sp.join(".")]
504
- else
505
- [sp.first, sp[1..-1].join(".")]
506
- end
507
- end
508
- hql = "set hive.cli.print.header=true;select * from #{source_path};"
509
- Hive.run(hql,cluster,user)
580
+ def Hive.read_by_dataset_path(dst_path,user_name,*args)
581
+ cluster, db, table = dst_path.split("/")
582
+ source_path = [db,table].join(".")
583
+ job_name = "read #{cluster}/#{db}/#{table}"
584
+ set_hql = "set hive.cli.print.header=true;set mapred.job.name=#{job_name};"
585
+ select_hql = "select * from #{source_path};"
586
+ hql = [set_hql,select_hql].join
587
+ response = Hive.run(cluster, hql,user_name)
588
+ if response['exit_code']==0
589
+ return response['stdout']
590
+ else
591
+ raise "Unable to read hive://#{dst_path} with error: #{response['stderr']}"
592
+ end
510
593
  end
511
594
 
512
- def Hive.write_by_dataset_path(dst_path,source_tsv,user)
513
- cluster,target_path = dst_path.split("/").ie do |sp|
514
- if sp.length == 2
515
- [Hive.clusters.first.first,sp.join(".")]
516
- else
517
- [sp.first, sp[1..-1].join(".")]
518
- end
519
- end
595
+ def Hive.write_by_dataset_path(dst_path,source_tsv,user_name,*args)
596
+ cluster,db,table = dst_path.split("/")
597
+ part_array = []
520
598
  drop = true
521
- Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop)
599
+ Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop)
522
600
  end
523
601
  end
524
602
 
@@ -1,5 +1,5 @@
1
1
  module Mobilize
2
2
  module Hive
3
- VERSION = "1.0.11"
3
+ VERSION = "1.2"
4
4
  end
5
5
  end
@@ -7,7 +7,7 @@ Gem::Specification.new do |gem|
7
7
  gem.name = "mobilize-hive"
8
8
  gem.version = Mobilize::Hive::VERSION
9
9
  gem.authors = ["Cassio Paes-Leme"]
10
- gem.email = ["cpaesleme@ngmoco.com"]
10
+ gem.email = ["cpaesleme@dena.com"]
11
11
  gem.description = %q{Adds hive read, write, and run support to mobilize-hdfs}
12
12
  gem.summary = %q{Adds hive read, write, and run support to mobilize-hdfs}
13
13
  gem.homepage = "http://github.com/dena/mobilize-hive"
@@ -16,5 +16,5 @@ Gem::Specification.new do |gem|
16
16
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ["lib"]
19
- gem.add_runtime_dependency "mobilize-hdfs","1.0.10"
19
+ gem.add_runtime_dependency "mobilize-hdfs","1.2"
20
20
  end
@@ -3,7 +3,7 @@
3
3
  active: true
4
4
  trigger: once
5
5
  status: ""
6
- stage1: hive.write target:"mobilize/hive_test_1/act_date", drop:true,
6
+ stage1: hive.write target:"mobilize/hive_test_1", partitions:"act_date", drop:true,
7
7
  source:"Runner_mobilize(test)/hive_test_1.in", schema:"hive_test_1.schema"
8
8
  stage2: hive.run source:"hive_test_1.hql"
9
9
  stage3: hive.run hql:"show databases;"
@@ -21,6 +21,6 @@
21
21
  trigger: after hive_test_2
22
22
  status: ""
23
23
  stage1: hive.run hql:"select act_date as `date`,product,category,value from mobilize.hive_test_1;"
24
- stage2: hive.write source:"stage1",target:"mobilize/hive_test_3/date/product", drop:true
25
- stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3/date/product", drop:false
24
+ stage2: hive.write source:"stage1",target:"mobilize/hive_test_3", partitions:"date/product", drop:true
25
+ stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3", partitions:"date/product", drop:false
26
26
  stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"
@@ -52,9 +52,9 @@ describe "Mobilize" do
52
52
  hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
53
53
  [hive_3_target_sheet].each{|s| s.delete if s}
54
54
 
55
- puts "job row added, force enqueued requestor, wait 1000s"
55
+ puts "job row added, force enqueued requestor, wait for stages"
56
56
  r.enqueue!
57
- sleep 1000
57
+ wait_for_stages(1200)
58
58
 
59
59
  puts "jobtracker posted data to test sheet"
60
60
  hive_1_stage_2_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1_stage_2.out",gdrive_slot)
@@ -63,9 +63,34 @@ describe "Mobilize" do
63
63
  hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
64
64
 
65
65
  assert hive_1_stage_2_target_sheet.read(u.name).length == 219
66
- assert hive_1_stage_3_target_sheet.read(u.name).length == 325
66
+ assert hive_1_stage_3_target_sheet.read(u.name).length > 3
67
67
  assert hive_2_target_sheet.read(u.name).length == 599
68
68
  assert hive_3_target_sheet.read(u.name).length == 347
69
69
  end
70
70
 
71
+ def wait_for_stages(time_limit=600,stage_limit=120,wait_length=10)
72
+ time = 0
73
+ time_since_stage = 0
74
+ #check for 10 min
75
+ while time < time_limit and time_since_stage < stage_limit
76
+ sleep wait_length
77
+ job_classes = Mobilize::Resque.jobs.map{|j| j['class']}
78
+ if job_classes.include?("Mobilize::Stage")
79
+ time_since_stage = 0
80
+ puts "saw stage at #{time.to_s} seconds"
81
+ else
82
+ time_since_stage += wait_length
83
+ puts "#{time_since_stage.to_s} seconds since stage seen"
84
+ end
85
+ time += wait_length
86
+ puts "total wait time #{time.to_s} seconds"
87
+ end
88
+
89
+ if time >= time_limit
90
+ raise "Timed out before stage completion"
91
+ end
92
+ end
93
+
94
+
95
+
71
96
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mobilize-hive
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.11
4
+ version: '1.2'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-03-05 00:00:00.000000000 Z
12
+ date: 2013-03-21 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mobilize-hdfs
@@ -18,7 +18,7 @@ dependencies:
18
18
  requirements:
19
19
  - - '='
20
20
  - !ruby/object:Gem::Version
21
- version: 1.0.10
21
+ version: '1.2'
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
@@ -26,10 +26,10 @@ dependencies:
26
26
  requirements:
27
27
  - - '='
28
28
  - !ruby/object:Gem::Version
29
- version: 1.0.10
29
+ version: '1.2'
30
30
  description: Adds hive read, write, and run support to mobilize-hdfs
31
31
  email:
32
- - cpaesleme@ngmoco.com
32
+ - cpaesleme@dena.com
33
33
  executables: []
34
34
  extensions: []
35
35
  extra_rdoc_files: []