mobilize-hive 1.2 → 1.3

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -142,6 +142,17 @@ Start
142
142
  * cluster and user are optional for all of the below.
143
143
  * cluster defaults to the first cluster listed;
144
144
  * user is treated the same way as in [mobilize-ssh][mobilize-ssh].
145
+ * params are also optional for all of the below. They replace HQL in sources.
146
+ * params are passed as a YML or JSON, as in:
147
+ * `hive.run source:<source_path>, params:{'date': '2013-03-01', 'unit': 'widgets'}`
148
+ * this example replaces all the keys, preceded by '@' in all source hqls with the value.
149
+ * The preceding '@' is used to keep from replacing instances
150
+ of "date" and "unit" in the HQL; you should have `@date` and `@unit` in your actual HQL
151
+ if you'd like to replace those tokens.
152
+ * in addition, the following params are substituted automatically:
153
+ * `$utc_date` - replaced with YYYY-MM-DD date, UTC
154
+ * `$utc_time` - replaced with HH:MM time, UTC
155
+ * any occurrence of these values in HQL will be replaced at runtime.
145
156
  * hive.run `hql:<hql> || source:<gsheet_path>, user:<user>, cluster:<cluster>`, which executes the
146
157
  script in the hql or source sheet and returns any output specified at the
147
158
  end. If the cmd or last query in source is a select statement, column headers will be
@@ -1,58 +1,9 @@
1
1
  module Mobilize
2
2
  module Hive
3
- def Hive.config
4
- Base.config('hive')
5
- end
6
-
7
- def Hive.exec_path(cluster)
8
- Hive.clusters[cluster]['exec_path']
9
- end
10
-
11
- def Hive.output_db(cluster)
12
- Hive.clusters[cluster]['output_db']
13
- end
14
-
15
- def Hive.output_db_user(cluster)
16
- output_db_node = Hadoop.gateway_node(cluster)
17
- output_db_user = Ssh.host(output_db_node)['user']
18
- output_db_user
19
- end
20
-
21
- def Hive.clusters
22
- Hive.config['clusters']
23
- end
24
-
25
- def Hive.slot_ids(cluster)
26
- (1..Hive.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
27
- end
28
-
29
- def Hive.slot_worker_by_cluster_and_path(cluster,path)
30
- working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
31
- Hive.slot_ids(cluster).each do |slot_id|
32
- unless working_slots.include?(slot_id)
33
- Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
34
- return slot_id
35
- end
36
- end
37
- #return false if none are available
38
- return false
39
- end
40
-
41
- def Hive.unslot_worker_by_path(path)
42
- begin
43
- Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
44
- return true
45
- rescue
46
- return false
47
- end
48
- end
49
-
50
- def Hive.databases(cluster,user_name)
51
- Hive.run(cluster,"show databases",user_name)['stdout'].split("\n")
52
- end
53
-
3
+ #adds convenience methods
4
+ require "#{File.dirname(__FILE__)}/../helpers/hive_helper"
54
5
  # converts a source path or target path to a dst in the context of handler and stage
55
- def Hive.path_to_dst(path,stage_path)
6
+ def Hive.path_to_dst(path,stage_path,gdrive_slot)
56
7
  has_handler = true if path.index("://")
57
8
  s = Stage.where(:path=>stage_path).first
58
9
  params = s.params
@@ -78,7 +29,7 @@ module Mobilize
78
29
  return Dataset.find_or_create_by_url(hive_url)
79
30
  end
80
31
  #otherwise, use hdfs convention
81
- return Ssh.path_to_dst(path,stage_path)
32
+ return Ssh.path_to_dst(path,stage_path,gdrive_slot)
82
33
  end
83
34
 
84
35
  def Hive.url_by_path(path,user_name,is_target=false)
@@ -108,7 +59,7 @@ module Mobilize
108
59
  def Hive.table_stats(cluster,db,table,user_name)
109
60
  describe_sql = "use #{db};describe extended #{table};"
110
61
  describe_response = Hive.run(cluster, describe_sql,user_name)
111
- return describe_response if describe_response['stdout'].length==0
62
+ return nil if describe_response['stdout'].length==0
112
63
  describe_output = describe_response['stdout']
113
64
  describe_output.split("location:").last.split(",").first
114
65
  #get location, fields, partitions
@@ -142,20 +93,43 @@ module Mobilize
142
93
  end
143
94
 
144
95
  #run a generic hive command, with the option of passing a file hash to be locally available
145
- def Hive.run(cluster,hql,user_name,file_hash=nil)
96
+ def Hive.run(cluster,hql,user_name,params=nil,file_hash=nil)
146
97
  # no TempStatsStore
147
98
  hql = "set hive.stats.autogather=false;#{hql}"
148
99
  filename = hql.to_md5
149
100
  file_hash||= {}
150
101
  file_hash[filename] = hql
102
+ #add in default params
103
+ params ||= {}
104
+ params = params.merge(Hive.default_params)
105
+ #replace any params in the file_hash and command
106
+ params.each do |k,v|
107
+ file_hash.each do |name,data|
108
+ if k.starts_with?("$")
109
+ data.gsub!(k,v)
110
+ else
111
+ data.gsub!("@#{k}",v)
112
+ end
113
+ end
114
+ end
151
115
  #silent mode so we don't have logs in stderr; clip output
152
116
  #at hadoop read limit
153
117
  command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
154
118
  gateway_node = Hadoop.gateway_node(cluster)
155
- Ssh.run(gateway_node,command,user_name,file_hash)
119
+ response = Ssh.run(gateway_node,command,user_name,file_hash)
120
+ #override exit code 0 when stdout is blank and
121
+ #stderror contains FAILED or KILLED
122
+ if response['stdout'].to_s.length == 0 and
123
+ response['stderr'].to_s.ie{|se| se.index("FAILED") or se.index("KILLED")}
124
+ response['exit_code'] = 500
125
+ end
126
+ return response
156
127
  end
157
128
 
158
129
  def Hive.run_by_stage_path(stage_path)
130
+ gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
131
+ #return blank response if there are no slots available
132
+ return nil unless gdrive_slot
159
133
  s = Stage.where(:path=>stage_path).first
160
134
  params = s.params
161
135
  cluster = params['cluster'] || Hive.clusters.keys.first
@@ -174,13 +148,16 @@ module Mobilize
174
148
  if params['hql']
175
149
  hql = params['hql']
176
150
  else
177
- source = s.sources.first
178
- hql = source.read(user_name)
151
+ source = s.sources(gdrive_slot).first
152
+ hql = source.read(user_name,gdrive_slot)
179
153
  end
180
154
 
155
+ Gdrive.unslot_worker_by_path(stage_path)
156
+
181
157
  #check for select at end
182
158
  hql_array = hql.split(";").map{|hc| hc.strip}.reject{|hc| hc.length==0}
183
- if hql_array.last.downcase.starts_with?("select")
159
+ last_statement = hql_array.last.downcase.split("\n").reject{|l| l.starts_with?("-- ")}.first
160
+ if last_statement.to_s.starts_with?("select")
184
161
  #nil if no prior commands
185
162
  prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
186
163
  select_hql = hql_array.last
@@ -188,10 +165,10 @@ module Mobilize
188
165
  "drop table if exists #{output_path}",
189
166
  "create table #{output_path} as #{select_hql};"].join(";")
190
167
  full_hql = [prior_hql, output_table_hql].compact.join(";")
191
- result = Hive.run(cluster,full_hql, user_name)
168
+ result = Hive.run(cluster,full_hql, user_name,params['params'])
192
169
  Dataset.find_or_create_by_url(out_url)
193
170
  else
194
- result = Hive.run(cluster, hql, user_name)
171
+ result = Hive.run(cluster, hql, user_name,params['params'])
195
172
  Dataset.find_or_create_by_url(out_url)
196
173
  Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
197
174
  end
@@ -224,7 +201,7 @@ module Mobilize
224
201
  file_name = schema_path.split("/").last
225
202
  out_url = "gridfs://#{schema_path}/#{file_name}"
226
203
  Dataset.write_by_url(out_url,out_tsv,user_name)
227
- schema_tsv = Dataset.find_by_url(out_url).read(user_name)
204
+ schema_tsv = Dataset.find_by_url(out_url).read(user_name,gdrive_slot)
228
205
  schema_hash = {}
229
206
  schema_tsv.tsv_to_hash_array.each do |ha|
230
207
  schema_hash[ha['name']] = ha['datatype']
@@ -232,24 +209,10 @@ module Mobilize
232
209
  schema_hash
233
210
  end
234
211
 
235
- def Hive.path_params(cluster, path, user_name)
236
- db, table, partitions = path.gsub(".","/").split("/").ie{|sp| [sp.first, sp.second, sp[2..-1]]}
237
- #get existing table stats if any
238
- curr_stats = begin
239
- Hive.table_stats(cluster, db, table, user_name)
240
- rescue
241
- nil
242
- end
243
- {"db"=>db,
244
- "table"=>table,
245
- "partitions"=>partitions,
246
- "curr_stats"=>curr_stats}
247
- end
248
-
249
- def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil)
212
+ def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil, params=nil)
250
213
  table_path = [db,table].join(".")
251
- target_params = Hive.path_params(cluster, table_path, user_name)
252
- table_stats = target_params['curr_stats']
214
+ table_stats = Hive.table_stats(cluster, db, table, user_name)
215
+ url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
253
216
 
254
217
  source_hql_array = source_hql.split(";")
255
218
  last_select_i = source_hql_array.rindex{|hql| hql.downcase.strip.starts_with?("select")}
@@ -265,11 +228,10 @@ module Mobilize
265
228
  temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
266
229
  temp_drop_hql = "drop table if exists #{temp_table_path};"
267
230
  temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
268
- Hive.run(cluster,temp_create_hql,user_name)
231
+ response = Hive.run(cluster,temp_create_hql,user_name,params)
232
+ raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
269
233
 
270
- source_params = Hive.path_params(cluster, temp_table_path, user_name)
271
- source_table_path = ['db','table'].map{|k| source_params[k]}.join(".")
272
- source_table_stats = source_params['curr_stats']
234
+ source_table_stats = Hive.table_stats(cluster,temp_db,temp_table_name,user_name)
273
235
  source_fields = source_table_stats['field_defs']
274
236
 
275
237
  if part_array.length == 0 and
@@ -297,7 +259,7 @@ module Mobilize
297
259
 
298
260
  target_create_hql = "create table if not exists #{table_path} #{field_def_stmt};"
299
261
 
300
- target_insert_hql = "insert overwrite table #{table_path} select #{target_field_stmt} from #{source_table_path};"
262
+ target_insert_hql = "insert overwrite table #{table_path} select #{target_field_stmt} from #{temp_table_path};"
301
263
 
302
264
  target_full_hql = [target_name_hql,
303
265
  target_drop_hql,
@@ -305,10 +267,12 @@ module Mobilize
305
267
  target_insert_hql,
306
268
  temp_drop_hql].join
307
269
 
308
- Hive.run(cluster, target_full_hql, user_name)
270
+ response = Hive.run(cluster, target_full_hql, user_name, params)
271
+
272
+ raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
309
273
 
310
274
  elsif part_array.length > 0 and
311
- table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
275
+ table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']}.sort == part_array.sort}
312
276
  #partitions and no target table or same partitions in both target table and user params
313
277
 
314
278
  target_headers = source_fields.map{|f| f['name']}.reject{|h| part_array.include?(h)}
@@ -352,10 +316,20 @@ module Mobilize
352
316
 
353
317
  else
354
318
  #get all the permutations of possible partititons
355
- part_perm_hql = "set hive.cli.print.header=true;select distinct #{target_part_stmt} from #{source_table_path};"
356
- part_perm_tsv = Hive.run(cluster, part_perm_hql, user_name)['stdout']
319
+ part_set_hql = "set hive.cli.print.header=true;set mapred.job.name=#{job_name} (permutations);"
320
+ part_select_hql = "select distinct #{target_part_stmt} from #{temp_table_path};"
321
+ part_perm_hql = part_set_hql + part_select_hql
322
+ response = Hive.run(cluster, part_perm_hql, user_name, params)
323
+ raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
324
+ part_perm_tsv = response['stdout']
357
325
  #having gotten the permutations, ensure they are dropped
358
326
  part_hash_array = part_perm_tsv.tsv_to_hash_array
327
+ #make sure there is data
328
+ if part_hash_array.first.nil? or part_hash_array.first.values.include?(nil)
329
+ #blank result set, return url
330
+ return url
331
+ end
332
+
359
333
  part_drop_hql = part_hash_array.map do |h|
360
334
  part_drop_stmt = h.map do |name,value|
361
335
  part_defs[name[1..-2]]=="string" ? "#{name}='#{value}'" : "#{name}=#{value}"
@@ -367,16 +341,16 @@ module Mobilize
367
341
 
368
342
  target_insert_hql = "insert overwrite table #{table_path} " +
369
343
  "partition (#{target_part_stmt}) " +
370
- "select #{target_field_stmt},#{target_part_stmt} from #{source_table_path};"
344
+ "select #{target_field_stmt},#{target_part_stmt} from #{temp_table_path};"
371
345
 
372
346
  target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
373
347
 
374
- Hive.run(cluster, target_full_hql, user_name)
348
+ response = Hive.run(cluster, target_full_hql, user_name, params)
349
+ raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
375
350
  else
376
351
  error_msg = "Incompatible partition specs"
377
352
  raise error_msg
378
353
  end
379
- url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
380
354
  return url
381
355
  end
382
356
 
@@ -384,14 +358,21 @@ module Mobilize
384
358
  #Accepts options to drop existing target if any
385
359
  #also schema with column datatype overrides
386
360
  def Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop=false, schema_hash=nil)
361
+ return nil if source_tsv.strip.length==0
362
+ if source_tsv.index("\r\n")
363
+ source_tsv = source_tsv.gsub("\r\n","\n")
364
+ elsif source_tsv.index("\r")
365
+ source_tsv = source_tsv.gsub("\r","\n")
366
+ end
387
367
  source_headers = source_tsv.tsv_header_array
388
368
 
389
369
  table_path = [db,table].join(".")
390
- target_params = Hive.path_params(cluster, table_path, user_name)
391
- table_stats = target_params['curr_stats']
370
+ table_stats = Hive.table_stats(cluster, db, table, user_name)
392
371
 
393
372
  schema_hash ||= {}
394
373
 
374
+ url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
375
+
395
376
  if part_array.length == 0 and
396
377
  table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
397
378
  #no partitions in either user params or the target table
@@ -418,10 +399,11 @@ module Mobilize
418
399
 
419
400
  target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
420
401
 
421
- Hive.run(cluster, target_full_hql, user_name, file_hash)
402
+ response = Hive.run(cluster, target_full_hql, user_name, nil, file_hash)
403
+ raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
422
404
 
423
405
  elsif part_array.length > 0 and
424
- table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
406
+ table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']}.sort == part_array.sort}
425
407
  #partitions and no target table
426
408
  #or same partitions in both target table and user params
427
409
  #or drop and start fresh
@@ -445,13 +427,17 @@ module Mobilize
445
427
  "partitioned by #{partition_defs}"
446
428
 
447
429
  #create target table early if not here
448
- Hive.run(cluster, target_create_hql, user_name)
430
+ response = Hive.run(cluster, target_create_hql, user_name)
431
+ raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
432
+
433
+ #return url (operation complete) if there's no data
434
+ source_hash_array = source_tsv.tsv_to_hash_array
435
+ return url if source_hash_array.length==1 and source_hash_array.first.values.compact.length==0
449
436
 
450
437
  table_stats = Hive.table_stats(cluster, db, table, user_name)
451
438
 
452
439
  #create data hash from source hash array
453
440
  data_hash = {}
454
- source_hash_array = source_tsv.tsv_to_hash_array
455
441
  source_hash_array.each do |ha|
456
442
  tpmk = part_array.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
457
443
  tpmv = ha.reject{|k,v| part_array.include?(k)}.values.join("\001")
@@ -484,7 +470,8 @@ module Mobilize
484
470
  #run actual partition adds all at once
485
471
  if target_part_hql.length>0
486
472
  puts "Adding partitions to #{cluster}/#{db}/#{table} for #{user_name} at #{Time.now.utc}"
487
- Hive.run(cluster, target_part_hql, user_name)
473
+ response = Hive.run(cluster, target_part_hql, user_name)
474
+ raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
488
475
  end
489
476
  else
490
477
  error_msg = "Incompatible partition specs: " +
@@ -492,33 +479,31 @@ module Mobilize
492
479
  "user_params:#{part_array.to_s}"
493
480
  raise error_msg
494
481
  end
495
- url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
482
+
496
483
  return url
497
484
  end
498
485
 
499
486
  def Hive.write_by_stage_path(stage_path)
487
+ gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
488
+ #return blank response if there are no slots available
489
+ return nil unless gdrive_slot
500
490
  s = Stage.where(:path=>stage_path).first
501
491
  params = s.params
502
- source = s.sources.first
492
+ source = s.sources(gdrive_slot).first
503
493
  target = s.target
504
494
  cluster, db, table = target.url.split("://").last.split("/")
505
- #update stage with the node so we can use it
506
- user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
507
- job_name = s.path.sub("Runner_","")
508
-
509
495
  #slot Hive worker if available
510
496
  slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
511
497
  return false unless slot_id
498
+ #update stage with the node so we can use it
499
+ user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
500
+ job_name = s.path.sub("Runner_","")
512
501
 
513
502
  schema_hash = if params['schema']
514
- gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
515
- #return blank response if there are no slots available
516
- return nil unless gdrive_slot
517
503
  Hive.schema_hash(params['schema'],user_name,gdrive_slot)
518
504
  else
519
505
  {}
520
506
  end
521
- Gdrive.unslot_worker_by_path(stage_path)
522
507
  #drop target before create/insert?
523
508
  drop = params['drop']
524
509
 
@@ -531,16 +516,17 @@ module Mobilize
531
516
  #source table
532
517
  cluster,source_path = source.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
533
518
  source_hql = "select * from #{source_path};"
534
- elsif ['gsheet','gridfs','hdfs'].include?(source.handler)
519
+ elsif ['gsheet','gfile','gridfs','hdfs'].include?(source.handler)
535
520
  if source.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
536
- source_hql = source.read(user_name)
521
+ source_hql = source.read(user_name,gdrive_slot)
537
522
  else
538
- #tsv from sheet
539
- source_tsv = source.read(user_name)
523
+ #tsv from sheet or file
524
+ source_tsv = source.read(user_name,gdrive_slot)
540
525
  end
541
526
  end
542
527
  end
543
528
 
529
+ Gdrive.unslot_worker_by_path(stage_path)
544
530
  part_array = if params['partitions']
545
531
  params['partitions'].to_a.map{|p| p.gsub(".","/").split("/")}.flatten
546
532
  elsif params['target']
@@ -559,12 +545,14 @@ module Mobilize
559
545
  Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop, schema_hash)
560
546
  elsif source_tsv
561
547
  Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop, schema_hash)
548
+ elsif source
549
+ #null sheet
562
550
  else
563
551
  raise "Unable to determine source tsv or source hql"
564
552
  end
565
553
  {'stdout'=>url,'exit_code'=>0}
566
554
  rescue => exc
567
- {'stderr'=>exc.to_s, 'exit_code'=>500}
555
+ {'stderr'=>"#{exc.to_s}\n#{exc.backtrace.join("\n")}", 'exit_code'=>500}
568
556
  end
569
557
 
570
558
  #unslot worker and write result
@@ -585,11 +573,8 @@ module Mobilize
585
573
  select_hql = "select * from #{source_path};"
586
574
  hql = [set_hql,select_hql].join
587
575
  response = Hive.run(cluster, hql,user_name)
588
- if response['exit_code']==0
589
- return response['stdout']
590
- else
591
- raise "Unable to read hive://#{dst_path} with error: #{response['stderr']}"
592
- end
576
+ raise "Unable to read hive://#{dst_path} with error: #{response['stderr']}" if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
577
+ return response['stdout']
593
578
  end
594
579
 
595
580
  def Hive.write_by_dataset_path(dst_path,source_tsv,user_name,*args)
@@ -0,0 +1,63 @@
1
+ module Mobilize
2
+ module Hive
3
+ def self.config
4
+ Base.config('hive')
5
+ end
6
+
7
+ def self.exec_path(cluster)
8
+ self.clusters[cluster]['exec_path']
9
+ end
10
+
11
+ def self.output_db(cluster)
12
+ self.clusters[cluster]['output_db']
13
+ end
14
+
15
+ def self.output_db_user(cluster)
16
+ output_db_node = Hadoop.gateway_node(cluster)
17
+ output_db_user = Ssh.host(output_db_node)['user']
18
+ output_db_user
19
+ end
20
+
21
+ def self.clusters
22
+ self.config['clusters']
23
+ end
24
+
25
+ def self.slot_ids(cluster)
26
+ (1..self.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
27
+ end
28
+
29
+ def self.slot_worker_by_cluster_and_path(cluster,path)
30
+ working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
31
+ self.slot_ids(cluster).each do |slot_id|
32
+ unless working_slots.include?(slot_id)
33
+ Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
34
+ return slot_id
35
+ end
36
+ end
37
+ #return false if none are available
38
+ return false
39
+ end
40
+
41
+ def self.unslot_worker_by_path(path)
42
+ begin
43
+ Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
44
+ return true
45
+ rescue
46
+ return false
47
+ end
48
+ end
49
+
50
+ def self.databases(cluster,user_name)
51
+ self.run(cluster,"show databases",user_name)['stdout'].split("\n")
52
+ end
53
+
54
+ def self.default_params
55
+ time = Time.now.utc
56
+ {
57
+ '$utc_date'=>time.strftime("%Y-%m-%d"),
58
+ '$utc_time'=>time.strftime("%H:%M"),
59
+ }
60
+ end
61
+ end
62
+ end
63
+
@@ -1,5 +1,5 @@
1
1
  module Mobilize
2
2
  module Hive
3
- VERSION = "1.2"
3
+ VERSION = "1.3"
4
4
  end
5
5
  end
@@ -16,5 +16,5 @@ Gem::Specification.new do |gem|
16
16
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ["lib"]
19
- gem.add_runtime_dependency "mobilize-hdfs","1.2"
19
+ gem.add_runtime_dependency "mobilize-hdfs","1.3"
20
20
  end
@@ -20,7 +20,15 @@
20
20
  active: true
21
21
  trigger: after hive_test_2
22
22
  status: ""
23
- stage1: hive.run hql:"select act_date as `date`,product,category,value from mobilize.hive_test_1;"
23
+ stage1: hive.run hql:"select '@date' as `date`,product,category,value from mobilize.hive_test_1;", params:{'date':'2013-01-01'}
24
24
  stage2: hive.write source:"stage1",target:"mobilize/hive_test_3", partitions:"date/product", drop:true
25
25
  stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3", partitions:"date/product", drop:false
26
26
  stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"
27
+ - name: hive_test_4
28
+ active: true
29
+ trigger: after hive_test_3
30
+ status: ""
31
+ stage1: hive.write source:"hive_test_4_stage_1.in", target:"mobilize/hive_test_1", partitions:"act_date"
32
+ stage2: hive.write source:"hive_test_4_stage_2.in", target:"mobilize/hive_test_1", partitions:"act_date"
33
+ stage3: hive.run hql:"select '$utc_date $utc_time' as `date_time`,product,category,value from mobilize.hive_test_1;"
34
+ stage4: gsheet.write source:stage3, target:"hive_test_4.out"
@@ -25,6 +25,18 @@ describe "Mobilize" do
25
25
  hive_1_in_tsv = YAML.load_file("#{Mobilize::Base.root}/test/hive_test_1_in.yml").hash_array_to_tsv
26
26
  hive_1_in_sheet.write(hive_1_in_tsv,Mobilize::Gdrive.owner_name)
27
27
 
28
+ #create blank sheet
29
+ hive_4_stage_1_in_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_1.in",gdrive_slot)
30
+ [hive_4_stage_1_in_sheet].each {|s| s.delete if s}
31
+ hive_4_stage_1_in_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_1.in",gdrive_slot)
32
+
33
+ #create sheet w just headers
34
+ hive_4_stage_2_in_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_2.in",gdrive_slot)
35
+ [hive_4_stage_2_in_sheet].each {|s| s.delete if s}
36
+ hive_4_stage_2_in_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_2.in",gdrive_slot)
37
+ hive_4_stage_2_in_sheet_header = hive_1_in_tsv.tsv_header_array.join("\t")
38
+ hive_4_stage_2_in_sheet.write(hive_4_stage_2_in_sheet_header,Mobilize::Gdrive.owner_name)
39
+
28
40
  hive_1_schema_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1.schema",gdrive_slot)
29
41
  [hive_1_schema_sheet].each {|s| s.delete if s}
30
42
  hive_1_schema_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1.schema",gdrive_slot)
@@ -51,21 +63,25 @@ describe "Mobilize" do
51
63
  [hive_2_target_sheet].each{|s| s.delete if s}
52
64
  hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
53
65
  [hive_3_target_sheet].each{|s| s.delete if s}
66
+ hive_4_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4.out",gdrive_slot)
67
+ [hive_4_target_sheet].each{|s| s.delete if s}
54
68
 
55
69
  puts "job row added, force enqueued requestor, wait for stages"
56
70
  r.enqueue!
57
- wait_for_stages(1200)
71
+ wait_for_stages(2100)
58
72
 
59
73
  puts "jobtracker posted data to test sheet"
60
74
  hive_1_stage_2_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1_stage_2.out",gdrive_slot)
61
75
  hive_1_stage_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1_stage_3.out",gdrive_slot)
62
76
  hive_2_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_2.out",gdrive_slot)
63
77
  hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
78
+ hive_4_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4.out",gdrive_slot)
64
79
 
65
80
  assert hive_1_stage_2_target_sheet.read(u.name).length == 219
66
81
  assert hive_1_stage_3_target_sheet.read(u.name).length > 3
67
82
  assert hive_2_target_sheet.read(u.name).length == 599
68
83
  assert hive_3_target_sheet.read(u.name).length == 347
84
+ assert hive_4_target_sheet.read(u.name).length == 432
69
85
  end
70
86
 
71
87
  def wait_for_stages(time_limit=600,stage_limit=120,wait_length=10)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mobilize-hive
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.2'
4
+ version: '1.3'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-03-21 00:00:00.000000000 Z
12
+ date: 2013-04-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mobilize-hdfs
@@ -18,7 +18,7 @@ dependencies:
18
18
  requirements:
19
19
  - - '='
20
20
  - !ruby/object:Gem::Version
21
- version: '1.2'
21
+ version: '1.3'
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
@@ -26,7 +26,7 @@ dependencies:
26
26
  requirements:
27
27
  - - '='
28
28
  - !ruby/object:Gem::Version
29
- version: '1.2'
29
+ version: '1.3'
30
30
  description: Adds hive read, write, and run support to mobilize-hdfs
31
31
  email:
32
32
  - cpaesleme@dena.com
@@ -41,6 +41,7 @@ files:
41
41
  - Rakefile
42
42
  - lib/mobilize-hive.rb
43
43
  - lib/mobilize-hive/handlers/hive.rb
44
+ - lib/mobilize-hive/helpers/hive_helper.rb
44
45
  - lib/mobilize-hive/tasks.rb
45
46
  - lib/mobilize-hive/version.rb
46
47
  - lib/samples/hive.yml
@@ -72,7 +73,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
72
73
  version: '0'
73
74
  requirements: []
74
75
  rubyforge_project:
75
- rubygems_version: 1.8.24
76
+ rubygems_version: 1.8.25
76
77
  signing_key:
77
78
  specification_version: 3
78
79
  summary: Adds hive read, write, and run support to mobilize-hdfs
@@ -84,4 +85,3 @@ test_files:
84
85
  - test/mobilize-hive_test.rb
85
86
  - test/redis-test.conf
86
87
  - test/test_helper.rb
87
- has_rdoc: