mobilize-hive 1.3 → 1.21

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -142,17 +142,6 @@ Start
142
142
  * cluster and user are optional for all of the below.
143
143
  * cluster defaults to the first cluster listed;
144
144
  * user is treated the same way as in [mobilize-ssh][mobilize-ssh].
145
- * params are also optional for all of the below. They replace HQL in sources.
146
- * params are passed as a YML or JSON, as in:
147
- * `hive.run source:<source_path>, params:{'date': '2013-03-01', 'unit': 'widgets'}`
148
- * this example replaces all the keys, preceded by '@' in all source hqls with the value.
149
- * The preceding '@' is used to keep from replacing instances
150
- of "date" and "unit" in the HQL; you should have `@date` and `@unit` in your actual HQL
151
- if you'd like to replace those tokens.
152
- * in addition, the following params are substituted automatically:
153
- * `$utc_date` - replaced with YYYY-MM-DD date, UTC
154
- * `$utc_time` - replaced with HH:MM time, UTC
155
- * any occurrence of these values in HQL will be replaced at runtime.
156
145
  * hive.run `hql:<hql> || source:<gsheet_path>, user:<user>, cluster:<cluster>`, which executes the
157
146
  script in the hql or source sheet and returns any output specified at the
158
147
  end. If the cmd or last query in source is a select statement, column headers will be
@@ -1,9 +1,58 @@
1
1
  module Mobilize
2
2
  module Hive
3
- #adds convenience methods
4
- require "#{File.dirname(__FILE__)}/../helpers/hive_helper"
3
+ def Hive.config
4
+ Base.config('hive')
5
+ end
6
+
7
+ def Hive.exec_path(cluster)
8
+ Hive.clusters[cluster]['exec_path']
9
+ end
10
+
11
+ def Hive.output_db(cluster)
12
+ Hive.clusters[cluster]['output_db']
13
+ end
14
+
15
+ def Hive.output_db_user(cluster)
16
+ output_db_node = Hadoop.gateway_node(cluster)
17
+ output_db_user = Ssh.host(output_db_node)['user']
18
+ output_db_user
19
+ end
20
+
21
+ def Hive.clusters
22
+ Hive.config['clusters']
23
+ end
24
+
25
+ def Hive.slot_ids(cluster)
26
+ (1..Hive.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
27
+ end
28
+
29
+ def Hive.slot_worker_by_cluster_and_path(cluster,path)
30
+ working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
31
+ Hive.slot_ids(cluster).each do |slot_id|
32
+ unless working_slots.include?(slot_id)
33
+ Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
34
+ return slot_id
35
+ end
36
+ end
37
+ #return false if none are available
38
+ return false
39
+ end
40
+
41
+ def Hive.unslot_worker_by_path(path)
42
+ begin
43
+ Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
44
+ return true
45
+ rescue
46
+ return false
47
+ end
48
+ end
49
+
50
+ def Hive.databases(cluster,user_name)
51
+ Hive.run(cluster,"show databases",user_name)['stdout'].split("\n")
52
+ end
53
+
5
54
  # converts a source path or target path to a dst in the context of handler and stage
6
- def Hive.path_to_dst(path,stage_path,gdrive_slot)
55
+ def Hive.path_to_dst(path,stage_path)
7
56
  has_handler = true if path.index("://")
8
57
  s = Stage.where(:path=>stage_path).first
9
58
  params = s.params
@@ -29,7 +78,7 @@ module Mobilize
29
78
  return Dataset.find_or_create_by_url(hive_url)
30
79
  end
31
80
  #otherwise, use hdfs convention
32
- return Ssh.path_to_dst(path,stage_path,gdrive_slot)
81
+ return Ssh.path_to_dst(path,stage_path)
33
82
  end
34
83
 
35
84
  def Hive.url_by_path(path,user_name,is_target=false)
@@ -59,7 +108,7 @@ module Mobilize
59
108
  def Hive.table_stats(cluster,db,table,user_name)
60
109
  describe_sql = "use #{db};describe extended #{table};"
61
110
  describe_response = Hive.run(cluster, describe_sql,user_name)
62
- return nil if describe_response['stdout'].length==0
111
+ return describe_response if describe_response['stdout'].length==0
63
112
  describe_output = describe_response['stdout']
64
113
  describe_output.split("location:").last.split(",").first
65
114
  #get location, fields, partitions
@@ -93,43 +142,20 @@ module Mobilize
93
142
  end
94
143
 
95
144
  #run a generic hive command, with the option of passing a file hash to be locally available
96
- def Hive.run(cluster,hql,user_name,params=nil,file_hash=nil)
145
+ def Hive.run(cluster,hql,user_name,file_hash=nil)
97
146
  # no TempStatsStore
98
147
  hql = "set hive.stats.autogather=false;#{hql}"
99
148
  filename = hql.to_md5
100
149
  file_hash||= {}
101
150
  file_hash[filename] = hql
102
- #add in default params
103
- params ||= {}
104
- params = params.merge(Hive.default_params)
105
- #replace any params in the file_hash and command
106
- params.each do |k,v|
107
- file_hash.each do |name,data|
108
- if k.starts_with?("$")
109
- data.gsub!(k,v)
110
- else
111
- data.gsub!("@#{k}",v)
112
- end
113
- end
114
- end
115
151
  #silent mode so we don't have logs in stderr; clip output
116
152
  #at hadoop read limit
117
153
  command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
118
154
  gateway_node = Hadoop.gateway_node(cluster)
119
- response = Ssh.run(gateway_node,command,user_name,file_hash)
120
- #override exit code 0 when stdout is blank and
121
- #stderror contains FAILED or KILLED
122
- if response['stdout'].to_s.length == 0 and
123
- response['stderr'].to_s.ie{|se| se.index("FAILED") or se.index("KILLED")}
124
- response['exit_code'] = 500
125
- end
126
- return response
155
+ Ssh.run(gateway_node,command,user_name,file_hash)
127
156
  end
128
157
 
129
158
  def Hive.run_by_stage_path(stage_path)
130
- gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
131
- #return blank response if there are no slots available
132
- return nil unless gdrive_slot
133
159
  s = Stage.where(:path=>stage_path).first
134
160
  params = s.params
135
161
  cluster = params['cluster'] || Hive.clusters.keys.first
@@ -148,16 +174,13 @@ module Mobilize
148
174
  if params['hql']
149
175
  hql = params['hql']
150
176
  else
151
- source = s.sources(gdrive_slot).first
152
- hql = source.read(user_name,gdrive_slot)
177
+ source = s.sources.first
178
+ hql = source.read(user_name)
153
179
  end
154
180
 
155
- Gdrive.unslot_worker_by_path(stage_path)
156
-
157
181
  #check for select at end
158
182
  hql_array = hql.split(";").map{|hc| hc.strip}.reject{|hc| hc.length==0}
159
- last_statement = hql_array.last.downcase.split("\n").reject{|l| l.starts_with?("-- ")}.first
160
- if last_statement.to_s.starts_with?("select")
183
+ if hql_array.last.downcase.starts_with?("select")
161
184
  #nil if no prior commands
162
185
  prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
163
186
  select_hql = hql_array.last
@@ -165,10 +188,10 @@ module Mobilize
165
188
  "drop table if exists #{output_path}",
166
189
  "create table #{output_path} as #{select_hql};"].join(";")
167
190
  full_hql = [prior_hql, output_table_hql].compact.join(";")
168
- result = Hive.run(cluster,full_hql, user_name,params['params'])
191
+ result = Hive.run(cluster,full_hql, user_name)
169
192
  Dataset.find_or_create_by_url(out_url)
170
193
  else
171
- result = Hive.run(cluster, hql, user_name,params['params'])
194
+ result = Hive.run(cluster, hql, user_name)
172
195
  Dataset.find_or_create_by_url(out_url)
173
196
  Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
174
197
  end
@@ -201,7 +224,7 @@ module Mobilize
201
224
  file_name = schema_path.split("/").last
202
225
  out_url = "gridfs://#{schema_path}/#{file_name}"
203
226
  Dataset.write_by_url(out_url,out_tsv,user_name)
204
- schema_tsv = Dataset.find_by_url(out_url).read(user_name,gdrive_slot)
227
+ schema_tsv = Dataset.find_by_url(out_url).read(user_name)
205
228
  schema_hash = {}
206
229
  schema_tsv.tsv_to_hash_array.each do |ha|
207
230
  schema_hash[ha['name']] = ha['datatype']
@@ -209,10 +232,24 @@ module Mobilize
209
232
  schema_hash
210
233
  end
211
234
 
212
- def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil, params=nil)
235
+ def Hive.path_params(cluster, path, user_name)
236
+ db, table, partitions = path.gsub(".","/").split("/").ie{|sp| [sp.first, sp.second, sp[2..-1]]}
237
+ #get existing table stats if any
238
+ curr_stats = begin
239
+ Hive.table_stats(cluster, db, table, user_name)
240
+ rescue
241
+ nil
242
+ end
243
+ {"db"=>db,
244
+ "table"=>table,
245
+ "partitions"=>partitions,
246
+ "curr_stats"=>curr_stats}
247
+ end
248
+
249
+ def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil)
213
250
  table_path = [db,table].join(".")
214
- table_stats = Hive.table_stats(cluster, db, table, user_name)
215
- url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
251
+ target_params = Hive.path_params(cluster, table_path, user_name)
252
+ table_stats = target_params['curr_stats']
216
253
 
217
254
  source_hql_array = source_hql.split(";")
218
255
  last_select_i = source_hql_array.rindex{|hql| hql.downcase.strip.starts_with?("select")}
@@ -228,10 +265,11 @@ module Mobilize
228
265
  temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
229
266
  temp_drop_hql = "drop table if exists #{temp_table_path};"
230
267
  temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
231
- response = Hive.run(cluster,temp_create_hql,user_name,params)
232
- raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
268
+ Hive.run(cluster,temp_create_hql,user_name)
233
269
 
234
- source_table_stats = Hive.table_stats(cluster,temp_db,temp_table_name,user_name)
270
+ source_params = Hive.path_params(cluster, temp_table_path, user_name)
271
+ source_table_path = ['db','table'].map{|k| source_params[k]}.join(".")
272
+ source_table_stats = source_params['curr_stats']
235
273
  source_fields = source_table_stats['field_defs']
236
274
 
237
275
  if part_array.length == 0 and
@@ -259,7 +297,7 @@ module Mobilize
259
297
 
260
298
  target_create_hql = "create table if not exists #{table_path} #{field_def_stmt};"
261
299
 
262
- target_insert_hql = "insert overwrite table #{table_path} select #{target_field_stmt} from #{temp_table_path};"
300
+ target_insert_hql = "insert overwrite table #{table_path} select #{target_field_stmt} from #{source_table_path};"
263
301
 
264
302
  target_full_hql = [target_name_hql,
265
303
  target_drop_hql,
@@ -267,12 +305,10 @@ module Mobilize
267
305
  target_insert_hql,
268
306
  temp_drop_hql].join
269
307
 
270
- response = Hive.run(cluster, target_full_hql, user_name, params)
271
-
272
- raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
308
+ Hive.run(cluster, target_full_hql, user_name)
273
309
 
274
310
  elsif part_array.length > 0 and
275
- table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']}.sort == part_array.sort}
311
+ table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
276
312
  #partitions and no target table or same partitions in both target table and user params
277
313
 
278
314
  target_headers = source_fields.map{|f| f['name']}.reject{|h| part_array.include?(h)}
@@ -316,20 +352,10 @@ module Mobilize
316
352
 
317
353
  else
318
354
  #get all the permutations of possible partititons
319
- part_set_hql = "set hive.cli.print.header=true;set mapred.job.name=#{job_name} (permutations);"
320
- part_select_hql = "select distinct #{target_part_stmt} from #{temp_table_path};"
321
- part_perm_hql = part_set_hql + part_select_hql
322
- response = Hive.run(cluster, part_perm_hql, user_name, params)
323
- raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
324
- part_perm_tsv = response['stdout']
355
+ part_perm_hql = "set hive.cli.print.header=true;select distinct #{target_part_stmt} from #{source_table_path};"
356
+ part_perm_tsv = Hive.run(cluster, part_perm_hql, user_name)['stdout']
325
357
  #having gotten the permutations, ensure they are dropped
326
358
  part_hash_array = part_perm_tsv.tsv_to_hash_array
327
- #make sure there is data
328
- if part_hash_array.first.nil? or part_hash_array.first.values.include?(nil)
329
- #blank result set, return url
330
- return url
331
- end
332
-
333
359
  part_drop_hql = part_hash_array.map do |h|
334
360
  part_drop_stmt = h.map do |name,value|
335
361
  part_defs[name[1..-2]]=="string" ? "#{name}='#{value}'" : "#{name}=#{value}"
@@ -341,16 +367,16 @@ module Mobilize
341
367
 
342
368
  target_insert_hql = "insert overwrite table #{table_path} " +
343
369
  "partition (#{target_part_stmt}) " +
344
- "select #{target_field_stmt},#{target_part_stmt} from #{temp_table_path};"
370
+ "select #{target_field_stmt},#{target_part_stmt} from #{source_table_path};"
345
371
 
346
372
  target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
347
373
 
348
- response = Hive.run(cluster, target_full_hql, user_name, params)
349
- raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
374
+ Hive.run(cluster, target_full_hql, user_name)
350
375
  else
351
376
  error_msg = "Incompatible partition specs"
352
377
  raise error_msg
353
378
  end
379
+ url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
354
380
  return url
355
381
  end
356
382
 
@@ -358,21 +384,14 @@ module Mobilize
358
384
  #Accepts options to drop existing target if any
359
385
  #also schema with column datatype overrides
360
386
  def Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop=false, schema_hash=nil)
361
- return nil if source_tsv.strip.length==0
362
- if source_tsv.index("\r\n")
363
- source_tsv = source_tsv.gsub("\r\n","\n")
364
- elsif source_tsv.index("\r")
365
- source_tsv = source_tsv.gsub("\r","\n")
366
- end
367
387
  source_headers = source_tsv.tsv_header_array
368
388
 
369
389
  table_path = [db,table].join(".")
370
- table_stats = Hive.table_stats(cluster, db, table, user_name)
390
+ target_params = Hive.path_params(cluster, table_path, user_name)
391
+ table_stats = target_params['curr_stats']
371
392
 
372
393
  schema_hash ||= {}
373
394
 
374
- url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
375
-
376
395
  if part_array.length == 0 and
377
396
  table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
378
397
  #no partitions in either user params or the target table
@@ -399,11 +418,10 @@ module Mobilize
399
418
 
400
419
  target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
401
420
 
402
- response = Hive.run(cluster, target_full_hql, user_name, nil, file_hash)
403
- raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
421
+ Hive.run(cluster, target_full_hql, user_name, file_hash)
404
422
 
405
423
  elsif part_array.length > 0 and
406
- table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']}.sort == part_array.sort}
424
+ table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
407
425
  #partitions and no target table
408
426
  #or same partitions in both target table and user params
409
427
  #or drop and start fresh
@@ -427,17 +445,13 @@ module Mobilize
427
445
  "partitioned by #{partition_defs}"
428
446
 
429
447
  #create target table early if not here
430
- response = Hive.run(cluster, target_create_hql, user_name)
431
- raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
432
-
433
- #return url (operation complete) if there's no data
434
- source_hash_array = source_tsv.tsv_to_hash_array
435
- return url if source_hash_array.length==1 and source_hash_array.first.values.compact.length==0
448
+ Hive.run(cluster, target_create_hql, user_name)
436
449
 
437
450
  table_stats = Hive.table_stats(cluster, db, table, user_name)
438
451
 
439
452
  #create data hash from source hash array
440
453
  data_hash = {}
454
+ source_hash_array = source_tsv.tsv_to_hash_array
441
455
  source_hash_array.each do |ha|
442
456
  tpmk = part_array.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
443
457
  tpmv = ha.reject{|k,v| part_array.include?(k)}.values.join("\001")
@@ -470,8 +484,7 @@ module Mobilize
470
484
  #run actual partition adds all at once
471
485
  if target_part_hql.length>0
472
486
  puts "Adding partitions to #{cluster}/#{db}/#{table} for #{user_name} at #{Time.now.utc}"
473
- response = Hive.run(cluster, target_part_hql, user_name)
474
- raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
487
+ Hive.run(cluster, target_part_hql, user_name)
475
488
  end
476
489
  else
477
490
  error_msg = "Incompatible partition specs: " +
@@ -479,31 +492,33 @@ module Mobilize
479
492
  "user_params:#{part_array.to_s}"
480
493
  raise error_msg
481
494
  end
482
-
495
+ url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
483
496
  return url
484
497
  end
485
498
 
486
499
  def Hive.write_by_stage_path(stage_path)
487
- gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
488
- #return blank response if there are no slots available
489
- return nil unless gdrive_slot
490
500
  s = Stage.where(:path=>stage_path).first
491
501
  params = s.params
492
- source = s.sources(gdrive_slot).first
502
+ source = s.sources.first
493
503
  target = s.target
494
504
  cluster, db, table = target.url.split("://").last.split("/")
495
- #slot Hive worker if available
496
- slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
497
- return false unless slot_id
498
505
  #update stage with the node so we can use it
499
506
  user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
500
507
  job_name = s.path.sub("Runner_","")
501
508
 
509
+ #slot Hive worker if available
510
+ slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
511
+ return false unless slot_id
512
+
502
513
  schema_hash = if params['schema']
514
+ gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
515
+ #return blank response if there are no slots available
516
+ return nil unless gdrive_slot
503
517
  Hive.schema_hash(params['schema'],user_name,gdrive_slot)
504
518
  else
505
519
  {}
506
520
  end
521
+ Gdrive.unslot_worker_by_path(stage_path)
507
522
  #drop target before create/insert?
508
523
  drop = params['drop']
509
524
 
@@ -516,17 +531,16 @@ module Mobilize
516
531
  #source table
517
532
  cluster,source_path = source.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
518
533
  source_hql = "select * from #{source_path};"
519
- elsif ['gsheet','gfile','gridfs','hdfs'].include?(source.handler)
534
+ elsif ['gsheet','gridfs','hdfs'].include?(source.handler)
520
535
  if source.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
521
- source_hql = source.read(user_name,gdrive_slot)
536
+ source_hql = source.read(user_name)
522
537
  else
523
- #tsv from sheet or file
524
- source_tsv = source.read(user_name,gdrive_slot)
538
+ #tsv from sheet
539
+ source_tsv = source.read(user_name)
525
540
  end
526
541
  end
527
542
  end
528
543
 
529
- Gdrive.unslot_worker_by_path(stage_path)
530
544
  part_array = if params['partitions']
531
545
  params['partitions'].to_a.map{|p| p.gsub(".","/").split("/")}.flatten
532
546
  elsif params['target']
@@ -545,14 +559,12 @@ module Mobilize
545
559
  Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop, schema_hash)
546
560
  elsif source_tsv
547
561
  Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop, schema_hash)
548
- elsif source
549
- #null sheet
550
562
  else
551
563
  raise "Unable to determine source tsv or source hql"
552
564
  end
553
565
  {'stdout'=>url,'exit_code'=>0}
554
566
  rescue => exc
555
- {'stderr'=>"#{exc.to_s}\n#{exc.backtrace.join("\n")}", 'exit_code'=>500}
567
+ {'stderr'=>exc.to_s, 'exit_code'=>500}
556
568
  end
557
569
 
558
570
  #unslot worker and write result
@@ -573,8 +585,11 @@ module Mobilize
573
585
  select_hql = "select * from #{source_path};"
574
586
  hql = [set_hql,select_hql].join
575
587
  response = Hive.run(cluster, hql,user_name)
576
- raise "Unable to read hive://#{dst_path} with error: #{response['stderr']}" if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
577
- return response['stdout']
588
+ if response['exit_code']==0
589
+ return response['stdout']
590
+ else
591
+ raise "Unable to read hive://#{dst_path} with error: #{response['stderr']}"
592
+ end
578
593
  end
579
594
 
580
595
  def Hive.write_by_dataset_path(dst_path,source_tsv,user_name,*args)
@@ -1,5 +1,5 @@
1
1
  module Mobilize
2
2
  module Hive
3
- VERSION = "1.3"
3
+ VERSION = "1.21"
4
4
  end
5
5
  end
@@ -16,5 +16,5 @@ Gem::Specification.new do |gem|
16
16
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ["lib"]
19
- gem.add_runtime_dependency "mobilize-hdfs","1.3"
19
+ gem.add_runtime_dependency "mobilize-hdfs","1.21"
20
20
  end
@@ -20,15 +20,7 @@
20
20
  active: true
21
21
  trigger: after hive_test_2
22
22
  status: ""
23
- stage1: hive.run hql:"select '@date' as `date`,product,category,value from mobilize.hive_test_1;", params:{'date':'2013-01-01'}
23
+ stage1: hive.run hql:"select act_date as `date`,product,category,value from mobilize.hive_test_1;"
24
24
  stage2: hive.write source:"stage1",target:"mobilize/hive_test_3", partitions:"date/product", drop:true
25
25
  stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3", partitions:"date/product", drop:false
26
26
  stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"
27
- - name: hive_test_4
28
- active: true
29
- trigger: after hive_test_3
30
- status: ""
31
- stage1: hive.write source:"hive_test_4_stage_1.in", target:"mobilize/hive_test_1", partitions:"act_date"
32
- stage2: hive.write source:"hive_test_4_stage_2.in", target:"mobilize/hive_test_1", partitions:"act_date"
33
- stage3: hive.run hql:"select '$utc_date $utc_time' as `date_time`,product,category,value from mobilize.hive_test_1;"
34
- stage4: gsheet.write source:stage3, target:"hive_test_4.out"
@@ -25,18 +25,6 @@ describe "Mobilize" do
25
25
  hive_1_in_tsv = YAML.load_file("#{Mobilize::Base.root}/test/hive_test_1_in.yml").hash_array_to_tsv
26
26
  hive_1_in_sheet.write(hive_1_in_tsv,Mobilize::Gdrive.owner_name)
27
27
 
28
- #create blank sheet
29
- hive_4_stage_1_in_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_1.in",gdrive_slot)
30
- [hive_4_stage_1_in_sheet].each {|s| s.delete if s}
31
- hive_4_stage_1_in_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_1.in",gdrive_slot)
32
-
33
- #create sheet w just headers
34
- hive_4_stage_2_in_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_2.in",gdrive_slot)
35
- [hive_4_stage_2_in_sheet].each {|s| s.delete if s}
36
- hive_4_stage_2_in_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_2.in",gdrive_slot)
37
- hive_4_stage_2_in_sheet_header = hive_1_in_tsv.tsv_header_array.join("\t")
38
- hive_4_stage_2_in_sheet.write(hive_4_stage_2_in_sheet_header,Mobilize::Gdrive.owner_name)
39
-
40
28
  hive_1_schema_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1.schema",gdrive_slot)
41
29
  [hive_1_schema_sheet].each {|s| s.delete if s}
42
30
  hive_1_schema_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1.schema",gdrive_slot)
@@ -63,25 +51,21 @@ describe "Mobilize" do
63
51
  [hive_2_target_sheet].each{|s| s.delete if s}
64
52
  hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
65
53
  [hive_3_target_sheet].each{|s| s.delete if s}
66
- hive_4_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4.out",gdrive_slot)
67
- [hive_4_target_sheet].each{|s| s.delete if s}
68
54
 
69
55
  puts "job row added, force enqueued requestor, wait for stages"
70
56
  r.enqueue!
71
- wait_for_stages(2100)
57
+ wait_for_stages(1200)
72
58
 
73
59
  puts "jobtracker posted data to test sheet"
74
60
  hive_1_stage_2_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1_stage_2.out",gdrive_slot)
75
61
  hive_1_stage_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1_stage_3.out",gdrive_slot)
76
62
  hive_2_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_2.out",gdrive_slot)
77
63
  hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
78
- hive_4_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4.out",gdrive_slot)
79
64
 
80
65
  assert hive_1_stage_2_target_sheet.read(u.name).length == 219
81
66
  assert hive_1_stage_3_target_sheet.read(u.name).length > 3
82
67
  assert hive_2_target_sheet.read(u.name).length == 599
83
68
  assert hive_3_target_sheet.read(u.name).length == 347
84
- assert hive_4_target_sheet.read(u.name).length == 432
85
69
  end
86
70
 
87
71
  def wait_for_stages(time_limit=600,stage_limit=120,wait_length=10)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mobilize-hive
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.3'
4
+ version: '1.21'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-18 00:00:00.000000000 Z
12
+ date: 2013-03-22 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mobilize-hdfs
@@ -18,7 +18,7 @@ dependencies:
18
18
  requirements:
19
19
  - - '='
20
20
  - !ruby/object:Gem::Version
21
- version: '1.3'
21
+ version: '1.21'
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
@@ -26,7 +26,7 @@ dependencies:
26
26
  requirements:
27
27
  - - '='
28
28
  - !ruby/object:Gem::Version
29
- version: '1.3'
29
+ version: '1.21'
30
30
  description: Adds hive read, write, and run support to mobilize-hdfs
31
31
  email:
32
32
  - cpaesleme@dena.com
@@ -41,7 +41,6 @@ files:
41
41
  - Rakefile
42
42
  - lib/mobilize-hive.rb
43
43
  - lib/mobilize-hive/handlers/hive.rb
44
- - lib/mobilize-hive/helpers/hive_helper.rb
45
44
  - lib/mobilize-hive/tasks.rb
46
45
  - lib/mobilize-hive/version.rb
47
46
  - lib/samples/hive.yml
@@ -65,12 +64,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
65
64
  - - ! '>='
66
65
  - !ruby/object:Gem::Version
67
66
  version: '0'
67
+ segments:
68
+ - 0
69
+ hash: -4590609456874633429
68
70
  required_rubygems_version: !ruby/object:Gem::Requirement
69
71
  none: false
70
72
  requirements:
71
73
  - - ! '>='
72
74
  - !ruby/object:Gem::Version
73
75
  version: '0'
76
+ segments:
77
+ - 0
78
+ hash: -4590609456874633429
74
79
  requirements: []
75
80
  rubyforge_project:
76
81
  rubygems_version: 1.8.25
@@ -1,63 +0,0 @@
1
- module Mobilize
2
- module Hive
3
- def self.config
4
- Base.config('hive')
5
- end
6
-
7
- def self.exec_path(cluster)
8
- self.clusters[cluster]['exec_path']
9
- end
10
-
11
- def self.output_db(cluster)
12
- self.clusters[cluster]['output_db']
13
- end
14
-
15
- def self.output_db_user(cluster)
16
- output_db_node = Hadoop.gateway_node(cluster)
17
- output_db_user = Ssh.host(output_db_node)['user']
18
- output_db_user
19
- end
20
-
21
- def self.clusters
22
- self.config['clusters']
23
- end
24
-
25
- def self.slot_ids(cluster)
26
- (1..self.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
27
- end
28
-
29
- def self.slot_worker_by_cluster_and_path(cluster,path)
30
- working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
31
- self.slot_ids(cluster).each do |slot_id|
32
- unless working_slots.include?(slot_id)
33
- Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
34
- return slot_id
35
- end
36
- end
37
- #return false if none are available
38
- return false
39
- end
40
-
41
- def self.unslot_worker_by_path(path)
42
- begin
43
- Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
44
- return true
45
- rescue
46
- return false
47
- end
48
- end
49
-
50
- def self.databases(cluster,user_name)
51
- self.run(cluster,"show databases",user_name)['stdout'].split("\n")
52
- end
53
-
54
- def self.default_params
55
- time = Time.now.utc
56
- {
57
- '$utc_date'=>time.strftime("%Y-%m-%d"),
58
- '$utc_time'=>time.strftime("%H:%M"),
59
- }
60
- end
61
- end
62
- end
63
-