mobilize-hive 1.3 → 1.21
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +0 -11
- data/lib/mobilize-hive/handlers/hive.rb +118 -103
- data/lib/mobilize-hive/version.rb +1 -1
- data/mobilize-hive.gemspec +1 -1
- data/test/hive_job_rows.yml +1 -9
- data/test/mobilize-hive_test.rb +1 -17
- metadata +10 -5
- data/lib/mobilize-hive/helpers/hive_helper.rb +0 -63
data/README.md
CHANGED
@@ -142,17 +142,6 @@ Start
|
|
142
142
|
* cluster and user are optional for all of the below.
|
143
143
|
* cluster defaults to the first cluster listed;
|
144
144
|
* user is treated the same way as in [mobilize-ssh][mobilize-ssh].
|
145
|
-
* params are also optional for all of the below. They replace HQL in sources.
|
146
|
-
* params are passed as a YML or JSON, as in:
|
147
|
-
* `hive.run source:<source_path>, params:{'date': '2013-03-01', 'unit': 'widgets'}`
|
148
|
-
* this example replaces all the keys, preceded by '@' in all source hqls with the value.
|
149
|
-
* The preceding '@' is used to keep from replacing instances
|
150
|
-
of "date" and "unit" in the HQL; you should have `@date` and `@unit` in your actual HQL
|
151
|
-
if you'd like to replace those tokens.
|
152
|
-
* in addition, the following params are substituted automatically:
|
153
|
-
* `$utc_date` - replaced with YYYY-MM-DD date, UTC
|
154
|
-
* `$utc_time` - replaced with HH:MM time, UTC
|
155
|
-
* any occurrence of these values in HQL will be replaced at runtime.
|
156
145
|
* hive.run `hql:<hql> || source:<gsheet_path>, user:<user>, cluster:<cluster>`, which executes the
|
157
146
|
script in the hql or source sheet and returns any output specified at the
|
158
147
|
end. If the cmd or last query in source is a select statement, column headers will be
|
@@ -1,9 +1,58 @@
|
|
1
1
|
module Mobilize
|
2
2
|
module Hive
|
3
|
-
|
4
|
-
|
3
|
+
def Hive.config
|
4
|
+
Base.config('hive')
|
5
|
+
end
|
6
|
+
|
7
|
+
def Hive.exec_path(cluster)
|
8
|
+
Hive.clusters[cluster]['exec_path']
|
9
|
+
end
|
10
|
+
|
11
|
+
def Hive.output_db(cluster)
|
12
|
+
Hive.clusters[cluster]['output_db']
|
13
|
+
end
|
14
|
+
|
15
|
+
def Hive.output_db_user(cluster)
|
16
|
+
output_db_node = Hadoop.gateway_node(cluster)
|
17
|
+
output_db_user = Ssh.host(output_db_node)['user']
|
18
|
+
output_db_user
|
19
|
+
end
|
20
|
+
|
21
|
+
def Hive.clusters
|
22
|
+
Hive.config['clusters']
|
23
|
+
end
|
24
|
+
|
25
|
+
def Hive.slot_ids(cluster)
|
26
|
+
(1..Hive.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
|
27
|
+
end
|
28
|
+
|
29
|
+
def Hive.slot_worker_by_cluster_and_path(cluster,path)
|
30
|
+
working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
|
31
|
+
Hive.slot_ids(cluster).each do |slot_id|
|
32
|
+
unless working_slots.include?(slot_id)
|
33
|
+
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
|
34
|
+
return slot_id
|
35
|
+
end
|
36
|
+
end
|
37
|
+
#return false if none are available
|
38
|
+
return false
|
39
|
+
end
|
40
|
+
|
41
|
+
def Hive.unslot_worker_by_path(path)
|
42
|
+
begin
|
43
|
+
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
|
44
|
+
return true
|
45
|
+
rescue
|
46
|
+
return false
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def Hive.databases(cluster,user_name)
|
51
|
+
Hive.run(cluster,"show databases",user_name)['stdout'].split("\n")
|
52
|
+
end
|
53
|
+
|
5
54
|
# converts a source path or target path to a dst in the context of handler and stage
|
6
|
-
def Hive.path_to_dst(path,stage_path
|
55
|
+
def Hive.path_to_dst(path,stage_path)
|
7
56
|
has_handler = true if path.index("://")
|
8
57
|
s = Stage.where(:path=>stage_path).first
|
9
58
|
params = s.params
|
@@ -29,7 +78,7 @@ module Mobilize
|
|
29
78
|
return Dataset.find_or_create_by_url(hive_url)
|
30
79
|
end
|
31
80
|
#otherwise, use hdfs convention
|
32
|
-
return Ssh.path_to_dst(path,stage_path
|
81
|
+
return Ssh.path_to_dst(path,stage_path)
|
33
82
|
end
|
34
83
|
|
35
84
|
def Hive.url_by_path(path,user_name,is_target=false)
|
@@ -59,7 +108,7 @@ module Mobilize
|
|
59
108
|
def Hive.table_stats(cluster,db,table,user_name)
|
60
109
|
describe_sql = "use #{db};describe extended #{table};"
|
61
110
|
describe_response = Hive.run(cluster, describe_sql,user_name)
|
62
|
-
return
|
111
|
+
return describe_response if describe_response['stdout'].length==0
|
63
112
|
describe_output = describe_response['stdout']
|
64
113
|
describe_output.split("location:").last.split(",").first
|
65
114
|
#get location, fields, partitions
|
@@ -93,43 +142,20 @@ module Mobilize
|
|
93
142
|
end
|
94
143
|
|
95
144
|
#run a generic hive command, with the option of passing a file hash to be locally available
|
96
|
-
def Hive.run(cluster,hql,user_name,
|
145
|
+
def Hive.run(cluster,hql,user_name,file_hash=nil)
|
97
146
|
# no TempStatsStore
|
98
147
|
hql = "set hive.stats.autogather=false;#{hql}"
|
99
148
|
filename = hql.to_md5
|
100
149
|
file_hash||= {}
|
101
150
|
file_hash[filename] = hql
|
102
|
-
#add in default params
|
103
|
-
params ||= {}
|
104
|
-
params = params.merge(Hive.default_params)
|
105
|
-
#replace any params in the file_hash and command
|
106
|
-
params.each do |k,v|
|
107
|
-
file_hash.each do |name,data|
|
108
|
-
if k.starts_with?("$")
|
109
|
-
data.gsub!(k,v)
|
110
|
-
else
|
111
|
-
data.gsub!("@#{k}",v)
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|
115
151
|
#silent mode so we don't have logs in stderr; clip output
|
116
152
|
#at hadoop read limit
|
117
153
|
command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
|
118
154
|
gateway_node = Hadoop.gateway_node(cluster)
|
119
|
-
|
120
|
-
#override exit code 0 when stdout is blank and
|
121
|
-
#stderror contains FAILED or KILLED
|
122
|
-
if response['stdout'].to_s.length == 0 and
|
123
|
-
response['stderr'].to_s.ie{|se| se.index("FAILED") or se.index("KILLED")}
|
124
|
-
response['exit_code'] = 500
|
125
|
-
end
|
126
|
-
return response
|
155
|
+
Ssh.run(gateway_node,command,user_name,file_hash)
|
127
156
|
end
|
128
157
|
|
129
158
|
def Hive.run_by_stage_path(stage_path)
|
130
|
-
gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
|
131
|
-
#return blank response if there are no slots available
|
132
|
-
return nil unless gdrive_slot
|
133
159
|
s = Stage.where(:path=>stage_path).first
|
134
160
|
params = s.params
|
135
161
|
cluster = params['cluster'] || Hive.clusters.keys.first
|
@@ -148,16 +174,13 @@ module Mobilize
|
|
148
174
|
if params['hql']
|
149
175
|
hql = params['hql']
|
150
176
|
else
|
151
|
-
source = s.sources
|
152
|
-
hql = source.read(user_name
|
177
|
+
source = s.sources.first
|
178
|
+
hql = source.read(user_name)
|
153
179
|
end
|
154
180
|
|
155
|
-
Gdrive.unslot_worker_by_path(stage_path)
|
156
|
-
|
157
181
|
#check for select at end
|
158
182
|
hql_array = hql.split(";").map{|hc| hc.strip}.reject{|hc| hc.length==0}
|
159
|
-
|
160
|
-
if last_statement.to_s.starts_with?("select")
|
183
|
+
if hql_array.last.downcase.starts_with?("select")
|
161
184
|
#nil if no prior commands
|
162
185
|
prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
|
163
186
|
select_hql = hql_array.last
|
@@ -165,10 +188,10 @@ module Mobilize
|
|
165
188
|
"drop table if exists #{output_path}",
|
166
189
|
"create table #{output_path} as #{select_hql};"].join(";")
|
167
190
|
full_hql = [prior_hql, output_table_hql].compact.join(";")
|
168
|
-
result = Hive.run(cluster,full_hql, user_name
|
191
|
+
result = Hive.run(cluster,full_hql, user_name)
|
169
192
|
Dataset.find_or_create_by_url(out_url)
|
170
193
|
else
|
171
|
-
result = Hive.run(cluster, hql, user_name
|
194
|
+
result = Hive.run(cluster, hql, user_name)
|
172
195
|
Dataset.find_or_create_by_url(out_url)
|
173
196
|
Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
|
174
197
|
end
|
@@ -201,7 +224,7 @@ module Mobilize
|
|
201
224
|
file_name = schema_path.split("/").last
|
202
225
|
out_url = "gridfs://#{schema_path}/#{file_name}"
|
203
226
|
Dataset.write_by_url(out_url,out_tsv,user_name)
|
204
|
-
schema_tsv = Dataset.find_by_url(out_url).read(user_name
|
227
|
+
schema_tsv = Dataset.find_by_url(out_url).read(user_name)
|
205
228
|
schema_hash = {}
|
206
229
|
schema_tsv.tsv_to_hash_array.each do |ha|
|
207
230
|
schema_hash[ha['name']] = ha['datatype']
|
@@ -209,10 +232,24 @@ module Mobilize
|
|
209
232
|
schema_hash
|
210
233
|
end
|
211
234
|
|
212
|
-
def Hive.
|
235
|
+
def Hive.path_params(cluster, path, user_name)
|
236
|
+
db, table, partitions = path.gsub(".","/").split("/").ie{|sp| [sp.first, sp.second, sp[2..-1]]}
|
237
|
+
#get existing table stats if any
|
238
|
+
curr_stats = begin
|
239
|
+
Hive.table_stats(cluster, db, table, user_name)
|
240
|
+
rescue
|
241
|
+
nil
|
242
|
+
end
|
243
|
+
{"db"=>db,
|
244
|
+
"table"=>table,
|
245
|
+
"partitions"=>partitions,
|
246
|
+
"curr_stats"=>curr_stats}
|
247
|
+
end
|
248
|
+
|
249
|
+
def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil)
|
213
250
|
table_path = [db,table].join(".")
|
214
|
-
|
215
|
-
|
251
|
+
target_params = Hive.path_params(cluster, table_path, user_name)
|
252
|
+
table_stats = target_params['curr_stats']
|
216
253
|
|
217
254
|
source_hql_array = source_hql.split(";")
|
218
255
|
last_select_i = source_hql_array.rindex{|hql| hql.downcase.strip.starts_with?("select")}
|
@@ -228,10 +265,11 @@ module Mobilize
|
|
228
265
|
temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
|
229
266
|
temp_drop_hql = "drop table if exists #{temp_table_path};"
|
230
267
|
temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
|
231
|
-
|
232
|
-
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
268
|
+
Hive.run(cluster,temp_create_hql,user_name)
|
233
269
|
|
234
|
-
|
270
|
+
source_params = Hive.path_params(cluster, temp_table_path, user_name)
|
271
|
+
source_table_path = ['db','table'].map{|k| source_params[k]}.join(".")
|
272
|
+
source_table_stats = source_params['curr_stats']
|
235
273
|
source_fields = source_table_stats['field_defs']
|
236
274
|
|
237
275
|
if part_array.length == 0 and
|
@@ -259,7 +297,7 @@ module Mobilize
|
|
259
297
|
|
260
298
|
target_create_hql = "create table if not exists #{table_path} #{field_def_stmt};"
|
261
299
|
|
262
|
-
target_insert_hql = "insert overwrite table #{table_path} select #{target_field_stmt} from #{
|
300
|
+
target_insert_hql = "insert overwrite table #{table_path} select #{target_field_stmt} from #{source_table_path};"
|
263
301
|
|
264
302
|
target_full_hql = [target_name_hql,
|
265
303
|
target_drop_hql,
|
@@ -267,12 +305,10 @@ module Mobilize
|
|
267
305
|
target_insert_hql,
|
268
306
|
temp_drop_hql].join
|
269
307
|
|
270
|
-
|
271
|
-
|
272
|
-
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
308
|
+
Hive.run(cluster, target_full_hql, user_name)
|
273
309
|
|
274
310
|
elsif part_array.length > 0 and
|
275
|
-
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']}
|
311
|
+
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
|
276
312
|
#partitions and no target table or same partitions in both target table and user params
|
277
313
|
|
278
314
|
target_headers = source_fields.map{|f| f['name']}.reject{|h| part_array.include?(h)}
|
@@ -316,20 +352,10 @@ module Mobilize
|
|
316
352
|
|
317
353
|
else
|
318
354
|
#get all the permutations of possible partititons
|
319
|
-
|
320
|
-
|
321
|
-
part_perm_hql = part_set_hql + part_select_hql
|
322
|
-
response = Hive.run(cluster, part_perm_hql, user_name, params)
|
323
|
-
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
324
|
-
part_perm_tsv = response['stdout']
|
355
|
+
part_perm_hql = "set hive.cli.print.header=true;select distinct #{target_part_stmt} from #{source_table_path};"
|
356
|
+
part_perm_tsv = Hive.run(cluster, part_perm_hql, user_name)['stdout']
|
325
357
|
#having gotten the permutations, ensure they are dropped
|
326
358
|
part_hash_array = part_perm_tsv.tsv_to_hash_array
|
327
|
-
#make sure there is data
|
328
|
-
if part_hash_array.first.nil? or part_hash_array.first.values.include?(nil)
|
329
|
-
#blank result set, return url
|
330
|
-
return url
|
331
|
-
end
|
332
|
-
|
333
359
|
part_drop_hql = part_hash_array.map do |h|
|
334
360
|
part_drop_stmt = h.map do |name,value|
|
335
361
|
part_defs[name[1..-2]]=="string" ? "#{name}='#{value}'" : "#{name}=#{value}"
|
@@ -341,16 +367,16 @@ module Mobilize
|
|
341
367
|
|
342
368
|
target_insert_hql = "insert overwrite table #{table_path} " +
|
343
369
|
"partition (#{target_part_stmt}) " +
|
344
|
-
"select #{target_field_stmt},#{target_part_stmt} from #{
|
370
|
+
"select #{target_field_stmt},#{target_part_stmt} from #{source_table_path};"
|
345
371
|
|
346
372
|
target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
|
347
373
|
|
348
|
-
|
349
|
-
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
374
|
+
Hive.run(cluster, target_full_hql, user_name)
|
350
375
|
else
|
351
376
|
error_msg = "Incompatible partition specs"
|
352
377
|
raise error_msg
|
353
378
|
end
|
379
|
+
url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
|
354
380
|
return url
|
355
381
|
end
|
356
382
|
|
@@ -358,21 +384,14 @@ module Mobilize
|
|
358
384
|
#Accepts options to drop existing target if any
|
359
385
|
#also schema with column datatype overrides
|
360
386
|
def Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop=false, schema_hash=nil)
|
361
|
-
return nil if source_tsv.strip.length==0
|
362
|
-
if source_tsv.index("\r\n")
|
363
|
-
source_tsv = source_tsv.gsub("\r\n","\n")
|
364
|
-
elsif source_tsv.index("\r")
|
365
|
-
source_tsv = source_tsv.gsub("\r","\n")
|
366
|
-
end
|
367
387
|
source_headers = source_tsv.tsv_header_array
|
368
388
|
|
369
389
|
table_path = [db,table].join(".")
|
370
|
-
|
390
|
+
target_params = Hive.path_params(cluster, table_path, user_name)
|
391
|
+
table_stats = target_params['curr_stats']
|
371
392
|
|
372
393
|
schema_hash ||= {}
|
373
394
|
|
374
|
-
url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
|
375
|
-
|
376
395
|
if part_array.length == 0 and
|
377
396
|
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
|
378
397
|
#no partitions in either user params or the target table
|
@@ -399,11 +418,10 @@ module Mobilize
|
|
399
418
|
|
400
419
|
target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
|
401
420
|
|
402
|
-
|
403
|
-
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
421
|
+
Hive.run(cluster, target_full_hql, user_name, file_hash)
|
404
422
|
|
405
423
|
elsif part_array.length > 0 and
|
406
|
-
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']}
|
424
|
+
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
|
407
425
|
#partitions and no target table
|
408
426
|
#or same partitions in both target table and user params
|
409
427
|
#or drop and start fresh
|
@@ -427,17 +445,13 @@ module Mobilize
|
|
427
445
|
"partitioned by #{partition_defs}"
|
428
446
|
|
429
447
|
#create target table early if not here
|
430
|
-
|
431
|
-
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
432
|
-
|
433
|
-
#return url (operation complete) if there's no data
|
434
|
-
source_hash_array = source_tsv.tsv_to_hash_array
|
435
|
-
return url if source_hash_array.length==1 and source_hash_array.first.values.compact.length==0
|
448
|
+
Hive.run(cluster, target_create_hql, user_name)
|
436
449
|
|
437
450
|
table_stats = Hive.table_stats(cluster, db, table, user_name)
|
438
451
|
|
439
452
|
#create data hash from source hash array
|
440
453
|
data_hash = {}
|
454
|
+
source_hash_array = source_tsv.tsv_to_hash_array
|
441
455
|
source_hash_array.each do |ha|
|
442
456
|
tpmk = part_array.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
|
443
457
|
tpmv = ha.reject{|k,v| part_array.include?(k)}.values.join("\001")
|
@@ -470,8 +484,7 @@ module Mobilize
|
|
470
484
|
#run actual partition adds all at once
|
471
485
|
if target_part_hql.length>0
|
472
486
|
puts "Adding partitions to #{cluster}/#{db}/#{table} for #{user_name} at #{Time.now.utc}"
|
473
|
-
|
474
|
-
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
487
|
+
Hive.run(cluster, target_part_hql, user_name)
|
475
488
|
end
|
476
489
|
else
|
477
490
|
error_msg = "Incompatible partition specs: " +
|
@@ -479,31 +492,33 @@ module Mobilize
|
|
479
492
|
"user_params:#{part_array.to_s}"
|
480
493
|
raise error_msg
|
481
494
|
end
|
482
|
-
|
495
|
+
url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
|
483
496
|
return url
|
484
497
|
end
|
485
498
|
|
486
499
|
def Hive.write_by_stage_path(stage_path)
|
487
|
-
gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
|
488
|
-
#return blank response if there are no slots available
|
489
|
-
return nil unless gdrive_slot
|
490
500
|
s = Stage.where(:path=>stage_path).first
|
491
501
|
params = s.params
|
492
|
-
source = s.sources
|
502
|
+
source = s.sources.first
|
493
503
|
target = s.target
|
494
504
|
cluster, db, table = target.url.split("://").last.split("/")
|
495
|
-
#slot Hive worker if available
|
496
|
-
slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
|
497
|
-
return false unless slot_id
|
498
505
|
#update stage with the node so we can use it
|
499
506
|
user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
|
500
507
|
job_name = s.path.sub("Runner_","")
|
501
508
|
|
509
|
+
#slot Hive worker if available
|
510
|
+
slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
|
511
|
+
return false unless slot_id
|
512
|
+
|
502
513
|
schema_hash = if params['schema']
|
514
|
+
gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
|
515
|
+
#return blank response if there are no slots available
|
516
|
+
return nil unless gdrive_slot
|
503
517
|
Hive.schema_hash(params['schema'],user_name,gdrive_slot)
|
504
518
|
else
|
505
519
|
{}
|
506
520
|
end
|
521
|
+
Gdrive.unslot_worker_by_path(stage_path)
|
507
522
|
#drop target before create/insert?
|
508
523
|
drop = params['drop']
|
509
524
|
|
@@ -516,17 +531,16 @@ module Mobilize
|
|
516
531
|
#source table
|
517
532
|
cluster,source_path = source.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
|
518
533
|
source_hql = "select * from #{source_path};"
|
519
|
-
elsif ['gsheet','
|
534
|
+
elsif ['gsheet','gridfs','hdfs'].include?(source.handler)
|
520
535
|
if source.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
|
521
|
-
source_hql = source.read(user_name
|
536
|
+
source_hql = source.read(user_name)
|
522
537
|
else
|
523
|
-
#tsv from sheet
|
524
|
-
source_tsv = source.read(user_name
|
538
|
+
#tsv from sheet
|
539
|
+
source_tsv = source.read(user_name)
|
525
540
|
end
|
526
541
|
end
|
527
542
|
end
|
528
543
|
|
529
|
-
Gdrive.unslot_worker_by_path(stage_path)
|
530
544
|
part_array = if params['partitions']
|
531
545
|
params['partitions'].to_a.map{|p| p.gsub(".","/").split("/")}.flatten
|
532
546
|
elsif params['target']
|
@@ -545,14 +559,12 @@ module Mobilize
|
|
545
559
|
Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop, schema_hash)
|
546
560
|
elsif source_tsv
|
547
561
|
Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop, schema_hash)
|
548
|
-
elsif source
|
549
|
-
#null sheet
|
550
562
|
else
|
551
563
|
raise "Unable to determine source tsv or source hql"
|
552
564
|
end
|
553
565
|
{'stdout'=>url,'exit_code'=>0}
|
554
566
|
rescue => exc
|
555
|
-
{'stderr'=>
|
567
|
+
{'stderr'=>exc.to_s, 'exit_code'=>500}
|
556
568
|
end
|
557
569
|
|
558
570
|
#unslot worker and write result
|
@@ -573,8 +585,11 @@ module Mobilize
|
|
573
585
|
select_hql = "select * from #{source_path};"
|
574
586
|
hql = [set_hql,select_hql].join
|
575
587
|
response = Hive.run(cluster, hql,user_name)
|
576
|
-
|
577
|
-
|
588
|
+
if response['exit_code']==0
|
589
|
+
return response['stdout']
|
590
|
+
else
|
591
|
+
raise "Unable to read hive://#{dst_path} with error: #{response['stderr']}"
|
592
|
+
end
|
578
593
|
end
|
579
594
|
|
580
595
|
def Hive.write_by_dataset_path(dst_path,source_tsv,user_name,*args)
|
data/mobilize-hive.gemspec
CHANGED
@@ -16,5 +16,5 @@ Gem::Specification.new do |gem|
|
|
16
16
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
18
|
gem.require_paths = ["lib"]
|
19
|
-
gem.add_runtime_dependency "mobilize-hdfs","1.
|
19
|
+
gem.add_runtime_dependency "mobilize-hdfs","1.21"
|
20
20
|
end
|
data/test/hive_job_rows.yml
CHANGED
@@ -20,15 +20,7 @@
|
|
20
20
|
active: true
|
21
21
|
trigger: after hive_test_2
|
22
22
|
status: ""
|
23
|
-
stage1: hive.run hql:"select
|
23
|
+
stage1: hive.run hql:"select act_date as `date`,product,category,value from mobilize.hive_test_1;"
|
24
24
|
stage2: hive.write source:"stage1",target:"mobilize/hive_test_3", partitions:"date/product", drop:true
|
25
25
|
stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3", partitions:"date/product", drop:false
|
26
26
|
stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"
|
27
|
-
- name: hive_test_4
|
28
|
-
active: true
|
29
|
-
trigger: after hive_test_3
|
30
|
-
status: ""
|
31
|
-
stage1: hive.write source:"hive_test_4_stage_1.in", target:"mobilize/hive_test_1", partitions:"act_date"
|
32
|
-
stage2: hive.write source:"hive_test_4_stage_2.in", target:"mobilize/hive_test_1", partitions:"act_date"
|
33
|
-
stage3: hive.run hql:"select '$utc_date $utc_time' as `date_time`,product,category,value from mobilize.hive_test_1;"
|
34
|
-
stage4: gsheet.write source:stage3, target:"hive_test_4.out"
|
data/test/mobilize-hive_test.rb
CHANGED
@@ -25,18 +25,6 @@ describe "Mobilize" do
|
|
25
25
|
hive_1_in_tsv = YAML.load_file("#{Mobilize::Base.root}/test/hive_test_1_in.yml").hash_array_to_tsv
|
26
26
|
hive_1_in_sheet.write(hive_1_in_tsv,Mobilize::Gdrive.owner_name)
|
27
27
|
|
28
|
-
#create blank sheet
|
29
|
-
hive_4_stage_1_in_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_1.in",gdrive_slot)
|
30
|
-
[hive_4_stage_1_in_sheet].each {|s| s.delete if s}
|
31
|
-
hive_4_stage_1_in_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_1.in",gdrive_slot)
|
32
|
-
|
33
|
-
#create sheet w just headers
|
34
|
-
hive_4_stage_2_in_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_2.in",gdrive_slot)
|
35
|
-
[hive_4_stage_2_in_sheet].each {|s| s.delete if s}
|
36
|
-
hive_4_stage_2_in_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_2.in",gdrive_slot)
|
37
|
-
hive_4_stage_2_in_sheet_header = hive_1_in_tsv.tsv_header_array.join("\t")
|
38
|
-
hive_4_stage_2_in_sheet.write(hive_4_stage_2_in_sheet_header,Mobilize::Gdrive.owner_name)
|
39
|
-
|
40
28
|
hive_1_schema_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1.schema",gdrive_slot)
|
41
29
|
[hive_1_schema_sheet].each {|s| s.delete if s}
|
42
30
|
hive_1_schema_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1.schema",gdrive_slot)
|
@@ -63,25 +51,21 @@ describe "Mobilize" do
|
|
63
51
|
[hive_2_target_sheet].each{|s| s.delete if s}
|
64
52
|
hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
|
65
53
|
[hive_3_target_sheet].each{|s| s.delete if s}
|
66
|
-
hive_4_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4.out",gdrive_slot)
|
67
|
-
[hive_4_target_sheet].each{|s| s.delete if s}
|
68
54
|
|
69
55
|
puts "job row added, force enqueued requestor, wait for stages"
|
70
56
|
r.enqueue!
|
71
|
-
wait_for_stages(
|
57
|
+
wait_for_stages(1200)
|
72
58
|
|
73
59
|
puts "jobtracker posted data to test sheet"
|
74
60
|
hive_1_stage_2_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1_stage_2.out",gdrive_slot)
|
75
61
|
hive_1_stage_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1_stage_3.out",gdrive_slot)
|
76
62
|
hive_2_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_2.out",gdrive_slot)
|
77
63
|
hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
|
78
|
-
hive_4_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4.out",gdrive_slot)
|
79
64
|
|
80
65
|
assert hive_1_stage_2_target_sheet.read(u.name).length == 219
|
81
66
|
assert hive_1_stage_3_target_sheet.read(u.name).length > 3
|
82
67
|
assert hive_2_target_sheet.read(u.name).length == 599
|
83
68
|
assert hive_3_target_sheet.read(u.name).length == 347
|
84
|
-
assert hive_4_target_sheet.read(u.name).length == 432
|
85
69
|
end
|
86
70
|
|
87
71
|
def wait_for_stages(time_limit=600,stage_limit=120,wait_length=10)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mobilize-hive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.21'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-03-22 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mobilize-hdfs
|
@@ -18,7 +18,7 @@ dependencies:
|
|
18
18
|
requirements:
|
19
19
|
- - '='
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: '1.
|
21
|
+
version: '1.21'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -26,7 +26,7 @@ dependencies:
|
|
26
26
|
requirements:
|
27
27
|
- - '='
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: '1.
|
29
|
+
version: '1.21'
|
30
30
|
description: Adds hive read, write, and run support to mobilize-hdfs
|
31
31
|
email:
|
32
32
|
- cpaesleme@dena.com
|
@@ -41,7 +41,6 @@ files:
|
|
41
41
|
- Rakefile
|
42
42
|
- lib/mobilize-hive.rb
|
43
43
|
- lib/mobilize-hive/handlers/hive.rb
|
44
|
-
- lib/mobilize-hive/helpers/hive_helper.rb
|
45
44
|
- lib/mobilize-hive/tasks.rb
|
46
45
|
- lib/mobilize-hive/version.rb
|
47
46
|
- lib/samples/hive.yml
|
@@ -65,12 +64,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
65
64
|
- - ! '>='
|
66
65
|
- !ruby/object:Gem::Version
|
67
66
|
version: '0'
|
67
|
+
segments:
|
68
|
+
- 0
|
69
|
+
hash: -4590609456874633429
|
68
70
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
69
71
|
none: false
|
70
72
|
requirements:
|
71
73
|
- - ! '>='
|
72
74
|
- !ruby/object:Gem::Version
|
73
75
|
version: '0'
|
76
|
+
segments:
|
77
|
+
- 0
|
78
|
+
hash: -4590609456874633429
|
74
79
|
requirements: []
|
75
80
|
rubyforge_project:
|
76
81
|
rubygems_version: 1.8.25
|
@@ -1,63 +0,0 @@
|
|
1
|
-
module Mobilize
|
2
|
-
module Hive
|
3
|
-
def self.config
|
4
|
-
Base.config('hive')
|
5
|
-
end
|
6
|
-
|
7
|
-
def self.exec_path(cluster)
|
8
|
-
self.clusters[cluster]['exec_path']
|
9
|
-
end
|
10
|
-
|
11
|
-
def self.output_db(cluster)
|
12
|
-
self.clusters[cluster]['output_db']
|
13
|
-
end
|
14
|
-
|
15
|
-
def self.output_db_user(cluster)
|
16
|
-
output_db_node = Hadoop.gateway_node(cluster)
|
17
|
-
output_db_user = Ssh.host(output_db_node)['user']
|
18
|
-
output_db_user
|
19
|
-
end
|
20
|
-
|
21
|
-
def self.clusters
|
22
|
-
self.config['clusters']
|
23
|
-
end
|
24
|
-
|
25
|
-
def self.slot_ids(cluster)
|
26
|
-
(1..self.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
|
27
|
-
end
|
28
|
-
|
29
|
-
def self.slot_worker_by_cluster_and_path(cluster,path)
|
30
|
-
working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
|
31
|
-
self.slot_ids(cluster).each do |slot_id|
|
32
|
-
unless working_slots.include?(slot_id)
|
33
|
-
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
|
34
|
-
return slot_id
|
35
|
-
end
|
36
|
-
end
|
37
|
-
#return false if none are available
|
38
|
-
return false
|
39
|
-
end
|
40
|
-
|
41
|
-
def self.unslot_worker_by_path(path)
|
42
|
-
begin
|
43
|
-
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
|
44
|
-
return true
|
45
|
-
rescue
|
46
|
-
return false
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
def self.databases(cluster,user_name)
|
51
|
-
self.run(cluster,"show databases",user_name)['stdout'].split("\n")
|
52
|
-
end
|
53
|
-
|
54
|
-
def self.default_params
|
55
|
-
time = Time.now.utc
|
56
|
-
{
|
57
|
-
'$utc_date'=>time.strftime("%Y-%m-%d"),
|
58
|
-
'$utc_time'=>time.strftime("%H:%M"),
|
59
|
-
}
|
60
|
-
end
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|