mobilize-hive 1.3 → 1.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +0 -11
- data/lib/mobilize-hive/handlers/hive.rb +118 -103
- data/lib/mobilize-hive/version.rb +1 -1
- data/mobilize-hive.gemspec +1 -1
- data/test/hive_job_rows.yml +1 -9
- data/test/mobilize-hive_test.rb +1 -17
- metadata +10 -5
- data/lib/mobilize-hive/helpers/hive_helper.rb +0 -63
data/README.md
CHANGED
@@ -142,17 +142,6 @@ Start
|
|
142
142
|
* cluster and user are optional for all of the below.
|
143
143
|
* cluster defaults to the first cluster listed;
|
144
144
|
* user is treated the same way as in [mobilize-ssh][mobilize-ssh].
|
145
|
-
* params are also optional for all of the below. They replace HQL in sources.
|
146
|
-
* params are passed as a YML or JSON, as in:
|
147
|
-
* `hive.run source:<source_path>, params:{'date': '2013-03-01', 'unit': 'widgets'}`
|
148
|
-
* this example replaces all the keys, preceded by '@' in all source hqls with the value.
|
149
|
-
* The preceding '@' is used to keep from replacing instances
|
150
|
-
of "date" and "unit" in the HQL; you should have `@date` and `@unit` in your actual HQL
|
151
|
-
if you'd like to replace those tokens.
|
152
|
-
* in addition, the following params are substituted automatically:
|
153
|
-
* `$utc_date` - replaced with YYYY-MM-DD date, UTC
|
154
|
-
* `$utc_time` - replaced with HH:MM time, UTC
|
155
|
-
* any occurrence of these values in HQL will be replaced at runtime.
|
156
145
|
* hive.run `hql:<hql> || source:<gsheet_path>, user:<user>, cluster:<cluster>`, which executes the
|
157
146
|
script in the hql or source sheet and returns any output specified at the
|
158
147
|
end. If the cmd or last query in source is a select statement, column headers will be
|
@@ -1,9 +1,58 @@
|
|
1
1
|
module Mobilize
|
2
2
|
module Hive
|
3
|
-
|
4
|
-
|
3
|
+
def Hive.config
|
4
|
+
Base.config('hive')
|
5
|
+
end
|
6
|
+
|
7
|
+
def Hive.exec_path(cluster)
|
8
|
+
Hive.clusters[cluster]['exec_path']
|
9
|
+
end
|
10
|
+
|
11
|
+
def Hive.output_db(cluster)
|
12
|
+
Hive.clusters[cluster]['output_db']
|
13
|
+
end
|
14
|
+
|
15
|
+
def Hive.output_db_user(cluster)
|
16
|
+
output_db_node = Hadoop.gateway_node(cluster)
|
17
|
+
output_db_user = Ssh.host(output_db_node)['user']
|
18
|
+
output_db_user
|
19
|
+
end
|
20
|
+
|
21
|
+
def Hive.clusters
|
22
|
+
Hive.config['clusters']
|
23
|
+
end
|
24
|
+
|
25
|
+
def Hive.slot_ids(cluster)
|
26
|
+
(1..Hive.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
|
27
|
+
end
|
28
|
+
|
29
|
+
def Hive.slot_worker_by_cluster_and_path(cluster,path)
|
30
|
+
working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
|
31
|
+
Hive.slot_ids(cluster).each do |slot_id|
|
32
|
+
unless working_slots.include?(slot_id)
|
33
|
+
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
|
34
|
+
return slot_id
|
35
|
+
end
|
36
|
+
end
|
37
|
+
#return false if none are available
|
38
|
+
return false
|
39
|
+
end
|
40
|
+
|
41
|
+
def Hive.unslot_worker_by_path(path)
|
42
|
+
begin
|
43
|
+
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
|
44
|
+
return true
|
45
|
+
rescue
|
46
|
+
return false
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def Hive.databases(cluster,user_name)
|
51
|
+
Hive.run(cluster,"show databases",user_name)['stdout'].split("\n")
|
52
|
+
end
|
53
|
+
|
5
54
|
# converts a source path or target path to a dst in the context of handler and stage
|
6
|
-
def Hive.path_to_dst(path,stage_path
|
55
|
+
def Hive.path_to_dst(path,stage_path)
|
7
56
|
has_handler = true if path.index("://")
|
8
57
|
s = Stage.where(:path=>stage_path).first
|
9
58
|
params = s.params
|
@@ -29,7 +78,7 @@ module Mobilize
|
|
29
78
|
return Dataset.find_or_create_by_url(hive_url)
|
30
79
|
end
|
31
80
|
#otherwise, use hdfs convention
|
32
|
-
return Ssh.path_to_dst(path,stage_path
|
81
|
+
return Ssh.path_to_dst(path,stage_path)
|
33
82
|
end
|
34
83
|
|
35
84
|
def Hive.url_by_path(path,user_name,is_target=false)
|
@@ -59,7 +108,7 @@ module Mobilize
|
|
59
108
|
def Hive.table_stats(cluster,db,table,user_name)
|
60
109
|
describe_sql = "use #{db};describe extended #{table};"
|
61
110
|
describe_response = Hive.run(cluster, describe_sql,user_name)
|
62
|
-
return
|
111
|
+
return describe_response if describe_response['stdout'].length==0
|
63
112
|
describe_output = describe_response['stdout']
|
64
113
|
describe_output.split("location:").last.split(",").first
|
65
114
|
#get location, fields, partitions
|
@@ -93,43 +142,20 @@ module Mobilize
|
|
93
142
|
end
|
94
143
|
|
95
144
|
#run a generic hive command, with the option of passing a file hash to be locally available
|
96
|
-
def Hive.run(cluster,hql,user_name,
|
145
|
+
def Hive.run(cluster,hql,user_name,file_hash=nil)
|
97
146
|
# no TempStatsStore
|
98
147
|
hql = "set hive.stats.autogather=false;#{hql}"
|
99
148
|
filename = hql.to_md5
|
100
149
|
file_hash||= {}
|
101
150
|
file_hash[filename] = hql
|
102
|
-
#add in default params
|
103
|
-
params ||= {}
|
104
|
-
params = params.merge(Hive.default_params)
|
105
|
-
#replace any params in the file_hash and command
|
106
|
-
params.each do |k,v|
|
107
|
-
file_hash.each do |name,data|
|
108
|
-
if k.starts_with?("$")
|
109
|
-
data.gsub!(k,v)
|
110
|
-
else
|
111
|
-
data.gsub!("@#{k}",v)
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|
115
151
|
#silent mode so we don't have logs in stderr; clip output
|
116
152
|
#at hadoop read limit
|
117
153
|
command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
|
118
154
|
gateway_node = Hadoop.gateway_node(cluster)
|
119
|
-
|
120
|
-
#override exit code 0 when stdout is blank and
|
121
|
-
#stderror contains FAILED or KILLED
|
122
|
-
if response['stdout'].to_s.length == 0 and
|
123
|
-
response['stderr'].to_s.ie{|se| se.index("FAILED") or se.index("KILLED")}
|
124
|
-
response['exit_code'] = 500
|
125
|
-
end
|
126
|
-
return response
|
155
|
+
Ssh.run(gateway_node,command,user_name,file_hash)
|
127
156
|
end
|
128
157
|
|
129
158
|
def Hive.run_by_stage_path(stage_path)
|
130
|
-
gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
|
131
|
-
#return blank response if there are no slots available
|
132
|
-
return nil unless gdrive_slot
|
133
159
|
s = Stage.where(:path=>stage_path).first
|
134
160
|
params = s.params
|
135
161
|
cluster = params['cluster'] || Hive.clusters.keys.first
|
@@ -148,16 +174,13 @@ module Mobilize
|
|
148
174
|
if params['hql']
|
149
175
|
hql = params['hql']
|
150
176
|
else
|
151
|
-
source = s.sources
|
152
|
-
hql = source.read(user_name
|
177
|
+
source = s.sources.first
|
178
|
+
hql = source.read(user_name)
|
153
179
|
end
|
154
180
|
|
155
|
-
Gdrive.unslot_worker_by_path(stage_path)
|
156
|
-
|
157
181
|
#check for select at end
|
158
182
|
hql_array = hql.split(";").map{|hc| hc.strip}.reject{|hc| hc.length==0}
|
159
|
-
|
160
|
-
if last_statement.to_s.starts_with?("select")
|
183
|
+
if hql_array.last.downcase.starts_with?("select")
|
161
184
|
#nil if no prior commands
|
162
185
|
prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
|
163
186
|
select_hql = hql_array.last
|
@@ -165,10 +188,10 @@ module Mobilize
|
|
165
188
|
"drop table if exists #{output_path}",
|
166
189
|
"create table #{output_path} as #{select_hql};"].join(";")
|
167
190
|
full_hql = [prior_hql, output_table_hql].compact.join(";")
|
168
|
-
result = Hive.run(cluster,full_hql, user_name
|
191
|
+
result = Hive.run(cluster,full_hql, user_name)
|
169
192
|
Dataset.find_or_create_by_url(out_url)
|
170
193
|
else
|
171
|
-
result = Hive.run(cluster, hql, user_name
|
194
|
+
result = Hive.run(cluster, hql, user_name)
|
172
195
|
Dataset.find_or_create_by_url(out_url)
|
173
196
|
Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
|
174
197
|
end
|
@@ -201,7 +224,7 @@ module Mobilize
|
|
201
224
|
file_name = schema_path.split("/").last
|
202
225
|
out_url = "gridfs://#{schema_path}/#{file_name}"
|
203
226
|
Dataset.write_by_url(out_url,out_tsv,user_name)
|
204
|
-
schema_tsv = Dataset.find_by_url(out_url).read(user_name
|
227
|
+
schema_tsv = Dataset.find_by_url(out_url).read(user_name)
|
205
228
|
schema_hash = {}
|
206
229
|
schema_tsv.tsv_to_hash_array.each do |ha|
|
207
230
|
schema_hash[ha['name']] = ha['datatype']
|
@@ -209,10 +232,24 @@ module Mobilize
|
|
209
232
|
schema_hash
|
210
233
|
end
|
211
234
|
|
212
|
-
def Hive.
|
235
|
+
def Hive.path_params(cluster, path, user_name)
|
236
|
+
db, table, partitions = path.gsub(".","/").split("/").ie{|sp| [sp.first, sp.second, sp[2..-1]]}
|
237
|
+
#get existing table stats if any
|
238
|
+
curr_stats = begin
|
239
|
+
Hive.table_stats(cluster, db, table, user_name)
|
240
|
+
rescue
|
241
|
+
nil
|
242
|
+
end
|
243
|
+
{"db"=>db,
|
244
|
+
"table"=>table,
|
245
|
+
"partitions"=>partitions,
|
246
|
+
"curr_stats"=>curr_stats}
|
247
|
+
end
|
248
|
+
|
249
|
+
def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil)
|
213
250
|
table_path = [db,table].join(".")
|
214
|
-
|
215
|
-
|
251
|
+
target_params = Hive.path_params(cluster, table_path, user_name)
|
252
|
+
table_stats = target_params['curr_stats']
|
216
253
|
|
217
254
|
source_hql_array = source_hql.split(";")
|
218
255
|
last_select_i = source_hql_array.rindex{|hql| hql.downcase.strip.starts_with?("select")}
|
@@ -228,10 +265,11 @@ module Mobilize
|
|
228
265
|
temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
|
229
266
|
temp_drop_hql = "drop table if exists #{temp_table_path};"
|
230
267
|
temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
|
231
|
-
|
232
|
-
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
268
|
+
Hive.run(cluster,temp_create_hql,user_name)
|
233
269
|
|
234
|
-
|
270
|
+
source_params = Hive.path_params(cluster, temp_table_path, user_name)
|
271
|
+
source_table_path = ['db','table'].map{|k| source_params[k]}.join(".")
|
272
|
+
source_table_stats = source_params['curr_stats']
|
235
273
|
source_fields = source_table_stats['field_defs']
|
236
274
|
|
237
275
|
if part_array.length == 0 and
|
@@ -259,7 +297,7 @@ module Mobilize
|
|
259
297
|
|
260
298
|
target_create_hql = "create table if not exists #{table_path} #{field_def_stmt};"
|
261
299
|
|
262
|
-
target_insert_hql = "insert overwrite table #{table_path} select #{target_field_stmt} from #{
|
300
|
+
target_insert_hql = "insert overwrite table #{table_path} select #{target_field_stmt} from #{source_table_path};"
|
263
301
|
|
264
302
|
target_full_hql = [target_name_hql,
|
265
303
|
target_drop_hql,
|
@@ -267,12 +305,10 @@ module Mobilize
|
|
267
305
|
target_insert_hql,
|
268
306
|
temp_drop_hql].join
|
269
307
|
|
270
|
-
|
271
|
-
|
272
|
-
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
308
|
+
Hive.run(cluster, target_full_hql, user_name)
|
273
309
|
|
274
310
|
elsif part_array.length > 0 and
|
275
|
-
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']}
|
311
|
+
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
|
276
312
|
#partitions and no target table or same partitions in both target table and user params
|
277
313
|
|
278
314
|
target_headers = source_fields.map{|f| f['name']}.reject{|h| part_array.include?(h)}
|
@@ -316,20 +352,10 @@ module Mobilize
|
|
316
352
|
|
317
353
|
else
|
318
354
|
#get all the permutations of possible partititons
|
319
|
-
|
320
|
-
|
321
|
-
part_perm_hql = part_set_hql + part_select_hql
|
322
|
-
response = Hive.run(cluster, part_perm_hql, user_name, params)
|
323
|
-
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
324
|
-
part_perm_tsv = response['stdout']
|
355
|
+
part_perm_hql = "set hive.cli.print.header=true;select distinct #{target_part_stmt} from #{source_table_path};"
|
356
|
+
part_perm_tsv = Hive.run(cluster, part_perm_hql, user_name)['stdout']
|
325
357
|
#having gotten the permutations, ensure they are dropped
|
326
358
|
part_hash_array = part_perm_tsv.tsv_to_hash_array
|
327
|
-
#make sure there is data
|
328
|
-
if part_hash_array.first.nil? or part_hash_array.first.values.include?(nil)
|
329
|
-
#blank result set, return url
|
330
|
-
return url
|
331
|
-
end
|
332
|
-
|
333
359
|
part_drop_hql = part_hash_array.map do |h|
|
334
360
|
part_drop_stmt = h.map do |name,value|
|
335
361
|
part_defs[name[1..-2]]=="string" ? "#{name}='#{value}'" : "#{name}=#{value}"
|
@@ -341,16 +367,16 @@ module Mobilize
|
|
341
367
|
|
342
368
|
target_insert_hql = "insert overwrite table #{table_path} " +
|
343
369
|
"partition (#{target_part_stmt}) " +
|
344
|
-
"select #{target_field_stmt},#{target_part_stmt} from #{
|
370
|
+
"select #{target_field_stmt},#{target_part_stmt} from #{source_table_path};"
|
345
371
|
|
346
372
|
target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
|
347
373
|
|
348
|
-
|
349
|
-
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
374
|
+
Hive.run(cluster, target_full_hql, user_name)
|
350
375
|
else
|
351
376
|
error_msg = "Incompatible partition specs"
|
352
377
|
raise error_msg
|
353
378
|
end
|
379
|
+
url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
|
354
380
|
return url
|
355
381
|
end
|
356
382
|
|
@@ -358,21 +384,14 @@ module Mobilize
|
|
358
384
|
#Accepts options to drop existing target if any
|
359
385
|
#also schema with column datatype overrides
|
360
386
|
def Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop=false, schema_hash=nil)
|
361
|
-
return nil if source_tsv.strip.length==0
|
362
|
-
if source_tsv.index("\r\n")
|
363
|
-
source_tsv = source_tsv.gsub("\r\n","\n")
|
364
|
-
elsif source_tsv.index("\r")
|
365
|
-
source_tsv = source_tsv.gsub("\r","\n")
|
366
|
-
end
|
367
387
|
source_headers = source_tsv.tsv_header_array
|
368
388
|
|
369
389
|
table_path = [db,table].join(".")
|
370
|
-
|
390
|
+
target_params = Hive.path_params(cluster, table_path, user_name)
|
391
|
+
table_stats = target_params['curr_stats']
|
371
392
|
|
372
393
|
schema_hash ||= {}
|
373
394
|
|
374
|
-
url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
|
375
|
-
|
376
395
|
if part_array.length == 0 and
|
377
396
|
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
|
378
397
|
#no partitions in either user params or the target table
|
@@ -399,11 +418,10 @@ module Mobilize
|
|
399
418
|
|
400
419
|
target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
|
401
420
|
|
402
|
-
|
403
|
-
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
421
|
+
Hive.run(cluster, target_full_hql, user_name, file_hash)
|
404
422
|
|
405
423
|
elsif part_array.length > 0 and
|
406
|
-
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']}
|
424
|
+
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
|
407
425
|
#partitions and no target table
|
408
426
|
#or same partitions in both target table and user params
|
409
427
|
#or drop and start fresh
|
@@ -427,17 +445,13 @@ module Mobilize
|
|
427
445
|
"partitioned by #{partition_defs}"
|
428
446
|
|
429
447
|
#create target table early if not here
|
430
|
-
|
431
|
-
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
432
|
-
|
433
|
-
#return url (operation complete) if there's no data
|
434
|
-
source_hash_array = source_tsv.tsv_to_hash_array
|
435
|
-
return url if source_hash_array.length==1 and source_hash_array.first.values.compact.length==0
|
448
|
+
Hive.run(cluster, target_create_hql, user_name)
|
436
449
|
|
437
450
|
table_stats = Hive.table_stats(cluster, db, table, user_name)
|
438
451
|
|
439
452
|
#create data hash from source hash array
|
440
453
|
data_hash = {}
|
454
|
+
source_hash_array = source_tsv.tsv_to_hash_array
|
441
455
|
source_hash_array.each do |ha|
|
442
456
|
tpmk = part_array.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
|
443
457
|
tpmv = ha.reject{|k,v| part_array.include?(k)}.values.join("\001")
|
@@ -470,8 +484,7 @@ module Mobilize
|
|
470
484
|
#run actual partition adds all at once
|
471
485
|
if target_part_hql.length>0
|
472
486
|
puts "Adding partitions to #{cluster}/#{db}/#{table} for #{user_name} at #{Time.now.utc}"
|
473
|
-
|
474
|
-
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
487
|
+
Hive.run(cluster, target_part_hql, user_name)
|
475
488
|
end
|
476
489
|
else
|
477
490
|
error_msg = "Incompatible partition specs: " +
|
@@ -479,31 +492,33 @@ module Mobilize
|
|
479
492
|
"user_params:#{part_array.to_s}"
|
480
493
|
raise error_msg
|
481
494
|
end
|
482
|
-
|
495
|
+
url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
|
483
496
|
return url
|
484
497
|
end
|
485
498
|
|
486
499
|
def Hive.write_by_stage_path(stage_path)
|
487
|
-
gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
|
488
|
-
#return blank response if there are no slots available
|
489
|
-
return nil unless gdrive_slot
|
490
500
|
s = Stage.where(:path=>stage_path).first
|
491
501
|
params = s.params
|
492
|
-
source = s.sources
|
502
|
+
source = s.sources.first
|
493
503
|
target = s.target
|
494
504
|
cluster, db, table = target.url.split("://").last.split("/")
|
495
|
-
#slot Hive worker if available
|
496
|
-
slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
|
497
|
-
return false unless slot_id
|
498
505
|
#update stage with the node so we can use it
|
499
506
|
user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
|
500
507
|
job_name = s.path.sub("Runner_","")
|
501
508
|
|
509
|
+
#slot Hive worker if available
|
510
|
+
slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
|
511
|
+
return false unless slot_id
|
512
|
+
|
502
513
|
schema_hash = if params['schema']
|
514
|
+
gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
|
515
|
+
#return blank response if there are no slots available
|
516
|
+
return nil unless gdrive_slot
|
503
517
|
Hive.schema_hash(params['schema'],user_name,gdrive_slot)
|
504
518
|
else
|
505
519
|
{}
|
506
520
|
end
|
521
|
+
Gdrive.unslot_worker_by_path(stage_path)
|
507
522
|
#drop target before create/insert?
|
508
523
|
drop = params['drop']
|
509
524
|
|
@@ -516,17 +531,16 @@ module Mobilize
|
|
516
531
|
#source table
|
517
532
|
cluster,source_path = source.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
|
518
533
|
source_hql = "select * from #{source_path};"
|
519
|
-
elsif ['gsheet','
|
534
|
+
elsif ['gsheet','gridfs','hdfs'].include?(source.handler)
|
520
535
|
if source.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
|
521
|
-
source_hql = source.read(user_name
|
536
|
+
source_hql = source.read(user_name)
|
522
537
|
else
|
523
|
-
#tsv from sheet
|
524
|
-
source_tsv = source.read(user_name
|
538
|
+
#tsv from sheet
|
539
|
+
source_tsv = source.read(user_name)
|
525
540
|
end
|
526
541
|
end
|
527
542
|
end
|
528
543
|
|
529
|
-
Gdrive.unslot_worker_by_path(stage_path)
|
530
544
|
part_array = if params['partitions']
|
531
545
|
params['partitions'].to_a.map{|p| p.gsub(".","/").split("/")}.flatten
|
532
546
|
elsif params['target']
|
@@ -545,14 +559,12 @@ module Mobilize
|
|
545
559
|
Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop, schema_hash)
|
546
560
|
elsif source_tsv
|
547
561
|
Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop, schema_hash)
|
548
|
-
elsif source
|
549
|
-
#null sheet
|
550
562
|
else
|
551
563
|
raise "Unable to determine source tsv or source hql"
|
552
564
|
end
|
553
565
|
{'stdout'=>url,'exit_code'=>0}
|
554
566
|
rescue => exc
|
555
|
-
{'stderr'=>
|
567
|
+
{'stderr'=>exc.to_s, 'exit_code'=>500}
|
556
568
|
end
|
557
569
|
|
558
570
|
#unslot worker and write result
|
@@ -573,8 +585,11 @@ module Mobilize
|
|
573
585
|
select_hql = "select * from #{source_path};"
|
574
586
|
hql = [set_hql,select_hql].join
|
575
587
|
response = Hive.run(cluster, hql,user_name)
|
576
|
-
|
577
|
-
|
588
|
+
if response['exit_code']==0
|
589
|
+
return response['stdout']
|
590
|
+
else
|
591
|
+
raise "Unable to read hive://#{dst_path} with error: #{response['stderr']}"
|
592
|
+
end
|
578
593
|
end
|
579
594
|
|
580
595
|
def Hive.write_by_dataset_path(dst_path,source_tsv,user_name,*args)
|
data/mobilize-hive.gemspec
CHANGED
@@ -16,5 +16,5 @@ Gem::Specification.new do |gem|
|
|
16
16
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
18
|
gem.require_paths = ["lib"]
|
19
|
-
gem.add_runtime_dependency "mobilize-hdfs","1.
|
19
|
+
gem.add_runtime_dependency "mobilize-hdfs","1.21"
|
20
20
|
end
|
data/test/hive_job_rows.yml
CHANGED
@@ -20,15 +20,7 @@
|
|
20
20
|
active: true
|
21
21
|
trigger: after hive_test_2
|
22
22
|
status: ""
|
23
|
-
stage1: hive.run hql:"select
|
23
|
+
stage1: hive.run hql:"select act_date as `date`,product,category,value from mobilize.hive_test_1;"
|
24
24
|
stage2: hive.write source:"stage1",target:"mobilize/hive_test_3", partitions:"date/product", drop:true
|
25
25
|
stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3", partitions:"date/product", drop:false
|
26
26
|
stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"
|
27
|
-
- name: hive_test_4
|
28
|
-
active: true
|
29
|
-
trigger: after hive_test_3
|
30
|
-
status: ""
|
31
|
-
stage1: hive.write source:"hive_test_4_stage_1.in", target:"mobilize/hive_test_1", partitions:"act_date"
|
32
|
-
stage2: hive.write source:"hive_test_4_stage_2.in", target:"mobilize/hive_test_1", partitions:"act_date"
|
33
|
-
stage3: hive.run hql:"select '$utc_date $utc_time' as `date_time`,product,category,value from mobilize.hive_test_1;"
|
34
|
-
stage4: gsheet.write source:stage3, target:"hive_test_4.out"
|
data/test/mobilize-hive_test.rb
CHANGED
@@ -25,18 +25,6 @@ describe "Mobilize" do
|
|
25
25
|
hive_1_in_tsv = YAML.load_file("#{Mobilize::Base.root}/test/hive_test_1_in.yml").hash_array_to_tsv
|
26
26
|
hive_1_in_sheet.write(hive_1_in_tsv,Mobilize::Gdrive.owner_name)
|
27
27
|
|
28
|
-
#create blank sheet
|
29
|
-
hive_4_stage_1_in_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_1.in",gdrive_slot)
|
30
|
-
[hive_4_stage_1_in_sheet].each {|s| s.delete if s}
|
31
|
-
hive_4_stage_1_in_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_1.in",gdrive_slot)
|
32
|
-
|
33
|
-
#create sheet w just headers
|
34
|
-
hive_4_stage_2_in_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_2.in",gdrive_slot)
|
35
|
-
[hive_4_stage_2_in_sheet].each {|s| s.delete if s}
|
36
|
-
hive_4_stage_2_in_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_2.in",gdrive_slot)
|
37
|
-
hive_4_stage_2_in_sheet_header = hive_1_in_tsv.tsv_header_array.join("\t")
|
38
|
-
hive_4_stage_2_in_sheet.write(hive_4_stage_2_in_sheet_header,Mobilize::Gdrive.owner_name)
|
39
|
-
|
40
28
|
hive_1_schema_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1.schema",gdrive_slot)
|
41
29
|
[hive_1_schema_sheet].each {|s| s.delete if s}
|
42
30
|
hive_1_schema_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1.schema",gdrive_slot)
|
@@ -63,25 +51,21 @@ describe "Mobilize" do
|
|
63
51
|
[hive_2_target_sheet].each{|s| s.delete if s}
|
64
52
|
hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
|
65
53
|
[hive_3_target_sheet].each{|s| s.delete if s}
|
66
|
-
hive_4_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4.out",gdrive_slot)
|
67
|
-
[hive_4_target_sheet].each{|s| s.delete if s}
|
68
54
|
|
69
55
|
puts "job row added, force enqueued requestor, wait for stages"
|
70
56
|
r.enqueue!
|
71
|
-
wait_for_stages(
|
57
|
+
wait_for_stages(1200)
|
72
58
|
|
73
59
|
puts "jobtracker posted data to test sheet"
|
74
60
|
hive_1_stage_2_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1_stage_2.out",gdrive_slot)
|
75
61
|
hive_1_stage_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1_stage_3.out",gdrive_slot)
|
76
62
|
hive_2_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_2.out",gdrive_slot)
|
77
63
|
hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
|
78
|
-
hive_4_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4.out",gdrive_slot)
|
79
64
|
|
80
65
|
assert hive_1_stage_2_target_sheet.read(u.name).length == 219
|
81
66
|
assert hive_1_stage_3_target_sheet.read(u.name).length > 3
|
82
67
|
assert hive_2_target_sheet.read(u.name).length == 599
|
83
68
|
assert hive_3_target_sheet.read(u.name).length == 347
|
84
|
-
assert hive_4_target_sheet.read(u.name).length == 432
|
85
69
|
end
|
86
70
|
|
87
71
|
def wait_for_stages(time_limit=600,stage_limit=120,wait_length=10)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mobilize-hive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.21'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-03-22 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mobilize-hdfs
|
@@ -18,7 +18,7 @@ dependencies:
|
|
18
18
|
requirements:
|
19
19
|
- - '='
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: '1.
|
21
|
+
version: '1.21'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -26,7 +26,7 @@ dependencies:
|
|
26
26
|
requirements:
|
27
27
|
- - '='
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: '1.
|
29
|
+
version: '1.21'
|
30
30
|
description: Adds hive read, write, and run support to mobilize-hdfs
|
31
31
|
email:
|
32
32
|
- cpaesleme@dena.com
|
@@ -41,7 +41,6 @@ files:
|
|
41
41
|
- Rakefile
|
42
42
|
- lib/mobilize-hive.rb
|
43
43
|
- lib/mobilize-hive/handlers/hive.rb
|
44
|
-
- lib/mobilize-hive/helpers/hive_helper.rb
|
45
44
|
- lib/mobilize-hive/tasks.rb
|
46
45
|
- lib/mobilize-hive/version.rb
|
47
46
|
- lib/samples/hive.yml
|
@@ -65,12 +64,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
65
64
|
- - ! '>='
|
66
65
|
- !ruby/object:Gem::Version
|
67
66
|
version: '0'
|
67
|
+
segments:
|
68
|
+
- 0
|
69
|
+
hash: -4590609456874633429
|
68
70
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
69
71
|
none: false
|
70
72
|
requirements:
|
71
73
|
- - ! '>='
|
72
74
|
- !ruby/object:Gem::Version
|
73
75
|
version: '0'
|
76
|
+
segments:
|
77
|
+
- 0
|
78
|
+
hash: -4590609456874633429
|
74
79
|
requirements: []
|
75
80
|
rubyforge_project:
|
76
81
|
rubygems_version: 1.8.25
|
@@ -1,63 +0,0 @@
|
|
1
|
-
module Mobilize
|
2
|
-
module Hive
|
3
|
-
def self.config
|
4
|
-
Base.config('hive')
|
5
|
-
end
|
6
|
-
|
7
|
-
def self.exec_path(cluster)
|
8
|
-
self.clusters[cluster]['exec_path']
|
9
|
-
end
|
10
|
-
|
11
|
-
def self.output_db(cluster)
|
12
|
-
self.clusters[cluster]['output_db']
|
13
|
-
end
|
14
|
-
|
15
|
-
def self.output_db_user(cluster)
|
16
|
-
output_db_node = Hadoop.gateway_node(cluster)
|
17
|
-
output_db_user = Ssh.host(output_db_node)['user']
|
18
|
-
output_db_user
|
19
|
-
end
|
20
|
-
|
21
|
-
def self.clusters
|
22
|
-
self.config['clusters']
|
23
|
-
end
|
24
|
-
|
25
|
-
def self.slot_ids(cluster)
|
26
|
-
(1..self.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
|
27
|
-
end
|
28
|
-
|
29
|
-
def self.slot_worker_by_cluster_and_path(cluster,path)
|
30
|
-
working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
|
31
|
-
self.slot_ids(cluster).each do |slot_id|
|
32
|
-
unless working_slots.include?(slot_id)
|
33
|
-
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
|
34
|
-
return slot_id
|
35
|
-
end
|
36
|
-
end
|
37
|
-
#return false if none are available
|
38
|
-
return false
|
39
|
-
end
|
40
|
-
|
41
|
-
def self.unslot_worker_by_path(path)
|
42
|
-
begin
|
43
|
-
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
|
44
|
-
return true
|
45
|
-
rescue
|
46
|
-
return false
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
def self.databases(cluster,user_name)
|
51
|
-
self.run(cluster,"show databases",user_name)['stdout'].split("\n")
|
52
|
-
end
|
53
|
-
|
54
|
-
def self.default_params
|
55
|
-
time = Time.now.utc
|
56
|
-
{
|
57
|
-
'$utc_date'=>time.strftime("%Y-%m-%d"),
|
58
|
-
'$utc_time'=>time.strftime("%H:%M"),
|
59
|
-
}
|
60
|
-
end
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|