mobilize-hive 1.2 → 1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +11 -0
- data/lib/mobilize-hive/handlers/hive.rb +103 -118
- data/lib/mobilize-hive/helpers/hive_helper.rb +63 -0
- data/lib/mobilize-hive/version.rb +1 -1
- data/mobilize-hive.gemspec +1 -1
- data/test/hive_job_rows.yml +9 -1
- data/test/mobilize-hive_test.rb +17 -1
- metadata +6 -6
data/README.md
CHANGED
@@ -142,6 +142,17 @@ Start
|
|
142
142
|
* cluster and user are optional for all of the below.
|
143
143
|
* cluster defaults to the first cluster listed;
|
144
144
|
* user is treated the same way as in [mobilize-ssh][mobilize-ssh].
|
145
|
+
* params are also optional for all of the below. They replace HQL in sources.
|
146
|
+
* params are passed as a YML or JSON, as in:
|
147
|
+
* `hive.run source:<source_path>, params:{'date': '2013-03-01', 'unit': 'widgets'}`
|
148
|
+
* this example replaces all the keys, preceded by '@' in all source hqls with the value.
|
149
|
+
* The preceding '@' is used to keep from replacing instances
|
150
|
+
of "date" and "unit" in the HQL; you should have `@date` and `@unit` in your actual HQL
|
151
|
+
if you'd like to replace those tokens.
|
152
|
+
* in addition, the following params are substituted automatically:
|
153
|
+
* `$utc_date` - replaced with YYYY-MM-DD date, UTC
|
154
|
+
* `$utc_time` - replaced with HH:MM time, UTC
|
155
|
+
* any occurrence of these values in HQL will be replaced at runtime.
|
145
156
|
* hive.run `hql:<hql> || source:<gsheet_path>, user:<user>, cluster:<cluster>`, which executes the
|
146
157
|
script in the hql or source sheet and returns any output specified at the
|
147
158
|
end. If the cmd or last query in source is a select statement, column headers will be
|
@@ -1,58 +1,9 @@
|
|
1
1
|
module Mobilize
|
2
2
|
module Hive
|
3
|
-
|
4
|
-
|
5
|
-
end
|
6
|
-
|
7
|
-
def Hive.exec_path(cluster)
|
8
|
-
Hive.clusters[cluster]['exec_path']
|
9
|
-
end
|
10
|
-
|
11
|
-
def Hive.output_db(cluster)
|
12
|
-
Hive.clusters[cluster]['output_db']
|
13
|
-
end
|
14
|
-
|
15
|
-
def Hive.output_db_user(cluster)
|
16
|
-
output_db_node = Hadoop.gateway_node(cluster)
|
17
|
-
output_db_user = Ssh.host(output_db_node)['user']
|
18
|
-
output_db_user
|
19
|
-
end
|
20
|
-
|
21
|
-
def Hive.clusters
|
22
|
-
Hive.config['clusters']
|
23
|
-
end
|
24
|
-
|
25
|
-
def Hive.slot_ids(cluster)
|
26
|
-
(1..Hive.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
|
27
|
-
end
|
28
|
-
|
29
|
-
def Hive.slot_worker_by_cluster_and_path(cluster,path)
|
30
|
-
working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
|
31
|
-
Hive.slot_ids(cluster).each do |slot_id|
|
32
|
-
unless working_slots.include?(slot_id)
|
33
|
-
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
|
34
|
-
return slot_id
|
35
|
-
end
|
36
|
-
end
|
37
|
-
#return false if none are available
|
38
|
-
return false
|
39
|
-
end
|
40
|
-
|
41
|
-
def Hive.unslot_worker_by_path(path)
|
42
|
-
begin
|
43
|
-
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
|
44
|
-
return true
|
45
|
-
rescue
|
46
|
-
return false
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
def Hive.databases(cluster,user_name)
|
51
|
-
Hive.run(cluster,"show databases",user_name)['stdout'].split("\n")
|
52
|
-
end
|
53
|
-
|
3
|
+
#adds convenience methods
|
4
|
+
require "#{File.dirname(__FILE__)}/../helpers/hive_helper"
|
54
5
|
# converts a source path or target path to a dst in the context of handler and stage
|
55
|
-
def Hive.path_to_dst(path,stage_path)
|
6
|
+
def Hive.path_to_dst(path,stage_path,gdrive_slot)
|
56
7
|
has_handler = true if path.index("://")
|
57
8
|
s = Stage.where(:path=>stage_path).first
|
58
9
|
params = s.params
|
@@ -78,7 +29,7 @@ module Mobilize
|
|
78
29
|
return Dataset.find_or_create_by_url(hive_url)
|
79
30
|
end
|
80
31
|
#otherwise, use hdfs convention
|
81
|
-
return Ssh.path_to_dst(path,stage_path)
|
32
|
+
return Ssh.path_to_dst(path,stage_path,gdrive_slot)
|
82
33
|
end
|
83
34
|
|
84
35
|
def Hive.url_by_path(path,user_name,is_target=false)
|
@@ -108,7 +59,7 @@ module Mobilize
|
|
108
59
|
def Hive.table_stats(cluster,db,table,user_name)
|
109
60
|
describe_sql = "use #{db};describe extended #{table};"
|
110
61
|
describe_response = Hive.run(cluster, describe_sql,user_name)
|
111
|
-
return
|
62
|
+
return nil if describe_response['stdout'].length==0
|
112
63
|
describe_output = describe_response['stdout']
|
113
64
|
describe_output.split("location:").last.split(",").first
|
114
65
|
#get location, fields, partitions
|
@@ -142,20 +93,43 @@ module Mobilize
|
|
142
93
|
end
|
143
94
|
|
144
95
|
#run a generic hive command, with the option of passing a file hash to be locally available
|
145
|
-
def Hive.run(cluster,hql,user_name,file_hash=nil)
|
96
|
+
def Hive.run(cluster,hql,user_name,params=nil,file_hash=nil)
|
146
97
|
# no TempStatsStore
|
147
98
|
hql = "set hive.stats.autogather=false;#{hql}"
|
148
99
|
filename = hql.to_md5
|
149
100
|
file_hash||= {}
|
150
101
|
file_hash[filename] = hql
|
102
|
+
#add in default params
|
103
|
+
params ||= {}
|
104
|
+
params = params.merge(Hive.default_params)
|
105
|
+
#replace any params in the file_hash and command
|
106
|
+
params.each do |k,v|
|
107
|
+
file_hash.each do |name,data|
|
108
|
+
if k.starts_with?("$")
|
109
|
+
data.gsub!(k,v)
|
110
|
+
else
|
111
|
+
data.gsub!("@#{k}",v)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
151
115
|
#silent mode so we don't have logs in stderr; clip output
|
152
116
|
#at hadoop read limit
|
153
117
|
command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
|
154
118
|
gateway_node = Hadoop.gateway_node(cluster)
|
155
|
-
Ssh.run(gateway_node,command,user_name,file_hash)
|
119
|
+
response = Ssh.run(gateway_node,command,user_name,file_hash)
|
120
|
+
#override exit code 0 when stdout is blank and
|
121
|
+
#stderror contains FAILED or KILLED
|
122
|
+
if response['stdout'].to_s.length == 0 and
|
123
|
+
response['stderr'].to_s.ie{|se| se.index("FAILED") or se.index("KILLED")}
|
124
|
+
response['exit_code'] = 500
|
125
|
+
end
|
126
|
+
return response
|
156
127
|
end
|
157
128
|
|
158
129
|
def Hive.run_by_stage_path(stage_path)
|
130
|
+
gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
|
131
|
+
#return blank response if there are no slots available
|
132
|
+
return nil unless gdrive_slot
|
159
133
|
s = Stage.where(:path=>stage_path).first
|
160
134
|
params = s.params
|
161
135
|
cluster = params['cluster'] || Hive.clusters.keys.first
|
@@ -174,13 +148,16 @@ module Mobilize
|
|
174
148
|
if params['hql']
|
175
149
|
hql = params['hql']
|
176
150
|
else
|
177
|
-
source = s.sources.first
|
178
|
-
hql = source.read(user_name)
|
151
|
+
source = s.sources(gdrive_slot).first
|
152
|
+
hql = source.read(user_name,gdrive_slot)
|
179
153
|
end
|
180
154
|
|
155
|
+
Gdrive.unslot_worker_by_path(stage_path)
|
156
|
+
|
181
157
|
#check for select at end
|
182
158
|
hql_array = hql.split(";").map{|hc| hc.strip}.reject{|hc| hc.length==0}
|
183
|
-
|
159
|
+
last_statement = hql_array.last.downcase.split("\n").reject{|l| l.starts_with?("-- ")}.first
|
160
|
+
if last_statement.to_s.starts_with?("select")
|
184
161
|
#nil if no prior commands
|
185
162
|
prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
|
186
163
|
select_hql = hql_array.last
|
@@ -188,10 +165,10 @@ module Mobilize
|
|
188
165
|
"drop table if exists #{output_path}",
|
189
166
|
"create table #{output_path} as #{select_hql};"].join(";")
|
190
167
|
full_hql = [prior_hql, output_table_hql].compact.join(";")
|
191
|
-
result = Hive.run(cluster,full_hql, user_name)
|
168
|
+
result = Hive.run(cluster,full_hql, user_name,params['params'])
|
192
169
|
Dataset.find_or_create_by_url(out_url)
|
193
170
|
else
|
194
|
-
result = Hive.run(cluster, hql, user_name)
|
171
|
+
result = Hive.run(cluster, hql, user_name,params['params'])
|
195
172
|
Dataset.find_or_create_by_url(out_url)
|
196
173
|
Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
|
197
174
|
end
|
@@ -224,7 +201,7 @@ module Mobilize
|
|
224
201
|
file_name = schema_path.split("/").last
|
225
202
|
out_url = "gridfs://#{schema_path}/#{file_name}"
|
226
203
|
Dataset.write_by_url(out_url,out_tsv,user_name)
|
227
|
-
schema_tsv = Dataset.find_by_url(out_url).read(user_name)
|
204
|
+
schema_tsv = Dataset.find_by_url(out_url).read(user_name,gdrive_slot)
|
228
205
|
schema_hash = {}
|
229
206
|
schema_tsv.tsv_to_hash_array.each do |ha|
|
230
207
|
schema_hash[ha['name']] = ha['datatype']
|
@@ -232,24 +209,10 @@ module Mobilize
|
|
232
209
|
schema_hash
|
233
210
|
end
|
234
211
|
|
235
|
-
def Hive.
|
236
|
-
db, table, partitions = path.gsub(".","/").split("/").ie{|sp| [sp.first, sp.second, sp[2..-1]]}
|
237
|
-
#get existing table stats if any
|
238
|
-
curr_stats = begin
|
239
|
-
Hive.table_stats(cluster, db, table, user_name)
|
240
|
-
rescue
|
241
|
-
nil
|
242
|
-
end
|
243
|
-
{"db"=>db,
|
244
|
-
"table"=>table,
|
245
|
-
"partitions"=>partitions,
|
246
|
-
"curr_stats"=>curr_stats}
|
247
|
-
end
|
248
|
-
|
249
|
-
def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil)
|
212
|
+
def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil, params=nil)
|
250
213
|
table_path = [db,table].join(".")
|
251
|
-
|
252
|
-
|
214
|
+
table_stats = Hive.table_stats(cluster, db, table, user_name)
|
215
|
+
url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
|
253
216
|
|
254
217
|
source_hql_array = source_hql.split(";")
|
255
218
|
last_select_i = source_hql_array.rindex{|hql| hql.downcase.strip.starts_with?("select")}
|
@@ -265,11 +228,10 @@ module Mobilize
|
|
265
228
|
temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
|
266
229
|
temp_drop_hql = "drop table if exists #{temp_table_path};"
|
267
230
|
temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
|
268
|
-
Hive.run(cluster,temp_create_hql,user_name)
|
231
|
+
response = Hive.run(cluster,temp_create_hql,user_name,params)
|
232
|
+
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
269
233
|
|
270
|
-
|
271
|
-
source_table_path = ['db','table'].map{|k| source_params[k]}.join(".")
|
272
|
-
source_table_stats = source_params['curr_stats']
|
234
|
+
source_table_stats = Hive.table_stats(cluster,temp_db,temp_table_name,user_name)
|
273
235
|
source_fields = source_table_stats['field_defs']
|
274
236
|
|
275
237
|
if part_array.length == 0 and
|
@@ -297,7 +259,7 @@ module Mobilize
|
|
297
259
|
|
298
260
|
target_create_hql = "create table if not exists #{table_path} #{field_def_stmt};"
|
299
261
|
|
300
|
-
target_insert_hql = "insert overwrite table #{table_path} select #{target_field_stmt} from #{
|
262
|
+
target_insert_hql = "insert overwrite table #{table_path} select #{target_field_stmt} from #{temp_table_path};"
|
301
263
|
|
302
264
|
target_full_hql = [target_name_hql,
|
303
265
|
target_drop_hql,
|
@@ -305,10 +267,12 @@ module Mobilize
|
|
305
267
|
target_insert_hql,
|
306
268
|
temp_drop_hql].join
|
307
269
|
|
308
|
-
Hive.run(cluster, target_full_hql, user_name)
|
270
|
+
response = Hive.run(cluster, target_full_hql, user_name, params)
|
271
|
+
|
272
|
+
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
309
273
|
|
310
274
|
elsif part_array.length > 0 and
|
311
|
-
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
|
275
|
+
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']}.sort == part_array.sort}
|
312
276
|
#partitions and no target table or same partitions in both target table and user params
|
313
277
|
|
314
278
|
target_headers = source_fields.map{|f| f['name']}.reject{|h| part_array.include?(h)}
|
@@ -352,10 +316,20 @@ module Mobilize
|
|
352
316
|
|
353
317
|
else
|
354
318
|
#get all the permutations of possible partititons
|
355
|
-
|
356
|
-
|
319
|
+
part_set_hql = "set hive.cli.print.header=true;set mapred.job.name=#{job_name} (permutations);"
|
320
|
+
part_select_hql = "select distinct #{target_part_stmt} from #{temp_table_path};"
|
321
|
+
part_perm_hql = part_set_hql + part_select_hql
|
322
|
+
response = Hive.run(cluster, part_perm_hql, user_name, params)
|
323
|
+
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
324
|
+
part_perm_tsv = response['stdout']
|
357
325
|
#having gotten the permutations, ensure they are dropped
|
358
326
|
part_hash_array = part_perm_tsv.tsv_to_hash_array
|
327
|
+
#make sure there is data
|
328
|
+
if part_hash_array.first.nil? or part_hash_array.first.values.include?(nil)
|
329
|
+
#blank result set, return url
|
330
|
+
return url
|
331
|
+
end
|
332
|
+
|
359
333
|
part_drop_hql = part_hash_array.map do |h|
|
360
334
|
part_drop_stmt = h.map do |name,value|
|
361
335
|
part_defs[name[1..-2]]=="string" ? "#{name}='#{value}'" : "#{name}=#{value}"
|
@@ -367,16 +341,16 @@ module Mobilize
|
|
367
341
|
|
368
342
|
target_insert_hql = "insert overwrite table #{table_path} " +
|
369
343
|
"partition (#{target_part_stmt}) " +
|
370
|
-
"select #{target_field_stmt},#{target_part_stmt} from #{
|
344
|
+
"select #{target_field_stmt},#{target_part_stmt} from #{temp_table_path};"
|
371
345
|
|
372
346
|
target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
|
373
347
|
|
374
|
-
Hive.run(cluster, target_full_hql, user_name)
|
348
|
+
response = Hive.run(cluster, target_full_hql, user_name, params)
|
349
|
+
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
375
350
|
else
|
376
351
|
error_msg = "Incompatible partition specs"
|
377
352
|
raise error_msg
|
378
353
|
end
|
379
|
-
url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
|
380
354
|
return url
|
381
355
|
end
|
382
356
|
|
@@ -384,14 +358,21 @@ module Mobilize
|
|
384
358
|
#Accepts options to drop existing target if any
|
385
359
|
#also schema with column datatype overrides
|
386
360
|
def Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop=false, schema_hash=nil)
|
361
|
+
return nil if source_tsv.strip.length==0
|
362
|
+
if source_tsv.index("\r\n")
|
363
|
+
source_tsv = source_tsv.gsub("\r\n","\n")
|
364
|
+
elsif source_tsv.index("\r")
|
365
|
+
source_tsv = source_tsv.gsub("\r","\n")
|
366
|
+
end
|
387
367
|
source_headers = source_tsv.tsv_header_array
|
388
368
|
|
389
369
|
table_path = [db,table].join(".")
|
390
|
-
|
391
|
-
table_stats = target_params['curr_stats']
|
370
|
+
table_stats = Hive.table_stats(cluster, db, table, user_name)
|
392
371
|
|
393
372
|
schema_hash ||= {}
|
394
373
|
|
374
|
+
url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
|
375
|
+
|
395
376
|
if part_array.length == 0 and
|
396
377
|
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
|
397
378
|
#no partitions in either user params or the target table
|
@@ -418,10 +399,11 @@ module Mobilize
|
|
418
399
|
|
419
400
|
target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
|
420
401
|
|
421
|
-
Hive.run(cluster, target_full_hql, user_name, file_hash)
|
402
|
+
response = Hive.run(cluster, target_full_hql, user_name, nil, file_hash)
|
403
|
+
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
422
404
|
|
423
405
|
elsif part_array.length > 0 and
|
424
|
-
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
|
406
|
+
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']}.sort == part_array.sort}
|
425
407
|
#partitions and no target table
|
426
408
|
#or same partitions in both target table and user params
|
427
409
|
#or drop and start fresh
|
@@ -445,13 +427,17 @@ module Mobilize
|
|
445
427
|
"partitioned by #{partition_defs}"
|
446
428
|
|
447
429
|
#create target table early if not here
|
448
|
-
Hive.run(cluster, target_create_hql, user_name)
|
430
|
+
response = Hive.run(cluster, target_create_hql, user_name)
|
431
|
+
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
432
|
+
|
433
|
+
#return url (operation complete) if there's no data
|
434
|
+
source_hash_array = source_tsv.tsv_to_hash_array
|
435
|
+
return url if source_hash_array.length==1 and source_hash_array.first.values.compact.length==0
|
449
436
|
|
450
437
|
table_stats = Hive.table_stats(cluster, db, table, user_name)
|
451
438
|
|
452
439
|
#create data hash from source hash array
|
453
440
|
data_hash = {}
|
454
|
-
source_hash_array = source_tsv.tsv_to_hash_array
|
455
441
|
source_hash_array.each do |ha|
|
456
442
|
tpmk = part_array.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
|
457
443
|
tpmv = ha.reject{|k,v| part_array.include?(k)}.values.join("\001")
|
@@ -484,7 +470,8 @@ module Mobilize
|
|
484
470
|
#run actual partition adds all at once
|
485
471
|
if target_part_hql.length>0
|
486
472
|
puts "Adding partitions to #{cluster}/#{db}/#{table} for #{user_name} at #{Time.now.utc}"
|
487
|
-
Hive.run(cluster, target_part_hql, user_name)
|
473
|
+
response = Hive.run(cluster, target_part_hql, user_name)
|
474
|
+
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
488
475
|
end
|
489
476
|
else
|
490
477
|
error_msg = "Incompatible partition specs: " +
|
@@ -492,33 +479,31 @@ module Mobilize
|
|
492
479
|
"user_params:#{part_array.to_s}"
|
493
480
|
raise error_msg
|
494
481
|
end
|
495
|
-
|
482
|
+
|
496
483
|
return url
|
497
484
|
end
|
498
485
|
|
499
486
|
def Hive.write_by_stage_path(stage_path)
|
487
|
+
gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
|
488
|
+
#return blank response if there are no slots available
|
489
|
+
return nil unless gdrive_slot
|
500
490
|
s = Stage.where(:path=>stage_path).first
|
501
491
|
params = s.params
|
502
|
-
source = s.sources.first
|
492
|
+
source = s.sources(gdrive_slot).first
|
503
493
|
target = s.target
|
504
494
|
cluster, db, table = target.url.split("://").last.split("/")
|
505
|
-
#update stage with the node so we can use it
|
506
|
-
user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
|
507
|
-
job_name = s.path.sub("Runner_","")
|
508
|
-
|
509
495
|
#slot Hive worker if available
|
510
496
|
slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
|
511
497
|
return false unless slot_id
|
498
|
+
#update stage with the node so we can use it
|
499
|
+
user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
|
500
|
+
job_name = s.path.sub("Runner_","")
|
512
501
|
|
513
502
|
schema_hash = if params['schema']
|
514
|
-
gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
|
515
|
-
#return blank response if there are no slots available
|
516
|
-
return nil unless gdrive_slot
|
517
503
|
Hive.schema_hash(params['schema'],user_name,gdrive_slot)
|
518
504
|
else
|
519
505
|
{}
|
520
506
|
end
|
521
|
-
Gdrive.unslot_worker_by_path(stage_path)
|
522
507
|
#drop target before create/insert?
|
523
508
|
drop = params['drop']
|
524
509
|
|
@@ -531,16 +516,17 @@ module Mobilize
|
|
531
516
|
#source table
|
532
517
|
cluster,source_path = source.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
|
533
518
|
source_hql = "select * from #{source_path};"
|
534
|
-
elsif ['gsheet','gridfs','hdfs'].include?(source.handler)
|
519
|
+
elsif ['gsheet','gfile','gridfs','hdfs'].include?(source.handler)
|
535
520
|
if source.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
|
536
|
-
source_hql = source.read(user_name)
|
521
|
+
source_hql = source.read(user_name,gdrive_slot)
|
537
522
|
else
|
538
|
-
#tsv from sheet
|
539
|
-
source_tsv = source.read(user_name)
|
523
|
+
#tsv from sheet or file
|
524
|
+
source_tsv = source.read(user_name,gdrive_slot)
|
540
525
|
end
|
541
526
|
end
|
542
527
|
end
|
543
528
|
|
529
|
+
Gdrive.unslot_worker_by_path(stage_path)
|
544
530
|
part_array = if params['partitions']
|
545
531
|
params['partitions'].to_a.map{|p| p.gsub(".","/").split("/")}.flatten
|
546
532
|
elsif params['target']
|
@@ -559,12 +545,14 @@ module Mobilize
|
|
559
545
|
Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop, schema_hash)
|
560
546
|
elsif source_tsv
|
561
547
|
Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop, schema_hash)
|
548
|
+
elsif source
|
549
|
+
#null sheet
|
562
550
|
else
|
563
551
|
raise "Unable to determine source tsv or source hql"
|
564
552
|
end
|
565
553
|
{'stdout'=>url,'exit_code'=>0}
|
566
554
|
rescue => exc
|
567
|
-
{'stderr'=>exc.to_s, 'exit_code'=>500}
|
555
|
+
{'stderr'=>"#{exc.to_s}\n#{exc.backtrace.join("\n")}", 'exit_code'=>500}
|
568
556
|
end
|
569
557
|
|
570
558
|
#unslot worker and write result
|
@@ -585,11 +573,8 @@ module Mobilize
|
|
585
573
|
select_hql = "select * from #{source_path};"
|
586
574
|
hql = [set_hql,select_hql].join
|
587
575
|
response = Hive.run(cluster, hql,user_name)
|
588
|
-
if response['
|
589
|
-
|
590
|
-
else
|
591
|
-
raise "Unable to read hive://#{dst_path} with error: #{response['stderr']}"
|
592
|
-
end
|
576
|
+
raise "Unable to read hive://#{dst_path} with error: #{response['stderr']}" if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
577
|
+
return response['stdout']
|
593
578
|
end
|
594
579
|
|
595
580
|
def Hive.write_by_dataset_path(dst_path,source_tsv,user_name,*args)
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Mobilize
|
2
|
+
module Hive
|
3
|
+
def self.config
|
4
|
+
Base.config('hive')
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.exec_path(cluster)
|
8
|
+
self.clusters[cluster]['exec_path']
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.output_db(cluster)
|
12
|
+
self.clusters[cluster]['output_db']
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.output_db_user(cluster)
|
16
|
+
output_db_node = Hadoop.gateway_node(cluster)
|
17
|
+
output_db_user = Ssh.host(output_db_node)['user']
|
18
|
+
output_db_user
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.clusters
|
22
|
+
self.config['clusters']
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.slot_ids(cluster)
|
26
|
+
(1..self.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.slot_worker_by_cluster_and_path(cluster,path)
|
30
|
+
working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
|
31
|
+
self.slot_ids(cluster).each do |slot_id|
|
32
|
+
unless working_slots.include?(slot_id)
|
33
|
+
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
|
34
|
+
return slot_id
|
35
|
+
end
|
36
|
+
end
|
37
|
+
#return false if none are available
|
38
|
+
return false
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.unslot_worker_by_path(path)
|
42
|
+
begin
|
43
|
+
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
|
44
|
+
return true
|
45
|
+
rescue
|
46
|
+
return false
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.databases(cluster,user_name)
|
51
|
+
self.run(cluster,"show databases",user_name)['stdout'].split("\n")
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.default_params
|
55
|
+
time = Time.now.utc
|
56
|
+
{
|
57
|
+
'$utc_date'=>time.strftime("%Y-%m-%d"),
|
58
|
+
'$utc_time'=>time.strftime("%H:%M"),
|
59
|
+
}
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
data/mobilize-hive.gemspec
CHANGED
@@ -16,5 +16,5 @@ Gem::Specification.new do |gem|
|
|
16
16
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
18
|
gem.require_paths = ["lib"]
|
19
|
-
gem.add_runtime_dependency "mobilize-hdfs","1.
|
19
|
+
gem.add_runtime_dependency "mobilize-hdfs","1.3"
|
20
20
|
end
|
data/test/hive_job_rows.yml
CHANGED
@@ -20,7 +20,15 @@
|
|
20
20
|
active: true
|
21
21
|
trigger: after hive_test_2
|
22
22
|
status: ""
|
23
|
-
stage1: hive.run hql:"select
|
23
|
+
stage1: hive.run hql:"select '@date' as `date`,product,category,value from mobilize.hive_test_1;", params:{'date':'2013-01-01'}
|
24
24
|
stage2: hive.write source:"stage1",target:"mobilize/hive_test_3", partitions:"date/product", drop:true
|
25
25
|
stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3", partitions:"date/product", drop:false
|
26
26
|
stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"
|
27
|
+
- name: hive_test_4
|
28
|
+
active: true
|
29
|
+
trigger: after hive_test_3
|
30
|
+
status: ""
|
31
|
+
stage1: hive.write source:"hive_test_4_stage_1.in", target:"mobilize/hive_test_1", partitions:"act_date"
|
32
|
+
stage2: hive.write source:"hive_test_4_stage_2.in", target:"mobilize/hive_test_1", partitions:"act_date"
|
33
|
+
stage3: hive.run hql:"select '$utc_date $utc_time' as `date_time`,product,category,value from mobilize.hive_test_1;"
|
34
|
+
stage4: gsheet.write source:stage3, target:"hive_test_4.out"
|
data/test/mobilize-hive_test.rb
CHANGED
@@ -25,6 +25,18 @@ describe "Mobilize" do
|
|
25
25
|
hive_1_in_tsv = YAML.load_file("#{Mobilize::Base.root}/test/hive_test_1_in.yml").hash_array_to_tsv
|
26
26
|
hive_1_in_sheet.write(hive_1_in_tsv,Mobilize::Gdrive.owner_name)
|
27
27
|
|
28
|
+
#create blank sheet
|
29
|
+
hive_4_stage_1_in_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_1.in",gdrive_slot)
|
30
|
+
[hive_4_stage_1_in_sheet].each {|s| s.delete if s}
|
31
|
+
hive_4_stage_1_in_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_1.in",gdrive_slot)
|
32
|
+
|
33
|
+
#create sheet w just headers
|
34
|
+
hive_4_stage_2_in_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_2.in",gdrive_slot)
|
35
|
+
[hive_4_stage_2_in_sheet].each {|s| s.delete if s}
|
36
|
+
hive_4_stage_2_in_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_2.in",gdrive_slot)
|
37
|
+
hive_4_stage_2_in_sheet_header = hive_1_in_tsv.tsv_header_array.join("\t")
|
38
|
+
hive_4_stage_2_in_sheet.write(hive_4_stage_2_in_sheet_header,Mobilize::Gdrive.owner_name)
|
39
|
+
|
28
40
|
hive_1_schema_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1.schema",gdrive_slot)
|
29
41
|
[hive_1_schema_sheet].each {|s| s.delete if s}
|
30
42
|
hive_1_schema_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1.schema",gdrive_slot)
|
@@ -51,21 +63,25 @@ describe "Mobilize" do
|
|
51
63
|
[hive_2_target_sheet].each{|s| s.delete if s}
|
52
64
|
hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
|
53
65
|
[hive_3_target_sheet].each{|s| s.delete if s}
|
66
|
+
hive_4_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4.out",gdrive_slot)
|
67
|
+
[hive_4_target_sheet].each{|s| s.delete if s}
|
54
68
|
|
55
69
|
puts "job row added, force enqueued requestor, wait for stages"
|
56
70
|
r.enqueue!
|
57
|
-
wait_for_stages(
|
71
|
+
wait_for_stages(2100)
|
58
72
|
|
59
73
|
puts "jobtracker posted data to test sheet"
|
60
74
|
hive_1_stage_2_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1_stage_2.out",gdrive_slot)
|
61
75
|
hive_1_stage_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1_stage_3.out",gdrive_slot)
|
62
76
|
hive_2_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_2.out",gdrive_slot)
|
63
77
|
hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
|
78
|
+
hive_4_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4.out",gdrive_slot)
|
64
79
|
|
65
80
|
assert hive_1_stage_2_target_sheet.read(u.name).length == 219
|
66
81
|
assert hive_1_stage_3_target_sheet.read(u.name).length > 3
|
67
82
|
assert hive_2_target_sheet.read(u.name).length == 599
|
68
83
|
assert hive_3_target_sheet.read(u.name).length == 347
|
84
|
+
assert hive_4_target_sheet.read(u.name).length == 432
|
69
85
|
end
|
70
86
|
|
71
87
|
def wait_for_stages(time_limit=600,stage_limit=120,wait_length=10)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mobilize-hive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.3'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-04-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mobilize-hdfs
|
@@ -18,7 +18,7 @@ dependencies:
|
|
18
18
|
requirements:
|
19
19
|
- - '='
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: '1.
|
21
|
+
version: '1.3'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -26,7 +26,7 @@ dependencies:
|
|
26
26
|
requirements:
|
27
27
|
- - '='
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: '1.
|
29
|
+
version: '1.3'
|
30
30
|
description: Adds hive read, write, and run support to mobilize-hdfs
|
31
31
|
email:
|
32
32
|
- cpaesleme@dena.com
|
@@ -41,6 +41,7 @@ files:
|
|
41
41
|
- Rakefile
|
42
42
|
- lib/mobilize-hive.rb
|
43
43
|
- lib/mobilize-hive/handlers/hive.rb
|
44
|
+
- lib/mobilize-hive/helpers/hive_helper.rb
|
44
45
|
- lib/mobilize-hive/tasks.rb
|
45
46
|
- lib/mobilize-hive/version.rb
|
46
47
|
- lib/samples/hive.yml
|
@@ -72,7 +73,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
72
73
|
version: '0'
|
73
74
|
requirements: []
|
74
75
|
rubyforge_project:
|
75
|
-
rubygems_version: 1.8.
|
76
|
+
rubygems_version: 1.8.25
|
76
77
|
signing_key:
|
77
78
|
specification_version: 3
|
78
79
|
summary: Adds hive read, write, and run support to mobilize-hdfs
|
@@ -84,4 +85,3 @@ test_files:
|
|
84
85
|
- test/mobilize-hive_test.rb
|
85
86
|
- test/redis-test.conf
|
86
87
|
- test/test_helper.rb
|
87
|
-
has_rdoc:
|