mobilize-hive 1.361 → 1.363
Sign up to get free protection for your applications and to get access to all the features.
@@ -93,7 +93,7 @@ module Mobilize
|
|
93
93
|
end
|
94
94
|
|
95
95
|
#run a generic hive command, with the option of passing a file hash to be locally available
|
96
|
-
def Hive.run(cluster,hql,user_name,params=nil,file_hash=nil)
|
96
|
+
def Hive.run(cluster,hql,user_name,params=nil,file_hash=nil,stage_path=nil)
|
97
97
|
preps = Hive.prepends.map do |p|
|
98
98
|
prefix = "set "
|
99
99
|
suffix = ";"
|
@@ -103,7 +103,7 @@ module Mobilize
|
|
103
103
|
prep_out
|
104
104
|
end.join
|
105
105
|
hql = "#{preps}#{hql}"
|
106
|
-
filename = hql
|
106
|
+
filename = "hql"
|
107
107
|
file_hash||= {}
|
108
108
|
file_hash[filename] = hql
|
109
109
|
params ||= {}
|
@@ -121,9 +121,9 @@ module Mobilize
|
|
121
121
|
end
|
122
122
|
#silent mode so we don't have logs in stderr; clip output
|
123
123
|
#at hadoop read limit
|
124
|
-
command = "#{Hive.exec_path(cluster)} -
|
124
|
+
command = "#{Hive.exec_path(cluster)} -f #{filename}"
|
125
125
|
gateway_node = Hadoop.gateway_node(cluster)
|
126
|
-
response = Ssh.run(gateway_node,command,user_name,file_hash)
|
126
|
+
response = Ssh.run(gateway_node,command,user_name,stage_path,file_hash)
|
127
127
|
#override exit code 0 when stdout is blank and
|
128
128
|
#stderror contains FAILED or KILLED
|
129
129
|
if response['stdout'].to_s.length == 0 and
|
@@ -147,7 +147,7 @@ module Mobilize
|
|
147
147
|
return false unless slot_id
|
148
148
|
|
149
149
|
#output table stores stage output
|
150
|
-
output_db,output_table = [Hive.output_db(cluster),
|
150
|
+
output_db,output_table = [Hive.output_db(cluster),job_name.downcase.alphanunderscore]
|
151
151
|
output_path = [output_db,output_table].join(".")
|
152
152
|
out_url = "hive://#{cluster}/#{output_db}/#{output_table}"
|
153
153
|
|
@@ -164,6 +164,7 @@ module Mobilize
|
|
164
164
|
#check for select at end
|
165
165
|
hql_array = hql.split("\n").reject{|l| l.starts_with?("--") or l.strip.length==0}.join("\n").split(";").map{|h| h.strip}
|
166
166
|
last_statement = hql_array.last
|
167
|
+
file_hash = nil
|
167
168
|
if last_statement.to_s.downcase.starts_with?("select")
|
168
169
|
#nil if no prior commands
|
169
170
|
prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
|
@@ -172,10 +173,10 @@ module Mobilize
|
|
172
173
|
"drop table if exists #{output_path}",
|
173
174
|
"create table #{output_path} as #{select_hql};"].join(";")
|
174
175
|
full_hql = [prior_hql, output_table_hql].compact.join(";")
|
175
|
-
result = Hive.run(cluster,full_hql, user_name,params['params'])
|
176
|
+
result = Hive.run(cluster,full_hql, user_name,params['params'],file_hash,stage_path)
|
176
177
|
Dataset.find_or_create_by_url(out_url)
|
177
178
|
else
|
178
|
-
result = Hive.run(cluster, hql, user_name,params['params'])
|
179
|
+
result = Hive.run(cluster, hql, user_name,params['params'],file_hash,stage_path)
|
179
180
|
Dataset.find_or_create_by_url(out_url)
|
180
181
|
Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
|
181
182
|
end
|
@@ -210,7 +211,8 @@ module Mobilize
|
|
210
211
|
schema_hash
|
211
212
|
end
|
212
213
|
|
213
|
-
def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name,
|
214
|
+
def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, stage_path, drop=false, schema_hash=nil, run_params=nil)
|
215
|
+
job_name = stage_path.sub("Runner_","")
|
214
216
|
table_path = [db,table].join(".")
|
215
217
|
table_stats = Hive.table_stats(cluster, db, table, user_name)
|
216
218
|
url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
|
@@ -226,12 +228,13 @@ module Mobilize
|
|
226
228
|
|
227
229
|
#create temporary table so we can identify fields etc.
|
228
230
|
temp_db = Hive.output_db(cluster)
|
229
|
-
temp_table_name =
|
231
|
+
temp_table_name = "temp_#{job_name.downcase.alphanunderscore}"
|
230
232
|
temp_table_path = [temp_db,temp_table_name].join(".")
|
231
233
|
temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
|
232
234
|
temp_drop_hql = "drop table if exists #{temp_table_path};"
|
233
235
|
temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
|
234
|
-
|
236
|
+
file_hash = nil
|
237
|
+
response = Hive.run(cluster,temp_create_hql,user_name,run_params,file_hash,stage_path)
|
235
238
|
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
236
239
|
|
237
240
|
source_table_stats = Hive.table_stats(cluster,temp_db,temp_table_name,user_name)
|
@@ -361,17 +364,17 @@ module Mobilize
|
|
361
364
|
#Accepts options to drop existing target if any
|
362
365
|
#also schema with column datatype overrides
|
363
366
|
def Hive.tsv_to_table(cluster, table_path, user_name, source_tsv)
|
364
|
-
#nil if only header row, or no header row
|
365
|
-
if source_tsv.strip.length==0 or source_tsv.strip.split("\n").length<=1
|
366
|
-
puts "no data in source_tsv for #{cluster}/#{table_path}"
|
367
|
-
return nil
|
368
|
-
end
|
369
367
|
#get rid of freaking carriage return characters
|
370
368
|
if source_tsv.index("\r\n")
|
371
369
|
source_tsv = source_tsv.gsub("\r\n","\n")
|
372
370
|
elsif source_tsv.index("\r")
|
373
371
|
source_tsv = source_tsv.gsub("\r","\n")
|
374
372
|
end
|
373
|
+
#nil if only header row, or no header row
|
374
|
+
if source_tsv.strip.length==0 or source_tsv.strip.split("\n").length<=1
|
375
|
+
puts "no data in source_tsv for #{cluster}/#{table_path}"
|
376
|
+
return nil
|
377
|
+
end
|
375
378
|
source_headers = source_tsv.tsv_header_array
|
376
379
|
|
377
380
|
#one file only, strip headers, replace tab with ctrl-a for hive
|
@@ -404,6 +407,7 @@ module Mobilize
|
|
404
407
|
#return blank response if there are no slots available
|
405
408
|
return nil unless gdrive_slot
|
406
409
|
s = Stage.where(:path=>stage_path).first
|
410
|
+
job_name = s.path.sub("Runner_","")
|
407
411
|
params = s.params
|
408
412
|
source = s.sources(gdrive_slot).first
|
409
413
|
target = s.target
|
@@ -413,7 +417,6 @@ module Mobilize
|
|
413
417
|
return false unless slot_id
|
414
418
|
#update stage with the node so we can use it
|
415
419
|
user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
|
416
|
-
job_name = s.path.sub("Runner_","")
|
417
420
|
|
418
421
|
schema_hash = if params['schema']
|
419
422
|
Hive.schema_hash(params['schema'],stage_path,user_name,gdrive_slot)
|
@@ -460,15 +463,15 @@ module Mobilize
|
|
460
463
|
url = if source_hql
|
461
464
|
#include any params (or nil) at the end
|
462
465
|
run_params = params['params']
|
463
|
-
Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name,
|
466
|
+
Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, stage_path,drop, schema_hash,run_params)
|
464
467
|
elsif source_tsv
|
465
468
|
#first write tsv to temp table
|
466
|
-
temp_table_path = "#{Hive.output_db(cluster)}.temptsv_#{
|
469
|
+
temp_table_path = "#{Hive.output_db(cluster)}.temptsv_#{job_name.downcase.alphanunderscore}"
|
467
470
|
has_data = Hive.tsv_to_table(cluster, temp_table_path, user_name, source_tsv)
|
468
471
|
if has_data
|
469
472
|
#then do the regular insert, with source hql being select * from temp table
|
470
473
|
source_hql = "select * from #{temp_table_path}"
|
471
|
-
Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name,
|
474
|
+
Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, stage_path, drop, schema_hash)
|
472
475
|
else
|
473
476
|
nil
|
474
477
|
end
|
data/mobilize-hive.gemspec
CHANGED
@@ -16,5 +16,5 @@ Gem::Specification.new do |gem|
|
|
16
16
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
18
|
gem.require_paths = ["lib"]
|
19
|
-
gem.add_runtime_dependency "mobilize-hdfs","1.
|
19
|
+
gem.add_runtime_dependency "mobilize-hdfs","1.363"
|
20
20
|
end
|
@@ -3,32 +3,32 @@
|
|
3
3
|
active: true
|
4
4
|
trigger: once
|
5
5
|
status: ""
|
6
|
-
stage1: hive.write target:"mobilize/hive1", partitions:"act_date", drop:true,
|
6
|
+
stage1: hive.write keep_logs:true, retries:3, target:"mobilize/hive1", partitions:"act_date", drop:true,
|
7
7
|
source:"Runner_mobilize(test)/hive1.in", schema:"hive1.schema"
|
8
|
-
stage2: hive.run source:"hive1.sql"
|
9
|
-
stage3: hive.run hql:"show databases;"
|
8
|
+
stage2: hive.run keep_logs:true, retries:3, source:"hive1.sql"
|
9
|
+
stage3: hive.run keep_logs:true, retries:3, hql:"show databases;"
|
10
10
|
stage4: gsheet.write source:"stage2", target:"hive1_stage2.out"
|
11
11
|
stage5: gsheet.write source:"stage3", target:"hive1_stage3.out"
|
12
12
|
- name: hive2
|
13
13
|
active: true
|
14
14
|
trigger: after hive1
|
15
15
|
status: ""
|
16
|
-
stage1: hive.write source:"hdfs://user/mobilize/test/hdfs1.out", target:"mobilize.hive2", drop:true
|
17
|
-
stage2: hive.run hql:"select * from mobilize.hive2;"
|
16
|
+
stage1: hive.write keep_logs:true, retries:3, source:"hdfs://user/mobilize/test/hdfs1.out", target:"mobilize.hive2", drop:true
|
17
|
+
stage2: hive.run keep_logs:true, retries:3, hql:"select * from mobilize.hive2;"
|
18
18
|
stage3: gsheet.write source:"stage2", target:"hive2.out"
|
19
19
|
- name: hive3
|
20
20
|
active: true
|
21
21
|
trigger: after hive2
|
22
22
|
status: ""
|
23
|
-
stage1: hive.run hql:"select '@date' as `date`,product,category,value from mobilize.hive1;", params:{'date':'2013-01-01'}
|
24
|
-
stage2: hive.write source:"stage1",target:"mobilize/hive3", partitions:"date/product", drop:true
|
25
|
-
stage3: hive.write hql:"select * from mobilize.hive3;",target:"mobilize/hive3", partitions:"date/product", drop:false
|
23
|
+
stage1: hive.run keep_logs:true, retries:3, hql:"select '@date' as `date`,product,category,value from mobilize.hive1;", params:{'date':'2013-01-01'}
|
24
|
+
stage2: hive.write keep_logs:true, retries:3, source:"stage1",target:"mobilize/hive3", partitions:"date/product", drop:true
|
25
|
+
stage3: hive.write keep_logs:true, retries:3, hql:"select * from mobilize.hive3;",target:"mobilize/hive3", partitions:"date/product", drop:false
|
26
26
|
stage4: gsheet.write source:"hive://mobilize/hive3", target:"hive3.out"
|
27
27
|
- name: hive4
|
28
28
|
active: true
|
29
29
|
trigger: after hive3
|
30
30
|
status: ""
|
31
|
-
stage1: hive.write source:"hive4_stage1.in", target:"mobilize/hive1", partitions:"act_date"
|
32
|
-
stage2: hive.write source:"hive4_stage2.in", target:"mobilize/hive1", partitions:"act_date"
|
33
|
-
stage3: hive.run hql:"select '@date $utc_time' as `date_time`,product,category,value from mobilize.hive1;", params:{'date':'$utc_date'}
|
31
|
+
stage1: hive.write keep_logs:true, retries:3, source:"hive4_stage1.in", target:"mobilize/hive1", partitions:"act_date"
|
32
|
+
stage2: hive.write keep_logs:true, retries:3, source:"hive4_stage2.in", target:"mobilize/hive1", partitions:"act_date"
|
33
|
+
stage3: hive.run keep_logs:true, retries:3, hql:"select '@date $utc_time' as `date_time`,product,category,value from mobilize.hive1;", params:{'date':'$utc_date'}
|
34
34
|
stage4: gsheet.write source:stage3, target:"hive4.out"
|
@@ -18,7 +18,7 @@ describe "Mobilize" do
|
|
18
18
|
end
|
19
19
|
|
20
20
|
puts "add/update jobs"
|
21
|
-
u.jobs.each{|j| j.delete}
|
21
|
+
u.jobs.each{|j| j.stages.each{|s| s.delete}; j.delete}
|
22
22
|
jobs_fixture_name = "integration_jobs"
|
23
23
|
jobs_target_url = "gsheet://#{r.title}/jobs"
|
24
24
|
TestHelper.write_fixture(jobs_fixture_name, jobs_target_url, 'update')
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mobilize-hive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.363'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-06-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mobilize-hdfs
|
@@ -18,7 +18,7 @@ dependencies:
|
|
18
18
|
requirements:
|
19
19
|
- - '='
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: '1.
|
21
|
+
version: '1.363'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -26,7 +26,7 @@ dependencies:
|
|
26
26
|
requirements:
|
27
27
|
- - '='
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: '1.
|
29
|
+
version: '1.363'
|
30
30
|
description: Adds hive read, write, and run support to mobilize-hdfs
|
31
31
|
email:
|
32
32
|
- cpaesleme@dena.com
|
@@ -71,7 +71,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
71
71
|
version: '0'
|
72
72
|
segments:
|
73
73
|
- 0
|
74
|
-
hash:
|
74
|
+
hash: 441677985208896084
|
75
75
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
76
|
none: false
|
77
77
|
requirements:
|
@@ -80,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
80
80
|
version: '0'
|
81
81
|
segments:
|
82
82
|
- 0
|
83
|
-
hash:
|
83
|
+
hash: 441677985208896084
|
84
84
|
requirements: []
|
85
85
|
rubyforge_project:
|
86
86
|
rubygems_version: 1.8.25
|