mobilize-hive 1.298 → 1.299
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +11 -0
- data/lib/mobilize-hive/handlers/hive.rb +24 -60
- data/lib/mobilize-hive/helpers/hive_helper.rb +55 -0
- data/lib/mobilize-hive/version.rb +1 -1
- data/test/hive_job_rows.yml +1 -1
- metadata +4 -3
data/README.md
CHANGED
@@ -142,6 +142,17 @@ Start
|
|
142
142
|
* cluster and user are optional for all of the below.
|
143
143
|
* cluster defaults to the first cluster listed;
|
144
144
|
* user is treated the same way as in [mobilize-ssh][mobilize-ssh].
|
145
|
+
* params are also optional for all of the below. They replace HQL in sources.
|
146
|
+
* params are passed as a YML or JSON, as in:
|
147
|
+
* `hive.run source:<source_path>, params:{'date': '2013-03-01', 'unit': 'widgets'}`
|
148
|
+
* this example replaces all the keys, preceded by '@' in all source hqls with the value.
|
149
|
+
* The preceding '@' is used to keep from replacing instances
|
150
|
+
of "date" and "unit" in the HQL; you should have `@date` and `@unit` in your actual HQL
|
151
|
+
if you'd like to replace those tokens.
|
152
|
+
* in addition, the following params are substituted automatically:
|
153
|
+
* `$utc_date` - replaced with YYYY-MM-DD date, UTC
|
154
|
+
* `$utc_time` - replaced with HH:MM time, UTC
|
155
|
+
* any occurrence of these values in HQL will be replaced at runtime.
|
145
156
|
* hive.run `hql:<hql> || source:<gsheet_path>, user:<user>, cluster:<cluster>`, which executes the
|
146
157
|
script in the hql or source sheet and returns any output specified at the
|
147
158
|
end. If the cmd or last query in source is a select statement, column headers will be
|
@@ -1,56 +1,7 @@
|
|
1
1
|
module Mobilize
|
2
2
|
module Hive
|
3
|
-
|
4
|
-
|
5
|
-
end
|
6
|
-
|
7
|
-
def Hive.exec_path(cluster)
|
8
|
-
Hive.clusters[cluster]['exec_path']
|
9
|
-
end
|
10
|
-
|
11
|
-
def Hive.output_db(cluster)
|
12
|
-
Hive.clusters[cluster]['output_db']
|
13
|
-
end
|
14
|
-
|
15
|
-
def Hive.output_db_user(cluster)
|
16
|
-
output_db_node = Hadoop.gateway_node(cluster)
|
17
|
-
output_db_user = Ssh.host(output_db_node)['user']
|
18
|
-
output_db_user
|
19
|
-
end
|
20
|
-
|
21
|
-
def Hive.clusters
|
22
|
-
Hive.config['clusters']
|
23
|
-
end
|
24
|
-
|
25
|
-
def Hive.slot_ids(cluster)
|
26
|
-
(1..Hive.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
|
27
|
-
end
|
28
|
-
|
29
|
-
def Hive.slot_worker_by_cluster_and_path(cluster,path)
|
30
|
-
working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
|
31
|
-
Hive.slot_ids(cluster).each do |slot_id|
|
32
|
-
unless working_slots.include?(slot_id)
|
33
|
-
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
|
34
|
-
return slot_id
|
35
|
-
end
|
36
|
-
end
|
37
|
-
#return false if none are available
|
38
|
-
return false
|
39
|
-
end
|
40
|
-
|
41
|
-
def Hive.unslot_worker_by_path(path)
|
42
|
-
begin
|
43
|
-
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
|
44
|
-
return true
|
45
|
-
rescue
|
46
|
-
return false
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
def Hive.databases(cluster,user_name)
|
51
|
-
Hive.run(cluster,"show databases",user_name)['stdout'].split("\n")
|
52
|
-
end
|
53
|
-
|
3
|
+
#adds convenience methods
|
4
|
+
require "#{File.dirname(__FILE__)}/../helpers/hive_helper"
|
54
5
|
# converts a source path or target path to a dst in the context of handler and stage
|
55
6
|
def Hive.path_to_dst(path,stage_path,gdrive_slot)
|
56
7
|
has_handler = true if path.index("://")
|
@@ -142,12 +93,25 @@ module Mobilize
|
|
142
93
|
end
|
143
94
|
|
144
95
|
#run a generic hive command, with the option of passing a file hash to be locally available
|
145
|
-
def Hive.run(cluster,hql,user_name,file_hash=nil)
|
96
|
+
def Hive.run(cluster,hql,user_name,params=nil,file_hash=nil)
|
146
97
|
# no TempStatsStore
|
147
98
|
hql = "set hive.stats.autogather=false;#{hql}"
|
148
99
|
filename = hql.to_md5
|
149
100
|
file_hash||= {}
|
150
101
|
file_hash[filename] = hql
|
102
|
+
#add in default params
|
103
|
+
params ||= {}
|
104
|
+
params.merge(Ssh.default_params)
|
105
|
+
#replace any params in the file_hash and command
|
106
|
+
params.each do |k,v|
|
107
|
+
file_hash.each do |name,data|
|
108
|
+
if k.starts_with?("$")
|
109
|
+
data.gsub!(k,v)
|
110
|
+
else
|
111
|
+
data.gsub!("@#{k}",v)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
151
115
|
#silent mode so we don't have logs in stderr; clip output
|
152
116
|
#at hadoop read limit
|
153
117
|
command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
|
@@ -201,10 +165,10 @@ module Mobilize
|
|
201
165
|
"drop table if exists #{output_path}",
|
202
166
|
"create table #{output_path} as #{select_hql};"].join(";")
|
203
167
|
full_hql = [prior_hql, output_table_hql].compact.join(";")
|
204
|
-
result = Hive.run(cluster,full_hql, user_name)
|
168
|
+
result = Hive.run(cluster,full_hql, user_name,params['params'])
|
205
169
|
Dataset.find_or_create_by_url(out_url)
|
206
170
|
else
|
207
|
-
result = Hive.run(cluster, hql, user_name)
|
171
|
+
result = Hive.run(cluster, hql, user_name,params['params'])
|
208
172
|
Dataset.find_or_create_by_url(out_url)
|
209
173
|
Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
|
210
174
|
end
|
@@ -245,7 +209,7 @@ module Mobilize
|
|
245
209
|
schema_hash
|
246
210
|
end
|
247
211
|
|
248
|
-
def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil)
|
212
|
+
def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil, params=nil)
|
249
213
|
table_path = [db,table].join(".")
|
250
214
|
table_stats = Hive.table_stats(cluster, db, table, user_name)
|
251
215
|
url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
|
@@ -264,7 +228,7 @@ module Mobilize
|
|
264
228
|
temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
|
265
229
|
temp_drop_hql = "drop table if exists #{temp_table_path};"
|
266
230
|
temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
|
267
|
-
response = Hive.run(cluster,temp_create_hql,user_name)
|
231
|
+
response = Hive.run(cluster,temp_create_hql,user_name,params)
|
268
232
|
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
269
233
|
|
270
234
|
source_table_stats = Hive.table_stats(cluster,temp_db,temp_table_name,user_name)
|
@@ -303,7 +267,7 @@ module Mobilize
|
|
303
267
|
target_insert_hql,
|
304
268
|
temp_drop_hql].join
|
305
269
|
|
306
|
-
response = Hive.run(cluster, target_full_hql, user_name)
|
270
|
+
response = Hive.run(cluster, target_full_hql, user_name, params)
|
307
271
|
|
308
272
|
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
309
273
|
|
@@ -355,7 +319,7 @@ module Mobilize
|
|
355
319
|
part_set_hql = "set hive.cli.print.header=true;set mapred.job.name=#{job_name} (permutations);"
|
356
320
|
part_select_hql = "select distinct #{target_part_stmt} from #{temp_table_path};"
|
357
321
|
part_perm_hql = part_set_hql + part_select_hql
|
358
|
-
response = Hive.run(cluster, part_perm_hql, user_name)
|
322
|
+
response = Hive.run(cluster, part_perm_hql, user_name, params)
|
359
323
|
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
360
324
|
part_perm_tsv = response['stdout']
|
361
325
|
#having gotten the permutations, ensure they are dropped
|
@@ -381,7 +345,7 @@ module Mobilize
|
|
381
345
|
|
382
346
|
target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
|
383
347
|
|
384
|
-
response = Hive.run(cluster, target_full_hql, user_name)
|
348
|
+
response = Hive.run(cluster, target_full_hql, user_name, params)
|
385
349
|
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
386
350
|
else
|
387
351
|
error_msg = "Incompatible partition specs"
|
@@ -435,7 +399,7 @@ module Mobilize
|
|
435
399
|
|
436
400
|
target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
|
437
401
|
|
438
|
-
response = Hive.run(cluster, target_full_hql, user_name, file_hash)
|
402
|
+
response = Hive.run(cluster, target_full_hql, user_name, nil, file_hash)
|
439
403
|
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
440
404
|
|
441
405
|
elsif part_array.length > 0 and
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Mobilize
|
2
|
+
module Hive
|
3
|
+
def self.config
|
4
|
+
Base.config('hive')
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.exec_path(cluster)
|
8
|
+
self.clusters[cluster]['exec_path']
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.output_db(cluster)
|
12
|
+
self.clusters[cluster]['output_db']
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.output_db_user(cluster)
|
16
|
+
output_db_node = Hadoop.gateway_node(cluster)
|
17
|
+
output_db_user = Ssh.host(output_db_node)['user']
|
18
|
+
output_db_user
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.clusters
|
22
|
+
self.config['clusters']
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.slot_ids(cluster)
|
26
|
+
(1..self.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.slot_worker_by_cluster_and_path(cluster,path)
|
30
|
+
working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
|
31
|
+
self.slot_ids(cluster).each do |slot_id|
|
32
|
+
unless working_slots.include?(slot_id)
|
33
|
+
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
|
34
|
+
return slot_id
|
35
|
+
end
|
36
|
+
end
|
37
|
+
#return false if none are available
|
38
|
+
return false
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.unslot_worker_by_path(path)
|
42
|
+
begin
|
43
|
+
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
|
44
|
+
return true
|
45
|
+
rescue
|
46
|
+
return false
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.databases(cluster,user_name)
|
51
|
+
self.run(cluster,"show databases",user_name)['stdout'].split("\n")
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
data/test/hive_job_rows.yml
CHANGED
@@ -20,7 +20,7 @@
|
|
20
20
|
active: true
|
21
21
|
trigger: after hive_test_2
|
22
22
|
status: ""
|
23
|
-
stage1: hive.run hql:"select
|
23
|
+
stage1: hive.run hql:"select '@date' as `date`,product,category,value from mobilize.hive_test_1;", params:{'date':'2013-01-01'}
|
24
24
|
stage2: hive.write source:"stage1",target:"mobilize/hive_test_3", partitions:"date/product", drop:true
|
25
25
|
stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3", partitions:"date/product", drop:false
|
26
26
|
stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mobilize-hive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.299'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -41,6 +41,7 @@ files:
|
|
41
41
|
- Rakefile
|
42
42
|
- lib/mobilize-hive.rb
|
43
43
|
- lib/mobilize-hive/handlers/hive.rb
|
44
|
+
- lib/mobilize-hive/helpers/hive_helper.rb
|
44
45
|
- lib/mobilize-hive/tasks.rb
|
45
46
|
- lib/mobilize-hive/version.rb
|
46
47
|
- lib/samples/hive.yml
|
@@ -66,7 +67,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
66
67
|
version: '0'
|
67
68
|
segments:
|
68
69
|
- 0
|
69
|
-
hash:
|
70
|
+
hash: -3388772007190329704
|
70
71
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
72
|
none: false
|
72
73
|
requirements:
|
@@ -75,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
75
76
|
version: '0'
|
76
77
|
segments:
|
77
78
|
- 0
|
78
|
-
hash:
|
79
|
+
hash: -3388772007190329704
|
79
80
|
requirements: []
|
80
81
|
rubyforge_project:
|
81
82
|
rubygems_version: 1.8.25
|