mobilize-hive 1.298 → 1.299
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +11 -0
- data/lib/mobilize-hive/handlers/hive.rb +24 -60
- data/lib/mobilize-hive/helpers/hive_helper.rb +55 -0
- data/lib/mobilize-hive/version.rb +1 -1
- data/test/hive_job_rows.yml +1 -1
- metadata +4 -3
data/README.md
CHANGED
@@ -142,6 +142,17 @@ Start
|
|
142
142
|
* cluster and user are optional for all of the below.
|
143
143
|
* cluster defaults to the first cluster listed;
|
144
144
|
* user is treated the same way as in [mobilize-ssh][mobilize-ssh].
|
145
|
+
* params are also optional for all of the below. They replace HQL in sources.
|
146
|
+
* params are passed as a YML or JSON, as in:
|
147
|
+
* `hive.run source:<source_path>, params:{'date': '2013-03-01', 'unit': 'widgets'}`
|
148
|
+
* this example replaces all the keys, preceded by '@' in all source hqls with the value.
|
149
|
+
* The preceding '@' is used to keep from replacing instances
|
150
|
+
of "date" and "unit" in the HQL; you should have `@date` and `@unit` in your actual HQL
|
151
|
+
if you'd like to replace those tokens.
|
152
|
+
* in addition, the following params are substituted automatically:
|
153
|
+
* `$utc_date` - replaced with YYYY-MM-DD date, UTC
|
154
|
+
* `$utc_time` - replaced with HH:MM time, UTC
|
155
|
+
* any occurrence of these values in HQL will be replaced at runtime.
|
145
156
|
* hive.run `hql:<hql> || source:<gsheet_path>, user:<user>, cluster:<cluster>`, which executes the
|
146
157
|
script in the hql or source sheet and returns any output specified at the
|
147
158
|
end. If the cmd or last query in source is a select statement, column headers will be
|
@@ -1,56 +1,7 @@
|
|
1
1
|
module Mobilize
|
2
2
|
module Hive
|
3
|
-
|
4
|
-
|
5
|
-
end
|
6
|
-
|
7
|
-
def Hive.exec_path(cluster)
|
8
|
-
Hive.clusters[cluster]['exec_path']
|
9
|
-
end
|
10
|
-
|
11
|
-
def Hive.output_db(cluster)
|
12
|
-
Hive.clusters[cluster]['output_db']
|
13
|
-
end
|
14
|
-
|
15
|
-
def Hive.output_db_user(cluster)
|
16
|
-
output_db_node = Hadoop.gateway_node(cluster)
|
17
|
-
output_db_user = Ssh.host(output_db_node)['user']
|
18
|
-
output_db_user
|
19
|
-
end
|
20
|
-
|
21
|
-
def Hive.clusters
|
22
|
-
Hive.config['clusters']
|
23
|
-
end
|
24
|
-
|
25
|
-
def Hive.slot_ids(cluster)
|
26
|
-
(1..Hive.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
|
27
|
-
end
|
28
|
-
|
29
|
-
def Hive.slot_worker_by_cluster_and_path(cluster,path)
|
30
|
-
working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
|
31
|
-
Hive.slot_ids(cluster).each do |slot_id|
|
32
|
-
unless working_slots.include?(slot_id)
|
33
|
-
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
|
34
|
-
return slot_id
|
35
|
-
end
|
36
|
-
end
|
37
|
-
#return false if none are available
|
38
|
-
return false
|
39
|
-
end
|
40
|
-
|
41
|
-
def Hive.unslot_worker_by_path(path)
|
42
|
-
begin
|
43
|
-
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
|
44
|
-
return true
|
45
|
-
rescue
|
46
|
-
return false
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
def Hive.databases(cluster,user_name)
|
51
|
-
Hive.run(cluster,"show databases",user_name)['stdout'].split("\n")
|
52
|
-
end
|
53
|
-
|
3
|
+
#adds convenience methods
|
4
|
+
require "#{File.dirname(__FILE__)}/../helpers/hive_helper"
|
54
5
|
# converts a source path or target path to a dst in the context of handler and stage
|
55
6
|
def Hive.path_to_dst(path,stage_path,gdrive_slot)
|
56
7
|
has_handler = true if path.index("://")
|
@@ -142,12 +93,25 @@ module Mobilize
|
|
142
93
|
end
|
143
94
|
|
144
95
|
#run a generic hive command, with the option of passing a file hash to be locally available
|
145
|
-
def Hive.run(cluster,hql,user_name,file_hash=nil)
|
96
|
+
def Hive.run(cluster,hql,user_name,params=nil,file_hash=nil)
|
146
97
|
# no TempStatsStore
|
147
98
|
hql = "set hive.stats.autogather=false;#{hql}"
|
148
99
|
filename = hql.to_md5
|
149
100
|
file_hash||= {}
|
150
101
|
file_hash[filename] = hql
|
102
|
+
#add in default params
|
103
|
+
params ||= {}
|
104
|
+
params.merge(Ssh.default_params)
|
105
|
+
#replace any params in the file_hash and command
|
106
|
+
params.each do |k,v|
|
107
|
+
file_hash.each do |name,data|
|
108
|
+
if k.starts_with?("$")
|
109
|
+
data.gsub!(k,v)
|
110
|
+
else
|
111
|
+
data.gsub!("@#{k}",v)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
151
115
|
#silent mode so we don't have logs in stderr; clip output
|
152
116
|
#at hadoop read limit
|
153
117
|
command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
|
@@ -201,10 +165,10 @@ module Mobilize
|
|
201
165
|
"drop table if exists #{output_path}",
|
202
166
|
"create table #{output_path} as #{select_hql};"].join(";")
|
203
167
|
full_hql = [prior_hql, output_table_hql].compact.join(";")
|
204
|
-
result = Hive.run(cluster,full_hql, user_name)
|
168
|
+
result = Hive.run(cluster,full_hql, user_name,params['params'])
|
205
169
|
Dataset.find_or_create_by_url(out_url)
|
206
170
|
else
|
207
|
-
result = Hive.run(cluster, hql, user_name)
|
171
|
+
result = Hive.run(cluster, hql, user_name,params['params'])
|
208
172
|
Dataset.find_or_create_by_url(out_url)
|
209
173
|
Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
|
210
174
|
end
|
@@ -245,7 +209,7 @@ module Mobilize
|
|
245
209
|
schema_hash
|
246
210
|
end
|
247
211
|
|
248
|
-
def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil)
|
212
|
+
def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil, params=nil)
|
249
213
|
table_path = [db,table].join(".")
|
250
214
|
table_stats = Hive.table_stats(cluster, db, table, user_name)
|
251
215
|
url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
|
@@ -264,7 +228,7 @@ module Mobilize
|
|
264
228
|
temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
|
265
229
|
temp_drop_hql = "drop table if exists #{temp_table_path};"
|
266
230
|
temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
|
267
|
-
response = Hive.run(cluster,temp_create_hql,user_name)
|
231
|
+
response = Hive.run(cluster,temp_create_hql,user_name,params)
|
268
232
|
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
269
233
|
|
270
234
|
source_table_stats = Hive.table_stats(cluster,temp_db,temp_table_name,user_name)
|
@@ -303,7 +267,7 @@ module Mobilize
|
|
303
267
|
target_insert_hql,
|
304
268
|
temp_drop_hql].join
|
305
269
|
|
306
|
-
response = Hive.run(cluster, target_full_hql, user_name)
|
270
|
+
response = Hive.run(cluster, target_full_hql, user_name, params)
|
307
271
|
|
308
272
|
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
309
273
|
|
@@ -355,7 +319,7 @@ module Mobilize
|
|
355
319
|
part_set_hql = "set hive.cli.print.header=true;set mapred.job.name=#{job_name} (permutations);"
|
356
320
|
part_select_hql = "select distinct #{target_part_stmt} from #{temp_table_path};"
|
357
321
|
part_perm_hql = part_set_hql + part_select_hql
|
358
|
-
response = Hive.run(cluster, part_perm_hql, user_name)
|
322
|
+
response = Hive.run(cluster, part_perm_hql, user_name, params)
|
359
323
|
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
360
324
|
part_perm_tsv = response['stdout']
|
361
325
|
#having gotten the permutations, ensure they are dropped
|
@@ -381,7 +345,7 @@ module Mobilize
|
|
381
345
|
|
382
346
|
target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
|
383
347
|
|
384
|
-
response = Hive.run(cluster, target_full_hql, user_name)
|
348
|
+
response = Hive.run(cluster, target_full_hql, user_name, params)
|
385
349
|
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
386
350
|
else
|
387
351
|
error_msg = "Incompatible partition specs"
|
@@ -435,7 +399,7 @@ module Mobilize
|
|
435
399
|
|
436
400
|
target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
|
437
401
|
|
438
|
-
response = Hive.run(cluster, target_full_hql, user_name, file_hash)
|
402
|
+
response = Hive.run(cluster, target_full_hql, user_name, nil, file_hash)
|
439
403
|
raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
|
440
404
|
|
441
405
|
elsif part_array.length > 0 and
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Mobilize
|
2
|
+
module Hive
|
3
|
+
def self.config
|
4
|
+
Base.config('hive')
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.exec_path(cluster)
|
8
|
+
self.clusters[cluster]['exec_path']
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.output_db(cluster)
|
12
|
+
self.clusters[cluster]['output_db']
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.output_db_user(cluster)
|
16
|
+
output_db_node = Hadoop.gateway_node(cluster)
|
17
|
+
output_db_user = Ssh.host(output_db_node)['user']
|
18
|
+
output_db_user
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.clusters
|
22
|
+
self.config['clusters']
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.slot_ids(cluster)
|
26
|
+
(1..self.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.slot_worker_by_cluster_and_path(cluster,path)
|
30
|
+
working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
|
31
|
+
self.slot_ids(cluster).each do |slot_id|
|
32
|
+
unless working_slots.include?(slot_id)
|
33
|
+
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
|
34
|
+
return slot_id
|
35
|
+
end
|
36
|
+
end
|
37
|
+
#return false if none are available
|
38
|
+
return false
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.unslot_worker_by_path(path)
|
42
|
+
begin
|
43
|
+
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
|
44
|
+
return true
|
45
|
+
rescue
|
46
|
+
return false
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.databases(cluster,user_name)
|
51
|
+
self.run(cluster,"show databases",user_name)['stdout'].split("\n")
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
data/test/hive_job_rows.yml
CHANGED
@@ -20,7 +20,7 @@
|
|
20
20
|
active: true
|
21
21
|
trigger: after hive_test_2
|
22
22
|
status: ""
|
23
|
-
stage1: hive.run hql:"select
|
23
|
+
stage1: hive.run hql:"select '@date' as `date`,product,category,value from mobilize.hive_test_1;", params:{'date':'2013-01-01'}
|
24
24
|
stage2: hive.write source:"stage1",target:"mobilize/hive_test_3", partitions:"date/product", drop:true
|
25
25
|
stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3", partitions:"date/product", drop:false
|
26
26
|
stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mobilize-hive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.299'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -41,6 +41,7 @@ files:
|
|
41
41
|
- Rakefile
|
42
42
|
- lib/mobilize-hive.rb
|
43
43
|
- lib/mobilize-hive/handlers/hive.rb
|
44
|
+
- lib/mobilize-hive/helpers/hive_helper.rb
|
44
45
|
- lib/mobilize-hive/tasks.rb
|
45
46
|
- lib/mobilize-hive/version.rb
|
46
47
|
- lib/samples/hive.yml
|
@@ -66,7 +67,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
66
67
|
version: '0'
|
67
68
|
segments:
|
68
69
|
- 0
|
69
|
-
hash:
|
70
|
+
hash: -3388772007190329704
|
70
71
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
72
|
none: false
|
72
73
|
requirements:
|
@@ -75,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
75
76
|
version: '0'
|
76
77
|
segments:
|
77
78
|
- 0
|
78
|
-
hash:
|
79
|
+
hash: -3388772007190329704
|
79
80
|
requirements: []
|
80
81
|
rubyforge_project:
|
81
82
|
rubygems_version: 1.8.25
|