mobilize-hive 1.298 → 1.299

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -142,6 +142,17 @@ Start
142
142
  * cluster and user are optional for all of the below.
143
143
  * cluster defaults to the first cluster listed;
144
144
  * user is treated the same way as in [mobilize-ssh][mobilize-ssh].
145
+ * params are also optional for all of the below. They replace HQL in sources.
146
+ * params are passed as a YML or JSON, as in:
147
+ * `hive.run source:<source_path>, params:{'date': '2013-03-01', 'unit': 'widgets'}`
148
+ * this example replaces all the keys, preceded by '@' in all source hqls with the value.
149
+ * The preceding '@' is used to keep from replacing instances
150
+ of "date" and "unit" in the HQL; you should have `@date` and `@unit` in your actual HQL
151
+ if you'd like to replace those tokens.
152
+ * in addition, the following params are substituted automatically:
153
+ * `$utc_date` - replaced with YYYY-MM-DD date, UTC
154
+ * `$utc_time` - replaced with HH:MM time, UTC
155
+ * any occurrence of these values in HQL will be replaced at runtime.
145
156
  * hive.run `hql:<hql> || source:<gsheet_path>, user:<user>, cluster:<cluster>`, which executes the
146
157
  script in the hql or source sheet and returns any output specified at the
147
158
  end. If the cmd or last query in source is a select statement, column headers will be
@@ -1,56 +1,7 @@
1
1
  module Mobilize
2
2
  module Hive
3
- def Hive.config
4
- Base.config('hive')
5
- end
6
-
7
- def Hive.exec_path(cluster)
8
- Hive.clusters[cluster]['exec_path']
9
- end
10
-
11
- def Hive.output_db(cluster)
12
- Hive.clusters[cluster]['output_db']
13
- end
14
-
15
- def Hive.output_db_user(cluster)
16
- output_db_node = Hadoop.gateway_node(cluster)
17
- output_db_user = Ssh.host(output_db_node)['user']
18
- output_db_user
19
- end
20
-
21
- def Hive.clusters
22
- Hive.config['clusters']
23
- end
24
-
25
- def Hive.slot_ids(cluster)
26
- (1..Hive.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
27
- end
28
-
29
- def Hive.slot_worker_by_cluster_and_path(cluster,path)
30
- working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
31
- Hive.slot_ids(cluster).each do |slot_id|
32
- unless working_slots.include?(slot_id)
33
- Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
34
- return slot_id
35
- end
36
- end
37
- #return false if none are available
38
- return false
39
- end
40
-
41
- def Hive.unslot_worker_by_path(path)
42
- begin
43
- Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
44
- return true
45
- rescue
46
- return false
47
- end
48
- end
49
-
50
- def Hive.databases(cluster,user_name)
51
- Hive.run(cluster,"show databases",user_name)['stdout'].split("\n")
52
- end
53
-
3
+ #adds convenience methods
4
+ require "#{File.dirname(__FILE__)}/../helpers/hive_helper"
54
5
  # converts a source path or target path to a dst in the context of handler and stage
55
6
  def Hive.path_to_dst(path,stage_path,gdrive_slot)
56
7
  has_handler = true if path.index("://")
@@ -142,12 +93,25 @@ module Mobilize
142
93
  end
143
94
 
144
95
  #run a generic hive command, with the option of passing a file hash to be locally available
145
- def Hive.run(cluster,hql,user_name,file_hash=nil)
96
+ def Hive.run(cluster,hql,user_name,params=nil,file_hash=nil)
146
97
  # no TempStatsStore
147
98
  hql = "set hive.stats.autogather=false;#{hql}"
148
99
  filename = hql.to_md5
149
100
  file_hash||= {}
150
101
  file_hash[filename] = hql
102
+ #add in default params
103
+ params ||= {}
104
+ params.merge(Ssh.default_params)
105
+ #replace any params in the file_hash and command
106
+ params.each do |k,v|
107
+ file_hash.each do |name,data|
108
+ if k.starts_with?("$")
109
+ data.gsub!(k,v)
110
+ else
111
+ data.gsub!("@#{k}",v)
112
+ end
113
+ end
114
+ end
151
115
  #silent mode so we don't have logs in stderr; clip output
152
116
  #at hadoop read limit
153
117
  command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
@@ -201,10 +165,10 @@ module Mobilize
201
165
  "drop table if exists #{output_path}",
202
166
  "create table #{output_path} as #{select_hql};"].join(";")
203
167
  full_hql = [prior_hql, output_table_hql].compact.join(";")
204
- result = Hive.run(cluster,full_hql, user_name)
168
+ result = Hive.run(cluster,full_hql, user_name,params['params'])
205
169
  Dataset.find_or_create_by_url(out_url)
206
170
  else
207
- result = Hive.run(cluster, hql, user_name)
171
+ result = Hive.run(cluster, hql, user_name,params['params'])
208
172
  Dataset.find_or_create_by_url(out_url)
209
173
  Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
210
174
  end
@@ -245,7 +209,7 @@ module Mobilize
245
209
  schema_hash
246
210
  end
247
211
 
248
- def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil)
212
+ def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil, params=nil)
249
213
  table_path = [db,table].join(".")
250
214
  table_stats = Hive.table_stats(cluster, db, table, user_name)
251
215
  url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
@@ -264,7 +228,7 @@ module Mobilize
264
228
  temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
265
229
  temp_drop_hql = "drop table if exists #{temp_table_path};"
266
230
  temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
267
- response = Hive.run(cluster,temp_create_hql,user_name)
231
+ response = Hive.run(cluster,temp_create_hql,user_name,params)
268
232
  raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
269
233
 
270
234
  source_table_stats = Hive.table_stats(cluster,temp_db,temp_table_name,user_name)
@@ -303,7 +267,7 @@ module Mobilize
303
267
  target_insert_hql,
304
268
  temp_drop_hql].join
305
269
 
306
- response = Hive.run(cluster, target_full_hql, user_name)
270
+ response = Hive.run(cluster, target_full_hql, user_name, params)
307
271
 
308
272
  raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
309
273
 
@@ -355,7 +319,7 @@ module Mobilize
355
319
  part_set_hql = "set hive.cli.print.header=true;set mapred.job.name=#{job_name} (permutations);"
356
320
  part_select_hql = "select distinct #{target_part_stmt} from #{temp_table_path};"
357
321
  part_perm_hql = part_set_hql + part_select_hql
358
- response = Hive.run(cluster, part_perm_hql, user_name)
322
+ response = Hive.run(cluster, part_perm_hql, user_name, params)
359
323
  raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
360
324
  part_perm_tsv = response['stdout']
361
325
  #having gotten the permutations, ensure they are dropped
@@ -381,7 +345,7 @@ module Mobilize
381
345
 
382
346
  target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
383
347
 
384
- response = Hive.run(cluster, target_full_hql, user_name)
348
+ response = Hive.run(cluster, target_full_hql, user_name, params)
385
349
  raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
386
350
  else
387
351
  error_msg = "Incompatible partition specs"
@@ -435,7 +399,7 @@ module Mobilize
435
399
 
436
400
  target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
437
401
 
438
- response = Hive.run(cluster, target_full_hql, user_name, file_hash)
402
+ response = Hive.run(cluster, target_full_hql, user_name, nil, file_hash)
439
403
  raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
440
404
 
441
405
  elsif part_array.length > 0 and
@@ -0,0 +1,55 @@
1
+ module Mobilize
2
+ module Hive
3
+ def self.config
4
+ Base.config('hive')
5
+ end
6
+
7
+ def self.exec_path(cluster)
8
+ self.clusters[cluster]['exec_path']
9
+ end
10
+
11
+ def self.output_db(cluster)
12
+ self.clusters[cluster]['output_db']
13
+ end
14
+
15
+ def self.output_db_user(cluster)
16
+ output_db_node = Hadoop.gateway_node(cluster)
17
+ output_db_user = Ssh.host(output_db_node)['user']
18
+ output_db_user
19
+ end
20
+
21
+ def self.clusters
22
+ self.config['clusters']
23
+ end
24
+
25
+ def self.slot_ids(cluster)
26
+ (1..self.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
27
+ end
28
+
29
+ def self.slot_worker_by_cluster_and_path(cluster,path)
30
+ working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
31
+ self.slot_ids(cluster).each do |slot_id|
32
+ unless working_slots.include?(slot_id)
33
+ Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
34
+ return slot_id
35
+ end
36
+ end
37
+ #return false if none are available
38
+ return false
39
+ end
40
+
41
+ def self.unslot_worker_by_path(path)
42
+ begin
43
+ Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
44
+ return true
45
+ rescue
46
+ return false
47
+ end
48
+ end
49
+
50
+ def self.databases(cluster,user_name)
51
+ self.run(cluster,"show databases",user_name)['stdout'].split("\n")
52
+ end
53
+ end
54
+ end
55
+
@@ -1,5 +1,5 @@
1
1
  module Mobilize
2
2
  module Hive
3
- VERSION = "1.298"
3
+ VERSION = "1.299"
4
4
  end
5
5
  end
@@ -20,7 +20,7 @@
20
20
  active: true
21
21
  trigger: after hive_test_2
22
22
  status: ""
23
- stage1: hive.run hql:"select act_date as `date`,product,category,value from mobilize.hive_test_1;"
23
+ stage1: hive.run hql:"select '@date' as `date`,product,category,value from mobilize.hive_test_1;", params:{'date':'2013-01-01'}
24
24
  stage2: hive.write source:"stage1",target:"mobilize/hive_test_3", partitions:"date/product", drop:true
25
25
  stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3", partitions:"date/product", drop:false
26
26
  stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mobilize-hive
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.298'
4
+ version: '1.299'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -41,6 +41,7 @@ files:
41
41
  - Rakefile
42
42
  - lib/mobilize-hive.rb
43
43
  - lib/mobilize-hive/handlers/hive.rb
44
+ - lib/mobilize-hive/helpers/hive_helper.rb
44
45
  - lib/mobilize-hive/tasks.rb
45
46
  - lib/mobilize-hive/version.rb
46
47
  - lib/samples/hive.yml
@@ -66,7 +67,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
66
67
  version: '0'
67
68
  segments:
68
69
  - 0
69
- hash: 1394133607903248824
70
+ hash: -3388772007190329704
70
71
  required_rubygems_version: !ruby/object:Gem::Requirement
71
72
  none: false
72
73
  requirements:
@@ -75,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
75
76
  version: '0'
76
77
  segments:
77
78
  - 0
78
- hash: 1394133607903248824
79
+ hash: -3388772007190329704
79
80
  requirements: []
80
81
  rubyforge_project:
81
82
  rubygems_version: 1.8.25