mobilize-hive 1.361 → 1.363

Sign up to get free protection for your applications and to get access to all the features.
@@ -93,7 +93,7 @@ module Mobilize
93
93
  end
94
94
 
95
95
  #run a generic hive command, with the option of passing a file hash to be locally available
96
- def Hive.run(cluster,hql,user_name,params=nil,file_hash=nil)
96
+ def Hive.run(cluster,hql,user_name,params=nil,file_hash=nil,stage_path=nil)
97
97
  preps = Hive.prepends.map do |p|
98
98
  prefix = "set "
99
99
  suffix = ";"
@@ -103,7 +103,7 @@ module Mobilize
103
103
  prep_out
104
104
  end.join
105
105
  hql = "#{preps}#{hql}"
106
- filename = hql.to_md5
106
+ filename = "hql"
107
107
  file_hash||= {}
108
108
  file_hash[filename] = hql
109
109
  params ||= {}
@@ -121,9 +121,9 @@ module Mobilize
121
121
  end
122
122
  #silent mode so we don't have logs in stderr; clip output
123
123
  #at hadoop read limit
124
- command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
124
+ command = "#{Hive.exec_path(cluster)} -f #{filename}"
125
125
  gateway_node = Hadoop.gateway_node(cluster)
126
- response = Ssh.run(gateway_node,command,user_name,file_hash)
126
+ response = Ssh.run(gateway_node,command,user_name,stage_path,file_hash)
127
127
  #override exit code 0 when stdout is blank and
128
128
  #stderror contains FAILED or KILLED
129
129
  if response['stdout'].to_s.length == 0 and
@@ -147,7 +147,7 @@ module Mobilize
147
147
  return false unless slot_id
148
148
 
149
149
  #output table stores stage output
150
- output_db,output_table = [Hive.output_db(cluster),stage_path.gridsafe]
150
+ output_db,output_table = [Hive.output_db(cluster),job_name.downcase.alphanunderscore]
151
151
  output_path = [output_db,output_table].join(".")
152
152
  out_url = "hive://#{cluster}/#{output_db}/#{output_table}"
153
153
 
@@ -164,6 +164,7 @@ module Mobilize
164
164
  #check for select at end
165
165
  hql_array = hql.split("\n").reject{|l| l.starts_with?("--") or l.strip.length==0}.join("\n").split(";").map{|h| h.strip}
166
166
  last_statement = hql_array.last
167
+ file_hash = nil
167
168
  if last_statement.to_s.downcase.starts_with?("select")
168
169
  #nil if no prior commands
169
170
  prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
@@ -172,10 +173,10 @@ module Mobilize
172
173
  "drop table if exists #{output_path}",
173
174
  "create table #{output_path} as #{select_hql};"].join(";")
174
175
  full_hql = [prior_hql, output_table_hql].compact.join(";")
175
- result = Hive.run(cluster,full_hql, user_name,params['params'])
176
+ result = Hive.run(cluster,full_hql, user_name,params['params'],file_hash,stage_path)
176
177
  Dataset.find_or_create_by_url(out_url)
177
178
  else
178
- result = Hive.run(cluster, hql, user_name,params['params'])
179
+ result = Hive.run(cluster, hql, user_name,params['params'],file_hash,stage_path)
179
180
  Dataset.find_or_create_by_url(out_url)
180
181
  Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
181
182
  end
@@ -210,7 +211,8 @@ module Mobilize
210
211
  schema_hash
211
212
  end
212
213
 
213
- def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil, run_params=nil)
214
+ def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, stage_path, drop=false, schema_hash=nil, run_params=nil)
215
+ job_name = stage_path.sub("Runner_","")
214
216
  table_path = [db,table].join(".")
215
217
  table_stats = Hive.table_stats(cluster, db, table, user_name)
216
218
  url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
@@ -226,12 +228,13 @@ module Mobilize
226
228
 
227
229
  #create temporary table so we can identify fields etc.
228
230
  temp_db = Hive.output_db(cluster)
229
- temp_table_name = (source_hql+table_path).to_md5
231
+ temp_table_name = "temp_#{job_name.downcase.alphanunderscore}"
230
232
  temp_table_path = [temp_db,temp_table_name].join(".")
231
233
  temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
232
234
  temp_drop_hql = "drop table if exists #{temp_table_path};"
233
235
  temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
234
- response = Hive.run(cluster,temp_create_hql,user_name,run_params)
236
+ file_hash = nil
237
+ response = Hive.run(cluster,temp_create_hql,user_name,run_params,file_hash,stage_path)
235
238
  raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
236
239
 
237
240
  source_table_stats = Hive.table_stats(cluster,temp_db,temp_table_name,user_name)
@@ -361,17 +364,17 @@ module Mobilize
361
364
  #Accepts options to drop existing target if any
362
365
  #also schema with column datatype overrides
363
366
  def Hive.tsv_to_table(cluster, table_path, user_name, source_tsv)
364
- #nil if only header row, or no header row
365
- if source_tsv.strip.length==0 or source_tsv.strip.split("\n").length<=1
366
- puts "no data in source_tsv for #{cluster}/#{table_path}"
367
- return nil
368
- end
369
367
  #get rid of freaking carriage return characters
370
368
  if source_tsv.index("\r\n")
371
369
  source_tsv = source_tsv.gsub("\r\n","\n")
372
370
  elsif source_tsv.index("\r")
373
371
  source_tsv = source_tsv.gsub("\r","\n")
374
372
  end
373
+ #nil if only header row, or no header row
374
+ if source_tsv.strip.length==0 or source_tsv.strip.split("\n").length<=1
375
+ puts "no data in source_tsv for #{cluster}/#{table_path}"
376
+ return nil
377
+ end
375
378
  source_headers = source_tsv.tsv_header_array
376
379
 
377
380
  #one file only, strip headers, replace tab with ctrl-a for hive
@@ -404,6 +407,7 @@ module Mobilize
404
407
  #return blank response if there are no slots available
405
408
  return nil unless gdrive_slot
406
409
  s = Stage.where(:path=>stage_path).first
410
+ job_name = s.path.sub("Runner_","")
407
411
  params = s.params
408
412
  source = s.sources(gdrive_slot).first
409
413
  target = s.target
@@ -413,7 +417,6 @@ module Mobilize
413
417
  return false unless slot_id
414
418
  #update stage with the node so we can use it
415
419
  user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
416
- job_name = s.path.sub("Runner_","")
417
420
 
418
421
  schema_hash = if params['schema']
419
422
  Hive.schema_hash(params['schema'],stage_path,user_name,gdrive_slot)
@@ -460,15 +463,15 @@ module Mobilize
460
463
  url = if source_hql
461
464
  #include any params (or nil) at the end
462
465
  run_params = params['params']
463
- Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop, schema_hash,run_params)
466
+ Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, stage_path,drop, schema_hash,run_params)
464
467
  elsif source_tsv
465
468
  #first write tsv to temp table
466
- temp_table_path = "#{Hive.output_db(cluster)}.temptsv_#{stage_path.gridsafe}"
469
+ temp_table_path = "#{Hive.output_db(cluster)}.temptsv_#{job_name.downcase.alphanunderscore}"
467
470
  has_data = Hive.tsv_to_table(cluster, temp_table_path, user_name, source_tsv)
468
471
  if has_data
469
472
  #then do the regular insert, with source hql being select * from temp table
470
473
  source_hql = "select * from #{temp_table_path}"
471
- Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop, schema_hash)
474
+ Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, stage_path, drop, schema_hash)
472
475
  else
473
476
  nil
474
477
  end
@@ -1,5 +1,5 @@
1
1
  module Mobilize
2
2
  module Hive
3
- VERSION = "1.361"
3
+ VERSION = "1.363"
4
4
  end
5
5
  end
@@ -16,5 +16,5 @@ Gem::Specification.new do |gem|
16
16
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ["lib"]
19
- gem.add_runtime_dependency "mobilize-hdfs","1.361"
19
+ gem.add_runtime_dependency "mobilize-hdfs","1.363"
20
20
  end
@@ -3,32 +3,32 @@
3
3
  active: true
4
4
  trigger: once
5
5
  status: ""
6
- stage1: hive.write target:"mobilize/hive1", partitions:"act_date", drop:true,
6
+ stage1: hive.write keep_logs:true, retries:3, target:"mobilize/hive1", partitions:"act_date", drop:true,
7
7
  source:"Runner_mobilize(test)/hive1.in", schema:"hive1.schema"
8
- stage2: hive.run source:"hive1.sql"
9
- stage3: hive.run hql:"show databases;"
8
+ stage2: hive.run keep_logs:true, retries:3, source:"hive1.sql"
9
+ stage3: hive.run keep_logs:true, retries:3, hql:"show databases;"
10
10
  stage4: gsheet.write source:"stage2", target:"hive1_stage2.out"
11
11
  stage5: gsheet.write source:"stage3", target:"hive1_stage3.out"
12
12
  - name: hive2
13
13
  active: true
14
14
  trigger: after hive1
15
15
  status: ""
16
- stage1: hive.write source:"hdfs://user/mobilize/test/hdfs1.out", target:"mobilize.hive2", drop:true
17
- stage2: hive.run hql:"select * from mobilize.hive2;"
16
+ stage1: hive.write keep_logs:true, retries:3, source:"hdfs://user/mobilize/test/hdfs1.out", target:"mobilize.hive2", drop:true
17
+ stage2: hive.run keep_logs:true, retries:3, hql:"select * from mobilize.hive2;"
18
18
  stage3: gsheet.write source:"stage2", target:"hive2.out"
19
19
  - name: hive3
20
20
  active: true
21
21
  trigger: after hive2
22
22
  status: ""
23
- stage1: hive.run hql:"select '@date' as `date`,product,category,value from mobilize.hive1;", params:{'date':'2013-01-01'}
24
- stage2: hive.write source:"stage1",target:"mobilize/hive3", partitions:"date/product", drop:true
25
- stage3: hive.write hql:"select * from mobilize.hive3;",target:"mobilize/hive3", partitions:"date/product", drop:false
23
+ stage1: hive.run keep_logs:true, retries:3, hql:"select '@date' as `date`,product,category,value from mobilize.hive1;", params:{'date':'2013-01-01'}
24
+ stage2: hive.write keep_logs:true, retries:3, source:"stage1",target:"mobilize/hive3", partitions:"date/product", drop:true
25
+ stage3: hive.write keep_logs:true, retries:3, hql:"select * from mobilize.hive3;",target:"mobilize/hive3", partitions:"date/product", drop:false
26
26
  stage4: gsheet.write source:"hive://mobilize/hive3", target:"hive3.out"
27
27
  - name: hive4
28
28
  active: true
29
29
  trigger: after hive3
30
30
  status: ""
31
- stage1: hive.write source:"hive4_stage1.in", target:"mobilize/hive1", partitions:"act_date"
32
- stage2: hive.write source:"hive4_stage2.in", target:"mobilize/hive1", partitions:"act_date"
33
- stage3: hive.run hql:"select '@date $utc_time' as `date_time`,product,category,value from mobilize.hive1;", params:{'date':'$utc_date'}
31
+ stage1: hive.write keep_logs:true, retries:3, source:"hive4_stage1.in", target:"mobilize/hive1", partitions:"act_date"
32
+ stage2: hive.write keep_logs:true, retries:3, source:"hive4_stage2.in", target:"mobilize/hive1", partitions:"act_date"
33
+ stage3: hive.run keep_logs:true, retries:3, hql:"select '@date $utc_time' as `date_time`,product,category,value from mobilize.hive1;", params:{'date':'$utc_date'}
34
34
  stage4: gsheet.write source:stage3, target:"hive4.out"
@@ -18,7 +18,7 @@ describe "Mobilize" do
18
18
  end
19
19
 
20
20
  puts "add/update jobs"
21
- u.jobs.each{|j| j.delete}
21
+ u.jobs.each{|j| j.stages.each{|s| s.delete}; j.delete}
22
22
  jobs_fixture_name = "integration_jobs"
23
23
  jobs_target_url = "gsheet://#{r.title}/jobs"
24
24
  TestHelper.write_fixture(jobs_fixture_name, jobs_target_url, 'update')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mobilize-hive
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.361'
4
+ version: '1.363'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-05-31 00:00:00.000000000 Z
12
+ date: 2013-06-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mobilize-hdfs
@@ -18,7 +18,7 @@ dependencies:
18
18
  requirements:
19
19
  - - '='
20
20
  - !ruby/object:Gem::Version
21
- version: '1.361'
21
+ version: '1.363'
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
@@ -26,7 +26,7 @@ dependencies:
26
26
  requirements:
27
27
  - - '='
28
28
  - !ruby/object:Gem::Version
29
- version: '1.361'
29
+ version: '1.363'
30
30
  description: Adds hive read, write, and run support to mobilize-hdfs
31
31
  email:
32
32
  - cpaesleme@dena.com
@@ -71,7 +71,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
71
71
  version: '0'
72
72
  segments:
73
73
  - 0
74
- hash: -2758952284012723637
74
+ hash: 441677985208896084
75
75
  required_rubygems_version: !ruby/object:Gem::Requirement
76
76
  none: false
77
77
  requirements:
@@ -80,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
80
80
  version: '0'
81
81
  segments:
82
82
  - 0
83
- hash: -2758952284012723637
83
+ hash: 441677985208896084
84
84
  requirements: []
85
85
  rubyforge_project:
86
86
  rubygems_version: 1.8.25