RubyGems - mobilize-hive - Versions diffs - 1.361 → 1.363 - Mend

mobilize-hive 1.361 → 1.363

Files changed (6) hide show

data/lib/mobilize-hive/handlers/hive.rb +22 -19
data/lib/mobilize-hive/version.rb +1 -1
data/mobilize-hive.gemspec +1 -1
data/test/fixtures/integration_jobs.yml +11 -11
data/test/integration/mobilize-hive_test.rb +1 -1
metadata +6 -6

data/lib/mobilize-hive/handlers/hive.rb CHANGED Viewed

@@ -93,7 +93,7 @@ module Mobilize
     end
     #run a generic hive command, with the option of passing a file hash to be locally available
-    def Hive.run(cluster,hql,user_name,params=nil,file_hash=nil)
+    def Hive.run(cluster,hql,user_name,params=nil,file_hash=nil,stage_path=nil)
       preps = Hive.prepends.map do |p|
                                   prefix = "set "
                                   suffix = ";"
@@ -103,7 +103,7 @@ module Mobilize
                                   prep_out
                                 end.join
       hql = "#{preps}#{hql}"
-      filename = hql.to_md5
+      filename = "hql"
       file_hash||= {}
       file_hash[filename] = hql
       params ||= {}
@@ -121,9 +121,9 @@ module Mobilize
       end
       #silent mode so we don't have logs in stderr; clip output
       #at hadoop read limit
-      command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
+      command = "#{Hive.exec_path(cluster)} -f #{filename}"
       gateway_node = Hadoop.gateway_node(cluster)
-      response = Ssh.run(gateway_node,command,user_name,file_hash)
+      response = Ssh.run(gateway_node,command,user_name,stage_path,file_hash)
       #override exit code 0 when stdout is blank and
       #stderror contains FAILED or KILLED
       if response['stdout'].to_s.length == 0 and
@@ -147,7 +147,7 @@ module Mobilize
       return false unless slot_id
       #output table stores stage output
-      output_db,output_table = [Hive.output_db(cluster),stage_path.gridsafe]
+      output_db,output_table = [Hive.output_db(cluster),job_name.downcase.alphanunderscore]
       output_path = [output_db,output_table].join(".")
       out_url = "hive://#{cluster}/#{output_db}/#{output_table}"
@@ -164,6 +164,7 @@ module Mobilize
       #check for select at end
       hql_array = hql.split("\n").reject{|l| l.starts_with?("--") or l.strip.length==0}.join("\n").split(";").map{|h| h.strip}
       last_statement = hql_array.last
+      file_hash = nil
       if last_statement.to_s.downcase.starts_with?("select")
         #nil if no prior commands
         prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
@@ -172,10 +173,10 @@ module Mobilize
                             "drop table if exists #{output_path}",
                             "create table #{output_path} as #{select_hql};"].join(";")
         full_hql = [prior_hql, output_table_hql].compact.join(";")
-        result = Hive.run(cluster,full_hql, user_name,params['params'])
+        result = Hive.run(cluster,full_hql, user_name,params['params'],file_hash,stage_path)
         Dataset.find_or_create_by_url(out_url)
       else
-        result = Hive.run(cluster, hql, user_name,params['params'])
+        result = Hive.run(cluster, hql, user_name,params['params'],file_hash,stage_path)
         Dataset.find_or_create_by_url(out_url)
         Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
       end
@@ -210,7 +211,8 @@ module Mobilize
       schema_hash
     end
-    def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil, run_params=nil)
+    def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, stage_path, drop=false, schema_hash=nil, run_params=nil)
+      job_name = stage_path.sub("Runner_","")
       table_path = [db,table].join(".")
       table_stats = Hive.table_stats(cluster, db, table, user_name)
       url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
@@ -226,12 +228,13 @@ module Mobilize
       #create temporary table so we can identify fields etc.
       temp_db = Hive.output_db(cluster)
-      temp_table_name = (source_hql+table_path).to_md5
+      temp_table_name = "temp_#{job_name.downcase.alphanunderscore}"
       temp_table_path = [temp_db,temp_table_name].join(".")
       temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
       temp_drop_hql = "drop table if exists #{temp_table_path};"
       temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
-      response = Hive.run(cluster,temp_create_hql,user_name,run_params)
+      file_hash = nil
+      response = Hive.run(cluster,temp_create_hql,user_name,run_params,file_hash,stage_path)
       raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
       source_table_stats = Hive.table_stats(cluster,temp_db,temp_table_name,user_name)
@@ -361,17 +364,17 @@ module Mobilize
     #Accepts options to drop existing target if any
     #also schema with column datatype overrides
     def Hive.tsv_to_table(cluster, table_path, user_name, source_tsv)
-      #nil if only header row, or no header row
-      if source_tsv.strip.length==0 or source_tsv.strip.split("\n").length<=1
-        puts "no data in source_tsv for #{cluster}/#{table_path}"
-        return nil
-      end
       #get rid of freaking carriage return characters
       if source_tsv.index("\r\n")
         source_tsv = source_tsv.gsub("\r\n","\n")
       elsif source_tsv.index("\r")
         source_tsv = source_tsv.gsub("\r","\n")
       end
+      #nil if only header row, or no header row
+      if source_tsv.strip.length==0 or source_tsv.strip.split("\n").length<=1
+        puts "no data in source_tsv for #{cluster}/#{table_path}"
+        return nil
+      end
       source_headers = source_tsv.tsv_header_array
       #one file only, strip headers, replace tab with ctrl-a for hive
@@ -404,6 +407,7 @@ module Mobilize
       #return blank response if there are no slots available
       return nil unless gdrive_slot
       s = Stage.where(:path=>stage_path).first
+      job_name = s.path.sub("Runner_","")
       params = s.params
       source = s.sources(gdrive_slot).first
       target = s.target
@@ -413,7 +417,6 @@ module Mobilize
       return false unless slot_id
       #update stage with the node so we can use it
       user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
-      job_name = s.path.sub("Runner_","")
       schema_hash = if params['schema']
                       Hive.schema_hash(params['schema'],stage_path,user_name,gdrive_slot)
@@ -460,15 +463,15 @@ module Mobilize
                  url = if source_hql
                          #include any params (or nil) at the end
                          run_params = params['params']
-                         Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop, schema_hash,run_params)
+                         Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, stage_path,drop, schema_hash,run_params)
                        elsif source_tsv
                          #first write tsv to temp table
-                         temp_table_path = "#{Hive.output_db(cluster)}.temptsv_#{stage_path.gridsafe}"
+                         temp_table_path = "#{Hive.output_db(cluster)}.temptsv_#{job_name.downcase.alphanunderscore}"
                          has_data = Hive.tsv_to_table(cluster, temp_table_path, user_name, source_tsv)
                          if has_data
                            #then do the regular insert, with source hql being select * from temp table
                            source_hql = "select * from #{temp_table_path}"
-                           Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop, schema_hash)
+                           Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, stage_path, drop, schema_hash)
                          else
                            nil
                          end

data/lib/mobilize-hive/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Mobilize
   module Hive
-    VERSION = "1.361"
+    VERSION = "1.363"
   end
 end

data/mobilize-hive.gemspec CHANGED Viewed

@@ -16,5 +16,5 @@ Gem::Specification.new do |gem|
   gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
   gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
   gem.require_paths = ["lib"]
-  gem.add_runtime_dependency "mobilize-hdfs","1.361"
+  gem.add_runtime_dependency "mobilize-hdfs","1.363"
 end

data/test/fixtures/integration_jobs.yml CHANGED Viewed

@@ -3,32 +3,32 @@
   active: true
   trigger: once
   status: ""
-  stage1: hive.write target:"mobilize/hive1", partitions:"act_date", drop:true,
+  stage1: hive.write keep_logs:true, retries:3, target:"mobilize/hive1", partitions:"act_date", drop:true,
             source:"Runner_mobilize(test)/hive1.in", schema:"hive1.schema"
-  stage2: hive.run source:"hive1.sql"
-  stage3: hive.run hql:"show databases;"
+  stage2: hive.run keep_logs:true, retries:3, source:"hive1.sql"
+  stage3: hive.run keep_logs:true, retries:3, hql:"show databases;"
   stage4: gsheet.write source:"stage2", target:"hive1_stage2.out"
   stage5: gsheet.write source:"stage3", target:"hive1_stage3.out"
 - name: hive2
   active: true
   trigger: after hive1
   status: ""
-  stage1: hive.write source:"hdfs://user/mobilize/test/hdfs1.out", target:"mobilize.hive2", drop:true
-  stage2: hive.run hql:"select * from mobilize.hive2;"
+  stage1: hive.write keep_logs:true, retries:3, source:"hdfs://user/mobilize/test/hdfs1.out", target:"mobilize.hive2", drop:true
+  stage2: hive.run keep_logs:true, retries:3, hql:"select * from mobilize.hive2;"
   stage3: gsheet.write source:"stage2", target:"hive2.out"
 - name: hive3
   active: true
   trigger: after hive2
   status: ""
-  stage1: hive.run hql:"select '@date' as `date`,product,category,value from mobilize.hive1;", params:{'date':'2013-01-01'}
-  stage2: hive.write source:"stage1",target:"mobilize/hive3", partitions:"date/product", drop:true
-  stage3: hive.write hql:"select * from mobilize.hive3;",target:"mobilize/hive3", partitions:"date/product", drop:false
+  stage1: hive.run keep_logs:true, retries:3, hql:"select '@date' as `date`,product,category,value from mobilize.hive1;", params:{'date':'2013-01-01'}
+  stage2: hive.write keep_logs:true, retries:3, source:"stage1",target:"mobilize/hive3", partitions:"date/product", drop:true
+  stage3: hive.write keep_logs:true, retries:3, hql:"select * from mobilize.hive3;",target:"mobilize/hive3", partitions:"date/product", drop:false
   stage4: gsheet.write source:"hive://mobilize/hive3", target:"hive3.out"
 - name: hive4
   active: true
   trigger: after hive3
   status: ""
-  stage1: hive.write source:"hive4_stage1.in", target:"mobilize/hive1", partitions:"act_date"
-  stage2: hive.write source:"hive4_stage2.in", target:"mobilize/hive1", partitions:"act_date"
-  stage3: hive.run hql:"select '@date $utc_time' as `date_time`,product,category,value from mobilize.hive1;", params:{'date':'$utc_date'}
+  stage1: hive.write keep_logs:true, retries:3, source:"hive4_stage1.in", target:"mobilize/hive1", partitions:"act_date"
+  stage2: hive.write keep_logs:true, retries:3, source:"hive4_stage2.in", target:"mobilize/hive1", partitions:"act_date"
+  stage3: hive.run keep_logs:true, retries:3, hql:"select '@date $utc_time' as `date_time`,product,category,value from mobilize.hive1;", params:{'date':'$utc_date'}
   stage4: gsheet.write source:stage3, target:"hive4.out"

data/test/integration/mobilize-hive_test.rb CHANGED Viewed

@@ -18,7 +18,7 @@ describe "Mobilize" do
     end
     puts "add/update jobs"
-    u.jobs.each{|j| j.delete}
+    u.jobs.each{|j| j.stages.each{|s| s.delete}; j.delete}
     jobs_fixture_name = "integration_jobs"
     jobs_target_url = "gsheet://#{r.title}/jobs"
     TestHelper.write_fixture(jobs_fixture_name, jobs_target_url, 'update')

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: mobilize-hive
 version: !ruby/object:Gem::Version
-  version: '1.361'
+  version: '1.363'
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-05-31 00:00:00.000000000 Z
+date: 2013-06-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mobilize-hdfs
@@ -18,7 +18,7 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: '1.361'
+        version: '1.363'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
@@ -26,7 +26,7 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: '1.361'
+        version: '1.363'
 description: Adds hive read, write, and run support to mobilize-hdfs
 email:
 - cpaesleme@dena.com
@@ -71,7 +71,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: -2758952284012723637
+      hash: 441677985208896084
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
@@ -80,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: -2758952284012723637
+      hash: 441677985208896084
 requirements: []
 rubyforge_project:
 rubygems_version: 1.8.25