RubyGems - mobilize-hive - Versions diffs - 1.298 → 1.299 - Mend

mobilize-hive 1.298 → 1.299

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/README.md +11 -0
data/lib/mobilize-hive/handlers/hive.rb +24 -60
data/lib/mobilize-hive/helpers/hive_helper.rb +55 -0
data/lib/mobilize-hive/version.rb +1 -1
data/test/hive_job_rows.yml +1 -1
metadata +4 -3

data/README.md CHANGED Viewed

@@ -142,6 +142,17 @@ Start
   * cluster and user are optional for all of the below.
     * cluster defaults to the first cluster listed;
     * user is treated the same way as in [mobilize-ssh][mobilize-ssh].
+  * params are also optional for all of the below. They replace HQL in sources.
+    * params are passed as a YML or JSON, as in:
+      * `hive.run source:<source_path>, params:{'date': '2013-03-01', 'unit': 'widgets'}`
+        * this example replaces all the keys, preceded by '@' in all source hqls with the value.
+          * The preceding '@' is used to keep from replacing instances
+            of "date" and "unit" in the HQL; you should have `@date` and `@unit` in your actual HQL
+            if you'd like to replace those tokens.
+    * in addition, the following params are substituted automatically:
+      * `$utc_date` - replaced with YYYY-MM-DD date, UTC
+      * `$utc_time` - replaced with HH:MM time, UTC
+      * any occurrence of these values in HQL will be replaced at runtime.
   * hive.run `hql:<hql> || source:<gsheet_path>, user:<user>, cluster:<cluster>`, which executes the
       script in the hql or source sheet and returns any output specified at the
       end. If the cmd or last query in source is a select statement, column headers will be

data/lib/mobilize-hive/handlers/hive.rb CHANGED Viewed

@@ -1,56 +1,7 @@
 module Mobilize
   module Hive
-    def Hive.config
-      Base.config('hive')
-    end
-    def Hive.exec_path(cluster)
-      Hive.clusters[cluster]['exec_path']
-    end
-    def Hive.output_db(cluster)
-      Hive.clusters[cluster]['output_db']
-    end
-    def Hive.output_db_user(cluster)
-      output_db_node = Hadoop.gateway_node(cluster)
-      output_db_user = Ssh.host(output_db_node)['user']
-      output_db_user
-    end
-    def Hive.clusters
-      Hive.config['clusters']
-    end
-    def Hive.slot_ids(cluster)
-      (1..Hive.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
-    end
-    def Hive.slot_worker_by_cluster_and_path(cluster,path)
-      working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
-      Hive.slot_ids(cluster).each do |slot_id|
-        unless working_slots.include?(slot_id)
-          Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
-          return slot_id
-        end
-      end
-      #return false if none are available
-      return false
-    end
-    def Hive.unslot_worker_by_path(path)
-      begin
-        Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
-        return true
-      rescue
-        return false
-      end
-    end
-    def Hive.databases(cluster,user_name)
-      Hive.run(cluster,"show databases",user_name)['stdout'].split("\n")
-    end
+    #adds convenience methods
+    require "#{File.dirname(__FILE__)}/../helpers/hive_helper"
     # converts a source path or target path to a dst in the context of handler and stage
     def Hive.path_to_dst(path,stage_path,gdrive_slot)
       has_handler = true if path.index("://")
@@ -142,12 +93,25 @@ module Mobilize
     end
     #run a generic hive command, with the option of passing a file hash to be locally available
-    def Hive.run(cluster,hql,user_name,file_hash=nil)
+    def Hive.run(cluster,hql,user_name,params=nil,file_hash=nil)
       # no TempStatsStore
       hql = "set hive.stats.autogather=false;#{hql}"
       filename = hql.to_md5
       file_hash||= {}
       file_hash[filename] = hql
+      #add in default params
+      params ||= {}
+      params.merge(Ssh.default_params)
+      #replace any params in the file_hash and command
+      params.each do |k,v|
+        file_hash.each do |name,data|
+          if k.starts_with?("$")
+            data.gsub!(k,v)
+          else
+            data.gsub!("@#{k}",v)
+          end
+        end
+      end
       #silent mode so we don't have logs in stderr; clip output
       #at hadoop read limit
       command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
@@ -201,10 +165,10 @@ module Mobilize
                             "drop table if exists #{output_path}",
                             "create table #{output_path} as #{select_hql};"].join(";")
         full_hql = [prior_hql, output_table_hql].compact.join(";")
-        result = Hive.run(cluster,full_hql, user_name)
+        result = Hive.run(cluster,full_hql, user_name,params['params'])
         Dataset.find_or_create_by_url(out_url)
       else
-        result = Hive.run(cluster, hql, user_name)
+        result = Hive.run(cluster, hql, user_name,params['params'])
         Dataset.find_or_create_by_url(out_url)
         Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
       end
@@ -245,7 +209,7 @@ module Mobilize
       schema_hash
     end
-    def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil)
+    def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil, params=nil)
       table_path = [db,table].join(".")
       table_stats = Hive.table_stats(cluster, db, table, user_name)
       url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
@@ -264,7 +228,7 @@ module Mobilize
       temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
       temp_drop_hql = "drop table if exists #{temp_table_path};"
       temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
-      response = Hive.run(cluster,temp_create_hql,user_name)
+      response = Hive.run(cluster,temp_create_hql,user_name,params)
       raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
       source_table_stats = Hive.table_stats(cluster,temp_db,temp_table_name,user_name)
@@ -303,7 +267,7 @@ module Mobilize
                            target_insert_hql,
                            temp_drop_hql].join
-        response = Hive.run(cluster, target_full_hql, user_name)
+        response = Hive.run(cluster, target_full_hql, user_name, params)
         raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
@@ -355,7 +319,7 @@ module Mobilize
           part_set_hql = "set hive.cli.print.header=true;set mapred.job.name=#{job_name} (permutations);"
           part_select_hql = "select distinct #{target_part_stmt} from #{temp_table_path};"
           part_perm_hql = part_set_hql + part_select_hql
-          response = Hive.run(cluster, part_perm_hql, user_name)
+          response = Hive.run(cluster, part_perm_hql, user_name, params)
           raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
           part_perm_tsv = response['stdout']
           #having gotten the permutations, ensure they are dropped
@@ -381,7 +345,7 @@ module Mobilize
         target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
-        response = Hive.run(cluster, target_full_hql, user_name)
+        response = Hive.run(cluster, target_full_hql, user_name, params)
         raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
       else
         error_msg = "Incompatible partition specs"
@@ -435,7 +399,7 @@ module Mobilize
         target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
-        response = Hive.run(cluster, target_full_hql, user_name, file_hash)
+        response = Hive.run(cluster, target_full_hql, user_name, nil, file_hash)
         raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
       elsif part_array.length > 0 and

data/lib/mobilize-hive/helpers/hive_helper.rb ADDED Viewed

@@ -0,0 +1,55 @@
+module Mobilize
+  module Hive
+    def self.config
+      Base.config('hive')
+    end
+    def self.exec_path(cluster)
+      self.clusters[cluster]['exec_path']
+    end
+    def self.output_db(cluster)
+      self.clusters[cluster]['output_db']
+    end
+    def self.output_db_user(cluster)
+      output_db_node = Hadoop.gateway_node(cluster)
+      output_db_user = Ssh.host(output_db_node)['user']
+      output_db_user
+    end
+    def self.clusters
+      self.config['clusters']
+    end
+    def self.slot_ids(cluster)
+      (1..self.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
+    end
+    def self.slot_worker_by_cluster_and_path(cluster,path)
+      working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
+      self.slot_ids(cluster).each do |slot_id|
+        unless working_slots.include?(slot_id)
+          Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
+          return slot_id
+        end
+      end
+      #return false if none are available
+      return false
+    end
+    def self.unslot_worker_by_path(path)
+      begin
+        Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
+        return true
+      rescue
+        return false
+      end
+    end
+    def self.databases(cluster,user_name)
+      self.run(cluster,"show databases",user_name)['stdout'].split("\n")
+    end
+  end
+end

data/lib/mobilize-hive/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Mobilize
   module Hive
-    VERSION = "1.298"
+    VERSION = "1.299"
   end
 end

data/test/hive_job_rows.yml CHANGED Viewed

@@ -20,7 +20,7 @@
   active: true
   trigger: after hive_test_2
   status: ""
-  stage1: hive.run hql:"select act_date as `date`,product,category,value from mobilize.hive_test_1;"
+  stage1: hive.run hql:"select '@date' as `date`,product,category,value from mobilize.hive_test_1;", params:{'date':'2013-01-01'}
   stage2: hive.write source:"stage1",target:"mobilize/hive_test_3", partitions:"date/product", drop:true
   stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3", partitions:"date/product", drop:false
   stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: mobilize-hive
 version: !ruby/object:Gem::Version
-  version: '1.298'
+  version: '1.299'
   prerelease:
 platform: ruby
 authors:
@@ -41,6 +41,7 @@ files:
 - Rakefile
 - lib/mobilize-hive.rb
 - lib/mobilize-hive/handlers/hive.rb
+- lib/mobilize-hive/helpers/hive_helper.rb
 - lib/mobilize-hive/tasks.rb
 - lib/mobilize-hive/version.rb
 - lib/samples/hive.yml
@@ -66,7 +67,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: 1394133607903248824
+      hash: -3388772007190329704
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
@@ -75,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: 1394133607903248824
+      hash: -3388772007190329704
 requirements: []
 rubyforge_project:
 rubygems_version: 1.8.25