RubyGems - mobilize-hive - Versions diffs - 1.3 → 1.21 - Mend

mobilize-hive 1.3 → 1.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/README.md +0 -11
data/lib/mobilize-hive/handlers/hive.rb +118 -103
data/lib/mobilize-hive/version.rb +1 -1
data/mobilize-hive.gemspec +1 -1
data/test/hive_job_rows.yml +1 -9
data/test/mobilize-hive_test.rb +1 -17
metadata +10 -5
data/lib/mobilize-hive/helpers/hive_helper.rb +0 -63

data/README.md CHANGED Viewed

@@ -142,17 +142,6 @@ Start
   * cluster and user are optional for all of the below.
     * cluster defaults to the first cluster listed;
     * user is treated the same way as in [mobilize-ssh][mobilize-ssh].
-  * params are also optional for all of the below. They replace HQL in sources.
-    * params are passed as a YML or JSON, as in:
-      * `hive.run source:<source_path>, params:{'date': '2013-03-01', 'unit': 'widgets'}`
-        * this example replaces all the keys, preceded by '@' in all source hqls with the value.
-          * The preceding '@' is used to keep from replacing instances
-            of "date" and "unit" in the HQL; you should have `@date` and `@unit` in your actual HQL
-            if you'd like to replace those tokens.
-    * in addition, the following params are substituted automatically:
-      * `$utc_date` - replaced with YYYY-MM-DD date, UTC
-      * `$utc_time` - replaced with HH:MM time, UTC
-      * any occurrence of these values in HQL will be replaced at runtime.
   * hive.run `hql:<hql> || source:<gsheet_path>, user:<user>, cluster:<cluster>`, which executes the
       script in the hql or source sheet and returns any output specified at the
       end. If the cmd or last query in source is a select statement, column headers will be

data/lib/mobilize-hive/handlers/hive.rb CHANGED Viewed

@@ -1,9 +1,58 @@
 module Mobilize
   module Hive
-    #adds convenience methods
-    require "#{File.dirname(__FILE__)}/../helpers/hive_helper"
+    def Hive.config
+      Base.config('hive')
+    end
+    def Hive.exec_path(cluster)
+      Hive.clusters[cluster]['exec_path']
+    end
+    def Hive.output_db(cluster)
+      Hive.clusters[cluster]['output_db']
+    end
+    def Hive.output_db_user(cluster)
+      output_db_node = Hadoop.gateway_node(cluster)
+      output_db_user = Ssh.host(output_db_node)['user']
+      output_db_user
+    end
+    def Hive.clusters
+      Hive.config['clusters']
+    end
+    def Hive.slot_ids(cluster)
+      (1..Hive.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
+    end
+    def Hive.slot_worker_by_cluster_and_path(cluster,path)
+      working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
+      Hive.slot_ids(cluster).each do |slot_id|
+        unless working_slots.include?(slot_id)
+          Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
+          return slot_id
+        end
+      end
+      #return false if none are available
+      return false
+    end
+    def Hive.unslot_worker_by_path(path)
+      begin
+        Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
+        return true
+      rescue
+        return false
+      end
+    end
+    def Hive.databases(cluster,user_name)
+      Hive.run(cluster,"show databases",user_name)['stdout'].split("\n")
+    end
     # converts a source path or target path to a dst in the context of handler and stage
-    def Hive.path_to_dst(path,stage_path,gdrive_slot)
+    def Hive.path_to_dst(path,stage_path)
       has_handler = true if path.index("://")
       s = Stage.where(:path=>stage_path).first
       params = s.params
@@ -29,7 +78,7 @@ module Mobilize
         return Dataset.find_or_create_by_url(hive_url)
       end
       #otherwise, use hdfs convention
-      return Ssh.path_to_dst(path,stage_path,gdrive_slot)
+      return Ssh.path_to_dst(path,stage_path)
     end
     def Hive.url_by_path(path,user_name,is_target=false)
@@ -59,7 +108,7 @@ module Mobilize
     def Hive.table_stats(cluster,db,table,user_name)
       describe_sql = "use #{db};describe extended #{table};"
       describe_response = Hive.run(cluster, describe_sql,user_name)
-      return nil if describe_response['stdout'].length==0
+      return describe_response if describe_response['stdout'].length==0
       describe_output = describe_response['stdout']
       describe_output.split("location:").last.split(",").first
       #get location, fields, partitions
@@ -93,43 +142,20 @@ module Mobilize
     end
     #run a generic hive command, with the option of passing a file hash to be locally available
-    def Hive.run(cluster,hql,user_name,params=nil,file_hash=nil)
+    def Hive.run(cluster,hql,user_name,file_hash=nil)
       # no TempStatsStore
       hql = "set hive.stats.autogather=false;#{hql}"
       filename = hql.to_md5
       file_hash||= {}
       file_hash[filename] = hql
-      #add in default params
-      params ||= {}
-      params = params.merge(Hive.default_params)
-      #replace any params in the file_hash and command
-      params.each do |k,v|
-        file_hash.each do |name,data|
-          if k.starts_with?("$")
-            data.gsub!(k,v)
-          else
-            data.gsub!("@#{k}",v)
-          end
-        end
-      end
       #silent mode so we don't have logs in stderr; clip output
       #at hadoop read limit
       command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
       gateway_node = Hadoop.gateway_node(cluster)
-      response = Ssh.run(gateway_node,command,user_name,file_hash)
-      #override exit code 0 when stdout is blank and
-      #stderror contains FAILED or KILLED
-      if response['stdout'].to_s.length == 0 and
-        response['stderr'].to_s.ie{|se| se.index("FAILED") or se.index("KILLED")}
-        response['exit_code'] = 500
-      end
-      return response
+      Ssh.run(gateway_node,command,user_name,file_hash)
     end
     def Hive.run_by_stage_path(stage_path)
-      gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
-      #return blank response if there are no slots available
-      return nil unless gdrive_slot
       s = Stage.where(:path=>stage_path).first
       params = s.params
       cluster = params['cluster'] || Hive.clusters.keys.first
@@ -148,16 +174,13 @@ module Mobilize
       if params['hql']
         hql = params['hql']
       else
-        source = s.sources(gdrive_slot).first
-        hql = source.read(user_name,gdrive_slot)
+        source = s.sources.first
+        hql = source.read(user_name)
       end
-      Gdrive.unslot_worker_by_path(stage_path)
       #check for select at end
       hql_array = hql.split(";").map{|hc| hc.strip}.reject{|hc| hc.length==0}
-      last_statement = hql_array.last.downcase.split("\n").reject{|l| l.starts_with?("-- ")}.first
-      if last_statement.to_s.starts_with?("select")
+      if hql_array.last.downcase.starts_with?("select")
         #nil if no prior commands
         prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
         select_hql = hql_array.last
@@ -165,10 +188,10 @@ module Mobilize
                             "drop table if exists #{output_path}",
                             "create table #{output_path} as #{select_hql};"].join(";")
         full_hql = [prior_hql, output_table_hql].compact.join(";")
-        result = Hive.run(cluster,full_hql, user_name,params['params'])
+        result = Hive.run(cluster,full_hql, user_name)
         Dataset.find_or_create_by_url(out_url)
       else
-        result = Hive.run(cluster, hql, user_name,params['params'])
+        result = Hive.run(cluster, hql, user_name)
         Dataset.find_or_create_by_url(out_url)
         Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
       end
@@ -201,7 +224,7 @@ module Mobilize
       file_name = schema_path.split("/").last
       out_url = "gridfs://#{schema_path}/#{file_name}"
       Dataset.write_by_url(out_url,out_tsv,user_name)
-      schema_tsv = Dataset.find_by_url(out_url).read(user_name,gdrive_slot)
+      schema_tsv = Dataset.find_by_url(out_url).read(user_name)
       schema_hash = {}
       schema_tsv.tsv_to_hash_array.each do |ha|
         schema_hash[ha['name']] = ha['datatype']
@@ -209,10 +232,24 @@ module Mobilize
       schema_hash
     end
-    def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil, params=nil)
+    def Hive.path_params(cluster, path, user_name)
+      db, table, partitions = path.gsub(".","/").split("/").ie{|sp| [sp.first, sp.second, sp[2..-1]]}
+      #get existing table stats if any
+      curr_stats = begin
+                     Hive.table_stats(cluster, db, table, user_name)
+                   rescue
+                     nil
+                   end
+      {"db"=>db,
+       "table"=>table,
+       "partitions"=>partitions,
+       "curr_stats"=>curr_stats}
+    end
+    def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil)
       table_path = [db,table].join(".")
-      table_stats = Hive.table_stats(cluster, db, table, user_name)
-      url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
+      target_params = Hive.path_params(cluster, table_path, user_name)
+      table_stats = target_params['curr_stats']
       source_hql_array = source_hql.split(";")
       last_select_i = source_hql_array.rindex{|hql| hql.downcase.strip.starts_with?("select")}
@@ -228,10 +265,11 @@ module Mobilize
       temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
       temp_drop_hql = "drop table if exists #{temp_table_path};"
       temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
-      response = Hive.run(cluster,temp_create_hql,user_name,params)
-      raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
+      Hive.run(cluster,temp_create_hql,user_name)
-      source_table_stats = Hive.table_stats(cluster,temp_db,temp_table_name,user_name)
+      source_params = Hive.path_params(cluster, temp_table_path, user_name)
+      source_table_path = ['db','table'].map{|k| source_params[k]}.join(".")
+      source_table_stats = source_params['curr_stats']
       source_fields = source_table_stats['field_defs']
       if part_array.length == 0 and
@@ -259,7 +297,7 @@ module Mobilize
         target_create_hql = "create table if not exists #{table_path} #{field_def_stmt};"
-        target_insert_hql = "insert overwrite table #{table_path} select #{target_field_stmt} from #{temp_table_path};"
+        target_insert_hql = "insert overwrite table #{table_path} select #{target_field_stmt} from #{source_table_path};"
         target_full_hql = [target_name_hql,
                            target_drop_hql,
@@ -267,12 +305,10 @@ module Mobilize
                            target_insert_hql,
                            temp_drop_hql].join
-        response = Hive.run(cluster, target_full_hql, user_name, params)
-        raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
+        Hive.run(cluster, target_full_hql, user_name)
       elsif part_array.length > 0 and
-        table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']}.sort == part_array.sort}
+        table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
         #partitions and no target table or same partitions in both target table and user params
         target_headers = source_fields.map{|f| f['name']}.reject{|h| part_array.include?(h)}
@@ -316,20 +352,10 @@ module Mobilize
         else
           #get all the permutations of possible partititons
-          part_set_hql = "set hive.cli.print.header=true;set mapred.job.name=#{job_name} (permutations);"
-          part_select_hql = "select distinct #{target_part_stmt} from #{temp_table_path};"
-          part_perm_hql = part_set_hql + part_select_hql
-          response = Hive.run(cluster, part_perm_hql, user_name, params)
-          raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
-          part_perm_tsv = response['stdout']
+          part_perm_hql = "set hive.cli.print.header=true;select distinct #{target_part_stmt} from #{source_table_path};"
+          part_perm_tsv = Hive.run(cluster, part_perm_hql, user_name)['stdout']
           #having gotten the permutations, ensure they are dropped
           part_hash_array = part_perm_tsv.tsv_to_hash_array
-          #make sure there is data
-          if part_hash_array.first.nil? or part_hash_array.first.values.include?(nil)
-            #blank result set, return url
-            return url
-          end
           part_drop_hql = part_hash_array.map do |h|
             part_drop_stmt = h.map do |name,value|
                                part_defs[name[1..-2]]=="string" ? "#{name}='#{value}'" : "#{name}=#{value}"
@@ -341,16 +367,16 @@ module Mobilize
         target_insert_hql = "insert overwrite table #{table_path} " +
                             "partition (#{target_part_stmt}) " +
-                            "select #{target_field_stmt},#{target_part_stmt} from #{temp_table_path};"
+                            "select #{target_field_stmt},#{target_part_stmt} from #{source_table_path};"
         target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
-        response = Hive.run(cluster, target_full_hql, user_name, params)
-        raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
+        Hive.run(cluster, target_full_hql, user_name)
       else
         error_msg = "Incompatible partition specs"
         raise error_msg
       end
+      url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
       return url
     end
@@ -358,21 +384,14 @@ module Mobilize
     #Accepts options to drop existing target if any
     #also schema with column datatype overrides
     def Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop=false, schema_hash=nil)
-      return nil if source_tsv.strip.length==0
-      if source_tsv.index("\r\n")
-        source_tsv = source_tsv.gsub("\r\n","\n")
-      elsif source_tsv.index("\r")
-        source_tsv = source_tsv.gsub("\r","\n")
-      end
       source_headers = source_tsv.tsv_header_array
       table_path = [db,table].join(".")
-      table_stats = Hive.table_stats(cluster, db, table, user_name)
+      target_params = Hive.path_params(cluster, table_path, user_name)
+      table_stats = target_params['curr_stats']
       schema_hash ||= {}
-      url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
       if part_array.length == 0 and
         table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
         #no partitions in either user params or the target table
@@ -399,11 +418,10 @@ module Mobilize
         target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
-        response = Hive.run(cluster, target_full_hql, user_name, nil, file_hash)
-        raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
+        Hive.run(cluster, target_full_hql, user_name, file_hash)
       elsif part_array.length > 0 and
-        table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']}.sort == part_array.sort}
+        table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
         #partitions and no target table
         #or same partitions in both target table and user params
         #or drop and start fresh
@@ -427,17 +445,13 @@ module Mobilize
                             "partitioned by #{partition_defs}"
         #create target table early if not here
-        response = Hive.run(cluster, target_create_hql, user_name)
-        raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
-        #return url (operation complete) if there's no data
-        source_hash_array = source_tsv.tsv_to_hash_array
-        return url if source_hash_array.length==1 and source_hash_array.first.values.compact.length==0
+        Hive.run(cluster, target_create_hql, user_name)
         table_stats = Hive.table_stats(cluster, db, table, user_name)
         #create data hash from source hash array
         data_hash = {}
+        source_hash_array = source_tsv.tsv_to_hash_array
         source_hash_array.each do |ha|
           tpmk = part_array.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
           tpmv = ha.reject{|k,v| part_array.include?(k)}.values.join("\001")
@@ -470,8 +484,7 @@ module Mobilize
         #run actual partition adds all at once
         if target_part_hql.length>0
           puts "Adding partitions to #{cluster}/#{db}/#{table} for #{user_name} at #{Time.now.utc}"
-          response = Hive.run(cluster, target_part_hql, user_name)
-          raise response['stderr'] if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
+          Hive.run(cluster, target_part_hql, user_name)
         end
       else
         error_msg = "Incompatible partition specs: " +
@@ -479,31 +492,33 @@ module Mobilize
                     "user_params:#{part_array.to_s}"
         raise error_msg
       end
+      url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
       return url
     end
     def Hive.write_by_stage_path(stage_path)
-      gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
-      #return blank response if there are no slots available
-      return nil unless gdrive_slot
       s = Stage.where(:path=>stage_path).first
       params = s.params
-      source = s.sources(gdrive_slot).first
+      source = s.sources.first
       target = s.target
       cluster, db, table = target.url.split("://").last.split("/")
-      #slot Hive worker if available
-      slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
-      return false unless slot_id
       #update stage with the node so we can use it
       user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
       job_name = s.path.sub("Runner_","")
+      #slot Hive worker if available
+      slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
+      return false unless slot_id
       schema_hash = if params['schema']
+                      gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
+                      #return blank response if there are no slots available
+                      return nil unless gdrive_slot
                       Hive.schema_hash(params['schema'],user_name,gdrive_slot)
                     else
                       {}
                     end
+      Gdrive.unslot_worker_by_path(stage_path)
       #drop target before create/insert?
       drop = params['drop']
@@ -516,17 +531,16 @@ module Mobilize
           #source table
           cluster,source_path = source.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
           source_hql = "select * from #{source_path};"
-        elsif ['gsheet','gfile','gridfs','hdfs'].include?(source.handler)
+        elsif ['gsheet','gridfs','hdfs'].include?(source.handler)
           if source.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
-            source_hql = source.read(user_name,gdrive_slot)
+            source_hql = source.read(user_name)
           else
-            #tsv from sheet or file
-            source_tsv = source.read(user_name,gdrive_slot)
+            #tsv from sheet
+            source_tsv = source.read(user_name)
           end
         end
       end
-      Gdrive.unslot_worker_by_path(stage_path)
       part_array = if params['partitions']
                     params['partitions'].to_a.map{|p| p.gsub(".","/").split("/")}.flatten
                   elsif params['target']
@@ -545,14 +559,12 @@ module Mobilize
                          Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop, schema_hash)
                        elsif source_tsv
                          Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop, schema_hash)
-                       elsif source
-                         #null sheet
                        else
                          raise "Unable to determine source tsv or source hql"
                        end
                  {'stdout'=>url,'exit_code'=>0}
                rescue => exc
-                 {'stderr'=>"#{exc.to_s}\n#{exc.backtrace.join("\n")}", 'exit_code'=>500}
+                 {'stderr'=>exc.to_s, 'exit_code'=>500}
                end
       #unslot worker and write result
@@ -573,8 +585,11 @@ module Mobilize
       select_hql = "select * from #{source_path};"
       hql = [set_hql,select_hql].join
       response = Hive.run(cluster, hql,user_name)
-      raise "Unable to read hive://#{dst_path} with error: #{response['stderr']}" if response['stderr'].to_s.ie{|s| s.index("FAILED") or s.index("KILLED")}
-      return response['stdout']
+      if response['exit_code']==0
+        return response['stdout']
+      else
+        raise "Unable to read hive://#{dst_path} with error: #{response['stderr']}"
+      end
     end
     def Hive.write_by_dataset_path(dst_path,source_tsv,user_name,*args)

data/lib/mobilize-hive/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Mobilize
   module Hive
-    VERSION = "1.3"
+    VERSION = "1.21"
   end
 end

data/mobilize-hive.gemspec CHANGED Viewed

@@ -16,5 +16,5 @@ Gem::Specification.new do |gem|
   gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
   gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
   gem.require_paths = ["lib"]
-  gem.add_runtime_dependency "mobilize-hdfs","1.3"
+  gem.add_runtime_dependency "mobilize-hdfs","1.21"
 end

data/test/hive_job_rows.yml CHANGED Viewed

@@ -20,15 +20,7 @@
   active: true
   trigger: after hive_test_2
   status: ""
-  stage1: hive.run hql:"select '@date' as `date`,product,category,value from mobilize.hive_test_1;", params:{'date':'2013-01-01'}
+  stage1: hive.run hql:"select act_date as `date`,product,category,value from mobilize.hive_test_1;"
   stage2: hive.write source:"stage1",target:"mobilize/hive_test_3", partitions:"date/product", drop:true
   stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3", partitions:"date/product", drop:false
   stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"
-- name: hive_test_4
-  active: true
-  trigger: after hive_test_3
-  status: ""
-  stage1: hive.write source:"hive_test_4_stage_1.in", target:"mobilize/hive_test_1", partitions:"act_date"
-  stage2: hive.write source:"hive_test_4_stage_2.in", target:"mobilize/hive_test_1", partitions:"act_date"
-  stage3: hive.run hql:"select '$utc_date $utc_time' as `date_time`,product,category,value from mobilize.hive_test_1;"
-  stage4: gsheet.write source:stage3, target:"hive_test_4.out"

data/test/mobilize-hive_test.rb CHANGED Viewed

@@ -25,18 +25,6 @@ describe "Mobilize" do
     hive_1_in_tsv = YAML.load_file("#{Mobilize::Base.root}/test/hive_test_1_in.yml").hash_array_to_tsv
     hive_1_in_sheet.write(hive_1_in_tsv,Mobilize::Gdrive.owner_name)
-    #create blank sheet
-    hive_4_stage_1_in_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_1.in",gdrive_slot)
-    [hive_4_stage_1_in_sheet].each {|s| s.delete if s}
-    hive_4_stage_1_in_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_1.in",gdrive_slot)
-    #create sheet w just headers
-    hive_4_stage_2_in_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_2.in",gdrive_slot)
-    [hive_4_stage_2_in_sheet].each {|s| s.delete if s}
-    hive_4_stage_2_in_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4_stage_2.in",gdrive_slot)
-    hive_4_stage_2_in_sheet_header = hive_1_in_tsv.tsv_header_array.join("\t")
-    hive_4_stage_2_in_sheet.write(hive_4_stage_2_in_sheet_header,Mobilize::Gdrive.owner_name)
     hive_1_schema_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1.schema",gdrive_slot)
     [hive_1_schema_sheet].each {|s| s.delete if s}
     hive_1_schema_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1.schema",gdrive_slot)
@@ -63,25 +51,21 @@ describe "Mobilize" do
     [hive_2_target_sheet].each{|s| s.delete if s}
     hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
     [hive_3_target_sheet].each{|s| s.delete if s}
-    hive_4_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4.out",gdrive_slot)
-    [hive_4_target_sheet].each{|s| s.delete if s}
     puts "job row added, force enqueued requestor, wait for stages"
     r.enqueue!
-    wait_for_stages(2100)
+    wait_for_stages(1200)
     puts "jobtracker posted data to test sheet"
     hive_1_stage_2_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1_stage_2.out",gdrive_slot)
     hive_1_stage_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1_stage_3.out",gdrive_slot)
     hive_2_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_2.out",gdrive_slot)
     hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
-    hive_4_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_4.out",gdrive_slot)
     assert hive_1_stage_2_target_sheet.read(u.name).length == 219
     assert hive_1_stage_3_target_sheet.read(u.name).length > 3
     assert hive_2_target_sheet.read(u.name).length == 599
     assert hive_3_target_sheet.read(u.name).length == 347
-    assert hive_4_target_sheet.read(u.name).length == 432
   end
   def wait_for_stages(time_limit=600,stage_limit=120,wait_length=10)

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: mobilize-hive
 version: !ruby/object:Gem::Version
-  version: '1.3'
+  version: '1.21'
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-04-18 00:00:00.000000000 Z
+date: 2013-03-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mobilize-hdfs
@@ -18,7 +18,7 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: '1.3'
+        version: '1.21'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
@@ -26,7 +26,7 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: '1.3'
+        version: '1.21'
 description: Adds hive read, write, and run support to mobilize-hdfs
 email:
 - cpaesleme@dena.com
@@ -41,7 +41,6 @@ files:
 - Rakefile
 - lib/mobilize-hive.rb
 - lib/mobilize-hive/handlers/hive.rb
-- lib/mobilize-hive/helpers/hive_helper.rb
 - lib/mobilize-hive/tasks.rb
 - lib/mobilize-hive/version.rb
 - lib/samples/hive.yml
@@ -65,12 +64,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
+      segments:
+      - 0
+      hash: -4590609456874633429
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
+      segments:
+      - 0
+      hash: -4590609456874633429
 requirements: []
 rubyforge_project:
 rubygems_version: 1.8.25

data/lib/mobilize-hive/helpers/hive_helper.rb DELETED Viewed

@@ -1,63 +0,0 @@
-module Mobilize
-  module Hive
-    def self.config
-      Base.config('hive')
-    end
-    def self.exec_path(cluster)
-      self.clusters[cluster]['exec_path']
-    end
-    def self.output_db(cluster)
-      self.clusters[cluster]['output_db']
-    end
-    def self.output_db_user(cluster)
-      output_db_node = Hadoop.gateway_node(cluster)
-      output_db_user = Ssh.host(output_db_node)['user']
-      output_db_user
-    end
-    def self.clusters
-      self.config['clusters']
-    end
-    def self.slot_ids(cluster)
-      (1..self.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
-    end
-    def self.slot_worker_by_cluster_and_path(cluster,path)
-      working_slots = Mobilize::Resque.jobs.map{|j| begin j['args'][1]['hive_slot'];rescue;nil;end}.compact.uniq
-      self.slot_ids(cluster).each do |slot_id|
-        unless working_slots.include?(slot_id)
-          Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
-          return slot_id
-        end
-      end
-      #return false if none are available
-      return false
-    end
-    def self.unslot_worker_by_path(path)
-      begin
-        Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
-        return true
-      rescue
-        return false
-      end
-    end
-    def self.databases(cluster,user_name)
-      self.run(cluster,"show databases",user_name)['stdout'].split("\n")
-    end
-    def self.default_params
-      time = Time.now.utc
-      {
-       '$utc_date'=>time.strftime("%Y-%m-%d"),
-       '$utc_time'=>time.strftime("%H:%M"),
-      }
-    end
-  end
-end