RubyGems - mobilize-hive - Versions diffs - 1.0.11 → 1.2 - Mend

mobilize-hive 1.0.11 → 1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/README.md +4 -2
data/lib/mobilize-hive/handlers/hive.rb +249 -171
data/lib/mobilize-hive/version.rb +1 -1
data/mobilize-hive.gemspec +2 -2
data/test/hive_job_rows.yml +3 -3
data/test/mobilize-hive_test.rb +28 -3
metadata +5 -5

data/README.md CHANGED Viewed

@@ -146,7 +146,7 @@ Start
       script in the hql or source sheet and returns any output specified at the
       end. If the cmd or last query in source is a select statement, column headers will be
       returned as well.
-  * hive.write `hql:<hql> || source:<source_path>, target:<hive_path>, user:<user>, cluster:<cluster>, schema:<gsheet_path>, drop:<true/false>`,
+  * hive.write `hql:<hql> || source:<source_path>, target:<hive_path>, partitions:<partition_path>, user:<user>, cluster:<cluster>, schema:<gsheet_path>, drop:<true/false>`,
       which writes the source or query result to the selected hive table.
     * hive_path
       * should be of the form `<hive_db>/<table_name>` or `<hive_db>.<table_name>`.
@@ -156,8 +156,10 @@ Start
         * if the file ends in .*ql, it's treated the same as passing hql
         * otherwise it is treated as a tsv with the first row as column headers
     * target:
-      * Partitions can optionally be added to the hive_path, as in `<hive_db>/<table_name>/<partition1>/<partition2>`.
+      * Should be a hive_path, as in `<hive_db>/<table_name>` or `<hive_db>.<table_name>`.
+    * partitions:
       * Due to Hive limitation, partition names CANNOT be reserved keywords when writing from tsv (gsheet or hdfs source)
+      * Partitions should be specified as a path, as in  partitions:`<partition1>/<partition2>`.
     * schema:
       * optional. gsheet_path to column schema.
         * two columns: name, datatype

data/lib/mobilize-hive/handlers/hive.rb CHANGED Viewed

@@ -47,10 +47,69 @@ module Mobilize
       end
     end
+    def Hive.databases(cluster,user_name)
+      Hive.run(cluster,"show databases",user_name)['stdout'].split("\n")
+    end
+    # converts a source path or target path to a dst in the context of handler and stage
+    def Hive.path_to_dst(path,stage_path)
+      has_handler = true if path.index("://")
+      s = Stage.where(:path=>stage_path).first
+      params = s.params
+      target_path = params['target']
+      cluster = params['cluster'] if Hadoop.clusters.include?(params['cluster'].to_s)
+      is_target = true if path == target_path
+      red_path = path.split("://").last
+      first_path_node = red_path.gsub(".","/").split("/").first
+      cluster ||= Hadoop.clusters.include?(first_path_node) ? first_path_node : Hadoop.default_cluster
+      user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
+      #save some time on targets
+      databases = Hive.databases(cluster,user_name) unless is_target
+      #is user has a handler, is specifying a target,
+      #or their first path node is a cluster name
+      #or their first path node is actually a database
+      #assume it's a hive pointer
+      if is_target or
+        has_handler or
+        Hadoop.clusters.include?(first_path_node) or
+        databases.include?(first_path_node)
+        #make sure cluster is legit
+        hive_url = Hive.url_by_path(red_path,user_name,is_target)
+        return Dataset.find_or_create_by_url(hive_url)
+      end
+      #otherwise, use hdfs convention
+      return Ssh.path_to_dst(path,stage_path)
+    end
+    def Hive.url_by_path(path,user_name,is_target=false)
+      red_path = path.gsub(".","/")
+      cluster = red_path.split("/").first.to_s
+      if Hadoop.clusters.include?(cluster)
+        #cut node out of path
+        red_path = red_path.split("/")[1..-1].join("/")
+      else
+        cluster = Hadoop.default_cluster
+      end
+      db, table = red_path.split("/")[0..-1]
+      url = "hive://#{cluster}/#{db}/#{table}"
+      begin
+        #add table stats check only if not target
+        if is_target or Hive.table_stats(cluster, db, table, user_name)['stderr'].to_s.length == 0
+          return url
+        else
+          raise "Unable to find #{url} with error: #{stat_response['stderr']}"
+        end
+      rescue => exc
+        raise Exception, "Unable to find #{url} with error: #{exc.to_s}", exc.backtrace
+      end
+    end
     #get field names and partition datatypes and size of a hive table
-    def Hive.table_stats(db,table,cluster,user)
-      describe_sql = "use #{db};describe extended #{table}"
-      describe_output = Hive.run(describe_sql,cluster,user)
+    def Hive.table_stats(cluster,db,table,user_name)
+      describe_sql = "use #{db};describe extended #{table};"
+      describe_response = Hive.run(cluster, describe_sql,user_name)
+      return describe_response if describe_response['stdout'].length==0
+      describe_output = describe_response['stdout']
       describe_output.split("location:").last.split(",").first
       #get location, fields, partitions
       result_hash = {}
@@ -78,12 +137,12 @@ module Mobilize
       #assign field defs after removing partitions
       result_hash['field_defs'] = field_defs
       #get size
-      result_hash['size'] = Hadoop.run("fs -dus #{result_hash['location']}",cluster,user).split("\t").last.strip.to_i
+      result_hash['size'] = Hadoop.run(cluster,"fs -dus #{result_hash['location']}",user_name)['stdout'].split("\t").last.strip.to_i
       return result_hash
     end
     #run a generic hive command, with the option of passing a file hash to be locally available
-    def Hive.run(hql,cluster,user,file_hash=nil)
+    def Hive.run(cluster,hql,user_name,file_hash=nil)
       # no TempStatsStore
       hql = "set hive.stats.autogather=false;#{hql}"
       filename = hql.to_md5
@@ -93,22 +152,15 @@ module Mobilize
       #at hadoop read limit
       command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
       gateway_node = Hadoop.gateway_node(cluster)
-      Ssh.run(gateway_node,command,user,file_hash)
+      Ssh.run(gateway_node,command,user_name,file_hash)
     end
     def Hive.run_by_stage_path(stage_path)
       s = Stage.where(:path=>stage_path).first
-      u = s.job.runner.user
       params = s.params
-      user = params['user']
       cluster = params['cluster'] || Hive.clusters.keys.first
-      node = Hadoop.gateway_node(cluster)
-      if user and !Ssh.sudoers(node).include?(u.name)
-        raise "#{u.name} does not have su permissions for #{node}"
-      elsif user.nil? and Ssh.su_all_users(node)
-        user = u.name
-      end
+      user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
+      job_name = s.path.sub("Runner_","")
       #slot Hive worker if available
       slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
       return false unless slot_id
@@ -122,13 +174,8 @@ module Mobilize
       if params['hql']
         hql = params['hql']
       else
-        #user has passed in a gsheet hql
-        gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
-        #return blank response if there are no slots available
-        return nil unless gdrive_slot
-        source_dst = s.source_dsts(gdrive_slot).first
-        Gdrive.unslot_worker_by_path(stage_path)
-        hql = source_dst.read(user)
+        source = s.sources.first
+        hql = source.read(user_name)
       end
       #check for select at end
@@ -137,55 +184,59 @@ module Mobilize
         #nil if no prior commands
         prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
         select_hql = hql_array.last
-        output_table_hql = ["drop table if exists #{output_path}",
+        output_table_hql = ["set mapred.job.name=#{job_name};",
+                            "drop table if exists #{output_path}",
                             "create table #{output_path} as #{select_hql};"].join(";")
         full_hql = [prior_hql, output_table_hql].compact.join(";")
-        Hive.run(full_hql, cluster, user)
-        #already populated, make sure dataset exists
+        result = Hive.run(cluster,full_hql, user_name)
         Dataset.find_or_create_by_url(out_url)
       else
-        out_string = Hive.run(hql, cluster, user)
-        out_string = "result\n#{out_string}"
-        Dataset.write_by_url(out_url,out_string,user)
+        result = Hive.run(cluster, hql, user_name)
+        Dataset.find_or_create_by_url(out_url)
+        Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
       end
       #unslot worker
       Hive.unslot_worker_by_path(stage_path)
-      out_url
+      response = {}
+      response['out_url'] = out_url
+      response['err_url'] = Dataset.write_by_url("gridfs://#{s.path}/err",result['stderr'].to_s,Gdrive.owner_name) if result['stderr'].to_s.length>0
+      response['signal'] = result['exit_code']
+      response
     end
-    def Hive.schema_hash(schema_path,user,gdrive_slot)
+    def Hive.schema_hash(schema_path,user_name,gdrive_slot)
       if schema_path.index("/")
         #slashes mean sheets
-        out_tsv = Gsheet.find_by_path(schema_path,gdrive_slot).read(user)
+        out_tsv = Gsheet.find_by_path(schema_path,gdrive_slot).read(user_name)
       else
-        u = User.where(:name=>user).first
+        u = User.where(:name=>user_name).first
         #check sheets in runner
         r = u.runner
         runner_sheet = r.gbook(gdrive_slot).worksheet_by_title(schema_path)
         out_tsv = if runner_sheet
-                    runner_sheet.read(user)
+                    runner_sheet.read(user_name)
                   else
                     #check for gfile. will fail if there isn't one.
-                    Gfile.find_by_path(schema_path).read(user)
+                    Gfile.find_by_path(schema_path).read(user_name)
                   end
-        #use Gridfs to cache gdrive results
-        file_name = schema_path.split("/").last
-        out_url = "gridfs://#{schema_path}/#{file_name}"
-        Dataset.write_by_url(out_url,out_tsv,user)
-        schema_tsv = Dataset.find_by_url(out_url).read(user)
-        schema_hash = {}
-        schema_tsv.tsv_to_hash_array.each do |ha|
-          schema_hash[ha['name']] = ha['datatype']
-        end
-        schema_hash
       end
+      #use Gridfs to cache gdrive results
+      file_name = schema_path.split("/").last
+      out_url = "gridfs://#{schema_path}/#{file_name}"
+      Dataset.write_by_url(out_url,out_tsv,user_name)
+      schema_tsv = Dataset.find_by_url(out_url).read(user_name)
+      schema_hash = {}
+      schema_tsv.tsv_to_hash_array.each do |ha|
+        schema_hash[ha['name']] = ha['datatype']
+      end
+      schema_hash
     end
-    def Hive.path_params(cluster, path, user)
+    def Hive.path_params(cluster, path, user_name)
       db, table, partitions = path.gsub(".","/").split("/").ie{|sp| [sp.first, sp.second, sp[2..-1]]}
       #get existing table stats if any
       curr_stats = begin
-                     Hive.table_stats(db, table, cluster, user)
+                     Hive.table_stats(cluster, db, table, user_name)
                    rescue
                      nil
                    end
@@ -195,27 +246,34 @@ module Mobilize
        "curr_stats"=>curr_stats}
     end
-    def Hive.hql_to_table(cluster, source_hql, target_path, user, drop=false, schema_hash=nil)
-      target_params = Hive.path_params(cluster, target_path, user)
-      target_table_path = ['db','table'].map{|k| target_params[k]}.join(".")
-      target_partitions = target_params['partitions'].to_a
-      target_table_stats = target_params['curr_stats']
+    def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil)
+      table_path = [db,table].join(".")
+      target_params = Hive.path_params(cluster, table_path, user_name)
+      table_stats = target_params['curr_stats']
+      source_hql_array = source_hql.split(";")
+      last_select_i = source_hql_array.rindex{|hql| hql.downcase.strip.starts_with?("select")}
+      #find the last select query -- it should be used for the temp table creation
+      last_select_hql = (source_hql_array[last_select_i..-1].join(";")+";")
+      #if there is anything prior to the last select, add it in prior to table creation
+      prior_hql = ((source_hql_array[0..(last_select_i-1)].join(";")+";") if last_select_i and last_select_i>=1).to_s
       #create temporary table so we can identify fields etc.
       temp_db = Hive.output_db(cluster)
-      temp_table_name = (source_hql+target_path).to_md5
+      temp_table_name = (source_hql+table_path).to_md5
       temp_table_path = [temp_db,temp_table_name].join(".")
+      temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
       temp_drop_hql = "drop table if exists #{temp_table_path};"
-      temp_create_hql = "#{temp_drop_hql}create table #{temp_table_path} as #{source_hql}"
-      Hive.run(temp_create_hql,cluster,user)
+      temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
+      Hive.run(cluster,temp_create_hql,user_name)
-      source_params = Hive.path_params(cluster, temp_table_path, user)
+      source_params = Hive.path_params(cluster, temp_table_path, user_name)
       source_table_path = ['db','table'].map{|k| source_params[k]}.join(".")
       source_table_stats = source_params['curr_stats']
       source_fields = source_table_stats['field_defs']
-      if target_partitions.length == 0 and
-        target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
+      if part_array.length == 0 and
+        table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
         #no partitions in either user params or the target table
         target_headers = source_fields.map{|f| f['name']}
@@ -233,21 +291,27 @@ module Mobilize
                          end.join(",")})"
         #always drop when no partititons
-        target_drop_hql = "drop table if exists #{target_table_path};"
+        target_name_hql = "set mapred.job.name=#{job_name};"
+        target_drop_hql = "drop table if exists #{table_path};"
-        target_create_hql = "create table if not exists #{target_table_path} #{field_def_stmt};"
+        target_create_hql = "create table if not exists #{table_path} #{field_def_stmt};"
-        target_insert_hql = "insert overwrite table #{target_table_path} select #{target_field_stmt} from #{source_table_path};"
+        target_insert_hql = "insert overwrite table #{table_path} select #{target_field_stmt} from #{source_table_path};"
-        target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql,temp_drop_hql].join
+        target_full_hql = [target_name_hql,
+                           target_drop_hql,
+                           target_create_hql,
+                           target_insert_hql,
+                           temp_drop_hql].join
-        Hive.run(target_full_hql, cluster, user)
+        Hive.run(cluster, target_full_hql, user_name)
-      elsif target_partitions.length > 0 and
-        target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == target_partitions}
+      elsif part_array.length > 0 and
+        table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
         #partitions and no target table or same partitions in both target table and user params
-        target_headers = source_fields.map{|f| f['name']}.reject{|h| target_partitions.include?(h)}
+        target_headers = source_fields.map{|f| f['name']}.reject{|h| part_array.include?(h)}
         field_defs = {}
         target_headers.each do |name|
@@ -260,7 +324,7 @@ module Mobilize
                          end.join(",")})"
         part_defs = {}
-        target_partitions.each do |name|
+        part_array.each do |name|
           datatype = schema_hash[name] || "string"
           part_defs[name] = datatype
         end
@@ -271,70 +335,70 @@ module Mobilize
         target_field_stmt = target_headers.map{|h| "`#{h}`"}.join(",")
-        target_part_stmt = target_partitions.map{|h| "`#{h}`"}.join(",")
+        target_part_stmt = part_array.map{|h| "`#{h}`"}.join(",")
-        target_set_hql = ["set hive.exec.dynamic.partition.mode=nonstrict;",
+        target_set_hql = ["set mapred.job.name=#{job_name};",
+                          "set hive.exec.dynamic.partition.mode=nonstrict;",
                           "set hive.exec.max.dynamic.partitions.pernode=1000;",
                           "set hive.exec.dynamic.partition=true;",
                           "set hive.exec.max.created.files = 200000;",
                           "set hive.max.created.files = 200000;"].join
-        if drop or target_table_stats.nil?
-          target_drop_hql = "drop table if exists #{target_table_path};"
+        if drop or table_stats.nil?
+          target_drop_hql = "drop table if exists #{table_path};"
           target_create_hql = target_drop_hql +
-                            "create table if not exists #{target_table_path} #{field_def_stmt} " +
+                            "create table if not exists #{table_path} #{field_def_stmt} " +
                             "partitioned by #{part_def_stmt};"
         else
-          target_db,target_table = target_table_path.split(".")
           #get all the permutations of possible partititons
           part_perm_hql = "set hive.cli.print.header=true;select distinct #{target_part_stmt} from #{source_table_path};"
-          part_perm_tsv = Hive.run(part_perm_hql, cluster, user)
+          part_perm_tsv = Hive.run(cluster, part_perm_hql, user_name)['stdout']
           #having gotten the permutations, ensure they are dropped
           part_hash_array = part_perm_tsv.tsv_to_hash_array
           part_drop_hql = part_hash_array.map do |h|
             part_drop_stmt = h.map do |name,value|
                                part_defs[name[1..-2]]=="string" ? "#{name}='#{value}'" : "#{name}=#{value}"
                              end.join(",")
-                            "use #{target_db};alter table #{target_table} drop if exists partition (#{part_drop_stmt});"
+                            "use #{db};alter table #{table} drop if exists partition (#{part_drop_stmt});"
                           end.join
           target_create_hql = part_drop_hql
         end
-        target_insert_hql = "insert overwrite table #{target_table_path} " +
+        target_insert_hql = "insert overwrite table #{table_path} " +
                             "partition (#{target_part_stmt}) " +
                             "select #{target_field_stmt},#{target_part_stmt} from #{source_table_path};"
         target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
-        Hive.run(target_full_hql, cluster, user)
+        Hive.run(cluster, target_full_hql, user_name)
       else
         error_msg = "Incompatible partition specs"
         raise error_msg
       end
-      return target_path
+      url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
+      return url
     end
     #turn a tsv into a hive table.
     #Accepts options to drop existing target if any
     #also schema with column datatype overrides
-    def Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop=false, schema_hash=nil)
+    def Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop=false, schema_hash=nil)
       source_headers = source_tsv.tsv_header_array
-      target_params = Hive.path_params(cluster, target_path, user)
-      target_db,target_table = ['db','table'].map{|k| target_params[k]}
-      target_table_path = [target_db,target_table].join(".")
-      target_partitions = target_params['partitions'].to_a
-      target_table_stats = target_params['curr_stats']
+      table_path = [db,table].join(".")
+      target_params = Hive.path_params(cluster, table_path, user_name)
+      table_stats = target_params['curr_stats']
       schema_hash ||= {}
-      if target_partitions.length == 0 and
-        target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
+      if part_array.length == 0 and
+        table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
         #no partitions in either user params or the target table
         #or drop and start fresh
         #one file only, strip headers, replace tab with ctrl-a for hive
+        #get rid of freaking carriage return characters
         source_rows = source_tsv.split("\n")[1..-1].join("\n").gsub("\t","\001")
         source_tsv_filename = "000000_0"
         file_hash = {source_tsv_filename=>source_rows}
@@ -345,52 +409,52 @@ module Mobilize
         end.ie{|fs| "(#{fs.join(",")})"}
         #for single insert, use drop table and create table always
-        target_drop_hql = "drop table if exists #{target_table_path}"
+        target_drop_hql = "drop table if exists #{table_path}"
-        target_create_hql = "create table #{target_table_path} #{field_defs}"
+        target_create_hql = "create table #{table_path} #{field_defs}"
         #load source data
-        target_insert_hql = "load data local inpath '#{source_tsv_filename}' overwrite into table #{target_table_path};"
+        target_insert_hql = "load data local inpath '#{source_tsv_filename}' overwrite into table #{table_path};"
         target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
-        Hive.run(target_full_hql, cluster, user, file_hash)
+        Hive.run(cluster, target_full_hql, user_name, file_hash)
-      elsif target_partitions.length > 0 and
-        target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == target_partitions}
+      elsif part_array.length > 0 and
+        table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
         #partitions and no target table
         #or same partitions in both target table and user params
         #or drop and start fresh
-        target_headers = source_headers.reject{|h| target_partitions.include?(h)}
+        target_headers = source_headers.reject{|h| part_array.include?(h)}
         field_defs = "(#{target_headers.map do |name|
                        datatype = schema_hash[name] || "string"
                        "`#{name}` #{datatype}"
                      end.join(",")})"
-        partition_defs = "(#{target_partitions.map do |name|
+        partition_defs = "(#{part_array.map do |name|
                            datatype = schema_hash[name] || "string"
                            "#{name} #{datatype}"
                          end.join(",")})"
-        target_drop_hql = drop ? "drop table if exists #{target_table_path};" : ""
+        target_drop_hql = drop ? "drop table if exists #{table_path};" : ""
         target_create_hql = target_drop_hql +
-                            "create table if not exists #{target_table_path} #{field_defs} " +
+                            "create table if not exists #{table_path} #{field_defs} " +
                             "partitioned by #{partition_defs}"
         #create target table early if not here
-        Hive.run(target_create_hql, cluster, user)
+        Hive.run(cluster, target_create_hql, user_name)
-        target_table_stats = Hive.table_stats(target_db, target_table, cluster, user)
+        table_stats = Hive.table_stats(cluster, db, table, user_name)
         #create data hash from source hash array
         data_hash = {}
         source_hash_array = source_tsv.tsv_to_hash_array
         source_hash_array.each do |ha|
-          tpmk = target_partitions.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
-          tpmv = ha.reject{|k,v| target_partitions.include?(k)}.values.join("\001")
+          tpmk = part_array.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
+          tpmv = ha.reject{|k,v| part_array.include?(k)}.values.join("\001")
           if data_hash[tpmk]
             data_hash[tpmk] += "\n#{tpmv}"
           else
@@ -399,61 +463,62 @@ module Mobilize
         end
         #go through completed data hash and write each key value to the table in question
+        target_part_hql = ""
         data_hash.each do |tpmk,tpmv|
           base_filename = "000000_0"
           part_pairs = tpmk.split("/").map{|p| p.split("=").ie{|pa| ["#{pa.first}","#{pa.second}"]}}
           part_dir = part_pairs.map{|pp| "#{pp.first}=#{pp.second}"}.join("/")
           part_stmt = part_pairs.map{|pp| "#{pp.first}='#{pp.second}'"}.join(",")
-          hdfs_dir = "#{target_table_stats['location']}/#{part_dir}"
-          hdfs_source_path = "/#{hdfs_dir.split("/")[3..-2].join("/")}/#{base_filename}"
-          hdfs_target_path = "/#{hdfs_dir.split("/")[3..-1].join("/")}"
+          hdfs_dir = "#{table_stats['location']}/#{part_dir}"
+          #source the partitions from a parallel load folder since filenames are all named the same
+          hdfs_source_url = "#{table_stats['location']}/part_load/#{part_dir}/#{base_filename}"
+          hdfs_target_url = hdfs_dir
           #load partition into source path
-          puts "Writing to #{hdfs_source_path} for #{user} at #{Time.now.utc}"
-          Hdfs.write(hdfs_source_path,tpmv,user)
+          puts "Writing to #{hdfs_source_url} for #{user_name} at #{Time.now.utc}"
+          Hdfs.write(cluster,hdfs_source_url,tpmv,user_name)
           #let Hive know where the partition is
-          target_add_part_hql = "use #{target_db};alter table #{target_table} add if not exists partition (#{part_stmt}) location '#{hdfs_target_path}'"
-          target_insert_part_hql   = "load data inpath '#{hdfs_source_path}' overwrite into table #{target_table} partition (#{part_stmt});"
-          target_part_hql = [target_add_part_hql,target_insert_part_hql].join(";")
-          puts "Adding partition #{tpmk} to #{target_table_path} for #{user} at #{Time.now.utc}"
-          Hive.run(target_part_hql, cluster, user)
+          target_add_part_hql = "use #{db};alter table #{table} add if not exists partition (#{part_stmt}) location '#{hdfs_target_url}'"
+          target_insert_part_hql = "load data inpath '#{hdfs_source_url}' overwrite into table #{table} partition (#{part_stmt});"
+          target_part_hql += [target_add_part_hql,target_insert_part_hql].join(";")
+        end
+        #run actual partition adds all at once
+        if target_part_hql.length>0
+          puts "Adding partitions to #{cluster}/#{db}/#{table} for #{user_name} at #{Time.now.utc}"
+          Hive.run(cluster, target_part_hql, user_name)
         end
       else
         error_msg = "Incompatible partition specs: " +
-                    "target table:#{target_table_stats['partitions'].to_s}, " +
-                    "user_params:#{target_partitions.to_s}"
+                    "target table:#{table_stats['partitions'].to_s}, " +
+                    "user_params:#{part_array.to_s}"
         raise error_msg
       end
-      return target_path
+      url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
+      return url
     end
     def Hive.write_by_stage_path(stage_path)
       s = Stage.where(:path=>stage_path).first
-      u = s.job.runner.user
       params = s.params
-      user = params['user']
-      cluster = params['cluster'] || Hive.clusters.keys.first
+      source = s.sources.first
+      target = s.target
+      cluster, db, table = target.url.split("://").last.split("/")
+      #update stage with the node so we can use it
+      user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
+      job_name = s.path.sub("Runner_","")
       #slot Hive worker if available
       slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
       return false unless slot_id
-      node = Hadoop.gateway_node(cluster)
-      if user and !Ssh.sudoers(node).include?(u.name)
-        raise "#{u.name} does not have su permissions for #{node}"
-      elsif user.nil? and Ssh.su_all_users(node)
-        user = u.name
-      end
-      #determine path for target
-      target_path = params['target']
-      gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
-      #return blank response if there are no slots available
-      return nil unless gdrive_slot
-      source_dst = s.source_dsts(gdrive_slot).first
-      schema_hash = params['schema'] ? Hive.schema_hash(params['schema'],user,gdrive_slot) : {}
+      schema_hash = if params['schema']
+                      gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
+                      #return blank response if there are no slots available
+                      return nil unless gdrive_slot
+                      Hive.schema_hash(params['schema'],user_name,gdrive_slot)
+                    else
+                      {}
+                    end
       Gdrive.unslot_worker_by_path(stage_path)
       #drop target before create/insert?
       drop = params['drop']
@@ -461,64 +526,77 @@ module Mobilize
       source_tsv,source_hql = [nil]*2
       if params['hql']
         source_hql = params['hql']
-      elsif source_dst
-        if source_dst.handler == 'hive'
+      elsif source
+        if source.handler == 'hive'
           #source table
-          cluster,source_path = source_dst.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
+          cluster,source_path = source.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
           source_hql = "select * from #{source_path};"
-        elsif ['gridfs','hdfs'].include?(source_dst.handler)
-          if source_dst.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
-            source_hql = source_dst.read(user)
+        elsif ['gsheet','gridfs','hdfs'].include?(source.handler)
+          if source.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
+            source_hql = source.read(user_name)
           else
             #tsv from sheet
-            source_tsv = source_dst.read(user)
+            source_tsv = source.read(user_name)
           end
         end
       end
-      out_string = if source_hql
-                     Hive.hql_to_table(cluster, source_hql, target_path, user, drop, schema_hash)
-                   elsif source_tsv
-                     Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop, schema_hash)
-                   else
-                     raise "Unable to determine source tsv or source hql"
-                   end
+      part_array = if params['partitions']
+                    params['partitions'].to_a.map{|p| p.gsub(".","/").split("/")}.flatten
+                  elsif params['target']
+                    #take the end parts of the target, that are not the cluster, db, table
+                    target_array = params['target'].gsub(".","/").split("/")
+                    [cluster,db,table].each do |term|
+                      target_array = target_array[1..-1] if target_array.first == term
+                    end
+                    target_array
+                  else
+                    []
+                  end
+      result = begin
+                 url = if source_hql
+                         Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop, schema_hash)
+                       elsif source_tsv
+                         Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop, schema_hash)
+                       else
+                         raise "Unable to determine source tsv or source hql"
+                       end
+                 {'stdout'=>url,'exit_code'=>0}
+               rescue => exc
+                 {'stderr'=>exc.to_s, 'exit_code'=>500}
+               end
       #unslot worker and write result
       Hive.unslot_worker_by_path(stage_path)
-      #output table stores stage output
-      out_string = "result\n#{out_string}"
-      output_db,output_table = [Hive.output_db(cluster),stage_path.gridsafe]
-      out_url = "hive://#{cluster}/#{output_db}/#{output_table}"
-      Dataset.write_by_url(out_url,out_string,user)
-      out_url
+      response = {}
+      response['out_url'] = Dataset.write_by_url("gridfs://#{s.path}/out",result['stdout'].to_s,Gdrive.owner_name) if result['stdout'].to_s.length>0
+      response['err_url'] = Dataset.write_by_url("gridfs://#{s.path}/err",result['stderr'].to_s,Gdrive.owner_name) if result['stderr'].to_s.length>0
+      response['signal'] = result['exit_code']
+      response
     end
-    def Hive.read_by_dataset_path(dst_path,user)
-      cluster,source_path = dst_path.split("/").ie do |sp|
-                                                     if sp.length == 2
-                                                       [Hive.clusters.first.first,sp.join(".")]
-                                                     else
-                                                       [sp.first, sp[1..-1].join(".")]
-                                                     end
-                                                   end
-      hql = "set hive.cli.print.header=true;select * from #{source_path};"
-      Hive.run(hql,cluster,user)
+    def Hive.read_by_dataset_path(dst_path,user_name,*args)
+      cluster, db, table = dst_path.split("/")
+      source_path = [db,table].join(".")
+      job_name = "read #{cluster}/#{db}/#{table}"
+      set_hql = "set hive.cli.print.header=true;set mapred.job.name=#{job_name};"
+      select_hql = "select * from #{source_path};"
+      hql = [set_hql,select_hql].join
+      response = Hive.run(cluster, hql,user_name)
+      if response['exit_code']==0
+        return response['stdout']
+      else
+        raise "Unable to read hive://#{dst_path} with error: #{response['stderr']}"
+      end
     end
-    def Hive.write_by_dataset_path(dst_path,source_tsv,user)
-      cluster,target_path = dst_path.split("/").ie do |sp|
-                                                     if sp.length == 2
-                                                       [Hive.clusters.first.first,sp.join(".")]
-                                                     else
-                                                       [sp.first, sp[1..-1].join(".")]
-                                                     end
-                                                   end
+    def Hive.write_by_dataset_path(dst_path,source_tsv,user_name,*args)
+      cluster,db,table = dst_path.split("/")
+      part_array = []
       drop = true
-      Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop)
+      Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop)
     end
   end

data/lib/mobilize-hive/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Mobilize
   module Hive
-    VERSION = "1.0.11"
+    VERSION = "1.2"
   end
 end

data/mobilize-hive.gemspec CHANGED Viewed

@@ -7,7 +7,7 @@ Gem::Specification.new do |gem|
   gem.name          = "mobilize-hive"
   gem.version       = Mobilize::Hive::VERSION
   gem.authors       = ["Cassio Paes-Leme"]
-  gem.email         = ["cpaesleme@ngmoco.com"]
+  gem.email         = ["cpaesleme@dena.com"]
   gem.description   = %q{Adds hive read, write, and run support to mobilize-hdfs}
   gem.summary       = %q{Adds hive read, write, and run support to mobilize-hdfs}
   gem.homepage      = "http://github.com/dena/mobilize-hive"
@@ -16,5 +16,5 @@ Gem::Specification.new do |gem|
   gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
   gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
   gem.require_paths = ["lib"]
-  gem.add_runtime_dependency "mobilize-hdfs","1.0.10"
+  gem.add_runtime_dependency "mobilize-hdfs","1.2"
 end

data/test/hive_job_rows.yml CHANGED Viewed

@@ -3,7 +3,7 @@
   active: true
   trigger: once
   status: ""
-  stage1: hive.write target:"mobilize/hive_test_1/act_date", drop:true,
+  stage1: hive.write target:"mobilize/hive_test_1", partitions:"act_date", drop:true,
             source:"Runner_mobilize(test)/hive_test_1.in", schema:"hive_test_1.schema"
   stage2: hive.run source:"hive_test_1.hql"
   stage3: hive.run hql:"show databases;"
@@ -21,6 +21,6 @@
   trigger: after hive_test_2
   status: ""
   stage1: hive.run hql:"select act_date as `date`,product,category,value from mobilize.hive_test_1;"
-  stage2: hive.write source:"stage1",target:"mobilize/hive_test_3/date/product", drop:true
-  stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3/date/product", drop:false
+  stage2: hive.write source:"stage1",target:"mobilize/hive_test_3", partitions:"date/product", drop:true
+  stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3", partitions:"date/product", drop:false
   stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"

data/test/mobilize-hive_test.rb CHANGED Viewed

@@ -52,9 +52,9 @@ describe "Mobilize" do
     hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
     [hive_3_target_sheet].each{|s| s.delete if s}
-    puts "job row added, force enqueued requestor, wait 1000s"
+    puts "job row added, force enqueued requestor, wait for stages"
     r.enqueue!
-    sleep 1000
+    wait_for_stages(1200)
     puts "jobtracker posted data to test sheet"
     hive_1_stage_2_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1_stage_2.out",gdrive_slot)
@@ -63,9 +63,34 @@ describe "Mobilize" do
     hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
     assert hive_1_stage_2_target_sheet.read(u.name).length == 219
-    assert hive_1_stage_3_target_sheet.read(u.name).length == 325
+    assert hive_1_stage_3_target_sheet.read(u.name).length > 3
     assert hive_2_target_sheet.read(u.name).length == 599
     assert hive_3_target_sheet.read(u.name).length == 347
   end
+  def wait_for_stages(time_limit=600,stage_limit=120,wait_length=10)
+    time = 0
+    time_since_stage = 0
+    #check for 10 min
+    while time < time_limit and time_since_stage < stage_limit
+      sleep wait_length
+      job_classes = Mobilize::Resque.jobs.map{|j| j['class']}
+      if job_classes.include?("Mobilize::Stage")
+        time_since_stage = 0
+        puts "saw stage at #{time.to_s} seconds"
+      else
+        time_since_stage += wait_length
+        puts "#{time_since_stage.to_s} seconds since stage seen"
+      end
+      time += wait_length
+      puts "total wait time #{time.to_s} seconds"
+    end
+    if time >= time_limit
+      raise "Timed out before stage completion"
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: mobilize-hive
 version: !ruby/object:Gem::Version
-  version: 1.0.11
+  version: '1.2'
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-03-05 00:00:00.000000000 Z
+date: 2013-03-21 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mobilize-hdfs
@@ -18,7 +18,7 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 1.0.10
+        version: '1.2'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
@@ -26,10 +26,10 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 1.0.10
+        version: '1.2'
 description: Adds hive read, write, and run support to mobilize-hdfs
 email:
-- cpaesleme@ngmoco.com
+- cpaesleme@dena.com
 executables: []
 extensions: []
 extra_rdoc_files: []