RubyGems - sequel-impala - Versions diffs - 1.0.0 → 1.0.1 - Mend

sequel-impala 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +4 -4
data/CHANGELOG +16 -0
data/LICENSE +2 -1
data/README.md +45 -0
data/lib/rbhive.rb +8 -0
data/lib/rbhive/connection.rb +150 -0
data/lib/rbhive/explain_result.rb +46 -0
data/lib/rbhive/result_set.rb +37 -0
data/lib/rbhive/schema_definition.rb +86 -0
data/lib/rbhive/t_c_l_i_connection.rb +464 -0
data/lib/rbhive/t_c_l_i_result_set.rb +3 -0
data/lib/rbhive/t_c_l_i_schema_definition.rb +87 -0
data/lib/rbhive/table_schema.rb +122 -0
data/lib/rbhive/version.rb +3 -0
data/lib/sequel/adapters/impala.rb +13 -1
data/lib/sequel/adapters/rbhive.rb +174 -0
data/lib/sequel/adapters/shared/impala.rb +11 -3
data/lib/sequel/extensions/csv_to_parquet.rb +68 -14
data/lib/thrift/facebook_service.rb +700 -0
data/lib/thrift/fb303_constants.rb +9 -0
data/lib/thrift/fb303_types.rb +19 -0
data/lib/thrift/hive_metastore_constants.rb +41 -0
data/lib/thrift/hive_metastore_types.rb +630 -0
data/lib/thrift/hive_service_constants.rb +13 -0
data/lib/thrift/hive_service_types.rb +72 -0
data/lib/thrift/queryplan_constants.rb +13 -0
data/lib/thrift/queryplan_types.rb +261 -0
data/lib/thrift/sasl_client_transport.rb +161 -0
data/lib/thrift/serde_constants.rb +92 -0
data/lib/thrift/serde_types.rb +7 -0
data/lib/thrift/t_c_l_i_service.rb +1054 -0
data/lib/thrift/t_c_l_i_service_constants.rb +72 -0
data/lib/thrift/t_c_l_i_service_types.rb +1768 -0
data/lib/thrift/thrift_hive.rb +508 -0
data/lib/thrift/thrift_hive_metastore.rb +3856 -0
data/spec/impala_test.rb +6 -1
metadata +53 -25
data/README.rdoc +0 -39

data/lib/rbhive/t_c_l_i_result_set.rb ADDED

@@ -0,0 +1,3 @@
+module RBHive
+  class TCLIResultSet < ResultSet; end
+end

data/lib/rbhive/t_c_l_i_schema_definition.rb ADDED

@@ -0,0 +1,87 @@
+require 'json'
+module RBHive
+  class TCLISchemaDefinition
+    attr_reader :schema
+    NAN = Float::NAN rescue 0.0/0.0
+    INFINITY = Float::INFINITY rescue 1.0/0.0
+    TYPES = {
+      :boolean  => :to_s,
+      :string   => :to_s,
+      :float    => :to_f,
+      :double   => :to_f,
+      :int      => :to_i,
+      :bigint   => :to_i,
+      :smallint => :to_i,
+      :tinyint  => :to_i,
+    }
+    def initialize(schema, example_row)
+      @schema = schema
+      @example_row = example_row ? example_row.colVals : []
+    end
+    def column_names
+      @column_names ||= begin
+        schema_names = @schema.columns.map {|c| c.columnName }
+        # In rare cases Hive can return two identical column names
+        # consider SELECT a.foo, b.foo...
+        # in this case you get two columns called foo with no disambiguation.
+        # as a (far from ideal) solution we detect this edge case and rename them
+        # a.foo => foo1, b.foo => foo2
+        # otherwise we will trample one of the columns during Hash mapping.
+        s = Hash.new(0)
+        schema_names.map! { |c| s[c] += 1; s[c] > 1 ? "#{c}---|---#{s[c]}" : c }
+        schema_names.map! { |c| s[c] > 1 ? "#{c}---|---1" : c }
+        schema_names.map! { |c| c.gsub('---|---', '_').to_sym }
+        # Lets fix the fact that Hive doesn't return schema data for partitions on SELECT * queries
+        # For now we will call them :_p1, :_p2, etc. to avoid collisions.
+        offset = 0
+        while schema_names.length < @example_row.length
+          schema_names.push(:"_p#{offset+=1}")
+        end
+        schema_names
+      end
+    end
+    def column_type_map
+      @column_type_map ||= column_names.inject({}) do |hsh, c|
+        definition = @schema.columns.find {|s| s.columnName.to_sym == c }
+        # If the column isn't in the schema (eg partitions in SELECT * queries) assume they are strings
+        type = TYPE_NAMES[definition.typeDesc.types.first.primitiveEntry.type].downcase rescue nil
+        hsh[c] = definition && type ? type.to_sym : :string
+        hsh
+      end
+    end
+    def coerce_row(row)
+      column_names.zip(row.colVals.map(&:get_value).map(&:value)).inject({}) do |hsh, (column_name, value)|
+        hsh[column_name] = coerce_column(column_name, value)
+        hsh
+      end
+    end
+    def coerce_column(column_name, value)
+      type = column_type_map[column_name]
+      return INFINITY if (type != :string && value == "Infinity")
+      return NAN if (type != :string && value == "NaN")
+      return coerce_complex_value(value) if type.to_s =~ /^array/
+      conversion_method = TYPES[type]
+      conversion_method ? value.send(conversion_method) : value
+    end
+    def coerce_row_to_array(row)
+      column_names.map { |n| row[n] }
+    end
+    def coerce_complex_value(value)
+      return nil if value.nil?
+      return nil if value.length == 0
+      return nil if value == 'null'
+      JSON.parse(value)
+    end
+  end
+end

data/lib/rbhive/table_schema.rb ADDED

@@ -0,0 +1,122 @@
+module RBHive
+  class TableSchema
+    attr_accessor :name
+    attr_reader :columns, :partitions
+    def initialize(name, comment=nil, options={}, &blk)
+      @name, @comment = name, comment
+      @location = options[:location] || nil
+      @field_sep = options[:field_sep] || "\t"
+      @line_sep = options[:line_sep] || "\n"
+      @collection_sep = options[:collection_sep] || "|"
+      @stored_as = options[:stored_as] || :textfile
+      @columns = []
+      @partitions = []
+      @serde_name = nil
+      @serde_properties = {}
+      instance_eval(&blk) if blk
+    end
+    def column(name, type, comment=nil)
+      @columns << Column.new(name, type, comment)
+    end
+    def partition(name, type, comment=nil)
+      @partitions << Column.new(name, type, comment)
+    end
+    def serde(name, properties={})
+      @serde_name = name
+      @serde_properties = properties
+    end
+    def create_table_statement()
+      %[CREATE #{external}TABLE #{table_statement}
+  ROW FORMAT #{row_format_statement}
+  STORED AS #{stored_as}
+  #{location}]
+    end
+    def stored_as
+      @stored_as.to_s.upcase
+    end
+    def row_format_statement
+      if @serde_name
+        serde_statement
+      else
+        delimited_statement
+      end
+    end
+    def delimited_statement
+      %(DELIMITED
+  FIELDS TERMINATED BY '#{@field_sep}'
+  COLLECTION ITEMS TERMINATED BY '#{@collection_sep}'
+  LINES TERMINATED BY '#{@line_sep}')
+    end
+    def serde_statement
+      %(SERDE '#{@serde_name}'\n#{serde_properties_statement})
+    end
+    def serde_properties_statement
+      return '' unless @serde_properties.any?
+      kvs = @serde_properties.map { |k,v| %("#{k}" = "#{v}") }.join(",\n")
+      %(WITH SERDEPROPERTIES (#{kvs}))
+    end
+    def replace_columns_statement
+      alter_columns_statement("REPLACE")
+    end
+    def add_columns_statement
+      alter_columns_statement("ADD")
+    end
+    def to_s
+      table_statement
+    end
+    private
+    def external
+      @location.nil? ? '' : 'EXTERNAL '
+    end
+    def table_statement
+      comment_string = (@comment.nil? ? '' : " COMMENT '#{@comment}'")
+      %[`#{@name}` #{column_statement}#{comment_string}\n#{partition_statement}]
+    end
+    def location
+      @location.nil? ? '' : "LOCATION '#{@location}'"
+    end
+    def alter_columns_statement(add_or_replace)
+      %[ALTER TABLE `#{name}` #{add_or_replace} COLUMNS #{column_statement}]
+    end
+    def column_statement
+      cols = @columns.join(",\n")
+      "(\n#{cols}\n)"
+    end
+    def partition_statement
+      return "" if @partitions.nil? || @partitions.empty?
+      cols = @partitions.join(",\n")
+      "PARTITIONED BY (\n#{cols}\n)"
+    end
+    class Column
+      attr_reader :name, :type, :comment
+      def initialize(name, type, comment=nil)
+        @name, @type, @comment = name, type, comment
+      end
+      def to_s
+        comment_string = @comment.nil? ? '' : " COMMENT '#{@comment}'"
+        "`#{@name}` #{@type.to_s.upcase}#{comment_string}"
+      end
+    end
+  end
+end

data/lib/rbhive/version.rb ADDED

@@ -0,0 +1,3 @@
+module RBHive
+  VERSION = '1.0.3.pre'
+end

data/lib/sequel/adapters/impala.rb CHANGED

@@ -10,9 +10,15 @@ module Sequel
       ImpalaExceptions = [
         ::Impala::Error,
         ::Impala::Protocol::Beeswax::BeeswaxException,
+        ::Thrift::TransportException,
         IOError
       ].freeze
+      DisconnectExceptions = [
+        ::Thrift::TransportException,
+        IOError
+      ]
       set_adapter_scheme :impala
       # Connect to the Impala server.  Currently, only the :host and :port options
@@ -28,6 +34,7 @@ module Sequel
       def disconnect_connection(c)
         c.close
+      rescue *DisconnectExceptions
       end
       def execute(sql, opts=OPTS)
@@ -54,7 +61,12 @@ module Sequel
       # in most cases that results in an unusable connection, so treat it as a
       # disconnect error so Sequel will reconnect.
       def disconnect_error?(exception, opts)
-        exception.is_a?(IOError) || super
+        case exception
+        when *DisconnectExceptions
+          true
+        else
+          super
+        end
       end
       # Use DESCRIBE to get the column names and types for the table.

data/lib/sequel/adapters/rbhive.rb ADDED

@@ -0,0 +1,174 @@
+require 'rbhive'
+require 'sequel/adapters/shared/impala'
+module Sequel
+  module Rbhive
+    class Database < Sequel::Database
+      include Impala::DatabaseMethods
+      NullLogger = Object.new
+      def NullLogger.info(str)
+        nil
+      end
+      to_i = lambda(&:to_i)
+      CONVERSION_PROCS = [
+        nil,  #  0 => %q"BOOLEAN",
+        nil,  #  1 => %q"TINYINT",
+        nil,  #  2 => %q"SMALLINT",
+        nil,  #  3 => %q"INT",
+        nil,  #  4 => %q"BIGINT",
+        nil,  #  5 => %q"FLOAT",
+        nil,  #  6 => %q"DOUBLE",
+        nil,  #  7 => %q"STRING",
+        nil,  #  8 => %q"TIMESTAMP",
+        nil,  #  9 => %q"BINARY",
+        nil,  #  10 => %q"ARRAY",
+        nil,  #  11 => %q"MAP",
+        nil,  #  12 => %q"STRUCT",
+        nil,  #  13 => %q"UNIONTYPE",
+        lambda{|v| BigDecimal.new(v)},  #  15 => %q"DECIMAL",
+        nil,  #  16 => %q"NULL",
+        lambda{|v| Date.new(*v[0...10].split('-'))},  #  17 => %q"DATE",
+        nil,  #  18 => %q"VARCHAR",
+        nil,  #  19 => %q"CHAR",
+      ]
+      attr_reader :conversion_procs
+      # Exception classes used by Impala.
+      RbhiveExceptions = [
+        RBHive::TCLIConnectionError,
+        ::Thrift::TransportException,
+        IOError
+      ].freeze
+      DisconnectExceptions = [
+        ::Thrift::TransportException,
+        IOError
+      ].freeze
+      set_adapter_scheme :rbhive
+      # Connect to the Impala server.  Currently, only the :host and :port options
+      # are respected, and they default to 'localhost' and 21000, respectively.
+      def connect(server)
+        opts = server_opts(server)
+        opts[:hive_version] ||= 12
+        conn = RBHive::TCLIConnection.new(opts[:host]||'localhost', opts[:port]||21050, opts, opts[:hive_logger] || NullLogger)
+        conn.open
+        conn.open_session
+        conn
+      end
+      def database_error_classes
+        RbhiveExceptions
+      end
+      def disconnect_connection(connection)
+        connection.close_session if connection.session
+        connection.close
+      rescue *DisconnectExceptions
+      end
+      def execute(sql, opts=OPTS)
+        synchronize(opts[:server]) do |c|
+          begin
+            r = log_yield(sql){c.execute(sql)}
+            yield(c, r) if block_given?
+            nil
+          rescue *RbhiveExceptions => e
+            raise_error(e)
+          end
+        end
+      end
+      private
+      def adapter_initialize
+        @conversion_procs = CONVERSION_PROCS.dup
+        @conversion_procs[8] = method(:to_application_timestamp)
+      end
+      def connection_execute_method
+        :execute
+      end
+      # Impala raises IOError if it detects a problem on the connection, and
+      # in most cases that results in an unusable connection, so treat it as a
+      # disconnect error so Sequel will reconnect.
+      def disconnect_error?(exception, opts)
+        case exception
+        when *DisconnectExceptions
+          true
+        else
+          super
+        end
+      end
+      # Use DESCRIBE to get the column names and types for the table.
+      def schema_parse_table(table_name, opts)
+        m = output_identifier_meth(opts[:dataset])
+        table = if opts[:schema]
+          Sequel.qualify(opts[:schema], table_name)
+        else
+          Sequel.identifier(table_name)
+        end
+        describe(table, opts).map do |row|
+          row[:db_type] = row[:type]
+          row[:type] = schema_column_type(row[:db_type])
+          row[:default] = nil
+          row[:primary_key] = false
+          [m.call(row.delete(:name)), row]
+        end
+      end
+    end
+    class Dataset < Sequel::Dataset
+      include Impala::DatasetMethods
+      Database::DatasetClass = self
+      APOS = "'".freeze
+      STRING_ESCAPES = {
+        "\\" => "\\\\".freeze,
+        "'" => "\\'".freeze,
+        "\n" => "\\n".freeze,
+        "\r" => "\\r".freeze,
+        "\0" => "\\0".freeze,
+        "\b" => "\\b".freeze,
+        "\04" => "\\Z".freeze,
+       # Impala is supposed to support this, but using it
+       # breaks things to the point of returning bad data.
+       # If you don't do this, the tabs in the input
+       # get converted to spaces, but that's better than the
+       # alternative.
+       # "\t" => "\\t".freeze,
+      }.freeze
+      STRING_ESCAPE_RE = /(#{Regexp.union(STRING_ESCAPES.keys)})/
+      def fetch_rows(sql)
+        execute(sql) do |conn, result|
+          op_handle = result.operationHandle
+          columns, type_nums = conn.get_column_info(op_handle)
+          @columns = columns.map!{|c| output_identifier(c)}
+          conversion_procs = db.conversion_procs
+          convertors = conversion_procs.values_at(*type_nums)
+          #cursor.typecast_map['timestamp'] = db.method(:to_application_timestamp)
+          conn.yield_hash_rows(op_handle, columns, convertors) do |row|
+            yield row
+          end
+        end
+      end
+      private
+      def literal_string_append(sql, s)
+        sql << APOS << s.to_s.gsub(STRING_ESCAPE_RE){|m| STRING_ESCAPES[m]} << APOS
+      end
+    end
+  end
+end

data/lib/sequel/adapters/shared/impala.rb CHANGED

@@ -98,12 +98,12 @@ module Sequel
           if schema = search_path_table_schemas[table]
             Sequel.qualify(schema, table)
           else
-            table
+            Sequel.identifier(table)
           end
         when SQL::Identifier
           implicit_qualify(table.value.to_s)
         when SQL::AliasedExpression
-          SQL::AliasedExpression.new(implicit_qualify(table), v.alias)
+          SQL::AliasedExpression.new(implicit_qualify(table.expression), table.alias)
         else
           table
         end
@@ -229,7 +229,7 @@ module Sequel
       def create_table_sql(name, generator, options)
         sql = super
-        sql << create_table_parameters_sql(options)
+        sql += create_table_parameters_sql(options)
         sql
       end
@@ -500,6 +500,10 @@ module Sequel
         true
       end
+      def supports_cte_in_subqueries?
+        true
+      end
       # Impala doesn't support derived column lists when aliasing
       # tables.
       def supports_derived_column_lists?
@@ -618,6 +622,10 @@ module Sequel
         sql << APOS << s.to_s.gsub(STRING_ESCAPE_RE, STRING_ESCAPE_REPLACE) << APOS
       end
+      def multi_insert_sql_strategy
+        :values
+      end
       # Impala doesn't support esacping of identifiers, so you can't use backtick in
       # an identifier name.
       def quoted_identifier_append(sql, name)