sequel-impala 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +16 -0
  3. data/LICENSE +2 -1
  4. data/README.md +45 -0
  5. data/lib/rbhive.rb +8 -0
  6. data/lib/rbhive/connection.rb +150 -0
  7. data/lib/rbhive/explain_result.rb +46 -0
  8. data/lib/rbhive/result_set.rb +37 -0
  9. data/lib/rbhive/schema_definition.rb +86 -0
  10. data/lib/rbhive/t_c_l_i_connection.rb +464 -0
  11. data/lib/rbhive/t_c_l_i_result_set.rb +3 -0
  12. data/lib/rbhive/t_c_l_i_schema_definition.rb +87 -0
  13. data/lib/rbhive/table_schema.rb +122 -0
  14. data/lib/rbhive/version.rb +3 -0
  15. data/lib/sequel/adapters/impala.rb +13 -1
  16. data/lib/sequel/adapters/rbhive.rb +174 -0
  17. data/lib/sequel/adapters/shared/impala.rb +11 -3
  18. data/lib/sequel/extensions/csv_to_parquet.rb +68 -14
  19. data/lib/thrift/facebook_service.rb +700 -0
  20. data/lib/thrift/fb303_constants.rb +9 -0
  21. data/lib/thrift/fb303_types.rb +19 -0
  22. data/lib/thrift/hive_metastore_constants.rb +41 -0
  23. data/lib/thrift/hive_metastore_types.rb +630 -0
  24. data/lib/thrift/hive_service_constants.rb +13 -0
  25. data/lib/thrift/hive_service_types.rb +72 -0
  26. data/lib/thrift/queryplan_constants.rb +13 -0
  27. data/lib/thrift/queryplan_types.rb +261 -0
  28. data/lib/thrift/sasl_client_transport.rb +161 -0
  29. data/lib/thrift/serde_constants.rb +92 -0
  30. data/lib/thrift/serde_types.rb +7 -0
  31. data/lib/thrift/t_c_l_i_service.rb +1054 -0
  32. data/lib/thrift/t_c_l_i_service_constants.rb +72 -0
  33. data/lib/thrift/t_c_l_i_service_types.rb +1768 -0
  34. data/lib/thrift/thrift_hive.rb +508 -0
  35. data/lib/thrift/thrift_hive_metastore.rb +3856 -0
  36. data/spec/impala_test.rb +6 -1
  37. metadata +53 -25
  38. data/README.rdoc +0 -39
@@ -0,0 +1,3 @@
1
+ module RBHive
2
+ class TCLIResultSet < ResultSet; end
3
+ end
@@ -0,0 +1,87 @@
1
+ require 'json'
2
+
3
+ module RBHive
4
+ class TCLISchemaDefinition
5
+ attr_reader :schema
6
+
7
+ NAN = Float::NAN rescue 0.0/0.0
8
+ INFINITY = Float::INFINITY rescue 1.0/0.0
9
+ TYPES = {
10
+ :boolean => :to_s,
11
+ :string => :to_s,
12
+ :float => :to_f,
13
+ :double => :to_f,
14
+ :int => :to_i,
15
+ :bigint => :to_i,
16
+ :smallint => :to_i,
17
+ :tinyint => :to_i,
18
+ }
19
+
20
+ def initialize(schema, example_row)
21
+ @schema = schema
22
+ @example_row = example_row ? example_row.colVals : []
23
+ end
24
+
25
+ def column_names
26
+ @column_names ||= begin
27
+ schema_names = @schema.columns.map {|c| c.columnName }
28
+
29
+ # In rare cases Hive can return two identical column names
30
+ # consider SELECT a.foo, b.foo...
31
+ # in this case you get two columns called foo with no disambiguation.
32
+ # as a (far from ideal) solution we detect this edge case and rename them
33
+ # a.foo => foo1, b.foo => foo2
34
+ # otherwise we will trample one of the columns during Hash mapping.
35
+ s = Hash.new(0)
36
+ schema_names.map! { |c| s[c] += 1; s[c] > 1 ? "#{c}---|---#{s[c]}" : c }
37
+ schema_names.map! { |c| s[c] > 1 ? "#{c}---|---1" : c }
38
+ schema_names.map! { |c| c.gsub('---|---', '_').to_sym }
39
+
40
+ # Lets fix the fact that Hive doesn't return schema data for partitions on SELECT * queries
41
+ # For now we will call them :_p1, :_p2, etc. to avoid collisions.
42
+ offset = 0
43
+ while schema_names.length < @example_row.length
44
+ schema_names.push(:"_p#{offset+=1}")
45
+ end
46
+ schema_names
47
+ end
48
+ end
49
+
50
+ def column_type_map
51
+ @column_type_map ||= column_names.inject({}) do |hsh, c|
52
+ definition = @schema.columns.find {|s| s.columnName.to_sym == c }
53
+ # If the column isn't in the schema (eg partitions in SELECT * queries) assume they are strings
54
+ type = TYPE_NAMES[definition.typeDesc.types.first.primitiveEntry.type].downcase rescue nil
55
+ hsh[c] = definition && type ? type.to_sym : :string
56
+ hsh
57
+ end
58
+ end
59
+
60
+ def coerce_row(row)
61
+ column_names.zip(row.colVals.map(&:get_value).map(&:value)).inject({}) do |hsh, (column_name, value)|
62
+ hsh[column_name] = coerce_column(column_name, value)
63
+ hsh
64
+ end
65
+ end
66
+
67
+ def coerce_column(column_name, value)
68
+ type = column_type_map[column_name]
69
+ return INFINITY if (type != :string && value == "Infinity")
70
+ return NAN if (type != :string && value == "NaN")
71
+ return coerce_complex_value(value) if type.to_s =~ /^array/
72
+ conversion_method = TYPES[type]
73
+ conversion_method ? value.send(conversion_method) : value
74
+ end
75
+
76
+ def coerce_row_to_array(row)
77
+ column_names.map { |n| row[n] }
78
+ end
79
+
80
+ def coerce_complex_value(value)
81
+ return nil if value.nil?
82
+ return nil if value.length == 0
83
+ return nil if value == 'null'
84
+ JSON.parse(value)
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,122 @@
1
+ module RBHive
2
+ class TableSchema
3
+ attr_accessor :name
4
+ attr_reader :columns, :partitions
5
+ def initialize(name, comment=nil, options={}, &blk)
6
+ @name, @comment = name, comment
7
+ @location = options[:location] || nil
8
+ @field_sep = options[:field_sep] || "\t"
9
+ @line_sep = options[:line_sep] || "\n"
10
+ @collection_sep = options[:collection_sep] || "|"
11
+ @stored_as = options[:stored_as] || :textfile
12
+ @columns = []
13
+ @partitions = []
14
+ @serde_name = nil
15
+ @serde_properties = {}
16
+ instance_eval(&blk) if blk
17
+ end
18
+
19
+ def column(name, type, comment=nil)
20
+ @columns << Column.new(name, type, comment)
21
+ end
22
+
23
+ def partition(name, type, comment=nil)
24
+ @partitions << Column.new(name, type, comment)
25
+ end
26
+
27
+ def serde(name, properties={})
28
+ @serde_name = name
29
+ @serde_properties = properties
30
+ end
31
+
32
+ def create_table_statement()
33
+ %[CREATE #{external}TABLE #{table_statement}
34
+ ROW FORMAT #{row_format_statement}
35
+ STORED AS #{stored_as}
36
+ #{location}]
37
+ end
38
+
39
+ def stored_as
40
+ @stored_as.to_s.upcase
41
+ end
42
+
43
+ def row_format_statement
44
+ if @serde_name
45
+ serde_statement
46
+ else
47
+ delimited_statement
48
+ end
49
+ end
50
+
51
+ def delimited_statement
52
+ %(DELIMITED
53
+ FIELDS TERMINATED BY '#{@field_sep}'
54
+ COLLECTION ITEMS TERMINATED BY '#{@collection_sep}'
55
+ LINES TERMINATED BY '#{@line_sep}')
56
+ end
57
+
58
+ def serde_statement
59
+ %(SERDE '#{@serde_name}'\n#{serde_properties_statement})
60
+ end
61
+
62
+ def serde_properties_statement
63
+ return '' unless @serde_properties.any?
64
+ kvs = @serde_properties.map { |k,v| %("#{k}" = "#{v}") }.join(",\n")
65
+ %(WITH SERDEPROPERTIES (#{kvs}))
66
+ end
67
+
68
+ def replace_columns_statement
69
+ alter_columns_statement("REPLACE")
70
+ end
71
+
72
+ def add_columns_statement
73
+ alter_columns_statement("ADD")
74
+ end
75
+
76
+ def to_s
77
+ table_statement
78
+ end
79
+
80
+ private
81
+
82
+ def external
83
+ @location.nil? ? '' : 'EXTERNAL '
84
+ end
85
+
86
+ def table_statement
87
+ comment_string = (@comment.nil? ? '' : " COMMENT '#{@comment}'")
88
+ %[`#{@name}` #{column_statement}#{comment_string}\n#{partition_statement}]
89
+ end
90
+
91
+ def location
92
+ @location.nil? ? '' : "LOCATION '#{@location}'"
93
+ end
94
+
95
+ def alter_columns_statement(add_or_replace)
96
+ %[ALTER TABLE `#{name}` #{add_or_replace} COLUMNS #{column_statement}]
97
+ end
98
+
99
+ def column_statement
100
+ cols = @columns.join(",\n")
101
+ "(\n#{cols}\n)"
102
+ end
103
+
104
+ def partition_statement
105
+ return "" if @partitions.nil? || @partitions.empty?
106
+ cols = @partitions.join(",\n")
107
+ "PARTITIONED BY (\n#{cols}\n)"
108
+ end
109
+
110
+ class Column
111
+ attr_reader :name, :type, :comment
112
+ def initialize(name, type, comment=nil)
113
+ @name, @type, @comment = name, type, comment
114
+ end
115
+
116
+ def to_s
117
+ comment_string = @comment.nil? ? '' : " COMMENT '#{@comment}'"
118
+ "`#{@name}` #{@type.to_s.upcase}#{comment_string}"
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,3 @@
1
+ module RBHive
2
+ VERSION = '1.0.3.pre'
3
+ end
@@ -10,9 +10,15 @@ module Sequel
10
10
  ImpalaExceptions = [
11
11
  ::Impala::Error,
12
12
  ::Impala::Protocol::Beeswax::BeeswaxException,
13
+ ::Thrift::TransportException,
13
14
  IOError
14
15
  ].freeze
15
16
 
17
+ DisconnectExceptions = [
18
+ ::Thrift::TransportException,
19
+ IOError
20
+ ]
21
+
16
22
  set_adapter_scheme :impala
17
23
 
18
24
  # Connect to the Impala server. Currently, only the :host and :port options
@@ -28,6 +34,7 @@ module Sequel
28
34
 
29
35
  def disconnect_connection(c)
30
36
  c.close
37
+ rescue *DisconnectExceptions
31
38
  end
32
39
 
33
40
  def execute(sql, opts=OPTS)
@@ -54,7 +61,12 @@ module Sequel
54
61
  # in most cases that results in an unusable connection, so treat it as a
55
62
  # disconnect error so Sequel will reconnect.
56
63
  def disconnect_error?(exception, opts)
57
- exception.is_a?(IOError) || super
64
+ case exception
65
+ when *DisconnectExceptions
66
+ true
67
+ else
68
+ super
69
+ end
58
70
  end
59
71
 
60
72
  # Use DESCRIBE to get the column names and types for the table.
@@ -0,0 +1,174 @@
1
+ require 'rbhive'
2
+ require 'sequel/adapters/shared/impala'
3
+
4
+ module Sequel
5
+ module Rbhive
6
+ class Database < Sequel::Database
7
+ include Impala::DatabaseMethods
8
+
9
+ NullLogger = Object.new
10
+ def NullLogger.info(str)
11
+ nil
12
+ end
13
+
14
+ to_i = lambda(&:to_i)
15
+ CONVERSION_PROCS = [
16
+ nil, # 0 => %q"BOOLEAN",
17
+ nil, # 1 => %q"TINYINT",
18
+ nil, # 2 => %q"SMALLINT",
19
+ nil, # 3 => %q"INT",
20
+ nil, # 4 => %q"BIGINT",
21
+ nil, # 5 => %q"FLOAT",
22
+ nil, # 6 => %q"DOUBLE",
23
+ nil, # 7 => %q"STRING",
24
+ nil, # 8 => %q"TIMESTAMP",
25
+ nil, # 9 => %q"BINARY",
26
+ nil, # 10 => %q"ARRAY",
27
+ nil, # 11 => %q"MAP",
28
+ nil, # 12 => %q"STRUCT",
29
+ nil, # 13 => %q"UNIONTYPE",
30
+ lambda{|v| BigDecimal.new(v)}, # 15 => %q"DECIMAL",
31
+ nil, # 16 => %q"NULL",
32
+ lambda{|v| Date.new(*v[0...10].split('-'))}, # 17 => %q"DATE",
33
+ nil, # 18 => %q"VARCHAR",
34
+ nil, # 19 => %q"CHAR",
35
+ ]
36
+
37
+ attr_reader :conversion_procs
38
+
39
+ # Exception classes used by Impala.
40
+ RbhiveExceptions = [
41
+ RBHive::TCLIConnectionError,
42
+ ::Thrift::TransportException,
43
+ IOError
44
+ ].freeze
45
+
46
+ DisconnectExceptions = [
47
+ ::Thrift::TransportException,
48
+ IOError
49
+ ].freeze
50
+
51
+ set_adapter_scheme :rbhive
52
+
53
+ # Connect to the Impala server. Currently, only the :host and :port options
54
+ # are respected, and they default to 'localhost' and 21000, respectively.
55
+ def connect(server)
56
+ opts = server_opts(server)
57
+ opts[:hive_version] ||= 12
58
+ conn = RBHive::TCLIConnection.new(opts[:host]||'localhost', opts[:port]||21050, opts, opts[:hive_logger] || NullLogger)
59
+ conn.open
60
+ conn.open_session
61
+ conn
62
+ end
63
+
64
+ def database_error_classes
65
+ RbhiveExceptions
66
+ end
67
+
68
+ def disconnect_connection(connection)
69
+ connection.close_session if connection.session
70
+ connection.close
71
+ rescue *DisconnectExceptions
72
+ end
73
+
74
+ def execute(sql, opts=OPTS)
75
+ synchronize(opts[:server]) do |c|
76
+ begin
77
+ r = log_yield(sql){c.execute(sql)}
78
+ yield(c, r) if block_given?
79
+ nil
80
+ rescue *RbhiveExceptions => e
81
+ raise_error(e)
82
+ end
83
+ end
84
+ end
85
+
86
+ private
87
+
88
+ def adapter_initialize
89
+ @conversion_procs = CONVERSION_PROCS.dup
90
+ @conversion_procs[8] = method(:to_application_timestamp)
91
+ end
92
+
93
+ def connection_execute_method
94
+ :execute
95
+ end
96
+
97
+ # Impala raises IOError if it detects a problem on the connection, and
98
+ # in most cases that results in an unusable connection, so treat it as a
99
+ # disconnect error so Sequel will reconnect.
100
+ def disconnect_error?(exception, opts)
101
+ case exception
102
+ when *DisconnectExceptions
103
+ true
104
+ else
105
+ super
106
+ end
107
+ end
108
+
109
+ # Use DESCRIBE to get the column names and types for the table.
110
+ def schema_parse_table(table_name, opts)
111
+ m = output_identifier_meth(opts[:dataset])
112
+
113
+ table = if opts[:schema]
114
+ Sequel.qualify(opts[:schema], table_name)
115
+ else
116
+ Sequel.identifier(table_name)
117
+ end
118
+
119
+ describe(table, opts).map do |row|
120
+ row[:db_type] = row[:type]
121
+ row[:type] = schema_column_type(row[:db_type])
122
+ row[:default] = nil
123
+ row[:primary_key] = false
124
+ [m.call(row.delete(:name)), row]
125
+ end
126
+ end
127
+ end
128
+
129
+ class Dataset < Sequel::Dataset
130
+ include Impala::DatasetMethods
131
+
132
+ Database::DatasetClass = self
133
+
134
+ APOS = "'".freeze
135
+ STRING_ESCAPES = {
136
+ "\\" => "\\\\".freeze,
137
+ "'" => "\\'".freeze,
138
+ "\n" => "\\n".freeze,
139
+ "\r" => "\\r".freeze,
140
+ "\0" => "\\0".freeze,
141
+ "\b" => "\\b".freeze,
142
+ "\04" => "\\Z".freeze,
143
+ # Impala is supposed to support this, but using it
144
+ # breaks things to the point of returning bad data.
145
+ # If you don't do this, the tabs in the input
146
+ # get converted to spaces, but that's better than the
147
+ # alternative.
148
+ # "\t" => "\\t".freeze,
149
+ }.freeze
150
+ STRING_ESCAPE_RE = /(#{Regexp.union(STRING_ESCAPES.keys)})/
151
+
152
+ def fetch_rows(sql)
153
+ execute(sql) do |conn, result|
154
+ op_handle = result.operationHandle
155
+ columns, type_nums = conn.get_column_info(op_handle)
156
+ @columns = columns.map!{|c| output_identifier(c)}
157
+ conversion_procs = db.conversion_procs
158
+ convertors = conversion_procs.values_at(*type_nums)
159
+ #cursor.typecast_map['timestamp'] = db.method(:to_application_timestamp)
160
+ conn.yield_hash_rows(op_handle, columns, convertors) do |row|
161
+ yield row
162
+ end
163
+ end
164
+ end
165
+
166
+ private
167
+
168
+ def literal_string_append(sql, s)
169
+ sql << APOS << s.to_s.gsub(STRING_ESCAPE_RE){|m| STRING_ESCAPES[m]} << APOS
170
+ end
171
+ end
172
+ end
173
+ end
174
+
@@ -98,12 +98,12 @@ module Sequel
98
98
  if schema = search_path_table_schemas[table]
99
99
  Sequel.qualify(schema, table)
100
100
  else
101
- table
101
+ Sequel.identifier(table)
102
102
  end
103
103
  when SQL::Identifier
104
104
  implicit_qualify(table.value.to_s)
105
105
  when SQL::AliasedExpression
106
- SQL::AliasedExpression.new(implicit_qualify(table), v.alias)
106
+ SQL::AliasedExpression.new(implicit_qualify(table.expression), table.alias)
107
107
  else
108
108
  table
109
109
  end
@@ -229,7 +229,7 @@ module Sequel
229
229
 
230
230
  def create_table_sql(name, generator, options)
231
231
  sql = super
232
- sql << create_table_parameters_sql(options)
232
+ sql += create_table_parameters_sql(options)
233
233
  sql
234
234
  end
235
235
 
@@ -500,6 +500,10 @@ module Sequel
500
500
  true
501
501
  end
502
502
 
503
+ def supports_cte_in_subqueries?
504
+ true
505
+ end
506
+
503
507
  # Impala doesn't support derived column lists when aliasing
504
508
  # tables.
505
509
  def supports_derived_column_lists?
@@ -618,6 +622,10 @@ module Sequel
618
622
  sql << APOS << s.to_s.gsub(STRING_ESCAPE_RE, STRING_ESCAPE_REPLACE) << APOS
619
623
  end
620
624
 
625
+ def multi_insert_sql_strategy
626
+ :values
627
+ end
628
+
621
629
  # Impala doesn't support esacping of identifiers, so you can't use backtick in
622
630
  # an identifier name.
623
631
  def quoted_identifier_append(sql, name)