sequel-impala 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +16 -0
- data/LICENSE +2 -1
- data/README.md +45 -0
- data/lib/rbhive.rb +8 -0
- data/lib/rbhive/connection.rb +150 -0
- data/lib/rbhive/explain_result.rb +46 -0
- data/lib/rbhive/result_set.rb +37 -0
- data/lib/rbhive/schema_definition.rb +86 -0
- data/lib/rbhive/t_c_l_i_connection.rb +464 -0
- data/lib/rbhive/t_c_l_i_result_set.rb +3 -0
- data/lib/rbhive/t_c_l_i_schema_definition.rb +87 -0
- data/lib/rbhive/table_schema.rb +122 -0
- data/lib/rbhive/version.rb +3 -0
- data/lib/sequel/adapters/impala.rb +13 -1
- data/lib/sequel/adapters/rbhive.rb +174 -0
- data/lib/sequel/adapters/shared/impala.rb +11 -3
- data/lib/sequel/extensions/csv_to_parquet.rb +68 -14
- data/lib/thrift/facebook_service.rb +700 -0
- data/lib/thrift/fb303_constants.rb +9 -0
- data/lib/thrift/fb303_types.rb +19 -0
- data/lib/thrift/hive_metastore_constants.rb +41 -0
- data/lib/thrift/hive_metastore_types.rb +630 -0
- data/lib/thrift/hive_service_constants.rb +13 -0
- data/lib/thrift/hive_service_types.rb +72 -0
- data/lib/thrift/queryplan_constants.rb +13 -0
- data/lib/thrift/queryplan_types.rb +261 -0
- data/lib/thrift/sasl_client_transport.rb +161 -0
- data/lib/thrift/serde_constants.rb +92 -0
- data/lib/thrift/serde_types.rb +7 -0
- data/lib/thrift/t_c_l_i_service.rb +1054 -0
- data/lib/thrift/t_c_l_i_service_constants.rb +72 -0
- data/lib/thrift/t_c_l_i_service_types.rb +1768 -0
- data/lib/thrift/thrift_hive.rb +508 -0
- data/lib/thrift/thrift_hive_metastore.rb +3856 -0
- data/spec/impala_test.rb +6 -1
- metadata +53 -25
- data/README.rdoc +0 -39
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
module RBHive
|
4
|
+
class TCLISchemaDefinition
|
5
|
+
attr_reader :schema
|
6
|
+
|
7
|
+
NAN = Float::NAN rescue 0.0/0.0
|
8
|
+
INFINITY = Float::INFINITY rescue 1.0/0.0
|
9
|
+
TYPES = {
|
10
|
+
:boolean => :to_s,
|
11
|
+
:string => :to_s,
|
12
|
+
:float => :to_f,
|
13
|
+
:double => :to_f,
|
14
|
+
:int => :to_i,
|
15
|
+
:bigint => :to_i,
|
16
|
+
:smallint => :to_i,
|
17
|
+
:tinyint => :to_i,
|
18
|
+
}
|
19
|
+
|
20
|
+
def initialize(schema, example_row)
|
21
|
+
@schema = schema
|
22
|
+
@example_row = example_row ? example_row.colVals : []
|
23
|
+
end
|
24
|
+
|
25
|
+
def column_names
|
26
|
+
@column_names ||= begin
|
27
|
+
schema_names = @schema.columns.map {|c| c.columnName }
|
28
|
+
|
29
|
+
# In rare cases Hive can return two identical column names
|
30
|
+
# consider SELECT a.foo, b.foo...
|
31
|
+
# in this case you get two columns called foo with no disambiguation.
|
32
|
+
# as a (far from ideal) solution we detect this edge case and rename them
|
33
|
+
# a.foo => foo1, b.foo => foo2
|
34
|
+
# otherwise we will trample one of the columns during Hash mapping.
|
35
|
+
s = Hash.new(0)
|
36
|
+
schema_names.map! { |c| s[c] += 1; s[c] > 1 ? "#{c}---|---#{s[c]}" : c }
|
37
|
+
schema_names.map! { |c| s[c] > 1 ? "#{c}---|---1" : c }
|
38
|
+
schema_names.map! { |c| c.gsub('---|---', '_').to_sym }
|
39
|
+
|
40
|
+
# Lets fix the fact that Hive doesn't return schema data for partitions on SELECT * queries
|
41
|
+
# For now we will call them :_p1, :_p2, etc. to avoid collisions.
|
42
|
+
offset = 0
|
43
|
+
while schema_names.length < @example_row.length
|
44
|
+
schema_names.push(:"_p#{offset+=1}")
|
45
|
+
end
|
46
|
+
schema_names
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def column_type_map
|
51
|
+
@column_type_map ||= column_names.inject({}) do |hsh, c|
|
52
|
+
definition = @schema.columns.find {|s| s.columnName.to_sym == c }
|
53
|
+
# If the column isn't in the schema (eg partitions in SELECT * queries) assume they are strings
|
54
|
+
type = TYPE_NAMES[definition.typeDesc.types.first.primitiveEntry.type].downcase rescue nil
|
55
|
+
hsh[c] = definition && type ? type.to_sym : :string
|
56
|
+
hsh
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def coerce_row(row)
|
61
|
+
column_names.zip(row.colVals.map(&:get_value).map(&:value)).inject({}) do |hsh, (column_name, value)|
|
62
|
+
hsh[column_name] = coerce_column(column_name, value)
|
63
|
+
hsh
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def coerce_column(column_name, value)
|
68
|
+
type = column_type_map[column_name]
|
69
|
+
return INFINITY if (type != :string && value == "Infinity")
|
70
|
+
return NAN if (type != :string && value == "NaN")
|
71
|
+
return coerce_complex_value(value) if type.to_s =~ /^array/
|
72
|
+
conversion_method = TYPES[type]
|
73
|
+
conversion_method ? value.send(conversion_method) : value
|
74
|
+
end
|
75
|
+
|
76
|
+
def coerce_row_to_array(row)
|
77
|
+
column_names.map { |n| row[n] }
|
78
|
+
end
|
79
|
+
|
80
|
+
def coerce_complex_value(value)
|
81
|
+
return nil if value.nil?
|
82
|
+
return nil if value.length == 0
|
83
|
+
return nil if value == 'null'
|
84
|
+
JSON.parse(value)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
module RBHive
|
2
|
+
class TableSchema
|
3
|
+
attr_accessor :name
|
4
|
+
attr_reader :columns, :partitions
|
5
|
+
def initialize(name, comment=nil, options={}, &blk)
|
6
|
+
@name, @comment = name, comment
|
7
|
+
@location = options[:location] || nil
|
8
|
+
@field_sep = options[:field_sep] || "\t"
|
9
|
+
@line_sep = options[:line_sep] || "\n"
|
10
|
+
@collection_sep = options[:collection_sep] || "|"
|
11
|
+
@stored_as = options[:stored_as] || :textfile
|
12
|
+
@columns = []
|
13
|
+
@partitions = []
|
14
|
+
@serde_name = nil
|
15
|
+
@serde_properties = {}
|
16
|
+
instance_eval(&blk) if blk
|
17
|
+
end
|
18
|
+
|
19
|
+
def column(name, type, comment=nil)
|
20
|
+
@columns << Column.new(name, type, comment)
|
21
|
+
end
|
22
|
+
|
23
|
+
def partition(name, type, comment=nil)
|
24
|
+
@partitions << Column.new(name, type, comment)
|
25
|
+
end
|
26
|
+
|
27
|
+
def serde(name, properties={})
|
28
|
+
@serde_name = name
|
29
|
+
@serde_properties = properties
|
30
|
+
end
|
31
|
+
|
32
|
+
def create_table_statement()
|
33
|
+
%[CREATE #{external}TABLE #{table_statement}
|
34
|
+
ROW FORMAT #{row_format_statement}
|
35
|
+
STORED AS #{stored_as}
|
36
|
+
#{location}]
|
37
|
+
end
|
38
|
+
|
39
|
+
def stored_as
|
40
|
+
@stored_as.to_s.upcase
|
41
|
+
end
|
42
|
+
|
43
|
+
def row_format_statement
|
44
|
+
if @serde_name
|
45
|
+
serde_statement
|
46
|
+
else
|
47
|
+
delimited_statement
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def delimited_statement
|
52
|
+
%(DELIMITED
|
53
|
+
FIELDS TERMINATED BY '#{@field_sep}'
|
54
|
+
COLLECTION ITEMS TERMINATED BY '#{@collection_sep}'
|
55
|
+
LINES TERMINATED BY '#{@line_sep}')
|
56
|
+
end
|
57
|
+
|
58
|
+
def serde_statement
|
59
|
+
%(SERDE '#{@serde_name}'\n#{serde_properties_statement})
|
60
|
+
end
|
61
|
+
|
62
|
+
def serde_properties_statement
|
63
|
+
return '' unless @serde_properties.any?
|
64
|
+
kvs = @serde_properties.map { |k,v| %("#{k}" = "#{v}") }.join(",\n")
|
65
|
+
%(WITH SERDEPROPERTIES (#{kvs}))
|
66
|
+
end
|
67
|
+
|
68
|
+
def replace_columns_statement
|
69
|
+
alter_columns_statement("REPLACE")
|
70
|
+
end
|
71
|
+
|
72
|
+
def add_columns_statement
|
73
|
+
alter_columns_statement("ADD")
|
74
|
+
end
|
75
|
+
|
76
|
+
def to_s
|
77
|
+
table_statement
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
def external
|
83
|
+
@location.nil? ? '' : 'EXTERNAL '
|
84
|
+
end
|
85
|
+
|
86
|
+
def table_statement
|
87
|
+
comment_string = (@comment.nil? ? '' : " COMMENT '#{@comment}'")
|
88
|
+
%[`#{@name}` #{column_statement}#{comment_string}\n#{partition_statement}]
|
89
|
+
end
|
90
|
+
|
91
|
+
def location
|
92
|
+
@location.nil? ? '' : "LOCATION '#{@location}'"
|
93
|
+
end
|
94
|
+
|
95
|
+
def alter_columns_statement(add_or_replace)
|
96
|
+
%[ALTER TABLE `#{name}` #{add_or_replace} COLUMNS #{column_statement}]
|
97
|
+
end
|
98
|
+
|
99
|
+
def column_statement
|
100
|
+
cols = @columns.join(",\n")
|
101
|
+
"(\n#{cols}\n)"
|
102
|
+
end
|
103
|
+
|
104
|
+
def partition_statement
|
105
|
+
return "" if @partitions.nil? || @partitions.empty?
|
106
|
+
cols = @partitions.join(",\n")
|
107
|
+
"PARTITIONED BY (\n#{cols}\n)"
|
108
|
+
end
|
109
|
+
|
110
|
+
class Column
|
111
|
+
attr_reader :name, :type, :comment
|
112
|
+
def initialize(name, type, comment=nil)
|
113
|
+
@name, @type, @comment = name, type, comment
|
114
|
+
end
|
115
|
+
|
116
|
+
def to_s
|
117
|
+
comment_string = @comment.nil? ? '' : " COMMENT '#{@comment}'"
|
118
|
+
"`#{@name}` #{@type.to_s.upcase}#{comment_string}"
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
@@ -10,9 +10,15 @@ module Sequel
|
|
10
10
|
ImpalaExceptions = [
|
11
11
|
::Impala::Error,
|
12
12
|
::Impala::Protocol::Beeswax::BeeswaxException,
|
13
|
+
::Thrift::TransportException,
|
13
14
|
IOError
|
14
15
|
].freeze
|
15
16
|
|
17
|
+
DisconnectExceptions = [
|
18
|
+
::Thrift::TransportException,
|
19
|
+
IOError
|
20
|
+
]
|
21
|
+
|
16
22
|
set_adapter_scheme :impala
|
17
23
|
|
18
24
|
# Connect to the Impala server. Currently, only the :host and :port options
|
@@ -28,6 +34,7 @@ module Sequel
|
|
28
34
|
|
29
35
|
def disconnect_connection(c)
|
30
36
|
c.close
|
37
|
+
rescue *DisconnectExceptions
|
31
38
|
end
|
32
39
|
|
33
40
|
def execute(sql, opts=OPTS)
|
@@ -54,7 +61,12 @@ module Sequel
|
|
54
61
|
# in most cases that results in an unusable connection, so treat it as a
|
55
62
|
# disconnect error so Sequel will reconnect.
|
56
63
|
def disconnect_error?(exception, opts)
|
57
|
-
exception
|
64
|
+
case exception
|
65
|
+
when *DisconnectExceptions
|
66
|
+
true
|
67
|
+
else
|
68
|
+
super
|
69
|
+
end
|
58
70
|
end
|
59
71
|
|
60
72
|
# Use DESCRIBE to get the column names and types for the table.
|
@@ -0,0 +1,174 @@
|
|
1
|
+
require 'rbhive'
|
2
|
+
require 'sequel/adapters/shared/impala'
|
3
|
+
|
4
|
+
module Sequel
|
5
|
+
module Rbhive
|
6
|
+
class Database < Sequel::Database
|
7
|
+
include Impala::DatabaseMethods
|
8
|
+
|
9
|
+
NullLogger = Object.new
|
10
|
+
def NullLogger.info(str)
|
11
|
+
nil
|
12
|
+
end
|
13
|
+
|
14
|
+
to_i = lambda(&:to_i)
|
15
|
+
CONVERSION_PROCS = [
|
16
|
+
nil, # 0 => %q"BOOLEAN",
|
17
|
+
nil, # 1 => %q"TINYINT",
|
18
|
+
nil, # 2 => %q"SMALLINT",
|
19
|
+
nil, # 3 => %q"INT",
|
20
|
+
nil, # 4 => %q"BIGINT",
|
21
|
+
nil, # 5 => %q"FLOAT",
|
22
|
+
nil, # 6 => %q"DOUBLE",
|
23
|
+
nil, # 7 => %q"STRING",
|
24
|
+
nil, # 8 => %q"TIMESTAMP",
|
25
|
+
nil, # 9 => %q"BINARY",
|
26
|
+
nil, # 10 => %q"ARRAY",
|
27
|
+
nil, # 11 => %q"MAP",
|
28
|
+
nil, # 12 => %q"STRUCT",
|
29
|
+
nil, # 13 => %q"UNIONTYPE",
|
30
|
+
lambda{|v| BigDecimal.new(v)}, # 15 => %q"DECIMAL",
|
31
|
+
nil, # 16 => %q"NULL",
|
32
|
+
lambda{|v| Date.new(*v[0...10].split('-'))}, # 17 => %q"DATE",
|
33
|
+
nil, # 18 => %q"VARCHAR",
|
34
|
+
nil, # 19 => %q"CHAR",
|
35
|
+
]
|
36
|
+
|
37
|
+
attr_reader :conversion_procs
|
38
|
+
|
39
|
+
# Exception classes used by Impala.
|
40
|
+
RbhiveExceptions = [
|
41
|
+
RBHive::TCLIConnectionError,
|
42
|
+
::Thrift::TransportException,
|
43
|
+
IOError
|
44
|
+
].freeze
|
45
|
+
|
46
|
+
DisconnectExceptions = [
|
47
|
+
::Thrift::TransportException,
|
48
|
+
IOError
|
49
|
+
].freeze
|
50
|
+
|
51
|
+
set_adapter_scheme :rbhive
|
52
|
+
|
53
|
+
# Connect to the Impala server. Currently, only the :host and :port options
|
54
|
+
# are respected, and they default to 'localhost' and 21000, respectively.
|
55
|
+
def connect(server)
|
56
|
+
opts = server_opts(server)
|
57
|
+
opts[:hive_version] ||= 12
|
58
|
+
conn = RBHive::TCLIConnection.new(opts[:host]||'localhost', opts[:port]||21050, opts, opts[:hive_logger] || NullLogger)
|
59
|
+
conn.open
|
60
|
+
conn.open_session
|
61
|
+
conn
|
62
|
+
end
|
63
|
+
|
64
|
+
def database_error_classes
|
65
|
+
RbhiveExceptions
|
66
|
+
end
|
67
|
+
|
68
|
+
def disconnect_connection(connection)
|
69
|
+
connection.close_session if connection.session
|
70
|
+
connection.close
|
71
|
+
rescue *DisconnectExceptions
|
72
|
+
end
|
73
|
+
|
74
|
+
def execute(sql, opts=OPTS)
|
75
|
+
synchronize(opts[:server]) do |c|
|
76
|
+
begin
|
77
|
+
r = log_yield(sql){c.execute(sql)}
|
78
|
+
yield(c, r) if block_given?
|
79
|
+
nil
|
80
|
+
rescue *RbhiveExceptions => e
|
81
|
+
raise_error(e)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
private
|
87
|
+
|
88
|
+
def adapter_initialize
|
89
|
+
@conversion_procs = CONVERSION_PROCS.dup
|
90
|
+
@conversion_procs[8] = method(:to_application_timestamp)
|
91
|
+
end
|
92
|
+
|
93
|
+
def connection_execute_method
|
94
|
+
:execute
|
95
|
+
end
|
96
|
+
|
97
|
+
# Impala raises IOError if it detects a problem on the connection, and
|
98
|
+
# in most cases that results in an unusable connection, so treat it as a
|
99
|
+
# disconnect error so Sequel will reconnect.
|
100
|
+
def disconnect_error?(exception, opts)
|
101
|
+
case exception
|
102
|
+
when *DisconnectExceptions
|
103
|
+
true
|
104
|
+
else
|
105
|
+
super
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# Use DESCRIBE to get the column names and types for the table.
|
110
|
+
def schema_parse_table(table_name, opts)
|
111
|
+
m = output_identifier_meth(opts[:dataset])
|
112
|
+
|
113
|
+
table = if opts[:schema]
|
114
|
+
Sequel.qualify(opts[:schema], table_name)
|
115
|
+
else
|
116
|
+
Sequel.identifier(table_name)
|
117
|
+
end
|
118
|
+
|
119
|
+
describe(table, opts).map do |row|
|
120
|
+
row[:db_type] = row[:type]
|
121
|
+
row[:type] = schema_column_type(row[:db_type])
|
122
|
+
row[:default] = nil
|
123
|
+
row[:primary_key] = false
|
124
|
+
[m.call(row.delete(:name)), row]
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
class Dataset < Sequel::Dataset
|
130
|
+
include Impala::DatasetMethods
|
131
|
+
|
132
|
+
Database::DatasetClass = self
|
133
|
+
|
134
|
+
APOS = "'".freeze
|
135
|
+
STRING_ESCAPES = {
|
136
|
+
"\\" => "\\\\".freeze,
|
137
|
+
"'" => "\\'".freeze,
|
138
|
+
"\n" => "\\n".freeze,
|
139
|
+
"\r" => "\\r".freeze,
|
140
|
+
"\0" => "\\0".freeze,
|
141
|
+
"\b" => "\\b".freeze,
|
142
|
+
"\04" => "\\Z".freeze,
|
143
|
+
# Impala is supposed to support this, but using it
|
144
|
+
# breaks things to the point of returning bad data.
|
145
|
+
# If you don't do this, the tabs in the input
|
146
|
+
# get converted to spaces, but that's better than the
|
147
|
+
# alternative.
|
148
|
+
# "\t" => "\\t".freeze,
|
149
|
+
}.freeze
|
150
|
+
STRING_ESCAPE_RE = /(#{Regexp.union(STRING_ESCAPES.keys)})/
|
151
|
+
|
152
|
+
def fetch_rows(sql)
|
153
|
+
execute(sql) do |conn, result|
|
154
|
+
op_handle = result.operationHandle
|
155
|
+
columns, type_nums = conn.get_column_info(op_handle)
|
156
|
+
@columns = columns.map!{|c| output_identifier(c)}
|
157
|
+
conversion_procs = db.conversion_procs
|
158
|
+
convertors = conversion_procs.values_at(*type_nums)
|
159
|
+
#cursor.typecast_map['timestamp'] = db.method(:to_application_timestamp)
|
160
|
+
conn.yield_hash_rows(op_handle, columns, convertors) do |row|
|
161
|
+
yield row
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
private
|
167
|
+
|
168
|
+
def literal_string_append(sql, s)
|
169
|
+
sql << APOS << s.to_s.gsub(STRING_ESCAPE_RE){|m| STRING_ESCAPES[m]} << APOS
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
@@ -98,12 +98,12 @@ module Sequel
|
|
98
98
|
if schema = search_path_table_schemas[table]
|
99
99
|
Sequel.qualify(schema, table)
|
100
100
|
else
|
101
|
-
table
|
101
|
+
Sequel.identifier(table)
|
102
102
|
end
|
103
103
|
when SQL::Identifier
|
104
104
|
implicit_qualify(table.value.to_s)
|
105
105
|
when SQL::AliasedExpression
|
106
|
-
SQL::AliasedExpression.new(implicit_qualify(table),
|
106
|
+
SQL::AliasedExpression.new(implicit_qualify(table.expression), table.alias)
|
107
107
|
else
|
108
108
|
table
|
109
109
|
end
|
@@ -229,7 +229,7 @@ module Sequel
|
|
229
229
|
|
230
230
|
def create_table_sql(name, generator, options)
|
231
231
|
sql = super
|
232
|
-
sql
|
232
|
+
sql += create_table_parameters_sql(options)
|
233
233
|
sql
|
234
234
|
end
|
235
235
|
|
@@ -500,6 +500,10 @@ module Sequel
|
|
500
500
|
true
|
501
501
|
end
|
502
502
|
|
503
|
+
def supports_cte_in_subqueries?
|
504
|
+
true
|
505
|
+
end
|
506
|
+
|
503
507
|
# Impala doesn't support derived column lists when aliasing
|
504
508
|
# tables.
|
505
509
|
def supports_derived_column_lists?
|
@@ -618,6 +622,10 @@ module Sequel
|
|
618
622
|
sql << APOS << s.to_s.gsub(STRING_ESCAPE_RE, STRING_ESCAPE_REPLACE) << APOS
|
619
623
|
end
|
620
624
|
|
625
|
+
def multi_insert_sql_strategy
|
626
|
+
:values
|
627
|
+
end
|
628
|
+
|
621
629
|
# Impala doesn't support esacping of identifiers, so you can't use backtick in
|
622
630
|
# an identifier name.
|
623
631
|
def quoted_identifier_append(sql, name)
|