sequel-impala 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +16 -0
- data/LICENSE +2 -1
- data/README.md +45 -0
- data/lib/rbhive.rb +8 -0
- data/lib/rbhive/connection.rb +150 -0
- data/lib/rbhive/explain_result.rb +46 -0
- data/lib/rbhive/result_set.rb +37 -0
- data/lib/rbhive/schema_definition.rb +86 -0
- data/lib/rbhive/t_c_l_i_connection.rb +464 -0
- data/lib/rbhive/t_c_l_i_result_set.rb +3 -0
- data/lib/rbhive/t_c_l_i_schema_definition.rb +87 -0
- data/lib/rbhive/table_schema.rb +122 -0
- data/lib/rbhive/version.rb +3 -0
- data/lib/sequel/adapters/impala.rb +13 -1
- data/lib/sequel/adapters/rbhive.rb +174 -0
- data/lib/sequel/adapters/shared/impala.rb +11 -3
- data/lib/sequel/extensions/csv_to_parquet.rb +68 -14
- data/lib/thrift/facebook_service.rb +700 -0
- data/lib/thrift/fb303_constants.rb +9 -0
- data/lib/thrift/fb303_types.rb +19 -0
- data/lib/thrift/hive_metastore_constants.rb +41 -0
- data/lib/thrift/hive_metastore_types.rb +630 -0
- data/lib/thrift/hive_service_constants.rb +13 -0
- data/lib/thrift/hive_service_types.rb +72 -0
- data/lib/thrift/queryplan_constants.rb +13 -0
- data/lib/thrift/queryplan_types.rb +261 -0
- data/lib/thrift/sasl_client_transport.rb +161 -0
- data/lib/thrift/serde_constants.rb +92 -0
- data/lib/thrift/serde_types.rb +7 -0
- data/lib/thrift/t_c_l_i_service.rb +1054 -0
- data/lib/thrift/t_c_l_i_service_constants.rb +72 -0
- data/lib/thrift/t_c_l_i_service_types.rb +1768 -0
- data/lib/thrift/thrift_hive.rb +508 -0
- data/lib/thrift/thrift_hive_metastore.rb +3856 -0
- data/spec/impala_test.rb +6 -1
- metadata +53 -25
- data/README.rdoc +0 -39
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 03ea5f2607bc4908064302d49640df3a4e34eaa3
|
4
|
+
data.tar.gz: 18c93756bb5918f32cb6f32612856963f634e966
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 72ca2b1c7177ecc8c2db06e8b266f4a1ff67085cf7ef77d464e6b7667ce896870d905a691acf4fc2b34e67c90777dc3767cc9b3a1e3a2a9252967beaf507b566
|
7
|
+
data.tar.gz: fb15a8bf19c03e54179666df88da7ffc0f0be90d48a65ada20722da03f46c4fa26d910a2f3c554a93149ea29f92f565fa3bbeb43b875c8ceacfcf413b64aa161
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,19 @@
|
|
1
1
|
=== HEAD
|
2
2
|
|
3
|
+
* Fix disconnect detection in impala and rbhive adapters (jeremyevans)
|
4
|
+
|
5
|
+
* Make implicit qualify return an SQL::Identifier if given an unqualified string (jeremyevans)
|
6
|
+
|
7
|
+
* Fix :search_path option handling when using Sequel::SQL::AliasedExpressions (jeremyevans)
|
8
|
+
|
9
|
+
* Speed up multi_insert and import (jeremyevans)
|
10
|
+
|
11
|
+
* Add rbhive adapter (jeremyevans)
|
12
|
+
|
13
|
+
* Add :empty_null=>:ruby option to csv_to_parquet extension, which can support quoted CSV cells (jeremyevans)
|
14
|
+
|
15
|
+
* Optimize csv_to_parquet extension by not spawning shells or unnecessary processes (jeremyevans)
|
16
|
+
|
17
|
+
=== 1.0.0 (2015-12-04)
|
18
|
+
|
3
19
|
* Initial Public Release
|
data/LICENSE
CHANGED
@@ -1,5 +1,6 @@
|
|
1
|
-
Copyright (c) 2015 Jeremy Evans
|
1
|
+
Copyright (c) 2015-2016 Jeremy Evans
|
2
2
|
Copyright (c) 2013 Colin Marc
|
3
|
+
Copyright (c) [2013] [Forward3D]
|
3
4
|
|
4
5
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
5
6
|
of this software and associated documentation files (the "Software"), to
|
data/README.md
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# sequel-impala
|
2
|
+
|
3
|
+
sequel-impala adds support for Sequel to connect to the Impala database
|
4
|
+
via the included impala driver, and the included jdbc-hive2 driver under JRuby.
|
5
|
+
|
6
|
+
# Source Code
|
7
|
+
|
8
|
+
Source code is available on GitHub at https://github.com/outcomesinsights/sequel-impala
|
9
|
+
|
10
|
+
# Usage
|
11
|
+
|
12
|
+
After installation, Sequel will automatically pick up the adapter as long as
|
13
|
+
the lib directory is in RUBYLIB, if you use a connection string starting with
|
14
|
+
`impala`, or `jdbc:hive2` on JRuby.
|
15
|
+
|
16
|
+
# Connection Strings
|
17
|
+
|
18
|
+
If using the impala driver (default host is localhost, default port is 21000):
|
19
|
+
|
20
|
+
impala://host:port
|
21
|
+
|
22
|
+
If using the jdbc:hive2 driver on JRuby (port 21050 works in testing):
|
23
|
+
|
24
|
+
jdbc:hive2://host:port/;auth=noSasl
|
25
|
+
|
26
|
+
# Dependencies
|
27
|
+
|
28
|
+
* sequel 4+
|
29
|
+
* thrift gem
|
30
|
+
|
31
|
+
# License
|
32
|
+
|
33
|
+
MIT/Apache
|
34
|
+
|
35
|
+
# Author
|
36
|
+
|
37
|
+
Ryan Duryea <aguynamedryan@gmail.com>
|
38
|
+
|
39
|
+
Work on sequel-impala is generously funded by [Outcomes Insights, Inc.](http://outins.com)
|
40
|
+
|
41
|
+
# Previous Author
|
42
|
+
|
43
|
+
Jeremy Evans <code@jeremyevans.net>
|
44
|
+
|
45
|
+
Provided initial work on this gem, and continues to maintain [Sequel](http://sequel.jeremyevans.net/). We can't thank you enough!
|
data/lib/rbhive.rb
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'connection')
|
2
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'table_schema')
|
3
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'result_set')
|
4
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'explain_result')
|
5
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'schema_definition')
|
6
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_result_set])
|
7
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_schema_definition])
|
8
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_connection])
|
@@ -0,0 +1,150 @@
|
|
1
|
+
# suppress warnings
|
2
|
+
old_verbose, $VERBOSE = $VERBOSE, nil
|
3
|
+
# require thrift autogenerated files
|
4
|
+
require File.join(File.split(File.dirname(__FILE__)).first, *%w[thrift thrift_hive])
|
5
|
+
# require 'thrift'
|
6
|
+
# restore warnings
|
7
|
+
$VERBOSE = old_verbose
|
8
|
+
|
9
|
+
module RBHive
|
10
|
+
def connect(server, port=10_000)
|
11
|
+
connection = RBHive::Connection.new(server, port)
|
12
|
+
ret = nil
|
13
|
+
begin
|
14
|
+
connection.open
|
15
|
+
ret = yield(connection)
|
16
|
+
ensure
|
17
|
+
connection.close
|
18
|
+
ret
|
19
|
+
end
|
20
|
+
end
|
21
|
+
module_function :connect
|
22
|
+
|
23
|
+
class StdOutLogger
|
24
|
+
%w(fatal error warn info debug).each do |level|
|
25
|
+
define_method level.to_sym do |message|
|
26
|
+
STDOUT.puts(message)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class Connection
|
32
|
+
attr_reader :client
|
33
|
+
|
34
|
+
def initialize(server, port=10_000, logger=StdOutLogger.new)
|
35
|
+
@socket = Thrift::Socket.new(server, port)
|
36
|
+
@transport = Thrift::BufferedTransport.new(@socket)
|
37
|
+
@protocol = Thrift::BinaryProtocol.new(@transport)
|
38
|
+
@client = Hive::Thrift::ThriftHive::Client.new(@protocol)
|
39
|
+
@logger = logger
|
40
|
+
@logger.info("Connecting to #{server} on port #{port}")
|
41
|
+
@mutex = Mutex.new
|
42
|
+
end
|
43
|
+
|
44
|
+
def open
|
45
|
+
@transport.open
|
46
|
+
end
|
47
|
+
|
48
|
+
def close
|
49
|
+
@transport.close
|
50
|
+
end
|
51
|
+
|
52
|
+
def client
|
53
|
+
@client
|
54
|
+
end
|
55
|
+
|
56
|
+
def execute(query)
|
57
|
+
execute_safe(query)
|
58
|
+
end
|
59
|
+
|
60
|
+
def explain(query)
|
61
|
+
safe do
|
62
|
+
execute_unsafe("EXPLAIN "+ query)
|
63
|
+
ExplainResult.new(client.fetchAll)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def priority=(priority)
|
68
|
+
set("mapred.job.priority", priority)
|
69
|
+
end
|
70
|
+
|
71
|
+
def queue=(queue)
|
72
|
+
set("mapred.job.queue.name", queue)
|
73
|
+
end
|
74
|
+
|
75
|
+
def set(name,value)
|
76
|
+
@logger.info("Setting #{name}=#{value}")
|
77
|
+
client.execute("SET #{name}=#{value}")
|
78
|
+
end
|
79
|
+
|
80
|
+
def fetch(query)
|
81
|
+
safe do
|
82
|
+
execute_unsafe(query)
|
83
|
+
rows = client.fetchAll
|
84
|
+
the_schema = SchemaDefinition.new(client.getSchema, rows.first)
|
85
|
+
ResultSet.new(rows, the_schema)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def fetch_in_batch(query, batch_size=1_000)
|
90
|
+
safe do
|
91
|
+
execute_unsafe(query)
|
92
|
+
until (next_batch = client.fetchN(batch_size)).empty?
|
93
|
+
the_schema ||= SchemaDefinition.new(client.getSchema, next_batch.first)
|
94
|
+
yield ResultSet.new(next_batch, the_schema)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def first(query)
|
100
|
+
safe do
|
101
|
+
execute_unsafe(query)
|
102
|
+
row = client.fetchOne
|
103
|
+
the_schema = SchemaDefinition.new(client.getSchema, row)
|
104
|
+
ResultSet.new([row], the_schema).first
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def schema(example_row=[])
|
109
|
+
safe { SchemaDefinition.new(client.getSchema, example_row) }
|
110
|
+
end
|
111
|
+
|
112
|
+
def create_table(schema)
|
113
|
+
execute(schema.create_table_statement)
|
114
|
+
end
|
115
|
+
|
116
|
+
def drop_table(name)
|
117
|
+
name = name.name if name.is_a?(TableSchema)
|
118
|
+
execute("DROP TABLE `#{name}`")
|
119
|
+
end
|
120
|
+
|
121
|
+
def replace_columns(schema)
|
122
|
+
execute(schema.replace_columns_statement)
|
123
|
+
end
|
124
|
+
|
125
|
+
def add_columns(schema)
|
126
|
+
execute(schema.add_columns_statement)
|
127
|
+
end
|
128
|
+
|
129
|
+
def method_missing(meth, *args)
|
130
|
+
client.send(meth, *args)
|
131
|
+
end
|
132
|
+
|
133
|
+
private
|
134
|
+
|
135
|
+
def execute_safe(query)
|
136
|
+
safe { execute_unsafe(query) }
|
137
|
+
end
|
138
|
+
|
139
|
+
def execute_unsafe(query)
|
140
|
+
@logger.info("Executing Hive Query: #{query}")
|
141
|
+
client.execute(query)
|
142
|
+
end
|
143
|
+
|
144
|
+
def safe
|
145
|
+
ret = nil
|
146
|
+
@mutex.synchronize { ret = yield }
|
147
|
+
ret
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
class ExplainResult
|
2
|
+
def initialize(rows)
|
3
|
+
@rows = rows
|
4
|
+
end
|
5
|
+
|
6
|
+
def ast
|
7
|
+
by_section[:abstract_syntax_tree].first
|
8
|
+
end
|
9
|
+
|
10
|
+
def stage_count
|
11
|
+
stage_dependencies.length
|
12
|
+
end
|
13
|
+
|
14
|
+
def stage_dependencies
|
15
|
+
by_section[:stage_dependencies] || []
|
16
|
+
end
|
17
|
+
|
18
|
+
def to_tsv
|
19
|
+
@rows.join("\n")
|
20
|
+
end
|
21
|
+
|
22
|
+
def raw
|
23
|
+
@rows
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_s
|
27
|
+
to_tsv
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def by_section
|
33
|
+
current_section = nil
|
34
|
+
@rows.inject({}) do |sections, row|
|
35
|
+
if row.match(/^[A-Z]/)
|
36
|
+
current_section = row.chomp(':').downcase.gsub(' ', '_').to_sym
|
37
|
+
sections[current_section] = []
|
38
|
+
elsif row.length == 0
|
39
|
+
next sections
|
40
|
+
else
|
41
|
+
sections[current_section] << row.strip
|
42
|
+
end
|
43
|
+
sections
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module RBHive
|
2
|
+
class ResultSet < Array
|
3
|
+
def initialize(rows, schema)
|
4
|
+
@schema = schema
|
5
|
+
super(rows.map {|r| @schema.coerce_row(r) })
|
6
|
+
end
|
7
|
+
|
8
|
+
def column_names
|
9
|
+
@schema.column_names
|
10
|
+
end
|
11
|
+
|
12
|
+
def column_type_map
|
13
|
+
@schema.column_type_map
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_csv(out_file=nil)
|
17
|
+
to_separated_output(",", out_file)
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_tsv(out_file=nil)
|
21
|
+
to_separated_output("\t", out_file)
|
22
|
+
end
|
23
|
+
|
24
|
+
def as_arrays
|
25
|
+
@as_arrays ||= self.map{ |r| @schema.coerce_row_to_array(r) }
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def to_separated_output(sep, out_file)
|
31
|
+
rows = self.map { |r| @schema.coerce_row_to_array(r).join(sep) }
|
32
|
+
sv = rows.join("\n")
|
33
|
+
return sv if out_file.nil?
|
34
|
+
File.open(out_file, 'w+') { |f| f << sv }
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
module RBHive
|
4
|
+
class SchemaDefinition
|
5
|
+
attr_reader :schema
|
6
|
+
|
7
|
+
NAN = Float::NAN rescue 0.0/0.0
|
8
|
+
INFINITY = Float::INFINITY rescue 1.0/0.0
|
9
|
+
TYPES = {
|
10
|
+
:boolean => :to_s,
|
11
|
+
:string => :to_s,
|
12
|
+
:bigint => :to_i,
|
13
|
+
:float => :to_f,
|
14
|
+
:double => :to_f,
|
15
|
+
:int => :to_i,
|
16
|
+
:smallint => :to_i,
|
17
|
+
:tinyint => :to_i,
|
18
|
+
}
|
19
|
+
|
20
|
+
def initialize(schema, example_row)
|
21
|
+
@schema = schema
|
22
|
+
@example_row = example_row ? example_row.split("\t") : []
|
23
|
+
end
|
24
|
+
|
25
|
+
def column_names
|
26
|
+
@column_names ||= begin
|
27
|
+
schema_names = @schema.fieldSchemas.map {|c| c.name }
|
28
|
+
|
29
|
+
# In rare cases Hive can return two identical column names
|
30
|
+
# consider SELECT a.foo, b.foo...
|
31
|
+
# in this case you get two columns called foo with no disambiguation.
|
32
|
+
# as a (far from ideal) solution we detect this edge case and rename them
|
33
|
+
# a.foo => foo1, b.foo => foo2
|
34
|
+
# otherwise we will trample one of the columns during Hash mapping.
|
35
|
+
s = Hash.new(0)
|
36
|
+
schema_names.map! { |c| s[c] += 1; s[c] > 1 ? "#{c}---|---#{s[c]}" : c }
|
37
|
+
schema_names.map! { |c| s[c] > 1 ? "#{c}---|---1" : c }
|
38
|
+
schema_names.map! { |c| c.gsub('---|---', '_').to_sym }
|
39
|
+
|
40
|
+
# Lets fix the fact that Hive doesn't return schema data for partitions on SELECT * queries
|
41
|
+
# For now we will call them :_p1, :_p2, etc. to avoid collisions.
|
42
|
+
offset = 0
|
43
|
+
while schema_names.length < @example_row.length
|
44
|
+
schema_names.push(:"_p#{offset+=1}")
|
45
|
+
end
|
46
|
+
schema_names
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def column_type_map
|
51
|
+
@column_type_map ||= column_names.inject({}) do |hsh, c|
|
52
|
+
definition = @schema.fieldSchemas.find {|s| s.name.to_sym == c }
|
53
|
+
# If the column isn't in the schema (eg partitions in SELECT * queries) assume they are strings
|
54
|
+
hsh[c] = definition ? definition.type.to_sym : :string
|
55
|
+
hsh
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def coerce_row(row)
|
60
|
+
column_names.zip(row.split("\t")).inject({}) do |hsh, (column_name, value)|
|
61
|
+
hsh[column_name] = coerce_column(column_name, value)
|
62
|
+
hsh
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def coerce_column(column_name, value)
|
67
|
+
type = column_type_map[column_name]
|
68
|
+
return INFINITY if (type != :string && value == "Infinity")
|
69
|
+
return NAN if (type != :string && value == "NaN")
|
70
|
+
return coerce_complex_value(value) if type.to_s =~ /^array/
|
71
|
+
conversion_method = TYPES[type]
|
72
|
+
conversion_method ? value.send(conversion_method) : value
|
73
|
+
end
|
74
|
+
|
75
|
+
def coerce_row_to_array(row)
|
76
|
+
column_names.map { |n| row[n] }
|
77
|
+
end
|
78
|
+
|
79
|
+
def coerce_complex_value(value)
|
80
|
+
return nil if value.nil?
|
81
|
+
return nil if value.length == 0
|
82
|
+
return nil if value == 'null'
|
83
|
+
JSON.parse(value)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,464 @@
|
|
1
|
+
# suppress warnings
|
2
|
+
old_verbose, $VERBOSE = $VERBOSE, nil
|
3
|
+
|
4
|
+
raise 'Thrift is not loaded' unless defined?(Thrift)
|
5
|
+
raise 'RBHive is not loaded' unless defined?(RBHive)
|
6
|
+
|
7
|
+
# require thrift autogenerated files
|
8
|
+
require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service_constants])
|
9
|
+
require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service])
|
10
|
+
require File.join(File.dirname(__FILE__), *%w[.. thrift sasl_client_transport])
|
11
|
+
|
12
|
+
# restore warnings
|
13
|
+
$VERBOSE = old_verbose
|
14
|
+
|
15
|
+
# Monkey patch thrift to set an infinite read timeout
|
16
|
+
module Thrift
|
17
|
+
class HTTPClientTransport < BaseTransport
|
18
|
+
def flush
|
19
|
+
http = Net::HTTP.new @url.host, @url.port
|
20
|
+
http.use_ssl = @url.scheme == 'https'
|
21
|
+
http.read_timeout = nil
|
22
|
+
http.verify_mode = @ssl_verify_mode if @url.scheme == 'https'
|
23
|
+
resp = http.post(@url.request_uri, @outbuf, @headers)
|
24
|
+
data = resp.body
|
25
|
+
data = Bytes.force_binary_encoding(data)
|
26
|
+
@inbuf = StringIO.new data
|
27
|
+
@outbuf = Bytes.empty_byte_buffer
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
module RBHive
|
33
|
+
|
34
|
+
HIVE_THRIFT_MAPPING = {
|
35
|
+
10 => 0,
|
36
|
+
11 => 1,
|
37
|
+
12 => 2,
|
38
|
+
13 => 6,
|
39
|
+
:cdh4 => 0,
|
40
|
+
:cdh5 => 4,
|
41
|
+
:PROTOCOL_V1 => 0,
|
42
|
+
:PROTOCOL_V2 => 1,
|
43
|
+
:PROTOCOL_V3 => 2,
|
44
|
+
:PROTOCOL_V4 => 3,
|
45
|
+
:PROTOCOL_V5 => 4,
|
46
|
+
:PROTOCOL_V6 => 5,
|
47
|
+
:PROTOCOL_V7 => 6
|
48
|
+
}
|
49
|
+
|
50
|
+
def tcli_connect(server, port = 10_000, options={})
|
51
|
+
logger = options.key?(:logger) ? options.delete(:logger) : StdOutLogger.new
|
52
|
+
connection = RBHive::TCLIConnection.new(server, port, options, logger)
|
53
|
+
ret = nil
|
54
|
+
begin
|
55
|
+
connection.open
|
56
|
+
connection.open_session
|
57
|
+
ret = yield(connection)
|
58
|
+
|
59
|
+
ensure
|
60
|
+
# Try to close the session and our connection if those are still open, ignore io errors
|
61
|
+
begin
|
62
|
+
connection.close_session if connection.session
|
63
|
+
connection.close
|
64
|
+
rescue IOError => e
|
65
|
+
# noop
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
ret
|
70
|
+
end
|
71
|
+
module_function :tcli_connect
|
72
|
+
|
73
|
+
class StdOutLogger
|
74
|
+
%w(fatal error warn info debug).each do |level|
|
75
|
+
define_method level.to_sym do |message|
|
76
|
+
STDOUT.puts(message)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
class TCLIConnection
|
82
|
+
attr_reader :client
|
83
|
+
|
84
|
+
def initialize(server, port = 10_000, options = {}, logger = StdOutLogger.new)
|
85
|
+
options ||= {} # backwards compatibility
|
86
|
+
raise "'options' parameter must be a hash" unless options.is_a?(Hash)
|
87
|
+
@sasl_params = options.delete(:sasl_params) || {}
|
88
|
+
|
89
|
+
if options[:transport] == :sasl and @sasl_params.empty?
|
90
|
+
raise ":transport is set to :sasl, but no :sasl_params option was supplied"
|
91
|
+
end
|
92
|
+
|
93
|
+
# Defaults to buffered transport, Hive 0.10, 1800 second timeout
|
94
|
+
options[:transport] ||= :buffered
|
95
|
+
options[:hive_version] ||= 10
|
96
|
+
options[:timeout] ||= 1800
|
97
|
+
@options = options
|
98
|
+
# Look up the appropriate Thrift protocol version for the supplied Hive version
|
99
|
+
@thrift_protocol_version = thrift_hive_protocol(options[:hive_version])
|
100
|
+
|
101
|
+
@logger = logger
|
102
|
+
@transport = thrift_transport(server, port)
|
103
|
+
@protocol = Thrift::BinaryProtocol.new(@transport)
|
104
|
+
@client = Hive2::Thrift::TCLIService::Client.new(@protocol)
|
105
|
+
@session = nil
|
106
|
+
@logger.info("Connecting to HiveServer2 #{server} on port #{port}")
|
107
|
+
end
|
108
|
+
|
109
|
+
def thrift_hive_protocol(version)
|
110
|
+
HIVE_THRIFT_MAPPING[version] || raise("Invalid Hive version")
|
111
|
+
end
|
112
|
+
|
113
|
+
def thrift_transport(server, port)
|
114
|
+
@logger.info("Initializing transport #{@options[:transport]}")
|
115
|
+
case @options[:transport]
|
116
|
+
when :buffered
|
117
|
+
return Thrift::BufferedTransport.new(thrift_socket(server, port, @options[:timeout]))
|
118
|
+
when :sasl
|
119
|
+
return Thrift::SaslClientTransport.new(thrift_socket(server, port, @options[:timeout]),
|
120
|
+
parse_sasl_params(@sasl_params))
|
121
|
+
when :http
|
122
|
+
return Thrift::HTTPClientTransport.new("http://#{server}:#{port}/cliservice")
|
123
|
+
else
|
124
|
+
raise "Unrecognised transport type '#{transport}'"
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def thrift_socket(server, port, timeout)
|
129
|
+
socket = Thrift::Socket.new(server, port)
|
130
|
+
socket.timeout = timeout
|
131
|
+
socket
|
132
|
+
end
|
133
|
+
|
134
|
+
# Processes SASL connection params and returns a hash with symbol keys or a nil
|
135
|
+
def parse_sasl_params(sasl_params)
|
136
|
+
# Symbilize keys in a hash
|
137
|
+
if sasl_params.kind_of?(Hash)
|
138
|
+
return sasl_params.inject({}) do |memo,(k,v)|
|
139
|
+
memo[k.to_sym] = v;
|
140
|
+
memo
|
141
|
+
end
|
142
|
+
end
|
143
|
+
return nil
|
144
|
+
end
|
145
|
+
|
146
|
+
def open
|
147
|
+
@transport.open
|
148
|
+
end
|
149
|
+
|
150
|
+
def close
|
151
|
+
@transport.close
|
152
|
+
end
|
153
|
+
|
154
|
+
def open_session
|
155
|
+
@session = @client.OpenSession(prepare_open_session(@thrift_protocol_version))
|
156
|
+
end
|
157
|
+
|
158
|
+
def close_session
|
159
|
+
@client.CloseSession prepare_close_session
|
160
|
+
@session = nil
|
161
|
+
end
|
162
|
+
|
163
|
+
def session
|
164
|
+
@session && @session.sessionHandle
|
165
|
+
end
|
166
|
+
|
167
|
+
def client
|
168
|
+
@client
|
169
|
+
end
|
170
|
+
|
171
|
+
def execute(query)
|
172
|
+
@logger.info("Executing Hive Query: #{query}")
|
173
|
+
req = prepare_execute_statement(query)
|
174
|
+
exec_result = client.ExecuteStatement(req)
|
175
|
+
raise_error_if_failed!(exec_result)
|
176
|
+
exec_result
|
177
|
+
end
|
178
|
+
|
179
|
+
def priority=(priority)
|
180
|
+
set("mapred.job.priority", priority)
|
181
|
+
end
|
182
|
+
|
183
|
+
def queue=(queue)
|
184
|
+
set("mapred.job.queue.name", queue)
|
185
|
+
end
|
186
|
+
|
187
|
+
def set(name,value)
|
188
|
+
@logger.info("Setting #{name}=#{value}")
|
189
|
+
self.execute("SET #{name}=#{value}")
|
190
|
+
end
|
191
|
+
|
192
|
+
# Async execute
|
193
|
+
def async_execute(query)
|
194
|
+
@logger.info("Executing query asynchronously: #{query}")
|
195
|
+
exec_result = @client.ExecuteStatement(
|
196
|
+
Hive2::Thrift::TExecuteStatementReq.new(
|
197
|
+
sessionHandle: @session.sessionHandle,
|
198
|
+
statement: query,
|
199
|
+
runAsync: true
|
200
|
+
)
|
201
|
+
)
|
202
|
+
raise_error_if_failed!(exec_result)
|
203
|
+
op_handle = exec_result.operationHandle
|
204
|
+
|
205
|
+
# Return handles to get hold of this query / session again
|
206
|
+
{
|
207
|
+
session: @session.sessionHandle,
|
208
|
+
guid: op_handle.operationId.guid,
|
209
|
+
secret: op_handle.operationId.secret
|
210
|
+
}
|
211
|
+
end
|
212
|
+
|
213
|
+
# Is the query complete?
|
214
|
+
def async_is_complete?(handles)
|
215
|
+
async_state(handles) == :finished
|
216
|
+
end
|
217
|
+
|
218
|
+
# Is the query actually running?
|
219
|
+
def async_is_running?(handles)
|
220
|
+
async_state(handles) == :running
|
221
|
+
end
|
222
|
+
|
223
|
+
# Has the query failed?
|
224
|
+
def async_is_failed?(handles)
|
225
|
+
async_state(handles) == :error
|
226
|
+
end
|
227
|
+
|
228
|
+
def async_is_cancelled?(handles)
|
229
|
+
async_state(handles) == :cancelled
|
230
|
+
end
|
231
|
+
|
232
|
+
def async_cancel(handles)
|
233
|
+
@client.CancelOperation(prepare_cancel_request(handles))
|
234
|
+
end
|
235
|
+
|
236
|
+
# Map states to symbols
|
237
|
+
def async_state(handles)
|
238
|
+
response = @client.GetOperationStatus(
|
239
|
+
Hive2::Thrift::TGetOperationStatusReq.new(operationHandle: prepare_operation_handle(handles))
|
240
|
+
)
|
241
|
+
|
242
|
+
case response.operationState
|
243
|
+
when Hive2::Thrift::TOperationState::FINISHED_STATE
|
244
|
+
return :finished
|
245
|
+
when Hive2::Thrift::TOperationState::INITIALIZED_STATE
|
246
|
+
return :initialized
|
247
|
+
when Hive2::Thrift::TOperationState::RUNNING_STATE
|
248
|
+
return :running
|
249
|
+
when Hive2::Thrift::TOperationState::CANCELED_STATE
|
250
|
+
return :cancelled
|
251
|
+
when Hive2::Thrift::TOperationState::CLOSED_STATE
|
252
|
+
return :closed
|
253
|
+
when Hive2::Thrift::TOperationState::ERROR_STATE
|
254
|
+
return :error
|
255
|
+
when Hive2::Thrift::TOperationState::UKNOWN_STATE
|
256
|
+
return :unknown
|
257
|
+
when Hive2::Thrift::TOperationState::PENDING_STATE
|
258
|
+
return :pending
|
259
|
+
when nil
|
260
|
+
raise "No operation state found for handles - has the session been closed?"
|
261
|
+
else
|
262
|
+
return :state_not_in_protocol
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
# Async fetch results from an async execute
|
267
|
+
def async_fetch(handles, max_rows = 100)
|
268
|
+
# Can't get data from an unfinished query
|
269
|
+
unless async_is_complete?(handles)
|
270
|
+
raise "Can't perform fetch on a query in state: #{async_state(handles)}"
|
271
|
+
end
|
272
|
+
|
273
|
+
# Fetch and
|
274
|
+
fetch_rows(prepare_operation_handle(handles), :first, max_rows)
|
275
|
+
end
|
276
|
+
|
277
|
+
# Performs a query on the server, fetches the results in batches of *batch_size* rows
|
278
|
+
# and yields the result batches to a given block as arrays of rows.
|
279
|
+
def async_fetch_in_batch(handles, batch_size = 1000, &block)
|
280
|
+
raise "No block given for the batch fetch request!" unless block_given?
|
281
|
+
# Can't get data from an unfinished query
|
282
|
+
unless async_is_complete?(handles)
|
283
|
+
raise "Can't perform fetch on a query in state: #{async_state(handles)}"
|
284
|
+
end
|
285
|
+
|
286
|
+
# Now let's iterate over the results
|
287
|
+
loop do
|
288
|
+
rows = fetch_rows(prepare_operation_handle(handles), :next, batch_size)
|
289
|
+
break if rows.empty?
|
290
|
+
yield rows
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
def async_close_session(handles)
|
295
|
+
validate_handles!(handles)
|
296
|
+
@client.CloseSession(Hive2::Thrift::TCloseSessionReq.new( sessionHandle: handles[:session] ))
|
297
|
+
end
|
298
|
+
|
299
|
+
def get_column_info(op_handle)
|
300
|
+
cols = get_schema_for(op_handle).columns
|
301
|
+
[cols.map(&:columnName), cols.map{|c| c.typeDesc.types.first.primitiveEntry.type}]
|
302
|
+
end
|
303
|
+
|
304
|
+
def yield_hash_rows(op_handle, columns, convertors)
|
305
|
+
i = -1
|
306
|
+
cols = columns.zip(convertors).map{|col, conv| [i+=1, col, conv]}
|
307
|
+
rows = fetch_rows(op_handle)
|
308
|
+
until rows.empty?
|
309
|
+
rows.each do |row|
|
310
|
+
h = {}
|
311
|
+
vals = row.colVals
|
312
|
+
cols.each do |i, col, conv|
|
313
|
+
v = vals[i].get_value.value
|
314
|
+
h[col] = conv ? conv[v] : v
|
315
|
+
end
|
316
|
+
yield h
|
317
|
+
end
|
318
|
+
rows = fetch_rows(op_handle, :next)
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
# Pull rows from the query result
|
323
|
+
def fetch_rows(op_handle, orientation = :first, max_rows = 1000)
|
324
|
+
fetch_req = prepare_fetch_results(op_handle, orientation, max_rows)
|
325
|
+
fetch_results = @client.FetchResults(fetch_req)
|
326
|
+
raise_error_if_failed!(fetch_results)
|
327
|
+
fetch_results.results.rows
|
328
|
+
#TCLIResultSet.new(rows, TCLISchemaDefinition.new(get_schema_for(op_handle), rows.first))
|
329
|
+
end
|
330
|
+
|
331
|
+
# Performs a explain on the supplied query on the server, returns it as a ExplainResult.
|
332
|
+
# (Only works on 0.12 if you have this patch - https://issues.apache.org/jira/browse/HIVE-5492)
|
333
|
+
def explain(query)
|
334
|
+
rows = []
|
335
|
+
fetch_in_batch("EXPLAIN " + query) do |batch|
|
336
|
+
rows << batch.map { |b| b[:Explain] }
|
337
|
+
end
|
338
|
+
ExplainResult.new(rows.flatten)
|
339
|
+
end
|
340
|
+
|
341
|
+
# Performs a query on the server, fetches up to *max_rows* rows and returns them as an array.
|
342
|
+
def fetch(query, max_rows = 100)
|
343
|
+
# Execute the query and check the result
|
344
|
+
exec_result = execute(query)
|
345
|
+
raise_error_if_failed!(exec_result)
|
346
|
+
|
347
|
+
# Get search operation handle to fetch the results
|
348
|
+
op_handle = exec_result.operationHandle
|
349
|
+
|
350
|
+
# Fetch the rows
|
351
|
+
fetch_rows(op_handle, :first, max_rows)
|
352
|
+
end
|
353
|
+
|
354
|
+
# Performs a query on the server, fetches the results in batches of *batch_size* rows
|
355
|
+
# and yields the result batches to a given block as arrays of rows.
|
356
|
+
def fetch_in_batch(query, batch_size = 1000, &block)
|
357
|
+
raise "No block given for the batch fetch request!" unless block_given?
|
358
|
+
|
359
|
+
# Execute the query and check the result
|
360
|
+
exec_result = execute(query)
|
361
|
+
raise_error_if_failed!(exec_result)
|
362
|
+
|
363
|
+
# Get search operation handle to fetch the results
|
364
|
+
op_handle = exec_result.operationHandle
|
365
|
+
|
366
|
+
# Prepare fetch results request
|
367
|
+
fetch_req = prepare_fetch_results(op_handle, :next, batch_size)
|
368
|
+
|
369
|
+
# Now let's iterate over the results
|
370
|
+
loop do
|
371
|
+
rows = fetch_rows(op_handle, :next, batch_size)
|
372
|
+
break if rows.empty?
|
373
|
+
yield rows
|
374
|
+
end
|
375
|
+
end
|
376
|
+
|
377
|
+
def create_table(schema)
|
378
|
+
execute(schema.create_table_statement)
|
379
|
+
end
|
380
|
+
|
381
|
+
def drop_table(name)
|
382
|
+
name = name.name if name.is_a?(TableSchema)
|
383
|
+
execute("DROP TABLE `#{name}`")
|
384
|
+
end
|
385
|
+
|
386
|
+
def replace_columns(schema)
|
387
|
+
execute(schema.replace_columns_statement)
|
388
|
+
end
|
389
|
+
|
390
|
+
def add_columns(schema)
|
391
|
+
execute(schema.add_columns_statement)
|
392
|
+
end
|
393
|
+
|
394
|
+
def method_missing(meth, *args)
|
395
|
+
client.send(meth, *args)
|
396
|
+
end
|
397
|
+
|
398
|
+
private
|
399
|
+
|
400
|
+
def prepare_open_session(client_protocol)
|
401
|
+
req = ::Hive2::Thrift::TOpenSessionReq.new( @sasl_params.empty? ? [] : @sasl_params )
|
402
|
+
req.client_protocol = client_protocol
|
403
|
+
req
|
404
|
+
end
|
405
|
+
|
406
|
+
def prepare_close_session
|
407
|
+
::Hive2::Thrift::TCloseSessionReq.new( sessionHandle: self.session )
|
408
|
+
end
|
409
|
+
|
410
|
+
def prepare_execute_statement(query)
|
411
|
+
::Hive2::Thrift::TExecuteStatementReq.new( sessionHandle: self.session, statement: query.to_s, confOverlay: {"impala.resultset.cache.size"=>"100000"} )
|
412
|
+
end
|
413
|
+
|
414
|
+
def prepare_fetch_results(handle, orientation=:first, rows=100)
|
415
|
+
orientation_value = "FETCH_#{orientation.to_s.upcase}"
|
416
|
+
valid_orientations = ::Hive2::Thrift::TFetchOrientation::VALUE_MAP.values
|
417
|
+
unless valid_orientations.include?(orientation_value)
|
418
|
+
raise ArgumentError, "Invalid orientation: #{orientation.inspect}"
|
419
|
+
end
|
420
|
+
orientation_const = eval("::Hive2::Thrift::TFetchOrientation::#{orientation_value}")
|
421
|
+
::Hive2::Thrift::TFetchResultsReq.new(
|
422
|
+
operationHandle: handle,
|
423
|
+
orientation: orientation_const,
|
424
|
+
maxRows: rows
|
425
|
+
)
|
426
|
+
end
|
427
|
+
|
428
|
+
def prepare_operation_handle(handles)
|
429
|
+
validate_handles!(handles)
|
430
|
+
Hive2::Thrift::TOperationHandle.new(
|
431
|
+
operationId: Hive2::Thrift::THandleIdentifier.new(guid: handles[:guid], secret: handles[:secret]),
|
432
|
+
operationType: Hive2::Thrift::TOperationType::EXECUTE_STATEMENT,
|
433
|
+
hasResultSet: false
|
434
|
+
)
|
435
|
+
end
|
436
|
+
|
437
|
+
def prepare_cancel_request(handles)
|
438
|
+
Hive2::Thrift::TCancelOperationReq.new(
|
439
|
+
operationHandle: prepare_operation_handle(handles)
|
440
|
+
)
|
441
|
+
end
|
442
|
+
|
443
|
+
def validate_handles!(handles)
|
444
|
+
unless handles.has_key?(:guid) and handles.has_key?(:secret) and handles.has_key?(:session)
|
445
|
+
raise "Invalid handles hash: #{handles.inspect}"
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|
449
|
+
def get_schema_for(handle)
|
450
|
+
req = ::Hive2::Thrift::TGetResultSetMetadataReq.new( operationHandle: handle )
|
451
|
+
metadata = client.GetResultSetMetadata( req )
|
452
|
+
metadata.schema
|
453
|
+
end
|
454
|
+
|
455
|
+
# Raises an exception if given operation result is a failure
|
456
|
+
def raise_error_if_failed!(result)
|
457
|
+
return if result.status.statusCode == 0
|
458
|
+
error_message = result.status.errorMessage || 'Execution failed!'
|
459
|
+
raise RBHive::TCLIConnectionError.new(error_message)
|
460
|
+
end
|
461
|
+
end
|
462
|
+
|
463
|
+
class TCLIConnectionError < StandardError; end
|
464
|
+
end
|