sequel-impala 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +16 -0
- data/LICENSE +2 -1
- data/README.md +45 -0
- data/lib/rbhive.rb +8 -0
- data/lib/rbhive/connection.rb +150 -0
- data/lib/rbhive/explain_result.rb +46 -0
- data/lib/rbhive/result_set.rb +37 -0
- data/lib/rbhive/schema_definition.rb +86 -0
- data/lib/rbhive/t_c_l_i_connection.rb +464 -0
- data/lib/rbhive/t_c_l_i_result_set.rb +3 -0
- data/lib/rbhive/t_c_l_i_schema_definition.rb +87 -0
- data/lib/rbhive/table_schema.rb +122 -0
- data/lib/rbhive/version.rb +3 -0
- data/lib/sequel/adapters/impala.rb +13 -1
- data/lib/sequel/adapters/rbhive.rb +174 -0
- data/lib/sequel/adapters/shared/impala.rb +11 -3
- data/lib/sequel/extensions/csv_to_parquet.rb +68 -14
- data/lib/thrift/facebook_service.rb +700 -0
- data/lib/thrift/fb303_constants.rb +9 -0
- data/lib/thrift/fb303_types.rb +19 -0
- data/lib/thrift/hive_metastore_constants.rb +41 -0
- data/lib/thrift/hive_metastore_types.rb +630 -0
- data/lib/thrift/hive_service_constants.rb +13 -0
- data/lib/thrift/hive_service_types.rb +72 -0
- data/lib/thrift/queryplan_constants.rb +13 -0
- data/lib/thrift/queryplan_types.rb +261 -0
- data/lib/thrift/sasl_client_transport.rb +161 -0
- data/lib/thrift/serde_constants.rb +92 -0
- data/lib/thrift/serde_types.rb +7 -0
- data/lib/thrift/t_c_l_i_service.rb +1054 -0
- data/lib/thrift/t_c_l_i_service_constants.rb +72 -0
- data/lib/thrift/t_c_l_i_service_types.rb +1768 -0
- data/lib/thrift/thrift_hive.rb +508 -0
- data/lib/thrift/thrift_hive_metastore.rb +3856 -0
- data/spec/impala_test.rb +6 -1
- metadata +53 -25
- data/README.rdoc +0 -39
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 03ea5f2607bc4908064302d49640df3a4e34eaa3
|
4
|
+
data.tar.gz: 18c93756bb5918f32cb6f32612856963f634e966
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 72ca2b1c7177ecc8c2db06e8b266f4a1ff67085cf7ef77d464e6b7667ce896870d905a691acf4fc2b34e67c90777dc3767cc9b3a1e3a2a9252967beaf507b566
|
7
|
+
data.tar.gz: fb15a8bf19c03e54179666df88da7ffc0f0be90d48a65ada20722da03f46c4fa26d910a2f3c554a93149ea29f92f565fa3bbeb43b875c8ceacfcf413b64aa161
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,19 @@
|
|
1
1
|
=== HEAD
|
2
2
|
|
3
|
+
* Fix disconnect detection in impala and rbhive adapters (jeremyevans)
|
4
|
+
|
5
|
+
* Make implicit qualify return an SQL::Identifier if given an unqualified string (jeremyevans)
|
6
|
+
|
7
|
+
* Fix :search_path option handling when using Sequel::SQL::AliasedExpressions (jeremyevans)
|
8
|
+
|
9
|
+
* Speed up multi_insert and import (jeremyevans)
|
10
|
+
|
11
|
+
* Add rbhive adapter (jeremyevans)
|
12
|
+
|
13
|
+
* Add :empty_null=>:ruby option to csv_to_parquet extension, which can support quoted CSV cells (jeremyevans)
|
14
|
+
|
15
|
+
* Optimize csv_to_parquet extension by not spawning shells or unnecessary processes (jeremyevans)
|
16
|
+
|
17
|
+
=== 1.0.0 (2015-12-04)
|
18
|
+
|
3
19
|
* Initial Public Release
|
data/LICENSE
CHANGED
@@ -1,5 +1,6 @@
|
|
1
|
-
Copyright (c) 2015 Jeremy Evans
|
1
|
+
Copyright (c) 2015-2016 Jeremy Evans
|
2
2
|
Copyright (c) 2013 Colin Marc
|
3
|
+
Copyright (c) [2013] [Forward3D]
|
3
4
|
|
4
5
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
5
6
|
of this software and associated documentation files (the "Software"), to
|
data/README.md
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# sequel-impala
|
2
|
+
|
3
|
+
sequel-impala adds support for Sequel to connect to the Impala database
|
4
|
+
via the included impala driver, and the included jdbc-hive2 driver under JRuby.
|
5
|
+
|
6
|
+
# Source Code
|
7
|
+
|
8
|
+
Source code is available on GitHub at https://github.com/outcomesinsights/sequel-impala
|
9
|
+
|
10
|
+
# Usage
|
11
|
+
|
12
|
+
After installation, Sequel will automatically pick up the adapter as long as
|
13
|
+
the lib directory is in RUBYLIB, if you use a connection string starting with
|
14
|
+
`impala`, or `jdbc:hive2` on JRuby.
|
15
|
+
|
16
|
+
# Connection Strings
|
17
|
+
|
18
|
+
If using the impala driver (default host is localhost, default port is 21000):
|
19
|
+
|
20
|
+
impala://host:port
|
21
|
+
|
22
|
+
If using the jdbc:hive2 driver on JRuby (port 21050 works in testing):
|
23
|
+
|
24
|
+
jdbc:hive2://host:port/;auth=noSasl
|
25
|
+
|
26
|
+
# Dependencies
|
27
|
+
|
28
|
+
* sequel 4+
|
29
|
+
* thrift gem
|
30
|
+
|
31
|
+
# License
|
32
|
+
|
33
|
+
MIT/Apache
|
34
|
+
|
35
|
+
# Author
|
36
|
+
|
37
|
+
Ryan Duryea <aguynamedryan@gmail.com>
|
38
|
+
|
39
|
+
Work on sequel-impala is generously funded by [Outcomes Insights, Inc.](http://outins.com)
|
40
|
+
|
41
|
+
# Previous Author
|
42
|
+
|
43
|
+
Jeremy Evans <code@jeremyevans.net>
|
44
|
+
|
45
|
+
Provided initial work on this gem, and continues to maintain [Sequel](http://sequel.jeremyevans.net/). We can't thank you enough!
|
data/lib/rbhive.rb
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'connection')
|
2
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'table_schema')
|
3
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'result_set')
|
4
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'explain_result')
|
5
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'schema_definition')
|
6
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_result_set])
|
7
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_schema_definition])
|
8
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_connection])
|
@@ -0,0 +1,150 @@
|
|
1
|
+
# suppress warnings
|
2
|
+
old_verbose, $VERBOSE = $VERBOSE, nil
|
3
|
+
# require thrift autogenerated files
|
4
|
+
require File.join(File.split(File.dirname(__FILE__)).first, *%w[thrift thrift_hive])
|
5
|
+
# require 'thrift'
|
6
|
+
# restore warnings
|
7
|
+
$VERBOSE = old_verbose
|
8
|
+
|
9
|
+
module RBHive
|
10
|
+
def connect(server, port=10_000)
|
11
|
+
connection = RBHive::Connection.new(server, port)
|
12
|
+
ret = nil
|
13
|
+
begin
|
14
|
+
connection.open
|
15
|
+
ret = yield(connection)
|
16
|
+
ensure
|
17
|
+
connection.close
|
18
|
+
ret
|
19
|
+
end
|
20
|
+
end
|
21
|
+
module_function :connect
|
22
|
+
|
23
|
+
class StdOutLogger
|
24
|
+
%w(fatal error warn info debug).each do |level|
|
25
|
+
define_method level.to_sym do |message|
|
26
|
+
STDOUT.puts(message)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class Connection
|
32
|
+
attr_reader :client
|
33
|
+
|
34
|
+
def initialize(server, port=10_000, logger=StdOutLogger.new)
|
35
|
+
@socket = Thrift::Socket.new(server, port)
|
36
|
+
@transport = Thrift::BufferedTransport.new(@socket)
|
37
|
+
@protocol = Thrift::BinaryProtocol.new(@transport)
|
38
|
+
@client = Hive::Thrift::ThriftHive::Client.new(@protocol)
|
39
|
+
@logger = logger
|
40
|
+
@logger.info("Connecting to #{server} on port #{port}")
|
41
|
+
@mutex = Mutex.new
|
42
|
+
end
|
43
|
+
|
44
|
+
def open
|
45
|
+
@transport.open
|
46
|
+
end
|
47
|
+
|
48
|
+
def close
|
49
|
+
@transport.close
|
50
|
+
end
|
51
|
+
|
52
|
+
def client
|
53
|
+
@client
|
54
|
+
end
|
55
|
+
|
56
|
+
def execute(query)
|
57
|
+
execute_safe(query)
|
58
|
+
end
|
59
|
+
|
60
|
+
def explain(query)
|
61
|
+
safe do
|
62
|
+
execute_unsafe("EXPLAIN "+ query)
|
63
|
+
ExplainResult.new(client.fetchAll)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def priority=(priority)
|
68
|
+
set("mapred.job.priority", priority)
|
69
|
+
end
|
70
|
+
|
71
|
+
def queue=(queue)
|
72
|
+
set("mapred.job.queue.name", queue)
|
73
|
+
end
|
74
|
+
|
75
|
+
def set(name,value)
|
76
|
+
@logger.info("Setting #{name}=#{value}")
|
77
|
+
client.execute("SET #{name}=#{value}")
|
78
|
+
end
|
79
|
+
|
80
|
+
def fetch(query)
|
81
|
+
safe do
|
82
|
+
execute_unsafe(query)
|
83
|
+
rows = client.fetchAll
|
84
|
+
the_schema = SchemaDefinition.new(client.getSchema, rows.first)
|
85
|
+
ResultSet.new(rows, the_schema)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def fetch_in_batch(query, batch_size=1_000)
|
90
|
+
safe do
|
91
|
+
execute_unsafe(query)
|
92
|
+
until (next_batch = client.fetchN(batch_size)).empty?
|
93
|
+
the_schema ||= SchemaDefinition.new(client.getSchema, next_batch.first)
|
94
|
+
yield ResultSet.new(next_batch, the_schema)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def first(query)
|
100
|
+
safe do
|
101
|
+
execute_unsafe(query)
|
102
|
+
row = client.fetchOne
|
103
|
+
the_schema = SchemaDefinition.new(client.getSchema, row)
|
104
|
+
ResultSet.new([row], the_schema).first
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def schema(example_row=[])
|
109
|
+
safe { SchemaDefinition.new(client.getSchema, example_row) }
|
110
|
+
end
|
111
|
+
|
112
|
+
def create_table(schema)
|
113
|
+
execute(schema.create_table_statement)
|
114
|
+
end
|
115
|
+
|
116
|
+
def drop_table(name)
|
117
|
+
name = name.name if name.is_a?(TableSchema)
|
118
|
+
execute("DROP TABLE `#{name}`")
|
119
|
+
end
|
120
|
+
|
121
|
+
def replace_columns(schema)
|
122
|
+
execute(schema.replace_columns_statement)
|
123
|
+
end
|
124
|
+
|
125
|
+
def add_columns(schema)
|
126
|
+
execute(schema.add_columns_statement)
|
127
|
+
end
|
128
|
+
|
129
|
+
def method_missing(meth, *args)
|
130
|
+
client.send(meth, *args)
|
131
|
+
end
|
132
|
+
|
133
|
+
private
|
134
|
+
|
135
|
+
def execute_safe(query)
|
136
|
+
safe { execute_unsafe(query) }
|
137
|
+
end
|
138
|
+
|
139
|
+
def execute_unsafe(query)
|
140
|
+
@logger.info("Executing Hive Query: #{query}")
|
141
|
+
client.execute(query)
|
142
|
+
end
|
143
|
+
|
144
|
+
def safe
|
145
|
+
ret = nil
|
146
|
+
@mutex.synchronize { ret = yield }
|
147
|
+
ret
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
class ExplainResult
|
2
|
+
def initialize(rows)
|
3
|
+
@rows = rows
|
4
|
+
end
|
5
|
+
|
6
|
+
def ast
|
7
|
+
by_section[:abstract_syntax_tree].first
|
8
|
+
end
|
9
|
+
|
10
|
+
def stage_count
|
11
|
+
stage_dependencies.length
|
12
|
+
end
|
13
|
+
|
14
|
+
def stage_dependencies
|
15
|
+
by_section[:stage_dependencies] || []
|
16
|
+
end
|
17
|
+
|
18
|
+
def to_tsv
|
19
|
+
@rows.join("\n")
|
20
|
+
end
|
21
|
+
|
22
|
+
def raw
|
23
|
+
@rows
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_s
|
27
|
+
to_tsv
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def by_section
|
33
|
+
current_section = nil
|
34
|
+
@rows.inject({}) do |sections, row|
|
35
|
+
if row.match(/^[A-Z]/)
|
36
|
+
current_section = row.chomp(':').downcase.gsub(' ', '_').to_sym
|
37
|
+
sections[current_section] = []
|
38
|
+
elsif row.length == 0
|
39
|
+
next sections
|
40
|
+
else
|
41
|
+
sections[current_section] << row.strip
|
42
|
+
end
|
43
|
+
sections
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module RBHive
|
2
|
+
class ResultSet < Array
|
3
|
+
def initialize(rows, schema)
|
4
|
+
@schema = schema
|
5
|
+
super(rows.map {|r| @schema.coerce_row(r) })
|
6
|
+
end
|
7
|
+
|
8
|
+
def column_names
|
9
|
+
@schema.column_names
|
10
|
+
end
|
11
|
+
|
12
|
+
def column_type_map
|
13
|
+
@schema.column_type_map
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_csv(out_file=nil)
|
17
|
+
to_separated_output(",", out_file)
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_tsv(out_file=nil)
|
21
|
+
to_separated_output("\t", out_file)
|
22
|
+
end
|
23
|
+
|
24
|
+
def as_arrays
|
25
|
+
@as_arrays ||= self.map{ |r| @schema.coerce_row_to_array(r) }
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def to_separated_output(sep, out_file)
|
31
|
+
rows = self.map { |r| @schema.coerce_row_to_array(r).join(sep) }
|
32
|
+
sv = rows.join("\n")
|
33
|
+
return sv if out_file.nil?
|
34
|
+
File.open(out_file, 'w+') { |f| f << sv }
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
module RBHive
|
4
|
+
class SchemaDefinition
|
5
|
+
attr_reader :schema
|
6
|
+
|
7
|
+
NAN = Float::NAN rescue 0.0/0.0
|
8
|
+
INFINITY = Float::INFINITY rescue 1.0/0.0
|
9
|
+
TYPES = {
|
10
|
+
:boolean => :to_s,
|
11
|
+
:string => :to_s,
|
12
|
+
:bigint => :to_i,
|
13
|
+
:float => :to_f,
|
14
|
+
:double => :to_f,
|
15
|
+
:int => :to_i,
|
16
|
+
:smallint => :to_i,
|
17
|
+
:tinyint => :to_i,
|
18
|
+
}
|
19
|
+
|
20
|
+
def initialize(schema, example_row)
|
21
|
+
@schema = schema
|
22
|
+
@example_row = example_row ? example_row.split("\t") : []
|
23
|
+
end
|
24
|
+
|
25
|
+
def column_names
|
26
|
+
@column_names ||= begin
|
27
|
+
schema_names = @schema.fieldSchemas.map {|c| c.name }
|
28
|
+
|
29
|
+
# In rare cases Hive can return two identical column names
|
30
|
+
# consider SELECT a.foo, b.foo...
|
31
|
+
# in this case you get two columns called foo with no disambiguation.
|
32
|
+
# as a (far from ideal) solution we detect this edge case and rename them
|
33
|
+
# a.foo => foo1, b.foo => foo2
|
34
|
+
# otherwise we will trample one of the columns during Hash mapping.
|
35
|
+
s = Hash.new(0)
|
36
|
+
schema_names.map! { |c| s[c] += 1; s[c] > 1 ? "#{c}---|---#{s[c]}" : c }
|
37
|
+
schema_names.map! { |c| s[c] > 1 ? "#{c}---|---1" : c }
|
38
|
+
schema_names.map! { |c| c.gsub('---|---', '_').to_sym }
|
39
|
+
|
40
|
+
# Lets fix the fact that Hive doesn't return schema data for partitions on SELECT * queries
|
41
|
+
# For now we will call them :_p1, :_p2, etc. to avoid collisions.
|
42
|
+
offset = 0
|
43
|
+
while schema_names.length < @example_row.length
|
44
|
+
schema_names.push(:"_p#{offset+=1}")
|
45
|
+
end
|
46
|
+
schema_names
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def column_type_map
|
51
|
+
@column_type_map ||= column_names.inject({}) do |hsh, c|
|
52
|
+
definition = @schema.fieldSchemas.find {|s| s.name.to_sym == c }
|
53
|
+
# If the column isn't in the schema (eg partitions in SELECT * queries) assume they are strings
|
54
|
+
hsh[c] = definition ? definition.type.to_sym : :string
|
55
|
+
hsh
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def coerce_row(row)
|
60
|
+
column_names.zip(row.split("\t")).inject({}) do |hsh, (column_name, value)|
|
61
|
+
hsh[column_name] = coerce_column(column_name, value)
|
62
|
+
hsh
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def coerce_column(column_name, value)
|
67
|
+
type = column_type_map[column_name]
|
68
|
+
return INFINITY if (type != :string && value == "Infinity")
|
69
|
+
return NAN if (type != :string && value == "NaN")
|
70
|
+
return coerce_complex_value(value) if type.to_s =~ /^array/
|
71
|
+
conversion_method = TYPES[type]
|
72
|
+
conversion_method ? value.send(conversion_method) : value
|
73
|
+
end
|
74
|
+
|
75
|
+
def coerce_row_to_array(row)
|
76
|
+
column_names.map { |n| row[n] }
|
77
|
+
end
|
78
|
+
|
79
|
+
def coerce_complex_value(value)
|
80
|
+
return nil if value.nil?
|
81
|
+
return nil if value.length == 0
|
82
|
+
return nil if value == 'null'
|
83
|
+
JSON.parse(value)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,464 @@
|
|
1
|
+
# suppress warnings
|
2
|
+
old_verbose, $VERBOSE = $VERBOSE, nil
|
3
|
+
|
4
|
+
raise 'Thrift is not loaded' unless defined?(Thrift)
|
5
|
+
raise 'RBHive is not loaded' unless defined?(RBHive)
|
6
|
+
|
7
|
+
# require thrift autogenerated files
|
8
|
+
require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service_constants])
|
9
|
+
require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service])
|
10
|
+
require File.join(File.dirname(__FILE__), *%w[.. thrift sasl_client_transport])
|
11
|
+
|
12
|
+
# restore warnings
|
13
|
+
$VERBOSE = old_verbose
|
14
|
+
|
15
|
+
# Monkey patch thrift to set an infinite read timeout
|
16
|
+
module Thrift
|
17
|
+
class HTTPClientTransport < BaseTransport
|
18
|
+
def flush
|
19
|
+
http = Net::HTTP.new @url.host, @url.port
|
20
|
+
http.use_ssl = @url.scheme == 'https'
|
21
|
+
http.read_timeout = nil
|
22
|
+
http.verify_mode = @ssl_verify_mode if @url.scheme == 'https'
|
23
|
+
resp = http.post(@url.request_uri, @outbuf, @headers)
|
24
|
+
data = resp.body
|
25
|
+
data = Bytes.force_binary_encoding(data)
|
26
|
+
@inbuf = StringIO.new data
|
27
|
+
@outbuf = Bytes.empty_byte_buffer
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
module RBHive
|
33
|
+
|
34
|
+
HIVE_THRIFT_MAPPING = {
|
35
|
+
10 => 0,
|
36
|
+
11 => 1,
|
37
|
+
12 => 2,
|
38
|
+
13 => 6,
|
39
|
+
:cdh4 => 0,
|
40
|
+
:cdh5 => 4,
|
41
|
+
:PROTOCOL_V1 => 0,
|
42
|
+
:PROTOCOL_V2 => 1,
|
43
|
+
:PROTOCOL_V3 => 2,
|
44
|
+
:PROTOCOL_V4 => 3,
|
45
|
+
:PROTOCOL_V5 => 4,
|
46
|
+
:PROTOCOL_V6 => 5,
|
47
|
+
:PROTOCOL_V7 => 6
|
48
|
+
}
|
49
|
+
|
50
|
+
def tcli_connect(server, port = 10_000, options={})
|
51
|
+
logger = options.key?(:logger) ? options.delete(:logger) : StdOutLogger.new
|
52
|
+
connection = RBHive::TCLIConnection.new(server, port, options, logger)
|
53
|
+
ret = nil
|
54
|
+
begin
|
55
|
+
connection.open
|
56
|
+
connection.open_session
|
57
|
+
ret = yield(connection)
|
58
|
+
|
59
|
+
ensure
|
60
|
+
# Try to close the session and our connection if those are still open, ignore io errors
|
61
|
+
begin
|
62
|
+
connection.close_session if connection.session
|
63
|
+
connection.close
|
64
|
+
rescue IOError => e
|
65
|
+
# noop
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
ret
|
70
|
+
end
|
71
|
+
module_function :tcli_connect
|
72
|
+
|
73
|
+
class StdOutLogger
|
74
|
+
%w(fatal error warn info debug).each do |level|
|
75
|
+
define_method level.to_sym do |message|
|
76
|
+
STDOUT.puts(message)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
class TCLIConnection
|
82
|
+
attr_reader :client
|
83
|
+
|
84
|
+
def initialize(server, port = 10_000, options = {}, logger = StdOutLogger.new)
|
85
|
+
options ||= {} # backwards compatibility
|
86
|
+
raise "'options' parameter must be a hash" unless options.is_a?(Hash)
|
87
|
+
@sasl_params = options.delete(:sasl_params) || {}
|
88
|
+
|
89
|
+
if options[:transport] == :sasl and @sasl_params.empty?
|
90
|
+
raise ":transport is set to :sasl, but no :sasl_params option was supplied"
|
91
|
+
end
|
92
|
+
|
93
|
+
# Defaults to buffered transport, Hive 0.10, 1800 second timeout
|
94
|
+
options[:transport] ||= :buffered
|
95
|
+
options[:hive_version] ||= 10
|
96
|
+
options[:timeout] ||= 1800
|
97
|
+
@options = options
|
98
|
+
# Look up the appropriate Thrift protocol version for the supplied Hive version
|
99
|
+
@thrift_protocol_version = thrift_hive_protocol(options[:hive_version])
|
100
|
+
|
101
|
+
@logger = logger
|
102
|
+
@transport = thrift_transport(server, port)
|
103
|
+
@protocol = Thrift::BinaryProtocol.new(@transport)
|
104
|
+
@client = Hive2::Thrift::TCLIService::Client.new(@protocol)
|
105
|
+
@session = nil
|
106
|
+
@logger.info("Connecting to HiveServer2 #{server} on port #{port}")
|
107
|
+
end
|
108
|
+
|
109
|
+
def thrift_hive_protocol(version)
|
110
|
+
HIVE_THRIFT_MAPPING[version] || raise("Invalid Hive version")
|
111
|
+
end
|
112
|
+
|
113
|
+
def thrift_transport(server, port)
|
114
|
+
@logger.info("Initializing transport #{@options[:transport]}")
|
115
|
+
case @options[:transport]
|
116
|
+
when :buffered
|
117
|
+
return Thrift::BufferedTransport.new(thrift_socket(server, port, @options[:timeout]))
|
118
|
+
when :sasl
|
119
|
+
return Thrift::SaslClientTransport.new(thrift_socket(server, port, @options[:timeout]),
|
120
|
+
parse_sasl_params(@sasl_params))
|
121
|
+
when :http
|
122
|
+
return Thrift::HTTPClientTransport.new("http://#{server}:#{port}/cliservice")
|
123
|
+
else
|
124
|
+
raise "Unrecognised transport type '#{transport}'"
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def thrift_socket(server, port, timeout)
|
129
|
+
socket = Thrift::Socket.new(server, port)
|
130
|
+
socket.timeout = timeout
|
131
|
+
socket
|
132
|
+
end
|
133
|
+
|
134
|
+
# Processes SASL connection params and returns a hash with symbol keys or a nil
|
135
|
+
def parse_sasl_params(sasl_params)
|
136
|
+
# Symbilize keys in a hash
|
137
|
+
if sasl_params.kind_of?(Hash)
|
138
|
+
return sasl_params.inject({}) do |memo,(k,v)|
|
139
|
+
memo[k.to_sym] = v;
|
140
|
+
memo
|
141
|
+
end
|
142
|
+
end
|
143
|
+
return nil
|
144
|
+
end
|
145
|
+
|
146
|
+
def open
|
147
|
+
@transport.open
|
148
|
+
end
|
149
|
+
|
150
|
+
def close
|
151
|
+
@transport.close
|
152
|
+
end
|
153
|
+
|
154
|
+
def open_session
|
155
|
+
@session = @client.OpenSession(prepare_open_session(@thrift_protocol_version))
|
156
|
+
end
|
157
|
+
|
158
|
+
def close_session
|
159
|
+
@client.CloseSession prepare_close_session
|
160
|
+
@session = nil
|
161
|
+
end
|
162
|
+
|
163
|
+
def session
|
164
|
+
@session && @session.sessionHandle
|
165
|
+
end
|
166
|
+
|
167
|
+
def client
|
168
|
+
@client
|
169
|
+
end
|
170
|
+
|
171
|
+
def execute(query)
|
172
|
+
@logger.info("Executing Hive Query: #{query}")
|
173
|
+
req = prepare_execute_statement(query)
|
174
|
+
exec_result = client.ExecuteStatement(req)
|
175
|
+
raise_error_if_failed!(exec_result)
|
176
|
+
exec_result
|
177
|
+
end
|
178
|
+
|
179
|
+
def priority=(priority)
|
180
|
+
set("mapred.job.priority", priority)
|
181
|
+
end
|
182
|
+
|
183
|
+
def queue=(queue)
|
184
|
+
set("mapred.job.queue.name", queue)
|
185
|
+
end
|
186
|
+
|
187
|
+
def set(name,value)
|
188
|
+
@logger.info("Setting #{name}=#{value}")
|
189
|
+
self.execute("SET #{name}=#{value}")
|
190
|
+
end
|
191
|
+
|
192
|
+
# Async execute
|
193
|
+
def async_execute(query)
|
194
|
+
@logger.info("Executing query asynchronously: #{query}")
|
195
|
+
exec_result = @client.ExecuteStatement(
|
196
|
+
Hive2::Thrift::TExecuteStatementReq.new(
|
197
|
+
sessionHandle: @session.sessionHandle,
|
198
|
+
statement: query,
|
199
|
+
runAsync: true
|
200
|
+
)
|
201
|
+
)
|
202
|
+
raise_error_if_failed!(exec_result)
|
203
|
+
op_handle = exec_result.operationHandle
|
204
|
+
|
205
|
+
# Return handles to get hold of this query / session again
|
206
|
+
{
|
207
|
+
session: @session.sessionHandle,
|
208
|
+
guid: op_handle.operationId.guid,
|
209
|
+
secret: op_handle.operationId.secret
|
210
|
+
}
|
211
|
+
end
|
212
|
+
|
213
|
+
# Is the query complete?
|
214
|
+
def async_is_complete?(handles)
|
215
|
+
async_state(handles) == :finished
|
216
|
+
end
|
217
|
+
|
218
|
+
# Is the query actually running?
|
219
|
+
def async_is_running?(handles)
|
220
|
+
async_state(handles) == :running
|
221
|
+
end
|
222
|
+
|
223
|
+
# Has the query failed?
|
224
|
+
def async_is_failed?(handles)
|
225
|
+
async_state(handles) == :error
|
226
|
+
end
|
227
|
+
|
228
|
+
def async_is_cancelled?(handles)
|
229
|
+
async_state(handles) == :cancelled
|
230
|
+
end
|
231
|
+
|
232
|
+
def async_cancel(handles)
|
233
|
+
@client.CancelOperation(prepare_cancel_request(handles))
|
234
|
+
end
|
235
|
+
|
236
|
+
# Map states to symbols
|
237
|
+
def async_state(handles)
|
238
|
+
response = @client.GetOperationStatus(
|
239
|
+
Hive2::Thrift::TGetOperationStatusReq.new(operationHandle: prepare_operation_handle(handles))
|
240
|
+
)
|
241
|
+
|
242
|
+
case response.operationState
|
243
|
+
when Hive2::Thrift::TOperationState::FINISHED_STATE
|
244
|
+
return :finished
|
245
|
+
when Hive2::Thrift::TOperationState::INITIALIZED_STATE
|
246
|
+
return :initialized
|
247
|
+
when Hive2::Thrift::TOperationState::RUNNING_STATE
|
248
|
+
return :running
|
249
|
+
when Hive2::Thrift::TOperationState::CANCELED_STATE
|
250
|
+
return :cancelled
|
251
|
+
when Hive2::Thrift::TOperationState::CLOSED_STATE
|
252
|
+
return :closed
|
253
|
+
when Hive2::Thrift::TOperationState::ERROR_STATE
|
254
|
+
return :error
|
255
|
+
when Hive2::Thrift::TOperationState::UKNOWN_STATE
|
256
|
+
return :unknown
|
257
|
+
when Hive2::Thrift::TOperationState::PENDING_STATE
|
258
|
+
return :pending
|
259
|
+
when nil
|
260
|
+
raise "No operation state found for handles - has the session been closed?"
|
261
|
+
else
|
262
|
+
return :state_not_in_protocol
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
# Async fetch results from an async execute
|
267
|
+
def async_fetch(handles, max_rows = 100)
|
268
|
+
# Can't get data from an unfinished query
|
269
|
+
unless async_is_complete?(handles)
|
270
|
+
raise "Can't perform fetch on a query in state: #{async_state(handles)}"
|
271
|
+
end
|
272
|
+
|
273
|
+
# Fetch and
|
274
|
+
fetch_rows(prepare_operation_handle(handles), :first, max_rows)
|
275
|
+
end
|
276
|
+
|
277
|
+
# Performs a query on the server, fetches the results in batches of *batch_size* rows
|
278
|
+
# and yields the result batches to a given block as arrays of rows.
|
279
|
+
def async_fetch_in_batch(handles, batch_size = 1000, &block)
|
280
|
+
raise "No block given for the batch fetch request!" unless block_given?
|
281
|
+
# Can't get data from an unfinished query
|
282
|
+
unless async_is_complete?(handles)
|
283
|
+
raise "Can't perform fetch on a query in state: #{async_state(handles)}"
|
284
|
+
end
|
285
|
+
|
286
|
+
# Now let's iterate over the results
|
287
|
+
loop do
|
288
|
+
rows = fetch_rows(prepare_operation_handle(handles), :next, batch_size)
|
289
|
+
break if rows.empty?
|
290
|
+
yield rows
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
def async_close_session(handles)
|
295
|
+
validate_handles!(handles)
|
296
|
+
@client.CloseSession(Hive2::Thrift::TCloseSessionReq.new( sessionHandle: handles[:session] ))
|
297
|
+
end
|
298
|
+
|
299
|
+
def get_column_info(op_handle)
|
300
|
+
cols = get_schema_for(op_handle).columns
|
301
|
+
[cols.map(&:columnName), cols.map{|c| c.typeDesc.types.first.primitiveEntry.type}]
|
302
|
+
end
|
303
|
+
|
304
|
+
def yield_hash_rows(op_handle, columns, convertors)
|
305
|
+
i = -1
|
306
|
+
cols = columns.zip(convertors).map{|col, conv| [i+=1, col, conv]}
|
307
|
+
rows = fetch_rows(op_handle)
|
308
|
+
until rows.empty?
|
309
|
+
rows.each do |row|
|
310
|
+
h = {}
|
311
|
+
vals = row.colVals
|
312
|
+
cols.each do |i, col, conv|
|
313
|
+
v = vals[i].get_value.value
|
314
|
+
h[col] = conv ? conv[v] : v
|
315
|
+
end
|
316
|
+
yield h
|
317
|
+
end
|
318
|
+
rows = fetch_rows(op_handle, :next)
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
# Pull rows from the query result
|
323
|
+
def fetch_rows(op_handle, orientation = :first, max_rows = 1000)
|
324
|
+
fetch_req = prepare_fetch_results(op_handle, orientation, max_rows)
|
325
|
+
fetch_results = @client.FetchResults(fetch_req)
|
326
|
+
raise_error_if_failed!(fetch_results)
|
327
|
+
fetch_results.results.rows
|
328
|
+
#TCLIResultSet.new(rows, TCLISchemaDefinition.new(get_schema_for(op_handle), rows.first))
|
329
|
+
end
|
330
|
+
|
331
|
+
# Performs a explain on the supplied query on the server, returns it as a ExplainResult.
|
332
|
+
# (Only works on 0.12 if you have this patch - https://issues.apache.org/jira/browse/HIVE-5492)
|
333
|
+
def explain(query)
|
334
|
+
rows = []
|
335
|
+
fetch_in_batch("EXPLAIN " + query) do |batch|
|
336
|
+
rows << batch.map { |b| b[:Explain] }
|
337
|
+
end
|
338
|
+
ExplainResult.new(rows.flatten)
|
339
|
+
end
|
340
|
+
|
341
|
+
# Performs a query on the server, fetches up to *max_rows* rows and returns them as an array.
|
342
|
+
def fetch(query, max_rows = 100)
|
343
|
+
# Execute the query and check the result
|
344
|
+
exec_result = execute(query)
|
345
|
+
raise_error_if_failed!(exec_result)
|
346
|
+
|
347
|
+
# Get search operation handle to fetch the results
|
348
|
+
op_handle = exec_result.operationHandle
|
349
|
+
|
350
|
+
# Fetch the rows
|
351
|
+
fetch_rows(op_handle, :first, max_rows)
|
352
|
+
end
|
353
|
+
|
354
|
+
# Performs a query on the server, fetches the results in batches of *batch_size* rows
|
355
|
+
# and yields the result batches to a given block as arrays of rows.
|
356
|
+
def fetch_in_batch(query, batch_size = 1000, &block)
|
357
|
+
raise "No block given for the batch fetch request!" unless block_given?
|
358
|
+
|
359
|
+
# Execute the query and check the result
|
360
|
+
exec_result = execute(query)
|
361
|
+
raise_error_if_failed!(exec_result)
|
362
|
+
|
363
|
+
# Get search operation handle to fetch the results
|
364
|
+
op_handle = exec_result.operationHandle
|
365
|
+
|
366
|
+
# Prepare fetch results request
|
367
|
+
fetch_req = prepare_fetch_results(op_handle, :next, batch_size)
|
368
|
+
|
369
|
+
# Now let's iterate over the results
|
370
|
+
loop do
|
371
|
+
rows = fetch_rows(op_handle, :next, batch_size)
|
372
|
+
break if rows.empty?
|
373
|
+
yield rows
|
374
|
+
end
|
375
|
+
end
|
376
|
+
|
377
|
+
def create_table(schema)
|
378
|
+
execute(schema.create_table_statement)
|
379
|
+
end
|
380
|
+
|
381
|
+
def drop_table(name)
|
382
|
+
name = name.name if name.is_a?(TableSchema)
|
383
|
+
execute("DROP TABLE `#{name}`")
|
384
|
+
end
|
385
|
+
|
386
|
+
def replace_columns(schema)
|
387
|
+
execute(schema.replace_columns_statement)
|
388
|
+
end
|
389
|
+
|
390
|
+
def add_columns(schema)
|
391
|
+
execute(schema.add_columns_statement)
|
392
|
+
end
|
393
|
+
|
394
|
+
def method_missing(meth, *args)
|
395
|
+
client.send(meth, *args)
|
396
|
+
end
|
397
|
+
|
398
|
+
private
|
399
|
+
|
400
|
+
def prepare_open_session(client_protocol)
|
401
|
+
req = ::Hive2::Thrift::TOpenSessionReq.new( @sasl_params.empty? ? [] : @sasl_params )
|
402
|
+
req.client_protocol = client_protocol
|
403
|
+
req
|
404
|
+
end
|
405
|
+
|
406
|
+
def prepare_close_session
|
407
|
+
::Hive2::Thrift::TCloseSessionReq.new( sessionHandle: self.session )
|
408
|
+
end
|
409
|
+
|
410
|
+
def prepare_execute_statement(query)
|
411
|
+
::Hive2::Thrift::TExecuteStatementReq.new( sessionHandle: self.session, statement: query.to_s, confOverlay: {"impala.resultset.cache.size"=>"100000"} )
|
412
|
+
end
|
413
|
+
|
414
|
+
def prepare_fetch_results(handle, orientation=:first, rows=100)
|
415
|
+
orientation_value = "FETCH_#{orientation.to_s.upcase}"
|
416
|
+
valid_orientations = ::Hive2::Thrift::TFetchOrientation::VALUE_MAP.values
|
417
|
+
unless valid_orientations.include?(orientation_value)
|
418
|
+
raise ArgumentError, "Invalid orientation: #{orientation.inspect}"
|
419
|
+
end
|
420
|
+
orientation_const = eval("::Hive2::Thrift::TFetchOrientation::#{orientation_value}")
|
421
|
+
::Hive2::Thrift::TFetchResultsReq.new(
|
422
|
+
operationHandle: handle,
|
423
|
+
orientation: orientation_const,
|
424
|
+
maxRows: rows
|
425
|
+
)
|
426
|
+
end
|
427
|
+
|
428
|
+
def prepare_operation_handle(handles)
|
429
|
+
validate_handles!(handles)
|
430
|
+
Hive2::Thrift::TOperationHandle.new(
|
431
|
+
operationId: Hive2::Thrift::THandleIdentifier.new(guid: handles[:guid], secret: handles[:secret]),
|
432
|
+
operationType: Hive2::Thrift::TOperationType::EXECUTE_STATEMENT,
|
433
|
+
hasResultSet: false
|
434
|
+
)
|
435
|
+
end
|
436
|
+
|
437
|
+
def prepare_cancel_request(handles)
|
438
|
+
Hive2::Thrift::TCancelOperationReq.new(
|
439
|
+
operationHandle: prepare_operation_handle(handles)
|
440
|
+
)
|
441
|
+
end
|
442
|
+
|
443
|
+
def validate_handles!(handles)
|
444
|
+
unless handles.has_key?(:guid) and handles.has_key?(:secret) and handles.has_key?(:session)
|
445
|
+
raise "Invalid handles hash: #{handles.inspect}"
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|
449
|
+
def get_schema_for(handle)
|
450
|
+
req = ::Hive2::Thrift::TGetResultSetMetadataReq.new( operationHandle: handle )
|
451
|
+
metadata = client.GetResultSetMetadata( req )
|
452
|
+
metadata.schema
|
453
|
+
end
|
454
|
+
|
455
|
+
# Raises an exception if given operation result is a failure
|
456
|
+
def raise_error_if_failed!(result)
|
457
|
+
return if result.status.statusCode == 0
|
458
|
+
error_message = result.status.errorMessage || 'Execution failed!'
|
459
|
+
raise RBHive::TCLIConnectionError.new(error_message)
|
460
|
+
end
|
461
|
+
end
|
462
|
+
|
463
|
+
class TCLIConnectionError < StandardError; end
|
464
|
+
end
|