rbhive 0.2.95 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/LICENSE +20 -0
- data/README.md +204 -0
- data/lib/rbhive/connection.rb +2 -1
- data/lib/rbhive/t_c_l_i_connection.rb +315 -0
- data/lib/rbhive/t_c_l_i_result_set.rb +3 -0
- data/lib/rbhive/t_c_l_i_schema_definition.rb +87 -0
- data/lib/rbhive/version.rb +3 -0
- data/lib/rbhive.rb +4 -1
- data/lib/thrift/facebook_service.rb +4 -5
- data/lib/thrift/fb303_constants.rb +3 -2
- data/lib/thrift/fb303_types.rb +2 -1
- data/lib/thrift/hive_metastore_constants.rb +3 -3
- data/lib/thrift/hive_metastore_types.rb +176 -14
- data/lib/thrift/hive_service_constants.rb +7 -2
- data/lib/thrift/hive_service_types.rb +53 -48
- data/lib/thrift/queryplan_constants.rb +7 -2
- data/lib/thrift/queryplan_types.rb +225 -217
- data/lib/thrift/sasl_client_transport.rb +97 -0
- data/lib/thrift/serde_constants.rb +5 -3
- data/lib/thrift/serde_types.rb +2 -2
- data/lib/thrift/t_c_l_i_service.rb +892 -0
- data/lib/thrift/t_c_l_i_service_constants.rb +66 -0
- data/lib/thrift/t_c_l_i_service_types.rb +1469 -0
- data/lib/thrift/thrift_hive.rb +405 -401
- data/lib/thrift/thrift_hive_metastore.rb +1452 -203
- data/rbhive.gemspec +24 -0
- metadata +90 -69
- data/lib/thrift/reflection_limited_constants.rb +0 -8
- data/lib/thrift/reflection_limited_types.rb +0 -150
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) [2013] [Forward3D]
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
6
|
+
this software and associated documentation files (the "Software"), to deal in
|
7
|
+
the Software without restriction, including without limitation the rights to
|
8
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
9
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
10
|
+
subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
17
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
18
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
19
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
20
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,204 @@
|
|
1
|
+
# RBHive -- Ruby thrift lib for executing Hive queries
|
2
|
+
|
3
|
+
RBHive is a simple Ruby gem to communicate with the [Apache Hive](http://hive.apache.org)
|
4
|
+
Thrift server.
|
5
|
+
|
6
|
+
It supports:
|
7
|
+
* Hiveserver (the original Thrift service shipped with Hive since early releases)
|
8
|
+
* Hiveserver2 (the new, concurrent Thrift service shipped with Hive releases since 0.10)
|
9
|
+
* Any other 100% Hive-compatible Thrift service (e.g. [Sharkserver](https://github.com/amplab/shark))
|
10
|
+
|
11
|
+
It is capable of using the following Thrift transports:
|
12
|
+
* BufferedTransport (the default)
|
13
|
+
* SaslClientTransport ([SASL-enabled](http://en.wikipedia.org/wiki/Simple_Authentication_and_Security_Layer) transport)
|
14
|
+
* HTTPClientTransport (tunnels Thrift over HTTP)
|
15
|
+
|
16
|
+
## About Thrift services and transports
|
17
|
+
|
18
|
+
### Hiveserver
|
19
|
+
|
20
|
+
Hiveserver (the original Thrift interface) only supports a single client at a time. RBHive
|
21
|
+
implements this with the `RBHive::Connection` class. It only supports a single transport,
|
22
|
+
BufferedTransport.
|
23
|
+
|
24
|
+
### Hiveserver2
|
25
|
+
|
26
|
+
[Hiveserver2](https://cwiki.apache.org/confluence/display/Hive/Setting+up+HiveServer2)
|
27
|
+
(the new Thrift interface) can support many concurrent client connections. It is shipped
|
28
|
+
with Hive 0.10 and later. In Hive 0.10, only BufferedTranport and SaslClientTransport are
|
29
|
+
supported; starting with Hive 0.12, HTTPClientTransport is also supported.
|
30
|
+
|
31
|
+
Each of the versions after Hive 0.10 has a slightly different Thrift interface; when
|
32
|
+
connecting, you must specify the Hive version or you may get an exception.
|
33
|
+
|
34
|
+
RBHive implements this client with the `RBHive::TCLIConnection` class.
|
35
|
+
|
36
|
+
### Other Hive-compatible services
|
37
|
+
|
38
|
+
Consult the documentation for the service, as this will vary depending on the service you're using.
|
39
|
+
|
40
|
+
## Connecting to Hiveserver and Hiveserver2
|
41
|
+
|
42
|
+
### Hiveserver
|
43
|
+
|
44
|
+
Since Hiveserver has no options, connection code is very simple:
|
45
|
+
|
46
|
+
RBHive.connect('hive.server.address', 10_000) do |connection|
|
47
|
+
connection.fetch 'SELECT city, country FROM cities'
|
48
|
+
end
|
49
|
+
➔ [{:city => "London", :country => "UK"}, {:city => "Mumbai", :country => "India"}, {:city => "New York", :country => "USA"}]
|
50
|
+
|
51
|
+
### Hiveserver2
|
52
|
+
|
53
|
+
Hiveserver2 has several options with how it is run. The connection code takes
|
54
|
+
a hash with these possible parameters:
|
55
|
+
* `:transport` - one of `:buffered` (BufferedTransport), `:http` (HTTPClientTransport), or `:sasl` (SaslClientTransport)
|
56
|
+
* `:hive_version` - the number after the period in the Hive version; e.g. `10`, `11`, `12`
|
57
|
+
* `:timeout` - if using BufferedTransport or SaslClientTransport, this is how long the timeout on the socket will be
|
58
|
+
* `:sasl_params` - if using SaslClientTransport, this is a hash of parameters to set up the SASL connection
|
59
|
+
|
60
|
+
If you pass either an empty hash or nil in place of the options (or do not supply them), the connection
|
61
|
+
is attempted with the Hive version set to 0.10, using `:buffered` as the transport, and a timeout of 1800 seconds.
|
62
|
+
|
63
|
+
Connecting with the defaults:
|
64
|
+
|
65
|
+
RBHive.tcli_connect('hive.server.address', 10_000) do |connection|
|
66
|
+
connection.fetch('SHOW TABLES')
|
67
|
+
end
|
68
|
+
|
69
|
+
Connecting with a specific Hive version (0.12 in this case):
|
70
|
+
|
71
|
+
RBHive.tcli_connect('hive.server.address', 10_000, {:hive_version => 12}) do |connection|
|
72
|
+
connection.fetch('SHOW TABLES')
|
73
|
+
end
|
74
|
+
|
75
|
+
Connecting with a specific Hive version (0.12) and using the `:http` transport:
|
76
|
+
|
77
|
+
RBHive.tcli_connect('hive.server.address', 10_000, {:hive_version => 12, :transport => :http}) do |connection|
|
78
|
+
connection.fetch('SHOW TABLES')
|
79
|
+
end
|
80
|
+
|
81
|
+
We have not tested the SASL connection, as we don't run SASL; pull requests and testing are welcomed.
|
82
|
+
|
83
|
+
## Examples
|
84
|
+
|
85
|
+
### Fetching results
|
86
|
+
|
87
|
+
#### Hiveserver
|
88
|
+
|
89
|
+
RBHive.connect('hive.server.address', 10_000) do |connection|
|
90
|
+
connection.fetch 'SELECT city, country FROM cities'
|
91
|
+
end
|
92
|
+
➔ [{:city => "London", :country => "UK"}, {:city => "Mumbai", :country => "India"}, {:city => "New York", :country => "USA"}]
|
93
|
+
|
94
|
+
#### Hiveserver2
|
95
|
+
|
96
|
+
RBHive.tcli_connect('hive.server.address', 10_000) do |connection|
|
97
|
+
connection.fetch 'SELECT city, country FROM cities'
|
98
|
+
end
|
99
|
+
➔ [{:city => "London", :country => "UK"}, {:city => "Mumbai", :country => "India"}, {:city => "New York", :country => "USA"}]
|
100
|
+
|
101
|
+
### Executing a query
|
102
|
+
|
103
|
+
#### Hiveserver
|
104
|
+
|
105
|
+
RBHive.connect('hive.server.address') do |connection|
|
106
|
+
connection.execute 'DROP TABLE cities'
|
107
|
+
end
|
108
|
+
➔ nil
|
109
|
+
|
110
|
+
#### Hiveserver2
|
111
|
+
|
112
|
+
RBHive.tcli_connect('hive.server.address') do |connection|
|
113
|
+
connection.execute 'DROP TABLE cities'
|
114
|
+
end
|
115
|
+
➔ nil
|
116
|
+
|
117
|
+
### Creating tables
|
118
|
+
|
119
|
+
table = TableSchema.new('person', 'List of people that owe me money') do
|
120
|
+
column 'name', :string, 'Full name of debtor'
|
121
|
+
column 'address', :string, 'Address of debtor'
|
122
|
+
column 'amount', :float, 'The amount of money borrowed'
|
123
|
+
|
124
|
+
partition 'dated', :string, 'The date money was given'
|
125
|
+
partition 'country', :string, 'The country the person resides in'
|
126
|
+
end
|
127
|
+
|
128
|
+
Then for Hiveserver:
|
129
|
+
|
130
|
+
RBHive.connect('hive.server.address', 10_000) do |connection|
|
131
|
+
connection.create_table(table)
|
132
|
+
end
|
133
|
+
|
134
|
+
Or Hiveserver2:
|
135
|
+
|
136
|
+
RBHive.tcli_connect('hive.server.address', 10_000) do |connection|
|
137
|
+
connection.create_table(table)
|
138
|
+
end
|
139
|
+
|
140
|
+
### Modifying table schema
|
141
|
+
|
142
|
+
table = TableSchema.new('person', 'List of people that owe me money') do
|
143
|
+
column 'name', :string, 'Full name of debtor'
|
144
|
+
column 'address', :string, 'Address of debtor'
|
145
|
+
column 'amount', :float, 'The amount of money borrowed'
|
146
|
+
column 'new_amount', :float, 'The new amount this person somehow convinced me to give them'
|
147
|
+
|
148
|
+
partition 'dated', :string, 'The date money was given'
|
149
|
+
partition 'country', :string, 'The country the person resides in'
|
150
|
+
end
|
151
|
+
|
152
|
+
Then for Hiveserver:
|
153
|
+
|
154
|
+
RBHive.connect('hive.server.address') do |connection|
|
155
|
+
connection.replace_columns(table)
|
156
|
+
end
|
157
|
+
|
158
|
+
Or Hiveserver2:
|
159
|
+
|
160
|
+
RBHive.tcli_connect('hive.server.address') do |connection|
|
161
|
+
connection.replace_columns(table)
|
162
|
+
end
|
163
|
+
|
164
|
+
### Setting properties
|
165
|
+
|
166
|
+
You can set various properties for Hive tasks, some of which change how they run. Consult the Apache
|
167
|
+
Hive documentation and Hadoop's documentation for the various properties that can be set.
|
168
|
+
For example, you can set the map-reduce job's priority with the following:
|
169
|
+
|
170
|
+
connection.set("mapred.job.priority", "VERY_HIGH")
|
171
|
+
|
172
|
+
### Inspecting tables
|
173
|
+
|
174
|
+
#### Hiveserver
|
175
|
+
|
176
|
+
RBHive.connect('hive.hadoop.forward.co.uk', 10_000) {|connection|
|
177
|
+
result = connection.fetch("describe some_table")
|
178
|
+
puts result.column_names.inspect
|
179
|
+
puts result.first.inspect
|
180
|
+
}
|
181
|
+
|
182
|
+
#### Hiveserver2
|
183
|
+
|
184
|
+
RBHive.tcli_connect('hive.hadoop.forward.co.uk', 10_000) {|connection|
|
185
|
+
result = connection.fetch("describe some_table")
|
186
|
+
puts result.column_names.inspect
|
187
|
+
puts result.first.inspect
|
188
|
+
}
|
189
|
+
|
190
|
+
## Testing
|
191
|
+
|
192
|
+
We use RBHive against Hive 0.10, 0.11 and 0.12, and have tested the BufferedTransport and
|
193
|
+
HTTPClientTransport. We use it against both Hiveserver and Hiveserver2 with success.
|
194
|
+
|
195
|
+
We have _not_ tested the SaslClientTransport, and would welcome reports
|
196
|
+
on whether it works correctly.
|
197
|
+
|
198
|
+
## Contributing
|
199
|
+
|
200
|
+
1. Fork it
|
201
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
202
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
203
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
204
|
+
5. Create new Pull Request
|
data/lib/rbhive/connection.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
old_verbose, $VERBOSE = $VERBOSE, nil
|
3
3
|
# require thrift autogenerated files
|
4
4
|
require File.join(File.dirname(__FILE__), *%w[.. thrift thrift_hive])
|
5
|
+
# require 'thrift'
|
5
6
|
# restore warnings
|
6
7
|
$VERBOSE = old_verbose
|
7
8
|
|
@@ -34,7 +35,7 @@ module RBHive
|
|
34
35
|
@socket = Thrift::Socket.new(server, port)
|
35
36
|
@transport = Thrift::BufferedTransport.new(@socket)
|
36
37
|
@protocol = Thrift::BinaryProtocol.new(@transport)
|
37
|
-
@client = ThriftHive::Client.new(@protocol)
|
38
|
+
@client = Hive::Thrift::ThriftHive::Client.new(@protocol)
|
38
39
|
@logger = logger
|
39
40
|
@logger.info("Connecting to #{server} on port #{port}")
|
40
41
|
@mutex = Mutex.new
|
@@ -0,0 +1,315 @@
|
|
1
|
+
# suppress warnings
|
2
|
+
old_verbose, $VERBOSE = $VERBOSE, nil
|
3
|
+
|
4
|
+
raise 'Thrift is not loaded' unless defined?(Thrift)
|
5
|
+
raise 'RBHive is not loaded' unless defined?(RBHive)
|
6
|
+
|
7
|
+
# require thrift autogenerated files
|
8
|
+
require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service_constants])
|
9
|
+
require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service])
|
10
|
+
require File.join(File.dirname(__FILE__), *%w[.. thrift sasl_client_transport])
|
11
|
+
|
12
|
+
# restore warnings
|
13
|
+
$VERBOSE = old_verbose
|
14
|
+
|
15
|
+
# Monkey patch thrift to set an infinite read timeout
|
16
|
+
module Thrift
|
17
|
+
class HTTPClientTransport < BaseTransport
|
18
|
+
def flush
|
19
|
+
http = Net::HTTP.new @url.host, @url.port
|
20
|
+
http.use_ssl = @url.scheme == 'https'
|
21
|
+
http.read_timeout = nil
|
22
|
+
http.verify_mode = @ssl_verify_mode if @url.scheme == 'https'
|
23
|
+
resp = http.post(@url.request_uri, @outbuf, @headers)
|
24
|
+
data = resp.body
|
25
|
+
data = Bytes.force_binary_encoding(data)
|
26
|
+
@inbuf = StringIO.new data
|
27
|
+
@outbuf = Bytes.empty_byte_buffer
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
module RBHive
|
33
|
+
|
34
|
+
HIVE_THRIFT_MAPPING = {
|
35
|
+
10 => 0,
|
36
|
+
11 => 1,
|
37
|
+
12 => 2
|
38
|
+
}
|
39
|
+
|
40
|
+
def tcli_connect(server, port=10_000, options)
|
41
|
+
connection = RBHive::TCLIConnection.new(server, port, options)
|
42
|
+
ret = nil
|
43
|
+
begin
|
44
|
+
connection.open
|
45
|
+
connection.open_session
|
46
|
+
ret = yield(connection)
|
47
|
+
|
48
|
+
ensure
|
49
|
+
# Try to close the session and our connection if those are still open, ignore io errors
|
50
|
+
begin
|
51
|
+
connection.close_session if connection.session
|
52
|
+
connection.close
|
53
|
+
rescue IOError => e
|
54
|
+
# noop
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
return ret
|
59
|
+
end
|
60
|
+
module_function :tcli_connect
|
61
|
+
|
62
|
+
class StdOutLogger
|
63
|
+
%w(fatal error warn info debug).each do |level|
|
64
|
+
define_method level.to_sym do |message|
|
65
|
+
STDOUT.puts(message)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
class TCLIConnection
|
71
|
+
attr_reader :client
|
72
|
+
|
73
|
+
def initialize(server, port=10_000, options={}, logger=StdOutLogger.new)
|
74
|
+
options ||= {} # backwards compatibility
|
75
|
+
raise "'options' parameter must be a hash" unless options.is_a?(Hash)
|
76
|
+
|
77
|
+
if options[:transport] == :sasl and options[:sasl_params].nil?
|
78
|
+
raise ":transport is set to :sasl, but no :sasl_params option was supplied"
|
79
|
+
end
|
80
|
+
|
81
|
+
# Defaults to buffered transport, Hive 0.10, 1800 second timeout
|
82
|
+
options[:transport] ||= :buffered
|
83
|
+
options[:hive_version] ||= 10
|
84
|
+
options[:timeout] ||= 1800
|
85
|
+
@options = options
|
86
|
+
|
87
|
+
# Look up the appropriate Thrift protocol version for the supplied Hive version
|
88
|
+
@thrift_protocol_version = thrift_hive_protocol(options[:hive_version])
|
89
|
+
|
90
|
+
@logger = logger
|
91
|
+
@transport = thrift_transport(server, port)
|
92
|
+
@protocol = Thrift::BinaryProtocol.new(@transport)
|
93
|
+
@client = Hive2::Thrift::TCLIService::Client.new(@protocol)
|
94
|
+
@session = nil
|
95
|
+
@logger.info("Connecting to HiveServer2 #{server} on port #{port}")
|
96
|
+
@mutex = Mutex.new
|
97
|
+
end
|
98
|
+
|
99
|
+
def thrift_hive_protocol(version)
|
100
|
+
HIVE_THRIFT_MAPPING[version] || raise("Invalid Hive version")
|
101
|
+
end
|
102
|
+
|
103
|
+
def thrift_transport(server, port)
|
104
|
+
@logger.info("Initializing transport #{@options[:transport]}")
|
105
|
+
case @options[:transport]
|
106
|
+
when :buffered
|
107
|
+
return Thrift::BufferedTransport.new(thrift_socket(server, port, @options[:timeout]))
|
108
|
+
when :sasl
|
109
|
+
return Thrift::SaslClientTransport.new(thrift_socket(server, port, @options[:timeout]),
|
110
|
+
parse_sasl_params(@options[:sasl_params]))
|
111
|
+
when :http
|
112
|
+
return Thrift::HTTPClientTransport.new("http://#{server}:#{port}/cliservice")
|
113
|
+
else
|
114
|
+
raise "Unrecognised transport type '#{transport}'"
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def thrift_socket(server, port, timeout)
|
119
|
+
socket = Thrift::Socket.new(server, port)
|
120
|
+
socket.timeout = timeout
|
121
|
+
socket
|
122
|
+
end
|
123
|
+
|
124
|
+
# Processes SASL connection params and returns a hash with symbol keys or a nil
|
125
|
+
def parse_sasl_params(sasl_params)
|
126
|
+
# Symbilize keys in a hash
|
127
|
+
if sasl_params.kind_of?(Hash)
|
128
|
+
return sasl_params.inject({}) do |memo,(k,v)|
|
129
|
+
memo[k.to_sym] = v;
|
130
|
+
memo
|
131
|
+
end
|
132
|
+
end
|
133
|
+
return nil
|
134
|
+
end
|
135
|
+
|
136
|
+
def open
|
137
|
+
@transport.open
|
138
|
+
end
|
139
|
+
|
140
|
+
def close
|
141
|
+
@transport.close
|
142
|
+
end
|
143
|
+
|
144
|
+
def open_session
|
145
|
+
@session = @client.OpenSession(prepare_open_session(@thrift_protocol_version))
|
146
|
+
end
|
147
|
+
|
148
|
+
def close_session
|
149
|
+
@client.CloseSession prepare_close_session
|
150
|
+
@session = nil
|
151
|
+
end
|
152
|
+
|
153
|
+
def session
|
154
|
+
@session && @session.sessionHandle
|
155
|
+
end
|
156
|
+
|
157
|
+
def client
|
158
|
+
@client
|
159
|
+
end
|
160
|
+
|
161
|
+
def execute(query)
|
162
|
+
execute_safe(query)
|
163
|
+
end
|
164
|
+
|
165
|
+
def priority=(priority)
|
166
|
+
set("mapred.job.priority", priority)
|
167
|
+
end
|
168
|
+
|
169
|
+
def queue=(queue)
|
170
|
+
set("mapred.job.queue.name", queue)
|
171
|
+
end
|
172
|
+
|
173
|
+
def set(name,value)
|
174
|
+
@logger.info("Setting #{name}=#{value}")
|
175
|
+
self.execute("SET #{name}=#{value}")
|
176
|
+
end
|
177
|
+
|
178
|
+
# Performs a query on the server, fetches up to *max_rows* rows and returns them as an array.
|
179
|
+
def fetch(query, max_rows = 100)
|
180
|
+
safe do
|
181
|
+
# Execute the query and check the result
|
182
|
+
exec_result = execute_unsafe(query)
|
183
|
+
raise_error_if_failed!(exec_result)
|
184
|
+
|
185
|
+
# Get search operation handle to fetch the results
|
186
|
+
op_handle = exec_result.operationHandle
|
187
|
+
|
188
|
+
# Prepare and execute fetch results request
|
189
|
+
fetch_req = prepare_fetch_results(op_handle, :first, max_rows)
|
190
|
+
fetch_results = client.FetchResults(fetch_req)
|
191
|
+
raise_error_if_failed!(fetch_results)
|
192
|
+
|
193
|
+
# Get data rows and format the result
|
194
|
+
rows = fetch_results.results.rows
|
195
|
+
the_schema = TCLISchemaDefinition.new(get_schema_for( op_handle ), rows.first)
|
196
|
+
TCLIResultSet.new(rows, the_schema)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Performs a query on the server, fetches the results in batches of *batch_size* rows
|
201
|
+
# and yields the result batches to a given block as arrays of rows.
|
202
|
+
def fetch_in_batch(query, batch_size = 1000, &block)
|
203
|
+
raise "No block given for the batch fetch request!" unless block_given?
|
204
|
+
safe do
|
205
|
+
# Execute the query and check the result
|
206
|
+
exec_result = execute_unsafe(query)
|
207
|
+
raise_error_if_failed!(exec_result)
|
208
|
+
|
209
|
+
# Get search operation handle to fetch the results
|
210
|
+
op_handle = exec_result.operationHandle
|
211
|
+
|
212
|
+
# Prepare fetch results request
|
213
|
+
fetch_req = prepare_fetch_results(op_handle, :next, batch_size)
|
214
|
+
|
215
|
+
# Now let's iterate over the results
|
216
|
+
loop do
|
217
|
+
# Fetch next batch and raise an exception if it failed
|
218
|
+
fetch_results = client.FetchResults(fetch_req)
|
219
|
+
raise_error_if_failed!(fetch_results)
|
220
|
+
|
221
|
+
# Get data rows from the result
|
222
|
+
rows = fetch_results.results.rows
|
223
|
+
break if rows.empty?
|
224
|
+
|
225
|
+
# Prepare schema definition for the row
|
226
|
+
schema_for_req ||= get_schema_for(op_handle)
|
227
|
+
the_schema ||= TCLISchemaDefinition.new(schema_for_req, rows.first)
|
228
|
+
|
229
|
+
# Format the results and yield them to the given block
|
230
|
+
yield TCLIResultSet.new(rows, the_schema)
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
def create_table(schema)
|
236
|
+
execute(schema.create_table_statement)
|
237
|
+
end
|
238
|
+
|
239
|
+
def drop_table(name)
|
240
|
+
name = name.name if name.is_a?(TableSchema)
|
241
|
+
execute("DROP TABLE `#{name}`")
|
242
|
+
end
|
243
|
+
|
244
|
+
def replace_columns(schema)
|
245
|
+
execute(schema.replace_columns_statement)
|
246
|
+
end
|
247
|
+
|
248
|
+
def add_columns(schema)
|
249
|
+
execute(schema.add_columns_statement)
|
250
|
+
end
|
251
|
+
|
252
|
+
def method_missing(meth, *args)
|
253
|
+
client.send(meth, *args)
|
254
|
+
end
|
255
|
+
|
256
|
+
private
|
257
|
+
|
258
|
+
def execute_safe(query)
|
259
|
+
safe { execute_unsafe(query) }
|
260
|
+
end
|
261
|
+
|
262
|
+
def execute_unsafe(query)
|
263
|
+
@logger.info("Executing Hive Query: #{query}")
|
264
|
+
req = prepare_execute_statement(query)
|
265
|
+
client.ExecuteStatement(req)
|
266
|
+
end
|
267
|
+
|
268
|
+
def safe
|
269
|
+
ret = nil
|
270
|
+
@mutex.synchronize { ret = yield }
|
271
|
+
ret
|
272
|
+
end
|
273
|
+
|
274
|
+
def prepare_open_session(client_protocol)
|
275
|
+
req = ::Hive2::Thrift::TOpenSessionReq.new( @options[:sasl_params].nil? ? [] : @options[:sasl_params] )
|
276
|
+
req.client_protocol = client_protocol
|
277
|
+
req
|
278
|
+
end
|
279
|
+
|
280
|
+
def prepare_close_session
|
281
|
+
::Hive2::Thrift::TCloseSessionReq.new( sessionHandle: self.session )
|
282
|
+
end
|
283
|
+
|
284
|
+
def prepare_execute_statement(query)
|
285
|
+
::Hive2::Thrift::TExecuteStatementReq.new( sessionHandle: self.session, statement: query.to_s, confOverlay: {} )
|
286
|
+
end
|
287
|
+
|
288
|
+
def prepare_fetch_results(handle, orientation=:first, rows=100)
|
289
|
+
orientation_value = "FETCH_#{orientation.to_s.upcase}"
|
290
|
+
valid_orientations = ::Hive2::Thrift::TFetchOrientation::VALUE_MAP.values
|
291
|
+
unless valid_orientations.include?(orientation_value)
|
292
|
+
raise ArgumentError, "Invalid orientation: #{orientation.inspect}"
|
293
|
+
end
|
294
|
+
orientation_const = eval("::Hive2::Thrift::TFetchOrientation::#{orientation_value}")
|
295
|
+
::Hive2::Thrift::TFetchResultsReq.new(
|
296
|
+
operationHandle: handle,
|
297
|
+
orientation: orientation_const,
|
298
|
+
maxRows: rows
|
299
|
+
)
|
300
|
+
end
|
301
|
+
|
302
|
+
def get_schema_for(handle)
|
303
|
+
req = ::Hive2::Thrift::TGetResultSetMetadataReq.new( operationHandle: handle )
|
304
|
+
metadata = client.GetResultSetMetadata( req )
|
305
|
+
metadata.schema
|
306
|
+
end
|
307
|
+
|
308
|
+
# Raises an exception if given operation result is a failure
|
309
|
+
def raise_error_if_failed!(result)
|
310
|
+
return if result.status.statusCode == 0
|
311
|
+
error_message = result.status.errorMessage || 'Execution failed!'
|
312
|
+
raise error_message
|
313
|
+
end
|
314
|
+
end
|
315
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
module RBHive
|
4
|
+
class TCLISchemaDefinition
|
5
|
+
attr_reader :schema
|
6
|
+
|
7
|
+
TYPES = {
|
8
|
+
:boolean => :to_s,
|
9
|
+
:string => :to_s,
|
10
|
+
:bigint => :to_i,
|
11
|
+
:float => :to_f,
|
12
|
+
:double => :to_f,
|
13
|
+
:int => :to_i,
|
14
|
+
:bigint => :to_i,
|
15
|
+
:smallint => :to_i,
|
16
|
+
:tinyint => :to_i,
|
17
|
+
}
|
18
|
+
|
19
|
+
def initialize(schema, example_row)
|
20
|
+
@schema = schema
|
21
|
+
@example_row = example_row ? example_row.colVals : []
|
22
|
+
end
|
23
|
+
|
24
|
+
def column_names
|
25
|
+
@column_names ||= begin
|
26
|
+
schema_names = @schema.columns.map {|c| c.columnName }
|
27
|
+
|
28
|
+
# In rare cases Hive can return two identical column names
|
29
|
+
# consider SELECT a.foo, b.foo...
|
30
|
+
# in this case you get two columns called foo with no disambiguation.
|
31
|
+
# as a (far from ideal) solution we detect this edge case and rename them
|
32
|
+
# a.foo => foo1, b.foo => foo2
|
33
|
+
# otherwise we will trample one of the columns during Hash mapping.
|
34
|
+
s = Hash.new(0)
|
35
|
+
schema_names.map! { |c| s[c] += 1; s[c] > 1 ? "#{c}---|---#{s[c]}" : c }
|
36
|
+
schema_names.map! { |c| s[c] > 1 ? "#{c}---|---1" : c }
|
37
|
+
schema_names.map! { |c| c.gsub('---|---', '_').to_sym }
|
38
|
+
|
39
|
+
# Lets fix the fact that Hive doesn't return schema data for partitions on SELECT * queries
|
40
|
+
# For now we will call them :_p1, :_p2, etc. to avoid collisions.
|
41
|
+
offset = 0
|
42
|
+
while schema_names.length < @example_row.length
|
43
|
+
schema_names.push(:"_p#{offset+=1}")
|
44
|
+
end
|
45
|
+
schema_names
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def column_type_map
|
50
|
+
@column_type_map ||= column_names.inject({}) do |hsh, c|
|
51
|
+
definition = @schema.columns.find {|s| s.columnName.to_sym == c }
|
52
|
+
# If the column isn't in the schema (eg partitions in SELECT * queries) assume they are strings
|
53
|
+
type = TYPE_NAMES[definition.typeDesc.types.first.primitiveEntry.type].downcase rescue nil
|
54
|
+
hsh[c] = definition && type ? type.to_sym : :string
|
55
|
+
hsh
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def coerce_row(row)
|
60
|
+
column_names.zip(row.colVals.map(&:get_value).map(&:value)).inject({}) do |hsh, (column_name, value)|
|
61
|
+
hsh[column_name] = coerce_column(column_name, value)
|
62
|
+
hsh
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def coerce_column(column_name, value)
|
67
|
+
type = column_type_map[column_name]
|
68
|
+
return 1.0/0.0 if(type != :string && value == "Infinity")
|
69
|
+
return 0.0/0.0 if(type != :string && value == "NaN")
|
70
|
+
return nil if value.nil? || value == 'NULL' || value == 'null'
|
71
|
+
return coerce_complex_value(value) if type.to_s =~ /^array/
|
72
|
+
conversion_method = TYPES[type]
|
73
|
+
conversion_method ? value.send(conversion_method) : value
|
74
|
+
end
|
75
|
+
|
76
|
+
def coerce_row_to_array(row)
|
77
|
+
column_names.map { |n| row[n] }
|
78
|
+
end
|
79
|
+
|
80
|
+
def coerce_complex_value(value)
|
81
|
+
return nil if value.nil?
|
82
|
+
return nil if value.length == 0
|
83
|
+
return nil if value == 'null'
|
84
|
+
JSON.parse(value)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
data/lib/rbhive.rb
CHANGED
@@ -2,4 +2,7 @@ require File.join(File.dirname(__FILE__), 'rbhive', 'connection')
|
|
2
2
|
require File.join(File.dirname(__FILE__), 'rbhive', 'table_schema')
|
3
3
|
require File.join(File.dirname(__FILE__), 'rbhive', 'result_set')
|
4
4
|
require File.join(File.dirname(__FILE__), 'rbhive', 'explain_result')
|
5
|
-
require File.join(File.dirname(__FILE__), 'rbhive', 'schema_definition')
|
5
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'schema_definition')
|
6
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_result_set])
|
7
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_schema_definition])
|
8
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_connection])
|
@@ -1,12 +1,11 @@
|
|
1
1
|
#
|
2
|
-
# Autogenerated by Thrift
|
2
|
+
# Autogenerated by Thrift Compiler (0.9.0)
|
3
3
|
#
|
4
4
|
# DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
|
5
5
|
#
|
6
6
|
|
7
7
|
require 'thrift'
|
8
|
-
|
9
|
-
|
8
|
+
require_relative 'fb303_types'
|
10
9
|
|
11
10
|
module FacebookService
|
12
11
|
class Client
|
@@ -370,13 +369,13 @@ module FacebookService
|
|
370
369
|
SUCCESS = 0
|
371
370
|
|
372
371
|
FIELDS = {
|
373
|
-
SUCCESS => {:type => ::Thrift::Types::I32, :name => 'success', :enum_class => Fb_status}
|
372
|
+
SUCCESS => {:type => ::Thrift::Types::I32, :name => 'success', :enum_class => ::Fb_status}
|
374
373
|
}
|
375
374
|
|
376
375
|
def struct_fields; FIELDS; end
|
377
376
|
|
378
377
|
def validate
|
379
|
-
unless @success.nil? || Fb_status::VALID_VALUES.include?(@success)
|
378
|
+
unless @success.nil? || ::Fb_status::VALID_VALUES.include?(@success)
|
380
379
|
raise ::Thrift::ProtocolException.new(::Thrift::ProtocolException::UNKNOWN, 'Invalid value of field success!')
|
381
380
|
end
|
382
381
|
end
|