rbhive 0.2.95 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/LICENSE +20 -0
- data/README.md +204 -0
- data/lib/rbhive/connection.rb +2 -1
- data/lib/rbhive/t_c_l_i_connection.rb +315 -0
- data/lib/rbhive/t_c_l_i_result_set.rb +3 -0
- data/lib/rbhive/t_c_l_i_schema_definition.rb +87 -0
- data/lib/rbhive/version.rb +3 -0
- data/lib/rbhive.rb +4 -1
- data/lib/thrift/facebook_service.rb +4 -5
- data/lib/thrift/fb303_constants.rb +3 -2
- data/lib/thrift/fb303_types.rb +2 -1
- data/lib/thrift/hive_metastore_constants.rb +3 -3
- data/lib/thrift/hive_metastore_types.rb +176 -14
- data/lib/thrift/hive_service_constants.rb +7 -2
- data/lib/thrift/hive_service_types.rb +53 -48
- data/lib/thrift/queryplan_constants.rb +7 -2
- data/lib/thrift/queryplan_types.rb +225 -217
- data/lib/thrift/sasl_client_transport.rb +97 -0
- data/lib/thrift/serde_constants.rb +5 -3
- data/lib/thrift/serde_types.rb +2 -2
- data/lib/thrift/t_c_l_i_service.rb +892 -0
- data/lib/thrift/t_c_l_i_service_constants.rb +66 -0
- data/lib/thrift/t_c_l_i_service_types.rb +1469 -0
- data/lib/thrift/thrift_hive.rb +405 -401
- data/lib/thrift/thrift_hive_metastore.rb +1452 -203
- data/rbhive.gemspec +24 -0
- metadata +90 -69
- data/lib/thrift/reflection_limited_constants.rb +0 -8
- data/lib/thrift/reflection_limited_types.rb +0 -150
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) [2013] [Forward3D]
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
6
|
+
this software and associated documentation files (the "Software"), to deal in
|
7
|
+
the Software without restriction, including without limitation the rights to
|
8
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
9
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
10
|
+
subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
17
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
18
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
19
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
20
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,204 @@
|
|
1
|
+
# RBHive -- Ruby thrift lib for executing Hive queries
|
2
|
+
|
3
|
+
RBHive is a simple Ruby gem to communicate with the [Apache Hive](http://hive.apache.org)
|
4
|
+
Thrift server.
|
5
|
+
|
6
|
+
It supports:
|
7
|
+
* Hiveserver (the original Thrift service shipped with Hive since early releases)
|
8
|
+
* Hiveserver2 (the new, concurrent Thrift service shipped with Hive releases since 0.10)
|
9
|
+
* Any other 100% Hive-compatible Thrift service (e.g. [Sharkserver](https://github.com/amplab/shark))
|
10
|
+
|
11
|
+
It is capable of using the following Thrift transports:
|
12
|
+
* BufferedTransport (the default)
|
13
|
+
* SaslClientTransport ([SASL-enabled](http://en.wikipedia.org/wiki/Simple_Authentication_and_Security_Layer) transport)
|
14
|
+
* HTTPClientTransport (tunnels Thrift over HTTP)
|
15
|
+
|
16
|
+
## About Thrift services and transports
|
17
|
+
|
18
|
+
### Hiveserver
|
19
|
+
|
20
|
+
Hiveserver (the original Thrift interface) only supports a single client at a time. RBHive
|
21
|
+
implements this with the `RBHive::Connection` class. It only supports a single transport,
|
22
|
+
BufferedTransport.
|
23
|
+
|
24
|
+
### Hiveserver2
|
25
|
+
|
26
|
+
[Hiveserver2](https://cwiki.apache.org/confluence/display/Hive/Setting+up+HiveServer2)
|
27
|
+
(the new Thrift interface) can support many concurrent client connections. It is shipped
|
28
|
+
with Hive 0.10 and later. In Hive 0.10, only BufferedTranport and SaslClientTransport are
|
29
|
+
supported; starting with Hive 0.12, HTTPClientTransport is also supported.
|
30
|
+
|
31
|
+
Each of the versions after Hive 0.10 has a slightly different Thrift interface; when
|
32
|
+
connecting, you must specify the Hive version or you may get an exception.
|
33
|
+
|
34
|
+
RBHive implements this client with the `RBHive::TCLIConnection` class.
|
35
|
+
|
36
|
+
### Other Hive-compatible services
|
37
|
+
|
38
|
+
Consult the documentation for the service, as this will vary depending on the service you're using.
|
39
|
+
|
40
|
+
## Connecting to Hiveserver and Hiveserver2
|
41
|
+
|
42
|
+
### Hiveserver
|
43
|
+
|
44
|
+
Since Hiveserver has no options, connection code is very simple:
|
45
|
+
|
46
|
+
RBHive.connect('hive.server.address', 10_000) do |connection|
|
47
|
+
connection.fetch 'SELECT city, country FROM cities'
|
48
|
+
end
|
49
|
+
➔ [{:city => "London", :country => "UK"}, {:city => "Mumbai", :country => "India"}, {:city => "New York", :country => "USA"}]
|
50
|
+
|
51
|
+
### Hiveserver2
|
52
|
+
|
53
|
+
Hiveserver2 has several options with how it is run. The connection code takes
|
54
|
+
a hash with these possible parameters:
|
55
|
+
* `:transport` - one of `:buffered` (BufferedTransport), `:http` (HTTPClientTransport), or `:sasl` (SaslClientTransport)
|
56
|
+
* `:hive_version` - the number after the period in the Hive version; e.g. `10`, `11`, `12`
|
57
|
+
* `:timeout` - if using BufferedTransport or SaslClientTransport, this is how long the timeout on the socket will be
|
58
|
+
* `:sasl_params` - if using SaslClientTransport, this is a hash of parameters to set up the SASL connection
|
59
|
+
|
60
|
+
If you pass either an empty hash or nil in place of the options (or do not supply them), the connection
|
61
|
+
is attempted with the Hive version set to 0.10, using `:buffered` as the transport, and a timeout of 1800 seconds.
|
62
|
+
|
63
|
+
Connecting with the defaults:
|
64
|
+
|
65
|
+
RBHive.tcli_connect('hive.server.address', 10_000) do |connection|
|
66
|
+
connection.fetch('SHOW TABLES')
|
67
|
+
end
|
68
|
+
|
69
|
+
Connecting with a specific Hive version (0.12 in this case):
|
70
|
+
|
71
|
+
RBHive.tcli_connect('hive.server.address', 10_000, {:hive_version => 12}) do |connection|
|
72
|
+
connection.fetch('SHOW TABLES')
|
73
|
+
end
|
74
|
+
|
75
|
+
Connecting with a specific Hive version (0.12) and using the `:http` transport:
|
76
|
+
|
77
|
+
RBHive.tcli_connect('hive.server.address', 10_000, {:hive_version => 12, :transport => :http}) do |connection|
|
78
|
+
connection.fetch('SHOW TABLES')
|
79
|
+
end
|
80
|
+
|
81
|
+
We have not tested the SASL connection, as we don't run SASL; pull requests and testing are welcomed.
|
82
|
+
|
83
|
+
## Examples
|
84
|
+
|
85
|
+
### Fetching results
|
86
|
+
|
87
|
+
#### Hiveserver
|
88
|
+
|
89
|
+
RBHive.connect('hive.server.address', 10_000) do |connection|
|
90
|
+
connection.fetch 'SELECT city, country FROM cities'
|
91
|
+
end
|
92
|
+
➔ [{:city => "London", :country => "UK"}, {:city => "Mumbai", :country => "India"}, {:city => "New York", :country => "USA"}]
|
93
|
+
|
94
|
+
#### Hiveserver2
|
95
|
+
|
96
|
+
RBHive.tcli_connect('hive.server.address', 10_000) do |connection|
|
97
|
+
connection.fetch 'SELECT city, country FROM cities'
|
98
|
+
end
|
99
|
+
➔ [{:city => "London", :country => "UK"}, {:city => "Mumbai", :country => "India"}, {:city => "New York", :country => "USA"}]
|
100
|
+
|
101
|
+
### Executing a query
|
102
|
+
|
103
|
+
#### Hiveserver
|
104
|
+
|
105
|
+
RBHive.connect('hive.server.address') do |connection|
|
106
|
+
connection.execute 'DROP TABLE cities'
|
107
|
+
end
|
108
|
+
➔ nil
|
109
|
+
|
110
|
+
#### Hiveserver2
|
111
|
+
|
112
|
+
RBHive.tcli_connect('hive.server.address') do |connection|
|
113
|
+
connection.execute 'DROP TABLE cities'
|
114
|
+
end
|
115
|
+
➔ nil
|
116
|
+
|
117
|
+
### Creating tables
|
118
|
+
|
119
|
+
table = TableSchema.new('person', 'List of people that owe me money') do
|
120
|
+
column 'name', :string, 'Full name of debtor'
|
121
|
+
column 'address', :string, 'Address of debtor'
|
122
|
+
column 'amount', :float, 'The amount of money borrowed'
|
123
|
+
|
124
|
+
partition 'dated', :string, 'The date money was given'
|
125
|
+
partition 'country', :string, 'The country the person resides in'
|
126
|
+
end
|
127
|
+
|
128
|
+
Then for Hiveserver:
|
129
|
+
|
130
|
+
RBHive.connect('hive.server.address', 10_000) do |connection|
|
131
|
+
connection.create_table(table)
|
132
|
+
end
|
133
|
+
|
134
|
+
Or Hiveserver2:
|
135
|
+
|
136
|
+
RBHive.tcli_connect('hive.server.address', 10_000) do |connection|
|
137
|
+
connection.create_table(table)
|
138
|
+
end
|
139
|
+
|
140
|
+
### Modifying table schema
|
141
|
+
|
142
|
+
table = TableSchema.new('person', 'List of people that owe me money') do
|
143
|
+
column 'name', :string, 'Full name of debtor'
|
144
|
+
column 'address', :string, 'Address of debtor'
|
145
|
+
column 'amount', :float, 'The amount of money borrowed'
|
146
|
+
column 'new_amount', :float, 'The new amount this person somehow convinced me to give them'
|
147
|
+
|
148
|
+
partition 'dated', :string, 'The date money was given'
|
149
|
+
partition 'country', :string, 'The country the person resides in'
|
150
|
+
end
|
151
|
+
|
152
|
+
Then for Hiveserver:
|
153
|
+
|
154
|
+
RBHive.connect('hive.server.address') do |connection|
|
155
|
+
connection.replace_columns(table)
|
156
|
+
end
|
157
|
+
|
158
|
+
Or Hiveserver2:
|
159
|
+
|
160
|
+
RBHive.tcli_connect('hive.server.address') do |connection|
|
161
|
+
connection.replace_columns(table)
|
162
|
+
end
|
163
|
+
|
164
|
+
### Setting properties
|
165
|
+
|
166
|
+
You can set various properties for Hive tasks, some of which change how they run. Consult the Apache
|
167
|
+
Hive documentation and Hadoop's documentation for the various properties that can be set.
|
168
|
+
For example, you can set the map-reduce job's priority with the following:
|
169
|
+
|
170
|
+
connection.set("mapred.job.priority", "VERY_HIGH")
|
171
|
+
|
172
|
+
### Inspecting tables
|
173
|
+
|
174
|
+
#### Hiveserver
|
175
|
+
|
176
|
+
RBHive.connect('hive.hadoop.forward.co.uk', 10_000) {|connection|
|
177
|
+
result = connection.fetch("describe some_table")
|
178
|
+
puts result.column_names.inspect
|
179
|
+
puts result.first.inspect
|
180
|
+
}
|
181
|
+
|
182
|
+
#### Hiveserver2
|
183
|
+
|
184
|
+
RBHive.tcli_connect('hive.hadoop.forward.co.uk', 10_000) {|connection|
|
185
|
+
result = connection.fetch("describe some_table")
|
186
|
+
puts result.column_names.inspect
|
187
|
+
puts result.first.inspect
|
188
|
+
}
|
189
|
+
|
190
|
+
## Testing
|
191
|
+
|
192
|
+
We use RBHive against Hive 0.10, 0.11 and 0.12, and have tested the BufferedTransport and
|
193
|
+
HTTPClientTransport. We use it against both Hiveserver and Hiveserver2 with success.
|
194
|
+
|
195
|
+
We have _not_ tested the SaslClientTransport, and would welcome reports
|
196
|
+
on whether it works correctly.
|
197
|
+
|
198
|
+
## Contributing
|
199
|
+
|
200
|
+
1. Fork it
|
201
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
202
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
203
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
204
|
+
5. Create new Pull Request
|
data/lib/rbhive/connection.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
old_verbose, $VERBOSE = $VERBOSE, nil
|
3
3
|
# require thrift autogenerated files
|
4
4
|
require File.join(File.dirname(__FILE__), *%w[.. thrift thrift_hive])
|
5
|
+
# require 'thrift'
|
5
6
|
# restore warnings
|
6
7
|
$VERBOSE = old_verbose
|
7
8
|
|
@@ -34,7 +35,7 @@ module RBHive
|
|
34
35
|
@socket = Thrift::Socket.new(server, port)
|
35
36
|
@transport = Thrift::BufferedTransport.new(@socket)
|
36
37
|
@protocol = Thrift::BinaryProtocol.new(@transport)
|
37
|
-
@client = ThriftHive::Client.new(@protocol)
|
38
|
+
@client = Hive::Thrift::ThriftHive::Client.new(@protocol)
|
38
39
|
@logger = logger
|
39
40
|
@logger.info("Connecting to #{server} on port #{port}")
|
40
41
|
@mutex = Mutex.new
|
@@ -0,0 +1,315 @@
|
|
1
|
+
# suppress warnings
|
2
|
+
old_verbose, $VERBOSE = $VERBOSE, nil
|
3
|
+
|
4
|
+
raise 'Thrift is not loaded' unless defined?(Thrift)
|
5
|
+
raise 'RBHive is not loaded' unless defined?(RBHive)
|
6
|
+
|
7
|
+
# require thrift autogenerated files
|
8
|
+
require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service_constants])
|
9
|
+
require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service])
|
10
|
+
require File.join(File.dirname(__FILE__), *%w[.. thrift sasl_client_transport])
|
11
|
+
|
12
|
+
# restore warnings
|
13
|
+
$VERBOSE = old_verbose
|
14
|
+
|
15
|
+
# Monkey patch thrift to set an infinite read timeout
|
16
|
+
module Thrift
|
17
|
+
class HTTPClientTransport < BaseTransport
|
18
|
+
def flush
|
19
|
+
http = Net::HTTP.new @url.host, @url.port
|
20
|
+
http.use_ssl = @url.scheme == 'https'
|
21
|
+
http.read_timeout = nil
|
22
|
+
http.verify_mode = @ssl_verify_mode if @url.scheme == 'https'
|
23
|
+
resp = http.post(@url.request_uri, @outbuf, @headers)
|
24
|
+
data = resp.body
|
25
|
+
data = Bytes.force_binary_encoding(data)
|
26
|
+
@inbuf = StringIO.new data
|
27
|
+
@outbuf = Bytes.empty_byte_buffer
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
module RBHive
|
33
|
+
|
34
|
+
HIVE_THRIFT_MAPPING = {
|
35
|
+
10 => 0,
|
36
|
+
11 => 1,
|
37
|
+
12 => 2
|
38
|
+
}
|
39
|
+
|
40
|
+
def tcli_connect(server, port=10_000, options)
|
41
|
+
connection = RBHive::TCLIConnection.new(server, port, options)
|
42
|
+
ret = nil
|
43
|
+
begin
|
44
|
+
connection.open
|
45
|
+
connection.open_session
|
46
|
+
ret = yield(connection)
|
47
|
+
|
48
|
+
ensure
|
49
|
+
# Try to close the session and our connection if those are still open, ignore io errors
|
50
|
+
begin
|
51
|
+
connection.close_session if connection.session
|
52
|
+
connection.close
|
53
|
+
rescue IOError => e
|
54
|
+
# noop
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
return ret
|
59
|
+
end
|
60
|
+
module_function :tcli_connect
|
61
|
+
|
62
|
+
class StdOutLogger
|
63
|
+
%w(fatal error warn info debug).each do |level|
|
64
|
+
define_method level.to_sym do |message|
|
65
|
+
STDOUT.puts(message)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
class TCLIConnection
|
71
|
+
attr_reader :client
|
72
|
+
|
73
|
+
def initialize(server, port=10_000, options={}, logger=StdOutLogger.new)
|
74
|
+
options ||= {} # backwards compatibility
|
75
|
+
raise "'options' parameter must be a hash" unless options.is_a?(Hash)
|
76
|
+
|
77
|
+
if options[:transport] == :sasl and options[:sasl_params].nil?
|
78
|
+
raise ":transport is set to :sasl, but no :sasl_params option was supplied"
|
79
|
+
end
|
80
|
+
|
81
|
+
# Defaults to buffered transport, Hive 0.10, 1800 second timeout
|
82
|
+
options[:transport] ||= :buffered
|
83
|
+
options[:hive_version] ||= 10
|
84
|
+
options[:timeout] ||= 1800
|
85
|
+
@options = options
|
86
|
+
|
87
|
+
# Look up the appropriate Thrift protocol version for the supplied Hive version
|
88
|
+
@thrift_protocol_version = thrift_hive_protocol(options[:hive_version])
|
89
|
+
|
90
|
+
@logger = logger
|
91
|
+
@transport = thrift_transport(server, port)
|
92
|
+
@protocol = Thrift::BinaryProtocol.new(@transport)
|
93
|
+
@client = Hive2::Thrift::TCLIService::Client.new(@protocol)
|
94
|
+
@session = nil
|
95
|
+
@logger.info("Connecting to HiveServer2 #{server} on port #{port}")
|
96
|
+
@mutex = Mutex.new
|
97
|
+
end
|
98
|
+
|
99
|
+
def thrift_hive_protocol(version)
|
100
|
+
HIVE_THRIFT_MAPPING[version] || raise("Invalid Hive version")
|
101
|
+
end
|
102
|
+
|
103
|
+
def thrift_transport(server, port)
|
104
|
+
@logger.info("Initializing transport #{@options[:transport]}")
|
105
|
+
case @options[:transport]
|
106
|
+
when :buffered
|
107
|
+
return Thrift::BufferedTransport.new(thrift_socket(server, port, @options[:timeout]))
|
108
|
+
when :sasl
|
109
|
+
return Thrift::SaslClientTransport.new(thrift_socket(server, port, @options[:timeout]),
|
110
|
+
parse_sasl_params(@options[:sasl_params]))
|
111
|
+
when :http
|
112
|
+
return Thrift::HTTPClientTransport.new("http://#{server}:#{port}/cliservice")
|
113
|
+
else
|
114
|
+
raise "Unrecognised transport type '#{transport}'"
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def thrift_socket(server, port, timeout)
|
119
|
+
socket = Thrift::Socket.new(server, port)
|
120
|
+
socket.timeout = timeout
|
121
|
+
socket
|
122
|
+
end
|
123
|
+
|
124
|
+
# Processes SASL connection params and returns a hash with symbol keys or a nil
|
125
|
+
def parse_sasl_params(sasl_params)
|
126
|
+
# Symbilize keys in a hash
|
127
|
+
if sasl_params.kind_of?(Hash)
|
128
|
+
return sasl_params.inject({}) do |memo,(k,v)|
|
129
|
+
memo[k.to_sym] = v;
|
130
|
+
memo
|
131
|
+
end
|
132
|
+
end
|
133
|
+
return nil
|
134
|
+
end
|
135
|
+
|
136
|
+
def open
|
137
|
+
@transport.open
|
138
|
+
end
|
139
|
+
|
140
|
+
def close
|
141
|
+
@transport.close
|
142
|
+
end
|
143
|
+
|
144
|
+
def open_session
|
145
|
+
@session = @client.OpenSession(prepare_open_session(@thrift_protocol_version))
|
146
|
+
end
|
147
|
+
|
148
|
+
def close_session
|
149
|
+
@client.CloseSession prepare_close_session
|
150
|
+
@session = nil
|
151
|
+
end
|
152
|
+
|
153
|
+
def session
|
154
|
+
@session && @session.sessionHandle
|
155
|
+
end
|
156
|
+
|
157
|
+
def client
|
158
|
+
@client
|
159
|
+
end
|
160
|
+
|
161
|
+
def execute(query)
|
162
|
+
execute_safe(query)
|
163
|
+
end
|
164
|
+
|
165
|
+
def priority=(priority)
|
166
|
+
set("mapred.job.priority", priority)
|
167
|
+
end
|
168
|
+
|
169
|
+
def queue=(queue)
|
170
|
+
set("mapred.job.queue.name", queue)
|
171
|
+
end
|
172
|
+
|
173
|
+
def set(name,value)
|
174
|
+
@logger.info("Setting #{name}=#{value}")
|
175
|
+
self.execute("SET #{name}=#{value}")
|
176
|
+
end
|
177
|
+
|
178
|
+
# Performs a query on the server, fetches up to *max_rows* rows and returns them as an array.
|
179
|
+
def fetch(query, max_rows = 100)
|
180
|
+
safe do
|
181
|
+
# Execute the query and check the result
|
182
|
+
exec_result = execute_unsafe(query)
|
183
|
+
raise_error_if_failed!(exec_result)
|
184
|
+
|
185
|
+
# Get search operation handle to fetch the results
|
186
|
+
op_handle = exec_result.operationHandle
|
187
|
+
|
188
|
+
# Prepare and execute fetch results request
|
189
|
+
fetch_req = prepare_fetch_results(op_handle, :first, max_rows)
|
190
|
+
fetch_results = client.FetchResults(fetch_req)
|
191
|
+
raise_error_if_failed!(fetch_results)
|
192
|
+
|
193
|
+
# Get data rows and format the result
|
194
|
+
rows = fetch_results.results.rows
|
195
|
+
the_schema = TCLISchemaDefinition.new(get_schema_for( op_handle ), rows.first)
|
196
|
+
TCLIResultSet.new(rows, the_schema)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Performs a query on the server, fetches the results in batches of *batch_size* rows
|
201
|
+
# and yields the result batches to a given block as arrays of rows.
|
202
|
+
def fetch_in_batch(query, batch_size = 1000, &block)
|
203
|
+
raise "No block given for the batch fetch request!" unless block_given?
|
204
|
+
safe do
|
205
|
+
# Execute the query and check the result
|
206
|
+
exec_result = execute_unsafe(query)
|
207
|
+
raise_error_if_failed!(exec_result)
|
208
|
+
|
209
|
+
# Get search operation handle to fetch the results
|
210
|
+
op_handle = exec_result.operationHandle
|
211
|
+
|
212
|
+
# Prepare fetch results request
|
213
|
+
fetch_req = prepare_fetch_results(op_handle, :next, batch_size)
|
214
|
+
|
215
|
+
# Now let's iterate over the results
|
216
|
+
loop do
|
217
|
+
# Fetch next batch and raise an exception if it failed
|
218
|
+
fetch_results = client.FetchResults(fetch_req)
|
219
|
+
raise_error_if_failed!(fetch_results)
|
220
|
+
|
221
|
+
# Get data rows from the result
|
222
|
+
rows = fetch_results.results.rows
|
223
|
+
break if rows.empty?
|
224
|
+
|
225
|
+
# Prepare schema definition for the row
|
226
|
+
schema_for_req ||= get_schema_for(op_handle)
|
227
|
+
the_schema ||= TCLISchemaDefinition.new(schema_for_req, rows.first)
|
228
|
+
|
229
|
+
# Format the results and yield them to the given block
|
230
|
+
yield TCLIResultSet.new(rows, the_schema)
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
def create_table(schema)
|
236
|
+
execute(schema.create_table_statement)
|
237
|
+
end
|
238
|
+
|
239
|
+
def drop_table(name)
|
240
|
+
name = name.name if name.is_a?(TableSchema)
|
241
|
+
execute("DROP TABLE `#{name}`")
|
242
|
+
end
|
243
|
+
|
244
|
+
def replace_columns(schema)
|
245
|
+
execute(schema.replace_columns_statement)
|
246
|
+
end
|
247
|
+
|
248
|
+
def add_columns(schema)
|
249
|
+
execute(schema.add_columns_statement)
|
250
|
+
end
|
251
|
+
|
252
|
+
def method_missing(meth, *args)
|
253
|
+
client.send(meth, *args)
|
254
|
+
end
|
255
|
+
|
256
|
+
private
|
257
|
+
|
258
|
+
def execute_safe(query)
|
259
|
+
safe { execute_unsafe(query) }
|
260
|
+
end
|
261
|
+
|
262
|
+
def execute_unsafe(query)
|
263
|
+
@logger.info("Executing Hive Query: #{query}")
|
264
|
+
req = prepare_execute_statement(query)
|
265
|
+
client.ExecuteStatement(req)
|
266
|
+
end
|
267
|
+
|
268
|
+
def safe
|
269
|
+
ret = nil
|
270
|
+
@mutex.synchronize { ret = yield }
|
271
|
+
ret
|
272
|
+
end
|
273
|
+
|
274
|
+
def prepare_open_session(client_protocol)
|
275
|
+
req = ::Hive2::Thrift::TOpenSessionReq.new( @options[:sasl_params].nil? ? [] : @options[:sasl_params] )
|
276
|
+
req.client_protocol = client_protocol
|
277
|
+
req
|
278
|
+
end
|
279
|
+
|
280
|
+
def prepare_close_session
|
281
|
+
::Hive2::Thrift::TCloseSessionReq.new( sessionHandle: self.session )
|
282
|
+
end
|
283
|
+
|
284
|
+
def prepare_execute_statement(query)
|
285
|
+
::Hive2::Thrift::TExecuteStatementReq.new( sessionHandle: self.session, statement: query.to_s, confOverlay: {} )
|
286
|
+
end
|
287
|
+
|
288
|
+
def prepare_fetch_results(handle, orientation=:first, rows=100)
|
289
|
+
orientation_value = "FETCH_#{orientation.to_s.upcase}"
|
290
|
+
valid_orientations = ::Hive2::Thrift::TFetchOrientation::VALUE_MAP.values
|
291
|
+
unless valid_orientations.include?(orientation_value)
|
292
|
+
raise ArgumentError, "Invalid orientation: #{orientation.inspect}"
|
293
|
+
end
|
294
|
+
orientation_const = eval("::Hive2::Thrift::TFetchOrientation::#{orientation_value}")
|
295
|
+
::Hive2::Thrift::TFetchResultsReq.new(
|
296
|
+
operationHandle: handle,
|
297
|
+
orientation: orientation_const,
|
298
|
+
maxRows: rows
|
299
|
+
)
|
300
|
+
end
|
301
|
+
|
302
|
+
def get_schema_for(handle)
|
303
|
+
req = ::Hive2::Thrift::TGetResultSetMetadataReq.new( operationHandle: handle )
|
304
|
+
metadata = client.GetResultSetMetadata( req )
|
305
|
+
metadata.schema
|
306
|
+
end
|
307
|
+
|
308
|
+
# Raises an exception if given operation result is a failure
|
309
|
+
def raise_error_if_failed!(result)
|
310
|
+
return if result.status.statusCode == 0
|
311
|
+
error_message = result.status.errorMessage || 'Execution failed!'
|
312
|
+
raise error_message
|
313
|
+
end
|
314
|
+
end
|
315
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
module RBHive
|
4
|
+
class TCLISchemaDefinition
|
5
|
+
attr_reader :schema
|
6
|
+
|
7
|
+
TYPES = {
|
8
|
+
:boolean => :to_s,
|
9
|
+
:string => :to_s,
|
10
|
+
:bigint => :to_i,
|
11
|
+
:float => :to_f,
|
12
|
+
:double => :to_f,
|
13
|
+
:int => :to_i,
|
14
|
+
:bigint => :to_i,
|
15
|
+
:smallint => :to_i,
|
16
|
+
:tinyint => :to_i,
|
17
|
+
}
|
18
|
+
|
19
|
+
def initialize(schema, example_row)
|
20
|
+
@schema = schema
|
21
|
+
@example_row = example_row ? example_row.colVals : []
|
22
|
+
end
|
23
|
+
|
24
|
+
def column_names
|
25
|
+
@column_names ||= begin
|
26
|
+
schema_names = @schema.columns.map {|c| c.columnName }
|
27
|
+
|
28
|
+
# In rare cases Hive can return two identical column names
|
29
|
+
# consider SELECT a.foo, b.foo...
|
30
|
+
# in this case you get two columns called foo with no disambiguation.
|
31
|
+
# as a (far from ideal) solution we detect this edge case and rename them
|
32
|
+
# a.foo => foo1, b.foo => foo2
|
33
|
+
# otherwise we will trample one of the columns during Hash mapping.
|
34
|
+
s = Hash.new(0)
|
35
|
+
schema_names.map! { |c| s[c] += 1; s[c] > 1 ? "#{c}---|---#{s[c]}" : c }
|
36
|
+
schema_names.map! { |c| s[c] > 1 ? "#{c}---|---1" : c }
|
37
|
+
schema_names.map! { |c| c.gsub('---|---', '_').to_sym }
|
38
|
+
|
39
|
+
# Lets fix the fact that Hive doesn't return schema data for partitions on SELECT * queries
|
40
|
+
# For now we will call them :_p1, :_p2, etc. to avoid collisions.
|
41
|
+
offset = 0
|
42
|
+
while schema_names.length < @example_row.length
|
43
|
+
schema_names.push(:"_p#{offset+=1}")
|
44
|
+
end
|
45
|
+
schema_names
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def column_type_map
|
50
|
+
@column_type_map ||= column_names.inject({}) do |hsh, c|
|
51
|
+
definition = @schema.columns.find {|s| s.columnName.to_sym == c }
|
52
|
+
# If the column isn't in the schema (eg partitions in SELECT * queries) assume they are strings
|
53
|
+
type = TYPE_NAMES[definition.typeDesc.types.first.primitiveEntry.type].downcase rescue nil
|
54
|
+
hsh[c] = definition && type ? type.to_sym : :string
|
55
|
+
hsh
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def coerce_row(row)
|
60
|
+
column_names.zip(row.colVals.map(&:get_value).map(&:value)).inject({}) do |hsh, (column_name, value)|
|
61
|
+
hsh[column_name] = coerce_column(column_name, value)
|
62
|
+
hsh
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def coerce_column(column_name, value)
|
67
|
+
type = column_type_map[column_name]
|
68
|
+
return 1.0/0.0 if(type != :string && value == "Infinity")
|
69
|
+
return 0.0/0.0 if(type != :string && value == "NaN")
|
70
|
+
return nil if value.nil? || value == 'NULL' || value == 'null'
|
71
|
+
return coerce_complex_value(value) if type.to_s =~ /^array/
|
72
|
+
conversion_method = TYPES[type]
|
73
|
+
conversion_method ? value.send(conversion_method) : value
|
74
|
+
end
|
75
|
+
|
76
|
+
def coerce_row_to_array(row)
|
77
|
+
column_names.map { |n| row[n] }
|
78
|
+
end
|
79
|
+
|
80
|
+
def coerce_complex_value(value)
|
81
|
+
return nil if value.nil?
|
82
|
+
return nil if value.length == 0
|
83
|
+
return nil if value == 'null'
|
84
|
+
JSON.parse(value)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
data/lib/rbhive.rb
CHANGED
@@ -2,4 +2,7 @@ require File.join(File.dirname(__FILE__), 'rbhive', 'connection')
|
|
2
2
|
require File.join(File.dirname(__FILE__), 'rbhive', 'table_schema')
|
3
3
|
require File.join(File.dirname(__FILE__), 'rbhive', 'result_set')
|
4
4
|
require File.join(File.dirname(__FILE__), 'rbhive', 'explain_result')
|
5
|
-
require File.join(File.dirname(__FILE__), 'rbhive', 'schema_definition')
|
5
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'schema_definition')
|
6
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_result_set])
|
7
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_schema_definition])
|
8
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_connection])
|
@@ -1,12 +1,11 @@
|
|
1
1
|
#
|
2
|
-
# Autogenerated by Thrift
|
2
|
+
# Autogenerated by Thrift Compiler (0.9.0)
|
3
3
|
#
|
4
4
|
# DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
|
5
5
|
#
|
6
6
|
|
7
7
|
require 'thrift'
|
8
|
-
|
9
|
-
|
8
|
+
require_relative 'fb303_types'
|
10
9
|
|
11
10
|
module FacebookService
|
12
11
|
class Client
|
@@ -370,13 +369,13 @@ module FacebookService
|
|
370
369
|
SUCCESS = 0
|
371
370
|
|
372
371
|
FIELDS = {
|
373
|
-
SUCCESS => {:type => ::Thrift::Types::I32, :name => 'success', :enum_class => Fb_status}
|
372
|
+
SUCCESS => {:type => ::Thrift::Types::I32, :name => 'success', :enum_class => ::Fb_status}
|
374
373
|
}
|
375
374
|
|
376
375
|
def struct_fields; FIELDS; end
|
377
376
|
|
378
377
|
def validate
|
379
|
-
unless @success.nil? || Fb_status::VALID_VALUES.include?(@success)
|
378
|
+
unless @success.nil? || ::Fb_status::VALID_VALUES.include?(@success)
|
380
379
|
raise ::Thrift::ProtocolException.new(::Thrift::ProtocolException::UNKNOWN, 'Invalid value of field success!')
|
381
380
|
end
|
382
381
|
end
|