rbhive-u2i 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/CHANGELOG.md +16 -0
- data/Gemfile +3 -0
- data/LICENSE +20 -0
- data/README.md +348 -0
- data/Rakefile +1 -0
- data/lib/rbhive.rb +8 -0
- data/lib/rbhive/connection.rb +150 -0
- data/lib/rbhive/explain_result.rb +46 -0
- data/lib/rbhive/result_set.rb +37 -0
- data/lib/rbhive/schema_definition.rb +87 -0
- data/lib/rbhive/t_c_l_i_connection.rb +441 -0
- data/lib/rbhive/t_c_l_i_result_set.rb +3 -0
- data/lib/rbhive/t_c_l_i_schema_definition.rb +89 -0
- data/lib/rbhive/table_schema.rb +122 -0
- data/lib/rbhive/version.rb +3 -0
- data/lib/thrift/facebook_service.rb +700 -0
- data/lib/thrift/fb303_constants.rb +9 -0
- data/lib/thrift/fb303_types.rb +19 -0
- data/lib/thrift/hive_metastore_constants.rb +41 -0
- data/lib/thrift/hive_metastore_types.rb +630 -0
- data/lib/thrift/hive_service_constants.rb +13 -0
- data/lib/thrift/hive_service_types.rb +72 -0
- data/lib/thrift/queryplan_constants.rb +13 -0
- data/lib/thrift/queryplan_types.rb +261 -0
- data/lib/thrift/sasl_client_transport.rb +97 -0
- data/lib/thrift/serde_constants.rb +92 -0
- data/lib/thrift/serde_types.rb +7 -0
- data/lib/thrift/t_c_l_i_service.rb +1054 -0
- data/lib/thrift/t_c_l_i_service_constants.rb +72 -0
- data/lib/thrift/t_c_l_i_service_types.rb +1762 -0
- data/lib/thrift/thrift_hive.rb +508 -0
- data/lib/thrift/thrift_hive_metastore.rb +3856 -0
- data/rbhive.gemspec +27 -0
- metadata +137 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: aa0f81a2728885fabb85feabc4044b2a29cc26f0
|
4
|
+
data.tar.gz: ec38ad569b040b3b0c646272fb3cd95e99a35beb
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 841210a38b540c3e1513bd440de7e65b4fe77ce40ee5847acfa91fffc66da14b8bf0d02417018ac3ac26756385549f72be176162268e8c232f1f70268ed11953
|
7
|
+
data.tar.gz: 064a4baa9bbd6266eee0a5dda8c5cbeab9a1eafe6497ec9f010858304ba144b8b8c790b9ef2dab1c40ad2bdd9e5ed62058e5c6bd347285592fa989bc77dd1923
|
data/.gitignore
ADDED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# RBHive changelog
|
2
|
+
|
3
|
+
Versioning prior to 0.5.3 was not tracked, so this changelog only lists changes introduced after 0.5.3.
|
4
|
+
|
5
|
+
## 0.6.0
|
6
|
+
|
7
|
+
0.6.0 introduces one backwards-incompatible change:
|
8
|
+
|
9
|
+
* Behaviour change: RBHive will no longer coerce the strings "NULL" or "null" to the Ruby `nil`; the rationale
|
10
|
+
for this change is that it introduces hard to trace bugs and does not seem to make sense from a logical
|
11
|
+
perspective (Hive's "NULL" is a very different thing to Ruby's `nil`).
|
12
|
+
|
13
|
+
0.6.0 introduces support for Hive 0.13, and for the Hive 0.11 version shipped with CDH5 Beta 1 and Beta 2:
|
14
|
+
|
15
|
+
* Thrift protocol bindings updated to include all the protocols shipped with the Hive 0.13 release.
|
16
|
+
* Allow the user to choose a protocol explicitly; provided helper symbols / lookups for common protocols (e.g. CDH4, CDH5)
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) [2013] [Forward3D]
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
6
|
+
this software and associated documentation files (the "Software"), to deal in
|
7
|
+
the Software without restriction, including without limitation the rights to
|
8
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
9
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
10
|
+
subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
17
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
18
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
19
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
20
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,348 @@
|
|
1
|
+
# RBHive - A Ruby Thrift client for Apache Hive
|
2
|
+
|
3
|
+
[![Code Climate](https://codeclimate.com/github/forward3d/rbhive/badges/gpa.svg)](https://codeclimate.com/github/forward3d/rbhive)
|
4
|
+
|
5
|
+
### WARNING
|
6
|
+
|
7
|
+
This is u2i fork of [rbhive](https://github.com/forward3d/rbhive).
|
8
|
+
|
9
|
+
RBHive is a simple Ruby gem to communicate with the [Apache Hive](http://hive.apache.org)
|
10
|
+
Thrift servers.
|
11
|
+
|
12
|
+
It supports:
|
13
|
+
* Hiveserver (the original Thrift service shipped with Hive since early releases)
|
14
|
+
* Hiveserver2 (the new, concurrent Thrift service shipped with Hive releases since 0.10)
|
15
|
+
* Any other 100% Hive-compatible Thrift service (e.g. [Sharkserver](https://github.com/amplab/shark))
|
16
|
+
|
17
|
+
It is capable of using the following Thrift transports:
|
18
|
+
* BufferedTransport (the default)
|
19
|
+
* SaslClientTransport ([SASL-enabled](http://en.wikipedia.org/wiki/Simple_Authentication_and_Security_Layer) transport)
|
20
|
+
* HTTPClientTransport (tunnels Thrift over HTTP)
|
21
|
+
|
22
|
+
As of version 1.0, it supports asynchronous execution of queries. This allows you to submit
|
23
|
+
a query, disconnect, then reconnect later to check the status and retrieve the results.
|
24
|
+
This frees systems of the need to keep a persistent TCP connection.
|
25
|
+
|
26
|
+
## About Thrift services and transports
|
27
|
+
|
28
|
+
### Hiveserver
|
29
|
+
|
30
|
+
Hiveserver (the original Thrift interface) only supports a single client at a time. RBHive
|
31
|
+
implements this with the `RBHive::Connection` class. It only supports a single transport,
|
32
|
+
BufferedTransport.
|
33
|
+
|
34
|
+
### Hiveserver2
|
35
|
+
|
36
|
+
[Hiveserver2](https://cwiki.apache.org/confluence/display/Hive/Setting+up+HiveServer2)
|
37
|
+
(the new Thrift interface) can support many concurrent client connections. It is shipped
|
38
|
+
with Hive 0.10 and later. In Hive 0.10, only BufferedTranport and SaslClientTransport are
|
39
|
+
supported; starting with Hive 0.12, HTTPClientTransport is also supported.
|
40
|
+
|
41
|
+
Each of the versions after Hive 0.10 has a slightly different Thrift interface; when
|
42
|
+
connecting, you must specify the Hive version or you may get an exception.
|
43
|
+
|
44
|
+
Hiveserver2 supports (in versions later than 0.12) asynchronous query execution. This
|
45
|
+
works by submitting a query and retrieving a handle to the execution process; you can
|
46
|
+
then reconnect at a later time and retrieve the results using this handle.
|
47
|
+
Using the asynchronous methods has some caveats - please read the Asynchronous Execution
|
48
|
+
section of the documentation thoroughly before using them.
|
49
|
+
|
50
|
+
RBHive implements this client with the `RBHive::TCLIConnection` class.
|
51
|
+
|
52
|
+
#### Warning!
|
53
|
+
|
54
|
+
We had to set the following in hive-site.xml to get the BufferedTransport Thrift service
|
55
|
+
to work with RBHive:
|
56
|
+
|
57
|
+
<property>
|
58
|
+
<name>hive.server2.enable.doAs</name>
|
59
|
+
<value>false</value>
|
60
|
+
</property>
|
61
|
+
|
62
|
+
Otherwise you'll get this nasty-looking exception in the logs:
|
63
|
+
|
64
|
+
ERROR server.TThreadPoolServer: Error occurred during processing of message.
|
65
|
+
java.lang.ClassCastException: org.apache.thrift.transport.TSocket cannot be cast to org.apache.thrift.transport.TSaslServerTransport
|
66
|
+
at org.apache.hive.service.auth.TUGIContainingProcessor.process(TUGIContainingProcessor.java:35)
|
67
|
+
at org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:206)
|
68
|
+
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
|
69
|
+
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
|
70
|
+
at java.lang.Thread.run(Thread.java:662)
|
71
|
+
|
72
|
+
### Other Hive-compatible services
|
73
|
+
|
74
|
+
Consult the documentation for the service, as this will vary depending on the service you're using.
|
75
|
+
|
76
|
+
## Connecting to Hiveserver and Hiveserver2
|
77
|
+
|
78
|
+
### Hiveserver
|
79
|
+
|
80
|
+
Since Hiveserver has no options, connection code is very simple:
|
81
|
+
|
82
|
+
RBHive.connect('hive.server.address', 10_000) do |connection|
|
83
|
+
connection.fetch 'SELECT city, country FROM cities'
|
84
|
+
end
|
85
|
+
➔ [{:city => "London", :country => "UK"}, {:city => "Mumbai", :country => "India"}, {:city => "New York", :country => "USA"}]
|
86
|
+
|
87
|
+
### Hiveserver2
|
88
|
+
|
89
|
+
Hiveserver2 has several options with how it is run. The connection code takes
|
90
|
+
a hash with these possible parameters:
|
91
|
+
* `:transport` - one of `:buffered` (BufferedTransport), `:http` (HTTPClientTransport), or `:sasl` (SaslClientTransport)
|
92
|
+
* `:hive_version` - the number after the period in the Hive version; e.g. `10`, `11`, `12`, `13` or one of
|
93
|
+
a set of symbols; see [Hiveserver2 protocol versions](#hiveserver2-protocol-versions) below for details
|
94
|
+
* `:timeout` - if using BufferedTransport or SaslClientTransport, this is how long the timeout on the socket will be
|
95
|
+
* `:sasl_params` - if using SaslClientTransport, this is a hash of parameters to set up the SASL connection
|
96
|
+
|
97
|
+
If you pass either an empty hash or nil in place of the options (or do not supply them), the connection
|
98
|
+
is attempted with the Hive version set to 0.10, using `:buffered` as the transport, and a timeout of 1800 seconds.
|
99
|
+
|
100
|
+
Connecting with the defaults:
|
101
|
+
|
102
|
+
RBHive.tcli_connect('hive.server.address', 10_000) do |connection|
|
103
|
+
connection.fetch('SHOW TABLES')
|
104
|
+
end
|
105
|
+
|
106
|
+
Connecting with a Logger:
|
107
|
+
|
108
|
+
RBHive.tcli_connect('hive.server.address', 10_000, { logger: Logger.new(STDOUT) }) do |connection|
|
109
|
+
connection.fetch('SHOW TABLES')
|
110
|
+
end
|
111
|
+
|
112
|
+
Connecting with a specific Hive version (0.12 in this case):
|
113
|
+
|
114
|
+
RBHive.tcli_connect('hive.server.address', 10_000, { hive_version: 12 }) do |connection|
|
115
|
+
connection.fetch('SHOW TABLES')
|
116
|
+
end
|
117
|
+
|
118
|
+
Connecting with a specific Hive version (0.12) and using the `:http` transport:
|
119
|
+
|
120
|
+
RBHive.tcli_connect('hive.server.address', 10_000, { hive_version: 12, transport: :http }) do |connection|
|
121
|
+
connection.fetch('SHOW TABLES')
|
122
|
+
end
|
123
|
+
|
124
|
+
We have not tested the SASL connection, as we don't run SASL; pull requests and testing are welcomed.
|
125
|
+
|
126
|
+
#### Hiveserver2 protocol versions
|
127
|
+
|
128
|
+
Since the introduction of Hiveserver2 in Hive 0.10, there have been a number of revisions to the Thrift protocol it uses.
|
129
|
+
|
130
|
+
The following table lists the available values you can supply to the `:hive_version` parameter when making a connection
|
131
|
+
to Hiveserver2.
|
132
|
+
|
133
|
+
| value | Thrift protocol version | notes
|
134
|
+
| ------- | ----------------------- | -----
|
135
|
+
| `10` | V1 | First version of the Thrift protocol used only by Hive 0.10
|
136
|
+
| `11` | V2 | Used by the Hive 0.11 release (*but not CDH5 which ships with Hive 0.11!*) - adds asynchronous execution
|
137
|
+
| `12` | V3 | Used by the Hive 0.12 release, adds varchar type and primitive type qualifiers
|
138
|
+
| `13` | V7 | Used by the Hive 0.13 release, adds features from V4, V5 and V6, plus token-based delegation connections
|
139
|
+
| `:cdh4` | V1 | CDH4 uses the V1 protocol as it ships with the upstream Hive 0.10
|
140
|
+
| `:cdh5` | V5 | CDH5 ships with upstream Hive 0.11, but adds patches to bring the Thrift protocol up to V5
|
141
|
+
|
142
|
+
In addition, you can explicitly set the Thrift protocol version according to this table:
|
143
|
+
|
144
|
+
| value | Thrift protocol version | notes
|
145
|
+
| --------------- | ----------------------- | -----
|
146
|
+
| `:PROTOCOL_V1` | V1 | Used by Hive 0.10 release
|
147
|
+
| `:PROTOCOL_V2` | V2 | Used by Hive 0.11 release
|
148
|
+
| `:PROTOCOL_V3` | V3 | Used by Hive 0.12 release
|
149
|
+
| `:PROTOCOL_V4` | V4 | Updated during Hive 0.13 development, adds decimal precision/scale, char type
|
150
|
+
| `:PROTOCOL_V5` | V5 | Updated during Hive 0.13 development, adds error details when GetOperationStatus returns in error state
|
151
|
+
| `:PROTOCOL_V6` | V6 | Updated during Hive 0.13 development, adds binary type for binary payload, uses columnar result set
|
152
|
+
| `:PROTOCOL_V7` | V7 | Used by Hive 0.13 release, support for token-based delegation connections
|
153
|
+
|
154
|
+
## Asynchronous execution with Hiveserver2
|
155
|
+
|
156
|
+
In versions of Hive later than 0.12, the Thrift server supports asynchronous execution.
|
157
|
+
|
158
|
+
The high-level view of using this feature is as follows:
|
159
|
+
|
160
|
+
1. Submit your query using `async_execute(query)`. This function returns a hash
|
161
|
+
with the following keys: `:guid`, `:secret`, and `:session`. You don't need to
|
162
|
+
care about the internals of this hash - all methods that interact with an async
|
163
|
+
query require this hash, and you can just store it and hand it to the methods.
|
164
|
+
2. To check the state of the query, call `async_state(handles)`, where `handles`
|
165
|
+
is the handles hash given to you when you called `async_execute(query)`.
|
166
|
+
3. To retrieve results, call either `async_fetch(handles)` or `async_fetch_in_batch(handles)`,
|
167
|
+
which work like the non async methods.
|
168
|
+
4. When you're done with the query, call `async_close_session(handles)`.
|
169
|
+
|
170
|
+
### Memory leaks
|
171
|
+
|
172
|
+
When you call `async_close_session(handles)`, *all async handles created during this
|
173
|
+
session are closed*.
|
174
|
+
|
175
|
+
If you do not close the sessions you create, *you will leak memory in the Hiveserver2 process*.
|
176
|
+
Be very careful to close your sessions!
|
177
|
+
|
178
|
+
### Method documentation
|
179
|
+
|
180
|
+
#### `async_execute(query)`
|
181
|
+
|
182
|
+
This method submits a query for async execution. The hash you get back is used in the other
|
183
|
+
async methods, and will look like this:
|
184
|
+
|
185
|
+
{
|
186
|
+
:guid => (binary string),
|
187
|
+
:secret => (binary string),
|
188
|
+
:session => (binary string)
|
189
|
+
}
|
190
|
+
|
191
|
+
The Thrift protocol specifies the strings as "binary" - which means they have no encoding.
|
192
|
+
Be *extremely* careful when manipulating or storing these values, as they can quite easily
|
193
|
+
get converted to UTF-8 strings, which will make them invalid when trying to retrieve async data.
|
194
|
+
|
195
|
+
#### `async_state(handles)`
|
196
|
+
|
197
|
+
`handles` is the hash returned by `async_execute(query)`. The state will be a symbol with
|
198
|
+
one of the following values and meanings:
|
199
|
+
|
200
|
+
| symbol | meaning
|
201
|
+
| --------------------- | -------
|
202
|
+
| :initialized | The query is initialized in Hive and ready to run
|
203
|
+
| :running | The query is running (either as a MapReduce job or within process)
|
204
|
+
| :finished | The query is completed and results can be retrieved
|
205
|
+
| :cancelled | The query was cancelled by a user
|
206
|
+
| :closed | Unknown at present
|
207
|
+
| :error | The query is invalid semantically or broken in another way
|
208
|
+
| :unknown | The query is in an unknown state
|
209
|
+
| :pending | The query is ready to run but is not running
|
210
|
+
|
211
|
+
There are also the utility methods `async_is_complete?(handles)`, `async_is_running?(handles)`,
|
212
|
+
`async_is_failed?(handles)` and `async_is_cancelled?(handles)`.
|
213
|
+
|
214
|
+
#### `async_cancel(handles)`
|
215
|
+
|
216
|
+
Calling this method will cancel the query in execution.
|
217
|
+
|
218
|
+
#### `async_fetch(handles)`, `async_fetch_in_batch(handles)`
|
219
|
+
|
220
|
+
These methods let you fetch the results of the async query, if they are complete. If you call
|
221
|
+
these methods on an incomplete query, they will raise an exception. They work in exactly the
|
222
|
+
same way as the normal synchronous methods.
|
223
|
+
|
224
|
+
## Examples
|
225
|
+
|
226
|
+
### Fetching results
|
227
|
+
|
228
|
+
#### Hiveserver
|
229
|
+
|
230
|
+
RBHive.connect('hive.server.address', 10_000) do |connection|
|
231
|
+
connection.fetch 'SELECT city, country FROM cities'
|
232
|
+
end
|
233
|
+
➔ [{:city => "London", :country => "UK"}, {:city => "Mumbai", :country => "India"}, {:city => "New York", :country => "USA"}]
|
234
|
+
|
235
|
+
#### Hiveserver2
|
236
|
+
|
237
|
+
RBHive.tcli_connect('hive.server.address', 10_000) do |connection|
|
238
|
+
connection.fetch 'SELECT city, country FROM cities'
|
239
|
+
end
|
240
|
+
➔ [{:city => "London", :country => "UK"}, {:city => "Mumbai", :country => "India"}, {:city => "New York", :country => "USA"}]
|
241
|
+
|
242
|
+
### Executing a query
|
243
|
+
|
244
|
+
#### Hiveserver
|
245
|
+
|
246
|
+
RBHive.connect('hive.server.address') do |connection|
|
247
|
+
connection.execute 'DROP TABLE cities'
|
248
|
+
end
|
249
|
+
➔ nil
|
250
|
+
|
251
|
+
#### Hiveserver2
|
252
|
+
|
253
|
+
RBHive.tcli_connect('hive.server.address') do |connection|
|
254
|
+
connection.execute 'DROP TABLE cities'
|
255
|
+
end
|
256
|
+
➔ nil
|
257
|
+
|
258
|
+
### Creating tables
|
259
|
+
|
260
|
+
table = TableSchema.new('person', 'List of people that owe me money') do
|
261
|
+
column 'name', :string, 'Full name of debtor'
|
262
|
+
column 'address', :string, 'Address of debtor'
|
263
|
+
column 'amount', :float, 'The amount of money borrowed'
|
264
|
+
|
265
|
+
partition 'dated', :string, 'The date money was given'
|
266
|
+
partition 'country', :string, 'The country the person resides in'
|
267
|
+
end
|
268
|
+
|
269
|
+
Then for Hiveserver:
|
270
|
+
|
271
|
+
RBHive.connect('hive.server.address', 10_000) do |connection|
|
272
|
+
connection.create_table(table)
|
273
|
+
end
|
274
|
+
|
275
|
+
Or Hiveserver2:
|
276
|
+
|
277
|
+
RBHive.tcli_connect('hive.server.address', 10_000) do |connection|
|
278
|
+
connection.create_table(table)
|
279
|
+
end
|
280
|
+
|
281
|
+
### Modifying table schema
|
282
|
+
|
283
|
+
table = TableSchema.new('person', 'List of people that owe me money') do
|
284
|
+
column 'name', :string, 'Full name of debtor'
|
285
|
+
column 'address', :string, 'Address of debtor'
|
286
|
+
column 'amount', :float, 'The amount of money borrowed'
|
287
|
+
column 'new_amount', :float, 'The new amount this person somehow convinced me to give them'
|
288
|
+
|
289
|
+
partition 'dated', :string, 'The date money was given'
|
290
|
+
partition 'country', :string, 'The country the person resides in'
|
291
|
+
end
|
292
|
+
|
293
|
+
Then for Hiveserver:
|
294
|
+
|
295
|
+
RBHive.connect('hive.server.address') do |connection|
|
296
|
+
connection.replace_columns(table)
|
297
|
+
end
|
298
|
+
|
299
|
+
Or Hiveserver2:
|
300
|
+
|
301
|
+
RBHive.tcli_connect('hive.server.address') do |connection|
|
302
|
+
connection.replace_columns(table)
|
303
|
+
end
|
304
|
+
|
305
|
+
### Setting properties
|
306
|
+
|
307
|
+
You can set various properties for Hive tasks, some of which change how they run. Consult the Apache
|
308
|
+
Hive documentation and Hadoop's documentation for the various properties that can be set.
|
309
|
+
For example, you can set the map-reduce job's priority with the following:
|
310
|
+
|
311
|
+
connection.set("mapred.job.priority", "VERY_HIGH")
|
312
|
+
|
313
|
+
### Inspecting tables
|
314
|
+
|
315
|
+
#### Hiveserver
|
316
|
+
|
317
|
+
RBHive.connect('hive.hadoop.forward.co.uk', 10_000) {|connection|
|
318
|
+
result = connection.fetch("describe some_table")
|
319
|
+
puts result.column_names.inspect
|
320
|
+
puts result.first.inspect
|
321
|
+
}
|
322
|
+
|
323
|
+
#### Hiveserver2
|
324
|
+
|
325
|
+
RBHive.tcli_connect('hive.hadoop.forward.co.uk', 10_000) {|connection|
|
326
|
+
result = connection.fetch("describe some_table")
|
327
|
+
puts result.column_names.inspect
|
328
|
+
puts result.first.inspect
|
329
|
+
}
|
330
|
+
|
331
|
+
## Testing
|
332
|
+
|
333
|
+
We use RBHive against Hive 0.10, 0.11 and 0.12, and have tested the BufferedTransport and
|
334
|
+
HTTPClientTransport. We use it against both Hiveserver and Hiveserver2 with success.
|
335
|
+
|
336
|
+
We have _not_ tested the SaslClientTransport, and would welcome reports
|
337
|
+
on whether it works correctly.
|
338
|
+
|
339
|
+
## Contributing
|
340
|
+
|
341
|
+
We welcome contributions, issues and pull requests. If there's a feature missing in RBHive that you need, or you
|
342
|
+
think you've found a bug, please do not hesitate to create an issue.
|
343
|
+
|
344
|
+
1. Fork it
|
345
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
346
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
347
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
348
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/lib/rbhive.rb
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'connection')
|
2
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'table_schema')
|
3
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'result_set')
|
4
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'explain_result')
|
5
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'schema_definition')
|
6
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_result_set])
|
7
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_schema_definition])
|
8
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_connection])
|
@@ -0,0 +1,150 @@
|
|
1
|
+
# suppress warnings
|
2
|
+
old_verbose, $VERBOSE = $VERBOSE, nil
|
3
|
+
# require thrift autogenerated files
|
4
|
+
require File.join(File.split(File.dirname(__FILE__)).first, *%w[thrift thrift_hive])
|
5
|
+
# require 'thrift'
|
6
|
+
# restore warnings
|
7
|
+
$VERBOSE = old_verbose
|
8
|
+
|
9
|
+
module RBHive
|
10
|
+
def connect(server, port=10_000, logger=StdOutLogger.new)
|
11
|
+
connection = RBHive::Connection.new(server, port, logger)
|
12
|
+
ret = nil
|
13
|
+
begin
|
14
|
+
connection.open
|
15
|
+
ret = yield(connection)
|
16
|
+
ensure
|
17
|
+
connection.close
|
18
|
+
ret
|
19
|
+
end
|
20
|
+
end
|
21
|
+
module_function :connect
|
22
|
+
|
23
|
+
class StdOutLogger
|
24
|
+
%w(fatal error warn info debug).each do |level|
|
25
|
+
define_method level.to_sym do |message|
|
26
|
+
STDOUT.puts(message)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class Connection
|
32
|
+
attr_reader :client
|
33
|
+
|
34
|
+
def initialize(server, port=10_000, logger=StdOutLogger.new)
|
35
|
+
@socket = Thrift::Socket.new(server, port)
|
36
|
+
@transport = Thrift::BufferedTransport.new(@socket)
|
37
|
+
@protocol = Thrift::BinaryProtocol.new(@transport)
|
38
|
+
@client = Hive::Thrift::ThriftHive::Client.new(@protocol)
|
39
|
+
@logger = logger
|
40
|
+
@logger.info("#{Time.now}: Connecting to #{server} on port #{port}")
|
41
|
+
@mutex = Mutex.new
|
42
|
+
end
|
43
|
+
|
44
|
+
def open
|
45
|
+
@transport.open
|
46
|
+
end
|
47
|
+
|
48
|
+
def close
|
49
|
+
@transport.close
|
50
|
+
end
|
51
|
+
|
52
|
+
def client
|
53
|
+
@client
|
54
|
+
end
|
55
|
+
|
56
|
+
def execute(query)
|
57
|
+
execute_safe(query)
|
58
|
+
end
|
59
|
+
|
60
|
+
def explain(query)
|
61
|
+
safe do
|
62
|
+
execute_unsafe("EXPLAIN "+ query)
|
63
|
+
ExplainResult.new(client.fetchAll)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def priority=(priority)
|
68
|
+
set("mapred.job.priority", priority)
|
69
|
+
end
|
70
|
+
|
71
|
+
def queue=(queue)
|
72
|
+
set("mapred.job.queue.name", queue)
|
73
|
+
end
|
74
|
+
|
75
|
+
def set(name,value)
|
76
|
+
@logger.info("Setting #{name}=#{value}")
|
77
|
+
client.execute("SET #{name}=#{value}")
|
78
|
+
end
|
79
|
+
|
80
|
+
def fetch(query)
|
81
|
+
safe do
|
82
|
+
execute_unsafe(query)
|
83
|
+
rows = client.fetchAll
|
84
|
+
the_schema = SchemaDefinition.new(client.getSchema, rows.first)
|
85
|
+
ResultSet.new(rows, the_schema)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def fetch_in_batch(query, batch_size=1_000)
|
90
|
+
safe do
|
91
|
+
execute_unsafe(query)
|
92
|
+
until (next_batch = client.fetchN(batch_size)).empty?
|
93
|
+
the_schema ||= SchemaDefinition.new(client.getSchema, next_batch.first)
|
94
|
+
yield ResultSet.new(next_batch, the_schema)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def first(query)
|
100
|
+
safe do
|
101
|
+
execute_unsafe(query)
|
102
|
+
row = client.fetchOne
|
103
|
+
the_schema = SchemaDefinition.new(client.getSchema, row)
|
104
|
+
ResultSet.new([row], the_schema).first
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def schema(example_row=[])
|
109
|
+
safe { SchemaDefinition.new(client.getSchema, example_row) }
|
110
|
+
end
|
111
|
+
|
112
|
+
def create_table(schema)
|
113
|
+
execute(schema.create_table_statement)
|
114
|
+
end
|
115
|
+
|
116
|
+
def drop_table(name)
|
117
|
+
name = name.name if name.is_a?(TableSchema)
|
118
|
+
execute("DROP TABLE `#{name}`")
|
119
|
+
end
|
120
|
+
|
121
|
+
def replace_columns(schema)
|
122
|
+
execute(schema.replace_columns_statement)
|
123
|
+
end
|
124
|
+
|
125
|
+
def add_columns(schema)
|
126
|
+
execute(schema.add_columns_statement)
|
127
|
+
end
|
128
|
+
|
129
|
+
def method_missing(meth, *args)
|
130
|
+
client.send(meth, *args)
|
131
|
+
end
|
132
|
+
|
133
|
+
private
|
134
|
+
|
135
|
+
def execute_safe(query)
|
136
|
+
safe { execute_unsafe(query) }
|
137
|
+
end
|
138
|
+
|
139
|
+
def execute_unsafe(query)
|
140
|
+
@logger.info("Executing Hive Query: #{query}")
|
141
|
+
client.execute(query)
|
142
|
+
end
|
143
|
+
|
144
|
+
def safe
|
145
|
+
ret = nil
|
146
|
+
@mutex.synchronize { ret = yield }
|
147
|
+
ret
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|