sequel-impala 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG +3 -0
- data/LICENSE +462 -0
- data/README.rdoc +39 -0
- data/Rakefile +39 -0
- data/lib/driver/commons-logging-1.2.jar +0 -0
- data/lib/driver/hadoop-common-2.6.0.jar +0 -0
- data/lib/driver/hadoop-core-2.6.0.jar +0 -0
- data/lib/driver/hive-exec-1.1.0.jar +0 -0
- data/lib/driver/hive-jdbc-1.1.0.jar +0 -0
- data/lib/driver/hive-metastore-1.1.0.jar +0 -0
- data/lib/driver/hive-service-1.1.0.jar +0 -0
- data/lib/driver/httpclient-4.3.jar +0 -0
- data/lib/driver/httpcore-4.3.jar +0 -0
- data/lib/driver/libfb303-0.9.0.jar +0 -0
- data/lib/driver/slf4j-api-1.7.5.jar +0 -0
- data/lib/impala.rb +47 -0
- data/lib/impala/connection.rb +117 -0
- data/lib/impala/cursor.rb +157 -0
- data/lib/impala/protocol.rb +8 -0
- data/lib/impala/protocol/beeswax_constants.rb +15 -0
- data/lib/impala/protocol/beeswax_service.rb +766 -0
- data/lib/impala/protocol/beeswax_types.rb +193 -0
- data/lib/impala/protocol/cli_service_constants.rb +60 -0
- data/lib/impala/protocol/cli_service_types.rb +1452 -0
- data/lib/impala/protocol/facebook_service.rb +706 -0
- data/lib/impala/protocol/fb303_constants.rb +15 -0
- data/lib/impala/protocol/fb303_types.rb +25 -0
- data/lib/impala/protocol/hive_metastore_constants.rb +53 -0
- data/lib/impala/protocol/hive_metastore_types.rb +698 -0
- data/lib/impala/protocol/impala_hive_server2_service.rb +29 -0
- data/lib/impala/protocol/impala_service.rb +377 -0
- data/lib/impala/protocol/impala_service_constants.rb +13 -0
- data/lib/impala/protocol/impala_service_types.rb +90 -0
- data/lib/impala/protocol/status_constants.rb +13 -0
- data/lib/impala/protocol/status_types.rb +46 -0
- data/lib/impala/protocol/t_c_l_i_service.rb +948 -0
- data/lib/impala/protocol/thrift_hive_metastore.rb +4707 -0
- data/lib/impala/version.rb +3 -0
- data/lib/jdbc/hive2.rb +46 -0
- data/lib/sequel/adapters/impala.rb +123 -0
- data/lib/sequel/adapters/jdbc/hive2.rb +26 -0
- data/lib/sequel/adapters/shared/impala.rb +635 -0
- data/lib/sequel/extensions/csv_to_parquet.rb +112 -0
- data/spec/database_test.rb +56 -0
- data/spec/dataset_test.rb +1268 -0
- data/spec/files/bad_down_migration/001_create_alt_basic.rb +4 -0
- data/spec/files/bad_down_migration/002_create_alt_advanced.rb +4 -0
- data/spec/files/bad_timestamped_migrations/1273253849_create_sessions.rb +9 -0
- data/spec/files/bad_timestamped_migrations/1273253851_create_nodes.rb +9 -0
- data/spec/files/bad_timestamped_migrations/1273253853_3_create_users.rb +3 -0
- data/spec/files/bad_up_migration/001_create_alt_basic.rb +4 -0
- data/spec/files/bad_up_migration/002_create_alt_advanced.rb +3 -0
- data/spec/files/convert_to_timestamp_migrations/001_create_sessions.rb +9 -0
- data/spec/files/convert_to_timestamp_migrations/002_create_nodes.rb +9 -0
- data/spec/files/convert_to_timestamp_migrations/003_3_create_users.rb +4 -0
- data/spec/files/convert_to_timestamp_migrations/1273253850_create_artists.rb +9 -0
- data/spec/files/convert_to_timestamp_migrations/1273253852_create_albums.rb +9 -0
- data/spec/files/duplicate_timestamped_migrations/1273253849_create_sessions.rb +9 -0
- data/spec/files/duplicate_timestamped_migrations/1273253853_create_nodes.rb +9 -0
- data/spec/files/duplicate_timestamped_migrations/1273253853_create_users.rb +4 -0
- data/spec/files/integer_migrations/001_create_sessions.rb +9 -0
- data/spec/files/integer_migrations/002_create_nodes.rb +9 -0
- data/spec/files/integer_migrations/003_3_create_users.rb +4 -0
- data/spec/files/interleaved_timestamped_migrations/1273253849_create_sessions.rb +9 -0
- data/spec/files/interleaved_timestamped_migrations/1273253850_create_artists.rb +9 -0
- data/spec/files/interleaved_timestamped_migrations/1273253851_create_nodes.rb +9 -0
- data/spec/files/interleaved_timestamped_migrations/1273253852_create_albums.rb +9 -0
- data/spec/files/interleaved_timestamped_migrations/1273253853_3_create_users.rb +4 -0
- data/spec/files/reversible_migrations/001_reversible.rb +5 -0
- data/spec/files/reversible_migrations/002_reversible.rb +5 -0
- data/spec/files/reversible_migrations/003_reversible.rb +5 -0
- data/spec/files/reversible_migrations/004_reversible.rb +5 -0
- data/spec/files/reversible_migrations/005_reversible.rb +10 -0
- data/spec/files/timestamped_migrations/1273253849_create_sessions.rb +9 -0
- data/spec/files/timestamped_migrations/1273253851_create_nodes.rb +9 -0
- data/spec/files/timestamped_migrations/1273253853_3_create_users.rb +4 -0
- data/spec/impala_test.rb +285 -0
- data/spec/migrator_test.rb +240 -0
- data/spec/plugin_test.rb +91 -0
- data/spec/prepared_statement_test.rb +327 -0
- data/spec/schema_test.rb +356 -0
- data/spec/spec_helper.rb +15 -0
- data/spec/timezone_test.rb +86 -0
- data/spec/type_test.rb +99 -0
- metadata +239 -0
data/README.rdoc
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
= sequel-impala
|
2
|
+
|
3
|
+
sequel-impala adds support for Sequel to connect to the Impala database
|
4
|
+
via the included impala driver, and the included jdbc-hive2 driver under JRuby.
|
5
|
+
|
6
|
+
= Source Code
|
7
|
+
|
8
|
+
Source code is available on GitHub at https://github.com/jeremyevans/sequel-impala
|
9
|
+
|
10
|
+
= Usage
|
11
|
+
|
12
|
+
After installation, Sequel will automatically pick up the adapter as long as
|
13
|
+
the lib directory is in RUBYLIB, if you use a connection string starting with
|
14
|
+
+impala+, or <tt>jdbc:hive2</tt> on JRuby.
|
15
|
+
|
16
|
+
= Connection Strings
|
17
|
+
|
18
|
+
If using the impala driver (default host is localhost, default port is 21000):
|
19
|
+
|
20
|
+
impala://host:port
|
21
|
+
|
22
|
+
If using the jdbc:hive2 driver on JRuby (port 21050 works in testing):
|
23
|
+
|
24
|
+
jdbc:hive2://host:port/;auth=noSasl
|
25
|
+
|
26
|
+
= Dependencies
|
27
|
+
|
28
|
+
* sequel 4+
|
29
|
+
* thrift gem
|
30
|
+
|
31
|
+
= License
|
32
|
+
|
33
|
+
MIT/Apache
|
34
|
+
|
35
|
+
= Author
|
36
|
+
|
37
|
+
Jeremy Evans <code@jeremyevans.net>
|
38
|
+
|
39
|
+
Work on sequel-impala is generously funded by Outcomes Insights, Inc.
|
data/Rakefile
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
require "rake"
|
2
|
+
require "rake/clean"
|
3
|
+
|
4
|
+
CLEAN.include ["sequel-impala-*.gem", "rdoc"]
|
5
|
+
|
6
|
+
desc "Build sequel-impala gem"
|
7
|
+
task :package=>[:clean] do |p|
|
8
|
+
sh %{#{FileUtils::RUBY} -S gem build sequel-impala.gemspec}
|
9
|
+
end
|
10
|
+
|
11
|
+
### Specs
|
12
|
+
|
13
|
+
desc "Run specs"
|
14
|
+
task "spec" do
|
15
|
+
sh "#{FileUtils::RUBY} -rubygems -I lib -e 'ARGV.each{|f| require f}' ./spec/*_test.rb"
|
16
|
+
end
|
17
|
+
|
18
|
+
task :default => :spec
|
19
|
+
|
20
|
+
### RDoc
|
21
|
+
|
22
|
+
RDOC_DEFAULT_OPTS = ["--quiet", "--line-numbers", "--inline-source", '--title', 'sequel-impala: Sequel support for Impala database']
|
23
|
+
|
24
|
+
begin
|
25
|
+
gem 'rdoc'
|
26
|
+
gem 'hanna-nouveau'
|
27
|
+
RDOC_DEFAULT_OPTS.concat(['-f', 'hanna'])
|
28
|
+
rescue Gem::LoadError
|
29
|
+
end
|
30
|
+
|
31
|
+
RDOC_OPTS = RDOC_DEFAULT_OPTS + ['--main', 'README.rdoc']
|
32
|
+
|
33
|
+
require 'rdoc/task'
|
34
|
+
RDoc::Task.new do |rdoc|
|
35
|
+
rdoc.rdoc_dir = "rdoc"
|
36
|
+
rdoc.options += RDOC_OPTS
|
37
|
+
rdoc.rdoc_files.add %w"README.rdoc CHANGELOG LICENSE lib/**/*.rb"
|
38
|
+
end
|
39
|
+
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/impala.rb
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
|
2
|
+
# the generated ruby files use a relative require, so we need to add the
|
3
|
+
# generated directory to $LOAD_PATH
|
4
|
+
this_dir = File.expand_path(File.dirname(__FILE__))
|
5
|
+
gen_dir = File.join(this_dir, 'impala/protocol')
|
6
|
+
$LOAD_PATH.push(gen_dir) unless $LOAD_PATH.include?(gen_dir)
|
7
|
+
|
8
|
+
require 'impala/version'
|
9
|
+
|
10
|
+
require 'thrift'
|
11
|
+
require 'time'
|
12
|
+
require 'impala/protocol'
|
13
|
+
require 'impala/cursor'
|
14
|
+
require 'impala/connection'
|
15
|
+
|
16
|
+
module Impala
|
17
|
+
DEFAULT_HOST = 'localhost'
|
18
|
+
DEFAULT_PORT = 21000
|
19
|
+
class Error < StandardError; end
|
20
|
+
class InvalidQueryError < Error; end
|
21
|
+
class ConnectionError < Error; end
|
22
|
+
class CursorError < Error; end
|
23
|
+
|
24
|
+
# Connect to an Impala server. If a block is given, it will close the
|
25
|
+
# connection after yielding the connection to the block.
|
26
|
+
# @param [String] host the hostname or IP address of the Impala server
|
27
|
+
# @param [int] port the port that the Impala server is listening on
|
28
|
+
# @yieldparam [Connection] conn the open connection. Will be closed once the block
|
29
|
+
# finishes
|
30
|
+
# @return [Connection] the open connection, or, if a block is
|
31
|
+
# passed, the return value of the block
|
32
|
+
def self.connect(host=DEFAULT_HOST, port=DEFAULT_PORT)
|
33
|
+
connection = Connection.new(host, port)
|
34
|
+
|
35
|
+
if block_given?
|
36
|
+
begin
|
37
|
+
ret = yield connection
|
38
|
+
ensure
|
39
|
+
connection.close
|
40
|
+
end
|
41
|
+
else
|
42
|
+
ret = connection
|
43
|
+
end
|
44
|
+
|
45
|
+
ret
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
module Impala
|
2
|
+
# This object represents a connection to an Impala server. It can be used to
|
3
|
+
# perform queries on the database.
|
4
|
+
class Connection
|
5
|
+
LOG_CONTEXT_ID = "impala-ruby"
|
6
|
+
|
7
|
+
# Don't instantiate Connections directly; instead, use {Impala.connect}.
|
8
|
+
def initialize(host, port)
|
9
|
+
@host = host
|
10
|
+
@port = port
|
11
|
+
@connected = false
|
12
|
+
open
|
13
|
+
end
|
14
|
+
|
15
|
+
def inspect
|
16
|
+
"#<#{self.class} #{@host}:#{@port}#{open? ? '' : ' (DISCONNECTED)'}>"
|
17
|
+
end
|
18
|
+
|
19
|
+
# Open the connection if it's currently closed.
|
20
|
+
def open
|
21
|
+
return if @connected
|
22
|
+
|
23
|
+
socket = Thrift::Socket.new(@host, @port)
|
24
|
+
|
25
|
+
@transport = Thrift::BufferedTransport.new(socket)
|
26
|
+
@transport.open
|
27
|
+
|
28
|
+
proto = Thrift::BinaryProtocol.new(@transport)
|
29
|
+
@service = Protocol::ImpalaService::Client.new(proto)
|
30
|
+
@connected = true
|
31
|
+
end
|
32
|
+
|
33
|
+
# Close this connection. It can still be reopened with {#open}.
|
34
|
+
def close
|
35
|
+
return unless @connected
|
36
|
+
|
37
|
+
@transport.close
|
38
|
+
@connected = false
|
39
|
+
end
|
40
|
+
|
41
|
+
# Returns true if the connection is currently open.
|
42
|
+
def open?
|
43
|
+
@connected
|
44
|
+
end
|
45
|
+
|
46
|
+
# Refresh the metadata store.
|
47
|
+
def refresh
|
48
|
+
raise ConnectionError.new("Connection closed") unless open?
|
49
|
+
@service.ResetCatalog
|
50
|
+
end
|
51
|
+
|
52
|
+
# Perform a query and return all the results. This will
|
53
|
+
# load the entire result set into memory, so if you're dealing with lots
|
54
|
+
# of rows, {#execute} may work better.
|
55
|
+
# @param [String] query the query you want to run
|
56
|
+
# @param [Hash] query_options the options to set user and configuration
|
57
|
+
# except for :user, see TImpalaQueryOptions in ImpalaService.thrift
|
58
|
+
# @option query_options [String] :user the user runs the query
|
59
|
+
# @return [Array<Hash>] an array of hashes, one for each row.
|
60
|
+
def query(raw_query, query_options = {})
|
61
|
+
execute(raw_query, query_options).fetch_all
|
62
|
+
end
|
63
|
+
|
64
|
+
# Perform a query and return a cursor for iterating over the results.
|
65
|
+
# @param [String] query the query you want to run
|
66
|
+
# @param [Hash] query_options the options to set user and configuration
|
67
|
+
# except for :user, see TImpalaQueryOptions in ImpalaService.thrift
|
68
|
+
# @option query_options [String] :user the user runs the query
|
69
|
+
# @return [Cursor] a cursor for the result rows
|
70
|
+
def execute(raw_query, query_options = {})
|
71
|
+
raise ConnectionError.new("Connection closed") unless open?
|
72
|
+
|
73
|
+
query = sanitize_query(raw_query)
|
74
|
+
handle = send_query(query, query_options)
|
75
|
+
|
76
|
+
check_result(handle)
|
77
|
+
Cursor.new(handle, @service)
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
def sanitize_query(raw_query)
|
83
|
+
words = raw_query.split
|
84
|
+
raise InvalidQueryError.new("Empty query") if words.empty?
|
85
|
+
|
86
|
+
command = words.first.downcase
|
87
|
+
([command] + words[1..-1]).join(' ')
|
88
|
+
end
|
89
|
+
|
90
|
+
def send_query(sanitized_query, query_options)
|
91
|
+
query = Protocol::Beeswax::Query.new
|
92
|
+
query.query = sanitized_query
|
93
|
+
|
94
|
+
query.hadoop_user = query_options.delete(:user) if query_options[:user]
|
95
|
+
query.configuration = query_options.map do |key, value|
|
96
|
+
"#{key.upcase}=#{value}"
|
97
|
+
end
|
98
|
+
|
99
|
+
@service.executeAndWait(query, LOG_CONTEXT_ID)
|
100
|
+
end
|
101
|
+
|
102
|
+
def check_result(handle)
|
103
|
+
state = @service.get_state(handle)
|
104
|
+
if state == Protocol::Beeswax::QueryState::EXCEPTION
|
105
|
+
close_handle(handle)
|
106
|
+
raise ConnectionError.new("The query was aborted")
|
107
|
+
end
|
108
|
+
rescue
|
109
|
+
close_handle(handle)
|
110
|
+
raise
|
111
|
+
end
|
112
|
+
|
113
|
+
def close_handle(handle)
|
114
|
+
@service.close(handle)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,157 @@
|
|
1
|
+
module Impala
|
2
|
+
# Cursors are used to iterate over result sets without loading them all
|
3
|
+
# into memory at once. This can be useful if you're dealing with lots of
|
4
|
+
# rows. It implements Enumerable, so you can use each/select/map/etc.
|
5
|
+
class Cursor
|
6
|
+
BUFFER_SIZE = 1024
|
7
|
+
include Enumerable
|
8
|
+
|
9
|
+
def self.typecast_boolean(value)
|
10
|
+
value == 'true'
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.typecast_int(value)
|
14
|
+
value.to_i
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.typecast_float(value)
|
18
|
+
value.to_f
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.typecast_decimal(value)
|
22
|
+
BigDecimal.new(value)
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.typecast_timestamp(value)
|
26
|
+
Time.parse(value)
|
27
|
+
end
|
28
|
+
|
29
|
+
TYPECAST_MAP = {
|
30
|
+
'boolean'=>method(:typecast_boolean),
|
31
|
+
'int'=>method(:typecast_int),
|
32
|
+
'double'=>method(:typecast_float),
|
33
|
+
'decimal'=>method(:typecast_decimal),
|
34
|
+
'timestamp'=>method(:typecast_timestamp),
|
35
|
+
}
|
36
|
+
TYPECAST_MAP['tinyint'] = TYPECAST_MAP['smallint'] = TYPECAST_MAP['bigint'] = TYPECAST_MAP['int']
|
37
|
+
TYPECAST_MAP['float'] = TYPECAST_MAP['double']
|
38
|
+
TYPECAST_MAP.freeze
|
39
|
+
|
40
|
+
NULL = 'NULL'.freeze
|
41
|
+
|
42
|
+
attr_reader :columns
|
43
|
+
|
44
|
+
attr_reader :typecast_map
|
45
|
+
|
46
|
+
def initialize(handle, service)
|
47
|
+
@handle = handle
|
48
|
+
@service = service
|
49
|
+
|
50
|
+
|
51
|
+
@row_buffer = []
|
52
|
+
@done = false
|
53
|
+
@open = true
|
54
|
+
@typecast_map = TYPECAST_MAP.dup
|
55
|
+
@columns = metadata.schema.fieldSchemas.map(&:name)
|
56
|
+
end
|
57
|
+
|
58
|
+
def inspect
|
59
|
+
"#<#{self.class}#{open? ? '' : ' (CLOSED)'}>"
|
60
|
+
end
|
61
|
+
|
62
|
+
def each
|
63
|
+
while row = fetch_row
|
64
|
+
yield row
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# Returns the next available row as a hash, or nil if there are none left.
|
69
|
+
# @return [Hash, nil] the next available row, or nil if there are none
|
70
|
+
# left
|
71
|
+
# @see #fetch_all
|
72
|
+
def fetch_row
|
73
|
+
if @row_buffer.empty?
|
74
|
+
if @done
|
75
|
+
return nil
|
76
|
+
else
|
77
|
+
fetch_more
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
@row_buffer.shift
|
82
|
+
end
|
83
|
+
|
84
|
+
# Returns all the remaining rows in the result set.
|
85
|
+
# @return [Array<Hash>] the remaining rows in the result set
|
86
|
+
# @see #fetch_one
|
87
|
+
def fetch_all
|
88
|
+
self.to_a
|
89
|
+
end
|
90
|
+
|
91
|
+
# Close the cursor on the remote server. Once a cursor is closed, you
|
92
|
+
# can no longer fetch any rows from it.
|
93
|
+
def close
|
94
|
+
@open = false
|
95
|
+
@service.close(@handle)
|
96
|
+
end
|
97
|
+
|
98
|
+
# Returns true if the cursor is still open.
|
99
|
+
def open?
|
100
|
+
@open
|
101
|
+
end
|
102
|
+
|
103
|
+
# Returns true if there are any more rows to fetch.
|
104
|
+
def has_more?
|
105
|
+
!@done || !@row_buffer.empty?
|
106
|
+
end
|
107
|
+
|
108
|
+
def runtime_profile
|
109
|
+
@service.GetRuntimeProfile(@handle)
|
110
|
+
end
|
111
|
+
|
112
|
+
private
|
113
|
+
|
114
|
+
def metadata
|
115
|
+
@metadata ||= @service.get_results_metadata(@handle)
|
116
|
+
end
|
117
|
+
|
118
|
+
def fetch_more
|
119
|
+
fetch_batch until @done || @row_buffer.count >= BUFFER_SIZE
|
120
|
+
end
|
121
|
+
|
122
|
+
def fetch_batch
|
123
|
+
raise CursorError.new("Cursor has expired or been closed") unless @open
|
124
|
+
|
125
|
+
begin
|
126
|
+
res = @service.fetch(@handle, false, BUFFER_SIZE)
|
127
|
+
rescue Protocol::Beeswax::BeeswaxException
|
128
|
+
@open = false
|
129
|
+
raise CursorError.new("Cursor has expired or been closed")
|
130
|
+
end
|
131
|
+
|
132
|
+
rows = res.data.map { |raw| parse_row(raw) }
|
133
|
+
@row_buffer.concat(rows)
|
134
|
+
|
135
|
+
unless res.has_more
|
136
|
+
@done = true
|
137
|
+
close
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def parse_row(raw)
|
142
|
+
row = {}
|
143
|
+
fields = raw.split(metadata.delim)
|
144
|
+
|
145
|
+
row_convertor.each do |c, p, i|
|
146
|
+
v = fields[i]
|
147
|
+
row[c] = (p ? p.call(v) : v unless v == NULL)
|
148
|
+
end
|
149
|
+
|
150
|
+
row
|
151
|
+
end
|
152
|
+
|
153
|
+
def row_convertor
|
154
|
+
@row_convertor ||= columns.zip(metadata.schema.fieldSchemas.map{|s| typecast_map[s.type]}, (0...(columns.length)).to_a)
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|