sequel_impala 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +50 -0
- data/LICENSE +463 -0
- data/README.md +45 -0
- data/Rakefile +39 -0
- data/lib/driver/commons-collections-3.2.1.jar +0 -0
- data/lib/driver/commons-configuration-1.10.jar +0 -0
- data/lib/driver/commons-logging-1.2.jar +0 -0
- data/lib/driver/hadoop-auth-2.9.0.jar +0 -0
- data/lib/driver/hadoop-common-2.9.0.jar +0 -0
- data/lib/driver/hadoop-core-2.6.0.jar +0 -0
- data/lib/driver/hive-exec-1.1.0.jar +0 -0
- data/lib/driver/hive-jdbc-1.1.0.jar +0 -0
- data/lib/driver/hive-metastore-1.1.0.jar +0 -0
- data/lib/driver/hive-service-1.1.0.jar +0 -0
- data/lib/driver/httpclient-4.3.jar +0 -0
- data/lib/driver/httpcore-4.3.jar +0 -0
- data/lib/driver/libfb303-0.9.0.jar +0 -0
- data/lib/driver/log4j-1.2.17.jar +0 -0
- data/lib/driver/slf4j-api-1.7.5.jar +0 -0
- data/lib/driver/stax2-api-3.1.4.jar +0 -0
- data/lib/driver/woodstox-core-asl-4.4.1.jar +0 -0
- data/lib/impala.rb +55 -0
- data/lib/impala/connection.rb +180 -0
- data/lib/impala/cursor.rb +200 -0
- data/lib/impala/progress_reporter.rb +40 -0
- data/lib/impala/protocol.rb +8 -0
- data/lib/impala/protocol/beeswax_constants.rb +15 -0
- data/lib/impala/protocol/beeswax_service.rb +747 -0
- data/lib/impala/protocol/beeswax_types.rb +193 -0
- data/lib/impala/protocol/exec_stats_constants.rb +13 -0
- data/lib/impala/protocol/exec_stats_types.rb +133 -0
- data/lib/impala/protocol/facebook_service.rb +706 -0
- data/lib/impala/protocol/fb303_constants.rb +15 -0
- data/lib/impala/protocol/fb303_types.rb +25 -0
- data/lib/impala/protocol/hive_metastore_constants.rb +53 -0
- data/lib/impala/protocol/hive_metastore_types.rb +698 -0
- data/lib/impala/protocol/impala_hive_server2_service.rb +137 -0
- data/lib/impala/protocol/impala_service.rb +443 -0
- data/lib/impala/protocol/impala_service_constants.rb +13 -0
- data/lib/impala/protocol/impala_service_types.rb +192 -0
- data/lib/impala/protocol/status_constants.rb +13 -0
- data/lib/impala/protocol/status_types.rb +46 -0
- data/lib/impala/protocol/t_c_l_i_service.rb +1108 -0
- data/lib/impala/protocol/t_c_l_i_service_constants.rb +72 -0
- data/lib/impala/protocol/t_c_l_i_service_types.rb +1802 -0
- data/lib/impala/protocol/thrift_hive_metastore.rb +4707 -0
- data/lib/impala/protocol/types_constants.rb +13 -0
- data/lib/impala/protocol/types_types.rb +332 -0
- data/lib/impala/sasl_transport.rb +117 -0
- data/lib/impala/thrift_patch.rb +31 -0
- data/lib/impala/version.rb +3 -0
- data/lib/jdbc/hive2.rb +52 -0
- data/lib/jdbc/impala.rb +50 -0
- data/lib/rbhive.rb +8 -0
- data/lib/rbhive/connection.rb +150 -0
- data/lib/rbhive/explain_result.rb +46 -0
- data/lib/rbhive/result_set.rb +37 -0
- data/lib/rbhive/schema_definition.rb +86 -0
- data/lib/rbhive/t_c_l_i_connection.rb +466 -0
- data/lib/rbhive/t_c_l_i_result_set.rb +3 -0
- data/lib/rbhive/t_c_l_i_schema_definition.rb +87 -0
- data/lib/rbhive/table_schema.rb +122 -0
- data/lib/rbhive/version.rb +3 -0
- data/lib/sequel/adapters/impala.rb +220 -0
- data/lib/sequel/adapters/jdbc/hive2.rb +36 -0
- data/lib/sequel/adapters/jdbc/impala.rb +38 -0
- data/lib/sequel/adapters/rbhive.rb +177 -0
- data/lib/sequel/adapters/shared/impala.rb +808 -0
- data/lib/sequel/extensions/csv_to_parquet.rb +166 -0
- data/lib/thrift/facebook_service.rb +700 -0
- data/lib/thrift/fb303_constants.rb +9 -0
- data/lib/thrift/fb303_types.rb +19 -0
- data/lib/thrift/hive_metastore_constants.rb +41 -0
- data/lib/thrift/hive_metastore_types.rb +630 -0
- data/lib/thrift/hive_service_constants.rb +13 -0
- data/lib/thrift/hive_service_types.rb +72 -0
- data/lib/thrift/queryplan_constants.rb +13 -0
- data/lib/thrift/queryplan_types.rb +261 -0
- data/lib/thrift/sasl_client_transport.rb +161 -0
- data/lib/thrift/serde_constants.rb +92 -0
- data/lib/thrift/serde_types.rb +7 -0
- data/lib/thrift/t_c_l_i_service.rb +1054 -0
- data/lib/thrift/t_c_l_i_service_constants.rb +72 -0
- data/lib/thrift/t_c_l_i_service_types.rb +1768 -0
- data/lib/thrift/thrift_hive.rb +508 -0
- data/lib/thrift/thrift_hive_metastore.rb +3856 -0
- data/spec/database_test.rb +56 -0
- data/spec/dataset_test.rb +1268 -0
- data/spec/files/bad_down_migration/001_create_alt_basic.rb +4 -0
- data/spec/files/bad_down_migration/002_create_alt_advanced.rb +4 -0
- data/spec/files/bad_timestamped_migrations/1273253849_create_sessions.rb +9 -0
- data/spec/files/bad_timestamped_migrations/1273253851_create_nodes.rb +9 -0
- data/spec/files/bad_timestamped_migrations/1273253853_3_create_users.rb +3 -0
- data/spec/files/bad_up_migration/001_create_alt_basic.rb +4 -0
- data/spec/files/bad_up_migration/002_create_alt_advanced.rb +3 -0
- data/spec/files/convert_to_timestamp_migrations/001_create_sessions.rb +9 -0
- data/spec/files/convert_to_timestamp_migrations/002_create_nodes.rb +9 -0
- data/spec/files/convert_to_timestamp_migrations/003_3_create_users.rb +4 -0
- data/spec/files/convert_to_timestamp_migrations/1273253850_create_artists.rb +9 -0
- data/spec/files/convert_to_timestamp_migrations/1273253852_create_albums.rb +9 -0
- data/spec/files/duplicate_timestamped_migrations/1273253849_create_sessions.rb +9 -0
- data/spec/files/duplicate_timestamped_migrations/1273253853_create_nodes.rb +9 -0
- data/spec/files/duplicate_timestamped_migrations/1273253853_create_users.rb +4 -0
- data/spec/files/integer_migrations/001_create_sessions.rb +9 -0
- data/spec/files/integer_migrations/002_create_nodes.rb +9 -0
- data/spec/files/integer_migrations/003_3_create_users.rb +4 -0
- data/spec/files/interleaved_timestamped_migrations/1273253849_create_sessions.rb +9 -0
- data/spec/files/interleaved_timestamped_migrations/1273253850_create_artists.rb +9 -0
- data/spec/files/interleaved_timestamped_migrations/1273253851_create_nodes.rb +9 -0
- data/spec/files/interleaved_timestamped_migrations/1273253852_create_albums.rb +9 -0
- data/spec/files/interleaved_timestamped_migrations/1273253853_3_create_users.rb +4 -0
- data/spec/files/reversible_migrations/001_reversible.rb +5 -0
- data/spec/files/reversible_migrations/002_reversible.rb +5 -0
- data/spec/files/reversible_migrations/003_reversible.rb +5 -0
- data/spec/files/reversible_migrations/004_reversible.rb +5 -0
- data/spec/files/reversible_migrations/005_reversible.rb +10 -0
- data/spec/files/timestamped_migrations/1273253849_create_sessions.rb +9 -0
- data/spec/files/timestamped_migrations/1273253851_create_nodes.rb +9 -0
- data/spec/files/timestamped_migrations/1273253853_3_create_users.rb +4 -0
- data/spec/impala_test.rb +290 -0
- data/spec/migrator_test.rb +240 -0
- data/spec/plugin_test.rb +91 -0
- data/spec/prepared_statement_test.rb +327 -0
- data/spec/schema_test.rb +356 -0
- data/spec/spec_helper.rb +19 -0
- data/spec/timezone_test.rb +86 -0
- data/spec/type_test.rb +99 -0
- metadata +294 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
require 'socket'
|
|
2
|
+
|
|
3
|
+
module Thrift
|
|
4
|
+
module KeepAlive
|
|
5
|
+
# We'll override #open so that once the socket is opened
|
|
6
|
+
# we enable keepalive on it
|
|
7
|
+
#
|
|
8
|
+
# Many queries are going to take a long time (10s of minutes) to complete
|
|
9
|
+
# and we don't want the connection to close while we wait for the
|
|
10
|
+
# query to return.
|
|
11
|
+
#
|
|
12
|
+
# Unfortunately, Thrift doesn't supply an easy way to get to the
|
|
13
|
+
# socket that it opens to communicate with Impala.
|
|
14
|
+
#
|
|
15
|
+
# I figured that while I was in here, monkey-patching a way to get
|
|
16
|
+
# to the socket, I might as well just enable keepalive here
|
|
17
|
+
# instead.
|
|
18
|
+
def open
|
|
19
|
+
super
|
|
20
|
+
yield @transport if block_given?
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
class BufferedTransport
|
|
25
|
+
prepend KeepAlive
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
class ImpalaSaslClientTransport
|
|
29
|
+
prepend KeepAlive
|
|
30
|
+
end
|
|
31
|
+
end
|
data/lib/jdbc/hive2.rb
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
warn 'jdbc-hive2 is only for use with JRuby' if (JRUBY_VERSION.nil? rescue true)
|
|
2
|
+
|
|
3
|
+
module Jdbc
|
|
4
|
+
module Hive2
|
|
5
|
+
DRIVER_VERSION = '1.1.0'
|
|
6
|
+
VERSION = DRIVER_VERSION + '.0'
|
|
7
|
+
|
|
8
|
+
def self.driver_jar
|
|
9
|
+
%W(
|
|
10
|
+
driver/libfb303-0.9.0.jar
|
|
11
|
+
driver/slf4j-api-1.7.5.jar
|
|
12
|
+
driver/hadoop-common-2.9.0.jar
|
|
13
|
+
driver/hadoop-auth-2.9.0.jar
|
|
14
|
+
driver/hadoop-core-2.6.0.jar
|
|
15
|
+
driver/commons-configuration-1.10.jar
|
|
16
|
+
driver/commons-collections-3.2.1.jar
|
|
17
|
+
driver/commons-logging-1.2.jar
|
|
18
|
+
driver/hive-exec-1.1.0.jar
|
|
19
|
+
driver/hive-jdbc-1.1.0.jar
|
|
20
|
+
driver/hive-metastore-1.1.0.jar
|
|
21
|
+
driver/hive-service-1.1.0.jar
|
|
22
|
+
driver/httpcore-4.3.jar
|
|
23
|
+
driver/httpclient-4.3.jar
|
|
24
|
+
driver/log4j-1.2.17.jar
|
|
25
|
+
driver/woodstox-core-asl-4.4.1.jar
|
|
26
|
+
driver/stax2-api-3.1.4.jar
|
|
27
|
+
)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def self.load_driver(method = :load)
|
|
31
|
+
# case version
|
|
32
|
+
# when 11
|
|
33
|
+
# when 12
|
|
34
|
+
# when :cdh5
|
|
35
|
+
# else # 11
|
|
36
|
+
# end
|
|
37
|
+
driver_jar.each do |jar|
|
|
38
|
+
send method, jar
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def self.driver_name
|
|
43
|
+
'org.apache.hive.jdbc.HiveDriver'
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
if defined?(JRUBY_VERSION) && # enable backwards-compat behavior
|
|
47
|
+
(Java::JavaLang::Boolean.get_boolean('jdbc.driver.autoload'))
|
|
48
|
+
warn "autoloading jdbc driver on require 'jdbc/hive2'" if $VERBOSE
|
|
49
|
+
load_driver :require
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
data/lib/jdbc/impala.rb
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
warn 'jdbc-impala is only for use with JRuby' if (JRUBY_VERSION.nil? rescue true)
|
|
2
|
+
|
|
3
|
+
module Jdbc
|
|
4
|
+
module Impala
|
|
5
|
+
DRIVER_VERSION = '2.5.41.1061'
|
|
6
|
+
VERSION = DRIVER_VERSION
|
|
7
|
+
JAR_ROOT = ENV['IMPALA_JDBC_JARS']
|
|
8
|
+
unless JAR_ROOT && File.directory?(JAR_ROOT)
|
|
9
|
+
warn "must specify IMPALA_JDBC_JARS environment variable for directory containing necessary jar files for Impala JDBC 4.1 driver version #{VERSION}"
|
|
10
|
+
raise LoadError, "cannot load such file -- jdbc/impala"
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def self.driver_jar
|
|
14
|
+
%W(
|
|
15
|
+
ImpalaJDBC41.jar
|
|
16
|
+
TCLIServiceClient.jar
|
|
17
|
+
commons-codec-1.3.jar
|
|
18
|
+
commons-logging-1.1.1.jar
|
|
19
|
+
hive_metastore.jar
|
|
20
|
+
hive_service.jar
|
|
21
|
+
httpclient-4.1.3.jar
|
|
22
|
+
httpcore-4.1.3.jar
|
|
23
|
+
libfb303-0.9.0.jar
|
|
24
|
+
libthrift-0.9.0.jar
|
|
25
|
+
log4j-1.2.14.jar
|
|
26
|
+
ql.jar
|
|
27
|
+
slf4j-api-1.5.11.jar
|
|
28
|
+
slf4j-log4j12-1.5.11.jar
|
|
29
|
+
zookeeper-3.4.6.jar
|
|
30
|
+
).map{|f| File.join(JAR_ROOT, f)}
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def self.load_driver(method = :load)
|
|
34
|
+
driver_jar.each do |jar|
|
|
35
|
+
send method, jar
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def self.driver_name
|
|
40
|
+
'com.cloudera.impala.jdbc41.Driver'
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
if defined?(JRUBY_VERSION) && # enable backwards-compat behavior
|
|
44
|
+
(Java::JavaLang::Boolean.get_boolean('jdbc.driver.autoload'))
|
|
45
|
+
warn "autoloading jdbc driver on require 'jdbc/impala'" if $VERBOSE
|
|
46
|
+
load_driver :require
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
data/lib/rbhive.rb
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'connection')
|
|
2
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'table_schema')
|
|
3
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'result_set')
|
|
4
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'explain_result')
|
|
5
|
+
require File.join(File.dirname(__FILE__), 'rbhive', 'schema_definition')
|
|
6
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_result_set])
|
|
7
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_schema_definition])
|
|
8
|
+
require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_connection])
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# suppress warnings
|
|
2
|
+
old_verbose, $VERBOSE = $VERBOSE, nil
|
|
3
|
+
# require thrift autogenerated files
|
|
4
|
+
require File.join(File.split(File.dirname(__FILE__)).first, *%w[thrift thrift_hive])
|
|
5
|
+
# require 'thrift'
|
|
6
|
+
# restore warnings
|
|
7
|
+
$VERBOSE = old_verbose
|
|
8
|
+
|
|
9
|
+
module RBHive
|
|
10
|
+
def connect(server, port=10_000)
|
|
11
|
+
connection = RBHive::Connection.new(server, port)
|
|
12
|
+
ret = nil
|
|
13
|
+
begin
|
|
14
|
+
connection.open
|
|
15
|
+
ret = yield(connection)
|
|
16
|
+
ensure
|
|
17
|
+
connection.close
|
|
18
|
+
ret
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
module_function :connect
|
|
22
|
+
|
|
23
|
+
class StdOutLogger
|
|
24
|
+
%w(fatal error warn info debug).each do |level|
|
|
25
|
+
define_method level.to_sym do |message|
|
|
26
|
+
STDOUT.puts(message)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
class Connection
|
|
32
|
+
attr_reader :client
|
|
33
|
+
|
|
34
|
+
def initialize(server, port=10_000, logger=StdOutLogger.new)
|
|
35
|
+
@socket = Thrift::Socket.new(server, port)
|
|
36
|
+
@transport = Thrift::BufferedTransport.new(@socket)
|
|
37
|
+
@protocol = Thrift::BinaryProtocol.new(@transport)
|
|
38
|
+
@client = Hive::Thrift::ThriftHive::Client.new(@protocol)
|
|
39
|
+
@logger = logger
|
|
40
|
+
@logger.info("Connecting to #{server} on port #{port}")
|
|
41
|
+
@mutex = Mutex.new
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def open
|
|
45
|
+
@transport.open
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def close
|
|
49
|
+
@transport.close
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def client
|
|
53
|
+
@client
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def execute(query)
|
|
57
|
+
execute_safe(query)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def explain(query)
|
|
61
|
+
safe do
|
|
62
|
+
execute_unsafe("EXPLAIN "+ query)
|
|
63
|
+
ExplainResult.new(client.fetchAll)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def priority=(priority)
|
|
68
|
+
set("mapred.job.priority", priority)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def queue=(queue)
|
|
72
|
+
set("mapred.job.queue.name", queue)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def set(name,value)
|
|
76
|
+
@logger.info("Setting #{name}=#{value}")
|
|
77
|
+
client.execute("SET #{name}=#{value}")
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def fetch(query)
|
|
81
|
+
safe do
|
|
82
|
+
execute_unsafe(query)
|
|
83
|
+
rows = client.fetchAll
|
|
84
|
+
the_schema = SchemaDefinition.new(client.getSchema, rows.first)
|
|
85
|
+
ResultSet.new(rows, the_schema)
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def fetch_in_batch(query, batch_size=1_000)
|
|
90
|
+
safe do
|
|
91
|
+
execute_unsafe(query)
|
|
92
|
+
until (next_batch = client.fetchN(batch_size)).empty?
|
|
93
|
+
the_schema ||= SchemaDefinition.new(client.getSchema, next_batch.first)
|
|
94
|
+
yield ResultSet.new(next_batch, the_schema)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def first(query)
|
|
100
|
+
safe do
|
|
101
|
+
execute_unsafe(query)
|
|
102
|
+
row = client.fetchOne
|
|
103
|
+
the_schema = SchemaDefinition.new(client.getSchema, row)
|
|
104
|
+
ResultSet.new([row], the_schema).first
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def schema(example_row=[])
|
|
109
|
+
safe { SchemaDefinition.new(client.getSchema, example_row) }
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def create_table(schema)
|
|
113
|
+
execute(schema.create_table_statement)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def drop_table(name)
|
|
117
|
+
name = name.name if name.is_a?(TableSchema)
|
|
118
|
+
execute("DROP TABLE `#{name}`")
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def replace_columns(schema)
|
|
122
|
+
execute(schema.replace_columns_statement)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def add_columns(schema)
|
|
126
|
+
execute(schema.add_columns_statement)
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def method_missing(meth, *args)
|
|
130
|
+
client.send(meth, *args)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
private
|
|
134
|
+
|
|
135
|
+
def execute_safe(query)
|
|
136
|
+
safe { execute_unsafe(query) }
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def execute_unsafe(query)
|
|
140
|
+
@logger.info("Executing Hive Query: #{query}")
|
|
141
|
+
client.execute(query)
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def safe
|
|
145
|
+
ret = nil
|
|
146
|
+
@mutex.synchronize { ret = yield }
|
|
147
|
+
ret
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
class ExplainResult
|
|
2
|
+
def initialize(rows)
|
|
3
|
+
@rows = rows
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def ast
|
|
7
|
+
by_section[:abstract_syntax_tree].first
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def stage_count
|
|
11
|
+
stage_dependencies.length
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def stage_dependencies
|
|
15
|
+
by_section[:stage_dependencies] || []
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def to_tsv
|
|
19
|
+
@rows.join("\n")
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def raw
|
|
23
|
+
@rows
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def to_s
|
|
27
|
+
to_tsv
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def by_section
|
|
33
|
+
current_section = nil
|
|
34
|
+
@rows.inject({}) do |sections, row|
|
|
35
|
+
if row.match(/^[A-Z]/)
|
|
36
|
+
current_section = row.chomp(':').downcase.gsub(' ', '_').to_sym
|
|
37
|
+
sections[current_section] = []
|
|
38
|
+
elsif row.length == 0
|
|
39
|
+
next sections
|
|
40
|
+
else
|
|
41
|
+
sections[current_section] << row.strip
|
|
42
|
+
end
|
|
43
|
+
sections
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
module RBHive
|
|
2
|
+
class ResultSet < Array
|
|
3
|
+
def initialize(rows, schema)
|
|
4
|
+
@schema = schema
|
|
5
|
+
super(rows.map {|r| @schema.coerce_row(r) })
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def column_names
|
|
9
|
+
@schema.column_names
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def column_type_map
|
|
13
|
+
@schema.column_type_map
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def to_csv(out_file=nil)
|
|
17
|
+
to_separated_output(",", out_file)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def to_tsv(out_file=nil)
|
|
21
|
+
to_separated_output("\t", out_file)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def as_arrays
|
|
25
|
+
@as_arrays ||= self.map{ |r| @schema.coerce_row_to_array(r) }
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
def to_separated_output(sep, out_file)
|
|
31
|
+
rows = self.map { |r| @schema.coerce_row_to_array(r).join(sep) }
|
|
32
|
+
sv = rows.join("\n")
|
|
33
|
+
return sv if out_file.nil?
|
|
34
|
+
File.open(out_file, 'w+') { |f| f << sv }
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
require 'json'
|
|
2
|
+
|
|
3
|
+
module RBHive
|
|
4
|
+
class SchemaDefinition
|
|
5
|
+
attr_reader :schema
|
|
6
|
+
|
|
7
|
+
NAN = Float::NAN rescue 0.0/0.0
|
|
8
|
+
INFINITY = Float::INFINITY rescue 1.0/0.0
|
|
9
|
+
TYPES = {
|
|
10
|
+
:boolean => :to_s,
|
|
11
|
+
:string => :to_s,
|
|
12
|
+
:bigint => :to_i,
|
|
13
|
+
:float => :to_f,
|
|
14
|
+
:double => :to_f,
|
|
15
|
+
:int => :to_i,
|
|
16
|
+
:smallint => :to_i,
|
|
17
|
+
:tinyint => :to_i,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
def initialize(schema, example_row)
|
|
21
|
+
@schema = schema
|
|
22
|
+
@example_row = example_row ? example_row.split("\t") : []
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def column_names
|
|
26
|
+
@column_names ||= begin
|
|
27
|
+
schema_names = @schema.fieldSchemas.map {|c| c.name }
|
|
28
|
+
|
|
29
|
+
# In rare cases Hive can return two identical column names
|
|
30
|
+
# consider SELECT a.foo, b.foo...
|
|
31
|
+
# in this case you get two columns called foo with no disambiguation.
|
|
32
|
+
# as a (far from ideal) solution we detect this edge case and rename them
|
|
33
|
+
# a.foo => foo1, b.foo => foo2
|
|
34
|
+
# otherwise we will trample one of the columns during Hash mapping.
|
|
35
|
+
s = Hash.new(0)
|
|
36
|
+
schema_names.map! { |c| s[c] += 1; s[c] > 1 ? "#{c}---|---#{s[c]}" : c }
|
|
37
|
+
schema_names.map! { |c| s[c] > 1 ? "#{c}---|---1" : c }
|
|
38
|
+
schema_names.map! { |c| c.gsub('---|---', '_').to_sym }
|
|
39
|
+
|
|
40
|
+
# Lets fix the fact that Hive doesn't return schema data for partitions on SELECT * queries
|
|
41
|
+
# For now we will call them :_p1, :_p2, etc. to avoid collisions.
|
|
42
|
+
offset = 0
|
|
43
|
+
while schema_names.length < @example_row.length
|
|
44
|
+
schema_names.push(:"_p#{offset+=1}")
|
|
45
|
+
end
|
|
46
|
+
schema_names
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def column_type_map
|
|
51
|
+
@column_type_map ||= column_names.inject({}) do |hsh, c|
|
|
52
|
+
definition = @schema.fieldSchemas.find {|s| s.name.to_sym == c }
|
|
53
|
+
# If the column isn't in the schema (eg partitions in SELECT * queries) assume they are strings
|
|
54
|
+
hsh[c] = definition ? definition.type.to_sym : :string
|
|
55
|
+
hsh
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def coerce_row(row)
|
|
60
|
+
column_names.zip(row.split("\t")).inject({}) do |hsh, (column_name, value)|
|
|
61
|
+
hsh[column_name] = coerce_column(column_name, value)
|
|
62
|
+
hsh
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def coerce_column(column_name, value)
|
|
67
|
+
type = column_type_map[column_name]
|
|
68
|
+
return INFINITY if (type != :string && value == "Infinity")
|
|
69
|
+
return NAN if (type != :string && value == "NaN")
|
|
70
|
+
return coerce_complex_value(value) if type.to_s =~ /^array/
|
|
71
|
+
conversion_method = TYPES[type]
|
|
72
|
+
conversion_method ? value.send(conversion_method) : value
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def coerce_row_to_array(row)
|
|
76
|
+
column_names.map { |n| row[n] }
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def coerce_complex_value(value)
|
|
80
|
+
return nil if value.nil?
|
|
81
|
+
return nil if value.length == 0
|
|
82
|
+
return nil if value == 'null'
|
|
83
|
+
JSON.parse(value)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|