sequel-impala 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +16 -0
  3. data/LICENSE +2 -1
  4. data/README.md +45 -0
  5. data/lib/rbhive.rb +8 -0
  6. data/lib/rbhive/connection.rb +150 -0
  7. data/lib/rbhive/explain_result.rb +46 -0
  8. data/lib/rbhive/result_set.rb +37 -0
  9. data/lib/rbhive/schema_definition.rb +86 -0
  10. data/lib/rbhive/t_c_l_i_connection.rb +464 -0
  11. data/lib/rbhive/t_c_l_i_result_set.rb +3 -0
  12. data/lib/rbhive/t_c_l_i_schema_definition.rb +87 -0
  13. data/lib/rbhive/table_schema.rb +122 -0
  14. data/lib/rbhive/version.rb +3 -0
  15. data/lib/sequel/adapters/impala.rb +13 -1
  16. data/lib/sequel/adapters/rbhive.rb +174 -0
  17. data/lib/sequel/adapters/shared/impala.rb +11 -3
  18. data/lib/sequel/extensions/csv_to_parquet.rb +68 -14
  19. data/lib/thrift/facebook_service.rb +700 -0
  20. data/lib/thrift/fb303_constants.rb +9 -0
  21. data/lib/thrift/fb303_types.rb +19 -0
  22. data/lib/thrift/hive_metastore_constants.rb +41 -0
  23. data/lib/thrift/hive_metastore_types.rb +630 -0
  24. data/lib/thrift/hive_service_constants.rb +13 -0
  25. data/lib/thrift/hive_service_types.rb +72 -0
  26. data/lib/thrift/queryplan_constants.rb +13 -0
  27. data/lib/thrift/queryplan_types.rb +261 -0
  28. data/lib/thrift/sasl_client_transport.rb +161 -0
  29. data/lib/thrift/serde_constants.rb +92 -0
  30. data/lib/thrift/serde_types.rb +7 -0
  31. data/lib/thrift/t_c_l_i_service.rb +1054 -0
  32. data/lib/thrift/t_c_l_i_service_constants.rb +72 -0
  33. data/lib/thrift/t_c_l_i_service_types.rb +1768 -0
  34. data/lib/thrift/thrift_hive.rb +508 -0
  35. data/lib/thrift/thrift_hive_metastore.rb +3856 -0
  36. data/spec/impala_test.rb +6 -1
  37. metadata +53 -25
  38. data/README.rdoc +0 -39
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1583131d7fd60fe171fa38b12e51553760f4636f
4
- data.tar.gz: 85c496bf9bdf022c29d9a309d7a1e2f2b202a288
3
+ metadata.gz: 03ea5f2607bc4908064302d49640df3a4e34eaa3
4
+ data.tar.gz: 18c93756bb5918f32cb6f32612856963f634e966
5
5
  SHA512:
6
- metadata.gz: 82f28a92091bd3a46992f2eceac625ed30f9107cdc80e6495af4a45b76fcd55b4df23c0cac768ccce48038d3ba888e018db8df5184fc8846e7f17d793a0d6ff6
7
- data.tar.gz: 93809720a5187f6c9eafe2708bf4cf86c1575caa5a0079077f7fef2d3b76a2dac6420b94c4b8b44d7a3e9a6336bdc049accce88ea80b301dee7049684c6ea135
6
+ metadata.gz: 72ca2b1c7177ecc8c2db06e8b266f4a1ff67085cf7ef77d464e6b7667ce896870d905a691acf4fc2b34e67c90777dc3767cc9b3a1e3a2a9252967beaf507b566
7
+ data.tar.gz: fb15a8bf19c03e54179666df88da7ffc0f0be90d48a65ada20722da03f46c4fa26d910a2f3c554a93149ea29f92f565fa3bbeb43b875c8ceacfcf413b64aa161
data/CHANGELOG CHANGED
@@ -1,3 +1,19 @@
1
1
  === HEAD
2
2
 
3
+ * Fix disconnect detection in impala and rbhive adapters (jeremyevans)
4
+
5
+ * Make implicit qualify return an SQL::Identifier if given an unqualified string (jeremyevans)
6
+
7
+ * Fix :search_path option handling when using Sequel::SQL::AliasedExpressions (jeremyevans)
8
+
9
+ * Speed up multi_insert and import (jeremyevans)
10
+
11
+ * Add rbhive adapter (jeremyevans)
12
+
13
+ * Add :empty_null=>:ruby option to csv_to_parquet extension, which can support quoted CSV cells (jeremyevans)
14
+
15
+ * Optimize csv_to_parquet extension by not spawning shells or unnecessary processes (jeremyevans)
16
+
17
+ === 1.0.0 (2015-12-04)
18
+
3
19
  * Initial Public Release
data/LICENSE CHANGED
@@ -1,5 +1,6 @@
1
- Copyright (c) 2015 Jeremy Evans
1
+ Copyright (c) 2015-2016 Jeremy Evans
2
2
  Copyright (c) 2013 Colin Marc
3
+ Copyright (c) [2013] [Forward3D]
3
4
 
4
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
5
6
  of this software and associated documentation files (the "Software"), to
@@ -0,0 +1,45 @@
1
+ # sequel-impala
2
+
3
+ sequel-impala adds support for Sequel to connect to the Impala database
4
+ via the included impala driver, and the included jdbc-hive2 driver under JRuby.
5
+
6
+ # Source Code
7
+
8
+ Source code is available on GitHub at https://github.com/outcomesinsights/sequel-impala
9
+
10
+ # Usage
11
+
12
+ After installation, Sequel will automatically pick up the adapter as long as
13
+ the lib directory is in RUBYLIB, if you use a connection string starting with
14
+ `impala`, or `jdbc:hive2` on JRuby.
15
+
16
+ # Connection Strings
17
+
18
+ If using the impala driver (default host is localhost, default port is 21000):
19
+
20
+ impala://host:port
21
+
22
+ If using the jdbc:hive2 driver on JRuby (port 21050 works in testing):
23
+
24
+ jdbc:hive2://host:port/;auth=noSasl
25
+
26
+ # Dependencies
27
+
28
+ * sequel 4+
29
+ * thrift gem
30
+
31
+ # License
32
+
33
+ MIT/Apache
34
+
35
+ # Author
36
+
37
+ Ryan Duryea <aguynamedryan@gmail.com>
38
+
39
+ Work on sequel-impala is generously funded by [Outcomes Insights, Inc.](http://outins.com)
40
+
41
+ # Previous Author
42
+
43
+ Jeremy Evans <code@jeremyevans.net>
44
+
45
+ Provided initial work on this gem, and continues to maintain [Sequel](http://sequel.jeremyevans.net/). We can't thank you enough!
@@ -0,0 +1,8 @@
1
+ require File.join(File.dirname(__FILE__), 'rbhive', 'connection')
2
+ require File.join(File.dirname(__FILE__), 'rbhive', 'table_schema')
3
+ require File.join(File.dirname(__FILE__), 'rbhive', 'result_set')
4
+ require File.join(File.dirname(__FILE__), 'rbhive', 'explain_result')
5
+ require File.join(File.dirname(__FILE__), 'rbhive', 'schema_definition')
6
+ require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_result_set])
7
+ require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_schema_definition])
8
+ require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_connection])
@@ -0,0 +1,150 @@
1
+ # suppress warnings
2
+ old_verbose, $VERBOSE = $VERBOSE, nil
3
+ # require thrift autogenerated files
4
+ require File.join(File.split(File.dirname(__FILE__)).first, *%w[thrift thrift_hive])
5
+ # require 'thrift'
6
+ # restore warnings
7
+ $VERBOSE = old_verbose
8
+
9
+ module RBHive
10
+ def connect(server, port=10_000)
11
+ connection = RBHive::Connection.new(server, port)
12
+ ret = nil
13
+ begin
14
+ connection.open
15
+ ret = yield(connection)
16
+ ensure
17
+ connection.close
18
+ ret
19
+ end
20
+ end
21
+ module_function :connect
22
+
23
+ class StdOutLogger
24
+ %w(fatal error warn info debug).each do |level|
25
+ define_method level.to_sym do |message|
26
+ STDOUT.puts(message)
27
+ end
28
+ end
29
+ end
30
+
31
+ class Connection
32
+ attr_reader :client
33
+
34
+ def initialize(server, port=10_000, logger=StdOutLogger.new)
35
+ @socket = Thrift::Socket.new(server, port)
36
+ @transport = Thrift::BufferedTransport.new(@socket)
37
+ @protocol = Thrift::BinaryProtocol.new(@transport)
38
+ @client = Hive::Thrift::ThriftHive::Client.new(@protocol)
39
+ @logger = logger
40
+ @logger.info("Connecting to #{server} on port #{port}")
41
+ @mutex = Mutex.new
42
+ end
43
+
44
+ def open
45
+ @transport.open
46
+ end
47
+
48
+ def close
49
+ @transport.close
50
+ end
51
+
52
+ def client
53
+ @client
54
+ end
55
+
56
+ def execute(query)
57
+ execute_safe(query)
58
+ end
59
+
60
+ def explain(query)
61
+ safe do
62
+ execute_unsafe("EXPLAIN "+ query)
63
+ ExplainResult.new(client.fetchAll)
64
+ end
65
+ end
66
+
67
+ def priority=(priority)
68
+ set("mapred.job.priority", priority)
69
+ end
70
+
71
+ def queue=(queue)
72
+ set("mapred.job.queue.name", queue)
73
+ end
74
+
75
+ def set(name,value)
76
+ @logger.info("Setting #{name}=#{value}")
77
+ client.execute("SET #{name}=#{value}")
78
+ end
79
+
80
+ def fetch(query)
81
+ safe do
82
+ execute_unsafe(query)
83
+ rows = client.fetchAll
84
+ the_schema = SchemaDefinition.new(client.getSchema, rows.first)
85
+ ResultSet.new(rows, the_schema)
86
+ end
87
+ end
88
+
89
+ def fetch_in_batch(query, batch_size=1_000)
90
+ safe do
91
+ execute_unsafe(query)
92
+ until (next_batch = client.fetchN(batch_size)).empty?
93
+ the_schema ||= SchemaDefinition.new(client.getSchema, next_batch.first)
94
+ yield ResultSet.new(next_batch, the_schema)
95
+ end
96
+ end
97
+ end
98
+
99
+ def first(query)
100
+ safe do
101
+ execute_unsafe(query)
102
+ row = client.fetchOne
103
+ the_schema = SchemaDefinition.new(client.getSchema, row)
104
+ ResultSet.new([row], the_schema).first
105
+ end
106
+ end
107
+
108
+ def schema(example_row=[])
109
+ safe { SchemaDefinition.new(client.getSchema, example_row) }
110
+ end
111
+
112
+ def create_table(schema)
113
+ execute(schema.create_table_statement)
114
+ end
115
+
116
+ def drop_table(name)
117
+ name = name.name if name.is_a?(TableSchema)
118
+ execute("DROP TABLE `#{name}`")
119
+ end
120
+
121
+ def replace_columns(schema)
122
+ execute(schema.replace_columns_statement)
123
+ end
124
+
125
+ def add_columns(schema)
126
+ execute(schema.add_columns_statement)
127
+ end
128
+
129
+ def method_missing(meth, *args)
130
+ client.send(meth, *args)
131
+ end
132
+
133
+ private
134
+
135
+ def execute_safe(query)
136
+ safe { execute_unsafe(query) }
137
+ end
138
+
139
+ def execute_unsafe(query)
140
+ @logger.info("Executing Hive Query: #{query}")
141
+ client.execute(query)
142
+ end
143
+
144
+ def safe
145
+ ret = nil
146
+ @mutex.synchronize { ret = yield }
147
+ ret
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,46 @@
1
+ class ExplainResult
2
+ def initialize(rows)
3
+ @rows = rows
4
+ end
5
+
6
+ def ast
7
+ by_section[:abstract_syntax_tree].first
8
+ end
9
+
10
+ def stage_count
11
+ stage_dependencies.length
12
+ end
13
+
14
+ def stage_dependencies
15
+ by_section[:stage_dependencies] || []
16
+ end
17
+
18
+ def to_tsv
19
+ @rows.join("\n")
20
+ end
21
+
22
+ def raw
23
+ @rows
24
+ end
25
+
26
+ def to_s
27
+ to_tsv
28
+ end
29
+
30
+ private
31
+
32
+ def by_section
33
+ current_section = nil
34
+ @rows.inject({}) do |sections, row|
35
+ if row.match(/^[A-Z]/)
36
+ current_section = row.chomp(':').downcase.gsub(' ', '_').to_sym
37
+ sections[current_section] = []
38
+ elsif row.length == 0
39
+ next sections
40
+ else
41
+ sections[current_section] << row.strip
42
+ end
43
+ sections
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,37 @@
1
+ module RBHive
2
+ class ResultSet < Array
3
+ def initialize(rows, schema)
4
+ @schema = schema
5
+ super(rows.map {|r| @schema.coerce_row(r) })
6
+ end
7
+
8
+ def column_names
9
+ @schema.column_names
10
+ end
11
+
12
+ def column_type_map
13
+ @schema.column_type_map
14
+ end
15
+
16
+ def to_csv(out_file=nil)
17
+ to_separated_output(",", out_file)
18
+ end
19
+
20
+ def to_tsv(out_file=nil)
21
+ to_separated_output("\t", out_file)
22
+ end
23
+
24
+ def as_arrays
25
+ @as_arrays ||= self.map{ |r| @schema.coerce_row_to_array(r) }
26
+ end
27
+
28
+ private
29
+
30
+ def to_separated_output(sep, out_file)
31
+ rows = self.map { |r| @schema.coerce_row_to_array(r).join(sep) }
32
+ sv = rows.join("\n")
33
+ return sv if out_file.nil?
34
+ File.open(out_file, 'w+') { |f| f << sv }
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,86 @@
1
+ require 'json'
2
+
3
+ module RBHive
4
+ class SchemaDefinition
5
+ attr_reader :schema
6
+
7
+ NAN = Float::NAN rescue 0.0/0.0
8
+ INFINITY = Float::INFINITY rescue 1.0/0.0
9
+ TYPES = {
10
+ :boolean => :to_s,
11
+ :string => :to_s,
12
+ :bigint => :to_i,
13
+ :float => :to_f,
14
+ :double => :to_f,
15
+ :int => :to_i,
16
+ :smallint => :to_i,
17
+ :tinyint => :to_i,
18
+ }
19
+
20
+ def initialize(schema, example_row)
21
+ @schema = schema
22
+ @example_row = example_row ? example_row.split("\t") : []
23
+ end
24
+
25
+ def column_names
26
+ @column_names ||= begin
27
+ schema_names = @schema.fieldSchemas.map {|c| c.name }
28
+
29
+ # In rare cases Hive can return two identical column names
30
+ # consider SELECT a.foo, b.foo...
31
+ # in this case you get two columns called foo with no disambiguation.
32
+ # as a (far from ideal) solution we detect this edge case and rename them
33
+ # a.foo => foo1, b.foo => foo2
34
+ # otherwise we will trample one of the columns during Hash mapping.
35
+ s = Hash.new(0)
36
+ schema_names.map! { |c| s[c] += 1; s[c] > 1 ? "#{c}---|---#{s[c]}" : c }
37
+ schema_names.map! { |c| s[c] > 1 ? "#{c}---|---1" : c }
38
+ schema_names.map! { |c| c.gsub('---|---', '_').to_sym }
39
+
40
+ # Lets fix the fact that Hive doesn't return schema data for partitions on SELECT * queries
41
+ # For now we will call them :_p1, :_p2, etc. to avoid collisions.
42
+ offset = 0
43
+ while schema_names.length < @example_row.length
44
+ schema_names.push(:"_p#{offset+=1}")
45
+ end
46
+ schema_names
47
+ end
48
+ end
49
+
50
+ def column_type_map
51
+ @column_type_map ||= column_names.inject({}) do |hsh, c|
52
+ definition = @schema.fieldSchemas.find {|s| s.name.to_sym == c }
53
+ # If the column isn't in the schema (eg partitions in SELECT * queries) assume they are strings
54
+ hsh[c] = definition ? definition.type.to_sym : :string
55
+ hsh
56
+ end
57
+ end
58
+
59
+ def coerce_row(row)
60
+ column_names.zip(row.split("\t")).inject({}) do |hsh, (column_name, value)|
61
+ hsh[column_name] = coerce_column(column_name, value)
62
+ hsh
63
+ end
64
+ end
65
+
66
+ def coerce_column(column_name, value)
67
+ type = column_type_map[column_name]
68
+ return INFINITY if (type != :string && value == "Infinity")
69
+ return NAN if (type != :string && value == "NaN")
70
+ return coerce_complex_value(value) if type.to_s =~ /^array/
71
+ conversion_method = TYPES[type]
72
+ conversion_method ? value.send(conversion_method) : value
73
+ end
74
+
75
+ def coerce_row_to_array(row)
76
+ column_names.map { |n| row[n] }
77
+ end
78
+
79
+ def coerce_complex_value(value)
80
+ return nil if value.nil?
81
+ return nil if value.length == 0
82
+ return nil if value == 'null'
83
+ JSON.parse(value)
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,464 @@
1
+ # suppress warnings
2
+ old_verbose, $VERBOSE = $VERBOSE, nil
3
+
4
+ raise 'Thrift is not loaded' unless defined?(Thrift)
5
+ raise 'RBHive is not loaded' unless defined?(RBHive)
6
+
7
+ # require thrift autogenerated files
8
+ require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service_constants])
9
+ require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service])
10
+ require File.join(File.dirname(__FILE__), *%w[.. thrift sasl_client_transport])
11
+
12
+ # restore warnings
13
+ $VERBOSE = old_verbose
14
+
15
+ # Monkey patch thrift to set an infinite read timeout
16
+ module Thrift
17
+ class HTTPClientTransport < BaseTransport
18
+ def flush
19
+ http = Net::HTTP.new @url.host, @url.port
20
+ http.use_ssl = @url.scheme == 'https'
21
+ http.read_timeout = nil
22
+ http.verify_mode = @ssl_verify_mode if @url.scheme == 'https'
23
+ resp = http.post(@url.request_uri, @outbuf, @headers)
24
+ data = resp.body
25
+ data = Bytes.force_binary_encoding(data)
26
+ @inbuf = StringIO.new data
27
+ @outbuf = Bytes.empty_byte_buffer
28
+ end
29
+ end
30
+ end
31
+
32
+ module RBHive
33
+
34
+ HIVE_THRIFT_MAPPING = {
35
+ 10 => 0,
36
+ 11 => 1,
37
+ 12 => 2,
38
+ 13 => 6,
39
+ :cdh4 => 0,
40
+ :cdh5 => 4,
41
+ :PROTOCOL_V1 => 0,
42
+ :PROTOCOL_V2 => 1,
43
+ :PROTOCOL_V3 => 2,
44
+ :PROTOCOL_V4 => 3,
45
+ :PROTOCOL_V5 => 4,
46
+ :PROTOCOL_V6 => 5,
47
+ :PROTOCOL_V7 => 6
48
+ }
49
+
50
+ def tcli_connect(server, port = 10_000, options={})
51
+ logger = options.key?(:logger) ? options.delete(:logger) : StdOutLogger.new
52
+ connection = RBHive::TCLIConnection.new(server, port, options, logger)
53
+ ret = nil
54
+ begin
55
+ connection.open
56
+ connection.open_session
57
+ ret = yield(connection)
58
+
59
+ ensure
60
+ # Try to close the session and our connection if those are still open, ignore io errors
61
+ begin
62
+ connection.close_session if connection.session
63
+ connection.close
64
+ rescue IOError => e
65
+ # noop
66
+ end
67
+ end
68
+
69
+ ret
70
+ end
71
+ module_function :tcli_connect
72
+
73
+ class StdOutLogger
74
+ %w(fatal error warn info debug).each do |level|
75
+ define_method level.to_sym do |message|
76
+ STDOUT.puts(message)
77
+ end
78
+ end
79
+ end
80
+
81
+ class TCLIConnection
82
+ attr_reader :client
83
+
84
+ def initialize(server, port = 10_000, options = {}, logger = StdOutLogger.new)
85
+ options ||= {} # backwards compatibility
86
+ raise "'options' parameter must be a hash" unless options.is_a?(Hash)
87
+ @sasl_params = options.delete(:sasl_params) || {}
88
+
89
+ if options[:transport] == :sasl and @sasl_params.empty?
90
+ raise ":transport is set to :sasl, but no :sasl_params option was supplied"
91
+ end
92
+
93
+ # Defaults to buffered transport, Hive 0.10, 1800 second timeout
94
+ options[:transport] ||= :buffered
95
+ options[:hive_version] ||= 10
96
+ options[:timeout] ||= 1800
97
+ @options = options
98
+ # Look up the appropriate Thrift protocol version for the supplied Hive version
99
+ @thrift_protocol_version = thrift_hive_protocol(options[:hive_version])
100
+
101
+ @logger = logger
102
+ @transport = thrift_transport(server, port)
103
+ @protocol = Thrift::BinaryProtocol.new(@transport)
104
+ @client = Hive2::Thrift::TCLIService::Client.new(@protocol)
105
+ @session = nil
106
+ @logger.info("Connecting to HiveServer2 #{server} on port #{port}")
107
+ end
108
+
109
+ def thrift_hive_protocol(version)
110
+ HIVE_THRIFT_MAPPING[version] || raise("Invalid Hive version")
111
+ end
112
+
113
+ def thrift_transport(server, port)
114
+ @logger.info("Initializing transport #{@options[:transport]}")
115
+ case @options[:transport]
116
+ when :buffered
117
+ return Thrift::BufferedTransport.new(thrift_socket(server, port, @options[:timeout]))
118
+ when :sasl
119
+ return Thrift::SaslClientTransport.new(thrift_socket(server, port, @options[:timeout]),
120
+ parse_sasl_params(@sasl_params))
121
+ when :http
122
+ return Thrift::HTTPClientTransport.new("http://#{server}:#{port}/cliservice")
123
+ else
124
+ raise "Unrecognised transport type '#{transport}'"
125
+ end
126
+ end
127
+
128
+ def thrift_socket(server, port, timeout)
129
+ socket = Thrift::Socket.new(server, port)
130
+ socket.timeout = timeout
131
+ socket
132
+ end
133
+
134
+ # Processes SASL connection params and returns a hash with symbol keys or a nil
135
+ def parse_sasl_params(sasl_params)
136
+ # Symbilize keys in a hash
137
+ if sasl_params.kind_of?(Hash)
138
+ return sasl_params.inject({}) do |memo,(k,v)|
139
+ memo[k.to_sym] = v;
140
+ memo
141
+ end
142
+ end
143
+ return nil
144
+ end
145
+
146
+ def open
147
+ @transport.open
148
+ end
149
+
150
+ def close
151
+ @transport.close
152
+ end
153
+
154
+ def open_session
155
+ @session = @client.OpenSession(prepare_open_session(@thrift_protocol_version))
156
+ end
157
+
158
+ def close_session
159
+ @client.CloseSession prepare_close_session
160
+ @session = nil
161
+ end
162
+
163
+ def session
164
+ @session && @session.sessionHandle
165
+ end
166
+
167
+ def client
168
+ @client
169
+ end
170
+
171
+ def execute(query)
172
+ @logger.info("Executing Hive Query: #{query}")
173
+ req = prepare_execute_statement(query)
174
+ exec_result = client.ExecuteStatement(req)
175
+ raise_error_if_failed!(exec_result)
176
+ exec_result
177
+ end
178
+
179
+ def priority=(priority)
180
+ set("mapred.job.priority", priority)
181
+ end
182
+
183
+ def queue=(queue)
184
+ set("mapred.job.queue.name", queue)
185
+ end
186
+
187
+ def set(name,value)
188
+ @logger.info("Setting #{name}=#{value}")
189
+ self.execute("SET #{name}=#{value}")
190
+ end
191
+
192
+ # Async execute
193
+ def async_execute(query)
194
+ @logger.info("Executing query asynchronously: #{query}")
195
+ exec_result = @client.ExecuteStatement(
196
+ Hive2::Thrift::TExecuteStatementReq.new(
197
+ sessionHandle: @session.sessionHandle,
198
+ statement: query,
199
+ runAsync: true
200
+ )
201
+ )
202
+ raise_error_if_failed!(exec_result)
203
+ op_handle = exec_result.operationHandle
204
+
205
+ # Return handles to get hold of this query / session again
206
+ {
207
+ session: @session.sessionHandle,
208
+ guid: op_handle.operationId.guid,
209
+ secret: op_handle.operationId.secret
210
+ }
211
+ end
212
+
213
+ # Is the query complete?
214
+ def async_is_complete?(handles)
215
+ async_state(handles) == :finished
216
+ end
217
+
218
+ # Is the query actually running?
219
+ def async_is_running?(handles)
220
+ async_state(handles) == :running
221
+ end
222
+
223
+ # Has the query failed?
224
+ def async_is_failed?(handles)
225
+ async_state(handles) == :error
226
+ end
227
+
228
+ def async_is_cancelled?(handles)
229
+ async_state(handles) == :cancelled
230
+ end
231
+
232
+ def async_cancel(handles)
233
+ @client.CancelOperation(prepare_cancel_request(handles))
234
+ end
235
+
236
+ # Map states to symbols
237
+ def async_state(handles)
238
+ response = @client.GetOperationStatus(
239
+ Hive2::Thrift::TGetOperationStatusReq.new(operationHandle: prepare_operation_handle(handles))
240
+ )
241
+
242
+ case response.operationState
243
+ when Hive2::Thrift::TOperationState::FINISHED_STATE
244
+ return :finished
245
+ when Hive2::Thrift::TOperationState::INITIALIZED_STATE
246
+ return :initialized
247
+ when Hive2::Thrift::TOperationState::RUNNING_STATE
248
+ return :running
249
+ when Hive2::Thrift::TOperationState::CANCELED_STATE
250
+ return :cancelled
251
+ when Hive2::Thrift::TOperationState::CLOSED_STATE
252
+ return :closed
253
+ when Hive2::Thrift::TOperationState::ERROR_STATE
254
+ return :error
255
+ when Hive2::Thrift::TOperationState::UKNOWN_STATE
256
+ return :unknown
257
+ when Hive2::Thrift::TOperationState::PENDING_STATE
258
+ return :pending
259
+ when nil
260
+ raise "No operation state found for handles - has the session been closed?"
261
+ else
262
+ return :state_not_in_protocol
263
+ end
264
+ end
265
+
266
+ # Async fetch results from an async execute
267
+ def async_fetch(handles, max_rows = 100)
268
+ # Can't get data from an unfinished query
269
+ unless async_is_complete?(handles)
270
+ raise "Can't perform fetch on a query in state: #{async_state(handles)}"
271
+ end
272
+
273
+ # Fetch and
274
+ fetch_rows(prepare_operation_handle(handles), :first, max_rows)
275
+ end
276
+
277
+ # Performs a query on the server, fetches the results in batches of *batch_size* rows
278
+ # and yields the result batches to a given block as arrays of rows.
279
+ def async_fetch_in_batch(handles, batch_size = 1000, &block)
280
+ raise "No block given for the batch fetch request!" unless block_given?
281
+ # Can't get data from an unfinished query
282
+ unless async_is_complete?(handles)
283
+ raise "Can't perform fetch on a query in state: #{async_state(handles)}"
284
+ end
285
+
286
+ # Now let's iterate over the results
287
+ loop do
288
+ rows = fetch_rows(prepare_operation_handle(handles), :next, batch_size)
289
+ break if rows.empty?
290
+ yield rows
291
+ end
292
+ end
293
+
294
+ def async_close_session(handles)
295
+ validate_handles!(handles)
296
+ @client.CloseSession(Hive2::Thrift::TCloseSessionReq.new( sessionHandle: handles[:session] ))
297
+ end
298
+
299
+ def get_column_info(op_handle)
300
+ cols = get_schema_for(op_handle).columns
301
+ [cols.map(&:columnName), cols.map{|c| c.typeDesc.types.first.primitiveEntry.type}]
302
+ end
303
+
304
+ def yield_hash_rows(op_handle, columns, convertors)
305
+ i = -1
306
+ cols = columns.zip(convertors).map{|col, conv| [i+=1, col, conv]}
307
+ rows = fetch_rows(op_handle)
308
+ until rows.empty?
309
+ rows.each do |row|
310
+ h = {}
311
+ vals = row.colVals
312
+ cols.each do |i, col, conv|
313
+ v = vals[i].get_value.value
314
+ h[col] = conv ? conv[v] : v
315
+ end
316
+ yield h
317
+ end
318
+ rows = fetch_rows(op_handle, :next)
319
+ end
320
+ end
321
+
322
+ # Pull rows from the query result
323
+ def fetch_rows(op_handle, orientation = :first, max_rows = 1000)
324
+ fetch_req = prepare_fetch_results(op_handle, orientation, max_rows)
325
+ fetch_results = @client.FetchResults(fetch_req)
326
+ raise_error_if_failed!(fetch_results)
327
+ fetch_results.results.rows
328
+ #TCLIResultSet.new(rows, TCLISchemaDefinition.new(get_schema_for(op_handle), rows.first))
329
+ end
330
+
331
+ # Performs a explain on the supplied query on the server, returns it as a ExplainResult.
332
+ # (Only works on 0.12 if you have this patch - https://issues.apache.org/jira/browse/HIVE-5492)
333
+ def explain(query)
334
+ rows = []
335
+ fetch_in_batch("EXPLAIN " + query) do |batch|
336
+ rows << batch.map { |b| b[:Explain] }
337
+ end
338
+ ExplainResult.new(rows.flatten)
339
+ end
340
+
341
+ # Performs a query on the server, fetches up to *max_rows* rows and returns them as an array.
342
+ def fetch(query, max_rows = 100)
343
+ # Execute the query and check the result
344
+ exec_result = execute(query)
345
+ raise_error_if_failed!(exec_result)
346
+
347
+ # Get search operation handle to fetch the results
348
+ op_handle = exec_result.operationHandle
349
+
350
+ # Fetch the rows
351
+ fetch_rows(op_handle, :first, max_rows)
352
+ end
353
+
354
+ # Performs a query on the server, fetches the results in batches of *batch_size* rows
355
+ # and yields the result batches to a given block as arrays of rows.
356
+ def fetch_in_batch(query, batch_size = 1000, &block)
357
+ raise "No block given for the batch fetch request!" unless block_given?
358
+
359
+ # Execute the query and check the result
360
+ exec_result = execute(query)
361
+ raise_error_if_failed!(exec_result)
362
+
363
+ # Get search operation handle to fetch the results
364
+ op_handle = exec_result.operationHandle
365
+
366
+ # Prepare fetch results request
367
+ fetch_req = prepare_fetch_results(op_handle, :next, batch_size)
368
+
369
+ # Now let's iterate over the results
370
+ loop do
371
+ rows = fetch_rows(op_handle, :next, batch_size)
372
+ break if rows.empty?
373
+ yield rows
374
+ end
375
+ end
376
+
377
+ def create_table(schema)
378
+ execute(schema.create_table_statement)
379
+ end
380
+
381
+ def drop_table(name)
382
+ name = name.name if name.is_a?(TableSchema)
383
+ execute("DROP TABLE `#{name}`")
384
+ end
385
+
386
+ def replace_columns(schema)
387
+ execute(schema.replace_columns_statement)
388
+ end
389
+
390
+ def add_columns(schema)
391
+ execute(schema.add_columns_statement)
392
+ end
393
+
394
+ def method_missing(meth, *args)
395
+ client.send(meth, *args)
396
+ end
397
+
398
+ private
399
+
400
+ def prepare_open_session(client_protocol)
401
+ req = ::Hive2::Thrift::TOpenSessionReq.new( @sasl_params.empty? ? [] : @sasl_params )
402
+ req.client_protocol = client_protocol
403
+ req
404
+ end
405
+
406
+ def prepare_close_session
407
+ ::Hive2::Thrift::TCloseSessionReq.new( sessionHandle: self.session )
408
+ end
409
+
410
+ def prepare_execute_statement(query)
411
+ ::Hive2::Thrift::TExecuteStatementReq.new( sessionHandle: self.session, statement: query.to_s, confOverlay: {"impala.resultset.cache.size"=>"100000"} )
412
+ end
413
+
414
+ def prepare_fetch_results(handle, orientation=:first, rows=100)
415
+ orientation_value = "FETCH_#{orientation.to_s.upcase}"
416
+ valid_orientations = ::Hive2::Thrift::TFetchOrientation::VALUE_MAP.values
417
+ unless valid_orientations.include?(orientation_value)
418
+ raise ArgumentError, "Invalid orientation: #{orientation.inspect}"
419
+ end
420
+ orientation_const = eval("::Hive2::Thrift::TFetchOrientation::#{orientation_value}")
421
+ ::Hive2::Thrift::TFetchResultsReq.new(
422
+ operationHandle: handle,
423
+ orientation: orientation_const,
424
+ maxRows: rows
425
+ )
426
+ end
427
+
428
+ def prepare_operation_handle(handles)
429
+ validate_handles!(handles)
430
+ Hive2::Thrift::TOperationHandle.new(
431
+ operationId: Hive2::Thrift::THandleIdentifier.new(guid: handles[:guid], secret: handles[:secret]),
432
+ operationType: Hive2::Thrift::TOperationType::EXECUTE_STATEMENT,
433
+ hasResultSet: false
434
+ )
435
+ end
436
+
437
+ def prepare_cancel_request(handles)
438
+ Hive2::Thrift::TCancelOperationReq.new(
439
+ operationHandle: prepare_operation_handle(handles)
440
+ )
441
+ end
442
+
443
+ def validate_handles!(handles)
444
+ unless handles.has_key?(:guid) and handles.has_key?(:secret) and handles.has_key?(:session)
445
+ raise "Invalid handles hash: #{handles.inspect}"
446
+ end
447
+ end
448
+
449
+ def get_schema_for(handle)
450
+ req = ::Hive2::Thrift::TGetResultSetMetadataReq.new( operationHandle: handle )
451
+ metadata = client.GetResultSetMetadata( req )
452
+ metadata.schema
453
+ end
454
+
455
+ # Raises an exception if given operation result is a failure
456
+ def raise_error_if_failed!(result)
457
+ return if result.status.statusCode == 0
458
+ error_message = result.status.errorMessage || 'Execution failed!'
459
+ raise RBHive::TCLIConnectionError.new(error_message)
460
+ end
461
+ end
462
+
463
+ class TCLIConnectionError < StandardError; end
464
+ end