sequel-impala 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +16 -0
  3. data/LICENSE +2 -1
  4. data/README.md +45 -0
  5. data/lib/rbhive.rb +8 -0
  6. data/lib/rbhive/connection.rb +150 -0
  7. data/lib/rbhive/explain_result.rb +46 -0
  8. data/lib/rbhive/result_set.rb +37 -0
  9. data/lib/rbhive/schema_definition.rb +86 -0
  10. data/lib/rbhive/t_c_l_i_connection.rb +464 -0
  11. data/lib/rbhive/t_c_l_i_result_set.rb +3 -0
  12. data/lib/rbhive/t_c_l_i_schema_definition.rb +87 -0
  13. data/lib/rbhive/table_schema.rb +122 -0
  14. data/lib/rbhive/version.rb +3 -0
  15. data/lib/sequel/adapters/impala.rb +13 -1
  16. data/lib/sequel/adapters/rbhive.rb +174 -0
  17. data/lib/sequel/adapters/shared/impala.rb +11 -3
  18. data/lib/sequel/extensions/csv_to_parquet.rb +68 -14
  19. data/lib/thrift/facebook_service.rb +700 -0
  20. data/lib/thrift/fb303_constants.rb +9 -0
  21. data/lib/thrift/fb303_types.rb +19 -0
  22. data/lib/thrift/hive_metastore_constants.rb +41 -0
  23. data/lib/thrift/hive_metastore_types.rb +630 -0
  24. data/lib/thrift/hive_service_constants.rb +13 -0
  25. data/lib/thrift/hive_service_types.rb +72 -0
  26. data/lib/thrift/queryplan_constants.rb +13 -0
  27. data/lib/thrift/queryplan_types.rb +261 -0
  28. data/lib/thrift/sasl_client_transport.rb +161 -0
  29. data/lib/thrift/serde_constants.rb +92 -0
  30. data/lib/thrift/serde_types.rb +7 -0
  31. data/lib/thrift/t_c_l_i_service.rb +1054 -0
  32. data/lib/thrift/t_c_l_i_service_constants.rb +72 -0
  33. data/lib/thrift/t_c_l_i_service_types.rb +1768 -0
  34. data/lib/thrift/thrift_hive.rb +508 -0
  35. data/lib/thrift/thrift_hive_metastore.rb +3856 -0
  36. data/spec/impala_test.rb +6 -1
  37. metadata +53 -25
  38. data/README.rdoc +0 -39
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1583131d7fd60fe171fa38b12e51553760f4636f
4
- data.tar.gz: 85c496bf9bdf022c29d9a309d7a1e2f2b202a288
3
+ metadata.gz: 03ea5f2607bc4908064302d49640df3a4e34eaa3
4
+ data.tar.gz: 18c93756bb5918f32cb6f32612856963f634e966
5
5
  SHA512:
6
- metadata.gz: 82f28a92091bd3a46992f2eceac625ed30f9107cdc80e6495af4a45b76fcd55b4df23c0cac768ccce48038d3ba888e018db8df5184fc8846e7f17d793a0d6ff6
7
- data.tar.gz: 93809720a5187f6c9eafe2708bf4cf86c1575caa5a0079077f7fef2d3b76a2dac6420b94c4b8b44d7a3e9a6336bdc049accce88ea80b301dee7049684c6ea135
6
+ metadata.gz: 72ca2b1c7177ecc8c2db06e8b266f4a1ff67085cf7ef77d464e6b7667ce896870d905a691acf4fc2b34e67c90777dc3767cc9b3a1e3a2a9252967beaf507b566
7
+ data.tar.gz: fb15a8bf19c03e54179666df88da7ffc0f0be90d48a65ada20722da03f46c4fa26d910a2f3c554a93149ea29f92f565fa3bbeb43b875c8ceacfcf413b64aa161
data/CHANGELOG CHANGED
@@ -1,3 +1,19 @@
1
1
  === HEAD
2
2
 
3
+ * Fix disconnect detection in impala and rbhive adapters (jeremyevans)
4
+
5
+ * Make implicit qualify return an SQL::Identifier if given an unqualified string (jeremyevans)
6
+
7
+ * Fix :search_path option handling when using Sequel::SQL::AliasedExpressions (jeremyevans)
8
+
9
+ * Speed up multi_insert and import (jeremyevans)
10
+
11
+ * Add rbhive adapter (jeremyevans)
12
+
13
+ * Add :empty_null=>:ruby option to csv_to_parquet extension, which can support quoted CSV cells (jeremyevans)
14
+
15
+ * Optimize csv_to_parquet extension by not spawning shells or unnecessary processes (jeremyevans)
16
+
17
+ === 1.0.0 (2015-12-04)
18
+
3
19
  * Initial Public Release
data/LICENSE CHANGED
@@ -1,5 +1,6 @@
1
- Copyright (c) 2015 Jeremy Evans
1
+ Copyright (c) 2015-2016 Jeremy Evans
2
2
  Copyright (c) 2013 Colin Marc
3
+ Copyright (c) [2013] [Forward3D]
3
4
 
4
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
5
6
  of this software and associated documentation files (the "Software"), to
@@ -0,0 +1,45 @@
1
+ # sequel-impala
2
+
3
+ sequel-impala adds support for Sequel to connect to the Impala database
4
+ via the included impala driver, and the included jdbc-hive2 driver under JRuby.
5
+
6
+ # Source Code
7
+
8
+ Source code is available on GitHub at https://github.com/outcomesinsights/sequel-impala
9
+
10
+ # Usage
11
+
12
+ After installation, Sequel will automatically pick up the adapter as long as
13
+ the lib directory is in RUBYLIB, if you use a connection string starting with
14
+ `impala`, or `jdbc:hive2` on JRuby.
15
+
16
+ # Connection Strings
17
+
18
+ If using the impala driver (default host is localhost, default port is 21000):
19
+
20
+ impala://host:port
21
+
22
+ If using the jdbc:hive2 driver on JRuby (port 21050 works in testing):
23
+
24
+ jdbc:hive2://host:port/;auth=noSasl
25
+
26
+ # Dependencies
27
+
28
+ * sequel 4+
29
+ * thrift gem
30
+
31
+ # License
32
+
33
+ MIT/Apache
34
+
35
+ # Author
36
+
37
+ Ryan Duryea <aguynamedryan@gmail.com>
38
+
39
+ Work on sequel-impala is generously funded by [Outcomes Insights, Inc.](http://outins.com)
40
+
41
+ # Previous Author
42
+
43
+ Jeremy Evans <code@jeremyevans.net>
44
+
45
+ Provided initial work on this gem, and continues to maintain [Sequel](http://sequel.jeremyevans.net/). We can't thank you enough!
@@ -0,0 +1,8 @@
1
+ require File.join(File.dirname(__FILE__), 'rbhive', 'connection')
2
+ require File.join(File.dirname(__FILE__), 'rbhive', 'table_schema')
3
+ require File.join(File.dirname(__FILE__), 'rbhive', 'result_set')
4
+ require File.join(File.dirname(__FILE__), 'rbhive', 'explain_result')
5
+ require File.join(File.dirname(__FILE__), 'rbhive', 'schema_definition')
6
+ require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_result_set])
7
+ require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_schema_definition])
8
+ require File.join(File.dirname(__FILE__), *%w[rbhive t_c_l_i_connection])
@@ -0,0 +1,150 @@
1
+ # suppress warnings
2
+ old_verbose, $VERBOSE = $VERBOSE, nil
3
+ # require thrift autogenerated files
4
+ require File.join(File.split(File.dirname(__FILE__)).first, *%w[thrift thrift_hive])
5
+ # require 'thrift'
6
+ # restore warnings
7
+ $VERBOSE = old_verbose
8
+
9
+ module RBHive
10
+ def connect(server, port=10_000)
11
+ connection = RBHive::Connection.new(server, port)
12
+ ret = nil
13
+ begin
14
+ connection.open
15
+ ret = yield(connection)
16
+ ensure
17
+ connection.close
18
+ ret
19
+ end
20
+ end
21
+ module_function :connect
22
+
23
+ class StdOutLogger
24
+ %w(fatal error warn info debug).each do |level|
25
+ define_method level.to_sym do |message|
26
+ STDOUT.puts(message)
27
+ end
28
+ end
29
+ end
30
+
31
+ class Connection
32
+ attr_reader :client
33
+
34
+ def initialize(server, port=10_000, logger=StdOutLogger.new)
35
+ @socket = Thrift::Socket.new(server, port)
36
+ @transport = Thrift::BufferedTransport.new(@socket)
37
+ @protocol = Thrift::BinaryProtocol.new(@transport)
38
+ @client = Hive::Thrift::ThriftHive::Client.new(@protocol)
39
+ @logger = logger
40
+ @logger.info("Connecting to #{server} on port #{port}")
41
+ @mutex = Mutex.new
42
+ end
43
+
44
+ def open
45
+ @transport.open
46
+ end
47
+
48
+ def close
49
+ @transport.close
50
+ end
51
+
52
+ def client
53
+ @client
54
+ end
55
+
56
+ def execute(query)
57
+ execute_safe(query)
58
+ end
59
+
60
+ def explain(query)
61
+ safe do
62
+ execute_unsafe("EXPLAIN "+ query)
63
+ ExplainResult.new(client.fetchAll)
64
+ end
65
+ end
66
+
67
+ def priority=(priority)
68
+ set("mapred.job.priority", priority)
69
+ end
70
+
71
+ def queue=(queue)
72
+ set("mapred.job.queue.name", queue)
73
+ end
74
+
75
+ def set(name,value)
76
+ @logger.info("Setting #{name}=#{value}")
77
+ client.execute("SET #{name}=#{value}")
78
+ end
79
+
80
+ def fetch(query)
81
+ safe do
82
+ execute_unsafe(query)
83
+ rows = client.fetchAll
84
+ the_schema = SchemaDefinition.new(client.getSchema, rows.first)
85
+ ResultSet.new(rows, the_schema)
86
+ end
87
+ end
88
+
89
+ def fetch_in_batch(query, batch_size=1_000)
90
+ safe do
91
+ execute_unsafe(query)
92
+ until (next_batch = client.fetchN(batch_size)).empty?
93
+ the_schema ||= SchemaDefinition.new(client.getSchema, next_batch.first)
94
+ yield ResultSet.new(next_batch, the_schema)
95
+ end
96
+ end
97
+ end
98
+
99
+ def first(query)
100
+ safe do
101
+ execute_unsafe(query)
102
+ row = client.fetchOne
103
+ the_schema = SchemaDefinition.new(client.getSchema, row)
104
+ ResultSet.new([row], the_schema).first
105
+ end
106
+ end
107
+
108
+ def schema(example_row=[])
109
+ safe { SchemaDefinition.new(client.getSchema, example_row) }
110
+ end
111
+
112
+ def create_table(schema)
113
+ execute(schema.create_table_statement)
114
+ end
115
+
116
+ def drop_table(name)
117
+ name = name.name if name.is_a?(TableSchema)
118
+ execute("DROP TABLE `#{name}`")
119
+ end
120
+
121
+ def replace_columns(schema)
122
+ execute(schema.replace_columns_statement)
123
+ end
124
+
125
+ def add_columns(schema)
126
+ execute(schema.add_columns_statement)
127
+ end
128
+
129
+ def method_missing(meth, *args)
130
+ client.send(meth, *args)
131
+ end
132
+
133
+ private
134
+
135
+ def execute_safe(query)
136
+ safe { execute_unsafe(query) }
137
+ end
138
+
139
+ def execute_unsafe(query)
140
+ @logger.info("Executing Hive Query: #{query}")
141
+ client.execute(query)
142
+ end
143
+
144
+ def safe
145
+ ret = nil
146
+ @mutex.synchronize { ret = yield }
147
+ ret
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,46 @@
1
+ class ExplainResult
2
+ def initialize(rows)
3
+ @rows = rows
4
+ end
5
+
6
+ def ast
7
+ by_section[:abstract_syntax_tree].first
8
+ end
9
+
10
+ def stage_count
11
+ stage_dependencies.length
12
+ end
13
+
14
+ def stage_dependencies
15
+ by_section[:stage_dependencies] || []
16
+ end
17
+
18
+ def to_tsv
19
+ @rows.join("\n")
20
+ end
21
+
22
+ def raw
23
+ @rows
24
+ end
25
+
26
+ def to_s
27
+ to_tsv
28
+ end
29
+
30
+ private
31
+
32
+ def by_section
33
+ current_section = nil
34
+ @rows.inject({}) do |sections, row|
35
+ if row.match(/^[A-Z]/)
36
+ current_section = row.chomp(':').downcase.gsub(' ', '_').to_sym
37
+ sections[current_section] = []
38
+ elsif row.length == 0
39
+ next sections
40
+ else
41
+ sections[current_section] << row.strip
42
+ end
43
+ sections
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,37 @@
1
+ module RBHive
2
+ class ResultSet < Array
3
+ def initialize(rows, schema)
4
+ @schema = schema
5
+ super(rows.map {|r| @schema.coerce_row(r) })
6
+ end
7
+
8
+ def column_names
9
+ @schema.column_names
10
+ end
11
+
12
+ def column_type_map
13
+ @schema.column_type_map
14
+ end
15
+
16
+ def to_csv(out_file=nil)
17
+ to_separated_output(",", out_file)
18
+ end
19
+
20
+ def to_tsv(out_file=nil)
21
+ to_separated_output("\t", out_file)
22
+ end
23
+
24
+ def as_arrays
25
+ @as_arrays ||= self.map{ |r| @schema.coerce_row_to_array(r) }
26
+ end
27
+
28
+ private
29
+
30
+ def to_separated_output(sep, out_file)
31
+ rows = self.map { |r| @schema.coerce_row_to_array(r).join(sep) }
32
+ sv = rows.join("\n")
33
+ return sv if out_file.nil?
34
+ File.open(out_file, 'w+') { |f| f << sv }
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,86 @@
1
+ require 'json'
2
+
3
+ module RBHive
4
+ class SchemaDefinition
5
+ attr_reader :schema
6
+
7
+ NAN = Float::NAN rescue 0.0/0.0
8
+ INFINITY = Float::INFINITY rescue 1.0/0.0
9
+ TYPES = {
10
+ :boolean => :to_s,
11
+ :string => :to_s,
12
+ :bigint => :to_i,
13
+ :float => :to_f,
14
+ :double => :to_f,
15
+ :int => :to_i,
16
+ :smallint => :to_i,
17
+ :tinyint => :to_i,
18
+ }
19
+
20
+ def initialize(schema, example_row)
21
+ @schema = schema
22
+ @example_row = example_row ? example_row.split("\t") : []
23
+ end
24
+
25
+ def column_names
26
+ @column_names ||= begin
27
+ schema_names = @schema.fieldSchemas.map {|c| c.name }
28
+
29
+ # In rare cases Hive can return two identical column names
30
+ # consider SELECT a.foo, b.foo...
31
+ # in this case you get two columns called foo with no disambiguation.
32
+ # as a (far from ideal) solution we detect this edge case and rename them
33
+ # a.foo => foo1, b.foo => foo2
34
+ # otherwise we will trample one of the columns during Hash mapping.
35
+ s = Hash.new(0)
36
+ schema_names.map! { |c| s[c] += 1; s[c] > 1 ? "#{c}---|---#{s[c]}" : c }
37
+ schema_names.map! { |c| s[c] > 1 ? "#{c}---|---1" : c }
38
+ schema_names.map! { |c| c.gsub('---|---', '_').to_sym }
39
+
40
+ # Lets fix the fact that Hive doesn't return schema data for partitions on SELECT * queries
41
+ # For now we will call them :_p1, :_p2, etc. to avoid collisions.
42
+ offset = 0
43
+ while schema_names.length < @example_row.length
44
+ schema_names.push(:"_p#{offset+=1}")
45
+ end
46
+ schema_names
47
+ end
48
+ end
49
+
50
+ def column_type_map
51
+ @column_type_map ||= column_names.inject({}) do |hsh, c|
52
+ definition = @schema.fieldSchemas.find {|s| s.name.to_sym == c }
53
+ # If the column isn't in the schema (eg partitions in SELECT * queries) assume they are strings
54
+ hsh[c] = definition ? definition.type.to_sym : :string
55
+ hsh
56
+ end
57
+ end
58
+
59
+ def coerce_row(row)
60
+ column_names.zip(row.split("\t")).inject({}) do |hsh, (column_name, value)|
61
+ hsh[column_name] = coerce_column(column_name, value)
62
+ hsh
63
+ end
64
+ end
65
+
66
+ def coerce_column(column_name, value)
67
+ type = column_type_map[column_name]
68
+ return INFINITY if (type != :string && value == "Infinity")
69
+ return NAN if (type != :string && value == "NaN")
70
+ return coerce_complex_value(value) if type.to_s =~ /^array/
71
+ conversion_method = TYPES[type]
72
+ conversion_method ? value.send(conversion_method) : value
73
+ end
74
+
75
+ def coerce_row_to_array(row)
76
+ column_names.map { |n| row[n] }
77
+ end
78
+
79
+ def coerce_complex_value(value)
80
+ return nil if value.nil?
81
+ return nil if value.length == 0
82
+ return nil if value == 'null'
83
+ JSON.parse(value)
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,464 @@
1
+ # suppress warnings
2
+ old_verbose, $VERBOSE = $VERBOSE, nil
3
+
4
+ raise 'Thrift is not loaded' unless defined?(Thrift)
5
+ raise 'RBHive is not loaded' unless defined?(RBHive)
6
+
7
+ # require thrift autogenerated files
8
+ require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service_constants])
9
+ require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service])
10
+ require File.join(File.dirname(__FILE__), *%w[.. thrift sasl_client_transport])
11
+
12
+ # restore warnings
13
+ $VERBOSE = old_verbose
14
+
15
+ # Monkey patch thrift to set an infinite read timeout
16
+ module Thrift
17
+ class HTTPClientTransport < BaseTransport
18
+ def flush
19
+ http = Net::HTTP.new @url.host, @url.port
20
+ http.use_ssl = @url.scheme == 'https'
21
+ http.read_timeout = nil
22
+ http.verify_mode = @ssl_verify_mode if @url.scheme == 'https'
23
+ resp = http.post(@url.request_uri, @outbuf, @headers)
24
+ data = resp.body
25
+ data = Bytes.force_binary_encoding(data)
26
+ @inbuf = StringIO.new data
27
+ @outbuf = Bytes.empty_byte_buffer
28
+ end
29
+ end
30
+ end
31
+
32
+ module RBHive
33
+
34
+ HIVE_THRIFT_MAPPING = {
35
+ 10 => 0,
36
+ 11 => 1,
37
+ 12 => 2,
38
+ 13 => 6,
39
+ :cdh4 => 0,
40
+ :cdh5 => 4,
41
+ :PROTOCOL_V1 => 0,
42
+ :PROTOCOL_V2 => 1,
43
+ :PROTOCOL_V3 => 2,
44
+ :PROTOCOL_V4 => 3,
45
+ :PROTOCOL_V5 => 4,
46
+ :PROTOCOL_V6 => 5,
47
+ :PROTOCOL_V7 => 6
48
+ }
49
+
50
+ def tcli_connect(server, port = 10_000, options={})
51
+ logger = options.key?(:logger) ? options.delete(:logger) : StdOutLogger.new
52
+ connection = RBHive::TCLIConnection.new(server, port, options, logger)
53
+ ret = nil
54
+ begin
55
+ connection.open
56
+ connection.open_session
57
+ ret = yield(connection)
58
+
59
+ ensure
60
+ # Try to close the session and our connection if those are still open, ignore io errors
61
+ begin
62
+ connection.close_session if connection.session
63
+ connection.close
64
+ rescue IOError => e
65
+ # noop
66
+ end
67
+ end
68
+
69
+ ret
70
+ end
71
+ module_function :tcli_connect
72
+
73
+ class StdOutLogger
74
+ %w(fatal error warn info debug).each do |level|
75
+ define_method level.to_sym do |message|
76
+ STDOUT.puts(message)
77
+ end
78
+ end
79
+ end
80
+
81
+ class TCLIConnection
82
+ attr_reader :client
83
+
84
+ def initialize(server, port = 10_000, options = {}, logger = StdOutLogger.new)
85
+ options ||= {} # backwards compatibility
86
+ raise "'options' parameter must be a hash" unless options.is_a?(Hash)
87
+ @sasl_params = options.delete(:sasl_params) || {}
88
+
89
+ if options[:transport] == :sasl and @sasl_params.empty?
90
+ raise ":transport is set to :sasl, but no :sasl_params option was supplied"
91
+ end
92
+
93
+ # Defaults to buffered transport, Hive 0.10, 1800 second timeout
94
+ options[:transport] ||= :buffered
95
+ options[:hive_version] ||= 10
96
+ options[:timeout] ||= 1800
97
+ @options = options
98
+ # Look up the appropriate Thrift protocol version for the supplied Hive version
99
+ @thrift_protocol_version = thrift_hive_protocol(options[:hive_version])
100
+
101
+ @logger = logger
102
+ @transport = thrift_transport(server, port)
103
+ @protocol = Thrift::BinaryProtocol.new(@transport)
104
+ @client = Hive2::Thrift::TCLIService::Client.new(@protocol)
105
+ @session = nil
106
+ @logger.info("Connecting to HiveServer2 #{server} on port #{port}")
107
+ end
108
+
109
+ def thrift_hive_protocol(version)
110
+ HIVE_THRIFT_MAPPING[version] || raise("Invalid Hive version")
111
+ end
112
+
113
+ def thrift_transport(server, port)
114
+ @logger.info("Initializing transport #{@options[:transport]}")
115
+ case @options[:transport]
116
+ when :buffered
117
+ return Thrift::BufferedTransport.new(thrift_socket(server, port, @options[:timeout]))
118
+ when :sasl
119
+ return Thrift::SaslClientTransport.new(thrift_socket(server, port, @options[:timeout]),
120
+ parse_sasl_params(@sasl_params))
121
+ when :http
122
+ return Thrift::HTTPClientTransport.new("http://#{server}:#{port}/cliservice")
123
+ else
124
+ raise "Unrecognised transport type '#{transport}'"
125
+ end
126
+ end
127
+
128
+ def thrift_socket(server, port, timeout)
129
+ socket = Thrift::Socket.new(server, port)
130
+ socket.timeout = timeout
131
+ socket
132
+ end
133
+
134
+ # Processes SASL connection params and returns a hash with symbol keys or a nil
135
+ def parse_sasl_params(sasl_params)
136
+ # Symbilize keys in a hash
137
+ if sasl_params.kind_of?(Hash)
138
+ return sasl_params.inject({}) do |memo,(k,v)|
139
+ memo[k.to_sym] = v;
140
+ memo
141
+ end
142
+ end
143
+ return nil
144
+ end
145
+
146
+ def open
147
+ @transport.open
148
+ end
149
+
150
+ def close
151
+ @transport.close
152
+ end
153
+
154
+ def open_session
155
+ @session = @client.OpenSession(prepare_open_session(@thrift_protocol_version))
156
+ end
157
+
158
+ def close_session
159
+ @client.CloseSession prepare_close_session
160
+ @session = nil
161
+ end
162
+
163
+ def session
164
+ @session && @session.sessionHandle
165
+ end
166
+
167
+ def client
168
+ @client
169
+ end
170
+
171
+ def execute(query)
172
+ @logger.info("Executing Hive Query: #{query}")
173
+ req = prepare_execute_statement(query)
174
+ exec_result = client.ExecuteStatement(req)
175
+ raise_error_if_failed!(exec_result)
176
+ exec_result
177
+ end
178
+
179
+ def priority=(priority)
180
+ set("mapred.job.priority", priority)
181
+ end
182
+
183
+ def queue=(queue)
184
+ set("mapred.job.queue.name", queue)
185
+ end
186
+
187
+ def set(name,value)
188
+ @logger.info("Setting #{name}=#{value}")
189
+ self.execute("SET #{name}=#{value}")
190
+ end
191
+
192
+ # Async execute
193
+ def async_execute(query)
194
+ @logger.info("Executing query asynchronously: #{query}")
195
+ exec_result = @client.ExecuteStatement(
196
+ Hive2::Thrift::TExecuteStatementReq.new(
197
+ sessionHandle: @session.sessionHandle,
198
+ statement: query,
199
+ runAsync: true
200
+ )
201
+ )
202
+ raise_error_if_failed!(exec_result)
203
+ op_handle = exec_result.operationHandle
204
+
205
+ # Return handles to get hold of this query / session again
206
+ {
207
+ session: @session.sessionHandle,
208
+ guid: op_handle.operationId.guid,
209
+ secret: op_handle.operationId.secret
210
+ }
211
+ end
212
+
213
+ # Is the query complete?
214
+ def async_is_complete?(handles)
215
+ async_state(handles) == :finished
216
+ end
217
+
218
+ # Is the query actually running?
219
+ def async_is_running?(handles)
220
+ async_state(handles) == :running
221
+ end
222
+
223
+ # Has the query failed?
224
+ def async_is_failed?(handles)
225
+ async_state(handles) == :error
226
+ end
227
+
228
+ def async_is_cancelled?(handles)
229
+ async_state(handles) == :cancelled
230
+ end
231
+
232
+ def async_cancel(handles)
233
+ @client.CancelOperation(prepare_cancel_request(handles))
234
+ end
235
+
236
+ # Map states to symbols
237
+ def async_state(handles)
238
+ response = @client.GetOperationStatus(
239
+ Hive2::Thrift::TGetOperationStatusReq.new(operationHandle: prepare_operation_handle(handles))
240
+ )
241
+
242
+ case response.operationState
243
+ when Hive2::Thrift::TOperationState::FINISHED_STATE
244
+ return :finished
245
+ when Hive2::Thrift::TOperationState::INITIALIZED_STATE
246
+ return :initialized
247
+ when Hive2::Thrift::TOperationState::RUNNING_STATE
248
+ return :running
249
+ when Hive2::Thrift::TOperationState::CANCELED_STATE
250
+ return :cancelled
251
+ when Hive2::Thrift::TOperationState::CLOSED_STATE
252
+ return :closed
253
+ when Hive2::Thrift::TOperationState::ERROR_STATE
254
+ return :error
255
+ when Hive2::Thrift::TOperationState::UKNOWN_STATE
256
+ return :unknown
257
+ when Hive2::Thrift::TOperationState::PENDING_STATE
258
+ return :pending
259
+ when nil
260
+ raise "No operation state found for handles - has the session been closed?"
261
+ else
262
+ return :state_not_in_protocol
263
+ end
264
+ end
265
+
266
+ # Async fetch results from an async execute
267
+ def async_fetch(handles, max_rows = 100)
268
+ # Can't get data from an unfinished query
269
+ unless async_is_complete?(handles)
270
+ raise "Can't perform fetch on a query in state: #{async_state(handles)}"
271
+ end
272
+
273
+ # Fetch and
274
+ fetch_rows(prepare_operation_handle(handles), :first, max_rows)
275
+ end
276
+
277
+ # Performs a query on the server, fetches the results in batches of *batch_size* rows
278
+ # and yields the result batches to a given block as arrays of rows.
279
+ def async_fetch_in_batch(handles, batch_size = 1000, &block)
280
+ raise "No block given for the batch fetch request!" unless block_given?
281
+ # Can't get data from an unfinished query
282
+ unless async_is_complete?(handles)
283
+ raise "Can't perform fetch on a query in state: #{async_state(handles)}"
284
+ end
285
+
286
+ # Now let's iterate over the results
287
+ loop do
288
+ rows = fetch_rows(prepare_operation_handle(handles), :next, batch_size)
289
+ break if rows.empty?
290
+ yield rows
291
+ end
292
+ end
293
+
294
+ def async_close_session(handles)
295
+ validate_handles!(handles)
296
+ @client.CloseSession(Hive2::Thrift::TCloseSessionReq.new( sessionHandle: handles[:session] ))
297
+ end
298
+
299
+ def get_column_info(op_handle)
300
+ cols = get_schema_for(op_handle).columns
301
+ [cols.map(&:columnName), cols.map{|c| c.typeDesc.types.first.primitiveEntry.type}]
302
+ end
303
+
304
+ def yield_hash_rows(op_handle, columns, convertors)
305
+ i = -1
306
+ cols = columns.zip(convertors).map{|col, conv| [i+=1, col, conv]}
307
+ rows = fetch_rows(op_handle)
308
+ until rows.empty?
309
+ rows.each do |row|
310
+ h = {}
311
+ vals = row.colVals
312
+ cols.each do |i, col, conv|
313
+ v = vals[i].get_value.value
314
+ h[col] = conv ? conv[v] : v
315
+ end
316
+ yield h
317
+ end
318
+ rows = fetch_rows(op_handle, :next)
319
+ end
320
+ end
321
+
322
+ # Pull rows from the query result
323
+ def fetch_rows(op_handle, orientation = :first, max_rows = 1000)
324
+ fetch_req = prepare_fetch_results(op_handle, orientation, max_rows)
325
+ fetch_results = @client.FetchResults(fetch_req)
326
+ raise_error_if_failed!(fetch_results)
327
+ fetch_results.results.rows
328
+ #TCLIResultSet.new(rows, TCLISchemaDefinition.new(get_schema_for(op_handle), rows.first))
329
+ end
330
+
331
+ # Performs a explain on the supplied query on the server, returns it as a ExplainResult.
332
+ # (Only works on 0.12 if you have this patch - https://issues.apache.org/jira/browse/HIVE-5492)
333
+ def explain(query)
334
+ rows = []
335
+ fetch_in_batch("EXPLAIN " + query) do |batch|
336
+ rows << batch.map { |b| b[:Explain] }
337
+ end
338
+ ExplainResult.new(rows.flatten)
339
+ end
340
+
341
+ # Performs a query on the server, fetches up to *max_rows* rows and returns them as an array.
342
+ def fetch(query, max_rows = 100)
343
+ # Execute the query and check the result
344
+ exec_result = execute(query)
345
+ raise_error_if_failed!(exec_result)
346
+
347
+ # Get search operation handle to fetch the results
348
+ op_handle = exec_result.operationHandle
349
+
350
+ # Fetch the rows
351
+ fetch_rows(op_handle, :first, max_rows)
352
+ end
353
+
354
+ # Performs a query on the server, fetches the results in batches of *batch_size* rows
355
+ # and yields the result batches to a given block as arrays of rows.
356
+ def fetch_in_batch(query, batch_size = 1000, &block)
357
+ raise "No block given for the batch fetch request!" unless block_given?
358
+
359
+ # Execute the query and check the result
360
+ exec_result = execute(query)
361
+ raise_error_if_failed!(exec_result)
362
+
363
+ # Get search operation handle to fetch the results
364
+ op_handle = exec_result.operationHandle
365
+
366
+ # Prepare fetch results request
367
+ fetch_req = prepare_fetch_results(op_handle, :next, batch_size)
368
+
369
+ # Now let's iterate over the results
370
+ loop do
371
+ rows = fetch_rows(op_handle, :next, batch_size)
372
+ break if rows.empty?
373
+ yield rows
374
+ end
375
+ end
376
+
377
+ def create_table(schema)
378
+ execute(schema.create_table_statement)
379
+ end
380
+
381
+ def drop_table(name)
382
+ name = name.name if name.is_a?(TableSchema)
383
+ execute("DROP TABLE `#{name}`")
384
+ end
385
+
386
+ def replace_columns(schema)
387
+ execute(schema.replace_columns_statement)
388
+ end
389
+
390
+ def add_columns(schema)
391
+ execute(schema.add_columns_statement)
392
+ end
393
+
394
+ def method_missing(meth, *args)
395
+ client.send(meth, *args)
396
+ end
397
+
398
+ private
399
+
400
+ def prepare_open_session(client_protocol)
401
+ req = ::Hive2::Thrift::TOpenSessionReq.new( @sasl_params.empty? ? [] : @sasl_params )
402
+ req.client_protocol = client_protocol
403
+ req
404
+ end
405
+
406
+ def prepare_close_session
407
+ ::Hive2::Thrift::TCloseSessionReq.new( sessionHandle: self.session )
408
+ end
409
+
410
+ def prepare_execute_statement(query)
411
+ ::Hive2::Thrift::TExecuteStatementReq.new( sessionHandle: self.session, statement: query.to_s, confOverlay: {"impala.resultset.cache.size"=>"100000"} )
412
+ end
413
+
414
+ def prepare_fetch_results(handle, orientation=:first, rows=100)
415
+ orientation_value = "FETCH_#{orientation.to_s.upcase}"
416
+ valid_orientations = ::Hive2::Thrift::TFetchOrientation::VALUE_MAP.values
417
+ unless valid_orientations.include?(orientation_value)
418
+ raise ArgumentError, "Invalid orientation: #{orientation.inspect}"
419
+ end
420
+ orientation_const = eval("::Hive2::Thrift::TFetchOrientation::#{orientation_value}")
421
+ ::Hive2::Thrift::TFetchResultsReq.new(
422
+ operationHandle: handle,
423
+ orientation: orientation_const,
424
+ maxRows: rows
425
+ )
426
+ end
427
+
428
+ def prepare_operation_handle(handles)
429
+ validate_handles!(handles)
430
+ Hive2::Thrift::TOperationHandle.new(
431
+ operationId: Hive2::Thrift::THandleIdentifier.new(guid: handles[:guid], secret: handles[:secret]),
432
+ operationType: Hive2::Thrift::TOperationType::EXECUTE_STATEMENT,
433
+ hasResultSet: false
434
+ )
435
+ end
436
+
437
+ def prepare_cancel_request(handles)
438
+ Hive2::Thrift::TCancelOperationReq.new(
439
+ operationHandle: prepare_operation_handle(handles)
440
+ )
441
+ end
442
+
443
+ def validate_handles!(handles)
444
+ unless handles.has_key?(:guid) and handles.has_key?(:secret) and handles.has_key?(:session)
445
+ raise "Invalid handles hash: #{handles.inspect}"
446
+ end
447
+ end
448
+
449
+ def get_schema_for(handle)
450
+ req = ::Hive2::Thrift::TGetResultSetMetadataReq.new( operationHandle: handle )
451
+ metadata = client.GetResultSetMetadata( req )
452
+ metadata.schema
453
+ end
454
+
455
+ # Raises an exception if given operation result is a failure
456
+ def raise_error_if_failed!(result)
457
+ return if result.status.statusCode == 0
458
+ error_message = result.status.errorMessage || 'Execution failed!'
459
+ raise RBHive::TCLIConnectionError.new(error_message)
460
+ end
461
+ end
462
+
463
+ class TCLIConnectionError < StandardError; end
464
+ end