rbhive-u2i 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ class ExplainResult
2
+ def initialize(rows)
3
+ @rows = rows
4
+ end
5
+
6
+ def ast
7
+ by_section[:abstract_syntax_tree].first
8
+ end
9
+
10
+ def stage_count
11
+ stage_dependencies.length
12
+ end
13
+
14
+ def stage_dependencies
15
+ by_section[:stage_dependencies] || []
16
+ end
17
+
18
+ def to_tsv
19
+ @rows.join("\n")
20
+ end
21
+
22
+ def raw
23
+ @rows
24
+ end
25
+
26
+ def to_s
27
+ to_tsv
28
+ end
29
+
30
+ private
31
+
32
+ def by_section
33
+ current_section = nil
34
+ @rows.inject({}) do |sections, row|
35
+ if row.match(/^[A-Z]/)
36
+ current_section = row.chomp(':').downcase.gsub(' ', '_').to_sym
37
+ sections[current_section] = []
38
+ elsif row.length == 0
39
+ next sections
40
+ else
41
+ sections[current_section] << row.strip
42
+ end
43
+ sections
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,37 @@
1
+ module RBHive
2
+ class ResultSet < Array
3
+ def initialize(rows, schema)
4
+ @schema = schema
5
+ super(rows.map {|r| @schema.coerce_row(r) })
6
+ end
7
+
8
+ def column_names
9
+ @schema.column_names
10
+ end
11
+
12
+ def column_type_map
13
+ @schema.column_type_map
14
+ end
15
+
16
+ def to_csv(out_file=nil)
17
+ to_separated_output(",", out_file)
18
+ end
19
+
20
+ def to_tsv(out_file=nil)
21
+ to_separated_output("\t", out_file)
22
+ end
23
+
24
+ def as_arrays
25
+ @as_arrays ||= self.map{ |r| @schema.coerce_row_to_array(r) }
26
+ end
27
+
28
+ private
29
+
30
+ def to_separated_output(sep, out_file)
31
+ rows = self.map { |r| @schema.coerce_row_to_array(r).join(sep) }
32
+ sv = rows.join("\n")
33
+ return sv if out_file.nil?
34
+ File.open(out_file, 'w+') { |f| f << sv }
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,87 @@
1
+ require 'json'
2
+
3
+ module RBHive
4
+ class SchemaDefinition
5
+ attr_reader :schema
6
+
7
+ NAN = Float::NAN rescue 0.0/0.0
8
+ INFINITY = Float::INFINITY rescue 1.0/0.0
9
+ TYPES = {
10
+ :boolean => :to_s,
11
+ :string => :to_s,
12
+ :bigint => :to_i,
13
+ :float => :to_f,
14
+ :double => :to_f,
15
+ :int => :to_i,
16
+ :smallint => :to_i,
17
+ :tinyint => :to_i,
18
+ }
19
+
20
+ def initialize(schema, example_row)
21
+ @schema = schema
22
+ @example_row = example_row ? example_row.split("\t") : []
23
+ end
24
+
25
+ def column_names
26
+ @column_names ||= begin
27
+ schema_names = @schema.fieldSchemas.map {|c| c.name }
28
+
29
+ # In rare cases Hive can return two identical column names
30
+ # consider SELECT a.foo, b.foo...
31
+ # in this case you get two columns called foo with no disambiguation.
32
+ # as a (far from ideal) solution we detect this edge case and rename them
33
+ # a.foo => foo1, b.foo => foo2
34
+ # otherwise we will trample one of the columns during Hash mapping.
35
+ s = Hash.new(0)
36
+ schema_names.map! { |c| s[c] += 1; s[c] > 1 ? "#{c}---|---#{s[c]}" : c }
37
+ schema_names.map! { |c| s[c] > 1 ? "#{c}---|---1" : c }
38
+ schema_names.map! { |c| c.gsub('---|---', '_').to_sym }
39
+
40
+ # Lets fix the fact that Hive doesn't return schema data for partitions on SELECT * queries
41
+ # For now we will call them :_p1, :_p2, etc. to avoid collisions.
42
+ offset = 0
43
+ while schema_names.length < @example_row.length
44
+ schema_names.push(:"_p#{offset+=1}")
45
+ end
46
+ schema_names
47
+ end
48
+ end
49
+
50
+ def column_type_map
51
+ @column_type_map ||= column_names.inject({}) do |hsh, c|
52
+ definition = @schema.fieldSchemas.find {|s| s.name.to_sym == c }
53
+ # If the column isn't in the schema (eg partitions in SELECT * queries) assume they are strings
54
+ hsh[c] = definition ? definition.type.to_sym : :string
55
+ hsh
56
+ end
57
+ end
58
+
59
+ def coerce_row(row)
60
+ column_names.zip(row.split("\t")).inject({}) do |hsh, (column_name, value)|
61
+ hsh[column_name] = coerce_column(column_name, value)
62
+ hsh
63
+ end
64
+ end
65
+
66
+ def coerce_column(column_name, value)
67
+ return nil if value.nil?
68
+ type = column_type_map[column_name]
69
+ return INFINITY if (type != :string && value == "Infinity")
70
+ return NAN if (type != :string && value == "NaN")
71
+ return coerce_complex_value(value) if type.to_s =~ /^array/
72
+ conversion_method = TYPES[type]
73
+ conversion_method ? value.send(conversion_method) : value
74
+ end
75
+
76
+ def coerce_row_to_array(row)
77
+ column_names.map { |n| row[n] }
78
+ end
79
+
80
+ def coerce_complex_value(value)
81
+ return nil if value.nil?
82
+ return nil if value.length == 0
83
+ return nil if value == 'null'
84
+ JSON.parse(value)
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,441 @@
1
+ # suppress warnings
2
+ old_verbose, $VERBOSE = $VERBOSE, nil
3
+
4
+ raise 'Thrift is not loaded' unless defined?(Thrift)
5
+ raise 'RBHive is not loaded' unless defined?(RBHive)
6
+
7
+ # require thrift autogenerated files
8
+ require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service_constants])
9
+ require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service])
10
+ require File.join(File.dirname(__FILE__), *%w[.. thrift sasl_client_transport])
11
+
12
+ # restore warnings
13
+ $VERBOSE = old_verbose
14
+
15
+ # Monkey patch thrift to set an infinite read timeout
16
+ module Thrift
17
+ class HTTPClientTransport < BaseTransport
18
+ def flush
19
+ http = Net::HTTP.new @url.host, @url.port
20
+ http.use_ssl = @url.scheme == 'https'
21
+ http.read_timeout = nil
22
+ http.verify_mode = @ssl_verify_mode if @url.scheme == 'https'
23
+ resp = http.post(@url.request_uri, @outbuf, @headers)
24
+ data = resp.body
25
+ data = Bytes.force_binary_encoding(data)
26
+ @inbuf = StringIO.new data
27
+ @outbuf = Bytes.empty_byte_buffer
28
+ end
29
+ end
30
+ end
31
+
32
+ module RBHive
33
+
34
+ HIVE_THRIFT_MAPPING = {
35
+ 10 => 0,
36
+ 11 => 1,
37
+ 12 => 2,
38
+ 13 => 6,
39
+ :cdh4 => 0,
40
+ :cdh5 => 4,
41
+ :PROTOCOL_V1 => 0,
42
+ :PROTOCOL_V2 => 1,
43
+ :PROTOCOL_V3 => 2,
44
+ :PROTOCOL_V4 => 3,
45
+ :PROTOCOL_V5 => 4,
46
+ :PROTOCOL_V6 => 5,
47
+ :PROTOCOL_V7 => 6
48
+ }
49
+
50
+ def tcli_connect(server, port = 10_000, options)
51
+ logger = options.key?(:logger) ? options.delete(:logger) : StdOutLogger.new
52
+ connection = RBHive::TCLIConnection.new(server, port, options, logger)
53
+ ret = nil
54
+ begin
55
+ connection.open
56
+ connection.open_session
57
+ ret = yield(connection)
58
+
59
+ ensure
60
+ # Try to close the session and our connection if those are still open, ignore io errors
61
+ begin
62
+ connection.close_session if connection.session
63
+ connection.close
64
+ rescue IOError => e
65
+ # noop
66
+ end
67
+ end
68
+
69
+ ret
70
+ end
71
+ module_function :tcli_connect
72
+
73
+ class StdOutLogger
74
+ %w(fatal error warn info debug).each do |level|
75
+ define_method level.to_sym do |message|
76
+ STDOUT.puts(message)
77
+ end
78
+ end
79
+ end
80
+
81
+ class TCLIConnection
82
+ attr_reader :client
83
+
84
+ def initialize(server, port = 10_000, options = {}, logger = StdOutLogger.new)
85
+ options ||= {} # backwards compatibility
86
+ raise "'options' parameter must be a hash" unless options.is_a?(Hash)
87
+
88
+ if options[:transport] == :sasl and options[:sasl_params].nil?
89
+ raise ":transport is set to :sasl, but no :sasl_params option was supplied"
90
+ end
91
+
92
+ # Defaults to buffered transport, Hive 0.10, 1800 second timeout
93
+ options[:transport] ||= :buffered
94
+ options[:hive_version] ||= 10
95
+ options[:timeout] ||= 1800
96
+ @options = options
97
+
98
+ # Look up the appropriate Thrift protocol version for the supplied Hive version
99
+ @thrift_protocol_version = thrift_hive_protocol(options[:hive_version])
100
+
101
+ @logger = logger
102
+ @transport = thrift_transport(server, port)
103
+ @protocol = Thrift::BinaryProtocol.new(@transport)
104
+ @client = Hive2::Thrift::TCLIService::Client.new(@protocol)
105
+ @session = nil
106
+ @logger.info("Connecting to HiveServer2 #{server} on port #{port}")
107
+ end
108
+
109
+ def thrift_hive_protocol(version)
110
+ HIVE_THRIFT_MAPPING[version] || raise("Invalid Hive version")
111
+ end
112
+
113
+ def thrift_transport(server, port)
114
+ @logger.info("Initializing transport #{@options[:transport]}")
115
+ case @options[:transport]
116
+ when :buffered
117
+ return Thrift::BufferedTransport.new(thrift_socket(server, port, @options[:timeout]))
118
+ when :sasl
119
+ return Thrift::SaslClientTransport.new(thrift_socket(server, port, @options[:timeout]),
120
+ parse_sasl_params(@options[:sasl_params]))
121
+ when :http
122
+ return Thrift::HTTPClientTransport.new("http://#{server}:#{port}/cliservice")
123
+ else
124
+ raise "Unrecognised transport type '#{transport}'"
125
+ end
126
+ end
127
+
128
+ def thrift_socket(server, port, timeout)
129
+ socket = Thrift::Socket.new(server, port)
130
+ socket.timeout = timeout
131
+ socket
132
+ end
133
+
134
+ # Processes SASL connection params and returns a hash with symbol keys or a nil
135
+ def parse_sasl_params(sasl_params)
136
+ # Symbilize keys in a hash
137
+ if sasl_params.kind_of?(Hash)
138
+ return sasl_params.inject({}) do |memo,(k,v)|
139
+ memo[k.to_sym] = v;
140
+ memo
141
+ end
142
+ end
143
+ return nil
144
+ end
145
+
146
+ def open
147
+ @transport.open
148
+ end
149
+
150
+ def close
151
+ @transport.close
152
+ end
153
+
154
+ def open_session
155
+ @session = @client.OpenSession(prepare_open_session(@thrift_protocol_version))
156
+ end
157
+
158
+ def close_session
159
+ @client.CloseSession prepare_close_session
160
+ @session = nil
161
+ end
162
+
163
+ def session
164
+ @session && @session.sessionHandle
165
+ end
166
+
167
+ def client
168
+ @client
169
+ end
170
+
171
+ def execute(query)
172
+ @logger.info("Executing Hive Query: #{query}")
173
+ req = prepare_execute_statement(query)
174
+ exec_result = client.ExecuteStatement(req)
175
+ raise_error_if_failed!(exec_result)
176
+ exec_result
177
+ end
178
+
179
+ def priority=(priority)
180
+ set("mapred.job.priority", priority)
181
+ end
182
+
183
+ def queue=(queue)
184
+ set("mapred.job.queue.name", queue)
185
+ end
186
+
187
+ def set(name,value)
188
+ @logger.info("Setting #{name}=#{value}")
189
+ self.execute("SET #{name}=#{value}")
190
+ end
191
+
192
+ # Async execute
193
+ def async_execute(query)
194
+ @logger.info("Executing query asynchronously: #{query}")
195
+ exec_result = @client.ExecuteStatement(
196
+ Hive2::Thrift::TExecuteStatementReq.new(
197
+ sessionHandle: @session.sessionHandle,
198
+ statement: query,
199
+ runAsync: true
200
+ )
201
+ )
202
+ raise_error_if_failed!(exec_result)
203
+ op_handle = exec_result.operationHandle
204
+
205
+ # Return handles to get hold of this query / session again
206
+ {
207
+ session: @session.sessionHandle,
208
+ guid: op_handle.operationId.guid,
209
+ secret: op_handle.operationId.secret
210
+ }
211
+ end
212
+
213
+ # Is the query complete?
214
+ def async_is_complete?(handles)
215
+ async_state(handles) == :finished
216
+ end
217
+
218
+ # Is the query actually running?
219
+ def async_is_running?(handles)
220
+ async_state(handles) == :running
221
+ end
222
+
223
+ # Has the query failed?
224
+ def async_is_failed?(handles)
225
+ async_state(handles) == :error
226
+ end
227
+
228
+ def async_is_cancelled?(handles)
229
+ async_state(handles) == :cancelled
230
+ end
231
+
232
+ def async_cancel(handles)
233
+ @client.CancelOperation(prepare_cancel_request(handles))
234
+ end
235
+
236
+ # Map states to symbols
237
+ def async_state(handles)
238
+ response = @client.GetOperationStatus(
239
+ Hive2::Thrift::TGetOperationStatusReq.new(operationHandle: prepare_operation_handle(handles))
240
+ )
241
+
242
+ case response.operationState
243
+ when Hive2::Thrift::TOperationState::FINISHED_STATE
244
+ return :finished
245
+ when Hive2::Thrift::TOperationState::INITIALIZED_STATE
246
+ return :initialized
247
+ when Hive2::Thrift::TOperationState::RUNNING_STATE
248
+ return :running
249
+ when Hive2::Thrift::TOperationState::CANCELED_STATE
250
+ return :cancelled
251
+ when Hive2::Thrift::TOperationState::CLOSED_STATE
252
+ return :closed
253
+ when Hive2::Thrift::TOperationState::ERROR_STATE
254
+ return :error
255
+ when Hive2::Thrift::TOperationState::UKNOWN_STATE
256
+ return :unknown
257
+ when Hive2::Thrift::TOperationState::PENDING_STATE
258
+ return :pending
259
+ when nil
260
+ raise "No operation state found for handles - has the session been closed?"
261
+ else
262
+ return :state_not_in_protocol
263
+ end
264
+ end
265
+
266
+ # Async fetch results from an async execute
267
+ def async_fetch(handles, max_rows = 100)
268
+ # Can't get data from an unfinished query
269
+ unless async_is_complete?(handles)
270
+ raise "Can't perform fetch on a query in state: #{async_state(handles)}"
271
+ end
272
+
273
+ # Fetch and
274
+ fetch_rows(prepare_operation_handle(handles), :first, max_rows)
275
+ end
276
+
277
+ # Performs a query on the server, fetches the results in batches of *batch_size* rows
278
+ # and yields the result batches to a given block as arrays of rows.
279
+ def async_fetch_in_batch(handles, batch_size = 1000, &block)
280
+ raise "No block given for the batch fetch request!" unless block_given?
281
+ # Can't get data from an unfinished query
282
+ unless async_is_complete?(handles)
283
+ raise "Can't perform fetch on a query in state: #{async_state(handles)}"
284
+ end
285
+
286
+ # Now let's iterate over the results
287
+ loop do
288
+ rows = fetch_rows(prepare_operation_handle(handles), :next, batch_size)
289
+ break if rows.empty?
290
+ yield rows
291
+ end
292
+ end
293
+
294
+ def async_close_session(handles)
295
+ validate_handles!(handles)
296
+ @client.CloseSession(Hive2::Thrift::TCloseSessionReq.new( sessionHandle: handles[:session] ))
297
+ end
298
+
299
+ # Pull rows from the query result
300
+ def fetch_rows(op_handle, orientation = :first, max_rows = 1000)
301
+ fetch_req = prepare_fetch_results(op_handle, orientation, max_rows)
302
+ fetch_results = @client.FetchResults(fetch_req)
303
+ raise_error_if_failed!(fetch_results)
304
+ rows = fetch_results.results.rows
305
+ TCLIResultSet.new(rows, TCLISchemaDefinition.new(get_schema_for(op_handle), rows.first))
306
+ end
307
+
308
+ # Performs a explain on the supplied query on the server, returns it as a ExplainResult.
309
+ # (Only works on 0.12 if you have this patch - https://issues.apache.org/jira/browse/HIVE-5492)
310
+ def explain(query)
311
+ rows = []
312
+ fetch_in_batch("EXPLAIN " + query) do |batch|
313
+ rows << batch.map { |b| b[:Explain] }
314
+ end
315
+ ExplainResult.new(rows.flatten)
316
+ end
317
+
318
+ # Performs a query on the server, fetches up to *max_rows* rows and returns them as an array.
319
+ def fetch(query, max_rows = 100)
320
+ # Execute the query and check the result
321
+ exec_result = execute(query)
322
+ raise_error_if_failed!(exec_result)
323
+
324
+ # Get search operation handle to fetch the results
325
+ op_handle = exec_result.operationHandle
326
+
327
+ # Fetch the rows
328
+ fetch_rows(op_handle, :first, max_rows)
329
+ end
330
+
331
+ # Performs a query on the server, fetches the results in batches of *batch_size* rows
332
+ # and yields the result batches to a given block as arrays of rows.
333
+ def fetch_in_batch(query, batch_size = 1000, &block)
334
+ raise "No block given for the batch fetch request!" unless block_given?
335
+
336
+ # Execute the query and check the result
337
+ exec_result = execute(query)
338
+ raise_error_if_failed!(exec_result)
339
+
340
+ # Get search operation handle to fetch the results
341
+ op_handle = exec_result.operationHandle
342
+
343
+ # Prepare fetch results request
344
+ fetch_req = prepare_fetch_results(op_handle, :next, batch_size)
345
+
346
+ # Now let's iterate over the results
347
+ loop do
348
+ rows = fetch_rows(op_handle, :next, batch_size)
349
+ break if rows.empty?
350
+ yield rows
351
+ end
352
+ end
353
+
354
+ def create_table(schema)
355
+ execute(schema.create_table_statement)
356
+ end
357
+
358
+ def drop_table(name)
359
+ name = name.name if name.is_a?(TableSchema)
360
+ execute("DROP TABLE `#{name}`")
361
+ end
362
+
363
+ def replace_columns(schema)
364
+ execute(schema.replace_columns_statement)
365
+ end
366
+
367
+ def add_columns(schema)
368
+ execute(schema.add_columns_statement)
369
+ end
370
+
371
+ def method_missing(meth, *args)
372
+ client.send(meth, *args)
373
+ end
374
+
375
+ private
376
+
377
+ def prepare_open_session(client_protocol)
378
+ req = ::Hive2::Thrift::TOpenSessionReq.new( @options[:sasl_params].nil? ? [] : @options[:sasl_params] )
379
+ req.client_protocol = client_protocol
380
+ req
381
+ end
382
+
383
+ def prepare_close_session
384
+ ::Hive2::Thrift::TCloseSessionReq.new( sessionHandle: self.session )
385
+ end
386
+
387
+ def prepare_execute_statement(query)
388
+ ::Hive2::Thrift::TExecuteStatementReq.new( sessionHandle: self.session, statement: query.to_s, confOverlay: {} )
389
+ end
390
+
391
+ def prepare_fetch_results(handle, orientation=:first, rows=100)
392
+ orientation_value = "FETCH_#{orientation.to_s.upcase}"
393
+ valid_orientations = ::Hive2::Thrift::TFetchOrientation::VALUE_MAP.values
394
+ unless valid_orientations.include?(orientation_value)
395
+ raise ArgumentError, "Invalid orientation: #{orientation.inspect}"
396
+ end
397
+ orientation_const = eval("::Hive2::Thrift::TFetchOrientation::#{orientation_value}")
398
+ ::Hive2::Thrift::TFetchResultsReq.new(
399
+ operationHandle: handle,
400
+ orientation: orientation_const,
401
+ maxRows: rows
402
+ )
403
+ end
404
+
405
+ def prepare_operation_handle(handles)
406
+ validate_handles!(handles)
407
+ Hive2::Thrift::TOperationHandle.new(
408
+ operationId: Hive2::Thrift::THandleIdentifier.new(guid: handles[:guid], secret: handles[:secret]),
409
+ operationType: Hive2::Thrift::TOperationType::EXECUTE_STATEMENT,
410
+ hasResultSet: false
411
+ )
412
+ end
413
+
414
+ def prepare_cancel_request(handles)
415
+ Hive2::Thrift::TCancelOperationReq.new(
416
+ operationHandle: prepare_operation_handle(handles)
417
+ )
418
+ end
419
+
420
+ def validate_handles!(handles)
421
+ unless handles.has_key?(:guid) and handles.has_key?(:secret) and handles.has_key?(:session)
422
+ raise "Invalid handles hash: #{handles.inspect}"
423
+ end
424
+ end
425
+
426
+ def get_schema_for(handle)
427
+ req = ::Hive2::Thrift::TGetResultSetMetadataReq.new( operationHandle: handle )
428
+ metadata = client.GetResultSetMetadata( req )
429
+ metadata.schema
430
+ end
431
+
432
+ # Raises an exception if given operation result is a failure
433
+ def raise_error_if_failed!(result)
434
+ return if result.status.statusCode == 0
435
+ error_message = result.status.errorMessage || 'Execution failed!'
436
+ raise RBHive::TCLIConnectionError.new(error_message)
437
+ end
438
+ end
439
+
440
+ class TCLIConnectionError < StandardError; end
441
+ end