rbhive-u2i 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,46 @@
1
+ class ExplainResult
2
+ def initialize(rows)
3
+ @rows = rows
4
+ end
5
+
6
+ def ast
7
+ by_section[:abstract_syntax_tree].first
8
+ end
9
+
10
+ def stage_count
11
+ stage_dependencies.length
12
+ end
13
+
14
+ def stage_dependencies
15
+ by_section[:stage_dependencies] || []
16
+ end
17
+
18
+ def to_tsv
19
+ @rows.join("\n")
20
+ end
21
+
22
+ def raw
23
+ @rows
24
+ end
25
+
26
+ def to_s
27
+ to_tsv
28
+ end
29
+
30
+ private
31
+
32
+ def by_section
33
+ current_section = nil
34
+ @rows.inject({}) do |sections, row|
35
+ if row.match(/^[A-Z]/)
36
+ current_section = row.chomp(':').downcase.gsub(' ', '_').to_sym
37
+ sections[current_section] = []
38
+ elsif row.length == 0
39
+ next sections
40
+ else
41
+ sections[current_section] << row.strip
42
+ end
43
+ sections
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,37 @@
1
+ module RBHive
2
+ class ResultSet < Array
3
+ def initialize(rows, schema)
4
+ @schema = schema
5
+ super(rows.map {|r| @schema.coerce_row(r) })
6
+ end
7
+
8
+ def column_names
9
+ @schema.column_names
10
+ end
11
+
12
+ def column_type_map
13
+ @schema.column_type_map
14
+ end
15
+
16
+ def to_csv(out_file=nil)
17
+ to_separated_output(",", out_file)
18
+ end
19
+
20
+ def to_tsv(out_file=nil)
21
+ to_separated_output("\t", out_file)
22
+ end
23
+
24
+ def as_arrays
25
+ @as_arrays ||= self.map{ |r| @schema.coerce_row_to_array(r) }
26
+ end
27
+
28
+ private
29
+
30
+ def to_separated_output(sep, out_file)
31
+ rows = self.map { |r| @schema.coerce_row_to_array(r).join(sep) }
32
+ sv = rows.join("\n")
33
+ return sv if out_file.nil?
34
+ File.open(out_file, 'w+') { |f| f << sv }
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,87 @@
1
+ require 'json'
2
+
3
+ module RBHive
4
+ class SchemaDefinition
5
+ attr_reader :schema
6
+
7
+ NAN = Float::NAN rescue 0.0/0.0
8
+ INFINITY = Float::INFINITY rescue 1.0/0.0
9
+ TYPES = {
10
+ :boolean => :to_s,
11
+ :string => :to_s,
12
+ :bigint => :to_i,
13
+ :float => :to_f,
14
+ :double => :to_f,
15
+ :int => :to_i,
16
+ :smallint => :to_i,
17
+ :tinyint => :to_i,
18
+ }
19
+
20
+ def initialize(schema, example_row)
21
+ @schema = schema
22
+ @example_row = example_row ? example_row.split("\t") : []
23
+ end
24
+
25
+ def column_names
26
+ @column_names ||= begin
27
+ schema_names = @schema.fieldSchemas.map {|c| c.name }
28
+
29
+ # In rare cases Hive can return two identical column names
30
+ # consider SELECT a.foo, b.foo...
31
+ # in this case you get two columns called foo with no disambiguation.
32
+ # as a (far from ideal) solution we detect this edge case and rename them
33
+ # a.foo => foo1, b.foo => foo2
34
+ # otherwise we will trample one of the columns during Hash mapping.
35
+ s = Hash.new(0)
36
+ schema_names.map! { |c| s[c] += 1; s[c] > 1 ? "#{c}---|---#{s[c]}" : c }
37
+ schema_names.map! { |c| s[c] > 1 ? "#{c}---|---1" : c }
38
+ schema_names.map! { |c| c.gsub('---|---', '_').to_sym }
39
+
40
+ # Lets fix the fact that Hive doesn't return schema data for partitions on SELECT * queries
41
+ # For now we will call them :_p1, :_p2, etc. to avoid collisions.
42
+ offset = 0
43
+ while schema_names.length < @example_row.length
44
+ schema_names.push(:"_p#{offset+=1}")
45
+ end
46
+ schema_names
47
+ end
48
+ end
49
+
50
+ def column_type_map
51
+ @column_type_map ||= column_names.inject({}) do |hsh, c|
52
+ definition = @schema.fieldSchemas.find {|s| s.name.to_sym == c }
53
+ # If the column isn't in the schema (eg partitions in SELECT * queries) assume they are strings
54
+ hsh[c] = definition ? definition.type.to_sym : :string
55
+ hsh
56
+ end
57
+ end
58
+
59
+ def coerce_row(row)
60
+ column_names.zip(row.split("\t")).inject({}) do |hsh, (column_name, value)|
61
+ hsh[column_name] = coerce_column(column_name, value)
62
+ hsh
63
+ end
64
+ end
65
+
66
+ def coerce_column(column_name, value)
67
+ return nil if value.nil?
68
+ type = column_type_map[column_name]
69
+ return INFINITY if (type != :string && value == "Infinity")
70
+ return NAN if (type != :string && value == "NaN")
71
+ return coerce_complex_value(value) if type.to_s =~ /^array/
72
+ conversion_method = TYPES[type]
73
+ conversion_method ? value.send(conversion_method) : value
74
+ end
75
+
76
+ def coerce_row_to_array(row)
77
+ column_names.map { |n| row[n] }
78
+ end
79
+
80
+ def coerce_complex_value(value)
81
+ return nil if value.nil?
82
+ return nil if value.length == 0
83
+ return nil if value == 'null'
84
+ JSON.parse(value)
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,441 @@
1
+ # suppress warnings
2
+ old_verbose, $VERBOSE = $VERBOSE, nil
3
+
4
+ raise 'Thrift is not loaded' unless defined?(Thrift)
5
+ raise 'RBHive is not loaded' unless defined?(RBHive)
6
+
7
+ # require thrift autogenerated files
8
+ require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service_constants])
9
+ require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service])
10
+ require File.join(File.dirname(__FILE__), *%w[.. thrift sasl_client_transport])
11
+
12
+ # restore warnings
13
+ $VERBOSE = old_verbose
14
+
15
+ # Monkey patch thrift to set an infinite read timeout
16
+ module Thrift
17
+ class HTTPClientTransport < BaseTransport
18
+ def flush
19
+ http = Net::HTTP.new @url.host, @url.port
20
+ http.use_ssl = @url.scheme == 'https'
21
+ http.read_timeout = nil
22
+ http.verify_mode = @ssl_verify_mode if @url.scheme == 'https'
23
+ resp = http.post(@url.request_uri, @outbuf, @headers)
24
+ data = resp.body
25
+ data = Bytes.force_binary_encoding(data)
26
+ @inbuf = StringIO.new data
27
+ @outbuf = Bytes.empty_byte_buffer
28
+ end
29
+ end
30
+ end
31
+
32
+ module RBHive
33
+
34
+ HIVE_THRIFT_MAPPING = {
35
+ 10 => 0,
36
+ 11 => 1,
37
+ 12 => 2,
38
+ 13 => 6,
39
+ :cdh4 => 0,
40
+ :cdh5 => 4,
41
+ :PROTOCOL_V1 => 0,
42
+ :PROTOCOL_V2 => 1,
43
+ :PROTOCOL_V3 => 2,
44
+ :PROTOCOL_V4 => 3,
45
+ :PROTOCOL_V5 => 4,
46
+ :PROTOCOL_V6 => 5,
47
+ :PROTOCOL_V7 => 6
48
+ }
49
+
50
+ def tcli_connect(server, port = 10_000, options)
51
+ logger = options.key?(:logger) ? options.delete(:logger) : StdOutLogger.new
52
+ connection = RBHive::TCLIConnection.new(server, port, options, logger)
53
+ ret = nil
54
+ begin
55
+ connection.open
56
+ connection.open_session
57
+ ret = yield(connection)
58
+
59
+ ensure
60
+ # Try to close the session and our connection if those are still open, ignore io errors
61
+ begin
62
+ connection.close_session if connection.session
63
+ connection.close
64
+ rescue IOError => e
65
+ # noop
66
+ end
67
+ end
68
+
69
+ ret
70
+ end
71
+ module_function :tcli_connect
72
+
73
+ class StdOutLogger
74
+ %w(fatal error warn info debug).each do |level|
75
+ define_method level.to_sym do |message|
76
+ STDOUT.puts(message)
77
+ end
78
+ end
79
+ end
80
+
81
+ class TCLIConnection
82
+ attr_reader :client
83
+
84
+ def initialize(server, port = 10_000, options = {}, logger = StdOutLogger.new)
85
+ options ||= {} # backwards compatibility
86
+ raise "'options' parameter must be a hash" unless options.is_a?(Hash)
87
+
88
+ if options[:transport] == :sasl and options[:sasl_params].nil?
89
+ raise ":transport is set to :sasl, but no :sasl_params option was supplied"
90
+ end
91
+
92
+ # Defaults to buffered transport, Hive 0.10, 1800 second timeout
93
+ options[:transport] ||= :buffered
94
+ options[:hive_version] ||= 10
95
+ options[:timeout] ||= 1800
96
+ @options = options
97
+
98
+ # Look up the appropriate Thrift protocol version for the supplied Hive version
99
+ @thrift_protocol_version = thrift_hive_protocol(options[:hive_version])
100
+
101
+ @logger = logger
102
+ @transport = thrift_transport(server, port)
103
+ @protocol = Thrift::BinaryProtocol.new(@transport)
104
+ @client = Hive2::Thrift::TCLIService::Client.new(@protocol)
105
+ @session = nil
106
+ @logger.info("Connecting to HiveServer2 #{server} on port #{port}")
107
+ end
108
+
109
+ def thrift_hive_protocol(version)
110
+ HIVE_THRIFT_MAPPING[version] || raise("Invalid Hive version")
111
+ end
112
+
113
+ def thrift_transport(server, port)
114
+ @logger.info("Initializing transport #{@options[:transport]}")
115
+ case @options[:transport]
116
+ when :buffered
117
+ return Thrift::BufferedTransport.new(thrift_socket(server, port, @options[:timeout]))
118
+ when :sasl
119
+ return Thrift::SaslClientTransport.new(thrift_socket(server, port, @options[:timeout]),
120
+ parse_sasl_params(@options[:sasl_params]))
121
+ when :http
122
+ return Thrift::HTTPClientTransport.new("http://#{server}:#{port}/cliservice")
123
+ else
124
+ raise "Unrecognised transport type '#{transport}'"
125
+ end
126
+ end
127
+
128
+ def thrift_socket(server, port, timeout)
129
+ socket = Thrift::Socket.new(server, port)
130
+ socket.timeout = timeout
131
+ socket
132
+ end
133
+
134
+ # Processes SASL connection params and returns a hash with symbol keys or a nil
135
+ def parse_sasl_params(sasl_params)
136
+ # Symbilize keys in a hash
137
+ if sasl_params.kind_of?(Hash)
138
+ return sasl_params.inject({}) do |memo,(k,v)|
139
+ memo[k.to_sym] = v;
140
+ memo
141
+ end
142
+ end
143
+ return nil
144
+ end
145
+
146
+ def open
147
+ @transport.open
148
+ end
149
+
150
+ def close
151
+ @transport.close
152
+ end
153
+
154
+ def open_session
155
+ @session = @client.OpenSession(prepare_open_session(@thrift_protocol_version))
156
+ end
157
+
158
+ def close_session
159
+ @client.CloseSession prepare_close_session
160
+ @session = nil
161
+ end
162
+
163
+ def session
164
+ @session && @session.sessionHandle
165
+ end
166
+
167
+ def client
168
+ @client
169
+ end
170
+
171
+ def execute(query)
172
+ @logger.info("Executing Hive Query: #{query}")
173
+ req = prepare_execute_statement(query)
174
+ exec_result = client.ExecuteStatement(req)
175
+ raise_error_if_failed!(exec_result)
176
+ exec_result
177
+ end
178
+
179
+ def priority=(priority)
180
+ set("mapred.job.priority", priority)
181
+ end
182
+
183
+ def queue=(queue)
184
+ set("mapred.job.queue.name", queue)
185
+ end
186
+
187
+ def set(name,value)
188
+ @logger.info("Setting #{name}=#{value}")
189
+ self.execute("SET #{name}=#{value}")
190
+ end
191
+
192
+ # Async execute
193
+ def async_execute(query)
194
+ @logger.info("Executing query asynchronously: #{query}")
195
+ exec_result = @client.ExecuteStatement(
196
+ Hive2::Thrift::TExecuteStatementReq.new(
197
+ sessionHandle: @session.sessionHandle,
198
+ statement: query,
199
+ runAsync: true
200
+ )
201
+ )
202
+ raise_error_if_failed!(exec_result)
203
+ op_handle = exec_result.operationHandle
204
+
205
+ # Return handles to get hold of this query / session again
206
+ {
207
+ session: @session.sessionHandle,
208
+ guid: op_handle.operationId.guid,
209
+ secret: op_handle.operationId.secret
210
+ }
211
+ end
212
+
213
+ # Is the query complete?
214
+ def async_is_complete?(handles)
215
+ async_state(handles) == :finished
216
+ end
217
+
218
+ # Is the query actually running?
219
+ def async_is_running?(handles)
220
+ async_state(handles) == :running
221
+ end
222
+
223
+ # Has the query failed?
224
+ def async_is_failed?(handles)
225
+ async_state(handles) == :error
226
+ end
227
+
228
+ def async_is_cancelled?(handles)
229
+ async_state(handles) == :cancelled
230
+ end
231
+
232
+ def async_cancel(handles)
233
+ @client.CancelOperation(prepare_cancel_request(handles))
234
+ end
235
+
236
+ # Map states to symbols
237
+ def async_state(handles)
238
+ response = @client.GetOperationStatus(
239
+ Hive2::Thrift::TGetOperationStatusReq.new(operationHandle: prepare_operation_handle(handles))
240
+ )
241
+
242
+ case response.operationState
243
+ when Hive2::Thrift::TOperationState::FINISHED_STATE
244
+ return :finished
245
+ when Hive2::Thrift::TOperationState::INITIALIZED_STATE
246
+ return :initialized
247
+ when Hive2::Thrift::TOperationState::RUNNING_STATE
248
+ return :running
249
+ when Hive2::Thrift::TOperationState::CANCELED_STATE
250
+ return :cancelled
251
+ when Hive2::Thrift::TOperationState::CLOSED_STATE
252
+ return :closed
253
+ when Hive2::Thrift::TOperationState::ERROR_STATE
254
+ return :error
255
+ when Hive2::Thrift::TOperationState::UKNOWN_STATE
256
+ return :unknown
257
+ when Hive2::Thrift::TOperationState::PENDING_STATE
258
+ return :pending
259
+ when nil
260
+ raise "No operation state found for handles - has the session been closed?"
261
+ else
262
+ return :state_not_in_protocol
263
+ end
264
+ end
265
+
266
+ # Async fetch results from an async execute
267
+ def async_fetch(handles, max_rows = 100)
268
+ # Can't get data from an unfinished query
269
+ unless async_is_complete?(handles)
270
+ raise "Can't perform fetch on a query in state: #{async_state(handles)}"
271
+ end
272
+
273
+ # Fetch and
274
+ fetch_rows(prepare_operation_handle(handles), :first, max_rows)
275
+ end
276
+
277
+ # Performs a query on the server, fetches the results in batches of *batch_size* rows
278
+ # and yields the result batches to a given block as arrays of rows.
279
+ def async_fetch_in_batch(handles, batch_size = 1000, &block)
280
+ raise "No block given for the batch fetch request!" unless block_given?
281
+ # Can't get data from an unfinished query
282
+ unless async_is_complete?(handles)
283
+ raise "Can't perform fetch on a query in state: #{async_state(handles)}"
284
+ end
285
+
286
+ # Now let's iterate over the results
287
+ loop do
288
+ rows = fetch_rows(prepare_operation_handle(handles), :next, batch_size)
289
+ break if rows.empty?
290
+ yield rows
291
+ end
292
+ end
293
+
294
+ def async_close_session(handles)
295
+ validate_handles!(handles)
296
+ @client.CloseSession(Hive2::Thrift::TCloseSessionReq.new( sessionHandle: handles[:session] ))
297
+ end
298
+
299
+ # Pull rows from the query result
300
+ def fetch_rows(op_handle, orientation = :first, max_rows = 1000)
301
+ fetch_req = prepare_fetch_results(op_handle, orientation, max_rows)
302
+ fetch_results = @client.FetchResults(fetch_req)
303
+ raise_error_if_failed!(fetch_results)
304
+ rows = fetch_results.results.rows
305
+ TCLIResultSet.new(rows, TCLISchemaDefinition.new(get_schema_for(op_handle), rows.first))
306
+ end
307
+
308
+ # Performs a explain on the supplied query on the server, returns it as a ExplainResult.
309
+ # (Only works on 0.12 if you have this patch - https://issues.apache.org/jira/browse/HIVE-5492)
310
+ def explain(query)
311
+ rows = []
312
+ fetch_in_batch("EXPLAIN " + query) do |batch|
313
+ rows << batch.map { |b| b[:Explain] }
314
+ end
315
+ ExplainResult.new(rows.flatten)
316
+ end
317
+
318
+ # Performs a query on the server, fetches up to *max_rows* rows and returns them as an array.
319
+ def fetch(query, max_rows = 100)
320
+ # Execute the query and check the result
321
+ exec_result = execute(query)
322
+ raise_error_if_failed!(exec_result)
323
+
324
+ # Get search operation handle to fetch the results
325
+ op_handle = exec_result.operationHandle
326
+
327
+ # Fetch the rows
328
+ fetch_rows(op_handle, :first, max_rows)
329
+ end
330
+
331
+ # Performs a query on the server, fetches the results in batches of *batch_size* rows
332
+ # and yields the result batches to a given block as arrays of rows.
333
+ def fetch_in_batch(query, batch_size = 1000, &block)
334
+ raise "No block given for the batch fetch request!" unless block_given?
335
+
336
+ # Execute the query and check the result
337
+ exec_result = execute(query)
338
+ raise_error_if_failed!(exec_result)
339
+
340
+ # Get search operation handle to fetch the results
341
+ op_handle = exec_result.operationHandle
342
+
343
+ # Prepare fetch results request
344
+ fetch_req = prepare_fetch_results(op_handle, :next, batch_size)
345
+
346
+ # Now let's iterate over the results
347
+ loop do
348
+ rows = fetch_rows(op_handle, :next, batch_size)
349
+ break if rows.empty?
350
+ yield rows
351
+ end
352
+ end
353
+
354
+ def create_table(schema)
355
+ execute(schema.create_table_statement)
356
+ end
357
+
358
+ def drop_table(name)
359
+ name = name.name if name.is_a?(TableSchema)
360
+ execute("DROP TABLE `#{name}`")
361
+ end
362
+
363
+ def replace_columns(schema)
364
+ execute(schema.replace_columns_statement)
365
+ end
366
+
367
+ def add_columns(schema)
368
+ execute(schema.add_columns_statement)
369
+ end
370
+
371
+ def method_missing(meth, *args)
372
+ client.send(meth, *args)
373
+ end
374
+
375
+ private
376
+
377
+ def prepare_open_session(client_protocol)
378
+ req = ::Hive2::Thrift::TOpenSessionReq.new( @options[:sasl_params].nil? ? [] : @options[:sasl_params] )
379
+ req.client_protocol = client_protocol
380
+ req
381
+ end
382
+
383
+ def prepare_close_session
384
+ ::Hive2::Thrift::TCloseSessionReq.new( sessionHandle: self.session )
385
+ end
386
+
387
+ def prepare_execute_statement(query)
388
+ ::Hive2::Thrift::TExecuteStatementReq.new( sessionHandle: self.session, statement: query.to_s, confOverlay: {} )
389
+ end
390
+
391
+ def prepare_fetch_results(handle, orientation=:first, rows=100)
392
+ orientation_value = "FETCH_#{orientation.to_s.upcase}"
393
+ valid_orientations = ::Hive2::Thrift::TFetchOrientation::VALUE_MAP.values
394
+ unless valid_orientations.include?(orientation_value)
395
+ raise ArgumentError, "Invalid orientation: #{orientation.inspect}"
396
+ end
397
+ orientation_const = eval("::Hive2::Thrift::TFetchOrientation::#{orientation_value}")
398
+ ::Hive2::Thrift::TFetchResultsReq.new(
399
+ operationHandle: handle,
400
+ orientation: orientation_const,
401
+ maxRows: rows
402
+ )
403
+ end
404
+
405
+ def prepare_operation_handle(handles)
406
+ validate_handles!(handles)
407
+ Hive2::Thrift::TOperationHandle.new(
408
+ operationId: Hive2::Thrift::THandleIdentifier.new(guid: handles[:guid], secret: handles[:secret]),
409
+ operationType: Hive2::Thrift::TOperationType::EXECUTE_STATEMENT,
410
+ hasResultSet: false
411
+ )
412
+ end
413
+
414
+ def prepare_cancel_request(handles)
415
+ Hive2::Thrift::TCancelOperationReq.new(
416
+ operationHandle: prepare_operation_handle(handles)
417
+ )
418
+ end
419
+
420
+ def validate_handles!(handles)
421
+ unless handles.has_key?(:guid) and handles.has_key?(:secret) and handles.has_key?(:session)
422
+ raise "Invalid handles hash: #{handles.inspect}"
423
+ end
424
+ end
425
+
426
+ def get_schema_for(handle)
427
+ req = ::Hive2::Thrift::TGetResultSetMetadataReq.new( operationHandle: handle )
428
+ metadata = client.GetResultSetMetadata( req )
429
+ metadata.schema
430
+ end
431
+
432
+ # Raises an exception if given operation result is a failure
433
+ def raise_error_if_failed!(result)
434
+ return if result.status.statusCode == 0
435
+ error_message = result.status.errorMessage || 'Execution failed!'
436
+ raise RBHive::TCLIConnectionError.new(error_message)
437
+ end
438
+ end
439
+
440
+ class TCLIConnectionError < StandardError; end
441
+ end