rbhive-vidma 1.0.2.pre1.pre.thrift0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ class ExplainResult
2
+ def initialize(rows)
3
+ @rows = rows
4
+ end
5
+
6
+ def ast
7
+ by_section[:abstract_syntax_tree].first
8
+ end
9
+
10
+ def stage_count
11
+ stage_dependencies.length
12
+ end
13
+
14
+ def stage_dependencies
15
+ by_section[:stage_dependencies] || []
16
+ end
17
+
18
+ def to_tsv
19
+ @rows.join("\n")
20
+ end
21
+
22
+ def raw
23
+ @rows
24
+ end
25
+
26
+ def to_s
27
+ to_tsv
28
+ end
29
+
30
+ private
31
+
32
+ def by_section
33
+ current_section = nil
34
+ @rows.inject({}) do |sections, row|
35
+ if row.match(/^[A-Z]/)
36
+ current_section = row.chomp(':').downcase.gsub(' ', '_').to_sym
37
+ sections[current_section] = []
38
+ elsif row.length == 0
39
+ next sections
40
+ else
41
+ sections[current_section] << row.strip
42
+ end
43
+ sections
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,37 @@
1
+ module RBHive
2
+ class ResultSet < Array
3
+ def initialize(rows, schema)
4
+ @schema = schema
5
+ super(rows.map {|r| @schema.coerce_row(r) })
6
+ end
7
+
8
+ def column_names
9
+ @schema.column_names
10
+ end
11
+
12
+ def column_type_map
13
+ @schema.column_type_map
14
+ end
15
+
16
+ def to_csv(out_file=nil)
17
+ to_seperated_output(",", out_file)
18
+ end
19
+
20
+ def to_tsv(out_file=nil)
21
+ to_seperated_output("\t", out_file)
22
+ end
23
+
24
+ def as_arrays
25
+ @as_arrays ||= self.map{ |r| @schema.coerce_row_to_array(r) }
26
+ end
27
+
28
+ private
29
+
30
+ def to_seperated_output(sep, out_file)
31
+ rows = self.map { |r| @schema.coerce_row_to_array(r).join(sep) }
32
+ sv = rows.join("\n")
33
+ return sv if out_file.nil?
34
+ File.open(out_file, 'w+') { |f| f << sv }
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,86 @@
1
+ require 'json'
2
+
3
+ module RBHive
4
+ class SchemaDefinition
5
+ attr_reader :schema
6
+
7
+ NAN = Float::NAN rescue 0.0/0.0
8
+ INFINITY = Float::INFINITY rescue 1.0/0.0
9
+ TYPES = {
10
+ :boolean => :to_s,
11
+ :string => :to_s,
12
+ :bigint => :to_i,
13
+ :float => :to_f,
14
+ :double => :to_f,
15
+ :int => :to_i,
16
+ :smallint => :to_i,
17
+ :tinyint => :to_i,
18
+ }
19
+
20
+ def initialize(schema, example_row)
21
+ @schema = schema
22
+ @example_row = example_row ? example_row.split("\t") : []
23
+ end
24
+
25
+ def column_names
26
+ @column_names ||= begin
27
+ schema_names = @schema.fieldSchemas.map {|c| c.name }
28
+
29
+ # In rare cases Hive can return two identical column names
30
+ # consider SELECT a.foo, b.foo...
31
+ # in this case you get two columns called foo with no disambiguation.
32
+ # as a (far from ideal) solution we detect this edge case and rename them
33
+ # a.foo => foo1, b.foo => foo2
34
+ # otherwise we will trample one of the columns during Hash mapping.
35
+ s = Hash.new(0)
36
+ schema_names.map! { |c| s[c] += 1; s[c] > 1 ? "#{c}---|---#{s[c]}" : c }
37
+ schema_names.map! { |c| s[c] > 1 ? "#{c}---|---1" : c }
38
+ schema_names.map! { |c| c.gsub('---|---', '_').to_sym }
39
+
40
+ # Lets fix the fact that Hive doesn't return schema data for partitions on SELECT * queries
41
+ # For now we will call them :_p1, :_p2, etc. to avoid collisions.
42
+ offset = 0
43
+ while schema_names.length < @example_row.length
44
+ schema_names.push(:"_p#{offset+=1}")
45
+ end
46
+ schema_names
47
+ end
48
+ end
49
+
50
+ def column_type_map
51
+ @column_type_map ||= column_names.inject({}) do |hsh, c|
52
+ definition = @schema.fieldSchemas.find {|s| s.name.to_sym == c }
53
+ # If the column isn't in the schema (eg partitions in SELECT * queries) assume they are strings
54
+ hsh[c] = definition ? definition.type.to_sym : :string
55
+ hsh
56
+ end
57
+ end
58
+
59
+ def coerce_row(row)
60
+ column_names.zip(row.split("\t")).inject({}) do |hsh, (column_name, value)|
61
+ hsh[column_name] = coerce_column(column_name, value)
62
+ hsh
63
+ end
64
+ end
65
+
66
+ def coerce_column(column_name, value)
67
+ type = column_type_map[column_name]
68
+ return INFINITY if (type != :string && value == "Infinity")
69
+ return NAN if (type != :string && value == "NaN")
70
+ return coerce_complex_value(value) if type.to_s =~ /^array/
71
+ conversion_method = TYPES[type]
72
+ conversion_method ? value.send(conversion_method) : value
73
+ end
74
+
75
+ def coerce_row_to_array(row)
76
+ column_names.map { |n| row[n] }
77
+ end
78
+
79
+ def coerce_complex_value(value)
80
+ return nil if value.nil?
81
+ return nil if value.length == 0
82
+ return nil if value == 'null'
83
+ JSON.parse(value)
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,439 @@
1
+ # suppress warnings
2
+ old_verbose, $VERBOSE = $VERBOSE, nil
3
+
4
+ raise 'Thrift is not loaded' unless defined?(Thrift)
5
+ raise 'RBHive is not loaded' unless defined?(RBHive)
6
+
7
+ # require thrift autogenerated files
8
+ require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service_constants])
9
+ require File.join(File.dirname(__FILE__), *%w[.. thrift t_c_l_i_service])
10
+ require File.join(File.dirname(__FILE__), *%w[.. thrift sasl_client_transport])
11
+
12
+ # restore warnings
13
+ $VERBOSE = old_verbose
14
+
15
+ # Monkey patch thrift to set an infinite read timeout
16
+ module Thrift
17
+ class HTTPClientTransport < BaseTransport
18
+ def flush
19
+ http = Net::HTTP.new @url.host, @url.port
20
+ http.use_ssl = @url.scheme == 'https'
21
+ http.read_timeout = nil
22
+ http.verify_mode = @ssl_verify_mode if @url.scheme == 'https'
23
+ resp = http.post(@url.request_uri, @outbuf, @headers)
24
+ data = resp.body
25
+ data = Bytes.force_binary_encoding(data)
26
+ @inbuf = StringIO.new data
27
+ @outbuf = Bytes.empty_byte_buffer
28
+ end
29
+ end
30
+ end
31
+
32
+ module RBHive
33
+
34
+ HIVE_THRIFT_MAPPING = {
35
+ 10 => 0,
36
+ 11 => 1,
37
+ 12 => 2,
38
+ 13 => 6,
39
+ :cdh4 => 0,
40
+ :cdh5 => 4,
41
+ :PROTOCOL_V1 => 0,
42
+ :PROTOCOL_V2 => 1,
43
+ :PROTOCOL_V3 => 2,
44
+ :PROTOCOL_V4 => 3,
45
+ :PROTOCOL_V5 => 4,
46
+ :PROTOCOL_V6 => 5,
47
+ :PROTOCOL_V7 => 6
48
+ }
49
+
50
+ def tcli_connect(server, port = 10_000, options)
51
+ logger = options.key?(:logger) ? options.delete(:logger) : StdOutLogger.new
52
+ connection = RBHive::TCLIConnection.new(server, port, options, logger)
53
+ ret = nil
54
+ begin
55
+ connection.open
56
+ connection.open_session
57
+ ret = yield(connection)
58
+
59
+ ensure
60
+ # Try to close the session and our connection if those are still open, ignore io errors
61
+ begin
62
+ connection.close_session if connection.session
63
+ connection.close
64
+ rescue IOError => e
65
+ # noop
66
+ end
67
+ end
68
+
69
+ ret
70
+ end
71
+ module_function :tcli_connect
72
+
73
+ class StdOutLogger
74
+ %w(fatal error warn info debug).each do |level|
75
+ define_method level.to_sym do |message|
76
+ STDOUT.puts(message)
77
+ end
78
+ end
79
+ end
80
+
81
+ class TCLIConnection
82
+ attr_reader :client
83
+
84
+ def initialize(server, port = 10_000, options = {}, logger = StdOutLogger.new)
85
+ options ||= {} # backwards compatibility
86
+ raise "'options' parameter must be a hash" unless options.is_a?(Hash)
87
+
88
+ if options[:transport] == :sasl and options[:sasl_params].nil?
89
+ raise ":transport is set to :sasl, but no :sasl_params option was supplied"
90
+ end
91
+
92
+ # Defaults to buffered transport, Hive 0.10, 1800 second timeout
93
+ options[:transport] ||= :buffered
94
+ options[:hive_version] ||= 10
95
+ options[:timeout] ||= 1800
96
+ @options = options
97
+
98
+ # Look up the appropriate Thrift protocol version for the supplied Hive version
99
+ @thrift_protocol_version = thrift_hive_protocol(options[:hive_version])
100
+
101
+ @logger = logger
102
+ @transport = thrift_transport(server, port)
103
+ @protocol = Thrift::BinaryProtocol.new(@transport)
104
+ @client = Hive2::Thrift::TCLIService::Client.new(@protocol)
105
+ @session = nil
106
+ @logger.info("Connecting to HiveServer2 #{server} on port #{port}")
107
+ end
108
+
109
+ def thrift_hive_protocol(version)
110
+ HIVE_THRIFT_MAPPING[version] || raise("Invalid Hive version")
111
+ end
112
+
113
+ def thrift_transport(server, port)
114
+ @logger.info("Initializing transport #{@options[:transport]}")
115
+ case @options[:transport]
116
+ when :buffered
117
+ return Thrift::BufferedTransport.new(thrift_socket(server, port, @options[:timeout]))
118
+ when :sasl
119
+ return Thrift::SaslClientTransport.new(thrift_socket(server, port, @options[:timeout]),
120
+ parse_sasl_params(@options[:sasl_params]))
121
+ when :http
122
+ return Thrift::HTTPClientTransport.new("http://#{server}:#{port}/cliservice")
123
+ else
124
+ raise "Unrecognised transport type '#{transport}'"
125
+ end
126
+ end
127
+
128
+ def thrift_socket(server, port, timeout)
129
+ socket = Thrift::Socket.new(server, port)
130
+ socket.timeout = timeout
131
+ socket
132
+ end
133
+
134
+ # Processes SASL connection params and returns a hash with symbol keys or a nil
135
+ def parse_sasl_params(sasl_params)
136
+ # Symbilize keys in a hash
137
+ if sasl_params.kind_of?(Hash)
138
+ return sasl_params.inject({}) do |memo,(k,v)|
139
+ memo[k.to_sym] = v;
140
+ memo
141
+ end
142
+ end
143
+ return nil
144
+ end
145
+
146
+ def open
147
+ @transport.open
148
+ end
149
+
150
+ def close
151
+ @transport.close
152
+ end
153
+
154
+ def open_session
155
+ @session = @client.OpenSession(prepare_open_session(@thrift_protocol_version))
156
+ end
157
+
158
+ def close_session
159
+ @client.CloseSession prepare_close_session
160
+ @session = nil
161
+ end
162
+
163
+ def session
164
+ @session && @session.sessionHandle
165
+ end
166
+
167
+ def client
168
+ @client
169
+ end
170
+
171
+ def execute(query)
172
+ @logger.info("Executing Hive Query: #{query}")
173
+ req = prepare_execute_statement(query)
174
+ exec_result = client.ExecuteStatement(req)
175
+ raise_error_if_failed!(exec_result)
176
+ exec_result
177
+ end
178
+
179
+ def priority=(priority)
180
+ set("mapred.job.priority", priority)
181
+ end
182
+
183
+ def queue=(queue)
184
+ set("mapred.job.queue.name", queue)
185
+ end
186
+
187
+ def set(name,value)
188
+ @logger.info("Setting #{name}=#{value}")
189
+ self.execute("SET #{name}=#{value}")
190
+ end
191
+
192
+ # Async execute
193
+ def async_execute(query)
194
+ @logger.info("Executing query asynchronously: #{query}")
195
+ op_handle = @client.ExecuteStatement(
196
+ Hive2::Thrift::TExecuteStatementReq.new(
197
+ sessionHandle: @session.sessionHandle,
198
+ statement: query,
199
+ runAsync: true
200
+ )
201
+ ).operationHandle
202
+
203
+ # Return handles to get hold of this query / session again
204
+ {
205
+ session: @session.sessionHandle,
206
+ guid: op_handle.operationId.guid,
207
+ secret: op_handle.operationId.secret
208
+ }
209
+ end
210
+
211
+ # Is the query complete?
212
+ def async_is_complete?(handles)
213
+ async_state(handles) == :finished
214
+ end
215
+
216
+ # Is the query actually running?
217
+ def async_is_running?(handles)
218
+ async_state(handles) == :running
219
+ end
220
+
221
+ # Has the query failed?
222
+ def async_is_failed?(handles)
223
+ async_state(handles) == :error
224
+ end
225
+
226
+ def async_is_cancelled?(handles)
227
+ async_state(handles) == :cancelled
228
+ end
229
+
230
+ def async_cancel(handles)
231
+ @client.CancelOperation(prepare_cancel_request(handles))
232
+ end
233
+
234
+ # Map states to symbols
235
+ def async_state(handles)
236
+ response = @client.GetOperationStatus(
237
+ Hive2::Thrift::TGetOperationStatusReq.new(operationHandle: prepare_operation_handle(handles))
238
+ )
239
+ puts response.operationState
240
+ case response.operationState
241
+ when Hive2::Thrift::TOperationState::FINISHED_STATE
242
+ return :finished
243
+ when Hive2::Thrift::TOperationState::INITIALIZED_STATE
244
+ return :initialized
245
+ when Hive2::Thrift::TOperationState::RUNNING_STATE
246
+ return :running
247
+ when Hive2::Thrift::TOperationState::CANCELED_STATE
248
+ return :cancelled
249
+ when Hive2::Thrift::TOperationState::CLOSED_STATE
250
+ return :closed
251
+ when Hive2::Thrift::TOperationState::ERROR_STATE
252
+ return :error
253
+ when Hive2::Thrift::TOperationState::UKNOWN_STATE
254
+ return :unknown
255
+ when Hive2::Thrift::TOperationState::PENDING_STATE
256
+ return :pending
257
+ when nil
258
+ raise "No operation state found for handles - has the session been closed?"
259
+ else
260
+ return :state_not_in_protocol
261
+ end
262
+ end
263
+
264
+ # Async fetch results from an async execute
265
+ def async_fetch(handles, max_rows = 100)
266
+ # Can't get data from an unfinished query
267
+ unless async_is_complete?(handles)
268
+ raise "Can't perform fetch on a query in state: #{async_state(handles)}"
269
+ end
270
+
271
+ # Fetch and
272
+ fetch_rows(prepare_operation_handle(handles), :first, max_rows)
273
+ end
274
+
275
+ # Performs a query on the server, fetches the results in batches of *batch_size* rows
276
+ # and yields the result batches to a given block as arrays of rows.
277
+ def async_fetch_in_batch(handles, batch_size = 1000, &block)
278
+ raise "No block given for the batch fetch request!" unless block_given?
279
+ # Can't get data from an unfinished query
280
+ unless async_is_complete?(handles)
281
+ raise "Can't perform fetch on a query in state: #{async_state(handles)}"
282
+ end
283
+
284
+ # Now let's iterate over the results
285
+ loop do
286
+ rows = fetch_rows(prepare_operation_handle(handles), :next, batch_size)
287
+ break if rows.empty?
288
+ yield rows
289
+ end
290
+ end
291
+
292
+ def async_close_session(handles)
293
+ validate_handles!(handles)
294
+ @client.CloseSession(Hive2::Thrift::TCloseSessionReq.new( sessionHandle: handles[:session] ))
295
+ end
296
+
297
+ # Pull rows from the query result
298
+ def fetch_rows(op_handle, orientation = :first, max_rows = 1000)
299
+ fetch_req = prepare_fetch_results(op_handle, orientation, max_rows)
300
+ fetch_results = @client.FetchResults(fetch_req)
301
+ raise_error_if_failed!(fetch_results)
302
+ rows = fetch_results.results.rows
303
+ TCLIResultSet.new(rows, TCLISchemaDefinition.new(get_schema_for(op_handle), rows.first))
304
+ end
305
+
306
+ # Performs a explain on the supplied query on the server, returns it as a ExplainResult.
307
+ # (Only works on 0.12 if you have this patch - https://issues.apache.org/jira/browse/HIVE-5492)
308
+ def explain(query)
309
+ rows = []
310
+ fetch_in_batch("EXPLAIN " + query) do |batch|
311
+ rows << batch.map { |b| b[:Explain] }
312
+ end
313
+ ExplainResult.new(rows.flatten)
314
+ end
315
+
316
+ # Performs a query on the server, fetches up to *max_rows* rows and returns them as an array.
317
+ def fetch(query, max_rows = 100)
318
+ # Execute the query and check the result
319
+ exec_result = execute(query)
320
+ raise_error_if_failed!(exec_result)
321
+
322
+ # Get search operation handle to fetch the results
323
+ op_handle = exec_result.operationHandle
324
+
325
+ # Fetch the rows
326
+ fetch_rows(op_handle, :first, max_rows)
327
+ end
328
+
329
+ # Performs a query on the server, fetches the results in batches of *batch_size* rows
330
+ # and yields the result batches to a given block as arrays of rows.
331
+ def fetch_in_batch(query, batch_size = 1000, &block)
332
+ raise "No block given for the batch fetch request!" unless block_given?
333
+
334
+ # Execute the query and check the result
335
+ exec_result = execute(query)
336
+ raise_error_if_failed!(exec_result)
337
+
338
+ # Get search operation handle to fetch the results
339
+ op_handle = exec_result.operationHandle
340
+
341
+ # Prepare fetch results request
342
+ fetch_req = prepare_fetch_results(op_handle, :next, batch_size)
343
+
344
+ # Now let's iterate over the results
345
+ loop do
346
+ rows = fetch_rows(op_handle, :next, batch_size)
347
+ break if rows.empty?
348
+ yield rows
349
+ end
350
+ end
351
+
352
+ def create_table(schema)
353
+ execute(schema.create_table_statement)
354
+ end
355
+
356
+ def drop_table(name)
357
+ name = name.name if name.is_a?(TableSchema)
358
+ execute("DROP TABLE `#{name}`")
359
+ end
360
+
361
+ def replace_columns(schema)
362
+ execute(schema.replace_columns_statement)
363
+ end
364
+
365
+ def add_columns(schema)
366
+ execute(schema.add_columns_statement)
367
+ end
368
+
369
+ def method_missing(meth, *args)
370
+ client.send(meth, *args)
371
+ end
372
+
373
+ private
374
+
375
+ def prepare_open_session(client_protocol)
376
+ req = ::Hive2::Thrift::TOpenSessionReq.new( @options[:sasl_params].nil? ? [] : @options[:sasl_params] )
377
+ req.client_protocol = client_protocol
378
+ req
379
+ end
380
+
381
+ def prepare_close_session
382
+ ::Hive2::Thrift::TCloseSessionReq.new( sessionHandle: self.session )
383
+ end
384
+
385
+ def prepare_execute_statement(query)
386
+ ::Hive2::Thrift::TExecuteStatementReq.new( sessionHandle: self.session, statement: query.to_s, confOverlay: {} )
387
+ end
388
+
389
+ def prepare_fetch_results(handle, orientation=:first, rows=100)
390
+ orientation_value = "FETCH_#{orientation.to_s.upcase}"
391
+ valid_orientations = ::Hive2::Thrift::TFetchOrientation::VALUE_MAP.values
392
+ unless valid_orientations.include?(orientation_value)
393
+ raise ArgumentError, "Invalid orientation: #{orientation.inspect}"
394
+ end
395
+ orientation_const = eval("::Hive2::Thrift::TFetchOrientation::#{orientation_value}")
396
+ ::Hive2::Thrift::TFetchResultsReq.new(
397
+ operationHandle: handle,
398
+ orientation: orientation_const,
399
+ maxRows: rows
400
+ )
401
+ end
402
+
403
+ def prepare_operation_handle(handles)
404
+ validate_handles!(handles)
405
+ Hive2::Thrift::TOperationHandle.new(
406
+ operationId: Hive2::Thrift::THandleIdentifier.new(guid: handles[:guid], secret: handles[:secret]),
407
+ operationType: Hive2::Thrift::TOperationType::EXECUTE_STATEMENT,
408
+ hasResultSet: false
409
+ )
410
+ end
411
+
412
+ def prepare_cancel_request(handles)
413
+ Hive2::Thrift::TCancelOperationReq.new(
414
+ operationHandle: prepare_operation_handle(handles)
415
+ )
416
+ end
417
+
418
+ def validate_handles!(handles)
419
+ unless handles.has_key?(:guid) and handles.has_key?(:secret) and handles.has_key?(:session)
420
+ raise "Invalid handles hash: #{handles.inspect}"
421
+ end
422
+ end
423
+
424
+ def get_schema_for(handle)
425
+ req = ::Hive2::Thrift::TGetResultSetMetadataReq.new( operationHandle: handle )
426
+ metadata = client.GetResultSetMetadata( req )
427
+ metadata.schema
428
+ end
429
+
430
+ # Raises an exception if given operation result is a failure
431
+ def raise_error_if_failed!(result)
432
+ return if result.status.statusCode == 0
433
+ error_message = result.status.errorMessage || 'Execution failed!'
434
+ raise RBHive::TCLIConnectionError.new(error_message)
435
+ end
436
+ end
437
+
438
+ class TCLIConnectionError < StandardError; end
439
+ end