sequel-impala 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +45 -0
  3. data/lib/impala.rb +14 -6
  4. data/lib/impala/connection.rb +46 -23
  5. data/lib/impala/cursor.rb +48 -4
  6. data/lib/impala/progress_reporter.rb +40 -0
  7. data/lib/impala/protocol/beeswax_constants.rb +1 -1
  8. data/lib/impala/protocol/beeswax_service.rb +1 -20
  9. data/lib/impala/protocol/beeswax_types.rb +1 -1
  10. data/lib/impala/protocol/exec_stats_constants.rb +13 -0
  11. data/lib/impala/protocol/exec_stats_types.rb +133 -0
  12. data/lib/impala/protocol/facebook_service.rb +3 -3
  13. data/lib/impala/protocol/fb303_constants.rb +1 -1
  14. data/lib/impala/protocol/fb303_types.rb +1 -1
  15. data/lib/impala/protocol/hive_metastore_constants.rb +1 -1
  16. data/lib/impala/protocol/hive_metastore_types.rb +1 -1
  17. data/lib/impala/protocol/impala_hive_server2_service.rb +111 -3
  18. data/lib/impala/protocol/impala_service.rb +67 -1
  19. data/lib/impala/protocol/impala_service_constants.rb +1 -1
  20. data/lib/impala/protocol/impala_service_types.rb +109 -7
  21. data/lib/impala/protocol/status_constants.rb +1 -1
  22. data/lib/impala/protocol/status_types.rb +1 -1
  23. data/lib/impala/protocol/t_c_l_i_service.rb +884 -724
  24. data/lib/impala/protocol/t_c_l_i_service_constants.rb +72 -0
  25. data/lib/impala/protocol/t_c_l_i_service_types.rb +1799 -0
  26. data/lib/impala/protocol/thrift_hive_metastore.rb +1 -1
  27. data/lib/impala/protocol/types_constants.rb +13 -0
  28. data/lib/impala/protocol/types_types.rb +332 -0
  29. data/lib/impala/sasl_transport.rb +117 -0
  30. data/lib/impala/thrift_patch.rb +42 -0
  31. data/lib/rbhive/connection.rb +25 -25
  32. data/lib/rbhive/explain_result.rb +9 -9
  33. data/lib/rbhive/schema_definition.rb +12 -12
  34. data/lib/rbhive/t_c_l_i_connection.rb +28 -26
  35. data/lib/rbhive/t_c_l_i_schema_definition.rb +1 -1
  36. data/lib/rbhive/table_schema.rb +1 -1
  37. data/lib/sequel/adapters/impala.rb +63 -6
  38. data/lib/sequel/adapters/jdbc/hive2.rb +1 -1
  39. data/lib/sequel/adapters/rbhive.rb +3 -2
  40. data/lib/sequel/adapters/shared/impala.rb +133 -25
  41. data/lib/thrift/sasl_client_transport.rb +2 -2
  42. data/lib/thrift/thrift_hive.rb +2 -2
  43. data/lib/thrift/thrift_hive_metastore.rb +2 -2
  44. data/spec/dataset_test.rb +85 -85
  45. data/spec/files/bad_timestamped_migrations/1273253849_create_sessions.rb +1 -1
  46. data/spec/files/bad_timestamped_migrations/1273253851_create_nodes.rb +1 -1
  47. data/spec/files/convert_to_timestamp_migrations/001_create_sessions.rb +1 -1
  48. data/spec/files/convert_to_timestamp_migrations/002_create_nodes.rb +1 -1
  49. data/spec/files/convert_to_timestamp_migrations/1273253850_create_artists.rb +1 -1
  50. data/spec/files/convert_to_timestamp_migrations/1273253852_create_albums.rb +1 -1
  51. data/spec/files/duplicate_timestamped_migrations/1273253849_create_sessions.rb +1 -1
  52. data/spec/files/duplicate_timestamped_migrations/1273253853_create_nodes.rb +1 -1
  53. data/spec/files/integer_migrations/001_create_sessions.rb +1 -1
  54. data/spec/files/integer_migrations/002_create_nodes.rb +1 -1
  55. data/spec/files/interleaved_timestamped_migrations/1273253849_create_sessions.rb +1 -1
  56. data/spec/files/interleaved_timestamped_migrations/1273253850_create_artists.rb +1 -1
  57. data/spec/files/interleaved_timestamped_migrations/1273253851_create_nodes.rb +1 -1
  58. data/spec/files/interleaved_timestamped_migrations/1273253852_create_albums.rb +1 -1
  59. data/spec/files/timestamped_migrations/1273253849_create_sessions.rb +1 -1
  60. data/spec/files/timestamped_migrations/1273253851_create_nodes.rb +1 -1
  61. data/spec/migrator_test.rb +2 -2
  62. data/spec/prepared_statement_test.rb +12 -12
  63. data/spec/schema_test.rb +6 -6
  64. data/spec/type_test.rb +8 -8
  65. metadata +30 -11
  66. data/CHANGELOG +0 -19
  67. data/lib/impala/protocol/cli_service_constants.rb +0 -60
  68. data/lib/impala/protocol/cli_service_types.rb +0 -1452
@@ -0,0 +1,42 @@
1
+ require 'socket'
2
+
3
+ module Thrift
4
+ module KeepAlive
5
+ # We'll override #open so that once the socket is opened
6
+ # we enable keepalive on it
7
+ #
8
+ # Many queries are going to take a long time (10s of minutes) to complete
9
+ # and we don't want the connection to close while we wait for the
10
+ # query to return.
11
+ #
12
+ # Unfortunately, Thrift doesn't supply an easy way to get to the
13
+ # socket that it opens to communicate with Impala.
14
+ #
15
+ # I figured that while I was in here, monkey-patching a way to get
16
+ # to the socket, I might as well just enable keepalive here
17
+ # instead.
18
+ def open
19
+ super
20
+ s = @transport.handle
21
+ s.setsockopt(::Socket::SOL_SOCKET, ::Socket::SO_KEEPALIVE, true)
22
+
23
+ # Apparently Mac OS X (Darwin) doesn't implement the SOL_TCP options below
24
+ # so we'll hope keep alive works under Mac OS X, but in production
25
+ # we Dockerize Jigsaw, so these options should be available when
26
+ # we're running on Linux
27
+ if defined? ::Socket::SOL_TCP
28
+ s.setsockopt(::Socket::SOL_TCP, ::Socket::TCP_KEEPIDLE, 60)
29
+ s.setsockopt(::Socket::SOL_TCP, ::Socket::TCP_KEEPINTVL, 10)
30
+ s.setsockopt(::Socket::SOL_TCP, ::Socket::TCP_KEEPCNT, 5)
31
+ end
32
+ end
33
+ end
34
+
35
+ class BufferedTransport
36
+ prepend KeepAlive
37
+ end
38
+
39
+ class ImpalaSaslClientTransport
40
+ prepend KeepAlive
41
+ end
42
+ end
@@ -19,18 +19,18 @@ module RBHive
19
19
  end
20
20
  end
21
21
  module_function :connect
22
-
22
+
23
23
  class StdOutLogger
24
- %w(fatal error warn info debug).each do |level|
24
+ %w(fatal error warn info debug).each do |level|
25
25
  define_method level.to_sym do |message|
26
26
  STDOUT.puts(message)
27
27
  end
28
28
  end
29
29
  end
30
-
30
+
31
31
  class Connection
32
32
  attr_reader :client
33
-
33
+
34
34
  def initialize(server, port=10_000, logger=StdOutLogger.new)
35
35
  @socket = Thrift::Socket.new(server, port)
36
36
  @transport = Thrift::BufferedTransport.new(@socket)
@@ -40,43 +40,43 @@ module RBHive
40
40
  @logger.info("Connecting to #{server} on port #{port}")
41
41
  @mutex = Mutex.new
42
42
  end
43
-
43
+
44
44
  def open
45
45
  @transport.open
46
46
  end
47
-
47
+
48
48
  def close
49
49
  @transport.close
50
50
  end
51
-
51
+
52
52
  def client
53
53
  @client
54
54
  end
55
-
55
+
56
56
  def execute(query)
57
57
  execute_safe(query)
58
58
  end
59
-
59
+
60
60
  def explain(query)
61
61
  safe do
62
62
  execute_unsafe("EXPLAIN "+ query)
63
63
  ExplainResult.new(client.fetchAll)
64
64
  end
65
65
  end
66
-
66
+
67
67
  def priority=(priority)
68
68
  set("mapred.job.priority", priority)
69
69
  end
70
-
70
+
71
71
  def queue=(queue)
72
72
  set("mapred.job.queue.name", queue)
73
73
  end
74
-
74
+
75
75
  def set(name,value)
76
76
  @logger.info("Setting #{name}=#{value}")
77
77
  client.execute("SET #{name}=#{value}")
78
78
  end
79
-
79
+
80
80
  def fetch(query)
81
81
  safe do
82
82
  execute_unsafe(query)
@@ -85,7 +85,7 @@ module RBHive
85
85
  ResultSet.new(rows, the_schema)
86
86
  end
87
87
  end
88
-
88
+
89
89
  def fetch_in_batch(query, batch_size=1_000)
90
90
  safe do
91
91
  execute_unsafe(query)
@@ -95,7 +95,7 @@ module RBHive
95
95
  end
96
96
  end
97
97
  end
98
-
98
+
99
99
  def first(query)
100
100
  safe do
101
101
  execute_unsafe(query)
@@ -104,43 +104,43 @@ module RBHive
104
104
  ResultSet.new([row], the_schema).first
105
105
  end
106
106
  end
107
-
107
+
108
108
  def schema(example_row=[])
109
109
  safe { SchemaDefinition.new(client.getSchema, example_row) }
110
110
  end
111
-
111
+
112
112
  def create_table(schema)
113
113
  execute(schema.create_table_statement)
114
114
  end
115
-
115
+
116
116
  def drop_table(name)
117
117
  name = name.name if name.is_a?(TableSchema)
118
118
  execute("DROP TABLE `#{name}`")
119
119
  end
120
-
120
+
121
121
  def replace_columns(schema)
122
122
  execute(schema.replace_columns_statement)
123
123
  end
124
-
124
+
125
125
  def add_columns(schema)
126
126
  execute(schema.add_columns_statement)
127
127
  end
128
-
128
+
129
129
  def method_missing(meth, *args)
130
130
  client.send(meth, *args)
131
131
  end
132
-
132
+
133
133
  private
134
-
134
+
135
135
  def execute_safe(query)
136
136
  safe { execute_unsafe(query) }
137
137
  end
138
-
138
+
139
139
  def execute_unsafe(query)
140
140
  @logger.info("Executing Hive Query: #{query}")
141
141
  client.execute(query)
142
142
  end
143
-
143
+
144
144
  def safe
145
145
  ret = nil
146
146
  @mutex.synchronize { ret = yield }
@@ -2,33 +2,33 @@ class ExplainResult
2
2
  def initialize(rows)
3
3
  @rows = rows
4
4
  end
5
-
5
+
6
6
  def ast
7
7
  by_section[:abstract_syntax_tree].first
8
8
  end
9
-
9
+
10
10
  def stage_count
11
11
  stage_dependencies.length
12
12
  end
13
-
13
+
14
14
  def stage_dependencies
15
15
  by_section[:stage_dependencies] || []
16
16
  end
17
-
17
+
18
18
  def to_tsv
19
19
  @rows.join("\n")
20
20
  end
21
-
21
+
22
22
  def raw
23
23
  @rows
24
24
  end
25
-
25
+
26
26
  def to_s
27
27
  to_tsv
28
28
  end
29
-
29
+
30
30
  private
31
-
31
+
32
32
  def by_section
33
33
  current_section = nil
34
34
  @rows.inject({}) do |sections, row|
@@ -43,4 +43,4 @@ class ExplainResult
43
43
  sections
44
44
  end
45
45
  end
46
- end
46
+ end
@@ -3,10 +3,10 @@ require 'json'
3
3
  module RBHive
4
4
  class SchemaDefinition
5
5
  attr_reader :schema
6
-
6
+
7
7
  NAN = Float::NAN rescue 0.0/0.0
8
8
  INFINITY = Float::INFINITY rescue 1.0/0.0
9
- TYPES = {
9
+ TYPES = {
10
10
  :boolean => :to_s,
11
11
  :string => :to_s,
12
12
  :bigint => :to_i,
@@ -16,16 +16,16 @@ module RBHive
16
16
  :smallint => :to_i,
17
17
  :tinyint => :to_i,
18
18
  }
19
-
19
+
20
20
  def initialize(schema, example_row)
21
21
  @schema = schema
22
22
  @example_row = example_row ? example_row.split("\t") : []
23
23
  end
24
-
24
+
25
25
  def column_names
26
26
  @column_names ||= begin
27
27
  schema_names = @schema.fieldSchemas.map {|c| c.name }
28
-
28
+
29
29
  # In rare cases Hive can return two identical column names
30
30
  # consider SELECT a.foo, b.foo...
31
31
  # in this case you get two columns called foo with no disambiguation.
@@ -36,7 +36,7 @@ module RBHive
36
36
  schema_names.map! { |c| s[c] += 1; s[c] > 1 ? "#{c}---|---#{s[c]}" : c }
37
37
  schema_names.map! { |c| s[c] > 1 ? "#{c}---|---1" : c }
38
38
  schema_names.map! { |c| c.gsub('---|---', '_').to_sym }
39
-
39
+
40
40
  # Lets fix the fact that Hive doesn't return schema data for partitions on SELECT * queries
41
41
  # For now we will call them :_p1, :_p2, etc. to avoid collisions.
42
42
  offset = 0
@@ -46,23 +46,23 @@ module RBHive
46
46
  schema_names
47
47
  end
48
48
  end
49
-
49
+
50
50
  def column_type_map
51
- @column_type_map ||= column_names.inject({}) do |hsh, c|
51
+ @column_type_map ||= column_names.inject({}) do |hsh, c|
52
52
  definition = @schema.fieldSchemas.find {|s| s.name.to_sym == c }
53
53
  # If the column isn't in the schema (eg partitions in SELECT * queries) assume they are strings
54
54
  hsh[c] = definition ? definition.type.to_sym : :string
55
55
  hsh
56
56
  end
57
57
  end
58
-
58
+
59
59
  def coerce_row(row)
60
60
  column_names.zip(row.split("\t")).inject({}) do |hsh, (column_name, value)|
61
61
  hsh[column_name] = coerce_column(column_name, value)
62
62
  hsh
63
63
  end
64
64
  end
65
-
65
+
66
66
  def coerce_column(column_name, value)
67
67
  type = column_type_map[column_name]
68
68
  return INFINITY if (type != :string && value == "Infinity")
@@ -71,11 +71,11 @@ module RBHive
71
71
  conversion_method = TYPES[type]
72
72
  conversion_method ? value.send(conversion_method) : value
73
73
  end
74
-
74
+
75
75
  def coerce_row_to_array(row)
76
76
  column_names.map { |n| row[n] }
77
77
  end
78
-
78
+
79
79
  def coerce_complex_value(value)
80
80
  return nil if value.nil?
81
81
  return nil if value.length == 0
@@ -30,7 +30,7 @@ module Thrift
30
30
  end
31
31
 
32
32
  module RBHive
33
-
33
+
34
34
  HIVE_THRIFT_MAPPING = {
35
35
  10 => 0,
36
36
  11 => 1,
@@ -85,11 +85,11 @@ module RBHive
85
85
  options ||= {} # backwards compatibility
86
86
  raise "'options' parameter must be a hash" unless options.is_a?(Hash)
87
87
  @sasl_params = options.delete(:sasl_params) || {}
88
-
88
+
89
89
  if options[:transport] == :sasl and @sasl_params.empty?
90
90
  raise ":transport is set to :sasl, but no :sasl_params option was supplied"
91
91
  end
92
-
92
+
93
93
  # Defaults to buffered transport, Hive 0.10, 1800 second timeout
94
94
  options[:transport] ||= :buffered
95
95
  options[:hive_version] ||= 10
@@ -97,7 +97,7 @@ module RBHive
97
97
  @options = options
98
98
  # Look up the appropriate Thrift protocol version for the supplied Hive version
99
99
  @thrift_protocol_version = thrift_hive_protocol(options[:hive_version])
100
-
100
+
101
101
  @logger = logger
102
102
  @transport = thrift_transport(server, port)
103
103
  @protocol = Thrift::BinaryProtocol.new(@transport)
@@ -105,11 +105,11 @@ module RBHive
105
105
  @session = nil
106
106
  @logger.info("Connecting to HiveServer2 #{server} on port #{port}")
107
107
  end
108
-
108
+
109
109
  def thrift_hive_protocol(version)
110
110
  HIVE_THRIFT_MAPPING[version] || raise("Invalid Hive version")
111
111
  end
112
-
112
+
113
113
  def thrift_transport(server, port)
114
114
  @logger.info("Initializing transport #{@options[:transport]}")
115
115
  case @options[:transport]
@@ -188,7 +188,7 @@ module RBHive
188
188
  @logger.info("Setting #{name}=#{value}")
189
189
  self.execute("SET #{name}=#{value}")
190
190
  end
191
-
191
+
192
192
  # Async execute
193
193
  def async_execute(query)
194
194
  @logger.info("Executing query asynchronously: #{query}")
@@ -204,35 +204,35 @@ module RBHive
204
204
 
205
205
  # Return handles to get hold of this query / session again
206
206
  {
207
- session: @session.sessionHandle,
208
- guid: op_handle.operationId.guid,
207
+ session: @session.sessionHandle,
208
+ guid: op_handle.operationId.guid,
209
209
  secret: op_handle.operationId.secret
210
210
  }
211
211
  end
212
-
212
+
213
213
  # Is the query complete?
214
214
  def async_is_complete?(handles)
215
215
  async_state(handles) == :finished
216
216
  end
217
-
217
+
218
218
  # Is the query actually running?
219
219
  def async_is_running?(handles)
220
220
  async_state(handles) == :running
221
221
  end
222
-
222
+
223
223
  # Has the query failed?
224
224
  def async_is_failed?(handles)
225
225
  async_state(handles) == :error
226
226
  end
227
-
227
+
228
228
  def async_is_cancelled?(handles)
229
229
  async_state(handles) == :cancelled
230
230
  end
231
-
231
+
232
232
  def async_cancel(handles)
233
233
  @client.CancelOperation(prepare_cancel_request(handles))
234
234
  end
235
-
235
+
236
236
  # Map states to symbols
237
237
  def async_state(handles)
238
238
  response = @client.GetOperationStatus(
@@ -262,18 +262,18 @@ module RBHive
262
262
  return :state_not_in_protocol
263
263
  end
264
264
  end
265
-
265
+
266
266
  # Async fetch results from an async execute
267
267
  def async_fetch(handles, max_rows = 100)
268
268
  # Can't get data from an unfinished query
269
269
  unless async_is_complete?(handles)
270
270
  raise "Can't perform fetch on a query in state: #{async_state(handles)}"
271
271
  end
272
-
272
+
273
273
  # Fetch and
274
274
  fetch_rows(prepare_operation_handle(handles), :first, max_rows)
275
275
  end
276
-
276
+
277
277
  # Performs a query on the server, fetches the results in batches of *batch_size* rows
278
278
  # and yields the result batches to a given block as arrays of rows.
279
279
  def async_fetch_in_batch(handles, batch_size = 1000, &block)
@@ -290,7 +290,7 @@ module RBHive
290
290
  yield rows
291
291
  end
292
292
  end
293
-
293
+
294
294
  def async_close_session(handles)
295
295
  validate_handles!(handles)
296
296
  @client.CloseSession(Hive2::Thrift::TCloseSessionReq.new( sessionHandle: handles[:session] ))
@@ -311,14 +311,16 @@ module RBHive
311
311
  vals = row.colVals
312
312
  cols.each do |i, col, conv|
313
313
  v = vals[i].get_value.value
314
- h[col] = conv ? conv[v] : v
314
+ h[col] = unless v.nil?
315
+ conv ? conv[v] : v
316
+ end
315
317
  end
316
318
  yield h
317
319
  end
318
320
  rows = fetch_rows(op_handle, :next)
319
321
  end
320
322
  end
321
-
323
+
322
324
  # Pull rows from the query result
323
325
  def fetch_rows(op_handle, orientation = :first, max_rows = 1000)
324
326
  fetch_req = prepare_fetch_results(op_handle, orientation, max_rows)
@@ -327,7 +329,7 @@ module RBHive
327
329
  fetch_results.results.rows
328
330
  #TCLIResultSet.new(rows, TCLISchemaDefinition.new(get_schema_for(op_handle), rows.first))
329
331
  end
330
-
332
+
331
333
  # Performs a explain on the supplied query on the server, returns it as a ExplainResult.
332
334
  # (Only works on 0.12 if you have this patch - https://issues.apache.org/jira/browse/HIVE-5492)
333
335
  def explain(query)
@@ -346,7 +348,7 @@ module RBHive
346
348
 
347
349
  # Get search operation handle to fetch the results
348
350
  op_handle = exec_result.operationHandle
349
-
351
+
350
352
  # Fetch the rows
351
353
  fetch_rows(op_handle, :first, max_rows)
352
354
  end
@@ -355,7 +357,7 @@ module RBHive
355
357
  # and yields the result batches to a given block as arrays of rows.
356
358
  def fetch_in_batch(query, batch_size = 1000, &block)
357
359
  raise "No block given for the batch fetch request!" unless block_given?
358
-
360
+
359
361
  # Execute the query and check the result
360
362
  exec_result = execute(query)
361
363
  raise_error_if_failed!(exec_result)
@@ -433,13 +435,13 @@ module RBHive
433
435
  hasResultSet: false
434
436
  )
435
437
  end
436
-
438
+
437
439
  def prepare_cancel_request(handles)
438
440
  Hive2::Thrift::TCancelOperationReq.new(
439
441
  operationHandle: prepare_operation_handle(handles)
440
442
  )
441
443
  end
442
-
444
+
443
445
  def validate_handles!(handles)
444
446
  unless handles.has_key?(:guid) and handles.has_key?(:secret) and handles.has_key?(:session)
445
447
  raise "Invalid handles hash: #{handles.inspect}"