sequel-impala 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +45 -0
  3. data/lib/impala.rb +14 -6
  4. data/lib/impala/connection.rb +46 -23
  5. data/lib/impala/cursor.rb +48 -4
  6. data/lib/impala/progress_reporter.rb +40 -0
  7. data/lib/impala/protocol/beeswax_constants.rb +1 -1
  8. data/lib/impala/protocol/beeswax_service.rb +1 -20
  9. data/lib/impala/protocol/beeswax_types.rb +1 -1
  10. data/lib/impala/protocol/exec_stats_constants.rb +13 -0
  11. data/lib/impala/protocol/exec_stats_types.rb +133 -0
  12. data/lib/impala/protocol/facebook_service.rb +3 -3
  13. data/lib/impala/protocol/fb303_constants.rb +1 -1
  14. data/lib/impala/protocol/fb303_types.rb +1 -1
  15. data/lib/impala/protocol/hive_metastore_constants.rb +1 -1
  16. data/lib/impala/protocol/hive_metastore_types.rb +1 -1
  17. data/lib/impala/protocol/impala_hive_server2_service.rb +111 -3
  18. data/lib/impala/protocol/impala_service.rb +67 -1
  19. data/lib/impala/protocol/impala_service_constants.rb +1 -1
  20. data/lib/impala/protocol/impala_service_types.rb +109 -7
  21. data/lib/impala/protocol/status_constants.rb +1 -1
  22. data/lib/impala/protocol/status_types.rb +1 -1
  23. data/lib/impala/protocol/t_c_l_i_service.rb +884 -724
  24. data/lib/impala/protocol/t_c_l_i_service_constants.rb +72 -0
  25. data/lib/impala/protocol/t_c_l_i_service_types.rb +1799 -0
  26. data/lib/impala/protocol/thrift_hive_metastore.rb +1 -1
  27. data/lib/impala/protocol/types_constants.rb +13 -0
  28. data/lib/impala/protocol/types_types.rb +332 -0
  29. data/lib/impala/sasl_transport.rb +117 -0
  30. data/lib/impala/thrift_patch.rb +42 -0
  31. data/lib/rbhive/connection.rb +25 -25
  32. data/lib/rbhive/explain_result.rb +9 -9
  33. data/lib/rbhive/schema_definition.rb +12 -12
  34. data/lib/rbhive/t_c_l_i_connection.rb +28 -26
  35. data/lib/rbhive/t_c_l_i_schema_definition.rb +1 -1
  36. data/lib/rbhive/table_schema.rb +1 -1
  37. data/lib/sequel/adapters/impala.rb +63 -6
  38. data/lib/sequel/adapters/jdbc/hive2.rb +1 -1
  39. data/lib/sequel/adapters/rbhive.rb +3 -2
  40. data/lib/sequel/adapters/shared/impala.rb +133 -25
  41. data/lib/thrift/sasl_client_transport.rb +2 -2
  42. data/lib/thrift/thrift_hive.rb +2 -2
  43. data/lib/thrift/thrift_hive_metastore.rb +2 -2
  44. data/spec/dataset_test.rb +85 -85
  45. data/spec/files/bad_timestamped_migrations/1273253849_create_sessions.rb +1 -1
  46. data/spec/files/bad_timestamped_migrations/1273253851_create_nodes.rb +1 -1
  47. data/spec/files/convert_to_timestamp_migrations/001_create_sessions.rb +1 -1
  48. data/spec/files/convert_to_timestamp_migrations/002_create_nodes.rb +1 -1
  49. data/spec/files/convert_to_timestamp_migrations/1273253850_create_artists.rb +1 -1
  50. data/spec/files/convert_to_timestamp_migrations/1273253852_create_albums.rb +1 -1
  51. data/spec/files/duplicate_timestamped_migrations/1273253849_create_sessions.rb +1 -1
  52. data/spec/files/duplicate_timestamped_migrations/1273253853_create_nodes.rb +1 -1
  53. data/spec/files/integer_migrations/001_create_sessions.rb +1 -1
  54. data/spec/files/integer_migrations/002_create_nodes.rb +1 -1
  55. data/spec/files/interleaved_timestamped_migrations/1273253849_create_sessions.rb +1 -1
  56. data/spec/files/interleaved_timestamped_migrations/1273253850_create_artists.rb +1 -1
  57. data/spec/files/interleaved_timestamped_migrations/1273253851_create_nodes.rb +1 -1
  58. data/spec/files/interleaved_timestamped_migrations/1273253852_create_albums.rb +1 -1
  59. data/spec/files/timestamped_migrations/1273253849_create_sessions.rb +1 -1
  60. data/spec/files/timestamped_migrations/1273253851_create_nodes.rb +1 -1
  61. data/spec/migrator_test.rb +2 -2
  62. data/spec/prepared_statement_test.rb +12 -12
  63. data/spec/schema_test.rb +6 -6
  64. data/spec/type_test.rb +8 -8
  65. metadata +30 -11
  66. data/CHANGELOG +0 -19
  67. data/lib/impala/protocol/cli_service_constants.rb +0 -60
  68. data/lib/impala/protocol/cli_service_types.rb +0 -1452
@@ -0,0 +1,42 @@
1
+ require 'socket'
2
+
3
+ module Thrift
4
+ module KeepAlive
5
+ # We'll override #open so that once the socket is opened
6
+ # we enable keepalive on it
7
+ #
8
+ # Many queries are going to take a long time (10s of minutes) to complete
9
+ # and we don't want the connection to close while we wait for the
10
+ # query to return.
11
+ #
12
+ # Unfortunately, Thrift doesn't supply an easy way to get to the
13
+ # socket that it opens to communicate with Impala.
14
+ #
15
+ # I figured that while I was in here, monkey-patching a way to get
16
+ # to the socket, I might as well just enable keepalive here
17
+ # instead.
18
+ def open
19
+ super
20
+ s = @transport.handle
21
+ s.setsockopt(::Socket::SOL_SOCKET, ::Socket::SO_KEEPALIVE, true)
22
+
23
+ # Apparently Mac OS X (Darwin) doesn't implement the SOL_TCP options below
24
+ # so we'll hope keep alive works under Mac OS X, but in production
25
+ # we Dockerize Jigsaw, so these options should be available when
26
+ # we're running on Linux
27
+ if defined? ::Socket::SOL_TCP
28
+ s.setsockopt(::Socket::SOL_TCP, ::Socket::TCP_KEEPIDLE, 60)
29
+ s.setsockopt(::Socket::SOL_TCP, ::Socket::TCP_KEEPINTVL, 10)
30
+ s.setsockopt(::Socket::SOL_TCP, ::Socket::TCP_KEEPCNT, 5)
31
+ end
32
+ end
33
+ end
34
+
35
+ class BufferedTransport
36
+ prepend KeepAlive
37
+ end
38
+
39
+ class ImpalaSaslClientTransport
40
+ prepend KeepAlive
41
+ end
42
+ end
@@ -19,18 +19,18 @@ module RBHive
19
19
  end
20
20
  end
21
21
  module_function :connect
22
-
22
+
23
23
  class StdOutLogger
24
- %w(fatal error warn info debug).each do |level|
24
+ %w(fatal error warn info debug).each do |level|
25
25
  define_method level.to_sym do |message|
26
26
  STDOUT.puts(message)
27
27
  end
28
28
  end
29
29
  end
30
-
30
+
31
31
  class Connection
32
32
  attr_reader :client
33
-
33
+
34
34
  def initialize(server, port=10_000, logger=StdOutLogger.new)
35
35
  @socket = Thrift::Socket.new(server, port)
36
36
  @transport = Thrift::BufferedTransport.new(@socket)
@@ -40,43 +40,43 @@ module RBHive
40
40
  @logger.info("Connecting to #{server} on port #{port}")
41
41
  @mutex = Mutex.new
42
42
  end
43
-
43
+
44
44
  def open
45
45
  @transport.open
46
46
  end
47
-
47
+
48
48
  def close
49
49
  @transport.close
50
50
  end
51
-
51
+
52
52
  def client
53
53
  @client
54
54
  end
55
-
55
+
56
56
  def execute(query)
57
57
  execute_safe(query)
58
58
  end
59
-
59
+
60
60
  def explain(query)
61
61
  safe do
62
62
  execute_unsafe("EXPLAIN "+ query)
63
63
  ExplainResult.new(client.fetchAll)
64
64
  end
65
65
  end
66
-
66
+
67
67
  def priority=(priority)
68
68
  set("mapred.job.priority", priority)
69
69
  end
70
-
70
+
71
71
  def queue=(queue)
72
72
  set("mapred.job.queue.name", queue)
73
73
  end
74
-
74
+
75
75
  def set(name,value)
76
76
  @logger.info("Setting #{name}=#{value}")
77
77
  client.execute("SET #{name}=#{value}")
78
78
  end
79
-
79
+
80
80
  def fetch(query)
81
81
  safe do
82
82
  execute_unsafe(query)
@@ -85,7 +85,7 @@ module RBHive
85
85
  ResultSet.new(rows, the_schema)
86
86
  end
87
87
  end
88
-
88
+
89
89
  def fetch_in_batch(query, batch_size=1_000)
90
90
  safe do
91
91
  execute_unsafe(query)
@@ -95,7 +95,7 @@ module RBHive
95
95
  end
96
96
  end
97
97
  end
98
-
98
+
99
99
  def first(query)
100
100
  safe do
101
101
  execute_unsafe(query)
@@ -104,43 +104,43 @@ module RBHive
104
104
  ResultSet.new([row], the_schema).first
105
105
  end
106
106
  end
107
-
107
+
108
108
  def schema(example_row=[])
109
109
  safe { SchemaDefinition.new(client.getSchema, example_row) }
110
110
  end
111
-
111
+
112
112
  def create_table(schema)
113
113
  execute(schema.create_table_statement)
114
114
  end
115
-
115
+
116
116
  def drop_table(name)
117
117
  name = name.name if name.is_a?(TableSchema)
118
118
  execute("DROP TABLE `#{name}`")
119
119
  end
120
-
120
+
121
121
  def replace_columns(schema)
122
122
  execute(schema.replace_columns_statement)
123
123
  end
124
-
124
+
125
125
  def add_columns(schema)
126
126
  execute(schema.add_columns_statement)
127
127
  end
128
-
128
+
129
129
  def method_missing(meth, *args)
130
130
  client.send(meth, *args)
131
131
  end
132
-
132
+
133
133
  private
134
-
134
+
135
135
  def execute_safe(query)
136
136
  safe { execute_unsafe(query) }
137
137
  end
138
-
138
+
139
139
  def execute_unsafe(query)
140
140
  @logger.info("Executing Hive Query: #{query}")
141
141
  client.execute(query)
142
142
  end
143
-
143
+
144
144
  def safe
145
145
  ret = nil
146
146
  @mutex.synchronize { ret = yield }
@@ -2,33 +2,33 @@ class ExplainResult
2
2
  def initialize(rows)
3
3
  @rows = rows
4
4
  end
5
-
5
+
6
6
  def ast
7
7
  by_section[:abstract_syntax_tree].first
8
8
  end
9
-
9
+
10
10
  def stage_count
11
11
  stage_dependencies.length
12
12
  end
13
-
13
+
14
14
  def stage_dependencies
15
15
  by_section[:stage_dependencies] || []
16
16
  end
17
-
17
+
18
18
  def to_tsv
19
19
  @rows.join("\n")
20
20
  end
21
-
21
+
22
22
  def raw
23
23
  @rows
24
24
  end
25
-
25
+
26
26
  def to_s
27
27
  to_tsv
28
28
  end
29
-
29
+
30
30
  private
31
-
31
+
32
32
  def by_section
33
33
  current_section = nil
34
34
  @rows.inject({}) do |sections, row|
@@ -43,4 +43,4 @@ class ExplainResult
43
43
  sections
44
44
  end
45
45
  end
46
- end
46
+ end
@@ -3,10 +3,10 @@ require 'json'
3
3
  module RBHive
4
4
  class SchemaDefinition
5
5
  attr_reader :schema
6
-
6
+
7
7
  NAN = Float::NAN rescue 0.0/0.0
8
8
  INFINITY = Float::INFINITY rescue 1.0/0.0
9
- TYPES = {
9
+ TYPES = {
10
10
  :boolean => :to_s,
11
11
  :string => :to_s,
12
12
  :bigint => :to_i,
@@ -16,16 +16,16 @@ module RBHive
16
16
  :smallint => :to_i,
17
17
  :tinyint => :to_i,
18
18
  }
19
-
19
+
20
20
  def initialize(schema, example_row)
21
21
  @schema = schema
22
22
  @example_row = example_row ? example_row.split("\t") : []
23
23
  end
24
-
24
+
25
25
  def column_names
26
26
  @column_names ||= begin
27
27
  schema_names = @schema.fieldSchemas.map {|c| c.name }
28
-
28
+
29
29
  # In rare cases Hive can return two identical column names
30
30
  # consider SELECT a.foo, b.foo...
31
31
  # in this case you get two columns called foo with no disambiguation.
@@ -36,7 +36,7 @@ module RBHive
36
36
  schema_names.map! { |c| s[c] += 1; s[c] > 1 ? "#{c}---|---#{s[c]}" : c }
37
37
  schema_names.map! { |c| s[c] > 1 ? "#{c}---|---1" : c }
38
38
  schema_names.map! { |c| c.gsub('---|---', '_').to_sym }
39
-
39
+
40
40
  # Lets fix the fact that Hive doesn't return schema data for partitions on SELECT * queries
41
41
  # For now we will call them :_p1, :_p2, etc. to avoid collisions.
42
42
  offset = 0
@@ -46,23 +46,23 @@ module RBHive
46
46
  schema_names
47
47
  end
48
48
  end
49
-
49
+
50
50
  def column_type_map
51
- @column_type_map ||= column_names.inject({}) do |hsh, c|
51
+ @column_type_map ||= column_names.inject({}) do |hsh, c|
52
52
  definition = @schema.fieldSchemas.find {|s| s.name.to_sym == c }
53
53
  # If the column isn't in the schema (eg partitions in SELECT * queries) assume they are strings
54
54
  hsh[c] = definition ? definition.type.to_sym : :string
55
55
  hsh
56
56
  end
57
57
  end
58
-
58
+
59
59
  def coerce_row(row)
60
60
  column_names.zip(row.split("\t")).inject({}) do |hsh, (column_name, value)|
61
61
  hsh[column_name] = coerce_column(column_name, value)
62
62
  hsh
63
63
  end
64
64
  end
65
-
65
+
66
66
  def coerce_column(column_name, value)
67
67
  type = column_type_map[column_name]
68
68
  return INFINITY if (type != :string && value == "Infinity")
@@ -71,11 +71,11 @@ module RBHive
71
71
  conversion_method = TYPES[type]
72
72
  conversion_method ? value.send(conversion_method) : value
73
73
  end
74
-
74
+
75
75
  def coerce_row_to_array(row)
76
76
  column_names.map { |n| row[n] }
77
77
  end
78
-
78
+
79
79
  def coerce_complex_value(value)
80
80
  return nil if value.nil?
81
81
  return nil if value.length == 0
@@ -30,7 +30,7 @@ module Thrift
30
30
  end
31
31
 
32
32
  module RBHive
33
-
33
+
34
34
  HIVE_THRIFT_MAPPING = {
35
35
  10 => 0,
36
36
  11 => 1,
@@ -85,11 +85,11 @@ module RBHive
85
85
  options ||= {} # backwards compatibility
86
86
  raise "'options' parameter must be a hash" unless options.is_a?(Hash)
87
87
  @sasl_params = options.delete(:sasl_params) || {}
88
-
88
+
89
89
  if options[:transport] == :sasl and @sasl_params.empty?
90
90
  raise ":transport is set to :sasl, but no :sasl_params option was supplied"
91
91
  end
92
-
92
+
93
93
  # Defaults to buffered transport, Hive 0.10, 1800 second timeout
94
94
  options[:transport] ||= :buffered
95
95
  options[:hive_version] ||= 10
@@ -97,7 +97,7 @@ module RBHive
97
97
  @options = options
98
98
  # Look up the appropriate Thrift protocol version for the supplied Hive version
99
99
  @thrift_protocol_version = thrift_hive_protocol(options[:hive_version])
100
-
100
+
101
101
  @logger = logger
102
102
  @transport = thrift_transport(server, port)
103
103
  @protocol = Thrift::BinaryProtocol.new(@transport)
@@ -105,11 +105,11 @@ module RBHive
105
105
  @session = nil
106
106
  @logger.info("Connecting to HiveServer2 #{server} on port #{port}")
107
107
  end
108
-
108
+
109
109
  def thrift_hive_protocol(version)
110
110
  HIVE_THRIFT_MAPPING[version] || raise("Invalid Hive version")
111
111
  end
112
-
112
+
113
113
  def thrift_transport(server, port)
114
114
  @logger.info("Initializing transport #{@options[:transport]}")
115
115
  case @options[:transport]
@@ -188,7 +188,7 @@ module RBHive
188
188
  @logger.info("Setting #{name}=#{value}")
189
189
  self.execute("SET #{name}=#{value}")
190
190
  end
191
-
191
+
192
192
  # Async execute
193
193
  def async_execute(query)
194
194
  @logger.info("Executing query asynchronously: #{query}")
@@ -204,35 +204,35 @@ module RBHive
204
204
 
205
205
  # Return handles to get hold of this query / session again
206
206
  {
207
- session: @session.sessionHandle,
208
- guid: op_handle.operationId.guid,
207
+ session: @session.sessionHandle,
208
+ guid: op_handle.operationId.guid,
209
209
  secret: op_handle.operationId.secret
210
210
  }
211
211
  end
212
-
212
+
213
213
  # Is the query complete?
214
214
  def async_is_complete?(handles)
215
215
  async_state(handles) == :finished
216
216
  end
217
-
217
+
218
218
  # Is the query actually running?
219
219
  def async_is_running?(handles)
220
220
  async_state(handles) == :running
221
221
  end
222
-
222
+
223
223
  # Has the query failed?
224
224
  def async_is_failed?(handles)
225
225
  async_state(handles) == :error
226
226
  end
227
-
227
+
228
228
  def async_is_cancelled?(handles)
229
229
  async_state(handles) == :cancelled
230
230
  end
231
-
231
+
232
232
  def async_cancel(handles)
233
233
  @client.CancelOperation(prepare_cancel_request(handles))
234
234
  end
235
-
235
+
236
236
  # Map states to symbols
237
237
  def async_state(handles)
238
238
  response = @client.GetOperationStatus(
@@ -262,18 +262,18 @@ module RBHive
262
262
  return :state_not_in_protocol
263
263
  end
264
264
  end
265
-
265
+
266
266
  # Async fetch results from an async execute
267
267
  def async_fetch(handles, max_rows = 100)
268
268
  # Can't get data from an unfinished query
269
269
  unless async_is_complete?(handles)
270
270
  raise "Can't perform fetch on a query in state: #{async_state(handles)}"
271
271
  end
272
-
272
+
273
273
  # Fetch and
274
274
  fetch_rows(prepare_operation_handle(handles), :first, max_rows)
275
275
  end
276
-
276
+
277
277
  # Performs a query on the server, fetches the results in batches of *batch_size* rows
278
278
  # and yields the result batches to a given block as arrays of rows.
279
279
  def async_fetch_in_batch(handles, batch_size = 1000, &block)
@@ -290,7 +290,7 @@ module RBHive
290
290
  yield rows
291
291
  end
292
292
  end
293
-
293
+
294
294
  def async_close_session(handles)
295
295
  validate_handles!(handles)
296
296
  @client.CloseSession(Hive2::Thrift::TCloseSessionReq.new( sessionHandle: handles[:session] ))
@@ -311,14 +311,16 @@ module RBHive
311
311
  vals = row.colVals
312
312
  cols.each do |i, col, conv|
313
313
  v = vals[i].get_value.value
314
- h[col] = conv ? conv[v] : v
314
+ h[col] = unless v.nil?
315
+ conv ? conv[v] : v
316
+ end
315
317
  end
316
318
  yield h
317
319
  end
318
320
  rows = fetch_rows(op_handle, :next)
319
321
  end
320
322
  end
321
-
323
+
322
324
  # Pull rows from the query result
323
325
  def fetch_rows(op_handle, orientation = :first, max_rows = 1000)
324
326
  fetch_req = prepare_fetch_results(op_handle, orientation, max_rows)
@@ -327,7 +329,7 @@ module RBHive
327
329
  fetch_results.results.rows
328
330
  #TCLIResultSet.new(rows, TCLISchemaDefinition.new(get_schema_for(op_handle), rows.first))
329
331
  end
330
-
332
+
331
333
  # Performs a explain on the supplied query on the server, returns it as a ExplainResult.
332
334
  # (Only works on 0.12 if you have this patch - https://issues.apache.org/jira/browse/HIVE-5492)
333
335
  def explain(query)
@@ -346,7 +348,7 @@ module RBHive
346
348
 
347
349
  # Get search operation handle to fetch the results
348
350
  op_handle = exec_result.operationHandle
349
-
351
+
350
352
  # Fetch the rows
351
353
  fetch_rows(op_handle, :first, max_rows)
352
354
  end
@@ -355,7 +357,7 @@ module RBHive
355
357
  # and yields the result batches to a given block as arrays of rows.
356
358
  def fetch_in_batch(query, batch_size = 1000, &block)
357
359
  raise "No block given for the batch fetch request!" unless block_given?
358
-
360
+
359
361
  # Execute the query and check the result
360
362
  exec_result = execute(query)
361
363
  raise_error_if_failed!(exec_result)
@@ -433,13 +435,13 @@ module RBHive
433
435
  hasResultSet: false
434
436
  )
435
437
  end
436
-
438
+
437
439
  def prepare_cancel_request(handles)
438
440
  Hive2::Thrift::TCancelOperationReq.new(
439
441
  operationHandle: prepare_operation_handle(handles)
440
442
  )
441
443
  end
442
-
444
+
443
445
  def validate_handles!(handles)
444
446
  unless handles.has_key?(:guid) and handles.has_key?(:secret) and handles.has_key?(:session)
445
447
  raise "Invalid handles hash: #{handles.inspect}"