impala 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # impala-ruby
2
2
 
3
- This is an ruby client for [Cloudera's Impala][1]. You use it like this:
3
+ This is a ruby client for [Cloudera Impala][1]. You use it like this:
4
4
 
5
5
  ```ruby
6
6
  require 'impala'
data/impala.gemspec CHANGED
@@ -13,6 +13,8 @@ Gem::Specification.new do |gem|
13
13
  gem.homepage = "https://github.com/colinmarc/impala-ruby"
14
14
 
15
15
  gem.add_dependency('thrift', '~> 0.9.1')
16
+ gem.add_dependency('rack')
17
+ gem.add_dependency('thin')
16
18
 
17
19
  gem.add_development_dependency('rake')
18
20
  gem.add_development_dependency('pry')
data/lib/impala.rb CHANGED
@@ -14,7 +14,7 @@ require 'impala/cursor'
14
14
  require 'impala/connection'
15
15
 
16
16
  module Impala
17
- KNOWN_COMMANDS = ['select', 'insert', 'show', 'describe', 'use', 'explain', 'create', 'drop']
17
+ KNOWN_COMMANDS = ['select', 'insert', 'show', 'describe', 'use', 'explain', 'create', 'drop', 'invalidate', 'with']
18
18
  DEFAULT_HOST = 'localhost'
19
19
  DEFAULT_PORT = 21000
20
20
  class InvalidQueryError < StandardError; end
@@ -2,7 +2,7 @@ module Impala
2
2
  # This object represents a connection to an Impala server. It can be used to
3
3
  # perform queries on the database.
4
4
  class Connection
5
- SLEEP_INTERVAL = 0.1
5
+ LOG_CONTEXT_ID = "impala-ruby"
6
6
 
7
7
  # Don't instantiate Connections directly; instead, use {Impala.connect}.
8
8
  def initialize(host, port)
@@ -43,7 +43,7 @@ module Impala
43
43
  @connected
44
44
  end
45
45
 
46
- # Refresh the metadata store
46
+ # Refresh the metadata store.
47
47
  def refresh
48
48
  raise ConnectionError.new("Connection closed") unless open?
49
49
  @service.ResetCatalog
@@ -53,21 +53,27 @@ module Impala
53
53
  # load the entire result set into memory, so if you're dealing with lots
54
54
  # of rows, {#execute} may work better.
55
55
  # @param [String] query the query you want to run
56
+ # @param [Hash] query_options the options to set user and configuration
57
+ # except for :user, see TImpalaQueryOptions in ImpalaService.thrift
58
+ # @option query_options [String] :user the user runs the query
56
59
  # @return [Array<Hash>] an array of hashes, one for each row.
57
- def query(raw_query)
58
- execute(raw_query).fetch_all
60
+ def query(raw_query, query_options = {})
61
+ execute(raw_query, query_options).fetch_all
59
62
  end
60
63
 
61
64
  # Perform a query and return a cursor for iterating over the results.
62
65
  # @param [String] query the query you want to run
66
+ # @param [Hash] query_options the options to set user and configuration
67
+ # except for :user, see TImpalaQueryOptions in ImpalaService.thrift
68
+ # @option query_options [String] :user the user runs the query
63
69
  # @return [Cursor] a cursor for the result rows
64
- def execute(raw_query)
70
+ def execute(raw_query, query_options = {})
65
71
  raise ConnectionError.new("Connection closed") unless open?
66
72
 
67
73
  query = sanitize_query(raw_query)
68
- handle = send_query(query)
74
+ handle = send_query(query, query_options)
69
75
 
70
- wait_for_result(handle)
76
+ check_result(handle)
71
77
  Cursor.new(handle, @service)
72
78
  end
73
79
 
@@ -85,25 +91,23 @@ module Impala
85
91
  ([command] + words[1..-1]).join(' ')
86
92
  end
87
93
 
88
- def send_query(sanitized_query)
94
+ def send_query(sanitized_query, query_options)
89
95
  query = Protocol::Beeswax::Query.new
90
96
  query.query = sanitized_query
91
97
 
92
- @service.query(query)
98
+ query.hadoop_user = query_options.delete(:user) if query_options[:user]
99
+ query.configuration = query_options.map do |key, value|
100
+ "#{key.upcase}=#{value}"
101
+ end
102
+
103
+ @service.executeAndWait(query, LOG_CONTEXT_ID)
93
104
  end
94
105
 
95
- def wait_for_result(handle)
96
- #TODO select here, or something
97
- while true
98
- state = @service.get_state(handle)
99
- if state == Protocol::Beeswax::QueryState::FINISHED
100
- break
101
- elsif state == Protocol::Beeswax::QueryState::EXCEPTION
102
- close_handle(handle)
103
- raise ConnectionError.new("The query was aborted")
104
- end
105
-
106
- sleep(SLEEP_INTERVAL)
106
+ def check_result(handle)
107
+ state = @service.get_state(handle)
108
+ if state == Protocol::Beeswax::QueryState::EXCEPTION
109
+ close_handle(handle)
110
+ raise ConnectionError.new("The query was aborted")
107
111
  end
108
112
  rescue
109
113
  close_handle(handle)
@@ -1,3 +1,3 @@
1
1
  module Impala
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
data/test/test_impala.rb CHANGED
@@ -46,7 +46,7 @@ describe Impala::Connection do
46
46
  end
47
47
  end
48
48
 
49
- describe '#wait_for_result' do
49
+ describe '#check_result' do
50
50
  before do
51
51
  Impala::Connection.any_instance.stubs(:open)
52
52
  @connection = Impala::Connection.new('test', 1234)
@@ -58,7 +58,46 @@ describe Impala::Connection do
58
58
  handle = stub()
59
59
  @service.expects(:close).with(handle).once
60
60
  @service.expects(:get_state).raises(StandardError)
61
- assert_raises(StandardError) { @connection.send(:wait_for_result, handle) }
61
+ assert_raises(StandardError) { @connection.send(:check_result, handle) }
62
62
  end
63
63
  end
64
- end
64
+
65
+ describe '#execute' do
66
+ before do
67
+ Impala::Connection.any_instance.stubs(:open)
68
+ Impala::Cursor.stubs(:new)
69
+ @connection = Impala::Connection.new('test', 1234)
70
+ @connection.stubs(:open? => true, :sanitize_query => 'sanitized_query', :check_result => nil)
71
+ end
72
+
73
+ it 'should call Protocol::ImpalaService::Client#executeAndWait with the sanitized query' do
74
+ query = Impala::Protocol::Beeswax::Query.new
75
+ query.query = 'sanitized_query'
76
+ query.configuration = []
77
+
78
+ @service = stub()
79
+ @service.expects(:executeAndWait).with(query, Impala::Connection::LOG_CONTEXT_ID).once
80
+ @connection.instance_variable_set('@service', @service)
81
+
82
+ @connection.execute('query')
83
+ end
84
+
85
+ it 'should call Protocol::ImpalaService::Client#executeAndWait with the hadoop_user and configuration if passed as parameter' do
86
+ query = Impala::Protocol::Beeswax::Query.new
87
+ query.query = 'sanitized_query'
88
+ query.hadoop_user = 'impala'
89
+ query.configuration = %w|NUM_SCANNER_THREADS=8 MEM_LIMIT=3221225472|
90
+
91
+ @service = stub()
92
+ @service.expects(:executeAndWait).with(query, Impala::Connection::LOG_CONTEXT_ID).once
93
+ @connection.instance_variable_set('@service', @service)
94
+
95
+ opt = {
96
+ :user => 'impala',
97
+ :num_scanner_threads => 8,
98
+ :mem_limit => 3221225472
99
+ }
100
+ @connection.execute('query', opt)
101
+ end
102
+ end
103
+ end
@@ -16,140 +16,171 @@ def connect
16
16
  Impala.connect(host, port)
17
17
  end
18
18
 
19
- describe 'basic connected tests' do
19
+ describe 'connected tests' do
20
20
  before do
21
21
  skip unless IMPALA_SERVER
22
22
  @connection = connect
23
23
  end
24
24
 
25
- it 'can connect' do
26
- assert_instance_of(Impala::Connection, @connection)
27
- assert(@connection.open?, "the connection should be open")
28
- end
25
+ describe 'basic tests' do
26
+ it 'can connect' do
27
+ assert_instance_of(Impala::Connection, @connection)
28
+ assert(@connection.open?, "the connection should be open")
29
+ end
29
30
 
30
- it 'can refresh the catalog' do
31
- @connection.refresh
32
- end
31
+ it 'can refresh the catalog' do
32
+ @connection.refresh
33
+ end
33
34
 
34
- it 'can run a basic query' do
35
- ret = @connection.query('SELECT "foo" AS foo')
36
- assert_equal([{:foo=>'foo'}], ret, "the result should be a list of hashes")
37
- end
35
+ it 'can refresh metadata' do
36
+ @connection.query('invalidate metadata')
37
+ end
38
38
 
39
- it 'can handle boolean values' do
40
- ret = @connection.query('SELECT TRUE AS foo')
41
- assert_equal([{:foo=>true}], ret, "the result should be a bool")
42
- end
39
+ it 'can run a basic query' do
40
+ ret = @connection.query('SELECT "foo" AS foo')
41
+ assert_equal([{:foo=>'foo'}], ret, "the result should be a list of hashes")
42
+ end
43
43
 
44
- it 'can handle double values' do
45
- ret = @connection.query("SELECT 1.23 AS foo")
46
- assert_equal([{:foo=>1.23}], ret, "the result should be a float")
47
- end
44
+ it 'can run a basic query with some query options as specified user' do
45
+ ret = @connection.query('SELECT "foo" AS foo',
46
+ :user => 'someoneelse',
47
+ :mem_limit => 1234567890,
48
+ :max_scan_range_length => 1024 * 1024 * 1024)
49
+ assert_equal([{:foo=>'foo'}], ret, "the result should be a list of hashes")
50
+ end
48
51
 
49
- it 'can handle float values' do
50
- ret = @connection.query("SELECT CAST(1.23 AS float) as foo")
51
- assert_instance_of(Float, ret.first[:foo], "the result should be a float")
52
- end
52
+ it 'can handle boolean values' do
53
+ ret = @connection.query('SELECT TRUE AS foo')
54
+ assert_equal([{:foo=>true}], ret, "the result should be a bool")
55
+ end
53
56
 
54
- it 'can handle timestamp values' do
55
- ret = @connection.query("SELECT NOW() AS foo")
56
- assert_instance_of(Time, ret.first[:foo])
57
- end
57
+ it 'can handle double values' do
58
+ ret = @connection.query('SELECT 1.23 AS foo')
59
+ assert_equal([{:foo=>1.23}], ret, "the result should be a float")
60
+ end
58
61
 
59
- it 'can successfully refresh the metadata store' do
60
- ret = @connection.refresh
61
- end
62
- end
62
+ it 'can handle float values' do
63
+ ret = @connection.query('SELECT CAST(1.23 AS float) as foo')
64
+ assert_instance_of(Float, ret.first[:foo], "the result should be a float")
65
+ end
63
66
 
64
- describe 'with a test database' do
65
- before do
66
- @database = '_impala_ruby_test'
67
- @connection.query("CREATE DATABASE IF NOT EXISTS #{@database}")
68
- end
67
+ it 'can handle timestamp values' do
68
+ ret = @connection.query('SELECT NOW() AS foo')
69
+ assert_instance_of(Time, ret.first[:foo], "the result should be a timestamp")
70
+ end
69
71
 
70
- after do
71
- @connection.query('DROP DATABASE IF EXISTS _impala_ruby_test')
72
- end
72
+ it 'can handle null values' do
73
+ ret = @connection.query('SELECT NULL AS nothing')
74
+ assert_equal(nil, ret.first[:nothing], "the result should be nil")
75
+ end
73
76
 
74
- it 'can use the database' do
75
- @connection.query("USE #{@database}")
76
- @connection.query("USE default")
77
+ it 'can successfully refresh the metadata store' do
78
+ ret = @connection.refresh
79
+ end
77
80
  end
78
81
 
79
- describe 'and a test table' do
82
+ describe 'with a test database' do
80
83
  before do
81
- @table = "#{@database}.foobar"
82
- @connection.query("CREATE TABLE #{@table} (i INT)")
84
+ @database = '_impala_ruby_test'
85
+ @connection.query("CREATE DATABASE IF NOT EXISTS #{@database}")
83
86
  end
84
87
 
85
88
  after do
86
- @connection.query("DROP TABLE #{@table}")
89
+ @connection.query("DROP DATABASE IF EXISTS #{@database}") if @connection
87
90
  end
88
91
 
89
- it 'deals with empty tables correctly when using #query' do
90
- res = @connection.query("SELECT * FROM #{@table}")
91
- assert_equal([], res, "the result set should be empty")
92
+ it 'can use the database' do
93
+ @connection.query("USE #{@database}")
94
+ @connection.query("USE default")
92
95
  end
93
96
 
94
- it 'deals with empty tables correctly when using a cursor' do
95
- cursor = @connection.execute("SELECT * FROM #{@table}")
96
- assert_equal(false, cursor.has_more?, "has_more? should be false")
97
- assert_nil(cursor.fetch_row, "calls to fetch_row should be nil")
98
- end
99
-
100
- describe 'with data' do
97
+ describe 'and a test table' do
101
98
  before do
102
- @connection.query("INSERT INTO #{@table} (i) SELECT 1")
103
- @connection.query("INSERT INTO #{@table} (i) SELECT 1")
104
- @connection.query("INSERT INTO #{@table} (i) SELECT 1")
99
+ @table = "#{@database}.foobar"
100
+ @connection.query("CREATE TABLE #{@table} (i INT)")
105
101
  end
106
102
 
107
- it 'can insert into the table' do
108
- @connection.query("INSERT INTO #{@table} (i) SELECT 2")
103
+ after do
104
+ @connection.query("DROP TABLE #{@table}") if @connection
109
105
  end
110
106
 
111
- it 'can select from the table using #query' do
107
+ it 'deals with empty tables correctly when using #query' do
112
108
  res = @connection.query("SELECT * FROM #{@table}")
113
- assert_equal([{:i => 1}, {:i => 1}, {:i => 1}], res)
109
+ assert_equal([], res, "the result set should be empty")
114
110
  end
115
111
 
116
- it 'can create a cursor and fetch one row at a time' do
112
+ it 'deals with empty tables correctly when using a cursor' do
117
113
  cursor = @connection.execute("SELECT * FROM #{@table}")
118
- assert_instance_of(Impala::Cursor, cursor, "the result should be a cursor")
114
+ assert_equal(false, cursor.has_more?, "has_more? should be false")
115
+ assert_nil(cursor.fetch_row, "calls to fetch_row should be nil")
116
+ end
119
117
 
120
- 3.times do
121
- row = cursor.fetch_row
122
- assert_equal({:i=>1}, row, "the row should be a hash with the correct result")
118
+ describe 'with data' do
119
+ before do
120
+ @connection.query("INSERT INTO #{@table} (i) SELECT 1")
121
+ @connection.query("INSERT INTO #{@table} (i) SELECT 1")
122
+ @connection.query("INSERT INTO #{@table} (i) SELECT 1")
123
123
  end
124
124
 
125
- assert_equal(false, cursor.has_more?, "has_more? should be false")
126
- assert_nil(cursor.fetch_row, "subsequent calls to fetch_row should be nil")
127
- end
125
+ it 'can handle the keywoard "with"' do
126
+ res = @connection.query("with bar as (select * from #{@table}) select * from bar")
127
+ assert_equal([{:i => 1}, {:i => 1}, {:i => 1}], res)
128
+ end
128
129
 
129
- it 'can use a cursor to deal with lots of data' do
130
- 10.times { @connection.query("INSERT INTO #{@table} SELECT * FROM #{@table}") }
131
- @connection.query("INSERT INTO #{@table} (i) SELECT 1")
132
- count = @connection.query("SELECT COUNT(*) as n from #{@table}")[0][:n]
133
- assert(count > Impala::Cursor::BUFFER_SIZE) # otherwise the test is pointless
130
+ it 'can insert into the table' do
131
+ @connection.query("INSERT INTO #{@table} (i) SELECT 2")
132
+ end
134
133
 
135
- cursor = @connection.execute("SELECT * FROM #{@table}")
136
- assert_instance_of(Impala::Cursor, cursor, "the result should be a cursor")
134
+ it 'can select from the table using #query' do
135
+ res = @connection.query("SELECT * FROM #{@table}")
136
+ assert_equal([{:i => 1}, {:i => 1}, {:i => 1}], res)
137
+ end
138
+
139
+ it 'can create a cursor and fetch one row at a time' do
140
+ cursor = @connection.execute("SELECT * FROM #{@table}")
141
+ assert_instance_of(Impala::Cursor, cursor, "the result should be a cursor")
142
+
143
+ 3.times do
144
+ row = cursor.fetch_row
145
+ assert_equal({:i=>1}, row, "the row should be a hash with the correct result")
146
+ end
137
147
 
138
- # fetch one to fill the buffer
139
- row = cursor.fetch_row
140
- assert_equal({:i=>1}, row, "the row should be a hash with the correct result")
148
+ assert_equal(false, cursor.has_more?, "has_more? should be false")
149
+ assert_nil(cursor.fetch_row, "subsequent calls to fetch_row should be nil")
150
+ end
151
+
152
+ it 'can use a cursor to deal with lots of data' do
153
+ 10.times { @connection.query("INSERT INTO #{@table} SELECT * FROM #{@table}") }
154
+ @connection.query("INSERT INTO #{@table} (i) SELECT 1")
155
+ count = @connection.query("SELECT COUNT(*) as n from #{@table}")[0][:n]
156
+ assert(count > Impala::Cursor::BUFFER_SIZE) # otherwise the test is pointless
141
157
 
142
- buffer_size = cursor.instance_variable_get('@row_buffer').size
143
- assert_equal(Impala::Cursor::BUFFER_SIZE - 1, buffer_size, "it should only buffer #{Impala::Cursor::BUFFER_SIZE} rows into memory")
158
+ cursor = @connection.execute("SELECT * FROM #{@table}")
159
+ assert_instance_of(Impala::Cursor, cursor, "the result should be a cursor")
144
160
 
145
- (count - 1).times do
161
+ # fetch one to fill the buffer
146
162
  row = cursor.fetch_row
147
163
  assert_equal({:i=>1}, row, "the row should be a hash with the correct result")
164
+
165
+ buffer_size = cursor.instance_variable_get('@row_buffer').size
166
+ assert_equal(Impala::Cursor::BUFFER_SIZE - 1, buffer_size, "it should only buffer #{Impala::Cursor::BUFFER_SIZE} rows into memory")
167
+
168
+ (count - 1).times do
169
+ row = cursor.fetch_row
170
+ assert_equal({:i=>1}, row, "the row should be a hash with the correct result")
171
+ end
172
+
173
+ assert_equal(false, cursor.has_more?, "has_more? should be false")
174
+ assert_nil(cursor.fetch_row, "subsequent calls to fetch_row should be nil")
148
175
  end
149
176
 
150
- assert_equal(false, cursor.has_more?, "has_more? should be false")
151
- assert_nil(cursor.fetch_row, "subsequent calls to fetch_row should be nil")
177
+ it 'can handle interspersed NULL values' do
178
+ @connection.query("INSERT INTO #{@table} (i) SELECT NULL")
179
+ res = @connection.query("SELECT * FROM #{@table} ORDER BY i DESC LIMIT 4")
180
+ assert_equal([{:i => 1}, {:i => 1}, {:i => 1}, {:i => nil}], res)
181
+ end
152
182
  end
183
+
153
184
  end
154
185
  end
155
186
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: impala
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-09-17 00:00:00.000000000 Z
12
+ date: 2013-11-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: thrift
@@ -27,6 +27,38 @@ dependencies:
27
27
  - - ~>
28
28
  - !ruby/object:Gem::Version
29
29
  version: 0.9.1
30
+ - !ruby/object:Gem::Dependency
31
+ name: rack
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: thin
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
30
62
  - !ruby/object:Gem::Dependency
31
63
  name: rake
32
64
  requirement: !ruby/object:Gem::Requirement