impala 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # impala-ruby
2
2
 
3
- This is an ruby client for [Cloudera's Impala][1]. You use it like this:
3
+ This is a ruby client for [Cloudera Impala][1]. You use it like this:
4
4
 
5
5
  ```ruby
6
6
  require 'impala'
data/impala.gemspec CHANGED
@@ -13,6 +13,8 @@ Gem::Specification.new do |gem|
13
13
  gem.homepage = "https://github.com/colinmarc/impala-ruby"
14
14
 
15
15
  gem.add_dependency('thrift', '~> 0.9.1')
16
+ gem.add_dependency('rack')
17
+ gem.add_dependency('thin')
16
18
 
17
19
  gem.add_development_dependency('rake')
18
20
  gem.add_development_dependency('pry')
data/lib/impala.rb CHANGED
@@ -14,7 +14,7 @@ require 'impala/cursor'
14
14
  require 'impala/connection'
15
15
 
16
16
  module Impala
17
- KNOWN_COMMANDS = ['select', 'insert', 'show', 'describe', 'use', 'explain', 'create', 'drop']
17
+ KNOWN_COMMANDS = ['select', 'insert', 'show', 'describe', 'use', 'explain', 'create', 'drop', 'invalidate', 'with']
18
18
  DEFAULT_HOST = 'localhost'
19
19
  DEFAULT_PORT = 21000
20
20
  class InvalidQueryError < StandardError; end
@@ -2,7 +2,7 @@ module Impala
2
2
  # This object represents a connection to an Impala server. It can be used to
3
3
  # perform queries on the database.
4
4
  class Connection
5
- SLEEP_INTERVAL = 0.1
5
+ LOG_CONTEXT_ID = "impala-ruby"
6
6
 
7
7
  # Don't instantiate Connections directly; instead, use {Impala.connect}.
8
8
  def initialize(host, port)
@@ -43,7 +43,7 @@ module Impala
43
43
  @connected
44
44
  end
45
45
 
46
- # Refresh the metadata store
46
+ # Refresh the metadata store.
47
47
  def refresh
48
48
  raise ConnectionError.new("Connection closed") unless open?
49
49
  @service.ResetCatalog
@@ -53,21 +53,27 @@ module Impala
53
53
  # load the entire result set into memory, so if you're dealing with lots
54
54
  # of rows, {#execute} may work better.
55
55
  # @param [String] query the query you want to run
56
+ # @param [Hash] query_options the options to set user and configuration
57
+ # except for :user, see TImpalaQueryOptions in ImpalaService.thrift
58
+ # @option query_options [String] :user the user runs the query
56
59
  # @return [Array<Hash>] an array of hashes, one for each row.
57
- def query(raw_query)
58
- execute(raw_query).fetch_all
60
+ def query(raw_query, query_options = {})
61
+ execute(raw_query, query_options).fetch_all
59
62
  end
60
63
 
61
64
  # Perform a query and return a cursor for iterating over the results.
62
65
  # @param [String] query the query you want to run
66
+ # @param [Hash] query_options the options to set user and configuration
67
+ # except for :user, see TImpalaQueryOptions in ImpalaService.thrift
68
+ # @option query_options [String] :user the user runs the query
63
69
  # @return [Cursor] a cursor for the result rows
64
- def execute(raw_query)
70
+ def execute(raw_query, query_options = {})
65
71
  raise ConnectionError.new("Connection closed") unless open?
66
72
 
67
73
  query = sanitize_query(raw_query)
68
- handle = send_query(query)
74
+ handle = send_query(query, query_options)
69
75
 
70
- wait_for_result(handle)
76
+ check_result(handle)
71
77
  Cursor.new(handle, @service)
72
78
  end
73
79
 
@@ -85,25 +91,23 @@ module Impala
85
91
  ([command] + words[1..-1]).join(' ')
86
92
  end
87
93
 
88
- def send_query(sanitized_query)
94
+ def send_query(sanitized_query, query_options)
89
95
  query = Protocol::Beeswax::Query.new
90
96
  query.query = sanitized_query
91
97
 
92
- @service.query(query)
98
+ query.hadoop_user = query_options.delete(:user) if query_options[:user]
99
+ query.configuration = query_options.map do |key, value|
100
+ "#{key.upcase}=#{value}"
101
+ end
102
+
103
+ @service.executeAndWait(query, LOG_CONTEXT_ID)
93
104
  end
94
105
 
95
- def wait_for_result(handle)
96
- #TODO select here, or something
97
- while true
98
- state = @service.get_state(handle)
99
- if state == Protocol::Beeswax::QueryState::FINISHED
100
- break
101
- elsif state == Protocol::Beeswax::QueryState::EXCEPTION
102
- close_handle(handle)
103
- raise ConnectionError.new("The query was aborted")
104
- end
105
-
106
- sleep(SLEEP_INTERVAL)
106
+ def check_result(handle)
107
+ state = @service.get_state(handle)
108
+ if state == Protocol::Beeswax::QueryState::EXCEPTION
109
+ close_handle(handle)
110
+ raise ConnectionError.new("The query was aborted")
107
111
  end
108
112
  rescue
109
113
  close_handle(handle)
@@ -1,3 +1,3 @@
1
1
  module Impala
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
data/test/test_impala.rb CHANGED
@@ -46,7 +46,7 @@ describe Impala::Connection do
46
46
  end
47
47
  end
48
48
 
49
- describe '#wait_for_result' do
49
+ describe '#check_result' do
50
50
  before do
51
51
  Impala::Connection.any_instance.stubs(:open)
52
52
  @connection = Impala::Connection.new('test', 1234)
@@ -58,7 +58,46 @@ describe Impala::Connection do
58
58
  handle = stub()
59
59
  @service.expects(:close).with(handle).once
60
60
  @service.expects(:get_state).raises(StandardError)
61
- assert_raises(StandardError) { @connection.send(:wait_for_result, handle) }
61
+ assert_raises(StandardError) { @connection.send(:check_result, handle) }
62
62
  end
63
63
  end
64
- end
64
+
65
+ describe '#execute' do
66
+ before do
67
+ Impala::Connection.any_instance.stubs(:open)
68
+ Impala::Cursor.stubs(:new)
69
+ @connection = Impala::Connection.new('test', 1234)
70
+ @connection.stubs(:open? => true, :sanitize_query => 'sanitized_query', :check_result => nil)
71
+ end
72
+
73
+ it 'should call Protocol::ImpalaService::Client#executeAndWait with the sanitized query' do
74
+ query = Impala::Protocol::Beeswax::Query.new
75
+ query.query = 'sanitized_query'
76
+ query.configuration = []
77
+
78
+ @service = stub()
79
+ @service.expects(:executeAndWait).with(query, Impala::Connection::LOG_CONTEXT_ID).once
80
+ @connection.instance_variable_set('@service', @service)
81
+
82
+ @connection.execute('query')
83
+ end
84
+
85
+ it 'should call Protocol::ImpalaService::Client#executeAndWait with the hadoop_user and configuration if passed as parameter' do
86
+ query = Impala::Protocol::Beeswax::Query.new
87
+ query.query = 'sanitized_query'
88
+ query.hadoop_user = 'impala'
89
+ query.configuration = %w|NUM_SCANNER_THREADS=8 MEM_LIMIT=3221225472|
90
+
91
+ @service = stub()
92
+ @service.expects(:executeAndWait).with(query, Impala::Connection::LOG_CONTEXT_ID).once
93
+ @connection.instance_variable_set('@service', @service)
94
+
95
+ opt = {
96
+ :user => 'impala',
97
+ :num_scanner_threads => 8,
98
+ :mem_limit => 3221225472
99
+ }
100
+ @connection.execute('query', opt)
101
+ end
102
+ end
103
+ end
@@ -16,140 +16,171 @@ def connect
16
16
  Impala.connect(host, port)
17
17
  end
18
18
 
19
- describe 'basic connected tests' do
19
+ describe 'connected tests' do
20
20
  before do
21
21
  skip unless IMPALA_SERVER
22
22
  @connection = connect
23
23
  end
24
24
 
25
- it 'can connect' do
26
- assert_instance_of(Impala::Connection, @connection)
27
- assert(@connection.open?, "the connection should be open")
28
- end
25
+ describe 'basic tests' do
26
+ it 'can connect' do
27
+ assert_instance_of(Impala::Connection, @connection)
28
+ assert(@connection.open?, "the connection should be open")
29
+ end
29
30
 
30
- it 'can refresh the catalog' do
31
- @connection.refresh
32
- end
31
+ it 'can refresh the catalog' do
32
+ @connection.refresh
33
+ end
33
34
 
34
- it 'can run a basic query' do
35
- ret = @connection.query('SELECT "foo" AS foo')
36
- assert_equal([{:foo=>'foo'}], ret, "the result should be a list of hashes")
37
- end
35
+ it 'can refresh metadata' do
36
+ @connection.query('invalidate metadata')
37
+ end
38
38
 
39
- it 'can handle boolean values' do
40
- ret = @connection.query('SELECT TRUE AS foo')
41
- assert_equal([{:foo=>true}], ret, "the result should be a bool")
42
- end
39
+ it 'can run a basic query' do
40
+ ret = @connection.query('SELECT "foo" AS foo')
41
+ assert_equal([{:foo=>'foo'}], ret, "the result should be a list of hashes")
42
+ end
43
43
 
44
- it 'can handle double values' do
45
- ret = @connection.query("SELECT 1.23 AS foo")
46
- assert_equal([{:foo=>1.23}], ret, "the result should be a float")
47
- end
44
+ it 'can run a basic query with some query options as specified user' do
45
+ ret = @connection.query('SELECT "foo" AS foo',
46
+ :user => 'someoneelse',
47
+ :mem_limit => 1234567890,
48
+ :max_scan_range_length => 1024 * 1024 * 1024)
49
+ assert_equal([{:foo=>'foo'}], ret, "the result should be a list of hashes")
50
+ end
48
51
 
49
- it 'can handle float values' do
50
- ret = @connection.query("SELECT CAST(1.23 AS float) as foo")
51
- assert_instance_of(Float, ret.first[:foo], "the result should be a float")
52
- end
52
+ it 'can handle boolean values' do
53
+ ret = @connection.query('SELECT TRUE AS foo')
54
+ assert_equal([{:foo=>true}], ret, "the result should be a bool")
55
+ end
53
56
 
54
- it 'can handle timestamp values' do
55
- ret = @connection.query("SELECT NOW() AS foo")
56
- assert_instance_of(Time, ret.first[:foo])
57
- end
57
+ it 'can handle double values' do
58
+ ret = @connection.query('SELECT 1.23 AS foo')
59
+ assert_equal([{:foo=>1.23}], ret, "the result should be a float")
60
+ end
58
61
 
59
- it 'can successfully refresh the metadata store' do
60
- ret = @connection.refresh
61
- end
62
- end
62
+ it 'can handle float values' do
63
+ ret = @connection.query('SELECT CAST(1.23 AS float) as foo')
64
+ assert_instance_of(Float, ret.first[:foo], "the result should be a float")
65
+ end
63
66
 
64
- describe 'with a test database' do
65
- before do
66
- @database = '_impala_ruby_test'
67
- @connection.query("CREATE DATABASE IF NOT EXISTS #{@database}")
68
- end
67
+ it 'can handle timestamp values' do
68
+ ret = @connection.query('SELECT NOW() AS foo')
69
+ assert_instance_of(Time, ret.first[:foo], "the result should be a timestamp")
70
+ end
69
71
 
70
- after do
71
- @connection.query('DROP DATABASE IF EXISTS _impala_ruby_test')
72
- end
72
+ it 'can handle null values' do
73
+ ret = @connection.query('SELECT NULL AS nothing')
74
+ assert_equal(nil, ret.first[:nothing], "the result should be nil")
75
+ end
73
76
 
74
- it 'can use the database' do
75
- @connection.query("USE #{@database}")
76
- @connection.query("USE default")
77
+ it 'can successfully refresh the metadata store' do
78
+ ret = @connection.refresh
79
+ end
77
80
  end
78
81
 
79
- describe 'and a test table' do
82
+ describe 'with a test database' do
80
83
  before do
81
- @table = "#{@database}.foobar"
82
- @connection.query("CREATE TABLE #{@table} (i INT)")
84
+ @database = '_impala_ruby_test'
85
+ @connection.query("CREATE DATABASE IF NOT EXISTS #{@database}")
83
86
  end
84
87
 
85
88
  after do
86
- @connection.query("DROP TABLE #{@table}")
89
+ @connection.query("DROP DATABASE IF EXISTS #{@database}") if @connection
87
90
  end
88
91
 
89
- it 'deals with empty tables correctly when using #query' do
90
- res = @connection.query("SELECT * FROM #{@table}")
91
- assert_equal([], res, "the result set should be empty")
92
+ it 'can use the database' do
93
+ @connection.query("USE #{@database}")
94
+ @connection.query("USE default")
92
95
  end
93
96
 
94
- it 'deals with empty tables correctly when using a cursor' do
95
- cursor = @connection.execute("SELECT * FROM #{@table}")
96
- assert_equal(false, cursor.has_more?, "has_more? should be false")
97
- assert_nil(cursor.fetch_row, "calls to fetch_row should be nil")
98
- end
99
-
100
- describe 'with data' do
97
+ describe 'and a test table' do
101
98
  before do
102
- @connection.query("INSERT INTO #{@table} (i) SELECT 1")
103
- @connection.query("INSERT INTO #{@table} (i) SELECT 1")
104
- @connection.query("INSERT INTO #{@table} (i) SELECT 1")
99
+ @table = "#{@database}.foobar"
100
+ @connection.query("CREATE TABLE #{@table} (i INT)")
105
101
  end
106
102
 
107
- it 'can insert into the table' do
108
- @connection.query("INSERT INTO #{@table} (i) SELECT 2")
103
+ after do
104
+ @connection.query("DROP TABLE #{@table}") if @connection
109
105
  end
110
106
 
111
- it 'can select from the table using #query' do
107
+ it 'deals with empty tables correctly when using #query' do
112
108
  res = @connection.query("SELECT * FROM #{@table}")
113
- assert_equal([{:i => 1}, {:i => 1}, {:i => 1}], res)
109
+ assert_equal([], res, "the result set should be empty")
114
110
  end
115
111
 
116
- it 'can create a cursor and fetch one row at a time' do
112
+ it 'deals with empty tables correctly when using a cursor' do
117
113
  cursor = @connection.execute("SELECT * FROM #{@table}")
118
- assert_instance_of(Impala::Cursor, cursor, "the result should be a cursor")
114
+ assert_equal(false, cursor.has_more?, "has_more? should be false")
115
+ assert_nil(cursor.fetch_row, "calls to fetch_row should be nil")
116
+ end
119
117
 
120
- 3.times do
121
- row = cursor.fetch_row
122
- assert_equal({:i=>1}, row, "the row should be a hash with the correct result")
118
+ describe 'with data' do
119
+ before do
120
+ @connection.query("INSERT INTO #{@table} (i) SELECT 1")
121
+ @connection.query("INSERT INTO #{@table} (i) SELECT 1")
122
+ @connection.query("INSERT INTO #{@table} (i) SELECT 1")
123
123
  end
124
124
 
125
- assert_equal(false, cursor.has_more?, "has_more? should be false")
126
- assert_nil(cursor.fetch_row, "subsequent calls to fetch_row should be nil")
127
- end
125
+ it 'can handle the keywoard "with"' do
126
+ res = @connection.query("with bar as (select * from #{@table}) select * from bar")
127
+ assert_equal([{:i => 1}, {:i => 1}, {:i => 1}], res)
128
+ end
128
129
 
129
- it 'can use a cursor to deal with lots of data' do
130
- 10.times { @connection.query("INSERT INTO #{@table} SELECT * FROM #{@table}") }
131
- @connection.query("INSERT INTO #{@table} (i) SELECT 1")
132
- count = @connection.query("SELECT COUNT(*) as n from #{@table}")[0][:n]
133
- assert(count > Impala::Cursor::BUFFER_SIZE) # otherwise the test is pointless
130
+ it 'can insert into the table' do
131
+ @connection.query("INSERT INTO #{@table} (i) SELECT 2")
132
+ end
134
133
 
135
- cursor = @connection.execute("SELECT * FROM #{@table}")
136
- assert_instance_of(Impala::Cursor, cursor, "the result should be a cursor")
134
+ it 'can select from the table using #query' do
135
+ res = @connection.query("SELECT * FROM #{@table}")
136
+ assert_equal([{:i => 1}, {:i => 1}, {:i => 1}], res)
137
+ end
138
+
139
+ it 'can create a cursor and fetch one row at a time' do
140
+ cursor = @connection.execute("SELECT * FROM #{@table}")
141
+ assert_instance_of(Impala::Cursor, cursor, "the result should be a cursor")
142
+
143
+ 3.times do
144
+ row = cursor.fetch_row
145
+ assert_equal({:i=>1}, row, "the row should be a hash with the correct result")
146
+ end
137
147
 
138
- # fetch one to fill the buffer
139
- row = cursor.fetch_row
140
- assert_equal({:i=>1}, row, "the row should be a hash with the correct result")
148
+ assert_equal(false, cursor.has_more?, "has_more? should be false")
149
+ assert_nil(cursor.fetch_row, "subsequent calls to fetch_row should be nil")
150
+ end
151
+
152
+ it 'can use a cursor to deal with lots of data' do
153
+ 10.times { @connection.query("INSERT INTO #{@table} SELECT * FROM #{@table}") }
154
+ @connection.query("INSERT INTO #{@table} (i) SELECT 1")
155
+ count = @connection.query("SELECT COUNT(*) as n from #{@table}")[0][:n]
156
+ assert(count > Impala::Cursor::BUFFER_SIZE) # otherwise the test is pointless
141
157
 
142
- buffer_size = cursor.instance_variable_get('@row_buffer').size
143
- assert_equal(Impala::Cursor::BUFFER_SIZE - 1, buffer_size, "it should only buffer #{Impala::Cursor::BUFFER_SIZE} rows into memory")
158
+ cursor = @connection.execute("SELECT * FROM #{@table}")
159
+ assert_instance_of(Impala::Cursor, cursor, "the result should be a cursor")
144
160
 
145
- (count - 1).times do
161
+ # fetch one to fill the buffer
146
162
  row = cursor.fetch_row
147
163
  assert_equal({:i=>1}, row, "the row should be a hash with the correct result")
164
+
165
+ buffer_size = cursor.instance_variable_get('@row_buffer').size
166
+ assert_equal(Impala::Cursor::BUFFER_SIZE - 1, buffer_size, "it should only buffer #{Impala::Cursor::BUFFER_SIZE} rows into memory")
167
+
168
+ (count - 1).times do
169
+ row = cursor.fetch_row
170
+ assert_equal({:i=>1}, row, "the row should be a hash with the correct result")
171
+ end
172
+
173
+ assert_equal(false, cursor.has_more?, "has_more? should be false")
174
+ assert_nil(cursor.fetch_row, "subsequent calls to fetch_row should be nil")
148
175
  end
149
176
 
150
- assert_equal(false, cursor.has_more?, "has_more? should be false")
151
- assert_nil(cursor.fetch_row, "subsequent calls to fetch_row should be nil")
177
+ it 'can handle interspersed NULL values' do
178
+ @connection.query("INSERT INTO #{@table} (i) SELECT NULL")
179
+ res = @connection.query("SELECT * FROM #{@table} ORDER BY i DESC LIMIT 4")
180
+ assert_equal([{:i => 1}, {:i => 1}, {:i => 1}, {:i => nil}], res)
181
+ end
152
182
  end
183
+
153
184
  end
154
185
  end
155
186
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: impala
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-09-17 00:00:00.000000000 Z
12
+ date: 2013-11-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: thrift
@@ -27,6 +27,38 @@ dependencies:
27
27
  - - ~>
28
28
  - !ruby/object:Gem::Version
29
29
  version: 0.9.1
30
+ - !ruby/object:Gem::Dependency
31
+ name: rack
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: thin
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
30
62
  - !ruby/object:Gem::Dependency
31
63
  name: rake
32
64
  requirement: !ruby/object:Gem::Requirement