sequel-impala 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (86) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG +3 -0
  3. data/LICENSE +462 -0
  4. data/README.rdoc +39 -0
  5. data/Rakefile +39 -0
  6. data/lib/driver/commons-logging-1.2.jar +0 -0
  7. data/lib/driver/hadoop-common-2.6.0.jar +0 -0
  8. data/lib/driver/hadoop-core-2.6.0.jar +0 -0
  9. data/lib/driver/hive-exec-1.1.0.jar +0 -0
  10. data/lib/driver/hive-jdbc-1.1.0.jar +0 -0
  11. data/lib/driver/hive-metastore-1.1.0.jar +0 -0
  12. data/lib/driver/hive-service-1.1.0.jar +0 -0
  13. data/lib/driver/httpclient-4.3.jar +0 -0
  14. data/lib/driver/httpcore-4.3.jar +0 -0
  15. data/lib/driver/libfb303-0.9.0.jar +0 -0
  16. data/lib/driver/slf4j-api-1.7.5.jar +0 -0
  17. data/lib/impala.rb +47 -0
  18. data/lib/impala/connection.rb +117 -0
  19. data/lib/impala/cursor.rb +157 -0
  20. data/lib/impala/protocol.rb +8 -0
  21. data/lib/impala/protocol/beeswax_constants.rb +15 -0
  22. data/lib/impala/protocol/beeswax_service.rb +766 -0
  23. data/lib/impala/protocol/beeswax_types.rb +193 -0
  24. data/lib/impala/protocol/cli_service_constants.rb +60 -0
  25. data/lib/impala/protocol/cli_service_types.rb +1452 -0
  26. data/lib/impala/protocol/facebook_service.rb +706 -0
  27. data/lib/impala/protocol/fb303_constants.rb +15 -0
  28. data/lib/impala/protocol/fb303_types.rb +25 -0
  29. data/lib/impala/protocol/hive_metastore_constants.rb +53 -0
  30. data/lib/impala/protocol/hive_metastore_types.rb +698 -0
  31. data/lib/impala/protocol/impala_hive_server2_service.rb +29 -0
  32. data/lib/impala/protocol/impala_service.rb +377 -0
  33. data/lib/impala/protocol/impala_service_constants.rb +13 -0
  34. data/lib/impala/protocol/impala_service_types.rb +90 -0
  35. data/lib/impala/protocol/status_constants.rb +13 -0
  36. data/lib/impala/protocol/status_types.rb +46 -0
  37. data/lib/impala/protocol/t_c_l_i_service.rb +948 -0
  38. data/lib/impala/protocol/thrift_hive_metastore.rb +4707 -0
  39. data/lib/impala/version.rb +3 -0
  40. data/lib/jdbc/hive2.rb +46 -0
  41. data/lib/sequel/adapters/impala.rb +123 -0
  42. data/lib/sequel/adapters/jdbc/hive2.rb +26 -0
  43. data/lib/sequel/adapters/shared/impala.rb +635 -0
  44. data/lib/sequel/extensions/csv_to_parquet.rb +112 -0
  45. data/spec/database_test.rb +56 -0
  46. data/spec/dataset_test.rb +1268 -0
  47. data/spec/files/bad_down_migration/001_create_alt_basic.rb +4 -0
  48. data/spec/files/bad_down_migration/002_create_alt_advanced.rb +4 -0
  49. data/spec/files/bad_timestamped_migrations/1273253849_create_sessions.rb +9 -0
  50. data/spec/files/bad_timestamped_migrations/1273253851_create_nodes.rb +9 -0
  51. data/spec/files/bad_timestamped_migrations/1273253853_3_create_users.rb +3 -0
  52. data/spec/files/bad_up_migration/001_create_alt_basic.rb +4 -0
  53. data/spec/files/bad_up_migration/002_create_alt_advanced.rb +3 -0
  54. data/spec/files/convert_to_timestamp_migrations/001_create_sessions.rb +9 -0
  55. data/spec/files/convert_to_timestamp_migrations/002_create_nodes.rb +9 -0
  56. data/spec/files/convert_to_timestamp_migrations/003_3_create_users.rb +4 -0
  57. data/spec/files/convert_to_timestamp_migrations/1273253850_create_artists.rb +9 -0
  58. data/spec/files/convert_to_timestamp_migrations/1273253852_create_albums.rb +9 -0
  59. data/spec/files/duplicate_timestamped_migrations/1273253849_create_sessions.rb +9 -0
  60. data/spec/files/duplicate_timestamped_migrations/1273253853_create_nodes.rb +9 -0
  61. data/spec/files/duplicate_timestamped_migrations/1273253853_create_users.rb +4 -0
  62. data/spec/files/integer_migrations/001_create_sessions.rb +9 -0
  63. data/spec/files/integer_migrations/002_create_nodes.rb +9 -0
  64. data/spec/files/integer_migrations/003_3_create_users.rb +4 -0
  65. data/spec/files/interleaved_timestamped_migrations/1273253849_create_sessions.rb +9 -0
  66. data/spec/files/interleaved_timestamped_migrations/1273253850_create_artists.rb +9 -0
  67. data/spec/files/interleaved_timestamped_migrations/1273253851_create_nodes.rb +9 -0
  68. data/spec/files/interleaved_timestamped_migrations/1273253852_create_albums.rb +9 -0
  69. data/spec/files/interleaved_timestamped_migrations/1273253853_3_create_users.rb +4 -0
  70. data/spec/files/reversible_migrations/001_reversible.rb +5 -0
  71. data/spec/files/reversible_migrations/002_reversible.rb +5 -0
  72. data/spec/files/reversible_migrations/003_reversible.rb +5 -0
  73. data/spec/files/reversible_migrations/004_reversible.rb +5 -0
  74. data/spec/files/reversible_migrations/005_reversible.rb +10 -0
  75. data/spec/files/timestamped_migrations/1273253849_create_sessions.rb +9 -0
  76. data/spec/files/timestamped_migrations/1273253851_create_nodes.rb +9 -0
  77. data/spec/files/timestamped_migrations/1273253853_3_create_users.rb +4 -0
  78. data/spec/impala_test.rb +285 -0
  79. data/spec/migrator_test.rb +240 -0
  80. data/spec/plugin_test.rb +91 -0
  81. data/spec/prepared_statement_test.rb +327 -0
  82. data/spec/schema_test.rb +356 -0
  83. data/spec/spec_helper.rb +15 -0
  84. data/spec/timezone_test.rb +86 -0
  85. data/spec/type_test.rb +99 -0
  86. metadata +239 -0
@@ -0,0 +1,39 @@
1
+ = sequel-impala
2
+
3
+ sequel-impala adds support for Sequel to connect to the Impala database
4
+ via the included impala driver, and the included jdbc-hive2 driver under JRuby.
5
+
6
+ = Source Code
7
+
8
+ Source code is available on GitHub at https://github.com/jeremyevans/sequel-impala
9
+
10
+ = Usage
11
+
12
+ After installation, Sequel will automatically pick up the adapter as long as
13
+ the lib directory is in RUBYLIB, if you use a connection string starting with
14
+ +impala+, or <tt>jdbc:hive2</tt> on JRuby.
15
+
16
+ = Connection Strings
17
+
18
+ If using the impala driver (default host is localhost, default port is 21000):
19
+
20
+ impala://host:port
21
+
22
+ If using the jdbc:hive2 driver on JRuby (port 21050 works in testing):
23
+
24
+ jdbc:hive2://host:port/;auth=noSasl
25
+
26
+ = Dependencies
27
+
28
+ * sequel 4+
29
+ * thrift gem
30
+
31
+ = License
32
+
33
+ MIT/Apache
34
+
35
+ = Author
36
+
37
+ Jeremy Evans <code@jeremyevans.net>
38
+
39
+ Work on sequel-impala is generously funded by Outcomes Insights, Inc.
@@ -0,0 +1,39 @@
1
+ require "rake"
2
+ require "rake/clean"
3
+
4
+ CLEAN.include ["sequel-impala-*.gem", "rdoc"]
5
+
6
+ desc "Build sequel-impala gem"
7
+ task :package=>[:clean] do |p|
8
+ sh %{#{FileUtils::RUBY} -S gem build sequel-impala.gemspec}
9
+ end
10
+
11
+ ### Specs
12
+
13
+ desc "Run specs"
14
+ task "spec" do
15
+ sh "#{FileUtils::RUBY} -rubygems -I lib -e 'ARGV.each{|f| require f}' ./spec/*_test.rb"
16
+ end
17
+
18
+ task :default => :spec
19
+
20
+ ### RDoc
21
+
22
+ RDOC_DEFAULT_OPTS = ["--quiet", "--line-numbers", "--inline-source", '--title', 'sequel-impala: Sequel support for Impala database']
23
+
24
+ begin
25
+ gem 'rdoc'
26
+ gem 'hanna-nouveau'
27
+ RDOC_DEFAULT_OPTS.concat(['-f', 'hanna'])
28
+ rescue Gem::LoadError
29
+ end
30
+
31
+ RDOC_OPTS = RDOC_DEFAULT_OPTS + ['--main', 'README.rdoc']
32
+
33
+ require 'rdoc/task'
34
+ RDoc::Task.new do |rdoc|
35
+ rdoc.rdoc_dir = "rdoc"
36
+ rdoc.options += RDOC_OPTS
37
+ rdoc.rdoc_files.add %w"README.rdoc CHANGELOG LICENSE lib/**/*.rb"
38
+ end
39
+
@@ -0,0 +1,47 @@
1
+
2
+ # the generated ruby files use a relative require, so we need to add the
3
+ # generated directory to $LOAD_PATH
4
+ this_dir = File.expand_path(File.dirname(__FILE__))
5
+ gen_dir = File.join(this_dir, 'impala/protocol')
6
+ $LOAD_PATH.push(gen_dir) unless $LOAD_PATH.include?(gen_dir)
7
+
8
+ require 'impala/version'
9
+
10
+ require 'thrift'
11
+ require 'time'
12
+ require 'impala/protocol'
13
+ require 'impala/cursor'
14
+ require 'impala/connection'
15
+
16
+ module Impala
17
+ DEFAULT_HOST = 'localhost'
18
+ DEFAULT_PORT = 21000
19
+ class Error < StandardError; end
20
+ class InvalidQueryError < Error; end
21
+ class ConnectionError < Error; end
22
+ class CursorError < Error; end
23
+
24
+ # Connect to an Impala server. If a block is given, it will close the
25
+ # connection after yielding the connection to the block.
26
+ # @param [String] host the hostname or IP address of the Impala server
27
+ # @param [int] port the port that the Impala server is listening on
28
+ # @yieldparam [Connection] conn the open connection. Will be closed once the block
29
+ # finishes
30
+ # @return [Connection] the open connection, or, if a block is
31
+ # passed, the return value of the block
32
+ def self.connect(host=DEFAULT_HOST, port=DEFAULT_PORT)
33
+ connection = Connection.new(host, port)
34
+
35
+ if block_given?
36
+ begin
37
+ ret = yield connection
38
+ ensure
39
+ connection.close
40
+ end
41
+ else
42
+ ret = connection
43
+ end
44
+
45
+ ret
46
+ end
47
+ end
@@ -0,0 +1,117 @@
1
+ module Impala
2
+ # This object represents a connection to an Impala server. It can be used to
3
+ # perform queries on the database.
4
+ class Connection
5
+ LOG_CONTEXT_ID = "impala-ruby"
6
+
7
+ # Don't instantiate Connections directly; instead, use {Impala.connect}.
8
+ def initialize(host, port)
9
+ @host = host
10
+ @port = port
11
+ @connected = false
12
+ open
13
+ end
14
+
15
+ def inspect
16
+ "#<#{self.class} #{@host}:#{@port}#{open? ? '' : ' (DISCONNECTED)'}>"
17
+ end
18
+
19
+ # Open the connection if it's currently closed.
20
+ def open
21
+ return if @connected
22
+
23
+ socket = Thrift::Socket.new(@host, @port)
24
+
25
+ @transport = Thrift::BufferedTransport.new(socket)
26
+ @transport.open
27
+
28
+ proto = Thrift::BinaryProtocol.new(@transport)
29
+ @service = Protocol::ImpalaService::Client.new(proto)
30
+ @connected = true
31
+ end
32
+
33
+ # Close this connection. It can still be reopened with {#open}.
34
+ def close
35
+ return unless @connected
36
+
37
+ @transport.close
38
+ @connected = false
39
+ end
40
+
41
+ # Returns true if the connection is currently open.
42
+ def open?
43
+ @connected
44
+ end
45
+
46
+ # Refresh the metadata store.
47
+ def refresh
48
+ raise ConnectionError.new("Connection closed") unless open?
49
+ @service.ResetCatalog
50
+ end
51
+
52
+ # Perform a query and return all the results. This will
53
+ # load the entire result set into memory, so if you're dealing with lots
54
+ # of rows, {#execute} may work better.
55
+ # @param [String] query the query you want to run
56
+ # @param [Hash] query_options the options to set user and configuration
57
+ # except for :user, see TImpalaQueryOptions in ImpalaService.thrift
58
+ # @option query_options [String] :user the user runs the query
59
+ # @return [Array<Hash>] an array of hashes, one for each row.
60
+ def query(raw_query, query_options = {})
61
+ execute(raw_query, query_options).fetch_all
62
+ end
63
+
64
+ # Perform a query and return a cursor for iterating over the results.
65
+ # @param [String] query the query you want to run
66
+ # @param [Hash] query_options the options to set user and configuration
67
+ # except for :user, see TImpalaQueryOptions in ImpalaService.thrift
68
+ # @option query_options [String] :user the user runs the query
69
+ # @return [Cursor] a cursor for the result rows
70
+ def execute(raw_query, query_options = {})
71
+ raise ConnectionError.new("Connection closed") unless open?
72
+
73
+ query = sanitize_query(raw_query)
74
+ handle = send_query(query, query_options)
75
+
76
+ check_result(handle)
77
+ Cursor.new(handle, @service)
78
+ end
79
+
80
+ private
81
+
82
+ def sanitize_query(raw_query)
83
+ words = raw_query.split
84
+ raise InvalidQueryError.new("Empty query") if words.empty?
85
+
86
+ command = words.first.downcase
87
+ ([command] + words[1..-1]).join(' ')
88
+ end
89
+
90
+ def send_query(sanitized_query, query_options)
91
+ query = Protocol::Beeswax::Query.new
92
+ query.query = sanitized_query
93
+
94
+ query.hadoop_user = query_options.delete(:user) if query_options[:user]
95
+ query.configuration = query_options.map do |key, value|
96
+ "#{key.upcase}=#{value}"
97
+ end
98
+
99
+ @service.executeAndWait(query, LOG_CONTEXT_ID)
100
+ end
101
+
102
+ def check_result(handle)
103
+ state = @service.get_state(handle)
104
+ if state == Protocol::Beeswax::QueryState::EXCEPTION
105
+ close_handle(handle)
106
+ raise ConnectionError.new("The query was aborted")
107
+ end
108
+ rescue
109
+ close_handle(handle)
110
+ raise
111
+ end
112
+
113
+ def close_handle(handle)
114
+ @service.close(handle)
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,157 @@
1
+ module Impala
2
+ # Cursors are used to iterate over result sets without loading them all
3
+ # into memory at once. This can be useful if you're dealing with lots of
4
+ # rows. It implements Enumerable, so you can use each/select/map/etc.
5
+ class Cursor
6
+ BUFFER_SIZE = 1024
7
+ include Enumerable
8
+
9
+ def self.typecast_boolean(value)
10
+ value == 'true'
11
+ end
12
+
13
+ def self.typecast_int(value)
14
+ value.to_i
15
+ end
16
+
17
+ def self.typecast_float(value)
18
+ value.to_f
19
+ end
20
+
21
+ def self.typecast_decimal(value)
22
+ BigDecimal.new(value)
23
+ end
24
+
25
+ def self.typecast_timestamp(value)
26
+ Time.parse(value)
27
+ end
28
+
29
+ TYPECAST_MAP = {
30
+ 'boolean'=>method(:typecast_boolean),
31
+ 'int'=>method(:typecast_int),
32
+ 'double'=>method(:typecast_float),
33
+ 'decimal'=>method(:typecast_decimal),
34
+ 'timestamp'=>method(:typecast_timestamp),
35
+ }
36
+ TYPECAST_MAP['tinyint'] = TYPECAST_MAP['smallint'] = TYPECAST_MAP['bigint'] = TYPECAST_MAP['int']
37
+ TYPECAST_MAP['float'] = TYPECAST_MAP['double']
38
+ TYPECAST_MAP.freeze
39
+
40
+ NULL = 'NULL'.freeze
41
+
42
+ attr_reader :columns
43
+
44
+ attr_reader :typecast_map
45
+
46
+ def initialize(handle, service)
47
+ @handle = handle
48
+ @service = service
49
+
50
+
51
+ @row_buffer = []
52
+ @done = false
53
+ @open = true
54
+ @typecast_map = TYPECAST_MAP.dup
55
+ @columns = metadata.schema.fieldSchemas.map(&:name)
56
+ end
57
+
58
+ def inspect
59
+ "#<#{self.class}#{open? ? '' : ' (CLOSED)'}>"
60
+ end
61
+
62
+ def each
63
+ while row = fetch_row
64
+ yield row
65
+ end
66
+ end
67
+
68
+ # Returns the next available row as a hash, or nil if there are none left.
69
+ # @return [Hash, nil] the next available row, or nil if there are none
70
+ # left
71
+ # @see #fetch_all
72
+ def fetch_row
73
+ if @row_buffer.empty?
74
+ if @done
75
+ return nil
76
+ else
77
+ fetch_more
78
+ end
79
+ end
80
+
81
+ @row_buffer.shift
82
+ end
83
+
84
+ # Returns all the remaining rows in the result set.
85
+ # @return [Array<Hash>] the remaining rows in the result set
86
+ # @see #fetch_one
87
+ def fetch_all
88
+ self.to_a
89
+ end
90
+
91
+ # Close the cursor on the remote server. Once a cursor is closed, you
92
+ # can no longer fetch any rows from it.
93
+ def close
94
+ @open = false
95
+ @service.close(@handle)
96
+ end
97
+
98
+ # Returns true if the cursor is still open.
99
+ def open?
100
+ @open
101
+ end
102
+
103
+ # Returns true if there are any more rows to fetch.
104
+ def has_more?
105
+ !@done || !@row_buffer.empty?
106
+ end
107
+
108
+ def runtime_profile
109
+ @service.GetRuntimeProfile(@handle)
110
+ end
111
+
112
+ private
113
+
114
+ def metadata
115
+ @metadata ||= @service.get_results_metadata(@handle)
116
+ end
117
+
118
+ def fetch_more
119
+ fetch_batch until @done || @row_buffer.count >= BUFFER_SIZE
120
+ end
121
+
122
+ def fetch_batch
123
+ raise CursorError.new("Cursor has expired or been closed") unless @open
124
+
125
+ begin
126
+ res = @service.fetch(@handle, false, BUFFER_SIZE)
127
+ rescue Protocol::Beeswax::BeeswaxException
128
+ @open = false
129
+ raise CursorError.new("Cursor has expired or been closed")
130
+ end
131
+
132
+ rows = res.data.map { |raw| parse_row(raw) }
133
+ @row_buffer.concat(rows)
134
+
135
+ unless res.has_more
136
+ @done = true
137
+ close
138
+ end
139
+ end
140
+
141
+ def parse_row(raw)
142
+ row = {}
143
+ fields = raw.split(metadata.delim)
144
+
145
+ row_convertor.each do |c, p, i|
146
+ v = fields[i]
147
+ row[c] = (p ? p.call(v) : v unless v == NULL)
148
+ end
149
+
150
+ row
151
+ end
152
+
153
+ def row_convertor
154
+ @row_convertor ||= columns.zip(metadata.schema.fieldSchemas.map{|s| typecast_map[s.type]}, (0...(columns.length)).to_a)
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,8 @@
1
+ require 'impala/protocol/impala_service'
2
+
3
+ module Impala
4
+ # Taken as a whole, this module contains all the thrift-generated stuff that
5
+ # defines the Impala protocol.
6
+ module Protocol
7
+ end
8
+ end
@@ -0,0 +1,15 @@
1
+ #
2
+ # Autogenerated by Thrift Compiler (0.9.1)
3
+ #
4
+ # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
5
+ #
6
+
7
+ require 'thrift'
8
+ require 'beeswax_types'
9
+
10
+ module Impala
11
+ module Protocol
12
+ module Beeswax
13
+ end
14
+ end
15
+ end