sequel-impala 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG +3 -0
  3. data/LICENSE +462 -0
  4. data/README.rdoc +39 -0
  5. data/Rakefile +39 -0
  6. data/lib/driver/commons-logging-1.2.jar +0 -0
  7. data/lib/driver/hadoop-common-2.6.0.jar +0 -0
  8. data/lib/driver/hadoop-core-2.6.0.jar +0 -0
  9. data/lib/driver/hive-exec-1.1.0.jar +0 -0
  10. data/lib/driver/hive-jdbc-1.1.0.jar +0 -0
  11. data/lib/driver/hive-metastore-1.1.0.jar +0 -0
  12. data/lib/driver/hive-service-1.1.0.jar +0 -0
  13. data/lib/driver/httpclient-4.3.jar +0 -0
  14. data/lib/driver/httpcore-4.3.jar +0 -0
  15. data/lib/driver/libfb303-0.9.0.jar +0 -0
  16. data/lib/driver/slf4j-api-1.7.5.jar +0 -0
  17. data/lib/impala.rb +47 -0
  18. data/lib/impala/connection.rb +117 -0
  19. data/lib/impala/cursor.rb +157 -0
  20. data/lib/impala/protocol.rb +8 -0
  21. data/lib/impala/protocol/beeswax_constants.rb +15 -0
  22. data/lib/impala/protocol/beeswax_service.rb +766 -0
  23. data/lib/impala/protocol/beeswax_types.rb +193 -0
  24. data/lib/impala/protocol/cli_service_constants.rb +60 -0
  25. data/lib/impala/protocol/cli_service_types.rb +1452 -0
  26. data/lib/impala/protocol/facebook_service.rb +706 -0
  27. data/lib/impala/protocol/fb303_constants.rb +15 -0
  28. data/lib/impala/protocol/fb303_types.rb +25 -0
  29. data/lib/impala/protocol/hive_metastore_constants.rb +53 -0
  30. data/lib/impala/protocol/hive_metastore_types.rb +698 -0
  31. data/lib/impala/protocol/impala_hive_server2_service.rb +29 -0
  32. data/lib/impala/protocol/impala_service.rb +377 -0
  33. data/lib/impala/protocol/impala_service_constants.rb +13 -0
  34. data/lib/impala/protocol/impala_service_types.rb +90 -0
  35. data/lib/impala/protocol/status_constants.rb +13 -0
  36. data/lib/impala/protocol/status_types.rb +46 -0
  37. data/lib/impala/protocol/t_c_l_i_service.rb +948 -0
  38. data/lib/impala/protocol/thrift_hive_metastore.rb +4707 -0
  39. data/lib/impala/version.rb +3 -0
  40. data/lib/jdbc/hive2.rb +46 -0
  41. data/lib/sequel/adapters/impala.rb +123 -0
  42. data/lib/sequel/adapters/jdbc/hive2.rb +26 -0
  43. data/lib/sequel/adapters/shared/impala.rb +635 -0
  44. data/lib/sequel/extensions/csv_to_parquet.rb +112 -0
  45. data/spec/database_test.rb +56 -0
  46. data/spec/dataset_test.rb +1268 -0
  47. data/spec/files/bad_down_migration/001_create_alt_basic.rb +4 -0
  48. data/spec/files/bad_down_migration/002_create_alt_advanced.rb +4 -0
  49. data/spec/files/bad_timestamped_migrations/1273253849_create_sessions.rb +9 -0
  50. data/spec/files/bad_timestamped_migrations/1273253851_create_nodes.rb +9 -0
  51. data/spec/files/bad_timestamped_migrations/1273253853_3_create_users.rb +3 -0
  52. data/spec/files/bad_up_migration/001_create_alt_basic.rb +4 -0
  53. data/spec/files/bad_up_migration/002_create_alt_advanced.rb +3 -0
  54. data/spec/files/convert_to_timestamp_migrations/001_create_sessions.rb +9 -0
  55. data/spec/files/convert_to_timestamp_migrations/002_create_nodes.rb +9 -0
  56. data/spec/files/convert_to_timestamp_migrations/003_3_create_users.rb +4 -0
  57. data/spec/files/convert_to_timestamp_migrations/1273253850_create_artists.rb +9 -0
  58. data/spec/files/convert_to_timestamp_migrations/1273253852_create_albums.rb +9 -0
  59. data/spec/files/duplicate_timestamped_migrations/1273253849_create_sessions.rb +9 -0
  60. data/spec/files/duplicate_timestamped_migrations/1273253853_create_nodes.rb +9 -0
  61. data/spec/files/duplicate_timestamped_migrations/1273253853_create_users.rb +4 -0
  62. data/spec/files/integer_migrations/001_create_sessions.rb +9 -0
  63. data/spec/files/integer_migrations/002_create_nodes.rb +9 -0
  64. data/spec/files/integer_migrations/003_3_create_users.rb +4 -0
  65. data/spec/files/interleaved_timestamped_migrations/1273253849_create_sessions.rb +9 -0
  66. data/spec/files/interleaved_timestamped_migrations/1273253850_create_artists.rb +9 -0
  67. data/spec/files/interleaved_timestamped_migrations/1273253851_create_nodes.rb +9 -0
  68. data/spec/files/interleaved_timestamped_migrations/1273253852_create_albums.rb +9 -0
  69. data/spec/files/interleaved_timestamped_migrations/1273253853_3_create_users.rb +4 -0
  70. data/spec/files/reversible_migrations/001_reversible.rb +5 -0
  71. data/spec/files/reversible_migrations/002_reversible.rb +5 -0
  72. data/spec/files/reversible_migrations/003_reversible.rb +5 -0
  73. data/spec/files/reversible_migrations/004_reversible.rb +5 -0
  74. data/spec/files/reversible_migrations/005_reversible.rb +10 -0
  75. data/spec/files/timestamped_migrations/1273253849_create_sessions.rb +9 -0
  76. data/spec/files/timestamped_migrations/1273253851_create_nodes.rb +9 -0
  77. data/spec/files/timestamped_migrations/1273253853_3_create_users.rb +4 -0
  78. data/spec/impala_test.rb +285 -0
  79. data/spec/migrator_test.rb +240 -0
  80. data/spec/plugin_test.rb +91 -0
  81. data/spec/prepared_statement_test.rb +327 -0
  82. data/spec/schema_test.rb +356 -0
  83. data/spec/spec_helper.rb +15 -0
  84. data/spec/timezone_test.rb +86 -0
  85. data/spec/type_test.rb +99 -0
  86. metadata +239 -0
@@ -0,0 +1,39 @@
1
+ = sequel-impala
2
+
3
+ sequel-impala adds support for Sequel to connect to the Impala database
4
+ via the included impala driver, and the included jdbc-hive2 driver under JRuby.
5
+
6
+ = Source Code
7
+
8
+ Source code is available on GitHub at https://github.com/jeremyevans/sequel-impala
9
+
10
+ = Usage
11
+
12
+ After installation, Sequel will automatically pick up the adapter as long as
13
+ the lib directory is in RUBYLIB, if you use a connection string starting with
14
+ +impala+, or <tt>jdbc:hive2</tt> on JRuby.
15
+
16
+ = Connection Strings
17
+
18
+ If using the impala driver (default host is localhost, default port is 21000):
19
+
20
+ impala://host:port
21
+
22
+ If using the jdbc:hive2 driver on JRuby (port 21050 works in testing):
23
+
24
+ jdbc:hive2://host:port/;auth=noSasl
25
+
26
+ = Dependencies
27
+
28
+ * sequel 4+
29
+ * thrift gem
30
+
31
+ = License
32
+
33
+ MIT/Apache
34
+
35
+ = Author
36
+
37
+ Jeremy Evans <code@jeremyevans.net>
38
+
39
+ Work on sequel-impala is generously funded by Outcomes Insights, Inc.
@@ -0,0 +1,39 @@
1
+ require "rake"
2
+ require "rake/clean"
3
+
4
+ CLEAN.include ["sequel-impala-*.gem", "rdoc"]
5
+
6
+ desc "Build sequel-impala gem"
7
+ task :package=>[:clean] do |p|
8
+ sh %{#{FileUtils::RUBY} -S gem build sequel-impala.gemspec}
9
+ end
10
+
11
+ ### Specs
12
+
13
+ desc "Run specs"
14
+ task "spec" do
15
+ sh "#{FileUtils::RUBY} -rubygems -I lib -e 'ARGV.each{|f| require f}' ./spec/*_test.rb"
16
+ end
17
+
18
+ task :default => :spec
19
+
20
+ ### RDoc
21
+
22
+ RDOC_DEFAULT_OPTS = ["--quiet", "--line-numbers", "--inline-source", '--title', 'sequel-impala: Sequel support for Impala database']
23
+
24
+ begin
25
+ gem 'rdoc'
26
+ gem 'hanna-nouveau'
27
+ RDOC_DEFAULT_OPTS.concat(['-f', 'hanna'])
28
+ rescue Gem::LoadError
29
+ end
30
+
31
+ RDOC_OPTS = RDOC_DEFAULT_OPTS + ['--main', 'README.rdoc']
32
+
33
+ require 'rdoc/task'
34
+ RDoc::Task.new do |rdoc|
35
+ rdoc.rdoc_dir = "rdoc"
36
+ rdoc.options += RDOC_OPTS
37
+ rdoc.rdoc_files.add %w"README.rdoc CHANGELOG LICENSE lib/**/*.rb"
38
+ end
39
+
@@ -0,0 +1,47 @@
1
+
2
+ # the generated ruby files use a relative require, so we need to add the
3
+ # generated directory to $LOAD_PATH
4
+ this_dir = File.expand_path(File.dirname(__FILE__))
5
+ gen_dir = File.join(this_dir, 'impala/protocol')
6
+ $LOAD_PATH.push(gen_dir) unless $LOAD_PATH.include?(gen_dir)
7
+
8
+ require 'impala/version'
9
+
10
+ require 'thrift'
11
+ require 'time'
12
+ require 'impala/protocol'
13
+ require 'impala/cursor'
14
+ require 'impala/connection'
15
+
16
+ module Impala
17
+ DEFAULT_HOST = 'localhost'
18
+ DEFAULT_PORT = 21000
19
+ class Error < StandardError; end
20
+ class InvalidQueryError < Error; end
21
+ class ConnectionError < Error; end
22
+ class CursorError < Error; end
23
+
24
+ # Connect to an Impala server. If a block is given, it will close the
25
+ # connection after yielding the connection to the block.
26
+ # @param [String] host the hostname or IP address of the Impala server
27
+ # @param [int] port the port that the Impala server is listening on
28
+ # @yieldparam [Connection] conn the open connection. Will be closed once the block
29
+ # finishes
30
+ # @return [Connection] the open connection, or, if a block is
31
+ # passed, the return value of the block
32
+ def self.connect(host=DEFAULT_HOST, port=DEFAULT_PORT)
33
+ connection = Connection.new(host, port)
34
+
35
+ if block_given?
36
+ begin
37
+ ret = yield connection
38
+ ensure
39
+ connection.close
40
+ end
41
+ else
42
+ ret = connection
43
+ end
44
+
45
+ ret
46
+ end
47
+ end
@@ -0,0 +1,117 @@
1
+ module Impala
2
+ # This object represents a connection to an Impala server. It can be used to
3
+ # perform queries on the database.
4
+ class Connection
5
+ LOG_CONTEXT_ID = "impala-ruby"
6
+
7
+ # Don't instantiate Connections directly; instead, use {Impala.connect}.
8
+ def initialize(host, port)
9
+ @host = host
10
+ @port = port
11
+ @connected = false
12
+ open
13
+ end
14
+
15
+ def inspect
16
+ "#<#{self.class} #{@host}:#{@port}#{open? ? '' : ' (DISCONNECTED)'}>"
17
+ end
18
+
19
+ # Open the connection if it's currently closed.
20
+ def open
21
+ return if @connected
22
+
23
+ socket = Thrift::Socket.new(@host, @port)
24
+
25
+ @transport = Thrift::BufferedTransport.new(socket)
26
+ @transport.open
27
+
28
+ proto = Thrift::BinaryProtocol.new(@transport)
29
+ @service = Protocol::ImpalaService::Client.new(proto)
30
+ @connected = true
31
+ end
32
+
33
+ # Close this connection. It can still be reopened with {#open}.
34
+ def close
35
+ return unless @connected
36
+
37
+ @transport.close
38
+ @connected = false
39
+ end
40
+
41
+ # Returns true if the connection is currently open.
42
+ def open?
43
+ @connected
44
+ end
45
+
46
+ # Refresh the metadata store.
47
+ def refresh
48
+ raise ConnectionError.new("Connection closed") unless open?
49
+ @service.ResetCatalog
50
+ end
51
+
52
+ # Perform a query and return all the results. This will
53
+ # load the entire result set into memory, so if you're dealing with lots
54
+ # of rows, {#execute} may work better.
55
+ # @param [String] query the query you want to run
56
+ # @param [Hash] query_options the options to set user and configuration
57
+ # except for :user, see TImpalaQueryOptions in ImpalaService.thrift
58
+ # @option query_options [String] :user the user runs the query
59
+ # @return [Array<Hash>] an array of hashes, one for each row.
60
+ def query(raw_query, query_options = {})
61
+ execute(raw_query, query_options).fetch_all
62
+ end
63
+
64
+ # Perform a query and return a cursor for iterating over the results.
65
+ # @param [String] query the query you want to run
66
+ # @param [Hash] query_options the options to set user and configuration
67
+ # except for :user, see TImpalaQueryOptions in ImpalaService.thrift
68
+ # @option query_options [String] :user the user runs the query
69
+ # @return [Cursor] a cursor for the result rows
70
+ def execute(raw_query, query_options = {})
71
+ raise ConnectionError.new("Connection closed") unless open?
72
+
73
+ query = sanitize_query(raw_query)
74
+ handle = send_query(query, query_options)
75
+
76
+ check_result(handle)
77
+ Cursor.new(handle, @service)
78
+ end
79
+
80
+ private
81
+
82
+ def sanitize_query(raw_query)
83
+ words = raw_query.split
84
+ raise InvalidQueryError.new("Empty query") if words.empty?
85
+
86
+ command = words.first.downcase
87
+ ([command] + words[1..-1]).join(' ')
88
+ end
89
+
90
+ def send_query(sanitized_query, query_options)
91
+ query = Protocol::Beeswax::Query.new
92
+ query.query = sanitized_query
93
+
94
+ query.hadoop_user = query_options.delete(:user) if query_options[:user]
95
+ query.configuration = query_options.map do |key, value|
96
+ "#{key.upcase}=#{value}"
97
+ end
98
+
99
+ @service.executeAndWait(query, LOG_CONTEXT_ID)
100
+ end
101
+
102
+ def check_result(handle)
103
+ state = @service.get_state(handle)
104
+ if state == Protocol::Beeswax::QueryState::EXCEPTION
105
+ close_handle(handle)
106
+ raise ConnectionError.new("The query was aborted")
107
+ end
108
+ rescue
109
+ close_handle(handle)
110
+ raise
111
+ end
112
+
113
+ def close_handle(handle)
114
+ @service.close(handle)
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,157 @@
1
+ module Impala
2
+ # Cursors are used to iterate over result sets without loading them all
3
+ # into memory at once. This can be useful if you're dealing with lots of
4
+ # rows. It implements Enumerable, so you can use each/select/map/etc.
5
+ class Cursor
6
+ BUFFER_SIZE = 1024
7
+ include Enumerable
8
+
9
+ def self.typecast_boolean(value)
10
+ value == 'true'
11
+ end
12
+
13
+ def self.typecast_int(value)
14
+ value.to_i
15
+ end
16
+
17
+ def self.typecast_float(value)
18
+ value.to_f
19
+ end
20
+
21
+ def self.typecast_decimal(value)
22
+ BigDecimal.new(value)
23
+ end
24
+
25
+ def self.typecast_timestamp(value)
26
+ Time.parse(value)
27
+ end
28
+
29
+ TYPECAST_MAP = {
30
+ 'boolean'=>method(:typecast_boolean),
31
+ 'int'=>method(:typecast_int),
32
+ 'double'=>method(:typecast_float),
33
+ 'decimal'=>method(:typecast_decimal),
34
+ 'timestamp'=>method(:typecast_timestamp),
35
+ }
36
+ TYPECAST_MAP['tinyint'] = TYPECAST_MAP['smallint'] = TYPECAST_MAP['bigint'] = TYPECAST_MAP['int']
37
+ TYPECAST_MAP['float'] = TYPECAST_MAP['double']
38
+ TYPECAST_MAP.freeze
39
+
40
+ NULL = 'NULL'.freeze
41
+
42
+ attr_reader :columns
43
+
44
+ attr_reader :typecast_map
45
+
46
+ def initialize(handle, service)
47
+ @handle = handle
48
+ @service = service
49
+
50
+
51
+ @row_buffer = []
52
+ @done = false
53
+ @open = true
54
+ @typecast_map = TYPECAST_MAP.dup
55
+ @columns = metadata.schema.fieldSchemas.map(&:name)
56
+ end
57
+
58
+ def inspect
59
+ "#<#{self.class}#{open? ? '' : ' (CLOSED)'}>"
60
+ end
61
+
62
+ def each
63
+ while row = fetch_row
64
+ yield row
65
+ end
66
+ end
67
+
68
+ # Returns the next available row as a hash, or nil if there are none left.
69
+ # @return [Hash, nil] the next available row, or nil if there are none
70
+ # left
71
+ # @see #fetch_all
72
+ def fetch_row
73
+ if @row_buffer.empty?
74
+ if @done
75
+ return nil
76
+ else
77
+ fetch_more
78
+ end
79
+ end
80
+
81
+ @row_buffer.shift
82
+ end
83
+
84
+ # Returns all the remaining rows in the result set.
85
+ # @return [Array<Hash>] the remaining rows in the result set
86
+ # @see #fetch_one
87
+ def fetch_all
88
+ self.to_a
89
+ end
90
+
91
+ # Close the cursor on the remote server. Once a cursor is closed, you
92
+ # can no longer fetch any rows from it.
93
+ def close
94
+ @open = false
95
+ @service.close(@handle)
96
+ end
97
+
98
+ # Returns true if the cursor is still open.
99
+ def open?
100
+ @open
101
+ end
102
+
103
+ # Returns true if there are any more rows to fetch.
104
+ def has_more?
105
+ !@done || !@row_buffer.empty?
106
+ end
107
+
108
+ def runtime_profile
109
+ @service.GetRuntimeProfile(@handle)
110
+ end
111
+
112
+ private
113
+
114
+ def metadata
115
+ @metadata ||= @service.get_results_metadata(@handle)
116
+ end
117
+
118
+ def fetch_more
119
+ fetch_batch until @done || @row_buffer.count >= BUFFER_SIZE
120
+ end
121
+
122
+ def fetch_batch
123
+ raise CursorError.new("Cursor has expired or been closed") unless @open
124
+
125
+ begin
126
+ res = @service.fetch(@handle, false, BUFFER_SIZE)
127
+ rescue Protocol::Beeswax::BeeswaxException
128
+ @open = false
129
+ raise CursorError.new("Cursor has expired or been closed")
130
+ end
131
+
132
+ rows = res.data.map { |raw| parse_row(raw) }
133
+ @row_buffer.concat(rows)
134
+
135
+ unless res.has_more
136
+ @done = true
137
+ close
138
+ end
139
+ end
140
+
141
+ def parse_row(raw)
142
+ row = {}
143
+ fields = raw.split(metadata.delim)
144
+
145
+ row_convertor.each do |c, p, i|
146
+ v = fields[i]
147
+ row[c] = (p ? p.call(v) : v unless v == NULL)
148
+ end
149
+
150
+ row
151
+ end
152
+
153
+ def row_convertor
154
+ @row_convertor ||= columns.zip(metadata.schema.fieldSchemas.map{|s| typecast_map[s.type]}, (0...(columns.length)).to_a)
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,8 @@
1
+ require 'impala/protocol/impala_service'
2
+
3
+ module Impala
4
+ # Taken as a whole, this module contains all the thrift-generated stuff that
5
+ # defines the Impala protocol.
6
+ module Protocol
7
+ end
8
+ end
@@ -0,0 +1,15 @@
1
+ #
2
+ # Autogenerated by Thrift Compiler (0.9.1)
3
+ #
4
+ # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
5
+ #
6
+
7
+ require 'thrift'
8
+ require 'beeswax_types'
9
+
10
+ module Impala
11
+ module Protocol
12
+ module Beeswax
13
+ end
14
+ end
15
+ end