sequel_impala 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +50 -0
  3. data/LICENSE +463 -0
  4. data/README.md +45 -0
  5. data/Rakefile +39 -0
  6. data/lib/driver/commons-collections-3.2.1.jar +0 -0
  7. data/lib/driver/commons-configuration-1.10.jar +0 -0
  8. data/lib/driver/commons-logging-1.2.jar +0 -0
  9. data/lib/driver/hadoop-auth-2.9.0.jar +0 -0
  10. data/lib/driver/hadoop-common-2.9.0.jar +0 -0
  11. data/lib/driver/hadoop-core-2.6.0.jar +0 -0
  12. data/lib/driver/hive-exec-1.1.0.jar +0 -0
  13. data/lib/driver/hive-jdbc-1.1.0.jar +0 -0
  14. data/lib/driver/hive-metastore-1.1.0.jar +0 -0
  15. data/lib/driver/hive-service-1.1.0.jar +0 -0
  16. data/lib/driver/httpclient-4.3.jar +0 -0
  17. data/lib/driver/httpcore-4.3.jar +0 -0
  18. data/lib/driver/libfb303-0.9.0.jar +0 -0
  19. data/lib/driver/log4j-1.2.17.jar +0 -0
  20. data/lib/driver/slf4j-api-1.7.5.jar +0 -0
  21. data/lib/driver/stax2-api-3.1.4.jar +0 -0
  22. data/lib/driver/woodstox-core-asl-4.4.1.jar +0 -0
  23. data/lib/impala.rb +55 -0
  24. data/lib/impala/connection.rb +180 -0
  25. data/lib/impala/cursor.rb +200 -0
  26. data/lib/impala/progress_reporter.rb +40 -0
  27. data/lib/impala/protocol.rb +8 -0
  28. data/lib/impala/protocol/beeswax_constants.rb +15 -0
  29. data/lib/impala/protocol/beeswax_service.rb +747 -0
  30. data/lib/impala/protocol/beeswax_types.rb +193 -0
  31. data/lib/impala/protocol/exec_stats_constants.rb +13 -0
  32. data/lib/impala/protocol/exec_stats_types.rb +133 -0
  33. data/lib/impala/protocol/facebook_service.rb +706 -0
  34. data/lib/impala/protocol/fb303_constants.rb +15 -0
  35. data/lib/impala/protocol/fb303_types.rb +25 -0
  36. data/lib/impala/protocol/hive_metastore_constants.rb +53 -0
  37. data/lib/impala/protocol/hive_metastore_types.rb +698 -0
  38. data/lib/impala/protocol/impala_hive_server2_service.rb +137 -0
  39. data/lib/impala/protocol/impala_service.rb +443 -0
  40. data/lib/impala/protocol/impala_service_constants.rb +13 -0
  41. data/lib/impala/protocol/impala_service_types.rb +192 -0
  42. data/lib/impala/protocol/status_constants.rb +13 -0
  43. data/lib/impala/protocol/status_types.rb +46 -0
  44. data/lib/impala/protocol/t_c_l_i_service.rb +1108 -0
  45. data/lib/impala/protocol/t_c_l_i_service_constants.rb +72 -0
  46. data/lib/impala/protocol/t_c_l_i_service_types.rb +1802 -0
  47. data/lib/impala/protocol/thrift_hive_metastore.rb +4707 -0
  48. data/lib/impala/protocol/types_constants.rb +13 -0
  49. data/lib/impala/protocol/types_types.rb +332 -0
  50. data/lib/impala/sasl_transport.rb +117 -0
  51. data/lib/impala/thrift_patch.rb +31 -0
  52. data/lib/impala/version.rb +3 -0
  53. data/lib/jdbc/hive2.rb +52 -0
  54. data/lib/jdbc/impala.rb +50 -0
  55. data/lib/rbhive.rb +8 -0
  56. data/lib/rbhive/connection.rb +150 -0
  57. data/lib/rbhive/explain_result.rb +46 -0
  58. data/lib/rbhive/result_set.rb +37 -0
  59. data/lib/rbhive/schema_definition.rb +86 -0
  60. data/lib/rbhive/t_c_l_i_connection.rb +466 -0
  61. data/lib/rbhive/t_c_l_i_result_set.rb +3 -0
  62. data/lib/rbhive/t_c_l_i_schema_definition.rb +87 -0
  63. data/lib/rbhive/table_schema.rb +122 -0
  64. data/lib/rbhive/version.rb +3 -0
  65. data/lib/sequel/adapters/impala.rb +220 -0
  66. data/lib/sequel/adapters/jdbc/hive2.rb +36 -0
  67. data/lib/sequel/adapters/jdbc/impala.rb +38 -0
  68. data/lib/sequel/adapters/rbhive.rb +177 -0
  69. data/lib/sequel/adapters/shared/impala.rb +808 -0
  70. data/lib/sequel/extensions/csv_to_parquet.rb +166 -0
  71. data/lib/thrift/facebook_service.rb +700 -0
  72. data/lib/thrift/fb303_constants.rb +9 -0
  73. data/lib/thrift/fb303_types.rb +19 -0
  74. data/lib/thrift/hive_metastore_constants.rb +41 -0
  75. data/lib/thrift/hive_metastore_types.rb +630 -0
  76. data/lib/thrift/hive_service_constants.rb +13 -0
  77. data/lib/thrift/hive_service_types.rb +72 -0
  78. data/lib/thrift/queryplan_constants.rb +13 -0
  79. data/lib/thrift/queryplan_types.rb +261 -0
  80. data/lib/thrift/sasl_client_transport.rb +161 -0
  81. data/lib/thrift/serde_constants.rb +92 -0
  82. data/lib/thrift/serde_types.rb +7 -0
  83. data/lib/thrift/t_c_l_i_service.rb +1054 -0
  84. data/lib/thrift/t_c_l_i_service_constants.rb +72 -0
  85. data/lib/thrift/t_c_l_i_service_types.rb +1768 -0
  86. data/lib/thrift/thrift_hive.rb +508 -0
  87. data/lib/thrift/thrift_hive_metastore.rb +3856 -0
  88. data/spec/database_test.rb +56 -0
  89. data/spec/dataset_test.rb +1268 -0
  90. data/spec/files/bad_down_migration/001_create_alt_basic.rb +4 -0
  91. data/spec/files/bad_down_migration/002_create_alt_advanced.rb +4 -0
  92. data/spec/files/bad_timestamped_migrations/1273253849_create_sessions.rb +9 -0
  93. data/spec/files/bad_timestamped_migrations/1273253851_create_nodes.rb +9 -0
  94. data/spec/files/bad_timestamped_migrations/1273253853_3_create_users.rb +3 -0
  95. data/spec/files/bad_up_migration/001_create_alt_basic.rb +4 -0
  96. data/spec/files/bad_up_migration/002_create_alt_advanced.rb +3 -0
  97. data/spec/files/convert_to_timestamp_migrations/001_create_sessions.rb +9 -0
  98. data/spec/files/convert_to_timestamp_migrations/002_create_nodes.rb +9 -0
  99. data/spec/files/convert_to_timestamp_migrations/003_3_create_users.rb +4 -0
  100. data/spec/files/convert_to_timestamp_migrations/1273253850_create_artists.rb +9 -0
  101. data/spec/files/convert_to_timestamp_migrations/1273253852_create_albums.rb +9 -0
  102. data/spec/files/duplicate_timestamped_migrations/1273253849_create_sessions.rb +9 -0
  103. data/spec/files/duplicate_timestamped_migrations/1273253853_create_nodes.rb +9 -0
  104. data/spec/files/duplicate_timestamped_migrations/1273253853_create_users.rb +4 -0
  105. data/spec/files/integer_migrations/001_create_sessions.rb +9 -0
  106. data/spec/files/integer_migrations/002_create_nodes.rb +9 -0
  107. data/spec/files/integer_migrations/003_3_create_users.rb +4 -0
  108. data/spec/files/interleaved_timestamped_migrations/1273253849_create_sessions.rb +9 -0
  109. data/spec/files/interleaved_timestamped_migrations/1273253850_create_artists.rb +9 -0
  110. data/spec/files/interleaved_timestamped_migrations/1273253851_create_nodes.rb +9 -0
  111. data/spec/files/interleaved_timestamped_migrations/1273253852_create_albums.rb +9 -0
  112. data/spec/files/interleaved_timestamped_migrations/1273253853_3_create_users.rb +4 -0
  113. data/spec/files/reversible_migrations/001_reversible.rb +5 -0
  114. data/spec/files/reversible_migrations/002_reversible.rb +5 -0
  115. data/spec/files/reversible_migrations/003_reversible.rb +5 -0
  116. data/spec/files/reversible_migrations/004_reversible.rb +5 -0
  117. data/spec/files/reversible_migrations/005_reversible.rb +10 -0
  118. data/spec/files/timestamped_migrations/1273253849_create_sessions.rb +9 -0
  119. data/spec/files/timestamped_migrations/1273253851_create_nodes.rb +9 -0
  120. data/spec/files/timestamped_migrations/1273253853_3_create_users.rb +4 -0
  121. data/spec/impala_test.rb +290 -0
  122. data/spec/migrator_test.rb +240 -0
  123. data/spec/plugin_test.rb +91 -0
  124. data/spec/prepared_statement_test.rb +327 -0
  125. data/spec/schema_test.rb +356 -0
  126. data/spec/spec_helper.rb +19 -0
  127. data/spec/timezone_test.rb +86 -0
  128. data/spec/type_test.rb +99 -0
  129. metadata +294 -0
@@ -0,0 +1,38 @@
1
+ require 'sequel/adapters/shared/impala'
2
+
3
+ Sequel::JDBC.load_driver('com.cloudera.impala.jdbc41.Driver', :Impala)
4
+
5
+ module Sequel
6
+ module JDBC
7
+ Sequel.synchronize do
8
+ DATABASE_SETUP[:impala] = proc do |db|
9
+ db.extend(Sequel::JDBC::Impala::DatabaseMethods)
10
+ db.extend_datasets(Sequel::Impala::DatasetMethods)
11
+
12
+ # Explicitly disconnect at exit, which can fix issues where
13
+ # existing without disconnecting causes problems.
14
+ at_exit{db.disconnect}
15
+
16
+ com.cloudera.impala.jdbc41.Driver
17
+ end
18
+ end
19
+
20
+ module Impala
21
+ module DatabaseMethods
22
+ include Sequel::Impala::DatabaseMethods
23
+
24
+ # Recognize wrapped and unwrapped java.net.SocketExceptions as disconnect errors
25
+ def disconnect_error?(exception, opts)
26
+ super || exception.message =~ /\A(Java::JavaSql::SQLException: )?org\.apache\.thrift\.transport\.TTransportException: java\.net\.SocketException/
27
+ end
28
+
29
+ def disconnect_connection(c)
30
+ super
31
+ rescue java.sql.SQLException
32
+ nil
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+
@@ -0,0 +1,177 @@
1
+ require 'rbhive'
2
+ require 'sequel/adapters/shared/impala'
3
+
4
+ module Sequel
5
+ module Rbhive
6
+ class Database < Sequel::Database
7
+ include Impala::DatabaseMethods
8
+
9
+ NullLogger = Object.new
10
+ def NullLogger.info(str)
11
+ nil
12
+ end
13
+
14
+ to_i = lambda(&:to_i)
15
+ CONVERSION_PROCS = [
16
+ nil, # 0 => %q"BOOLEAN",
17
+ nil, # 1 => %q"TINYINT",
18
+ nil, # 2 => %q"SMALLINT",
19
+ nil, # 3 => %q"INT",
20
+ nil, # 4 => %q"BIGINT",
21
+ nil, # 5 => %q"FLOAT",
22
+ nil, # 6 => %q"DOUBLE",
23
+ nil, # 7 => %q"STRING",
24
+ nil, # 8 => %q"TIMESTAMP",
25
+ nil, # 9 => %q"BINARY",
26
+ nil, # 10 => %q"ARRAY",
27
+ nil, # 11 => %q"MAP",
28
+ nil, # 12 => %q"STRUCT",
29
+ nil, # 13 => %q"UNIONTYPE",
30
+ lambda{|v| BigDecimal.new(v)}, # 15 => %q"DECIMAL",
31
+ nil, # 16 => %q"NULL",
32
+ lambda{|v| Date.new(*v[0...10].split('-'))}, # 17 => %q"DATE",
33
+ nil, # 18 => %q"VARCHAR",
34
+ nil, # 19 => %q"CHAR",
35
+ ]
36
+
37
+ attr_reader :conversion_procs
38
+
39
+ # Exception classes used by Impala.
40
+ RbhiveExceptions = [
41
+ RBHive::TCLIConnectionError,
42
+ ::Thrift::TransportException,
43
+ IOError
44
+ ].freeze
45
+
46
+ DisconnectExceptions = [
47
+ ::Thrift::TransportException,
48
+ IOError
49
+ ].freeze
50
+
51
+ set_adapter_scheme :rbhive
52
+
53
+ # Connect to the Impala server. Currently, only the :host and :port options
54
+ # are respected, and they default to 'localhost' and 21000, respectively.
55
+ def connect(server)
56
+ opts = server_opts(server)
57
+ opts[:hive_version] ||= 12
58
+ conn = RBHive::TCLIConnection.new(opts[:host]||'localhost', opts[:port]||21050, opts, opts[:hive_logger] || NullLogger)
59
+ conn.open
60
+ conn.open_session
61
+ force_database(conn, opts[:database])
62
+ end
63
+
64
+ def database_error_classes
65
+ RbhiveExceptions
66
+ end
67
+
68
+ def disconnect_connection(connection)
69
+ connection.close_session if connection.session
70
+ connection.close
71
+ rescue *DisconnectExceptions
72
+ end
73
+
74
+ def execute(sql, opts=OPTS)
75
+ synchronize(opts[:server]) do |c|
76
+ begin
77
+ puts sql
78
+ r = log_connection_yield(sql, c){c.execute(sql)}
79
+ yield(c, r) if block_given?
80
+ nil
81
+ rescue *RbhiveExceptions => e
82
+ raise_error(e)
83
+ end
84
+ end
85
+ end
86
+
87
+ private
88
+
89
+ def adapter_initialize
90
+ @conversion_procs = CONVERSION_PROCS.dup
91
+ @conversion_procs[8] = method(:to_application_timestamp)
92
+ end
93
+
94
+ def connection_execute_method
95
+ :execute
96
+ end
97
+
98
+ def dataset_class_default
99
+ Dataset
100
+ end
101
+
102
+ # Impala raises IOError if it detects a problem on the connection, and
103
+ # in most cases that results in an unusable connection, so treat it as a
104
+ # disconnect error so Sequel will reconnect.
105
+ def disconnect_error?(exception, opts)
106
+ case exception
107
+ when *DisconnectExceptions
108
+ true
109
+ else
110
+ super
111
+ end
112
+ end
113
+
114
+ # Use DESCRIBE to get the column names and types for the table.
115
+ def schema_parse_table(table_name, opts)
116
+ m = output_identifier_meth(opts[:dataset])
117
+
118
+ table = if opts[:schema]
119
+ Sequel.qualify(opts[:schema], table_name)
120
+ else
121
+ Sequel.identifier(table_name)
122
+ end
123
+
124
+ describe(table, opts).map do |row|
125
+ row[:db_type] = row[:type]
126
+ row[:type] = schema_column_type(row[:db_type])
127
+ row[:default] = nil
128
+ row[:primary_key] = false
129
+ [m.call(row.delete(:name)), row]
130
+ end
131
+ end
132
+ end
133
+
134
+ class Dataset < Sequel::Dataset
135
+ include Impala::DatasetMethods
136
+
137
+ APOS = "'".freeze
138
+ STRING_ESCAPES = {
139
+ "\\" => "\\\\".freeze,
140
+ "'" => "\\'".freeze,
141
+ "\n" => "\\n".freeze,
142
+ "\r" => "\\r".freeze,
143
+ "\0" => "\\0".freeze,
144
+ "\b" => "\\b".freeze,
145
+ "\04" => "\\Z".freeze,
146
+ # Impala is supposed to support this, but using it
147
+ # breaks things to the point of returning bad data.
148
+ # If you don't do this, the tabs in the input
149
+ # get converted to spaces, but that's better than the
150
+ # alternative.
151
+ # "\t" => "\\t".freeze,
152
+ }.freeze
153
+ STRING_ESCAPE_RE = /(#{Regexp.union(STRING_ESCAPES.keys)})/
154
+
155
+ def fetch_rows(sql)
156
+ execute(sql) do |conn, result|
157
+ op_handle = result.operationHandle
158
+ columns, type_nums = conn.get_column_info(op_handle)
159
+ self.columns = columns.map!{|c| output_identifier(c)}
160
+ conversion_procs = db.conversion_procs
161
+ convertors = conversion_procs.values_at(*type_nums)
162
+ #cursor.typecast_map['timestamp'] = db.method(:to_application_timestamp)
163
+ conn.yield_hash_rows(op_handle, columns, convertors) do |row|
164
+ yield row
165
+ end
166
+ end
167
+ end
168
+
169
+ private
170
+
171
+ def literal_string_append(sql, s)
172
+ sql << APOS << s.to_s.gsub(STRING_ESCAPE_RE){|m| STRING_ESCAPES[m]} << APOS
173
+ end
174
+ end
175
+ end
176
+ end
177
+
@@ -0,0 +1,808 @@
1
+ require 'sequel/adapters/utils/unmodified_identifiers'
2
+
3
+ module Sequel
4
+ module Impala
5
+ Sequel::Database.set_shared_adapter_scheme :impala, self
6
+
7
+ module DatabaseMethods
8
+ include UnmodifiedIdentifiers::DatabaseMethods
9
+
10
+ # Do not use a composite primary key, foreign keys, or an
11
+ # index when creating a join table, as Impala doesn't support those.
12
+ def create_join_table(hash, options=OPTS)
13
+ keys = hash.keys.sort_by(&:to_s)
14
+ create_table(join_table_name(hash, options), options) do
15
+ keys.each do |key|
16
+ Integer key
17
+ end
18
+ end
19
+ end
20
+
21
+ def refresh(table_name)
22
+ run(refresh_sql(table_name))
23
+ end
24
+
25
+ def compute_stats(table_name)
26
+ run(compute_stats_sql(table_name))
27
+ end
28
+
29
+ # Create a database/schema in Imapala.
30
+ #
31
+ # Options:
32
+ # :if_not_exists :: Don't raise an error if the schema already exists.
33
+ # :location :: Set the file system location to store the data for tables
34
+ # in the created schema.
35
+ #
36
+ # Examples:
37
+ #
38
+ # create_schema(:s)
39
+ # # CREATE SCHEMA `s`
40
+ #
41
+ # create_schema(:s, :if_not_exists=>true)
42
+ # # CREATE SCHEMA IF NOT EXISTS `s`
43
+ #
44
+ # create_schema(:s, :location=>'/a/b')
45
+ # # CREATE SCHEMA `s` LOCATION '/a/b'
46
+ def create_schema(schema, options=OPTS)
47
+ run(create_schema_sql(schema, options))
48
+ end
49
+
50
+ def create_table(name, options=OPTS)
51
+ super
52
+ if im = options[:invalidate_metadata]
53
+ invalidate_metadata((name unless im == :all))
54
+ end
55
+ end
56
+
57
+ # Set the database_type for this database to :impala.
58
+ def database_type
59
+ :impala
60
+ end
61
+
62
+ # Return the DESCRIBE output for the table, showing table
63
+ # columns, types, and comments. If the :formatted option
64
+ # is given, use DESCRIBE FORMATTED and return a lot more
65
+ # information about the table. Both of these return arrays
66
+ # of hashes.
67
+ #
68
+ # Examples:
69
+ #
70
+ # describe(:t)
71
+ # # DESCRIBE `t`
72
+ #
73
+ # describe(:t, :formatted=>true)
74
+ # # DESCRIBE FORMATTED `t`
75
+ def describe(table, opts=OPTS)
76
+ if ds = opts[:dataset]
77
+ ds = ds.naked
78
+ else
79
+ ds = dataset
80
+ end
81
+ ds.with_sql("DESCRIBE #{'FORMATTED ' if opts[:formatted]} ?", table).all
82
+ end
83
+
84
+ # Drop a database/schema from Imapala.
85
+ #
86
+ # Options:
87
+ # :if_exists :: Don't raise an error if the schema doesn't exist.
88
+ #
89
+ # Examples:
90
+ #
91
+ # drop_schema(:s)
92
+ # # DROP SCHEMA `s`
93
+ #
94
+ # create_schema(:s, :if_exists=>true)
95
+ # # DROP SCHEMA IF EXISTS `s`
96
+ def drop_schema(schema, options=OPTS)
97
+ run(drop_schema_sql(schema, options))
98
+ end
99
+
100
+ def drop_table(*names)
101
+ # CASCADE isn't a supported option in Impala
102
+ if names.last.is_a?(Hash)
103
+ names.last.delete(:cascade)
104
+ end
105
+ super
106
+ end
107
+
108
+ # Implicitly quailfy the table if using the :search_path option.
109
+ # This will look at all of the tables and views in the schemas,
110
+ # and if an unqualified table is used and appears in one of the
111
+ # schemas, it will be implicitly qualified with the given schema
112
+ # name.
113
+ def implicit_qualify(table)
114
+ return table unless opts[:search_path]
115
+
116
+ case table
117
+ when Symbol
118
+ s, t, a = Sequel.split_symbol(table)
119
+ if s
120
+ return table
121
+ end
122
+ t = implicit_qualify(t)
123
+ a ? Sequel.as(t, a) : t
124
+ when String
125
+ if schema = search_path_table_schemas[table]
126
+ Sequel.qualify(schema, table)
127
+ else
128
+ invalidate_table_schemas
129
+ if schema = search_path_table_schemas[table]
130
+ Sequel.qualify(schema, table)
131
+ else
132
+ Sequel.identifier(table)
133
+ end
134
+ end
135
+ when SQL::Identifier
136
+ implicit_qualify(table.value.to_s)
137
+ when SQL::AliasedExpression
138
+ SQL::AliasedExpression.new(implicit_qualify(table.expression), table.alias)
139
+ else
140
+ table
141
+ end
142
+ end
143
+
144
+ # Invalidate the metadata for the given table, or for all tables if
145
+ # no argument is given.
146
+ def invalidate_metadata(identifier=nil)
147
+ run("INVALIDATE METADATA #{quote_schema_table(identifier) if identifier}")
148
+ end
149
+
150
+ # Load data from HDFS into Impala.
151
+ #
152
+ # Options:
153
+ # :overwrite :: Overwrite the existing table instead of appending to it.
154
+ #
155
+ # Examples:
156
+ #
157
+ # load_data('/user/foo', :bar)
158
+ # LOAD DATA INPATH '/user/foo' INTO TABLE `bar`
159
+ #
160
+ # load_data('/user/foo', :bar, :overwrite=>true)
161
+ # LOAD DATA INPATH '/user/foo' OVERWRITE INTO TABLE `bar`
162
+ def load_data(path, table, options=OPTS)
163
+ run(load_data_sql(path, table, options))
164
+ end
165
+
166
+ # Don't use PRIMARY KEY or AUTOINCREMENT on Impala, as Impala doesn't
167
+ # support either.
168
+ def serial_primary_key_options
169
+ {:type=>Integer}
170
+ end
171
+
172
+ # Impala supports CREATE TABLE IF NOT EXISTS.
173
+ def supports_create_table_if_not_exists?
174
+ true
175
+ end
176
+
177
+ # Impala does not support foreign keys.
178
+ def supports_foreign_key_parsing?
179
+ false
180
+ end
181
+
182
+ # Impala does not support indexes.
183
+ def supports_index_parsing?
184
+ false
185
+ end
186
+
187
+ # Check that the tables returned by the JDBC driver are actually valid
188
+ # tables and not views. The Hive2 JDBC driver returns views when listing
189
+ # tables and nothing when listing views.
190
+ def tables(opts=OPTS)
191
+ _tables(opts).select{|t| is_valid_table?(t, opts)}
192
+ end
193
+
194
+ # Impala doesn't support transactions, so instead of issuing a
195
+ # transaction, just checkout a connection. This ensures the same
196
+ # connection is used for the transaction block, but as Impala
197
+ # doesn't support transactions, you can't rollback.
198
+ def transaction(opts=OPTS)
199
+ synchronize(opts[:server]) do |c|
200
+ yield c
201
+ end
202
+ end
203
+
204
+ # Determine the available views for listing all tables via JDBC (which
205
+ # includes both tables and views), and removing all valid tables.
206
+ def views(opts=OPTS)
207
+ _tables(opts).reject{|t| is_valid_table?(t, opts)}
208
+ end
209
+
210
+ # Creates a dataset that uses the VALUES clause:
211
+ #
212
+ # DB.values([[1, 2], [3, 4]])
213
+ # VALUES ((1, 2), (3, 4))
214
+ def values(v)
215
+ @default_dataset.clone(:values=>v)
216
+ end
217
+
218
+ def invalidate_table_schemas
219
+ @search_path_table_schemas = nil
220
+ end
221
+
222
+ # Sets options in the current db connection for each key/value pair
223
+ def set(opts)
224
+ set_sql(opts).each do |sql|
225
+ run(sql)
226
+ end
227
+ end
228
+
229
+ private
230
+
231
+ def _tables(opts)
232
+ m = output_identifier_meth
233
+ metadata_dataset.with_sql("SHOW TABLES#{" IN #{quote_identifier(opts[:schema])}" if opts[:schema]}").
234
+ select_map(:name).map do |table|
235
+ m.call(table)
236
+ end
237
+ end
238
+
239
+ # Impala uses ADD COLUMNS instead of ADD COLUMN. As its use of
240
+ # ADD COLUMNS implies, it supports adding multiple columns at once,
241
+ # but this adapter doesn't offer an API for that.
242
+ def alter_table_add_column_sql(table, op)
243
+ "ADD COLUMNS (#{column_definition_sql(op)})"
244
+ end
245
+
246
+ # Impala uses CHANGE instead of having separate RENAME syntax
247
+ # for renaming tables. As CHANGE requires a type, look up the
248
+ # type from the database schema.
249
+ def alter_table_rename_column_sql(table, op)
250
+ old_name = op[:name]
251
+ opts = schema(table).find{|x| x.first == old_name}
252
+ opts = opts ? opts.last : {}
253
+ unless opts[:db_type]
254
+ raise Error, "cannot determine database type to use for CHANGE COLUMN operation"
255
+ end
256
+ new_col = op.merge(:type=>opts[:db_type], :name=>op[:new_name])
257
+ "CHANGE #{quote_identifier(old_name)} #{column_definition_sql(new_col)}"
258
+ end
259
+
260
+ def alter_table_set_column_type_sql(table, op)
261
+ "CHANGE #{quote_identifier(op[:name])} #{column_definition_sql(op)}"
262
+ end
263
+
264
+ # Add COMMENT when defining the column, if :comment is present.
265
+ def column_definition_comment_sql(sql, column)
266
+ sql << " COMMENT #{literal(column[:comment])}" if column[:comment]
267
+ end
268
+
269
+ def column_definition_order
270
+ [:comment]
271
+ end
272
+
273
+ def create_schema_sql(schema, options)
274
+ "CREATE SCHEMA #{'IF NOT EXISTS ' if options[:if_not_exists]}#{quote_identifier(schema)}#{" LOCATION #{literal(options[:location])}" if options[:location]}"
275
+ end
276
+
277
+ # Support using table parameters for CREATE TABLE AS, necessary for
278
+ # creating parquet files from datasets.
279
+ def create_table_as_sql(name, sql, options)
280
+ "#{create_table_prefix_sql(name, options)}#{create_table_parameters_sql(options) } AS #{sql}"
281
+ end
282
+
283
+ def create_table_prefix_sql(name, options)
284
+ "CREATE #{'EXTERNAL ' if options[:external]}TABLE#{' IF NOT EXISTS' if options[:if_not_exists]} #{quote_schema_table(name)}"
285
+ end
286
+
287
+ def create_table_sql(name, generator, options)
288
+ sql = super
289
+ sql += create_table_parameters_sql(options)
290
+ sql
291
+ end
292
+
293
+ def create_table_parameters_sql(options)
294
+ sql = String.new
295
+ sql << " COMMENT #{literal(options[:comment])}" if options[:comment]
296
+ if options[:field_term] || options[:line_term]
297
+ sql << " ROW FORMAT DELIMITED"
298
+ if options[:field_term]
299
+ sql << " FIELDS TERMINATED BY #{literal(options[:field_term])}"
300
+ sql << " ESCAPED BY #{literal(options[:field_escape])}" if options[:field_escape]
301
+ end
302
+ if options[:line_term]
303
+ sql << " LINES TERMINATED BY #{literal(options[:line_term])}"
304
+ end
305
+ end
306
+ sql << " STORED AS #{options[:stored_as]}" if options[:stored_as]
307
+ sql << " LOCATION #{literal(options[:location])}" if options[:location]
308
+ sql
309
+ end
310
+
311
+ def refresh_sql(table_name)
312
+ "REFRESH #{quote_schema_table(table_name)}"
313
+ end
314
+
315
+ def compute_stats_sql(table_name)
316
+ "COMPUTE STATS #{quote_schema_table(table_name)}"
317
+ end
318
+
319
+ def drop_schema_sql(schema, options)
320
+ "DROP SCHEMA #{'IF EXISTS ' if options[:if_exists]}#{quote_identifier(schema)}#{' CASCADE' if options[:cascade]}"
321
+ end
322
+
323
+ def search_path_table_schemas
324
+ @search_path_table_schemas ||= begin
325
+ search_path = opts[:search_path]
326
+ search_path = search_path.split(',') if search_path.is_a?(String)
327
+ table_schemas = {}
328
+ search_path.reverse_each do |schema|
329
+ _tables(:schema=>schema).each do |table|
330
+ table_schemas[table.to_s] = schema.to_s
331
+ end
332
+ end
333
+ table_schemas
334
+ end
335
+ end
336
+
337
+ # SHOW TABLE STATS will raise an error if given a view and not a table,
338
+ # so use that to differentiate tables from views.
339
+ def is_valid_table?(t, opts=OPTS)
340
+ t = Sequel.qualify(opts[:schema], t) if opts[:schema]
341
+ rows = describe(t, :formatted=>true)
342
+ if row = rows.find{|r| r[:name].to_s.strip == 'Table Type:'}
343
+ row[:type].to_s.strip !~ /VIEW/
344
+ end
345
+ rescue Sequel::DatabaseError
346
+ # This can be raised for Hive tables that Impala returns via SHOW TABLES,
347
+ # but which it raises an exception when you try to DESCRIBE them.
348
+ false
349
+ end
350
+
351
+ def load_data_sql(path, table, options)
352
+ "LOAD DATA INPATH #{literal(path)}#{' OVERWRITE' if options[:overwrite]} INTO TABLE #{literal(table)}"
353
+ end
354
+
355
+ # Metadata queries on JDBC use uppercase keys, so set the identifier
356
+ # output method to downcase so that metadata queries work correctly.
357
+ def _metadata_dataset
358
+ super.with_extend do
359
+ def output_identifier(v)
360
+ v.downcase.to_sym
361
+ end
362
+ end
363
+ end
364
+
365
+ # Impala doesn't like the word "integer"
366
+ def type_literal_generic_integer(column)
367
+ :int
368
+ end
369
+
370
+ # Impala doesn't like the word "biginteger"
371
+ def type_literal_generic_bignum_symbol(column)
372
+ :bigint
373
+ end
374
+
375
+ # Impala doesn't like the word "biginteger"
376
+ def type_literal_generic_bignum(column)
377
+ :bigint
378
+ end
379
+
380
+ # Impala doesn't support date columns yet, so use timestamp until date
381
+ # is natively supported.
382
+ def type_literal_generic_date(column)
383
+ :timestamp
384
+ end
385
+
386
+ # Impala uses double instead of "double precision" for floating point
387
+ # values.
388
+ def type_literal_generic_float(column)
389
+ :double
390
+ end
391
+
392
+ # Impala uses decimal instead of numeric for arbitrary precision
393
+ # numeric values.
394
+ def type_literal_generic_numeric(column)
395
+ column[:size] ? "decimal(#{Array(column[:size]).join(', ')})" : :decimal
396
+ end
397
+
398
+ # Use char or varchar if given a size, otherwise use string.
399
+ # Using a size is not recommend, as Impala doesn't implicitly
400
+ # cast string values to char or varchar, and doesn't implicitly
401
+ # cast from different sizes of varchar.
402
+ def type_literal_generic_string(column)
403
+ if size = column[:size]
404
+ "#{'var' unless column[:fixed]}char(#{size})"
405
+ else
406
+ :string
407
+ end
408
+ end
409
+
410
+ def set_sql(opts)
411
+ opts.map { |k, v| "SET #{k}=#{v}" }
412
+ end
413
+
414
+ def force_database(conn, database)
415
+ if database
416
+ log_connection_execute(conn, "USE #{database}")
417
+ end
418
+ conn
419
+ end
420
+ end
421
+
422
+ module DatasetMethods
423
+ include UnmodifiedIdentifiers::DatasetMethods
424
+
425
+ BACKTICK = '`'.freeze
426
+ APOS = "'".freeze
427
+ STRING_ESCAPE_RE = /([\\'])/
428
+ STRING_ESCAPE_REPLACE = '\\\\\1'.freeze
429
+ BOOL_TRUE = 'true'.freeze
430
+ BOOL_FALSE = 'false'.freeze
431
+ CONSTANT_LITERAL_MAP = {:CURRENT_TIMESTAMP=>'now()'.freeze}.freeze
432
+ PAREN_OPEN = '('.freeze
433
+ PAREN_CLOSE = ')'.freeze
434
+ SPACE = ' '.freeze
435
+ NOT = 'NOT '.freeze
436
+ REGEXP = ' REGEXP '.freeze
437
+ EXCEPT_SOURCE_COLUMN = :__source__
438
+ EXCEPT_STRATEGIES = [:not_exists, :not_in, :left_join, :group_by].freeze
439
+ SELECT_VALUES = 'VALUES '.freeze
440
+
441
+ Dataset.def_sql_method(self, :select, [['if opts[:values]', %w'values'], ['else', %w'with select distinct columns from join where group having compounds order limit']])
442
+
443
+ # Handle string concatenation using the concat string function.
444
+ # Don't use the ESCAPE syntax when using LIKE/NOT LIKE, as
445
+ # Impala doesn't support escaping LIKE metacharacters.
446
+ # Support regexps on Impala using the REGEXP operator.
447
+ # For cast insensitive regexps, cast both values to uppercase first.
448
+ def complex_expression_sql_append(sql, op, args)
449
+ case op
450
+ when :'||'
451
+ literal_append(sql, Sequel.function(:concat, *args))
452
+ when :LIKE, :'NOT LIKE'
453
+ sql << PAREN_OPEN
454
+ literal_append(sql, args.at(0))
455
+ sql << SPACE << op.to_s << SPACE
456
+ literal_append(sql, args.at(1))
457
+ sql << PAREN_CLOSE
458
+ when :~, :'!~', :'~*', :'!~*'
459
+ if op == :'~*' || op == :'!~*'
460
+ args = args.map{|a| Sequel.function(:upper, a)}
461
+ end
462
+ sql << NOT if op == :'!~' || op == :'!~*'
463
+ sql << PAREN_OPEN
464
+ literal_append(sql, args.at(0))
465
+ sql << REGEXP
466
+ literal_append(sql, args.at(1))
467
+ sql << PAREN_CLOSE
468
+ else
469
+ super
470
+ end
471
+ end
472
+
473
+ # Use now() for current timestamp, as Impala doesn't support
474
+ # CURRENT_TIMESTAMP.
475
+ def constant_sql_append(sql, constant)
476
+ sql << CONSTANT_LITERAL_MAP.fetch(constant, constant.to_s)
477
+ end
478
+
479
+ # Use the addition operator combined with interval types to
480
+ # handle date arithmetic when using the date_arithmetic
481
+ # extension.
482
+ def date_add_sql_append(sql, da)
483
+ h = da.interval
484
+ expr = da.expr
485
+ intervals = []
486
+ each_valid_interval_unit(h, Sequel::SQL::DateAdd::DatasetMethods::DEF_DURATION_UNITS) do |value, sql_unit|
487
+ intervals << Sequel.lit("INTERVAL #{value} #{sql_unit}")
488
+ end
489
+ if intervals.empty?
490
+ return literal_append(sql, Sequel.cast(expr, Time))
491
+ else
492
+ intervals.unshift(Sequel.cast(expr, Time))
493
+ return complex_expression_sql_append(sql, :+, intervals)
494
+ end
495
+ end
496
+
497
+ # DELETE is emulated on Impala and doesn't return the number of
498
+ # modified rows.
499
+ def delete
500
+ super
501
+ nil
502
+ end
503
+
504
+ # Emulate DELETE using INSERT OVERWRITE selecting all columns from
505
+ # the table, with a reversed condition used for WHERE.
506
+ def delete_sql
507
+ return @opts[:prepared_sql] if @opts[:prepared_sql]
508
+ sql = @opts[:append_sql] || sql_string_origin
509
+ sql << "INSERT OVERWRITE "
510
+ source_list_append(sql, opts[:from])
511
+ sql << " SELECT * FROM "
512
+ source_list_append(sql, opts[:from])
513
+ if where = opts[:where]
514
+ sql << " WHERE NOT ("
515
+ literal_append(sql, where)
516
+ sql << ")"
517
+ else
518
+ sql << " WHERE false"
519
+ end
520
+ sql
521
+ end
522
+
523
+ # Implicitly qualify tables if using the :search_path database option.
524
+ def from(*)
525
+ ds = super
526
+ ds.clone(:from => ds.opts[:from].map{|t| db.implicit_qualify(t)})
527
+ end
528
+
529
+ # Implicitly qualify tables if using the :search_path database option.
530
+ def join_table(type, table, expr=nil, options=OPTS, &block)
531
+ super(type, db.implicit_qualify(table), expr, options, &block)
532
+ end
533
+
534
+ # Emulate TRUNCATE by using INSERT OVERWRITE selecting all columns
535
+ # from the table, with WHERE false.
536
+ def truncate_sql
537
+ unfiltered.delete_sql
538
+ end
539
+
540
+ # Don't remove an order, because that breaks things when offsets
541
+ # are used, as Impala requires an order when using an offset.
542
+ def empty?
543
+ get(Sequel::SQL::AliasedExpression.new(1, :one)).nil?
544
+ end
545
+
546
+ # Emulate EXCEPT using a chosen strategy and checking for values in only the first table.
547
+ def except(other, opts=OPTS)
548
+ raise(InvalidOperation, "EXCEPT ALL not supported") if opts[:all]
549
+ raise(InvalidOperation, "The :from_self=>false option to except is not supported") if opts[:from_self] == false
550
+
551
+ strategy, *keys = @opts[:except_strategy]
552
+ ds = from_self(:alias=>:t1)
553
+
554
+ ds = case strategy
555
+ when :not_exists
556
+ ds.exclude(other.
557
+ from_self(:alias=>:t2).
558
+ where(keys.map{|key| [Sequel.qualify(:t1, key), Sequel.qualify(:t2, key)]}).
559
+ select(nil).
560
+ exists)
561
+ when :not_in
562
+ raise Sequel::Error, ":not_in EXCEPT strategy only supports a single key" unless keys.length == 1
563
+ key = keys.first
564
+ ds.exclude(Sequel.qualify(:t1, key)=>other.from_self(:alias=>:t2).select(key))
565
+ when :left_join
566
+ ds.left_join(other.from_self(:alias=>:t2).as(:t2), keys.map{|key| [key, key]}).
567
+ where(Sequel.or(keys.map{|key| [Sequel.qualify(:t2, key), nil]})).
568
+ select_all(:t1)
569
+ else
570
+ cols = columns
571
+ rhs = other.from_self.select_group(*other.columns).select_append(Sequel.expr(2).as(EXCEPT_SOURCE_COLUMN))
572
+ ds.select_group(*cols).
573
+ select_append(Sequel.expr(1).as(EXCEPT_SOURCE_COLUMN)).
574
+ union(rhs, all: true).
575
+ select_group(*cols).
576
+ having{{count.function.* => 1, min(EXCEPT_SOURCE_COLUMN) => 1}}
577
+ end
578
+
579
+ ds.from_self(opts)
580
+ end
581
+
582
+ # The strategy to use for EXCEPT emulation. By default, uses a GROUP BY emulation,
583
+ # as that doesn't require you provide a key column, but you can use this to choose
584
+ # a NOT EXISTS, NOT IN, or LEFT JOIN emulation, providing the unique key column.
585
+ def except_strategy(strategy, *keys)
586
+ raise Sequel::Error, "invalid EXCEPT strategy: #{strategy.inspect}" unless EXCEPT_STRATEGIES.include?(strategy)
587
+ clone(:except_strategy=>[strategy, *keys])
588
+ end
589
+
590
+ # Use INSERT OVERWRITE instead of INSERT INTO when inserting into this dataset:
591
+ #
592
+ # DB[:table].insert_overwrite.insert(DB[:other])
593
+ # # INSERT OVERWRITE table SELECT * FROM other
594
+ def insert_overwrite
595
+ clone(:insert_overwrite=>true)
596
+ end
597
+
598
+ # Impala does not support INSERT DEFAULT VALUES.
599
+ def insert_supports_empty_values?
600
+ false
601
+ end
602
+
603
+ # Emulate INTERSECT using a join and checking for values in both tables.
604
+ def intersect(other, opts=OPTS)
605
+ raise(InvalidOperation, "INTERSECT ALL not supported") if opts[:all]
606
+ raise(InvalidOperation, "The :from_self=>false option to intersect is not supported") if opts[:from_self] == false
607
+ raise(Error, "Attempt to INTERSECT on dataset with no columns: #{inspect}") if columns.empty?
608
+ raise(Error, "Attempt to INTERSECT other dataset with no columns: #{other.inspect}") if other.columns.empty?
609
+
610
+ cols = columns.zip(other.columns)
611
+ from_self(alias: :l)
612
+ .join(other){|lj, j, _| Sequel.&(*cols.map{|c1,c2| Sequel.expr(Sequel.qualify(lj, c2)=>Sequel.qualify(j, c1)) | {Sequel.qualify(lj, c2)=>nil, Sequel.qualify(j, c1)=>nil}})}
613
+ .select_all(:l)
614
+ .distinct
615
+ .from_self(opts)
616
+ end
617
+
618
+ # Impala supports non-recursive common table expressions.
619
+ def supports_cte?(type=:select)
620
+ true
621
+ end
622
+
623
+ def supports_cte_in_subqueries?
624
+ true
625
+ end
626
+
627
+ # Impala doesn't support derived column lists when aliasing
628
+ # tables.
629
+ def supports_derived_column_lists?
630
+ false
631
+ end
632
+
633
+ # Impala doesn't support EXCEPT or INTERSECT, but support is emulated for them.
634
+ # However, EXCEPT ALL and INTERSECT ALL are not emulated.
635
+ def supports_intersect_except_all?
636
+ false
637
+ end
638
+
639
+ # Impala only support IS NULL, not IS TRUE or IS FALSE.
640
+ def supports_is_true?
641
+ false
642
+ end
643
+
644
+ # Impala doesn't support IN when used with multiple columns.
645
+ def supports_multiple_column_in?
646
+ false
647
+ end
648
+
649
+ # Impala supports regexps using the REGEXP operator.
650
+ def supports_regexp?
651
+ true
652
+ end
653
+
654
+ # Impala supports window functions.
655
+ def supports_window_functions?
656
+ true
657
+ end
658
+
659
+ # Create a parquet file from this dataset. +table+ should
660
+ # be the table name to create. To specify a path for the
661
+ # parquet file, use the :location option.
662
+ #
663
+ # Examples:
664
+ #
665
+ # DB[:t].to_parquet(:p)
666
+ # # CREATE TABLE `p` STORED AS parquet AS
667
+ # # SELECT * FROM `t`
668
+ #
669
+ # DB[:t].to_parquet(:p, :location=>'/a/b')
670
+ # # CREATE TABLE `p` STORED AS parquet LOCATION '/a/b'
671
+ # # SELECT * FROM `t`
672
+ def to_parquet(table, options=OPTS)
673
+ db.create_table(table, options.merge(:as=>self, :stored_as=>:parquet))
674
+ end
675
+
676
+ # UPDATE is emulated on Impala, and returns nil instead of the number of
677
+ # modified rows
678
+ def update(values=OPTS)
679
+ super
680
+ nil
681
+ end
682
+
683
+ # Emulate UPDATE using INSERT OVERWRITE AS SELECT. For all columns used
684
+ # in the given +values+, use a CASE statement. In the CASE statement,
685
+ # set the value to the new value if the row matches WHERE conditions of
686
+ # the current dataset, otherwise use the existing value.
687
+ def update_sql(values)
688
+ sql = String.new
689
+ sql << "INSERT OVERWRITE "
690
+ source_list_append(sql, opts[:from])
691
+ sql << " SELECT "
692
+ comma = false
693
+
694
+ if where = opts[:where]
695
+ where = Sequel.lit(literal(where))
696
+ else
697
+ where = true
698
+ end
699
+
700
+ select_all.columns.each do |c|
701
+ if comma
702
+ sql << comma
703
+ else
704
+ comma = ', '
705
+ end
706
+
707
+ if values.has_key?(c)
708
+ new_value = values[c]
709
+ literal_append(sql, Sequel.case({where=>new_value}, c).as(c))
710
+ else
711
+ quote_identifier_append(sql, c)
712
+ end
713
+ end
714
+ sql << " FROM "
715
+ source_list_append(sql, opts[:from])
716
+ sql
717
+ end
718
+
719
+ def with(name, dataset, opts={})
720
+ if has_cte?(dataset)
721
+ s, ds = hoist_cte(dataset)
722
+ s.with(name, ds, opts)
723
+ else
724
+ super
725
+ end
726
+ end
727
+
728
+ def with_recursive(name, nonrecursive, recursive, opts={})
729
+ if has_cte?(nonrecursive)
730
+ s, ds = hoist_cte(nonrecursive)
731
+ s.with_recursive(name, ds, recursive, opts)
732
+ elsif has_cte?(recursive)
733
+ s, ds = hoist_cte(recursive)
734
+ s.with_recursive(name, nonrecursive, ds, opts)
735
+ else
736
+ super
737
+ end
738
+ end
739
+
740
+ protected
741
+
742
+ # Add the dataset to the list of compounds
743
+ def compound_clone(type, dataset, opts)
744
+ if has_cte?(dataset)
745
+ s, ds = hoist_cte(dataset)
746
+ s.compound_clone(type, ds, opts)
747
+ else
748
+ super
749
+ end
750
+ end
751
+
752
+ private
753
+
754
+ def has_cte?(ds)
755
+ ds.is_a?(Dataset) && ds.opts[:with]
756
+ end
757
+
758
+ # Impala doesn't handle the DEFAULT keyword used in inserts, as all default
759
+ # values in Impala are NULL, so just use a NULL value.
760
+ def insert_empty_columns_values
761
+ [[columns.last], [nil]]
762
+ end
763
+
764
+ def literal_true
765
+ BOOL_TRUE
766
+ end
767
+
768
+ def literal_false
769
+ BOOL_FALSE
770
+ end
771
+
772
+ def insert_into_sql(sql)
773
+ sql << (@opts[:insert_overwrite] ? ' OVERWRITE ' : ' INTO ')
774
+ identifier_append(sql, unaliased_identifier(@opts[:from].first))
775
+ end
776
+
777
+ # Double backslashes in all strings, and escape all apostrophes with
778
+ # backslashes.
779
+ def literal_string_append(sql, s)
780
+ sql << APOS << s.to_s.gsub(STRING_ESCAPE_RE, STRING_ESCAPE_REPLACE) << APOS
781
+ end
782
+
783
+ def multi_insert_sql_strategy
784
+ :values
785
+ end
786
+
787
+ # Impala doesn't support esacping of identifiers, so you can't use backtick in
788
+ # an identifier name.
789
+ def quoted_identifier_append(sql, name)
790
+ sql << BACKTICK << name.to_s << BACKTICK
791
+ end
792
+
793
+ # Don't include a LIMIT clause if there is no FROM clause. In general,
794
+ # such queries can only return 1 row.
795
+ def select_limit_sql(sql)
796
+ return unless opts[:from]
797
+ super
798
+ end
799
+
800
+
801
+ # Support VALUES clause instead of the SELECT clause to return rows.
802
+ def select_values_sql(sql)
803
+ sql << SELECT_VALUES
804
+ expression_list_append(sql, opts[:values])
805
+ end
806
+ end
807
+ end
808
+ end