dwh 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 899fb3e403f0362cb21132d9af44f622e870b339776aac7bd4d3e7ed333d59e4
4
- data.tar.gz: a3d536eb6da5885d4817be013d80e6b5a5ed422bce1466ac582c2b478724e278
3
+ metadata.gz: e70f914cc994c4be7a9d76b0d72d170ae6bf4d895427ec90adcdf9e3099774fe
4
+ data.tar.gz: 3ef66bc3d9a326bbae4b51d2bb3ec6af45425971617aee3a3131c7d496cf9127
5
5
  SHA512:
6
- metadata.gz: 5703c4ee14bf336b92f15740bfd0a6bce086ac9b636d67bcf269ea337a550a80bdd6a2f9d4265dd0a476d8d693a313dd2e6c8fa2fc587ea0f4653f6e0b528860
7
- data.tar.gz: fa72a56f3eac7c5b4eae524a70b002f93b23e54513209d182de92e119aa43ea57a08d31d0f183b19d7df36601472045d9d2a001b26dda8bd56ee886556d4b21e
6
+ metadata.gz: d05640e86dc5a6df2135173dd7a513df0805b4035fa5c4c5fa182659b1286fb6fd78f67ff2e259899f3d4ecab19b44bf5e7bfb427acfa5daa6065d71fb2fa18d
7
+ data.tar.gz: 28e5c623c8401dea1d222a1b318325543d4c209e083944d2fd43367ae6721c2ebe1c2b7e69190462a19887770d722edb7f2d1d851f90e5cc53b4c51cf2b65953
data/CHANGELOG.md CHANGED
@@ -1,5 +1,17 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.3.0] - 2026-04-22
4
+
5
+ ### Changed
6
+
7
+ - Added Databricks Adapter
8
+
9
+ ## [0.2.1] - 2025-01-27
10
+
11
+ ### Changed
12
+
13
+ - **Adapter missing-gem error messages** (Athena, DuckDB, MySQL, PostgreSQL, SQL Server, Trino): replace platform-specific system library install instructions with links to official documentation. Messages now include `gem install` and a single link for system libraries.
14
+
3
15
  ## [0.2.0] - 2025-10-12
4
16
 
5
17
  ### Added
@@ -202,8 +202,15 @@ module DWH
202
202
  def valid_config?
203
203
  super
204
204
  require 'aws-sdk-athena'
205
+ require 'aws-sdk-s3'
205
206
  rescue LoadError
206
- raise ConfigError, "Required 'aws-sdk-athena' and 'aws-sdk-s3' gems missing. Please add them to your Gemfile."
207
+ raise ConfigError, <<~MSG
208
+ Athena adapter requires the 'aws-sdk-athena' and 'aws-sdk-s3' gems.
209
+
210
+ Install with: gem install aws-sdk-athena aws-sdk-s3
211
+
212
+ No system libraries required (pure Ruby).
213
+ MSG
207
214
  end
208
215
 
209
216
  private
@@ -0,0 +1,328 @@
1
+ require 'csv'
2
+ require 'base64'
3
+
4
+ module DWH
5
+ module Adapters
6
+ # Databricks adapter for executing SQL queries against Databricks SQL warehouses.
7
+ #
8
+ # Supports OAuth M2M (service principal) authentication only.
9
+ #
10
+ # @example Connection with OAuth (service principal)
11
+ # DWH.create(:databricks, {
12
+ # host: 'adb-1234567890123456.7.azuredatabricks.net',
13
+ # warehouse: 'abc123def456',
14
+ # oauth_client_id: 'service-principal-app-id',
15
+ # oauth_client_secret: 'your-oauth-secret-here',
16
+ # catalog: 'main',
17
+ # schema: 'default'
18
+ # })
19
+ class Databricks < Adapter
20
+ config :host, String, required: true, message: 'Databricks workspace host (e.g., adb-xxx.databricks.cloud.com)'
21
+ config :oauth_client_id, String, required: true, message: 'OAuth client ID (service principal application ID)'
22
+ config :oauth_client_secret, String, required: true, message: 'OAuth client secret'
23
+ config :client_name, String, required: false, default: 'Ruby DWH Gem', message: 'Client name sent to Databricks'
24
+ config :query_timeout, Integer, required: false, default: 3600, message: 'Query execution timeout in seconds'
25
+ config :warehouse, String, required: true, message: 'Databricks SQL warehouse ID to use for query execution'
26
+ config :catalog, String, required: false, message: 'Default catalog (Unity Catalog)'
27
+ config :schema, String, required: false, message: 'Default schema'
28
+
29
+ DEFAULT_POLL_INTERVAL = 0.25
30
+ MAX_POLL_INTERVAL = 30
31
+
32
+ STATEMENTS_API = '/api/2.0/sql/statements'.freeze
33
+
34
+ def initialize(config)
35
+ super
36
+ validate_auth_config
37
+ end
38
+
39
+ def connection
40
+ return @connection if @connection && !token_expired?
41
+
42
+ reset_connection if token_expired?
43
+ @connection = Faraday.new(
44
+ url: "https://#{workspace_host}",
45
+ headers: {
46
+ 'Content-Type' => 'application/json',
47
+ 'Authorization' => "Bearer #{auth_token}",
48
+ 'User-Agent' => config[:client_name]
49
+ },
50
+ request: {
51
+ timeout: config[:query_timeout]
52
+ }.merge(extra_connection_params)
53
+ )
54
+ end
55
+
56
+ def test_connection(raise_exception: false)
57
+ execute('SELECT 1')
58
+ true
59
+ rescue StandardError => e
60
+ raise ConnectionError, "Failed to connect to Databricks: #{e.message}" if raise_exception
61
+
62
+ logger.error "Connection test failed: #{e.message}"
63
+ false
64
+ end
65
+
66
+ # (see Adapter#execute)
67
+ def execute(sql, format: :array, retries: 0)
68
+ result = with_retry(retries + 1) do
69
+ with_debug(sql) do
70
+ response = submit_query(sql)
71
+ fetch_data(handle_query_response(response))
72
+ end
73
+ end
74
+
75
+ format_result(result, format)
76
+ end
77
+
78
+ def execute_stream(sql, io, stats: nil, retries: 0)
79
+ with_retry(retries) do
80
+ with_debug(sql) do
81
+ response = submit_query(sql)
82
+ fetch_data(handle_query_response(response), io: io, stats: stats)
83
+ end
84
+ end
85
+
86
+ io.rewind
87
+ io
88
+ end
89
+
90
+ # Execute SQL query and yield streamed results
91
+ # @param sql [String] SQL query to execute
92
+ # @yield [chunk] yields each chunk of data as it's processed
93
+ def stream(sql, &block)
94
+ with_debug(sql) do
95
+ response = submit_query(sql)
96
+ fetch_data(handle_query_response(response), proc: block)
97
+ end
98
+ end
99
+
100
+ def tables(**qualifiers)
101
+ catalog = qualifiers[:catalog] || config[:catalog]
102
+ schema = qualifiers[:schema] || config[:schema]
103
+
104
+ raise ConfigError, 'catalog is required for Databricks tables query' unless catalog
105
+
106
+ sql = "SELECT table_name FROM #{catalog}.information_schema.tables"
107
+ sql += " WHERE table_schema = '#{schema}'" if schema
108
+
109
+ result = execute(sql)
110
+ result.flatten
111
+ end
112
+
113
+ def metadata(table, **qualifiers)
114
+ catalog = qualifiers[:catalog] || config[:catalog]
115
+ schema = qualifiers[:schema] || config[:schema]
116
+
117
+ raise ConfigError, 'catalog is required for Databricks metadata query' unless catalog
118
+
119
+ db_table = Table.new(table, schema: schema, catalog: catalog)
120
+
121
+ sql = <<~SQL
122
+ SELECT column_name, data_type, numeric_precision, numeric_scale, character_maximum_length
123
+ FROM #{catalog}.information_schema.columns
124
+ WHERE table_name = '#{db_table.physical_name}'
125
+ SQL
126
+ sql += " AND table_schema = '#{db_table.schema}'" if db_table.schema
127
+
128
+ columns = execute(sql)
129
+
130
+ columns.each do |col|
131
+ db_table << Column.new(
132
+ name: col[0]&.downcase,
133
+ data_type: col[1]&.downcase,
134
+ precision: col[2],
135
+ scale: col[3],
136
+ max_char_length: col[4]
137
+ )
138
+ end
139
+
140
+ db_table
141
+ end
142
+
143
+ def stats(table, date_column: nil)
144
+ date_fields = if date_column
145
+ ", MIN(#{date_column}) AS date_start, MAX(#{date_column}) AS date_end"
146
+ else
147
+ ', NULL AS date_start, NULL AS date_end'
148
+ end
149
+
150
+ data = execute("SELECT COUNT(*) AS row_count#{date_fields} FROM #{table}")
151
+ cols = data.first
152
+
153
+ TableStats.new(
154
+ row_count: cols[0],
155
+ date_start: cols[1],
156
+ date_end: cols[2]
157
+ )
158
+ end
159
+
160
+ private
161
+
162
+ def validate_auth_config
163
+ raise ConfigError, 'oauth_client_id is required' unless config[:oauth_client_id]
164
+ raise ConfigError, 'oauth_client_secret is required' unless config[:oauth_client_secret]
165
+ end
166
+
167
+ def auth_token
168
+ return @oauth_access_token if @oauth_access_token && !token_expired?
169
+
170
+ request_oauth_access_token!
171
+ @oauth_access_token
172
+ end
173
+
174
+ def request_oauth_access_token!
175
+ credentials = Base64.strict_encode64("#{config[:oauth_client_id]}:#{config[:oauth_client_secret]}")
176
+ response = Faraday.post(
177
+ "https://#{workspace_host}/oidc/v1/token",
178
+ 'grant_type=client_credentials&scope=all-apis',
179
+ 'Authorization' => "Basic #{credentials}",
180
+ 'Content-Type' => 'application/x-www-form-urlencoded'
181
+ )
182
+
183
+ raise AuthenticationError, "OAuth M2M token request failed (#{response.status}): #{response.body}" unless response.status == 200
184
+
185
+ data = JSON.parse(response.body)
186
+ @oauth_access_token = data['access_token']
187
+ expires_in = data['expires_in'] || 3600
188
+ @token_expires_at = Time.now + [expires_in - 60, 60].max
189
+ end
190
+
191
+ def reset_connection
192
+ @oauth_access_token = nil
193
+ @token_expires_at = nil
194
+ close
195
+ end
196
+
197
+ def submit_query(sql)
198
+ connection.post(STATEMENTS_API) do |req|
199
+ req.body = {
200
+ statement: sql,
201
+ warehouse_id: config[:warehouse],
202
+ catalog: config[:catalog],
203
+ schema: config[:schema],
204
+ wait_timeout: '30s',
205
+ on_wait_timeout: 'CONTINUE',
206
+ format: 'JSON_ARRAY',
207
+ disposition: 'INLINE'
208
+ }.compact.merge(extra_query_params).to_json
209
+ end
210
+ end
211
+
212
+ def handle_query_response(response)
213
+ body = JSON.parse(response.body)
214
+
215
+ case response.status
216
+ when 200
217
+ state = body.dig('status', 'state')
218
+ state == 'SUCCEEDED' ? body : poll(body['statement_id'])
219
+ when 202
220
+ poll(body['statement_id'])
221
+ else
222
+ error_message = body['message'] || body['error_code'] || response.body
223
+ raise ExecutionError, "Databricks query failed (#{response.status}): #{error_message}"
224
+ end
225
+ end
226
+
227
+ def poll(statement_id)
228
+ sleep_interval = DEFAULT_POLL_INTERVAL
229
+
230
+ logger.debug "Polling for query completion: #{statement_id}"
231
+
232
+ loop do
233
+ response = connection.get("#{STATEMENTS_API}/#{statement_id}")
234
+ body = JSON.parse(response.body)
235
+ state = body.dig('status', 'state')
236
+
237
+ case state
238
+ when 'SUCCEEDED'
239
+ return body
240
+ when 'FAILED', 'CANCELED', 'CLOSED'
241
+ error_msg = body.dig('status', 'error', 'message') || state
242
+ raise ExecutionError, "Databricks query #{state}: #{error_msg}"
243
+ else
244
+ logger.debug "Query still running (state: #{state}). Sleeping #{sleep_interval}s..."
245
+ sleep(sleep_interval)
246
+ sleep_interval = sleep_interval == MAX_POLL_INTERVAL ? DEFAULT_POLL_INTERVAL : sleep_interval
247
+ sleep_interval = [sleep_interval * 2, MAX_POLL_INTERVAL].min
248
+ end
249
+ end
250
+ end
251
+
252
+ def fetch_data(result, io: nil, stats: nil, proc: nil)
253
+ columns = result.dig('manifest', 'schema', 'columns')&.map { |col| col['name'] } || []
254
+ chunks = result.dig('manifest', 'chunks') || []
255
+ collector = {
256
+ columns: columns,
257
+ data: [],
258
+ io: io,
259
+ stats: stats,
260
+ wrote_header: false
261
+ }
262
+
263
+ write_data(result.dig('result', 'data_array') || [], collector, io, stats, proc)
264
+
265
+ return collector unless chunks.size > 1
266
+
267
+ statement_id = result['statement_id']
268
+ chunks[1..].each do |chunk|
269
+ chunk_index = chunk['chunk_index']
270
+ logger.debug "Fetching chunk #{chunk_index} of #{chunks.size} for statement: #{statement_id}"
271
+
272
+ resp = connection.get("#{STATEMENTS_API}/#{statement_id}/result/chunks/#{chunk_index}")
273
+ raise ExecutionError, "Failed to fetch chunk #{chunk_index}: #{resp.body}" unless resp.status == 200
274
+
275
+ chunk_data = JSON.parse(resp.body)
276
+ write_data(chunk_data['data_array'] || [], collector, io, stats, proc)
277
+ end
278
+
279
+ collector
280
+ end
281
+
282
+ def write_data(data, collector, io = nil, stats = nil, proc = nil)
283
+ if io
284
+ unless collector[:wrote_header]
285
+ io << CSV.generate_line(collector[:columns])
286
+ collector[:wrote_header] = true
287
+ end
288
+
289
+ data.each do |row|
290
+ stats << row if stats
291
+ io << CSV.generate_line(row)
292
+ end
293
+ elsif proc
294
+ data.each { proc.call(it) }
295
+ else
296
+ data.each { collector[:data] << it }
297
+ end
298
+
299
+ collector
300
+ end
301
+
302
+ def format_result(result, format)
303
+ data = result[:data]
304
+ columns = result[:columns]
305
+
306
+ case format
307
+ when :array
308
+ data
309
+ when :object
310
+ data.map { |row| columns.zip(row).to_h }
311
+ when :csv
312
+ CSV.generate do |csv|
313
+ csv << columns
314
+ data.each { |row| csv << row }
315
+ end
316
+ when :native
317
+ result
318
+ else
319
+ raise UnsupportedCapability, "Unknown result format: #{format}"
320
+ end
321
+ end
322
+
323
+ def workspace_host
324
+ config[:host].to_s.gsub(%r{\Ahttps?://}, '').gsub(%r{/+\z}, '')
325
+ end
326
+ end
327
+ end
328
+ end
@@ -209,7 +209,13 @@ module DWH
209
209
  super
210
210
  require 'duckdb'
211
211
  rescue LoadError
212
- raise ConfigError, "Required 'duckdb' gem missing. Please add it to your Gemfile."
212
+ raise ConfigError, <<~MSG
213
+ DuckDB adapter requires the 'duckdb' gem.
214
+
215
+ Install with: gem install duckdb
216
+
217
+ See https://github.com/suketa/ruby-duckdb for installation details.
218
+ MSG
213
219
  end
214
220
 
215
221
  private
@@ -219,7 +219,13 @@ module DWH
219
219
  super
220
220
  require 'mysql2'
221
221
  rescue LoadError
222
- raise ConfigError, "Required 'MySql2' gem missing. Please add it to your Gemfile."
222
+ raise ConfigError, <<~MSG
223
+ MySQL adapter requires the 'mysql2' gem.
224
+
225
+ Install with: gem install mysql2
226
+
227
+ System libraries: https://dev.mysql.com/downloads/
228
+ MSG
223
229
  end
224
230
 
225
231
  def result_to_csv(result)
@@ -221,7 +221,13 @@ module DWH
221
221
  super
222
222
  require 'pg'
223
223
  rescue LoadError
224
- raise ConfigError, "Required 'pg' gem missing. Please add it to your Gemfile."
224
+ raise ConfigError, <<~MSG
225
+ PostgreSQL adapter requires the 'pg' gem.
226
+
227
+ Install with: gem install pg
228
+
229
+ System libraries: https://www.postgresql.org/download/
230
+ MSG
225
231
  end
226
232
 
227
233
  private
@@ -234,7 +234,13 @@ module DWH
234
234
  super
235
235
  require 'tiny_tds'
236
236
  rescue LoadError
237
- raise ConfigError, "Required 'tiny_tds' gem missing. Please add it to your Gemfile."
237
+ raise ConfigError, <<~MSG
238
+ SQL Server adapter requires the 'tiny_tds' gem.
239
+
240
+ Install with: gem install tiny_tds
241
+
242
+ System libraries (FreeTDS): https://www.freetds.org/
243
+ MSG
238
244
  end
239
245
 
240
246
  private
@@ -194,7 +194,13 @@ module DWH
194
194
  super
195
195
  require 'trino-client'
196
196
  rescue LoadError
197
- raise ConfigError, "Required 'trino-client' gem missing. Please add it to your Gemfile."
197
+ raise ConfigError, <<~MSG
198
+ Trino adapter requires the 'trino-client' gem.
199
+
200
+ Install with: gem install trino-client
201
+
202
+ No system libraries required (pure Ruby).
203
+ MSG
198
204
  end
199
205
 
200
206
  private
@@ -1,6 +1,5 @@
1
-
2
1
  # quotes and string lit
3
- quote: "\"@exp\""
2
+ quote: "`@exp`"
4
3
  string_literal: "'@exp'"
5
4
 
6
5
  # Date Literal Formats
data/lib/dwh/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DWH
4
- VERSION = '0.2.0'
4
+ VERSION = '0.3.0'
5
5
  end
data/lib/dwh.rb CHANGED
@@ -18,6 +18,7 @@ require_relative 'dwh/adapters/duck_db'
18
18
  require_relative 'dwh/adapters/sqlite'
19
19
  require_relative 'dwh/adapters/athena'
20
20
  require_relative 'dwh/adapters/redshift'
21
+ require_relative 'dwh/adapters/databricks'
21
22
 
22
23
  # DWH encapsulates the full functionality of this gem.
23
24
  #
@@ -49,6 +50,7 @@ module DWH
49
50
  register(:sqlite, Adapters::Sqlite)
50
51
  register(:athena, Adapters::Athena)
51
52
  register(:redshift, Adapters::Redshift)
53
+ register(:databricks, Adapters::Databricks)
52
54
 
53
55
  # start_reaper
54
56
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwh
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ajo Abraham
@@ -154,6 +154,7 @@ files:
154
154
  - lib/dwh.rb
155
155
  - lib/dwh/adapters.rb
156
156
  - lib/dwh/adapters/athena.rb
157
+ - lib/dwh/adapters/databricks.rb
157
158
  - lib/dwh/adapters/druid.rb
158
159
  - lib/dwh/adapters/duck_db.rb
159
160
  - lib/dwh/adapters/my_sql.rb