dwh 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e782fe9e0167d10f1672d0ce2b1601445cd0484c2e9b68b46df11be0cd0f20ca
4
- data.tar.gz: 795f5cb0173413e2a475b216f824aed8b0de975a4abb39e30fa27f7cccb9f779
3
+ metadata.gz: 9a67d957a140e258a8bef00efbae509bb7169e9de98dae911945316e7f24afb1
4
+ data.tar.gz: 0ae9acadd0a1a8e0e508f8a264c1534118d323202706c0488f3ae17ebdd98e9e
5
5
  SHA512:
6
- metadata.gz: f42b78511e879191933ff87b4441041b63541413c37f3f308a4854564f278ef4e8ff91e7547a573f03104e37e4fb068e187ee5d3cbab9fe834dbf129b27f68ac
7
- data.tar.gz: ef6f624c3e0f7f2dfd9deff755af97d28798aed6c8c8ddea474899fbb02f079bfc4a80772ddb2048e9d0777da4f5a508a51ede9669159082758f8019e523bb08
6
+ metadata.gz: 1aff93e7071cd35b748b43e174c3beeea2fb4c748a97bbf81d407e95e978c411e160b53ebe5cdef331ed685233407143485a0be7775b6b62db73538aebf05718
7
+ data.tar.gz: d9ad974f9e7a4edf0211b40bb0be33f730f8cd6abace42df05aa4339dcfc53babeea6c495c9d6d4d4b91519fa30f03852e99f6a0a0e14912914daf91bd62fb00
data/CHANGELOG.md CHANGED
@@ -1,5 +1,21 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.4.2] - 2026-05-22
4
+
5
+ ### Fixed
6
+
7
+ - **DuckDB adapter**: Pass connection options with `config:` when opening a database so initialization uses the correct DuckDB API parameter.
8
+
9
+ ## [0.4.1] - 2026-04-29
10
+
11
+ ### Added
12
+
13
+ - Databricks `execute_stream` support for `EXTERNAL_LINKS` result delivery using CSV downloads
14
+
15
+ ### Changed
16
+
17
+ - Databricks now uses method-specific result delivery defaults: `execute` uses `INLINE` + `JSON_ARRAY`, and `execute_stream` uses `EXTERNAL_LINKS` + `CSV`.
18
+
3
19
  ## [0.4.0] - 2026-04-28
4
20
 
5
21
  ### Added
@@ -268,6 +268,33 @@ Set `auth_mode: 'oauth_u2m'` and provide `oauth_redirect_uri` in config, then ru
268
268
 
269
269
  When U2M is active, PKCE is applied automatically by the adapter.
270
270
 
271
+ ### Large result sets (>25MiB)
272
+
273
+ Databricks `INLINE` result disposition is limited for large payloads. The adapter uses
274
+ method-driven defaults:
275
+
276
+ - `execute` uses `INLINE` + `JSON_ARRAY` (in-memory/object style results)
277
+ - `execute_stream` uses `EXTERNAL_LINKS` + `CSV` (large export path)
278
+
279
+ ```ruby
280
+ adapter = DWH.create(:databricks, {
281
+ host: 'workspace.cloud.databricks.com',
282
+ auth_mode: 'oauth_m2m',
283
+ warehouse: 'warehouse_id',
284
+ oauth_client_id: '<CLIENT_ID>',
285
+ oauth_client_secret: '<CLIENT_SECRET>'
286
+ })
287
+
288
+ File.open('export.csv', 'w') do |io|
289
+ adapter.execute_stream('SELECT * FROM big_table', io)
290
+ end
291
+ ```
292
+
293
+ For low-memory exports, prefer `File`/`Tempfile`/pipes as the IO target. Avoid `StringIO`
294
+ for very large result sets, because it keeps output bytes in memory.
295
+ When using `execute_stream` with EXTERNAL_LINKS CSV, streaming stats/row counts are tracked
296
+ consistently with other adapters (data rows only, header excluded).
297
+
271
298
  ### Migration note
272
299
 
273
300
  Databricks now requires explicit `auth_mode`.
@@ -78,7 +78,7 @@ module DWH
78
78
  def execute(sql, format: :array, retries: 0)
79
79
  result = with_retry(retries + 1) do
80
80
  with_debug(sql) do
81
- response = submit_query(sql)
81
+ response = submit_query_for_execute(sql)
82
82
  fetch_data(handle_query_response(response))
83
83
  end
84
84
  end
@@ -89,7 +89,7 @@ module DWH
89
89
  def execute_stream(sql, io, stats: nil, retries: 0)
90
90
  with_retry(retries) do
91
91
  with_debug(sql) do
92
- response = submit_query(sql)
92
+ response = submit_query_for_execute_stream(sql)
93
93
  fetch_data(handle_query_response(response), io: io, stats: stats)
94
94
  end
95
95
  end
@@ -103,7 +103,7 @@ module DWH
103
103
  # @yield [chunk] yields each chunk of data as it's processed
104
104
  def stream(sql, &block)
105
105
  with_debug(sql) do
106
- response = submit_query(sql)
106
+ response = submit_query_for_execute(sql)
107
107
  fetch_data(handle_query_response(response), proc: block)
108
108
  end
109
109
  end
@@ -177,7 +177,7 @@ module DWH
177
177
  close
178
178
  end
179
179
 
180
- def submit_query(sql)
180
+ def submit_query(sql, disposition: 'INLINE', format: 'JSON_ARRAY')
181
181
  connection.post(STATEMENTS_API) do |req|
182
182
  req.body = {
183
183
  statement: sql,
@@ -186,12 +186,20 @@ module DWH
186
186
  schema: config[:schema],
187
187
  wait_timeout: '30s',
188
188
  on_wait_timeout: 'CONTINUE',
189
- format: 'JSON_ARRAY',
190
- disposition: 'INLINE'
189
+ format:,
190
+ disposition:
191
191
  }.compact.merge(extra_query_params).to_json
192
192
  end
193
193
  end
194
194
 
195
+ def submit_query_for_execute(sql)
196
+ submit_query(sql, disposition: 'INLINE', format: 'JSON_ARRAY')
197
+ end
198
+
199
+ def submit_query_for_execute_stream(sql)
200
+ submit_query(sql, disposition: 'EXTERNAL_LINKS', format: 'CSV')
201
+ end
202
+
195
203
  def handle_query_response(response)
196
204
  body = JSON.parse(response.body)
197
205
 
@@ -233,6 +241,8 @@ module DWH
233
241
  end
234
242
 
235
243
  def fetch_data(result, io: nil, stats: nil, proc: nil)
244
+ return fetch_external_links_data(result, io:, stats:, proc:) if result.dig('result', 'external_links')
245
+
236
246
  columns = result.dig('manifest', 'schema', 'columns')&.map { |col| col['name'] } || []
237
247
  chunks = result.dig('manifest', 'chunks') || []
238
248
  collector = {
@@ -262,6 +272,60 @@ module DWH
262
272
  collector
263
273
  end
264
274
 
275
+ def fetch_external_links_data(result, io:, proc:, stats: nil)
276
+ if io.nil?
277
+ raise UnsupportedCapability,
278
+ "Databricks EXTERNAL_LINKS is supported only for execute_stream. Use result_format: 'CSV' with execute_stream."
279
+ end
280
+ raise UnsupportedCapability, 'Databricks EXTERNAL_LINKS does not support stream/yield. Use execute_stream.' if proc
281
+
282
+ csv_buffer = +''
283
+ header_skipped = false
284
+ current = result
285
+ loop do
286
+ links = current.dig('result', 'external_links') || []
287
+ links.each do |link|
288
+ url = link['external_link']
289
+ raise ExecutionError, 'Databricks external link missing external_link URL' if url.to_s.strip.empty?
290
+
291
+ response = external_link_http_client.get(url)
292
+ raise ExecutionError, "Failed to download Databricks external link: #{response.status}" unless response.status == 200
293
+
294
+ body = response.body.to_s
295
+ io << body
296
+ header_skipped = append_csv_stats(stats, body, csv_buffer, header_skipped:) if stats
297
+ end
298
+
299
+ next_chunk_internal_link = links.first&.dig('next_chunk_internal_link')
300
+ break if next_chunk_internal_link.to_s.strip.empty?
301
+
302
+ current = JSON.parse(connection.get(next_chunk_internal_link).body)
303
+ end
304
+
305
+ io.rewind
306
+ { columns: [], data: [], io: io, stats: stats, wrote_header: true }
307
+ end
308
+
309
+ def append_csv_stats(stats, chunk, csv_buffer, header_skipped:)
310
+ return if stats.nil?
311
+
312
+ csv_buffer << chunk
313
+ rows = CSV.parse(csv_buffer, skip_blanks: true)
314
+ rows.each_with_index do |row, index|
315
+ if !header_skipped && index.zero?
316
+ header_skipped = true
317
+ next
318
+ end
319
+
320
+ stats << row
321
+ end
322
+ csv_buffer.clear
323
+ header_skipped
324
+ rescue CSV::MalformedCSVError
325
+ logger.debug("Unparseable:\n #{chunk}")
326
+ header_skipped
327
+ end
328
+
265
329
  def write_data(data, collector, io = nil, stats = nil, proc = nil)
266
330
  if io
267
331
  unless collector[:wrote_header]
@@ -307,6 +371,10 @@ module DWH
307
371
  config[:host].to_s
308
372
  end
309
373
 
374
+ def external_link_http_client
375
+ @external_link_http_client ||= Faraday.new
376
+ end
377
+
310
378
  def oauth_supports_authorization_code_flow?
311
379
  auth_mode == 'oauth_u2m'
312
380
  end
@@ -32,7 +32,7 @@ module DWH
32
32
  ducked_config[key.to_s] = val
33
33
  end
34
34
  end
35
- @db = DuckDB::Database.open(config[:file], ducked_config)
35
+ @db = DuckDB::Database.open(config[:file], config: ducked_config)
36
36
  self.class.databases[config[:file]] = @db
37
37
  end
38
38
 
data/lib/dwh/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DWH
4
- VERSION = '0.4.0'
4
+ VERSION = '0.4.2'
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwh
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ajo Abraham