dwh 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/docs/guides/adapters.md +27 -0
- data/lib/dwh/adapters/databricks.rb +74 -6
- data/lib/dwh/adapters/duck_db.rb +1 -1
- data/lib/dwh/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9a67d957a140e258a8bef00efbae509bb7169e9de98dae911945316e7f24afb1
|
|
4
|
+
data.tar.gz: 0ae9acadd0a1a8e0e508f8a264c1534118d323202706c0488f3ae17ebdd98e9e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1aff93e7071cd35b748b43e174c3beeea2fb4c748a97bbf81d407e95e978c411e160b53ebe5cdef331ed685233407143485a0be7775b6b62db73538aebf05718
|
|
7
|
+
data.tar.gz: d9ad974f9e7a4edf0211b40bb0be33f730f8cd6abace42df05aa4339dcfc53babeea6c495c9d6d4d4b91519fa30f03852e99f6a0a0e14912914daf91bd62fb00
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,21 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
## [0.4.2] - 2026-05-22
|
|
4
|
+
|
|
5
|
+
### Fixed
|
|
6
|
+
|
|
7
|
+
- **DuckDB adapter**: Pass connection options with `config:` when opening a database so initialization uses the correct DuckDB API parameter.
|
|
8
|
+
|
|
9
|
+
## [0.4.1] - 2026-04-29
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
|
|
13
|
+
- Databricks `execute_stream` support for `EXTERNAL_LINKS` result delivery using CSV downloads
|
|
14
|
+
|
|
15
|
+
### Changed
|
|
16
|
+
|
|
17
|
+
- Databricks now uses method-specific result delivery defaults: `execute` uses `INLINE` + `JSON_ARRAY`, and `execute_stream` uses `EXTERNAL_LINKS` + `CSV`.
|
|
18
|
+
|
|
3
19
|
## [0.4.0] - 2026-04-28
|
|
4
20
|
|
|
5
21
|
### Added
|
data/docs/guides/adapters.md
CHANGED
|
@@ -268,6 +268,33 @@ Set `auth_mode: 'oauth_u2m'` and provide `oauth_redirect_uri` in config, then ru
|
|
|
268
268
|
|
|
269
269
|
When U2M is active, PKCE is applied automatically by the adapter.
|
|
270
270
|
|
|
271
|
+
### Large result sets (>25MiB)
|
|
272
|
+
|
|
273
|
+
Databricks `INLINE` result disposition is limited for large payloads. The adapter uses
|
|
274
|
+
method-driven defaults:
|
|
275
|
+
|
|
276
|
+
- `execute` uses `INLINE` + `JSON_ARRAY` (in-memory/object style results)
|
|
277
|
+
- `execute_stream` uses `EXTERNAL_LINKS` + `CSV` (large export path)
|
|
278
|
+
|
|
279
|
+
```ruby
|
|
280
|
+
adapter = DWH.create(:databricks, {
|
|
281
|
+
host: 'workspace.cloud.databricks.com',
|
|
282
|
+
auth_mode: 'oauth_m2m',
|
|
283
|
+
warehouse: 'warehouse_id',
|
|
284
|
+
oauth_client_id: '<CLIENT_ID>',
|
|
285
|
+
oauth_client_secret: '<CLIENT_SECRET>'
|
|
286
|
+
})
|
|
287
|
+
|
|
288
|
+
File.open('export.csv', 'w') do |io|
|
|
289
|
+
adapter.execute_stream('SELECT * FROM big_table', io)
|
|
290
|
+
end
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
For low-memory exports, prefer `File`/`Tempfile`/pipes as the IO target. Avoid `StringIO`
|
|
294
|
+
for very large result sets, because it keeps output bytes in memory.
|
|
295
|
+
When using `execute_stream` with EXTERNAL_LINKS CSV, streaming stats/row counts are tracked
|
|
296
|
+
consistently with other adapters (data rows only, header excluded).
|
|
297
|
+
|
|
271
298
|
### Migration note
|
|
272
299
|
|
|
273
300
|
Databricks now requires explicit `auth_mode`.
|
|
@@ -78,7 +78,7 @@ module DWH
|
|
|
78
78
|
def execute(sql, format: :array, retries: 0)
|
|
79
79
|
result = with_retry(retries + 1) do
|
|
80
80
|
with_debug(sql) do
|
|
81
|
-
response =
|
|
81
|
+
response = submit_query_for_execute(sql)
|
|
82
82
|
fetch_data(handle_query_response(response))
|
|
83
83
|
end
|
|
84
84
|
end
|
|
@@ -89,7 +89,7 @@ module DWH
|
|
|
89
89
|
def execute_stream(sql, io, stats: nil, retries: 0)
|
|
90
90
|
with_retry(retries) do
|
|
91
91
|
with_debug(sql) do
|
|
92
|
-
response =
|
|
92
|
+
response = submit_query_for_execute_stream(sql)
|
|
93
93
|
fetch_data(handle_query_response(response), io: io, stats: stats)
|
|
94
94
|
end
|
|
95
95
|
end
|
|
@@ -103,7 +103,7 @@ module DWH
|
|
|
103
103
|
# @yield [chunk] yields each chunk of data as it's processed
|
|
104
104
|
def stream(sql, &block)
|
|
105
105
|
with_debug(sql) do
|
|
106
|
-
response =
|
|
106
|
+
response = submit_query_for_execute(sql)
|
|
107
107
|
fetch_data(handle_query_response(response), proc: block)
|
|
108
108
|
end
|
|
109
109
|
end
|
|
@@ -177,7 +177,7 @@ module DWH
|
|
|
177
177
|
close
|
|
178
178
|
end
|
|
179
179
|
|
|
180
|
-
def submit_query(sql)
|
|
180
|
+
def submit_query(sql, disposition: 'INLINE', format: 'JSON_ARRAY')
|
|
181
181
|
connection.post(STATEMENTS_API) do |req|
|
|
182
182
|
req.body = {
|
|
183
183
|
statement: sql,
|
|
@@ -186,12 +186,20 @@ module DWH
|
|
|
186
186
|
schema: config[:schema],
|
|
187
187
|
wait_timeout: '30s',
|
|
188
188
|
on_wait_timeout: 'CONTINUE',
|
|
189
|
-
format
|
|
190
|
-
disposition:
|
|
189
|
+
format:,
|
|
190
|
+
disposition:
|
|
191
191
|
}.compact.merge(extra_query_params).to_json
|
|
192
192
|
end
|
|
193
193
|
end
|
|
194
194
|
|
|
195
|
+
def submit_query_for_execute(sql)
|
|
196
|
+
submit_query(sql, disposition: 'INLINE', format: 'JSON_ARRAY')
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def submit_query_for_execute_stream(sql)
|
|
200
|
+
submit_query(sql, disposition: 'EXTERNAL_LINKS', format: 'CSV')
|
|
201
|
+
end
|
|
202
|
+
|
|
195
203
|
def handle_query_response(response)
|
|
196
204
|
body = JSON.parse(response.body)
|
|
197
205
|
|
|
@@ -233,6 +241,8 @@ module DWH
|
|
|
233
241
|
end
|
|
234
242
|
|
|
235
243
|
def fetch_data(result, io: nil, stats: nil, proc: nil)
|
|
244
|
+
return fetch_external_links_data(result, io:, stats:, proc:) if result.dig('result', 'external_links')
|
|
245
|
+
|
|
236
246
|
columns = result.dig('manifest', 'schema', 'columns')&.map { |col| col['name'] } || []
|
|
237
247
|
chunks = result.dig('manifest', 'chunks') || []
|
|
238
248
|
collector = {
|
|
@@ -262,6 +272,60 @@ module DWH
|
|
|
262
272
|
collector
|
|
263
273
|
end
|
|
264
274
|
|
|
275
|
+
def fetch_external_links_data(result, io:, proc:, stats: nil)
|
|
276
|
+
if io.nil?
|
|
277
|
+
raise UnsupportedCapability,
|
|
278
|
+
"Databricks EXTERNAL_LINKS is supported only for execute_stream. Use result_format: 'CSV' with execute_stream."
|
|
279
|
+
end
|
|
280
|
+
raise UnsupportedCapability, 'Databricks EXTERNAL_LINKS does not support stream/yield. Use execute_stream.' if proc
|
|
281
|
+
|
|
282
|
+
csv_buffer = +''
|
|
283
|
+
header_skipped = false
|
|
284
|
+
current = result
|
|
285
|
+
loop do
|
|
286
|
+
links = current.dig('result', 'external_links') || []
|
|
287
|
+
links.each do |link|
|
|
288
|
+
url = link['external_link']
|
|
289
|
+
raise ExecutionError, 'Databricks external link missing external_link URL' if url.to_s.strip.empty?
|
|
290
|
+
|
|
291
|
+
response = external_link_http_client.get(url)
|
|
292
|
+
raise ExecutionError, "Failed to download Databricks external link: #{response.status}" unless response.status == 200
|
|
293
|
+
|
|
294
|
+
body = response.body.to_s
|
|
295
|
+
io << body
|
|
296
|
+
header_skipped = append_csv_stats(stats, body, csv_buffer, header_skipped:) if stats
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
next_chunk_internal_link = links.first&.dig('next_chunk_internal_link')
|
|
300
|
+
break if next_chunk_internal_link.to_s.strip.empty?
|
|
301
|
+
|
|
302
|
+
current = JSON.parse(connection.get(next_chunk_internal_link).body)
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
io.rewind
|
|
306
|
+
{ columns: [], data: [], io: io, stats: stats, wrote_header: true }
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
def append_csv_stats(stats, chunk, csv_buffer, header_skipped:)
|
|
310
|
+
return if stats.nil?
|
|
311
|
+
|
|
312
|
+
csv_buffer << chunk
|
|
313
|
+
rows = CSV.parse(csv_buffer, skip_blanks: true)
|
|
314
|
+
rows.each_with_index do |row, index|
|
|
315
|
+
if !header_skipped && index.zero?
|
|
316
|
+
header_skipped = true
|
|
317
|
+
next
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
stats << row
|
|
321
|
+
end
|
|
322
|
+
csv_buffer.clear
|
|
323
|
+
header_skipped
|
|
324
|
+
rescue CSV::MalformedCSVError
|
|
325
|
+
logger.debug("Unparseable:\n #{chunk}")
|
|
326
|
+
header_skipped
|
|
327
|
+
end
|
|
328
|
+
|
|
265
329
|
def write_data(data, collector, io = nil, stats = nil, proc = nil)
|
|
266
330
|
if io
|
|
267
331
|
unless collector[:wrote_header]
|
|
@@ -307,6 +371,10 @@ module DWH
|
|
|
307
371
|
config[:host].to_s
|
|
308
372
|
end
|
|
309
373
|
|
|
374
|
+
def external_link_http_client
|
|
375
|
+
@external_link_http_client ||= Faraday.new
|
|
376
|
+
end
|
|
377
|
+
|
|
310
378
|
def oauth_supports_authorization_code_flow?
|
|
311
379
|
auth_mode == 'oauth_u2m'
|
|
312
380
|
end
|
data/lib/dwh/adapters/duck_db.rb
CHANGED
data/lib/dwh/version.rb
CHANGED