RubyGems - dwh - Versions diffs - 0.4.0 → 0.4.2 - Mend

dwh 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +16 -0
data/docs/guides/adapters.md +27 -0
data/lib/dwh/adapters/databricks.rb +74 -6
data/lib/dwh/adapters/duck_db.rb +1 -1
data/lib/dwh/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e782fe9e0167d10f1672d0ce2b1601445cd0484c2e9b68b46df11be0cd0f20ca
-  data.tar.gz: 795f5cb0173413e2a475b216f824aed8b0de975a4abb39e30fa27f7cccb9f779
+  metadata.gz: 9a67d957a140e258a8bef00efbae509bb7169e9de98dae911945316e7f24afb1
+  data.tar.gz: 0ae9acadd0a1a8e0e508f8a264c1534118d323202706c0488f3ae17ebdd98e9e
 SHA512:
-  metadata.gz: f42b78511e879191933ff87b4441041b63541413c37f3f308a4854564f278ef4e8ff91e7547a573f03104e37e4fb068e187ee5d3cbab9fe834dbf129b27f68ac
-  data.tar.gz: ef6f624c3e0f7f2dfd9deff755af97d28798aed6c8c8ddea474899fbb02f079bfc4a80772ddb2048e9d0777da4f5a508a51ede9669159082758f8019e523bb08
+  metadata.gz: 1aff93e7071cd35b748b43e174c3beeea2fb4c748a97bbf81d407e95e978c411e160b53ebe5cdef331ed685233407143485a0be7775b6b62db73538aebf05718
+  data.tar.gz: d9ad974f9e7a4edf0211b40bb0be33f730f8cd6abace42df05aa4339dcfc53babeea6c495c9d6d4d4b91519fa30f03852e99f6a0a0e14912914daf91bd62fb00

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,21 @@
 ## [Unreleased]
+## [0.4.2] - 2026-05-22
+### Fixed
+- **DuckDB adapter**: Pass connection options with `config:` when opening a database so initialization uses the correct DuckDB API parameter.
+## [0.4.1] - 2026-04-29
+### Added
+- Databricks `execute_stream` support for `EXTERNAL_LINKS` result delivery using CSV downloads
+### Changed
+- Databricks now uses method-specific result delivery defaults: `execute` uses `INLINE` + `JSON_ARRAY`, and `execute_stream` uses `EXTERNAL_LINKS` + `CSV`.
 ## [0.4.0] - 2026-04-28
 ### Added

data/docs/guides/adapters.md CHANGED Viewed

@@ -268,6 +268,33 @@ Set `auth_mode: 'oauth_u2m'` and provide `oauth_redirect_uri` in config, then ru
 When U2M is active, PKCE is applied automatically by the adapter.
+### Large result sets (>25MiB)
+Databricks `INLINE` result disposition is limited for large payloads. The adapter uses
+method-driven defaults:
+- `execute` uses `INLINE` + `JSON_ARRAY` (in-memory/object style results)
+- `execute_stream` uses `EXTERNAL_LINKS` + `CSV` (large export path)
+```ruby
+adapter = DWH.create(:databricks, {
+  host: 'workspace.cloud.databricks.com',
+  auth_mode: 'oauth_m2m',
+  warehouse: 'warehouse_id',
+  oauth_client_id: '<CLIENT_ID>',
+  oauth_client_secret: '<CLIENT_SECRET>'
+})
+File.open('export.csv', 'w') do |io|
+  adapter.execute_stream('SELECT * FROM big_table', io)
+end
+```
+For low-memory exports, prefer `File`/`Tempfile`/pipes as the IO target. Avoid `StringIO`
+for very large result sets, because it keeps output bytes in memory.
+When using `execute_stream` with EXTERNAL_LINKS CSV, streaming stats/row counts are tracked
+consistently with other adapters (data rows only, header excluded).
 ### Migration note
 Databricks now requires explicit `auth_mode`.

data/lib/dwh/adapters/databricks.rb CHANGED Viewed

@@ -78,7 +78,7 @@ module DWH
       def execute(sql, format: :array, retries: 0)
         result = with_retry(retries + 1) do
           with_debug(sql) do
-            response = submit_query(sql)
+            response = submit_query_for_execute(sql)
             fetch_data(handle_query_response(response))
           end
         end
@@ -89,7 +89,7 @@ module DWH
       def execute_stream(sql, io, stats: nil, retries: 0)
         with_retry(retries) do
           with_debug(sql) do
-            response = submit_query(sql)
+            response = submit_query_for_execute_stream(sql)
             fetch_data(handle_query_response(response), io: io, stats: stats)
           end
         end
@@ -103,7 +103,7 @@ module DWH
       # @yield [chunk] yields each chunk of data as it's processed
       def stream(sql, &block)
         with_debug(sql) do
-          response = submit_query(sql)
+          response = submit_query_for_execute(sql)
           fetch_data(handle_query_response(response), proc: block)
         end
       end
@@ -177,7 +177,7 @@ module DWH
         close
       end
-      def submit_query(sql)
+      def submit_query(sql, disposition: 'INLINE', format: 'JSON_ARRAY')
         connection.post(STATEMENTS_API) do |req|
           req.body = {
             statement: sql,
@@ -186,12 +186,20 @@ module DWH
             schema: config[:schema],
             wait_timeout: '30s',
             on_wait_timeout: 'CONTINUE',
-            format: 'JSON_ARRAY',
-            disposition: 'INLINE'
+            format:,
+            disposition:
           }.compact.merge(extra_query_params).to_json
         end
       end
+      def submit_query_for_execute(sql)
+        submit_query(sql, disposition: 'INLINE', format: 'JSON_ARRAY')
+      end
+      def submit_query_for_execute_stream(sql)
+        submit_query(sql, disposition: 'EXTERNAL_LINKS', format: 'CSV')
+      end
       def handle_query_response(response)
         body = JSON.parse(response.body)
@@ -233,6 +241,8 @@ module DWH
       end
       def fetch_data(result, io: nil, stats: nil, proc: nil)
+        return fetch_external_links_data(result, io:, stats:, proc:) if result.dig('result', 'external_links')
         columns = result.dig('manifest', 'schema', 'columns')&.map { |col| col['name'] } || []
         chunks = result.dig('manifest', 'chunks') || []
         collector = {
@@ -262,6 +272,60 @@ module DWH
         collector
       end
+      def fetch_external_links_data(result, io:, proc:, stats: nil)
+        if io.nil?
+          raise UnsupportedCapability,
+                "Databricks EXTERNAL_LINKS is supported only for execute_stream. Use result_format: 'CSV' with execute_stream."
+        end
+        raise UnsupportedCapability, 'Databricks EXTERNAL_LINKS does not support stream/yield. Use execute_stream.' if proc
+        csv_buffer = +''
+        header_skipped = false
+        current = result
+        loop do
+          links = current.dig('result', 'external_links') || []
+          links.each do |link|
+            url = link['external_link']
+            raise ExecutionError, 'Databricks external link missing external_link URL' if url.to_s.strip.empty?
+            response = external_link_http_client.get(url)
+            raise ExecutionError, "Failed to download Databricks external link: #{response.status}" unless response.status == 200
+            body = response.body.to_s
+            io << body
+            header_skipped = append_csv_stats(stats, body, csv_buffer, header_skipped:) if stats
+          end
+          next_chunk_internal_link = links.first&.dig('next_chunk_internal_link')
+          break if next_chunk_internal_link.to_s.strip.empty?
+          current = JSON.parse(connection.get(next_chunk_internal_link).body)
+        end
+        io.rewind
+        { columns: [], data: [], io: io, stats: stats, wrote_header: true }
+      end
+      def append_csv_stats(stats, chunk, csv_buffer, header_skipped:)
+        return if stats.nil?
+        csv_buffer << chunk
+        rows = CSV.parse(csv_buffer, skip_blanks: true)
+        rows.each_with_index do |row, index|
+          if !header_skipped && index.zero?
+            header_skipped = true
+            next
+          end
+          stats << row
+        end
+        csv_buffer.clear
+        header_skipped
+      rescue CSV::MalformedCSVError
+        logger.debug("Unparseable:\n #{chunk}")
+        header_skipped
+      end
       def write_data(data, collector, io = nil, stats = nil, proc = nil)
         if io
           unless collector[:wrote_header]
@@ -307,6 +371,10 @@ module DWH
         config[:host].to_s
       end
+      def external_link_http_client
+        @external_link_http_client ||= Faraday.new
+      end
       def oauth_supports_authorization_code_flow?
         auth_mode == 'oauth_u2m'
       end

data/lib/dwh/adapters/duck_db.rb CHANGED Viewed

@@ -32,7 +32,7 @@ module DWH
               ducked_config[key.to_s] = val
             end
           end
-          @db = DuckDB::Database.open(config[:file], ducked_config)
+          @db = DuckDB::Database.open(config[:file], config: ducked_config)
           self.class.databases[config[:file]] = @db
         end

data/lib/dwh/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module DWH
-  VERSION = '0.4.0'
+  VERSION = '0.4.2'
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: dwh
 version: !ruby/object:Gem::Version
-  version: 0.4.0
+  version: 0.4.2
 platform: ruby
 authors:
 - Ajo Abraham