dwh 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +43 -0
- data/README.md +10 -1
- data/docs/guides/adapters.md +158 -0
- data/docs/guides/getting-started.md +6 -1
- data/docs/guides/usage.md +33 -1
- data/lib/dwh/adapters/athena.rb +8 -1
- data/lib/dwh/adapters/databricks.rb +328 -0
- data/lib/dwh/adapters/duck_db.rb +8 -2
- data/lib/dwh/adapters/my_sql.rb +7 -1
- data/lib/dwh/adapters/postgres.rb +11 -5
- data/lib/dwh/adapters/redshift.rb +48 -0
- data/lib/dwh/adapters/sql_server.rb +8 -2
- data/lib/dwh/adapters/sqlite.rb +364 -0
- data/lib/dwh/adapters/trino.rb +7 -1
- data/lib/dwh/adapters.rb +3 -3
- data/lib/dwh/column.rb +12 -1
- data/lib/dwh/functions/dates.rb +15 -0
- data/lib/dwh/settings/databricks.yml +14 -15
- data/lib/dwh/settings/druid.yml +3 -3
- data/lib/dwh/settings/duckdb.yml +2 -2
- data/lib/dwh/settings/mysql.yml +2 -2
- data/lib/dwh/settings/postgres.yml +11 -11
- data/lib/dwh/settings/redshift.yml +15 -24
- data/lib/dwh/settings/snowflake.yml +15 -15
- data/lib/dwh/settings/sqlite.yml +42 -0
- data/lib/dwh/settings.rb +6 -2
- data/lib/dwh/table.rb +18 -10
- data/lib/dwh/version.rb +1 -1
- data/lib/dwh.rb +6 -4
- metadata +6 -16
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e70f914cc994c4be7a9d76b0d72d170ae6bf4d895427ec90adcdf9e3099774fe
|
|
4
|
+
data.tar.gz: 3ef66bc3d9a326bbae4b51d2bb3ec6af45425971617aee3a3131c7d496cf9127
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d05640e86dc5a6df2135173dd7a513df0805b4035fa5c4c5fa182659b1286fb6fd78f67ff2e259899f3d4ecab19b44bf5e7bfb427acfa5daa6065d71fb2fa18d
|
|
7
|
+
data.tar.gz: 28e5c623c8401dea1d222a1b318325543d4c209e083944d2fd43367ae6721c2ebe1c2b7e69190462a19887770d722edb7f2d1d851f90e5cc53b4c51cf2b65953
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,48 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
## [0.3.0] - 2026-04-22
|
|
4
|
+
|
|
5
|
+
### Changed
|
|
6
|
+
|
|
7
|
+
- Added Databricks Adapter
|
|
8
|
+
|
|
9
|
+
## [0.2.1] - 2025-01-27
|
|
10
|
+
|
|
11
|
+
### Changed
|
|
12
|
+
|
|
13
|
+
- **Adapter missing-gem error messages** (Athena, DuckDB, MySQL, PostgreSQL, SQL Server, Trino): replace platform-specific system library install instructions with links to official documentation. Messages now include `gem install` and a single link for system libraries.
|
|
14
|
+
|
|
15
|
+
## [0.2.0] - 2025-10-12
|
|
16
|
+
|
|
17
|
+
### Added
|
|
18
|
+
|
|
19
|
+
- **SQLite adapter** with performance optimizations
|
|
20
|
+
- WAL (Write-Ahead Logging) mode enabled by default for concurrent reads
|
|
21
|
+
- Performance-tuned pragmas: cache_size, mmap_size, temp_store, synchronous
|
|
22
|
+
- Custom date truncation for year, quarter, month, week, day, hour, minute, second
|
|
23
|
+
- Custom day/month name extraction via CASE statements (SQLite lacks strftime %A/%B support)
|
|
24
|
+
- Proper date casting using `date()` function
|
|
25
|
+
- Comprehensive test suite and documentation
|
|
26
|
+
- **Redshift adapter** for AWS data warehouse
|
|
27
|
+
- Native Redshift SQL function support
|
|
28
|
+
- Full metadata and table introspection
|
|
29
|
+
- `date_time_literal` method for creating timestamp literals
|
|
30
|
+
- `date_lit` method for creating date literals
|
|
31
|
+
|
|
32
|
+
### Changed
|
|
33
|
+
|
|
34
|
+
- Removed ActiveSupport dependency
|
|
35
|
+
- Replaced `symbolize_keys` with `transform_keys(&:to_sym)`
|
|
36
|
+
- Replaced `demodulize` with `split('::').last.downcase`
|
|
37
|
+
- Removed core extensions
|
|
38
|
+
- Standardized all SQL function names in settings to UPPERCASE for consistency
|
|
39
|
+
|
|
40
|
+
### Fixed
|
|
41
|
+
|
|
42
|
+
- Config defaults now properly set even when config key is passed with nil value
|
|
43
|
+
- Table instantiation issues resolved
|
|
44
|
+
- Test suite no longer requires Trino gem for default tests
|
|
45
|
+
|
|
3
46
|
## [0.1.0] - 2025-07-03
|
|
4
47
|
|
|
5
48
|
- Initial release
|
data/README.md
CHANGED
|
@@ -25,16 +25,17 @@ The adapter only has 5 core methods (6 including the connection method). A YAML
|
|
|
25
25
|
|
|
26
26
|
- **Snowflake** - High performance cloud warehouse
|
|
27
27
|
- **Trino** (formerly Presto) - Distributed SQL query engine
|
|
28
|
+
- **Redshift** - AWS data warehouse platform
|
|
28
29
|
- **AWS Athena** - AWS big data warehouse
|
|
29
30
|
- **Apache Druid** - Real-time analytics database
|
|
30
31
|
- **DuckDB** - In-process analytical database
|
|
32
|
+
- **SQLite** - Lightweight embedded database
|
|
31
33
|
- **PostgreSQL** - Full-featured RDBMS with advanced SQL support
|
|
32
34
|
- **MySQL** - Popular open-source database
|
|
33
35
|
- **SQL Server** - Microsoft's enterprise database
|
|
34
36
|
|
|
35
37
|
## Integrations Coming Soon
|
|
36
38
|
|
|
37
|
-
- **Redshift** - AWS data warehouse platform
|
|
38
39
|
- **ClickHouse** - High performance analytical db
|
|
39
40
|
- **Databricks** - Big data compute engine
|
|
40
41
|
- **MotherDuck** - Hosted DuckDB service
|
|
@@ -61,6 +62,14 @@ druid = DWH.create(:druid, {
|
|
|
61
62
|
|
|
62
63
|
# basic query execution
|
|
63
64
|
results = druid.execute("SELECT * FROM web_sales", format: :csv)
|
|
65
|
+
|
|
66
|
+
# Connect to SQLite for local analytics
|
|
67
|
+
sqlite = DWH.create(:sqlite, {
|
|
68
|
+
file: 'path/to/analytics.db'
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
# Query with optimized WAL mode enabled by default
|
|
72
|
+
results = sqlite.execute("SELECT * FROM sales_data", format: :array)
|
|
64
73
|
```
|
|
65
74
|
|
|
66
75
|
## Core API
|
data/docs/guides/adapters.md
CHANGED
|
@@ -70,6 +70,71 @@ postgres = DWH.create(:postgres, {
|
|
|
70
70
|
})
|
|
71
71
|
```
|
|
72
72
|
|
|
73
|
+
## Redshift Adapter
|
|
74
|
+
|
|
75
|
+
The Redshift adapter uses the `pg` gem and provides full-featured RDBMS support.
|
|
76
|
+
|
|
77
|
+
### Basic Configuration
|
|
78
|
+
|
|
79
|
+
```ruby
|
|
80
|
+
redshift = DWH.create(:redshift, {
|
|
81
|
+
host: 'localhost',
|
|
82
|
+
port: 5432, # Default: 5432
|
|
83
|
+
database: 'mydb',
|
|
84
|
+
schema: 'public', # Default: 'public'
|
|
85
|
+
username: 'user',
|
|
86
|
+
password: 'password',
|
|
87
|
+
client_name: 'My Application' # Default: 'DWH Ruby Gem'
|
|
88
|
+
})
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### SSL Configuration
|
|
92
|
+
|
|
93
|
+
```ruby
|
|
94
|
+
# Basic SSL
|
|
95
|
+
redshift = DWH.create(:redshift, {
|
|
96
|
+
host: 'localhost',
|
|
97
|
+
database: 'mydb',
|
|
98
|
+
username: 'user',
|
|
99
|
+
password: 'password',
|
|
100
|
+
ssl: true,
|
|
101
|
+
extra_connection_params: {
|
|
102
|
+
sslmode: 'require' # disable, prefer, require, verify-ca, verify-full
|
|
103
|
+
}
|
|
104
|
+
})
|
|
105
|
+
|
|
106
|
+
# Certificate-based SSL
|
|
107
|
+
redshift = DWH.create(:postgres, {
|
|
108
|
+
host: 'localhost',
|
|
109
|
+
database: 'mydb',
|
|
110
|
+
username: 'user',
|
|
111
|
+
ssl: true,
|
|
112
|
+
extra_connection_params: {
|
|
113
|
+
sslmode: 'verify-full',
|
|
114
|
+
sslrootcert: '/path/to/ca-cert.pem',
|
|
115
|
+
sslcert: '/path/to/client-cert.pem',
|
|
116
|
+
sslkey: '/path/to/client-key.pem'
|
|
117
|
+
}
|
|
118
|
+
})
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Advanced Configuration
|
|
122
|
+
|
|
123
|
+
```ruby
|
|
124
|
+
redshift = DWH.create(:redshift, {
|
|
125
|
+
host: 'localhost',
|
|
126
|
+
database: 'mydb',
|
|
127
|
+
username: 'user',
|
|
128
|
+
password: 'password',
|
|
129
|
+
query_timeout: 3600, # seconds, default: 3600
|
|
130
|
+
extra_connection_params: {
|
|
131
|
+
application_name: 'Data Analysis Tool',
|
|
132
|
+
connect_timeout: 10,
|
|
133
|
+
options: '-c maintenance_work_mem=256MB'
|
|
134
|
+
}
|
|
135
|
+
})
|
|
136
|
+
```
|
|
137
|
+
|
|
73
138
|
## Snowflake
|
|
74
139
|
|
|
75
140
|
Snowflake adapter use the REST apis (https) to connect and query. This adapter also supports Multi-Database
|
|
@@ -287,6 +352,99 @@ duckdb = DWH.create(:duckdb, {
|
|
|
287
352
|
})
|
|
288
353
|
```
|
|
289
354
|
|
|
355
|
+
## SQLite Adapter
|
|
356
|
+
|
|
357
|
+
The SQLite adapter uses the `sqlite3` gem for lightweight embedded database analytics. It's optimized for analytical workloads with WAL mode enabled by default for better concurrent read performance.
|
|
358
|
+
|
|
359
|
+
### Basic Configuration
|
|
360
|
+
|
|
361
|
+
```ruby
|
|
362
|
+
# File-based database
|
|
363
|
+
sqlite = DWH.create(:sqlite, {
|
|
364
|
+
file: '/path/to/my/database.sqlite'
|
|
365
|
+
})
|
|
366
|
+
|
|
367
|
+
# In-memory database
|
|
368
|
+
sqlite = DWH.create(:sqlite, {
|
|
369
|
+
file: ':memory:'
|
|
370
|
+
})
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
### Read-Only Mode
|
|
374
|
+
|
|
375
|
+
```ruby
|
|
376
|
+
sqlite = DWH.create(:sqlite, {
|
|
377
|
+
file: '/path/to/readonly/database.sqlite',
|
|
378
|
+
readonly: true
|
|
379
|
+
})
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
### Performance Optimization
|
|
383
|
+
|
|
384
|
+
The adapter includes default optimizations for analytical workloads:
|
|
385
|
+
- WAL mode enabled by default for concurrent reads
|
|
386
|
+
- 64MB cache size
|
|
387
|
+
- Memory-mapped I/O (128MB)
|
|
388
|
+
- Temp tables stored in memory
|
|
389
|
+
|
|
390
|
+
```ruby
|
|
391
|
+
# Customize performance settings
|
|
392
|
+
sqlite = DWH.create(:sqlite, {
|
|
393
|
+
file: '/path/to/my/database.sqlite',
|
|
394
|
+
timeout: 5000, # busy timeout in milliseconds, default: 5000
|
|
395
|
+
pragmas: {
|
|
396
|
+
cache_size: -128000, # 128MB cache (negative means KB)
|
|
397
|
+
mmap_size: 268435456, # 256MB memory-mapped I/O
|
|
398
|
+
temp_store: 'MEMORY', # Store temp tables in memory
|
|
399
|
+
synchronous: 'NORMAL' # Faster than FULL, safe with WAL
|
|
400
|
+
}
|
|
401
|
+
})
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
### Disable WAL Mode
|
|
405
|
+
|
|
406
|
+
```ruby
|
|
407
|
+
# Disable WAL mode if needed (e.g., for NFS or network filesystems)
|
|
408
|
+
sqlite = DWH.create(:sqlite, {
|
|
409
|
+
file: '/path/to/my/database.sqlite',
|
|
410
|
+
enable_wal: false
|
|
411
|
+
})
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+
### Advanced Configuration
|
|
415
|
+
|
|
416
|
+
```ruby
|
|
417
|
+
sqlite = DWH.create(:sqlite, {
|
|
418
|
+
file: '/path/to/analytics.sqlite',
|
|
419
|
+
readonly: false,
|
|
420
|
+
enable_wal: true, # Default: true
|
|
421
|
+
timeout: 10000, # 10 second busy timeout
|
|
422
|
+
pragmas: {
|
|
423
|
+
journal_mode: 'WAL', # Explicitly set WAL (done by default)
|
|
424
|
+
cache_size: -256000, # 256MB cache
|
|
425
|
+
page_size: 8192, # Larger page size for analytics
|
|
426
|
+
mmap_size: 536870912, # 512MB memory-mapped I/O
|
|
427
|
+
temp_store: 'MEMORY', # Keep temp data in memory
|
|
428
|
+
synchronous: 'NORMAL', # Balance between safety and speed
|
|
429
|
+
locking_mode: 'NORMAL' # Allow multiple connections
|
|
430
|
+
}
|
|
431
|
+
})
|
|
432
|
+
```
|
|
433
|
+
|
|
434
|
+
### Multiple Connections
|
|
435
|
+
|
|
436
|
+
Unlike DuckDB, SQLite allows multiple independent connections to the same database file:
|
|
437
|
+
|
|
438
|
+
```ruby
|
|
439
|
+
# Multiple readers/writers to the same file
|
|
440
|
+
reader = DWH.create(:sqlite, { file: '/path/to/data.sqlite', readonly: true })
|
|
441
|
+
writer = DWH.create(:sqlite, { file: '/path/to/data.sqlite' })
|
|
442
|
+
|
|
443
|
+
# Both can operate concurrently with WAL mode enabled
|
|
444
|
+
data = reader.execute('SELECT * FROM sales')
|
|
445
|
+
writer.execute('INSERT INTO sales VALUES (...)')
|
|
446
|
+
```
|
|
447
|
+
|
|
290
448
|
## Trino Adapter
|
|
291
449
|
|
|
292
450
|
The Trino adapter requires the `trino-client-ruby` gem and works with both Trino and Presto.
|
|
@@ -40,9 +40,14 @@ postgres = DWH.create(:postgres, {
|
|
|
40
40
|
password: 'password'
|
|
41
41
|
})
|
|
42
42
|
|
|
43
|
+
# Connect to SQLite (lightweight, embedded)
|
|
44
|
+
sqlite = DWH.create(:sqlite, {
|
|
45
|
+
file: '/path/to/analytics.db'
|
|
46
|
+
})
|
|
47
|
+
|
|
43
48
|
# Connect to DuckDB (in-memory)
|
|
44
49
|
duckdb = DWH.create(:duckdb, {
|
|
45
|
-
|
|
50
|
+
file: ':memory:'
|
|
46
51
|
})
|
|
47
52
|
```
|
|
48
53
|
|
data/docs/guides/usage.md
CHANGED
|
@@ -293,7 +293,7 @@ native = adapter.execute(sql, format: :native) # Database's native format
|
|
|
293
293
|
# Use streaming for large result sets
|
|
294
294
|
def export_large_table(adapter, table_name, output_file)
|
|
295
295
|
query = "SELECT * FROM #{table_name}"
|
|
296
|
-
|
|
296
|
+
|
|
297
297
|
File.open(output_file, 'w') do |file|
|
|
298
298
|
adapter.execute_stream(query, file)
|
|
299
299
|
end
|
|
@@ -309,6 +309,38 @@ def process_large_dataset(adapter, query)
|
|
|
309
309
|
end
|
|
310
310
|
```
|
|
311
311
|
|
|
312
|
+
### SQLite Performance Tuning
|
|
313
|
+
|
|
314
|
+
SQLite adapter comes with optimized defaults for analytical workloads, but can be further tuned:
|
|
315
|
+
|
|
316
|
+
```ruby
|
|
317
|
+
# High-performance SQLite configuration for analytics
|
|
318
|
+
sqlite = DWH.create(:sqlite, {
|
|
319
|
+
file: '/path/to/large_analytics.db',
|
|
320
|
+
enable_wal: true, # WAL mode for concurrent reads (default: true)
|
|
321
|
+
timeout: 30000, # 30 second busy timeout for heavy writes
|
|
322
|
+
pragmas: {
|
|
323
|
+
cache_size: -512000, # 512MB cache for large datasets
|
|
324
|
+
page_size: 8192, # Larger pages for sequential scans
|
|
325
|
+
mmap_size: 1073741824, # 1GB memory-mapped I/O
|
|
326
|
+
temp_store: 'MEMORY', # Keep temp tables in RAM
|
|
327
|
+
synchronous: 'NORMAL', # Balance safety/speed (safe with WAL)
|
|
328
|
+
journal_size_limit: 67108864 # 64MB journal limit
|
|
329
|
+
}
|
|
330
|
+
})
|
|
331
|
+
|
|
332
|
+
# Read-only analytics queries with maximum performance
|
|
333
|
+
readonly_analytics = DWH.create(:sqlite, {
|
|
334
|
+
file: '/path/to/data.db',
|
|
335
|
+
readonly: true, # Read-only for maximum concurrency
|
|
336
|
+
pragmas: {
|
|
337
|
+
cache_size: -256000, # 256MB cache
|
|
338
|
+
mmap_size: 2147483648, # 2GB memory mapping for large files
|
|
339
|
+
temp_store: 'MEMORY' # Fast temp operations
|
|
340
|
+
}
|
|
341
|
+
})
|
|
342
|
+
```
|
|
343
|
+
|
|
312
344
|
## Error Handling and Debugging
|
|
313
345
|
|
|
314
346
|
### Comprehensive Error Handling
|
data/lib/dwh/adapters/athena.rb
CHANGED
|
@@ -202,8 +202,15 @@ module DWH
|
|
|
202
202
|
def valid_config?
|
|
203
203
|
super
|
|
204
204
|
require 'aws-sdk-athena'
|
|
205
|
+
require 'aws-sdk-s3'
|
|
205
206
|
rescue LoadError
|
|
206
|
-
raise ConfigError,
|
|
207
|
+
raise ConfigError, <<~MSG
|
|
208
|
+
Athena adapter requires the 'aws-sdk-athena' and 'aws-sdk-s3' gems.
|
|
209
|
+
|
|
210
|
+
Install with: gem install aws-sdk-athena aws-sdk-s3
|
|
211
|
+
|
|
212
|
+
No system libraries required (pure Ruby).
|
|
213
|
+
MSG
|
|
207
214
|
end
|
|
208
215
|
|
|
209
216
|
private
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
require 'csv'
|
|
2
|
+
require 'base64'
|
|
3
|
+
|
|
4
|
+
module DWH
|
|
5
|
+
module Adapters
|
|
6
|
+
# Databricks adapter for executing SQL queries against Databricks SQL warehouses.
|
|
7
|
+
#
|
|
8
|
+
# Supports OAuth M2M (service principal) authentication only.
|
|
9
|
+
#
|
|
10
|
+
# @example Connection with OAuth (service principal)
|
|
11
|
+
# DWH.create(:databricks, {
|
|
12
|
+
# host: 'adb-1234567890123456.7.azuredatabricks.net',
|
|
13
|
+
# warehouse: 'abc123def456',
|
|
14
|
+
# oauth_client_id: 'service-principal-app-id',
|
|
15
|
+
# oauth_client_secret: 'your-oauth-secret-here',
|
|
16
|
+
# catalog: 'main',
|
|
17
|
+
# schema: 'default'
|
|
18
|
+
# })
|
|
19
|
+
class Databricks < Adapter
|
|
20
|
+
config :host, String, required: true, message: 'Databricks workspace host (e.g., adb-xxx.databricks.cloud.com)'
|
|
21
|
+
config :oauth_client_id, String, required: true, message: 'OAuth client ID (service principal application ID)'
|
|
22
|
+
config :oauth_client_secret, String, required: true, message: 'OAuth client secret'
|
|
23
|
+
config :client_name, String, required: false, default: 'Ruby DWH Gem', message: 'Client name sent to Databricks'
|
|
24
|
+
config :query_timeout, Integer, required: false, default: 3600, message: 'Query execution timeout in seconds'
|
|
25
|
+
config :warehouse, String, required: true, message: 'Databricks SQL warehouse ID to use for query execution'
|
|
26
|
+
config :catalog, String, required: false, message: 'Default catalog (Unity Catalog)'
|
|
27
|
+
config :schema, String, required: false, message: 'Default schema'
|
|
28
|
+
|
|
29
|
+
DEFAULT_POLL_INTERVAL = 0.25
|
|
30
|
+
MAX_POLL_INTERVAL = 30
|
|
31
|
+
|
|
32
|
+
STATEMENTS_API = '/api/2.0/sql/statements'.freeze
|
|
33
|
+
|
|
34
|
+
def initialize(config)
|
|
35
|
+
super
|
|
36
|
+
validate_auth_config
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def connection
|
|
40
|
+
return @connection if @connection && !token_expired?
|
|
41
|
+
|
|
42
|
+
reset_connection if token_expired?
|
|
43
|
+
@connection = Faraday.new(
|
|
44
|
+
url: "https://#{workspace_host}",
|
|
45
|
+
headers: {
|
|
46
|
+
'Content-Type' => 'application/json',
|
|
47
|
+
'Authorization' => "Bearer #{auth_token}",
|
|
48
|
+
'User-Agent' => config[:client_name]
|
|
49
|
+
},
|
|
50
|
+
request: {
|
|
51
|
+
timeout: config[:query_timeout]
|
|
52
|
+
}.merge(extra_connection_params)
|
|
53
|
+
)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def test_connection(raise_exception: false)
|
|
57
|
+
execute('SELECT 1')
|
|
58
|
+
true
|
|
59
|
+
rescue StandardError => e
|
|
60
|
+
raise ConnectionError, "Failed to connect to Databricks: #{e.message}" if raise_exception
|
|
61
|
+
|
|
62
|
+
logger.error "Connection test failed: #{e.message}"
|
|
63
|
+
false
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# (see Adapter#execute)
|
|
67
|
+
def execute(sql, format: :array, retries: 0)
|
|
68
|
+
result = with_retry(retries + 1) do
|
|
69
|
+
with_debug(sql) do
|
|
70
|
+
response = submit_query(sql)
|
|
71
|
+
fetch_data(handle_query_response(response))
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
format_result(result, format)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def execute_stream(sql, io, stats: nil, retries: 0)
|
|
79
|
+
with_retry(retries) do
|
|
80
|
+
with_debug(sql) do
|
|
81
|
+
response = submit_query(sql)
|
|
82
|
+
fetch_data(handle_query_response(response), io: io, stats: stats)
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
io.rewind
|
|
87
|
+
io
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Execute SQL query and yield streamed results
|
|
91
|
+
# @param sql [String] SQL query to execute
|
|
92
|
+
# @yield [chunk] yields each chunk of data as it's processed
|
|
93
|
+
def stream(sql, &block)
|
|
94
|
+
with_debug(sql) do
|
|
95
|
+
response = submit_query(sql)
|
|
96
|
+
fetch_data(handle_query_response(response), proc: block)
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def tables(**qualifiers)
|
|
101
|
+
catalog = qualifiers[:catalog] || config[:catalog]
|
|
102
|
+
schema = qualifiers[:schema] || config[:schema]
|
|
103
|
+
|
|
104
|
+
raise ConfigError, 'catalog is required for Databricks tables query' unless catalog
|
|
105
|
+
|
|
106
|
+
sql = "SELECT table_name FROM #{catalog}.information_schema.tables"
|
|
107
|
+
sql += " WHERE table_schema = '#{schema}'" if schema
|
|
108
|
+
|
|
109
|
+
result = execute(sql)
|
|
110
|
+
result.flatten
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def metadata(table, **qualifiers)
|
|
114
|
+
catalog = qualifiers[:catalog] || config[:catalog]
|
|
115
|
+
schema = qualifiers[:schema] || config[:schema]
|
|
116
|
+
|
|
117
|
+
raise ConfigError, 'catalog is required for Databricks metadata query' unless catalog
|
|
118
|
+
|
|
119
|
+
db_table = Table.new(table, schema: schema, catalog: catalog)
|
|
120
|
+
|
|
121
|
+
sql = <<~SQL
|
|
122
|
+
SELECT column_name, data_type, numeric_precision, numeric_scale, character_maximum_length
|
|
123
|
+
FROM #{catalog}.information_schema.columns
|
|
124
|
+
WHERE table_name = '#{db_table.physical_name}'
|
|
125
|
+
SQL
|
|
126
|
+
sql += " AND table_schema = '#{db_table.schema}'" if db_table.schema
|
|
127
|
+
|
|
128
|
+
columns = execute(sql)
|
|
129
|
+
|
|
130
|
+
columns.each do |col|
|
|
131
|
+
db_table << Column.new(
|
|
132
|
+
name: col[0]&.downcase,
|
|
133
|
+
data_type: col[1]&.downcase,
|
|
134
|
+
precision: col[2],
|
|
135
|
+
scale: col[3],
|
|
136
|
+
max_char_length: col[4]
|
|
137
|
+
)
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
db_table
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def stats(table, date_column: nil)
|
|
144
|
+
date_fields = if date_column
|
|
145
|
+
", MIN(#{date_column}) AS date_start, MAX(#{date_column}) AS date_end"
|
|
146
|
+
else
|
|
147
|
+
', NULL AS date_start, NULL AS date_end'
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
data = execute("SELECT COUNT(*) AS row_count#{date_fields} FROM #{table}")
|
|
151
|
+
cols = data.first
|
|
152
|
+
|
|
153
|
+
TableStats.new(
|
|
154
|
+
row_count: cols[0],
|
|
155
|
+
date_start: cols[1],
|
|
156
|
+
date_end: cols[2]
|
|
157
|
+
)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
private
|
|
161
|
+
|
|
162
|
+
def validate_auth_config
|
|
163
|
+
raise ConfigError, 'oauth_client_id is required' unless config[:oauth_client_id]
|
|
164
|
+
raise ConfigError, 'oauth_client_secret is required' unless config[:oauth_client_secret]
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def auth_token
|
|
168
|
+
return @oauth_access_token if @oauth_access_token && !token_expired?
|
|
169
|
+
|
|
170
|
+
request_oauth_access_token!
|
|
171
|
+
@oauth_access_token
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def request_oauth_access_token!
|
|
175
|
+
credentials = Base64.strict_encode64("#{config[:oauth_client_id]}:#{config[:oauth_client_secret]}")
|
|
176
|
+
response = Faraday.post(
|
|
177
|
+
"https://#{workspace_host}/oidc/v1/token",
|
|
178
|
+
'grant_type=client_credentials&scope=all-apis',
|
|
179
|
+
'Authorization' => "Basic #{credentials}",
|
|
180
|
+
'Content-Type' => 'application/x-www-form-urlencoded'
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
raise AuthenticationError, "OAuth M2M token request failed (#{response.status}): #{response.body}" unless response.status == 200
|
|
184
|
+
|
|
185
|
+
data = JSON.parse(response.body)
|
|
186
|
+
@oauth_access_token = data['access_token']
|
|
187
|
+
expires_in = data['expires_in'] || 3600
|
|
188
|
+
@token_expires_at = Time.now + [expires_in - 60, 60].max
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def reset_connection
|
|
192
|
+
@oauth_access_token = nil
|
|
193
|
+
@token_expires_at = nil
|
|
194
|
+
close
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def submit_query(sql)
|
|
198
|
+
connection.post(STATEMENTS_API) do |req|
|
|
199
|
+
req.body = {
|
|
200
|
+
statement: sql,
|
|
201
|
+
warehouse_id: config[:warehouse],
|
|
202
|
+
catalog: config[:catalog],
|
|
203
|
+
schema: config[:schema],
|
|
204
|
+
wait_timeout: '30s',
|
|
205
|
+
on_wait_timeout: 'CONTINUE',
|
|
206
|
+
format: 'JSON_ARRAY',
|
|
207
|
+
disposition: 'INLINE'
|
|
208
|
+
}.compact.merge(extra_query_params).to_json
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def handle_query_response(response)
|
|
213
|
+
body = JSON.parse(response.body)
|
|
214
|
+
|
|
215
|
+
case response.status
|
|
216
|
+
when 200
|
|
217
|
+
state = body.dig('status', 'state')
|
|
218
|
+
state == 'SUCCEEDED' ? body : poll(body['statement_id'])
|
|
219
|
+
when 202
|
|
220
|
+
poll(body['statement_id'])
|
|
221
|
+
else
|
|
222
|
+
error_message = body['message'] || body['error_code'] || response.body
|
|
223
|
+
raise ExecutionError, "Databricks query failed (#{response.status}): #{error_message}"
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
def poll(statement_id)
|
|
228
|
+
sleep_interval = DEFAULT_POLL_INTERVAL
|
|
229
|
+
|
|
230
|
+
logger.debug "Polling for query completion: #{statement_id}"
|
|
231
|
+
|
|
232
|
+
loop do
|
|
233
|
+
response = connection.get("#{STATEMENTS_API}/#{statement_id}")
|
|
234
|
+
body = JSON.parse(response.body)
|
|
235
|
+
state = body.dig('status', 'state')
|
|
236
|
+
|
|
237
|
+
case state
|
|
238
|
+
when 'SUCCEEDED'
|
|
239
|
+
return body
|
|
240
|
+
when 'FAILED', 'CANCELED', 'CLOSED'
|
|
241
|
+
error_msg = body.dig('status', 'error', 'message') || state
|
|
242
|
+
raise ExecutionError, "Databricks query #{state}: #{error_msg}"
|
|
243
|
+
else
|
|
244
|
+
logger.debug "Query still running (state: #{state}). Sleeping #{sleep_interval}s..."
|
|
245
|
+
sleep(sleep_interval)
|
|
246
|
+
sleep_interval = sleep_interval == MAX_POLL_INTERVAL ? DEFAULT_POLL_INTERVAL : sleep_interval
|
|
247
|
+
sleep_interval = [sleep_interval * 2, MAX_POLL_INTERVAL].min
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
def fetch_data(result, io: nil, stats: nil, proc: nil)
|
|
253
|
+
columns = result.dig('manifest', 'schema', 'columns')&.map { |col| col['name'] } || []
|
|
254
|
+
chunks = result.dig('manifest', 'chunks') || []
|
|
255
|
+
collector = {
|
|
256
|
+
columns: columns,
|
|
257
|
+
data: [],
|
|
258
|
+
io: io,
|
|
259
|
+
stats: stats,
|
|
260
|
+
wrote_header: false
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
write_data(result.dig('result', 'data_array') || [], collector, io, stats, proc)
|
|
264
|
+
|
|
265
|
+
return collector unless chunks.size > 1
|
|
266
|
+
|
|
267
|
+
statement_id = result['statement_id']
|
|
268
|
+
chunks[1..].each do |chunk|
|
|
269
|
+
chunk_index = chunk['chunk_index']
|
|
270
|
+
logger.debug "Fetching chunk #{chunk_index} of #{chunks.size} for statement: #{statement_id}"
|
|
271
|
+
|
|
272
|
+
resp = connection.get("#{STATEMENTS_API}/#{statement_id}/result/chunks/#{chunk_index}")
|
|
273
|
+
raise ExecutionError, "Failed to fetch chunk #{chunk_index}: #{resp.body}" unless resp.status == 200
|
|
274
|
+
|
|
275
|
+
chunk_data = JSON.parse(resp.body)
|
|
276
|
+
write_data(chunk_data['data_array'] || [], collector, io, stats, proc)
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
collector
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
def write_data(data, collector, io = nil, stats = nil, proc = nil)
|
|
283
|
+
if io
|
|
284
|
+
unless collector[:wrote_header]
|
|
285
|
+
io << CSV.generate_line(collector[:columns])
|
|
286
|
+
collector[:wrote_header] = true
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
data.each do |row|
|
|
290
|
+
stats << row if stats
|
|
291
|
+
io << CSV.generate_line(row)
|
|
292
|
+
end
|
|
293
|
+
elsif proc
|
|
294
|
+
data.each { proc.call(it) }
|
|
295
|
+
else
|
|
296
|
+
data.each { collector[:data] << it }
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
collector
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
def format_result(result, format)
|
|
303
|
+
data = result[:data]
|
|
304
|
+
columns = result[:columns]
|
|
305
|
+
|
|
306
|
+
case format
|
|
307
|
+
when :array
|
|
308
|
+
data
|
|
309
|
+
when :object
|
|
310
|
+
data.map { |row| columns.zip(row).to_h }
|
|
311
|
+
when :csv
|
|
312
|
+
CSV.generate do |csv|
|
|
313
|
+
csv << columns
|
|
314
|
+
data.each { |row| csv << row }
|
|
315
|
+
end
|
|
316
|
+
when :native
|
|
317
|
+
result
|
|
318
|
+
else
|
|
319
|
+
raise UnsupportedCapability, "Unknown result format: #{format}"
|
|
320
|
+
end
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
def workspace_host
|
|
324
|
+
config[:host].to_s.gsub(%r{\Ahttps?://}, '').gsub(%r{/+\z}, '')
|
|
325
|
+
end
|
|
326
|
+
end
|
|
327
|
+
end
|
|
328
|
+
end
|
data/lib/dwh/adapters/duck_db.rb
CHANGED
|
@@ -150,7 +150,7 @@ module DWH
|
|
|
150
150
|
|
|
151
151
|
# True if the configuration was setup with a schema.
|
|
152
152
|
def schema?
|
|
153
|
-
config[:schema].
|
|
153
|
+
!config[:schema].nil? && !config[:schema]&.strip&.empty?
|
|
154
154
|
end
|
|
155
155
|
|
|
156
156
|
# (see Adapter#execute)
|
|
@@ -209,7 +209,13 @@ module DWH
|
|
|
209
209
|
super
|
|
210
210
|
require 'duckdb'
|
|
211
211
|
rescue LoadError
|
|
212
|
-
raise ConfigError,
|
|
212
|
+
raise ConfigError, <<~MSG
|
|
213
|
+
DuckDB adapter requires the 'duckdb' gem.
|
|
214
|
+
|
|
215
|
+
Install with: gem install duckdb
|
|
216
|
+
|
|
217
|
+
See https://github.com/suketa/ruby-duckdb for installation details.
|
|
218
|
+
MSG
|
|
213
219
|
end
|
|
214
220
|
|
|
215
221
|
private
|