dwh 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/.rubocop.yml +36 -0
  3. data/CHANGELOG.md +5 -0
  4. data/LICENSE +21 -0
  5. data/README.md +130 -0
  6. data/Rakefile +42 -0
  7. data/docs/DWH/Adapters/Adapter.html +3053 -0
  8. data/docs/DWH/Adapters/Athena.html +1704 -0
  9. data/docs/DWH/Adapters/Boolean.html +121 -0
  10. data/docs/DWH/Adapters/Druid.html +1626 -0
  11. data/docs/DWH/Adapters/DuckDb.html +2012 -0
  12. data/docs/DWH/Adapters/MySql.html +1704 -0
  13. data/docs/DWH/Adapters/OpenAuthorizable/ClassMethods.html +265 -0
  14. data/docs/DWH/Adapters/OpenAuthorizable.html +1102 -0
  15. data/docs/DWH/Adapters/Postgres.html +2000 -0
  16. data/docs/DWH/Adapters/Snowflake.html +1662 -0
  17. data/docs/DWH/Adapters/SqlServer.html +2084 -0
  18. data/docs/DWH/Adapters/Trino.html +1835 -0
  19. data/docs/DWH/Adapters.html +129 -0
  20. data/docs/DWH/AuthenticationError.html +142 -0
  21. data/docs/DWH/Behaviors.html +767 -0
  22. data/docs/DWH/Capabilities.html +748 -0
  23. data/docs/DWH/Column.html +1115 -0
  24. data/docs/DWH/ConfigError.html +143 -0
  25. data/docs/DWH/ConnectionError.html +143 -0
  26. data/docs/DWH/DWHError.html +138 -0
  27. data/docs/DWH/ExecutionError.html +143 -0
  28. data/docs/DWH/Factory.html +1133 -0
  29. data/docs/DWH/Functions/Arrays.html +505 -0
  30. data/docs/DWH/Functions/Dates.html +1644 -0
  31. data/docs/DWH/Functions/ExtractDatePart.html +804 -0
  32. data/docs/DWH/Functions/Nulls.html +377 -0
  33. data/docs/DWH/Functions.html +846 -0
  34. data/docs/DWH/Logger.html +258 -0
  35. data/docs/DWH/OAuthError.html +138 -0
  36. data/docs/DWH/Settings.html +658 -0
  37. data/docs/DWH/StreamingStats.html +804 -0
  38. data/docs/DWH/Table.html +1260 -0
  39. data/docs/DWH/TableStats.html +583 -0
  40. data/docs/DWH/TokenExpiredError.html +142 -0
  41. data/docs/DWH/UnsupportedCapability.html +135 -0
  42. data/docs/DWH.html +220 -0
  43. data/docs/_index.html +471 -0
  44. data/docs/class_list.html +54 -0
  45. data/docs/css/common.css +1 -0
  46. data/docs/css/full_list.css +58 -0
  47. data/docs/css/style.css +503 -0
  48. data/docs/file.README.html +210 -0
  49. data/docs/file.adapters.html +514 -0
  50. data/docs/file.creating-adapters.html +497 -0
  51. data/docs/file.getting-started.html +288 -0
  52. data/docs/file.usage.html +446 -0
  53. data/docs/file_list.html +79 -0
  54. data/docs/frames.html +22 -0
  55. data/docs/guides/adapters.md +445 -0
  56. data/docs/guides/creating-adapters.md +430 -0
  57. data/docs/guides/getting-started.md +225 -0
  58. data/docs/guides/usage.md +378 -0
  59. data/docs/index.html +210 -0
  60. data/docs/js/app.js +344 -0
  61. data/docs/js/full_list.js +242 -0
  62. data/docs/js/jquery.js +4 -0
  63. data/docs/method_list.html +2038 -0
  64. data/docs/top-level-namespace.html +110 -0
  65. data/lib/dwh/adapters/athena.rb +359 -0
  66. data/lib/dwh/adapters/druid.rb +267 -0
  67. data/lib/dwh/adapters/duck_db.rb +235 -0
  68. data/lib/dwh/adapters/my_sql.rb +235 -0
  69. data/lib/dwh/adapters/open_authorizable.rb +215 -0
  70. data/lib/dwh/adapters/postgres.rb +250 -0
  71. data/lib/dwh/adapters/snowflake.rb +489 -0
  72. data/lib/dwh/adapters/sql_server.rb +257 -0
  73. data/lib/dwh/adapters/trino.rb +213 -0
  74. data/lib/dwh/adapters.rb +363 -0
  75. data/lib/dwh/behaviors.rb +67 -0
  76. data/lib/dwh/capabilities.rb +39 -0
  77. data/lib/dwh/column.rb +79 -0
  78. data/lib/dwh/errors.rb +29 -0
  79. data/lib/dwh/factory.rb +125 -0
  80. data/lib/dwh/functions/arrays.rb +42 -0
  81. data/lib/dwh/functions/dates.rb +162 -0
  82. data/lib/dwh/functions/extract_date_part.rb +70 -0
  83. data/lib/dwh/functions/nulls.rb +31 -0
  84. data/lib/dwh/functions.rb +86 -0
  85. data/lib/dwh/logger.rb +50 -0
  86. data/lib/dwh/settings/athena.yml +77 -0
  87. data/lib/dwh/settings/base.yml +81 -0
  88. data/lib/dwh/settings/databricks.yml +51 -0
  89. data/lib/dwh/settings/druid.yml +59 -0
  90. data/lib/dwh/settings/duckdb.yml +44 -0
  91. data/lib/dwh/settings/mysql.yml +67 -0
  92. data/lib/dwh/settings/postgres.yml +30 -0
  93. data/lib/dwh/settings/redshift.yml +52 -0
  94. data/lib/dwh/settings/snowflake.yml +45 -0
  95. data/lib/dwh/settings/sqlserver.yml +80 -0
  96. data/lib/dwh/settings/trino.yml +77 -0
  97. data/lib/dwh/settings.rb +79 -0
  98. data/lib/dwh/streaming_stats.rb +69 -0
  99. data/lib/dwh/table.rb +105 -0
  100. data/lib/dwh/table_stats.rb +51 -0
  101. data/lib/dwh/version.rb +5 -0
  102. data/lib/dwh.rb +54 -0
  103. data/sig/dwh.rbs +4 -0
  104. metadata +231 -0
@@ -0,0 +1,489 @@
1
+ require 'jwt'
2
+ require 'csv'
3
+ require 'base64'
4
+ require 'digest'
5
+ require_relative 'open_authorizable'
6
+
7
+ module DWH
8
+ module Adapters
9
+ # Snowflake adapter for executing SQL queries against Snowflake databases.
10
+ #
11
+ # Supports two authentication modes:
12
+ # - Personal Access Token (pat)
13
+ # - Key Pair Authentication (kp)
14
+ # - OAuth 2.0 (oauth)
15
+ #
16
+ # @example Basic connection with Personal Access Token
17
+ # DWH.create(:snowflake, {
18
+ # auth_mode: 'pat',
19
+ # account_identifier: 'myorg-myaccount',
20
+ # personal_access_token: 'your-token-here',
21
+ # warehouse: 'COMPUTE_WH',
22
+ # database: 'ANALYTICS',
23
+ # schema: 'PUBLIC'
24
+ # })
25
+ #
26
+ # @example Connection with Key Pair Authentication
27
+ # DWH.create(:snowflake, {
28
+ # auth_mode: 'kp',
29
+ # account_identifier: 'myorg-myaccount.us-east-1',
30
+ # username: 'john_doe',
31
+ # private_key: '/path/to/private_key.pem',
32
+ # warehouse: 'COMPUTE_WH',
33
+ # database: 'ANALYTICS'
34
+ # })
35
+ #
36
+ # @example Connecting with OAuth
37
+ # DWH.create(:snowflake, {
38
+ # auth_mode: 'oauth',
39
+ # account_identifier: 'myorg-myaccount.us-east-1',
40
+ # oauth_client_id: '<YOUR_CLIENT_ID>',
41
+ # oauth_client_secret: '<YOUR_CLIENT_SECRET>',
42
+ # oauth_redirect_url: 'https://localhost:3030/some/path',
43
+ # database: 'ANALYTICS'
44
+ # })
45
+ #
46
+ # # This sill only work if you setup an OAuth security integration
47
+ # # and grant it to the correct users.
48
+ #
49
+ # # Use this url to get auth code
50
+ # adapter.authorization_url
51
+ #
52
+ # # Pass the code to generate oauth tokens
53
+ # adapter.generate_oauth_tokens(authorization_code)
54
+ #
55
+ # # Apply previously created tokens for new connections
56
+ # adapter.apply_oauth_tokens(access_token: token, refresh_token: token, expires_at: Time.now)
57
+ class Snowflake < Adapter
58
+ include OpenAuthorizable
59
+
60
+ # OAuth setup
61
+ oauth_with authorize: ->(adapter) { "https://#{adapter.account_identifier}.snowflakecomputing.com/oauth/authorize" },
62
+ tokenize: ->(adapter) { "https://#{adapter.account_identifier}.snowflakecomputing.com/oauth/token-request" },
63
+ default_scope: 'refresh_token'
64
+
65
+ # Authentication configuration
66
+ config :auth_mode, String, required: true, allowed: %w[pat kp oauth],
67
+ message: 'Authentication mode: "pat" (Personal Access Token) or "kp" (Key Pair)'
68
+
69
+ config :account_identifier, String,
70
+ required: true, message: 'Snowflake account identifier (e.g., myorg-myaccount or myorg-myaccount.region)'
71
+
72
+ # Personal Access Token authentication
73
+ config :personal_access_token, String,
74
+ required: false, message: 'Personal access token (required when auth_mode is "pat")'
75
+
76
+ # Key Pair authentication
77
+ config :username, String,
78
+ required: false, message: 'Username (required when auth_mode is "kp")'
79
+
80
+ config :private_key, String,
81
+ required: false, message: 'Private key file path or private key content (required when auth_mode is "kp")'
82
+
83
+ config :public_key_fp, String,
84
+ required: false, message: 'Public key fingerprint (optional, will be derived if not provided)'
85
+
86
+ # Connection configuration
87
+ config :client_name, String,
88
+ required: false, default: 'Ruby DWH Gem', message: 'Client name sent to Snowflake'
89
+
90
+ config :query_timeout, Integer,
91
+ required: false, default: 3600, message: 'Query execution timeout in seconds'
92
+
93
+ # Database configuration
94
+ config :role, String,
95
+ required: false, message: 'Snowflake role to assume'
96
+
97
+ config :warehouse, String,
98
+ required: false, message: 'Snowflake warehouse to use'
99
+
100
+ config :database, String,
101
+ required: true, message: 'Specific database to connect to.'
102
+
103
+ config :schema, String,
104
+ required: false, message: 'Default schema'
105
+
106
+ # Constants
107
+ AUTH_TOKEN_TYPES = {
108
+ pat: 'PROGRAMMATIC_ACCESS_TOKEN',
109
+ kp: 'KEYPAIR_JWT',
110
+ oauth: 'OAUTH'
111
+ }.freeze
112
+
113
+ API_ENDPOINTS = {
114
+ statements: '/api/v2/statements'
115
+ }.freeze
116
+
117
+ DEFAULT_PARAMETERS = {
118
+ DATE_OUTPUT_FORMAT: 'YYYY-MM-DD',
119
+ TIMESTAMP_OUTPUT_FORMAT: 'YYYY-MM-DD HH24:MI:SS',
120
+ TIMESTAMP_TZ_OUTPUT_FORMAT: 'YYYY-MM-DD HH24:MI:SS TZH',
121
+ TIMESTAMP_NTZ_OUTPUT_FORMAT: 'YYYY-MM-DD HH24:MI:SS',
122
+ TIMESTAMP_LTZ_OUTPUT_FORMAT: 'YYYY-MM-DD HH24:MI:SS TZH',
123
+ TIME_OUTPUT_FORMAT: 'HH24:MI:SS'
124
+ }.freeze
125
+
126
+ DEFAULT_POLL_INTERVAL = 0.25
127
+ MAX_POLL_INTERVAL = 30
128
+ TOKEN_VALIDITY_SECONDS = 3600
129
+
130
+ def initialize(config)
131
+ super
132
+ validate_auth_config
133
+ end
134
+
135
+ # (see Adapter#connection)
136
+ def connection
137
+ return @connection if @connection && !token_expired?
138
+
139
+ reset_connection if token_expired?
140
+ @token_expires_at ||= Time.now + TOKEN_VALIDITY_SECONDS
141
+
142
+ @connection = Faraday.new(
143
+ url: "https://#{config[:account_identifier]}.snowflakecomputing.com",
144
+ headers: {
145
+ 'Content-Type' => 'application/json',
146
+ 'Authorization' => "Bearer #{auth_token}",
147
+ 'X-Snowflake-Authorization-Token-Type' => auth_token_type,
148
+ 'User-Agent' => config[:client_name]
149
+ },
150
+ request: {
151
+ timeout: config[:query_timeout]
152
+ }.merge(extra_connection_params)
153
+ )
154
+ end
155
+
156
+ # (see Adapter#test_connection)
157
+ def test_connection(raise_exception: false)
158
+ execute('SELECT 1')
159
+ true
160
+ rescue StandardError => e
161
+ raise ConnectionError, "Failed to connect to Snowflake: #{e.message}" if raise_exception
162
+
163
+ logger.error "Connection test failed: #{e.message}"
164
+ false
165
+ end
166
+
167
+ # (see Adapter#execute)
168
+ def execute(sql, format: :array, retries: 0)
169
+ result = with_retry(retries + 1) do
170
+ with_debug(sql) do
171
+ response = submit_query(sql)
172
+ fetch_data(handle_query_response(response))
173
+ end
174
+ end
175
+
176
+ format_result(result, format)
177
+ end
178
+
179
+ # (see Adapter#execute)
180
+ def execute_stream(sql, io, stats: nil, retries: 0)
181
+ with_retry(retries) do
182
+ with_debug(sql) do
183
+ response = submit_query(sql)
184
+ fetch_data(handle_query_response(response), io: io, stats: stats)
185
+ end
186
+ end
187
+
188
+ io.rewind
189
+ io
190
+ end
191
+
192
+ # Execute SQL query and yield streamed results
193
+ # @param sql [String] SQL query to execute
194
+ # @yield [chunk] yields each chunk of data as it's processed
195
+ def stream(sql, &block)
196
+ with_debug(sql) do
197
+ response = submit_query(sql)
198
+ fetch_data(handle_query_response(response), proc: block)
199
+ end
200
+ end
201
+
202
+ # (see Adapter#tables)
203
+ # For metadata queries table_catalog and database are
204
+ # the same in the Snowflake information_schema.
205
+ #
206
+ # However, we need to prefix the information_schema table with
207
+ # the db name to correctly constrain to the target db.
208
+ #
209
+ # @return [Array<String>] list of table names
210
+ def tables(**qualifiers)
211
+ catalog, schema = qualifiers.values_at(:catalog, :schema)
212
+
213
+ db = catalog || config[:database]
214
+ sql = "SELECT table_name FROM #{db}.information_schema.tables"
215
+ conditions = []
216
+
217
+ conditions << "table_schema = '#{schema.upcase}'" if schema
218
+
219
+ sql += " WHERE #{conditions.join(' AND ')}" if conditions.any?
220
+
221
+ result = execute(sql)
222
+ result.flatten
223
+ end
224
+
225
+ # (see Adapter#tables)
226
+ def metadata(table, **qualifiers)
227
+ catalog, schema = qualifiers.values_at(:catalog, :schema)
228
+ db_table = Table.new(table, schema: schema, catalog: catalog)
229
+ db = db_table.catalog || config[:database]
230
+ sql = <<~SQL
231
+ SELECT column_name, data_type, numeric_precision, numeric_scale, character_maximum_length
232
+ FROM #{db}.information_schema.columns
233
+ SQL
234
+
235
+ conditions = ["table_name = '#{db_table.physical_name.upcase}'"]
236
+ conditions << "table_schema = '#{db_table.schema.upcase}'" if db_table.schema
237
+
238
+ columns = execute("#{sql} WHERE #{conditions.join(' AND ')}")
239
+
240
+ columns.each do |col|
241
+ db_table << Column.new(
242
+ name: col[0]&.downcase,
243
+ data_type: col[1]&.downcase,
244
+ precision: col[2],
245
+ scale: col[3],
246
+ max_char_length: col[4]
247
+ )
248
+ end
249
+
250
+ db_table
251
+ end
252
+
253
+ # (see Adapter#stats)
254
+ def stats(table, date_column: nil)
255
+ date_fields = if date_column
256
+ ", MIN(#{date_column}) AS date_start, MAX(#{date_column}) AS date_end"
257
+ else
258
+ ', NULL AS date_start, NULL AS date_end'
259
+ end
260
+
261
+ data = execute("SELECT COUNT(*) AS row_count#{date_fields} FROM #{table}")
262
+ cols = data.first
263
+
264
+ TableStats.new(
265
+ row_count: cols[0],
266
+ date_start: cols[1],
267
+ date_end: cols[2]
268
+ )
269
+ end
270
+
271
+ private
272
+
273
+ # Validation and Setup Methods
274
+ def validate_auth_config
275
+ case auth_mode.downcase.to_sym
276
+ when :pat
277
+ return if config[:personal_access_token]
278
+
279
+ raise ConfigError, "personal_access_token is required when auth_mode is 'pat'"
280
+ when :kp
281
+ raise ConfigError, "username is required when auth_mode is 'kp'" unless config[:username]
282
+ return if config[:private_key]
283
+
284
+ raise ConfigError, "private_key is required when auth_mode is 'kp'"
285
+ when :oauth
286
+ validate_oauth_config
287
+ else
288
+ raise ConfigError, "Invalid auth_mode: #{config[:auth_mode]}"
289
+ end
290
+ end
291
+
292
+ def reset_connection
293
+ @token_expires_at = nil unless oauth_mode? # here we keep the set expiration time
294
+ @jwt_token = nil
295
+ close
296
+ end
297
+
298
+ # Authentication
299
+ def auth_token
300
+ case auth_mode.downcase.to_sym
301
+ when :pat
302
+ config[:personal_access_token]
303
+ when :kp
304
+ jwt_token
305
+ when :oauth
306
+ oauth_access_token
307
+ else
308
+ raise ConfigError, "Invalid auth_mode: #{config[:auth_mode]}"
309
+ end
310
+ end
311
+
312
+ # Translate auth mode to Snowflake auth token type
313
+ def auth_token_type
314
+ AUTH_TOKEN_TYPES[config[:auth_mode].to_sym]
315
+ end
316
+
317
+ def personal_access_token_mode?
318
+ config[:auth_mode] == 'pat'
319
+ end
320
+
321
+ def key_pair_mode?
322
+ config[:auth_mode] == 'kp'
323
+ end
324
+
325
+ def oauth_mode?
326
+ config[:auth_mode] == 'oauth'
327
+ end
328
+
329
+ def jwt_token
330
+ @jwt_token ||= JWT.encode(
331
+ {
332
+ iss: "#{qualified_username}.SHA256:#{public_key_fingerprint}",
333
+ sub: qualified_username,
334
+ iat: Time.now.to_i,
335
+ exp: @token_expires_at.to_i
336
+ },
337
+ private_key_object, 'RS256'
338
+ )
339
+ end
340
+
341
+ def qualified_username
342
+ "#{account_identifier.upcase}.#{config[:username].upcase}"
343
+ end
344
+
345
+ def private_key_object
346
+ @private_key_object ||= OpenSSL::PKey.read(
347
+ if File.exist?(config[:private_key])
348
+ File.read(config[:private_key])
349
+ else
350
+ config[:private_key]
351
+ end
352
+ )
353
+ end
354
+
355
+ def public_key_fingerprint
356
+ @public_key_fingerprint ||=
357
+ config[:public_key_fp] || Base64.strict_encode64(
358
+ Digest::SHA256.digest(private_key_object.public_key.to_der)
359
+ )
360
+ end
361
+
362
+ def submit_query(sql)
363
+ connection.post(API_ENDPOINTS[:statements]) do |req|
364
+ req.body =
365
+ {
366
+ statement: sql,
367
+ timeout: config[:query_timeout],
368
+ warehouse: config[:warehouse]&.upcase,
369
+ database: config[:database]&.upcase,
370
+ schema: config[:schema]&.upcase,
371
+ role: config[:role]&.upcase,
372
+ parameters: DEFAULT_PARAMETERS
373
+ }.compact.merge(extra_query_params)
374
+ .to_json
375
+ end
376
+ end
377
+
378
+ def handle_query_response(response)
379
+ case response.status
380
+ when 200
381
+ JSON.parse(response.body)
382
+ when 202
383
+ poll(JSON.parse(response.body))
384
+ else
385
+ error_info = begin
386
+ JSON.parse(response.body)
387
+ rescue StandardError
388
+ response.body
389
+ end
390
+ message = error_info.is_a?(Hash) ? error_info['message'] : error_info
391
+ raise ExecutionError, "Snowflake query failed: #{message}"
392
+ end
393
+ end
394
+
395
+ def poll(initial_result)
396
+ statement_handle = initial_result['statementHandle']
397
+ sleep_interval = DEFAULT_POLL_INTERVAL
398
+
399
+ logger.debug "Polling for query completion: #{statement_handle}"
400
+
401
+ loop do
402
+ response = connection.get("#{API_ENDPOINTS[:statements]}/#{statement_handle}")
403
+ result = JSON.parse(response.body)
404
+
405
+ case response.status
406
+ when 200
407
+ return result
408
+ when 202
409
+ logger.debug "Query still running. Sleeping #{sleep_interval}s..."
410
+ sleep(sleep_interval)
411
+ # once we hit one max interval lets restart
412
+ # the cycle.
413
+ sleep_interval = sleep_interval == MAX_POLL_INTERVAL ? DEFAULT_POLL_INTERVAL : sleep_interval
414
+ sleep_interval = [sleep_interval * 2, MAX_POLL_INTERVAL].min
415
+ else
416
+ message = result['message'] || result
417
+ raise ExecutionError, "Polling failed: #{message}"
418
+ end
419
+ end
420
+ end
421
+
422
+ # Result Processing
423
+ def format_result(result, format)
424
+ data = result[:data]
425
+ columns = result[:columns]
426
+
427
+ case format
428
+ when :array
429
+ data
430
+ when :object
431
+ data.map { |row| columns.zip(row).to_h }
432
+ when :csv
433
+ CSV.generate do |csv|
434
+ csv << columns
435
+ data.each { |row| csv << row }
436
+ end
437
+ when :native
438
+ result
439
+ else
440
+ raise UnsupportedCapability, "Unknown result format: #{format}"
441
+ end
442
+ end
443
+
444
+ def fetch_data(result, io: nil, stats: nil, proc: nil)
445
+ collector = {
446
+ columns: result.dig('resultSetMetaData', 'rowType')&.map { |col| col['name'] } || [],
447
+ data: [], io: io, stats: stats, wrote_header: false
448
+ }
449
+
450
+ partitions = result.dig('resultSetMetaData', 'partitionInfo')
451
+ write_data(result['data'], collector, io, stats, proc)
452
+ return collector unless partitions.size > 1
453
+
454
+ url = "#{API_ENDPOINTS[:statements]}/#{result['statementHandle']}?partition="
455
+ partitions[1..].each.with_index(1) do |_, index|
456
+ logger.debug "Fetching partition #{index} of #{partitions.length - 1} for statement handle: #{result['statementHandle']}"
457
+ resp = connection.get(url + index.to_s)
458
+ raise ExecutionError, "Could not data partitions from Snowflake: #{resp.body}" unless resp.status == 200
459
+
460
+ part_res = JSON.parse(resp.body)
461
+
462
+ write_data(part_res['data'], collector, io, stats, proc)
463
+ end
464
+
465
+ collector
466
+ end
467
+
468
+ def write_data(data, collector, io = nil, stats = nil, proc = nil)
469
+ if io
470
+ unless collector[:wrote_header]
471
+ io << CSV.generate_line(collector[:columns])
472
+ collector[:wrote_header] = true
473
+ end
474
+
475
+ data.each do |row|
476
+ stats << row if stats
477
+ io << CSV.generate_line(row)
478
+ end
479
+ elsif proc
480
+ data.each { proc.call(it) }
481
+ else
482
+ data.each { collector[:data] << it }
483
+ end
484
+
485
+ collector
486
+ end
487
+ end
488
+ end
489
+ end