dwh 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/.rubocop.yml +36 -0
  3. data/CHANGELOG.md +5 -0
  4. data/LICENSE +21 -0
  5. data/README.md +130 -0
  6. data/Rakefile +42 -0
  7. data/docs/DWH/Adapters/Adapter.html +3053 -0
  8. data/docs/DWH/Adapters/Athena.html +1704 -0
  9. data/docs/DWH/Adapters/Boolean.html +121 -0
  10. data/docs/DWH/Adapters/Druid.html +1626 -0
  11. data/docs/DWH/Adapters/DuckDb.html +2012 -0
  12. data/docs/DWH/Adapters/MySql.html +1704 -0
  13. data/docs/DWH/Adapters/OpenAuthorizable/ClassMethods.html +265 -0
  14. data/docs/DWH/Adapters/OpenAuthorizable.html +1102 -0
  15. data/docs/DWH/Adapters/Postgres.html +2000 -0
  16. data/docs/DWH/Adapters/Snowflake.html +1662 -0
  17. data/docs/DWH/Adapters/SqlServer.html +2084 -0
  18. data/docs/DWH/Adapters/Trino.html +1835 -0
  19. data/docs/DWH/Adapters.html +129 -0
  20. data/docs/DWH/AuthenticationError.html +142 -0
  21. data/docs/DWH/Behaviors.html +767 -0
  22. data/docs/DWH/Capabilities.html +748 -0
  23. data/docs/DWH/Column.html +1115 -0
  24. data/docs/DWH/ConfigError.html +143 -0
  25. data/docs/DWH/ConnectionError.html +143 -0
  26. data/docs/DWH/DWHError.html +138 -0
  27. data/docs/DWH/ExecutionError.html +143 -0
  28. data/docs/DWH/Factory.html +1133 -0
  29. data/docs/DWH/Functions/Arrays.html +505 -0
  30. data/docs/DWH/Functions/Dates.html +1644 -0
  31. data/docs/DWH/Functions/ExtractDatePart.html +804 -0
  32. data/docs/DWH/Functions/Nulls.html +377 -0
  33. data/docs/DWH/Functions.html +846 -0
  34. data/docs/DWH/Logger.html +258 -0
  35. data/docs/DWH/OAuthError.html +138 -0
  36. data/docs/DWH/Settings.html +658 -0
  37. data/docs/DWH/StreamingStats.html +804 -0
  38. data/docs/DWH/Table.html +1260 -0
  39. data/docs/DWH/TableStats.html +583 -0
  40. data/docs/DWH/TokenExpiredError.html +142 -0
  41. data/docs/DWH/UnsupportedCapability.html +135 -0
  42. data/docs/DWH.html +220 -0
  43. data/docs/_index.html +471 -0
  44. data/docs/class_list.html +54 -0
  45. data/docs/css/common.css +1 -0
  46. data/docs/css/full_list.css +58 -0
  47. data/docs/css/style.css +503 -0
  48. data/docs/file.README.html +210 -0
  49. data/docs/file.adapters.html +514 -0
  50. data/docs/file.creating-adapters.html +497 -0
  51. data/docs/file.getting-started.html +288 -0
  52. data/docs/file.usage.html +446 -0
  53. data/docs/file_list.html +79 -0
  54. data/docs/frames.html +22 -0
  55. data/docs/guides/adapters.md +445 -0
  56. data/docs/guides/creating-adapters.md +430 -0
  57. data/docs/guides/getting-started.md +225 -0
  58. data/docs/guides/usage.md +378 -0
  59. data/docs/index.html +210 -0
  60. data/docs/js/app.js +344 -0
  61. data/docs/js/full_list.js +242 -0
  62. data/docs/js/jquery.js +4 -0
  63. data/docs/method_list.html +2038 -0
  64. data/docs/top-level-namespace.html +110 -0
  65. data/lib/dwh/adapters/athena.rb +359 -0
  66. data/lib/dwh/adapters/druid.rb +267 -0
  67. data/lib/dwh/adapters/duck_db.rb +235 -0
  68. data/lib/dwh/adapters/my_sql.rb +235 -0
  69. data/lib/dwh/adapters/open_authorizable.rb +215 -0
  70. data/lib/dwh/adapters/postgres.rb +250 -0
  71. data/lib/dwh/adapters/snowflake.rb +489 -0
  72. data/lib/dwh/adapters/sql_server.rb +257 -0
  73. data/lib/dwh/adapters/trino.rb +213 -0
  74. data/lib/dwh/adapters.rb +363 -0
  75. data/lib/dwh/behaviors.rb +67 -0
  76. data/lib/dwh/capabilities.rb +39 -0
  77. data/lib/dwh/column.rb +79 -0
  78. data/lib/dwh/errors.rb +29 -0
  79. data/lib/dwh/factory.rb +125 -0
  80. data/lib/dwh/functions/arrays.rb +42 -0
  81. data/lib/dwh/functions/dates.rb +162 -0
  82. data/lib/dwh/functions/extract_date_part.rb +70 -0
  83. data/lib/dwh/functions/nulls.rb +31 -0
  84. data/lib/dwh/functions.rb +86 -0
  85. data/lib/dwh/logger.rb +50 -0
  86. data/lib/dwh/settings/athena.yml +77 -0
  87. data/lib/dwh/settings/base.yml +81 -0
  88. data/lib/dwh/settings/databricks.yml +51 -0
  89. data/lib/dwh/settings/druid.yml +59 -0
  90. data/lib/dwh/settings/duckdb.yml +44 -0
  91. data/lib/dwh/settings/mysql.yml +67 -0
  92. data/lib/dwh/settings/postgres.yml +30 -0
  93. data/lib/dwh/settings/redshift.yml +52 -0
  94. data/lib/dwh/settings/snowflake.yml +45 -0
  95. data/lib/dwh/settings/sqlserver.yml +80 -0
  96. data/lib/dwh/settings/trino.yml +77 -0
  97. data/lib/dwh/settings.rb +79 -0
  98. data/lib/dwh/streaming_stats.rb +69 -0
  99. data/lib/dwh/table.rb +105 -0
  100. data/lib/dwh/table_stats.rb +51 -0
  101. data/lib/dwh/version.rb +5 -0
  102. data/lib/dwh.rb +54 -0
  103. data/sig/dwh.rbs +4 -0
  104. metadata +231 -0
@@ -0,0 +1,110 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>
7
+ Top Level Namespace
8
+
9
+ &mdash; Documentation by YARD 0.9.37
10
+
11
+ </title>
12
+
13
+ <link rel="stylesheet" href="css/style.css" type="text/css" />
14
+
15
+ <link rel="stylesheet" href="css/common.css" type="text/css" />
16
+
17
+ <script type="text/javascript">
18
+ pathId = "";
19
+ relpath = '';
20
+ </script>
21
+
22
+
23
+ <script type="text/javascript" charset="utf-8" src="js/jquery.js"></script>
24
+
25
+ <script type="text/javascript" charset="utf-8" src="js/app.js"></script>
26
+
27
+
28
+ </head>
29
+ <body>
30
+ <div class="nav_wrap">
31
+ <iframe id="nav" src="class_list.html?1"></iframe>
32
+ <div id="resizer"></div>
33
+ </div>
34
+
35
+ <div id="main" tabindex="-1">
36
+ <div id="header">
37
+ <div id="menu">
38
+
39
+ <a href="_index.html">Index</a> &raquo;
40
+
41
+
42
+ <span class="title">Top Level Namespace</span>
43
+
44
+ </div>
45
+
46
+ <div id="search">
47
+
48
+ <a class="full_list_link" id="class_list_link"
49
+ href="class_list.html">
50
+
51
+ <svg width="24" height="24">
52
+ <rect x="0" y="4" width="24" height="4" rx="1" ry="1"></rect>
53
+ <rect x="0" y="12" width="24" height="4" rx="1" ry="1"></rect>
54
+ <rect x="0" y="20" width="24" height="4" rx="1" ry="1"></rect>
55
+ </svg>
56
+ </a>
57
+
58
+ </div>
59
+ <div class="clear"></div>
60
+ </div>
61
+
62
+ <div id="content"><h1>Top Level Namespace
63
+
64
+
65
+
66
+ </h1>
67
+ <div class="box_info">
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+ </div>
80
+
81
+ <h2>Defined Under Namespace</h2>
82
+ <p class="children">
83
+
84
+
85
+ <strong class="modules">Modules:</strong> <span class='object_link'><a href="DWH.html" title="DWH (module)">DWH</a></span>
86
+
87
+
88
+
89
+
90
+ </p>
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+ </div>
101
+
102
+ <div id="footer">
103
+ Generated on Fri Aug 22 08:31:21 2025 by
104
+ <a href="https://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
105
+ 0.9.37 (ruby-3.4.4).
106
+ </div>
107
+
108
+ </div>
109
+ </body>
110
+ </html>
@@ -0,0 +1,359 @@
1
+ require 'csv'
2
+
3
+ module DWH
4
+ module Adapters
5
+ # AWS Athena adapter. Please ensure the aws-sdk-athena and aws-sdk-s3 gems are available before using this adapter.
6
+ # Generally, adapters should be created using {DWH::Factory#create DWH.create}. Where a configuration
7
+ # is passed in as options hash or argument list.
8
+ #
9
+ # @example Basic connection with required options
10
+ # DWH.create(:athena, {
11
+ # region: 'us-east-1',
12
+ # database: 'default',
13
+ # s3_output_location: 's3://my-athena-results-bucket/queries/',
14
+ # access_key_id: 'AKIAIOSFODNN7EXAMPLE',
15
+ # secret_access_key: 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'
16
+ # })
17
+ #
18
+ # @example Connection with IAM role (recommended)
19
+ # DWH.create(:athena, {
20
+ # region: 'us-east-1',
21
+ # database: 'default',
22
+ # s3_output_location: 's3://my-athena-results-bucket/queries/'
23
+ # })
24
+ #
25
+ # @example Connection with workgroup
26
+ # DWH.create(:athena, {
27
+ # region: 'us-east-1',
28
+ # database: 'default',
29
+ # s3_output_location: 's3://my-athena-results-bucket/queries/',
30
+ # workgroup: 'my-workgroup'
31
+ # })
32
+ class Athena < Adapter
33
+ config :region, String, required: true, message: 'AWS region (e.g., us-east-1)'
34
+ config :catalog, String, required: true, message: 'defaults to awsdatacatalog', default: 'awsdatacatalog'
35
+ config :database, String, required: true, message: 'Athena database/schema name. defaults to default.', default: 'default'
36
+ config :s3_output_location, String, required: true, message: 'S3 location for query results (e.g., s3://bucket/path/)'
37
+ config :access_key_id, String, required: false, default: nil, message: 'AWS access key ID (optional if using IAM role)'
38
+ config :secret_access_key, String, required: false, default: nil, message: 'AWS secret access key (optional if using IAM role)'
39
+ config :workgroup, String, required: false, message: 'Athena workgroup name'
40
+ config :query_timeout, Integer, required: false, default: 300, message: 'query execution timeout in seconds'
41
+ config :poll_interval, Integer, required: false, default: 2, message: 'polling interval in seconds for query status'
42
+
43
+ # (see Adapter#connection)
44
+ def connection
45
+ return @connection if @connection
46
+
47
+ aws_config = {
48
+ region: config[:region],
49
+ workgroup: config[:workgroup]
50
+ }.compact
51
+
52
+ # Add credentials if provided, otherwise rely on IAM role or environment
53
+ if config[:access_key_id] && config[:secret_access_key]
54
+ aws_config[:credentials] = Aws::Credentials.new(
55
+ config[:access_key_id],
56
+ config[:secret_access_key]
57
+ )
58
+ end
59
+
60
+ # Merge any extra connection params
61
+ aws_config.merge!(extra_connection_params)
62
+
63
+ @connection = Aws::Athena::Client.new(aws_config)
64
+ @s3_output_location = Aws::S3::Client.new(aws_config)
65
+
66
+ @connection
67
+ rescue StandardError => e
68
+ raise ConfigError, "Failed to connect to Athena: #{e.message}"
69
+ end
70
+
71
+ # (see Adapter#test_connection)
72
+ def test_connection(raise_exception: false)
73
+ # Test connection by listing workgroups
74
+ connection.list_work_groups(max_results: 1)
75
+ true
76
+ rescue StandardError => e
77
+ raise ConnectionError, "Athena connection test failed: #{e.message}" if raise_exception
78
+
79
+ false
80
+ end
81
+
82
+ # (see Adapter#tables)
83
+ def tables(**qualifiers)
84
+ schema = qualifiers[:database] || qualifiers[:schema] || config[:database]
85
+ catalog = qualifiers[:catalog] || config[:catalog]
86
+
87
+ sql = 'SELECT table_name FROM information_schema.tables'
88
+ wheres = ['WHERE 1=1']
89
+ wheres << "table_catalog = '#{catalog}'"
90
+ wheres << "table_schema = '#{schema}'"
91
+
92
+ result = execute("#{sql} #{wheres.join(' AND ')}", format: :array)
93
+ result.flatten
94
+ end
95
+
96
+ # (see Adapter#stats)
97
+ def stats(table, date_column: nil, **qualifiers)
98
+ database_name = qualifiers[:database] || config[:database]
99
+ full_table_name = "#{database_name}.#{table}"
100
+
101
+ sql_parts = ['SELECT COUNT(*) as row_count']
102
+
103
+ if date_column
104
+ sql_parts << ", MIN(#{date_column}) as date_start"
105
+ sql_parts << ", MAX(#{date_column}) as date_end"
106
+ end
107
+
108
+ sql = "#{sql_parts.join} FROM #{full_table_name}"
109
+
110
+ result = execute(sql, format: :object)
111
+ first_row = result.first || {}
112
+
113
+ TableStats.new(
114
+ row_count: first_row['row_count'],
115
+ date_start: first_row['date_start'],
116
+ date_end: first_row['date_end']
117
+ )
118
+ end
119
+
120
+ # (see Adapter#metadata)
121
+ def metadata(table, **qualifiers)
122
+ schema = qualifiers[:database] || qualifiers[:schema] || config[:database]
123
+ catalog = qualifiers[:catalog] || config[:catalog]
124
+ db_table = Table.new table, schema: schema, catalog: catalog
125
+
126
+ sql = 'SELECT * FROM information_schema.columns'
127
+ wheres = ["WHERE table_name = '#{db_table.physical_name}'"]
128
+
129
+ wheres << "table_schema = '#{db_table.schema}'" if db_table.schema
130
+ wheres << "table_catalog = '#{db_table.catalog}'" if db_table.catalog
131
+
132
+ cols = execute("#{sql} \n #{wheres.join(' AND ')}", format: :object)
133
+ cols.each do |col|
134
+ # Athena DESCRIBE returns different column names than standard information_schema
135
+ column_name = col['col_name'] || col['column_name']
136
+ data_type = col['data_type']
137
+
138
+ # Parse Athena data types (e.g., "varchar(255)", "decimal(10,2)")
139
+ precision, scale = parse_data_type_precision(data_type)
140
+ max_char_length = parse_char_length(data_type)
141
+
142
+ db_table << Column.new(
143
+ name: column_name,
144
+ data_type: data_type,
145
+ precision: precision,
146
+ scale: scale,
147
+ max_char_length: max_char_length
148
+ )
149
+ end
150
+
151
+ db_table
152
+ end
153
+
154
+ # (see Adapter#execute)
155
+ def execute(sql, format: :array, retries: 0)
156
+ begin
157
+ result_data = with_debug(sql) { with_retry(retries) { execute_query(sql) } }
158
+ rescue ExecutionError
159
+ raise
160
+ rescue StandardError => e
161
+ raise ExecutionError, "Athena query failed: #{e.message}"
162
+ end
163
+
164
+ format = format.downcase if format.is_a?(String)
165
+ case format.to_sym
166
+ when :array
167
+ result_data[:rows]
168
+ when :object
169
+ headers = result_data[:headers]
170
+ result_data[:rows].map { |row| Hash[headers.zip(row)] }
171
+ when :csv
172
+ rows_to_csv(result_data[:headers], result_data[:rows])
173
+ when :native
174
+ result_data
175
+ else
176
+ raise UnsupportedCapability, "Unsupported format: #{format} for Athena adapter"
177
+ end
178
+ end
179
+
180
+ # (see Adapter#execute_stream)
181
+ def execute_stream(sql, io, stats: nil, retries: 0)
182
+ with_debug(sql) do
183
+ with_retry(retries) do
184
+ execute_query(sql, io: io, stats: stats)
185
+ end
186
+ end
187
+ rescue StandardError => e
188
+ raise ExecutionError, "Athena streaming query failed: #{e.message}"
189
+ end
190
+
191
+ # (see Adapter#stream)
192
+ def stream(sql, &block)
193
+ with_debug(sql) do
194
+ result_data = execute_query(sql)
195
+
196
+ result_data[:rows].each do |row|
197
+ block.call(row)
198
+ end
199
+ end
200
+ end
201
+
202
+ def valid_config?
203
+ super
204
+ require 'aws-sdk-athena'
205
+ rescue LoadError
206
+ raise ConfigError, "Required 'aws-sdk-athena' and 'aws-sdk-s3' gems missing. Please add them to your Gemfile."
207
+ end
208
+
209
+ private
210
+
211
+ # Execute a query and return the parsed results
212
+ def execute_query(sql, io: nil, stats: nil)
213
+ query_execution_id = start_query_execution(sql)
214
+ wait_for_query_completion(query_execution_id)
215
+ if io
216
+ fetch_query_results_to_io(query_execution_id, io, stats: stats)
217
+ else
218
+ fetch_query_results(query_execution_id)
219
+ end
220
+ end
221
+
222
+ # Start query execution and return execution ID
223
+ def start_query_execution(sql)
224
+ params = {
225
+ query_string: sql,
226
+ query_execution_context: {
227
+ catalog: config[:catalog],
228
+ database: config[:database]
229
+ },
230
+ result_configuration: {
231
+ output_location: config[:s3_output_location]
232
+ },
233
+ work_group: config[:workgroup]
234
+ }
235
+
236
+ response = connection.start_query_execution(params)
237
+ response.query_execution_id
238
+ end
239
+
240
+ # Wait for query to complete
241
+ def wait_for_query_completion(query_execution_id)
242
+ timeout = config[:query_timeout]
243
+ start_time = Time.now
244
+
245
+ loop do
246
+ raise ExecutionError, "Query timeout after #{timeout} seconds" if Time.now - start_time > timeout
247
+
248
+ response = connection.get_query_execution(
249
+ query_execution_id: query_execution_id
250
+ )
251
+
252
+ state = response.query_execution.status.state
253
+
254
+ case state
255
+ when 'SUCCEEDED'
256
+ return true
257
+ when 'FAILED', 'CANCELLED'
258
+ reason = response.query_execution.status.state_change_reason
259
+ raise ExecutionError, "Query #{state.downcase}: #{reason}"
260
+ when 'QUEUED', 'RUNNING'
261
+ sleep(config[:poll_interval])
262
+ next
263
+ else
264
+ raise ExecutionError, "Unknown query state: #{state}"
265
+ end
266
+ end
267
+ end
268
+
269
+ # Fetch and parse query results
270
+ def fetch_query_results(query_execution_id)
271
+ headers = []
272
+ rows = []
273
+ next_token = nil
274
+
275
+ loop do
276
+ params = { query_execution_id: query_execution_id }
277
+ params[:next_token] = next_token if next_token
278
+
279
+ response = connection.get_query_results(params)
280
+ headers = response.result_set.result_set_metadata.column_info.map(&:name) if headers.empty? && response.result_set.result_set_metadata
281
+
282
+ response.result_set.rows.each_with_index do |row, index|
283
+ # skip headers. first row on the first page is headers
284
+ # we only skip first row on the first page with headers
285
+ next if headers.empty? || (next_token.nil? && index.zero?)
286
+
287
+ row_data = row.data.map { |datum| datum.var_char_value }
288
+ rows << row_data unless row_data.compact.empty? # skip empty rows
289
+ end
290
+
291
+ next_token = response.next_token
292
+ break unless next_token
293
+ end
294
+
295
+ { headers: headers, rows: rows }
296
+ end
297
+
298
+ # Fetch and parse query results
299
+ def fetch_query_results_to_io(query_execution_id, io, stats: nil)
300
+ headers = []
301
+ next_token = nil
302
+ wrote_headers = false
303
+
304
+ loop do
305
+ params = { query_execution_id: query_execution_id }
306
+ params[:next_token] = next_token if next_token
307
+
308
+ response = connection.get_query_results(params)
309
+
310
+ if headers.empty? && response.result_set.result_set_metadata
311
+ headers = response.result_set.result_set_metadata.column_info.map(&:name)
312
+ io.write(CSV.generate_line(headers)) unless wrote_headers || headers.empty?
313
+ wrote_headers = headers.empty?
314
+ end
315
+
316
+ response.result_set.rows.each_with_index do |row, index|
317
+ next if headers.empty? || (next_token.nil? && index.zero?)
318
+
319
+ row_data = row.data.map { |datum| datum.var_char_value }
320
+ stats << row_data unless stats.nil?
321
+ io.write(CSV.generate_line(row_data))
322
+ end
323
+
324
+ next_token = response.next_token
325
+ break unless next_token
326
+ end
327
+
328
+ io.rewind
329
+ io
330
+ end
331
+
332
+ # Parse precision and scale from data type string
333
+ def parse_data_type_precision(data_type)
334
+ if data_type && (match = data_type.match(/\((\d+)(?:,\s*(\d+))?\)/))
335
+ precision = match[1].to_i
336
+ scale = match[2]&.to_i
337
+ [precision, scale]
338
+ else
339
+ [nil, nil]
340
+ end
341
+ end
342
+
343
+ # Parse character length from data type string
344
+ def parse_char_length(data_type)
345
+ if data_type && (match = data_type.match(/(?:var)?char\((\d+)\)/i))
346
+ match[1].to_i
347
+ end
348
+ end
349
+
350
+ # Convert headers and rows to CSV string
351
+ def rows_to_csv(headers, rows)
352
+ CSV.generate do |csv|
353
+ csv << headers
354
+ rows.each { |row| csv << row }
355
+ end
356
+ end
357
+ end
358
+ end
359
+ end