dwh 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/.rubocop.yml +36 -0
  3. data/CHANGELOG.md +5 -0
  4. data/LICENSE +21 -0
  5. data/README.md +130 -0
  6. data/Rakefile +42 -0
  7. data/docs/DWH/Adapters/Adapter.html +3053 -0
  8. data/docs/DWH/Adapters/Athena.html +1704 -0
  9. data/docs/DWH/Adapters/Boolean.html +121 -0
  10. data/docs/DWH/Adapters/Druid.html +1626 -0
  11. data/docs/DWH/Adapters/DuckDb.html +2012 -0
  12. data/docs/DWH/Adapters/MySql.html +1704 -0
  13. data/docs/DWH/Adapters/OpenAuthorizable/ClassMethods.html +265 -0
  14. data/docs/DWH/Adapters/OpenAuthorizable.html +1102 -0
  15. data/docs/DWH/Adapters/Postgres.html +2000 -0
  16. data/docs/DWH/Adapters/Snowflake.html +1662 -0
  17. data/docs/DWH/Adapters/SqlServer.html +2084 -0
  18. data/docs/DWH/Adapters/Trino.html +1835 -0
  19. data/docs/DWH/Adapters.html +129 -0
  20. data/docs/DWH/AuthenticationError.html +142 -0
  21. data/docs/DWH/Behaviors.html +767 -0
  22. data/docs/DWH/Capabilities.html +748 -0
  23. data/docs/DWH/Column.html +1115 -0
  24. data/docs/DWH/ConfigError.html +143 -0
  25. data/docs/DWH/ConnectionError.html +143 -0
  26. data/docs/DWH/DWHError.html +138 -0
  27. data/docs/DWH/ExecutionError.html +143 -0
  28. data/docs/DWH/Factory.html +1133 -0
  29. data/docs/DWH/Functions/Arrays.html +505 -0
  30. data/docs/DWH/Functions/Dates.html +1644 -0
  31. data/docs/DWH/Functions/ExtractDatePart.html +804 -0
  32. data/docs/DWH/Functions/Nulls.html +377 -0
  33. data/docs/DWH/Functions.html +846 -0
  34. data/docs/DWH/Logger.html +258 -0
  35. data/docs/DWH/OAuthError.html +138 -0
  36. data/docs/DWH/Settings.html +658 -0
  37. data/docs/DWH/StreamingStats.html +804 -0
  38. data/docs/DWH/Table.html +1260 -0
  39. data/docs/DWH/TableStats.html +583 -0
  40. data/docs/DWH/TokenExpiredError.html +142 -0
  41. data/docs/DWH/UnsupportedCapability.html +135 -0
  42. data/docs/DWH.html +220 -0
  43. data/docs/_index.html +471 -0
  44. data/docs/class_list.html +54 -0
  45. data/docs/css/common.css +1 -0
  46. data/docs/css/full_list.css +58 -0
  47. data/docs/css/style.css +503 -0
  48. data/docs/file.README.html +210 -0
  49. data/docs/file.adapters.html +514 -0
  50. data/docs/file.creating-adapters.html +497 -0
  51. data/docs/file.getting-started.html +288 -0
  52. data/docs/file.usage.html +446 -0
  53. data/docs/file_list.html +79 -0
  54. data/docs/frames.html +22 -0
  55. data/docs/guides/adapters.md +445 -0
  56. data/docs/guides/creating-adapters.md +430 -0
  57. data/docs/guides/getting-started.md +225 -0
  58. data/docs/guides/usage.md +378 -0
  59. data/docs/index.html +210 -0
  60. data/docs/js/app.js +344 -0
  61. data/docs/js/full_list.js +242 -0
  62. data/docs/js/jquery.js +4 -0
  63. data/docs/method_list.html +2038 -0
  64. data/docs/top-level-namespace.html +110 -0
  65. data/lib/dwh/adapters/athena.rb +359 -0
  66. data/lib/dwh/adapters/druid.rb +267 -0
  67. data/lib/dwh/adapters/duck_db.rb +235 -0
  68. data/lib/dwh/adapters/my_sql.rb +235 -0
  69. data/lib/dwh/adapters/open_authorizable.rb +215 -0
  70. data/lib/dwh/adapters/postgres.rb +250 -0
  71. data/lib/dwh/adapters/snowflake.rb +489 -0
  72. data/lib/dwh/adapters/sql_server.rb +257 -0
  73. data/lib/dwh/adapters/trino.rb +213 -0
  74. data/lib/dwh/adapters.rb +363 -0
  75. data/lib/dwh/behaviors.rb +67 -0
  76. data/lib/dwh/capabilities.rb +39 -0
  77. data/lib/dwh/column.rb +79 -0
  78. data/lib/dwh/errors.rb +29 -0
  79. data/lib/dwh/factory.rb +125 -0
  80. data/lib/dwh/functions/arrays.rb +42 -0
  81. data/lib/dwh/functions/dates.rb +162 -0
  82. data/lib/dwh/functions/extract_date_part.rb +70 -0
  83. data/lib/dwh/functions/nulls.rb +31 -0
  84. data/lib/dwh/functions.rb +86 -0
  85. data/lib/dwh/logger.rb +50 -0
  86. data/lib/dwh/settings/athena.yml +77 -0
  87. data/lib/dwh/settings/base.yml +81 -0
  88. data/lib/dwh/settings/databricks.yml +51 -0
  89. data/lib/dwh/settings/druid.yml +59 -0
  90. data/lib/dwh/settings/duckdb.yml +44 -0
  91. data/lib/dwh/settings/mysql.yml +67 -0
  92. data/lib/dwh/settings/postgres.yml +30 -0
  93. data/lib/dwh/settings/redshift.yml +52 -0
  94. data/lib/dwh/settings/snowflake.yml +45 -0
  95. data/lib/dwh/settings/sqlserver.yml +80 -0
  96. data/lib/dwh/settings/trino.yml +77 -0
  97. data/lib/dwh/settings.rb +79 -0
  98. data/lib/dwh/streaming_stats.rb +69 -0
  99. data/lib/dwh/table.rb +105 -0
  100. data/lib/dwh/table_stats.rb +51 -0
  101. data/lib/dwh/version.rb +5 -0
  102. data/lib/dwh.rb +54 -0
  103. data/sig/dwh.rbs +4 -0
  104. metadata +231 -0
@@ -0,0 +1,267 @@
1
+ module DWH
2
+ module Adapters
3
+ # Druid adapter.
4
+ #
5
+ # Generally, adapters should be created using {DWH::Factory#create DWH.create}. Where a configuration
6
+ # is passed in as options hash or argument list.
7
+ #
8
+ # @example Basic connection with required only options
9
+ # DWH.create(:druid, {host: 'localhost',port: 8080, protocol: 'http'})
10
+ #
11
+ # @example Connect with SSL and basic authorization
12
+ # DWH.create(:druid, {host: 'localhost',port: 8080, protocol: 'http',
13
+ # basic_auth: 'BASE_64 encoded authorization key'
14
+ # })
15
+ #
16
+ # @example Sending custom client name and user information
17
+ # DWH.create(:druid, {host: 'localhost',port: 8080,
18
+ # client_name: 'Strata CLI', extra_connection_params: {
19
+ # context: {
20
+ # user: 'Ajo',
21
+ # team: 'Engineering'
22
+ # }
23
+ # }})
24
+ class Druid < Adapter
25
+ DRUID_STATUS = '/status'.freeze
26
+ DRUID_DATASOURCES = '/druid/coordinator/v1/datasources'.freeze
27
+ DRUID_SQL = '/druid/v2/sql/'.freeze
28
+ COLUMNS_FOR_TABLE = '"COLUMN_NAME","DATA_TYPE", "NUMERIC_PRECISION", "NUMERIC_SCALE", "CHARACTER_MAXIMUM_LENGTH"'.freeze
29
+
30
+ config :protocol, String, required: true, default: 'http', message: 'must be http or https', allowd: %w[http https]
31
+ config :host, String, required: true, message: 'server host ip address or domain name'
32
+ config :port, Integer, required: true, default: 8081, message: 'port to connect to'
33
+ config :query_timeout, Integer, required: false, default: 600, message: 'query execution timeout in seconds'
34
+ config :open_timeout, Integer, required: false, default: nil, message: 'how long to wait to connect'
35
+ config :client_name, String, default: 'DWH Ruby Gem', message: 'client_name will be passed in the context object'
36
+ config :basic_auth, String, required: false, message: 'authorization key sent in the header'
37
+
38
+ # (see Adapter#connection)
39
+ def connection
40
+ return @connection if @connection
41
+
42
+ @connection = Faraday.new(
43
+ url: "#{config[:protocol]}://#{config[:host]}:#{config[:port]}",
44
+ headers: {
45
+ 'Content-Type' => 'application/json',
46
+ **(config[:basic_auth] ? { 'Authorization' => "Basic #{config[:basic_auth]}" } : {})
47
+ },
48
+ request: {
49
+ timeout: config[:query_timeout],
50
+ open_timeout: config[:open_timeout],
51
+ context: {
52
+ client_name: config[:client_name]
53
+ }
54
+ }.merge(extra_connection_params)
55
+ )
56
+
57
+ @connection
58
+ end
59
+
60
+ # (see Adapter#test_connection)
61
+ def test_connection(raise_exception: false)
62
+ res = connection.get(DRUID_STATUS)
63
+ unless res.success?
64
+ raise ConnectionError, res.body if raise_exception
65
+
66
+ false
67
+ end
68
+
69
+ true
70
+ rescue Faraday::ConnectionFailed => e
71
+ raise ConnectionError, e.message if raise_exception
72
+
73
+ false
74
+ end
75
+
76
+ # (see Adapter#tables)
77
+ def tables
78
+ resp = connection.get(DRUID_DATASOURCES) do |req|
79
+ req.options.timeout = 30
80
+ end
81
+ JSON.parse resp.body
82
+ end
83
+
84
+ # Date column will default to __time. If the datasource,
85
+ # does not have a date column please set it to nil
86
+ # @param table [String] table name
87
+ # @param date_column [String] optional date column
88
+ # @see Adapter#stats
89
+ def stats(table, date_column: '__time')
90
+ sql = <<-SQL
91
+ SELECT
92
+ count(*) ROW_COUNT
93
+ #{date_column.nil? ? nil : ", min(#{date_column}) DATE_START"}
94
+ #{date_column.nil? ? nil : ", max(#{date_column}) DATE_END"}
95
+ FROM "#{table}"
96
+ SQL
97
+
98
+ result = execute(sql)
99
+
100
+ TableStats.new(
101
+ row_count: result[0][0],
102
+ date_start: result[0][1],
103
+ date_end: result[0][2]
104
+ )
105
+ end
106
+
107
+ # Marks unused segments of a datasource/table as unused
108
+ # @param table [String] datasource/table name
109
+ # @param interval [String] date interval in the format of from_date/to_date
110
+ # as valid ISO timestamps
111
+ def drop_unused_segments(table, interval)
112
+ url = "/druid/coordinator/v1/datasources/#{table}/markUnused"
113
+
114
+ logger.debug '=== Dropping Segments ==='
115
+
116
+ response = connection.post(url) do |req|
117
+ req.headers['Content-Type'] = 'application/json'
118
+ req.body = { interval: interval }.to_json
119
+ end
120
+
121
+ logger.debug response.status
122
+ end
123
+
124
+ # (see Adapter#metadata)
125
+ def metadata(table)
126
+ sql = <<-SQL
127
+ SELECT #{COLUMNS_FOR_TABLE} FROM INFORMATION_SCHEMA.COLUMNS
128
+ WHERE TABLE_SCHEMA = 'druid' AND TABLE_NAME = '#{table}'
129
+ SQL
130
+
131
+ stats = stats(table)
132
+ db_table = Table.new 'table', table_stats: stats
133
+ cols = execute(sql, format: :object)
134
+ st = table_druid_schema_types(table, stats.date_end)
135
+
136
+ cols.each do |col|
137
+ db_table << Column.new(
138
+ name: col['COLUMN_NAME'],
139
+ schema_type: st[:metrics].include?(col['COLUMN_NAME']) ? 'measure' : 'dimension',
140
+ data_type: col['DATA_TYPE'],
141
+ precision: col['NUMERIC_PRECISION'],
142
+ scale: col['NUMERIC_SCALE'],
143
+ max_char_length: col['CHARACTER_MAXIMUM_LENGTH']
144
+ )
145
+ end
146
+
147
+ db_table
148
+ end
149
+
150
+ # (see Adapter#execute)
151
+ def execute(sql, format: :array, retries: 0)
152
+ format = format.to_sym
153
+ result_format = format == :native ? 'array' : format.to_s
154
+ resp = with_debug(sql) do
155
+ with_retry(retries) do
156
+ connection.post(DRUID_SQL) do |req|
157
+ req.headers['Content-Type'] = 'application/json'
158
+ req.body = {
159
+ query: sql,
160
+ resultFormat: result_format,
161
+ context: { sqlTimeZone: 'Etc/UTC' }
162
+ }.merge(extra_query_params)
163
+ .to_json
164
+ end
165
+ end
166
+ end
167
+
168
+ raise ExecutionError, "Could not execute #{sql}: \n #{resp.body}" if resp.status != 200
169
+
170
+ if format == :native
171
+ resp
172
+ else
173
+ format == :csv ? resp.body : JSON.parse(resp.body)
174
+ end
175
+ end
176
+
177
+ # (see Adapter#execute_stream)
178
+ def execute_stream(sql, io, stats: nil, retries: 0)
179
+ resp = with_debug(sql) do
180
+ with_retry(retries) do
181
+ connection.post(DRUID_SQL) do |req|
182
+ req.headers['Content-Type'] = 'application/json'
183
+ req.body = {
184
+ query: sql,
185
+ resultFormat: 'csv',
186
+ header: true
187
+ # added timezone here due to druid bug
188
+ # where date sub query joins failed without it.
189
+ # context: { sqlTimeZone: 'Etc/UTC'}
190
+ }.merge(extra_query_params).to_json
191
+
192
+ parseable_row = ''
193
+ req.options.on_data = proc do |chunk, _|
194
+ handle_streaming_chunk(io, chunk, stats, parseable_row)
195
+ end
196
+ end
197
+ end
198
+ end
199
+
200
+ io.rewind
201
+ # Raise exception on failed runs
202
+ raise ExecutionError, io.read unless resp.success?
203
+
204
+ io
205
+ end
206
+
207
+ # (see Adapter#stream)
208
+ def stream(sql, &block)
209
+ on_data_calls = 0
210
+ with_debug(sql) do
211
+ connection.post(DRUID_SQL) do |req|
212
+ req.headers['Content-Type'] = 'application/json'
213
+ req.body = { query: sql, resultFormat: 'csv' }.to_json
214
+ req.options.on_data = proc do |chunk, _chunk_size|
215
+ block.call chunk.force_encoding('utf-8')
216
+ on_data_calls += 1
217
+ end
218
+ end
219
+ end
220
+
221
+ on_data_calls
222
+ end
223
+
224
+ protected
225
+
226
+ def table_druid_schema_types(table, last_interval_start_date)
227
+ end_date = last_interval_start_date + 1
228
+ start_date = last_interval_start_date
229
+ url_friendly_interval = "#{start_date.strftime('%Y-%m-%d')}_#{end_date.strftime('%Y-%m-%d')}"
230
+ url = "/druid/coordinator/v1/datasources/#{table}/intervals/#{url_friendly_interval}?full"
231
+
232
+ resp = connection.get(url) do |req|
233
+ req.options.timeout = 30
234
+ end
235
+
236
+ raise ExecutionError, "Could not fetch druid schema types: \n #{resp.body}" if resp.status != 200
237
+
238
+ res = JSON.parse(resp.body)
239
+ meta = res.flatten[1].flatten(4)[1]['metadata']
240
+ {
241
+ dimensions: meta['dimensions'].split(','),
242
+ metrics: meta['metrics'].split(',')
243
+ }
244
+ end
245
+
246
+ def handle_streaming_chunk(io, chunk, stats, parseable_row)
247
+ io.write chunk.rstrip.force_encoding('utf-8')
248
+
249
+ parseable_row += chunk
250
+ process_streaming_rows(parseable_row, chunk, stats)
251
+ end
252
+
253
+ def process_streaming_rows(parseable_row, chunk, stats)
254
+ return if stats.nil? || stats&.limit_reached?
255
+
256
+ rows = CSV.parse(parseable_row, skip_blanks: true)
257
+ rows.each_with_index do |row, index|
258
+ # skip header rows in stats collector
259
+ stats << row unless index.zero? && stats.total_rows.zero?
260
+ end
261
+ parseable_row.clear
262
+ rescue CSV::MalformedCSVError
263
+ logger.debug("Unparseable:\n #{chunk}")
264
+ end
265
+ end
266
+ end
267
+ end
@@ -0,0 +1,235 @@
1
+ module DWH
2
+ module Adapters
3
+ # DuckDb adapter.
4
+ #
5
+ # This requires the ruby {https://github.com/suketa/ruby-duckdb DuckDb} gem. Installation
6
+ # is a bit complex. Please follow the guide on the gems page to make sure
7
+ # you have DuckDb installed as required before installing the gem.
8
+ #
9
+ # Generally, adapters should be created using {DWH::Factory#create DWH.create}. Where a configuration
10
+ # is passed in as options hash or argument list.
11
+ #
12
+ # @example Basic connection with required only options
13
+ # DWH.create(:duckdb, {file: 'path/to/my/duckdb' })
14
+ #
15
+ # @example Open in read only mode. {https://duckdb.org/docs/stable/configuration/overview#configuration-reference config docs}
16
+ # DWH.create(:duckdb, {file: 'path/to/my/duckdb' ,duck_config: { access_mode: "READ_ONLY"}})
17
+ class DuckDb < Adapter
18
+ config :file, String, required: true, message: 'path/to/duckdb/db'
19
+ config :schema, String, required: false, default: 'main', message: 'schema defaults to main'
20
+ config :duck_config, Hash, required: false, message: 'hash of valid DuckDb configuration options'
21
+
22
+ # (see Adapter#connection)
23
+ def connection
24
+ return @connection if @connection
25
+
26
+ if self.class.databases.key?(config[:file])
27
+ @db = self.class.databases[config[:file]]
28
+ else
29
+ ducked_config = DuckDB::Config.new
30
+ if config.key?(:duck_config)
31
+ config[:duck_config].each do |key, val|
32
+ ducked_config[key.to_s] = val
33
+ end
34
+ end
35
+ @db = DuckDB::Database.open(config[:file], ducked_config)
36
+ self.class.databases[config[:file]] = @db
37
+ end
38
+
39
+ @connection = @db.connect
40
+
41
+ @connection
42
+ rescue StandardError => e
43
+ raise ConfigError, e.message
44
+ end
45
+
46
+ def self.databases
47
+ @databases ||= {}
48
+ end
49
+
50
+ def self.open_databases
51
+ databases.size
52
+ end
53
+
54
+ # DuckDB is an in process database so we don't want to
55
+ # open multiple instances of the same db in memory. Rather,
56
+ # we open one instance but many connections. Use this
57
+ # method to close them all.
58
+ def self.close_all
59
+ databases.each do |key, db|
60
+ db.close
61
+ databases.delete(key)
62
+ end
63
+ end
64
+
65
+ # This disconnects the current connection but
66
+ # the db is still in process and can be reconnected
67
+ # to.
68
+ #
69
+ # (see Adapter#close)
70
+ def close
71
+ connection.disconnect
72
+ @connection = nil
73
+ end
74
+
75
+ # (see Adapter#test_connection)
76
+ def test_connection(raise_exception: false)
77
+ connection
78
+ true
79
+ rescue StandardError => e
80
+ raise ConnectionError, e.message if raise_exception
81
+
82
+ false
83
+ end
84
+
85
+ # (see Adapter#tables)
86
+ def tables(**qualifiers)
87
+ catalog, schema = qualifiers.values_at(:catalog, :schema)
88
+ sql = 'SELECT table_name FROM duckdb_tables'
89
+
90
+ where = []
91
+ where << "database_name = '#{catalog}'" if catalog
92
+
93
+ where << if schema
94
+ "schema_name = '#{schema}'"
95
+ else
96
+ "schema_name = '#{config[:schema]}'"
97
+ end
98
+
99
+ res = execute("#{sql} WHERE #{where.join(' AND ')}")
100
+ res.flatten
101
+ end
102
+
103
+ # (see Adapter#stats)
104
+ def stats(table, date_column: nil, **qualifiers)
105
+ qualifiers[:schema] = config[:schema] unless qualifiers[:schema]
106
+ db_table = Table.new table, **qualifiers
107
+
108
+ sql = <<-SQL
109
+ SELECT count(*) ROW_COUNT
110
+ #{date_column.nil? ? nil : ", min(#{date_column}) DATE_START"}
111
+ #{date_column.nil? ? nil : ", max(#{date_column}) DATE_END"}
112
+ FROM #{db_table.fully_qualified_table_name}
113
+ SQL
114
+
115
+ result = execute(sql)
116
+ TableStats.new(
117
+ row_count: result.first[0],
118
+ date_start: result.first[1],
119
+ date_end: result.first[2]
120
+ )
121
+ end
122
+
123
+ # (see Adapter#metadata)
124
+ def metadata(table, **qualifiers)
125
+ db_table = Table.new table, **qualifiers
126
+ sql = 'SELECT column_name, data_type, character_maximum_length, numeric_precision,numeric_scale FROM duckdb_columns'
127
+
128
+ where = ["table_name = '#{db_table.physical_name}'"]
129
+ where << "database_name = '#{db_table.catalog}'" if db_table.catalog
130
+
131
+ where << if db_table.schema
132
+ "schema_name = '#{db_table.schema}'"
133
+ else
134
+ "schema_name = '#{config[:schema]}'"
135
+ end
136
+
137
+ cols = execute("#{sql} WHERE #{where.join(' AND ')}")
138
+ cols.each do |col|
139
+ db_table << Column.new(
140
+ name: col[0],
141
+ data_type: col[1],
142
+ precision: col[3],
143
+ scale: col[4],
144
+ max_char_length: col[2]
145
+ )
146
+ end
147
+
148
+ db_table
149
+ end
150
+
151
+ # True if the configuration was setup with a schema.
152
+ def schema?
153
+ config[:schema].present?
154
+ end
155
+
156
+ # (see Adapter#execute)
157
+ def execute(sql, format: :array, retries: 0)
158
+ begin
159
+ result = with_debug(sql) { with_retry(retries) { connection.query(sql) } }
160
+ rescue StandardError => e
161
+ raise ExecutionError, e.message
162
+ end
163
+
164
+ format = format.downcase if format.is_a?(String)
165
+ case format.to_sym
166
+ when :array
167
+ result.to_a
168
+ when :object
169
+ result_to_hash(result)
170
+ when :csv
171
+ result_to_csv(result)
172
+ when :native
173
+ result
174
+ else
175
+ raise UnsupportedCapability, "Unsupported format: #{format} for this #{name}"
176
+ end
177
+ end
178
+
179
+ # (see Adapter#execute_stream)
180
+ def execute_stream(sql, io, stats: nil, retries: 0)
181
+ with_debug(sql) do
182
+ with_retry(retries) do
183
+ result = connection.query(sql)
184
+ io.write(CSV.generate_line(result.columns.map(&:name)))
185
+ result.each do |row|
186
+ stats << row unless stats.nil?
187
+ io.write(CSV.generate_line(row))
188
+ end
189
+ end
190
+ end
191
+
192
+ io.rewind
193
+ io
194
+ rescue StandardError => e
195
+ raise ExecutionError, e.message
196
+ end
197
+
198
+ # (see Adapter#stream)
199
+ def stream(sql, &block)
200
+ with_debug(sql) do
201
+ result = connection.query(sql)
202
+ result.each do |row|
203
+ block.call(row)
204
+ end
205
+ end
206
+ end
207
+
208
+ def valid_config?
209
+ super
210
+ require 'duckdb'
211
+ rescue LoadError
212
+ raise ConfigError, "Required 'duckdb' gem missing. Please add it to your Gemfile."
213
+ end
214
+
215
+ private
216
+
217
+ def result_to_hash(result)
218
+ columns = result.columns.map(&:name)
219
+
220
+ result.each.map do |row|
221
+ columns.zip(row).to_h
222
+ end
223
+ end
224
+
225
+ def result_to_csv(result)
226
+ CSV.generate do |csv|
227
+ csv << result.columns.map(&:name)
228
+ result.each do |row|
229
+ csv << row
230
+ end
231
+ end
232
+ end
233
+ end
234
+ end
235
+ end