sequelizer 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +64 -0
- data/.devcontainer/.p10k.zsh +1713 -0
- data/.devcontainer/.zshrc +29 -0
- data/.devcontainer/Dockerfile +137 -0
- data/.devcontainer/copy-claude-credentials.sh +32 -0
- data/.devcontainer/devcontainer.json +102 -0
- data/.devcontainer/init-firewall.sh +123 -0
- data/.devcontainer/setup-credentials.sh +95 -0
- data/.github/workflows/test.yml +1 -1
- data/.gitignore +6 -1
- data/.overcommit.yml +73 -0
- data/.rubocop.yml +167 -0
- data/CHANGELOG.md +24 -0
- data/CLAUDE.md +219 -0
- data/Gemfile +6 -2
- data/Gemfile.lock +158 -0
- data/Guardfile +1 -1
- data/Rakefile +28 -3
- data/lib/sequel/extensions/cold_col.rb +436 -0
- data/lib/sequel/extensions/db_opts.rb +65 -4
- data/lib/sequel/extensions/make_readyable.rb +148 -30
- data/lib/sequel/extensions/more_sql.rb +76 -0
- data/lib/sequel/extensions/settable.rb +64 -0
- data/lib/sequel/extensions/sql_recorder.rb +85 -0
- data/lib/sequel/extensions/unionize.rb +169 -0
- data/lib/sequel/extensions/usable.rb +30 -1
- data/lib/sequelizer/cli.rb +61 -18
- data/lib/sequelizer/connection_maker.rb +54 -72
- data/lib/sequelizer/env_config.rb +6 -6
- data/lib/sequelizer/gemfile_modifier.rb +23 -21
- data/lib/sequelizer/monkey_patches/database_in_after_connect.rb +7 -5
- data/lib/sequelizer/options.rb +97 -18
- data/lib/sequelizer/options_hash.rb +2 -0
- data/lib/sequelizer/version.rb +3 -1
- data/lib/sequelizer/yaml_config.rb +9 -3
- data/lib/sequelizer.rb +65 -9
- data/sequelizer.gemspec +12 -7
- data/test/lib/sequel/extensions/test_cold_col.rb +251 -0
- data/test/lib/sequel/extensions/test_db_opts.rb +10 -8
- data/test/lib/sequel/extensions/test_make_readyable.rb +199 -28
- data/test/lib/sequel/extensions/test_more_sql.rb +132 -0
- data/test/lib/sequel/extensions/test_settable.rb +109 -0
- data/test/lib/sequel/extensions/test_sql_recorder.rb +231 -0
- data/test/lib/sequel/extensions/test_unionize.rb +76 -0
- data/test/lib/sequel/extensions/test_usable.rb +5 -2
- data/test/lib/sequelizer/test_connection_maker.rb +21 -17
- data/test/lib/sequelizer/test_env_config.rb +5 -2
- data/test/lib/sequelizer/test_gemfile_modifier.rb +7 -6
- data/test/lib/sequelizer/test_options.rb +14 -9
- data/test/lib/sequelizer/test_yaml_config.rb +13 -12
- data/test/test_helper.rb +36 -8
- metadata +107 -28
- data/lib/sequel/extensions/sqls.rb +0 -31
@@ -1,60 +1,101 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
|
1
3
|
module Sequel
|
4
|
+
|
5
|
+
# = MakeReadyable
|
6
|
+
#
|
7
|
+
# Sequel extension that provides database readiness functionality,
|
8
|
+
# primarily geared towards Spark SQL-based databases. This extension
|
9
|
+
# allows setting up temporary views and schema configurations to prepare
|
10
|
+
# a database for use.
|
11
|
+
#
|
12
|
+
# @example Basic schema usage
|
13
|
+
# db.extension :make_readyable
|
14
|
+
# db.make_ready(use_schema: :my_schema)
|
15
|
+
#
|
16
|
+
# @example Search path with schema precedence
|
17
|
+
# db.make_ready(search_path: [:schema1, :schema2])
|
18
|
+
#
|
19
|
+
# @example External file sources
|
20
|
+
# db.make_ready(search_path: [Pathname.new('data.parquet')])
|
2
21
|
module MakeReadyable
|
3
|
-
|
4
|
-
#
|
22
|
+
|
23
|
+
# Prepares the database by setting up schemas, views, and external data sources.
|
5
24
|
#
|
25
|
+
# This method is primarily geared towards Spark SQL-based databases.
|
6
26
|
# Given some options, prepares a set of views to represent a set
|
7
27
|
# of tables across a collection of different schemas and external,
|
8
28
|
# unmanaged tables.
|
9
29
|
#
|
30
|
+
# @param opts [Hash] the options used to prepare the database
|
31
|
+
# @option opts [Symbol] :use_schema The schema to be used as the primary schema
|
32
|
+
# @option opts [Array] :search_path A set of symbols (schemas) or Pathnames (external files)
|
33
|
+
# @option opts [Array] :only Limit view creation to these tables only
|
34
|
+
# @option opts [Array] :except Skip view creation for these tables
|
35
|
+
#
|
36
|
+
# @example Set primary schema
|
10
37
|
# DB.make_ready(use_schema: :schema)
|
11
38
|
# # => USE `schema`
|
12
39
|
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
# the PATH variable for programs.
|
16
|
-
#
|
17
|
-
# Assuming the following tables: schema1.a, schema2.a, schema2.b
|
18
|
-
#
|
40
|
+
# @example Search path with precedence
|
41
|
+
# # Assuming tables: schema1.a, schema2.a, schema2.b
|
19
42
|
# DB.make_ready(search_path: [:schema1, :schema2])
|
20
43
|
# # => CREATE TEMPORARY VIEW `a` AS SELECT * FROM `schema1`.`a;`
|
21
44
|
# # => CREATE TEMPORARY VIEW `b` AS SELECT * FROM `schema2`.`b;`
|
22
45
|
#
|
23
|
-
#
|
24
|
-
# to try to read from the file.
|
25
|
-
#
|
46
|
+
# @example External file sources
|
26
47
|
# DB.make_ready(search_path: [Pathname.new("c.parquet"), Pathname.new("d.orc")])
|
27
48
|
# # => CREATE TEMPORARY VIEW `c` USING parquet OPTIONS ('path'='c.parquet')
|
28
49
|
# # => CREATE TEMPORARY VIEW `d` USING orc OPTIONS ('path'='d.orc')
|
29
|
-
#
|
30
|
-
# @param [Hash] opts the options used to prepare the database
|
31
|
-
# @option opts [String] :use_schema The schema to be used as the primary schema
|
32
|
-
# @option opts [Array] :search_path A set of sympbols (to represent schemas) or Pathnames (to represent externally managed data files)
|
33
50
|
def make_ready(opts = {})
|
34
51
|
ReadyMaker.new(self, opts).run
|
35
52
|
end
|
53
|
+
|
36
54
|
end
|
37
55
|
|
38
|
-
|
56
|
+
# = ReadyMaker
|
57
|
+
#
|
58
|
+
# Internal class that handles the actual database preparation logic.
|
59
|
+
# This class processes the make_ready options and executes the necessary
|
60
|
+
# SQL statements to set up schemas, views, and external data sources.
|
39
61
|
class ReadyMaker
|
62
|
+
|
63
|
+
# @!attribute [r] db
|
64
|
+
# @return [Sequel::Database] the database instance
|
65
|
+
# @!attribute [r] opts
|
66
|
+
# @return [Hash] the preparation options
|
40
67
|
attr_reader :db, :opts
|
41
68
|
|
69
|
+
# Creates a new ReadyMaker instance.
|
70
|
+
#
|
71
|
+
# @param db [Sequel::Database] the database to prepare
|
72
|
+
# @param opts [Hash] the preparation options
|
42
73
|
def initialize(db, opts)
|
43
74
|
@db = db
|
44
75
|
@opts = opts
|
45
76
|
end
|
46
|
-
|
77
|
+
|
78
|
+
# Executes the database preparation process.
|
79
|
+
#
|
80
|
+
# This method handles:
|
81
|
+
# 1. Setting the primary schema if specified
|
82
|
+
# 2. Processing the search path to create views
|
83
|
+
# 3. Handling table filtering (only/except options)
|
47
84
|
def run
|
48
85
|
if opts[:use_schema]
|
49
86
|
db.extension :usable
|
50
87
|
db.use(opts[:use_schema])
|
51
88
|
end
|
52
89
|
only_tables = Array(opts[:only])
|
53
|
-
created_views =
|
54
|
-
(opts[:search_path] || []).each do |schema|
|
55
|
-
schema = schema.is_a?(Pathname)
|
90
|
+
created_views = Array(opts[:except]) || []
|
91
|
+
(opts[:search_path] || []).flatten.each do |schema|
|
92
|
+
schema = schema.to_sym unless schema.is_a?(Pathname)
|
56
93
|
source = get_source(db, schema)
|
57
|
-
tables =
|
94
|
+
tables = if schema.is_a?(Pathname)
|
95
|
+
source.tables - created_views
|
96
|
+
else
|
97
|
+
source.tables(schema: schema) - created_views
|
98
|
+
end
|
58
99
|
tables &= only_tables unless only_tables.empty?
|
59
100
|
tables.each do |table|
|
60
101
|
create_view(source, table, schema)
|
@@ -63,46 +104,123 @@ module Sequel
|
|
63
104
|
end
|
64
105
|
end
|
65
106
|
|
107
|
+
# Creates a temporary view for the given table.
|
108
|
+
#
|
109
|
+
# @param source [Object] the source (database or FileSourcerer)
|
110
|
+
# @param table [Symbol] the table name
|
111
|
+
# @param schema [Symbol, Pathname] the schema or file path
|
66
112
|
def create_view(source, table, schema)
|
67
113
|
if schema.to_s =~ %r{/}
|
68
114
|
source.create_view(table, temp: true)
|
69
115
|
else
|
116
|
+
# For schema-based tables, just create temporary views
|
117
|
+
# This extension is primarily for Spark SQL-based databases
|
70
118
|
source.create_view(table, db[Sequel.qualify(schema, table)], temp: true)
|
71
119
|
end
|
72
120
|
end
|
73
121
|
|
122
|
+
# Gets the appropriate source handler for the schema.
|
123
|
+
#
|
124
|
+
# @param db [Sequel::Database] the database instance
|
125
|
+
# @param schema [Symbol, Pathname] the schema or file path
|
126
|
+
# @return [Sequel::Database, FileSourcerer] the source handler
|
74
127
|
def get_source(db, schema)
|
75
128
|
if schema.to_s =~ %r{/}
|
76
|
-
FileSourcerer.new(db, Pathname.new(schema))
|
129
|
+
FileSourcerer.new(db, Pathname.new(schema.to_s))
|
77
130
|
else
|
78
131
|
db
|
79
132
|
end
|
80
133
|
end
|
81
134
|
|
135
|
+
# = FileSourcerer
|
136
|
+
#
|
137
|
+
# Handles external file sources for the make_ready functionality.
|
138
|
+
# This class creates temporary views that read from external files
|
139
|
+
# like Parquet, ORC, etc.
|
82
140
|
class FileSourcerer
|
141
|
+
|
142
|
+
# @!attribute [r] db
|
143
|
+
# @return [Sequel::Database] the database instance
|
144
|
+
# @!attribute [r] schema
|
145
|
+
# @return [Pathname] the file path
|
83
146
|
attr_reader :db, :schema
|
147
|
+
|
148
|
+
# Creates a new FileSourcerer instance.
|
149
|
+
#
|
150
|
+
# @param db [Sequel::Database] the database instance
|
151
|
+
# @param schema [Pathname] the file path
|
84
152
|
def initialize(db, schema)
|
85
153
|
@db = db
|
86
154
|
@schema = schema
|
87
155
|
end
|
88
156
|
|
89
|
-
|
90
|
-
|
157
|
+
# Returns the table name derived from the file name.
|
158
|
+
#
|
159
|
+
# @param _opts [Hash] unused options parameter
|
160
|
+
# @return [Array<Symbol>] array containing the table name
|
161
|
+
def tables(_opts = {})
|
162
|
+
[schema.basename(schema.extname).to_s.to_sym]
|
91
163
|
end
|
92
164
|
|
165
|
+
# Creates a temporary view that reads from the external file.
|
166
|
+
#
|
167
|
+
# @param table [Symbol] the table/view name
|
168
|
+
# @param opts [Hash] additional options to merge
|
93
169
|
def create_view(table, opts = {})
|
94
|
-
db.
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
170
|
+
case db.database_type
|
171
|
+
when :spark
|
172
|
+
# Spark SQL uses USING clause for external tables
|
173
|
+
db.create_view(table, {
|
174
|
+
temp: true,
|
175
|
+
using: format,
|
176
|
+
options: { path: schema.expand_path },
|
177
|
+
}.merge(opts))
|
178
|
+
when :duckdb
|
179
|
+
# DuckDB uses direct file reading with read_* functions
|
180
|
+
create_duckdb_view(table, opts)
|
181
|
+
else
|
182
|
+
raise Sequel::Error, "External file sources are not supported on #{db.database_type}"
|
183
|
+
end
|
99
184
|
end
|
100
185
|
|
186
|
+
private
|
187
|
+
|
188
|
+
# Creates a view for DuckDB to read external files
|
189
|
+
#
|
190
|
+
# @param table [Symbol] the table/view name
|
191
|
+
# @param _opts [Hash] additional options to merge (currently unused for DuckDB)
|
192
|
+
def create_duckdb_view(table, _opts)
|
193
|
+
file_path = if schema.directory?
|
194
|
+
schema.expand_path.join('**').join("*.#{format}").to_s
|
195
|
+
else
|
196
|
+
schema.expand_path.to_s
|
197
|
+
end
|
198
|
+
read_function = case format
|
199
|
+
when 'parquet'
|
200
|
+
:read_parquet
|
201
|
+
when 'csv'
|
202
|
+
:read_csv_auto
|
203
|
+
when 'json'
|
204
|
+
:read_json_auto
|
205
|
+
else
|
206
|
+
raise Sequel::Error, "Unsupported file format '#{format}' for DuckDB"
|
207
|
+
end
|
208
|
+
|
209
|
+
# DuckDB doesn't support TEMPORARY views, use regular CREATE VIEW
|
210
|
+
db.create_view(table, db.from(Sequel.function(read_function, file_path)))
|
211
|
+
end
|
212
|
+
|
213
|
+
# Returns the file format based on the file extension.
|
214
|
+
#
|
215
|
+
# @return [String] the file format (e.g., 'parquet', 'orc')
|
101
216
|
def format
|
102
|
-
schema.extname[1
|
217
|
+
schema.extname[1..]
|
103
218
|
end
|
219
|
+
|
104
220
|
end
|
221
|
+
|
105
222
|
end
|
106
223
|
|
107
224
|
Database.register_extension(:make_readyable, MakeReadyable)
|
225
|
+
|
108
226
|
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Sequel
|
4
|
+
|
5
|
+
# Provides additional SQL helper methods for database operations.
|
6
|
+
#
|
7
|
+
# The more_sql extension adds convenience methods for SQL operations that
|
8
|
+
# aren't covered by Sequel's core functionality, particularly schema-related
|
9
|
+
# operations like CREATE SCHEMA.
|
10
|
+
#
|
11
|
+
# @example Load the extension
|
12
|
+
# DB.extension :more_sql
|
13
|
+
#
|
14
|
+
# @example Create a schema
|
15
|
+
# DB.create_schema(:analytics)
|
16
|
+
# # Executes: CREATE SCHEMA "analytics"
|
17
|
+
#
|
18
|
+
# @example Create schema if it doesn't exist
|
19
|
+
# DB.create_schema(:staging, if_not_exists: true)
|
20
|
+
# # Executes: CREATE SCHEMA IF NOT EXISTS "staging"
|
21
|
+
module MoreSql
|
22
|
+
|
23
|
+
# Creates a database schema.
|
24
|
+
#
|
25
|
+
# Generates and executes a CREATE SCHEMA statement with optional
|
26
|
+
# IF NOT EXISTS clause for idempotent schema creation.
|
27
|
+
#
|
28
|
+
# @param schema_name [Symbol, String] The name of the schema to create
|
29
|
+
# @param opts [Hash] Options for schema creation
|
30
|
+
# @option opts [Boolean] :if_not_exists (false) Only create the schema if it doesn't already exist
|
31
|
+
#
|
32
|
+
# @return [nil]
|
33
|
+
#
|
34
|
+
# @example Basic schema creation
|
35
|
+
# DB.create_schema(:reports)
|
36
|
+
# # Executes: CREATE SCHEMA "reports"
|
37
|
+
#
|
38
|
+
# @example Idempotent schema creation
|
39
|
+
# DB.create_schema(:analytics, if_not_exists: true)
|
40
|
+
# # Executes: CREATE SCHEMA IF NOT EXISTS "analytics"
|
41
|
+
#
|
42
|
+
# @example With string schema name
|
43
|
+
# DB.create_schema('user_data')
|
44
|
+
# # Executes: CREATE SCHEMA "user_data"
|
45
|
+
def create_schema(schema_name, opts = {})
|
46
|
+
run(create_schema_sql(schema_name, opts))
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
# Generates the SQL for creating a schema.
|
52
|
+
#
|
53
|
+
# Builds a CREATE SCHEMA statement with proper identifier quoting
|
54
|
+
# and optional IF NOT EXISTS clause.
|
55
|
+
#
|
56
|
+
# @param schema_name [Symbol, String] The name of the schema to create
|
57
|
+
# @param opts [Hash] Options for schema creation
|
58
|
+
# @option opts [Boolean] :if_not_exists (false) Include IF NOT EXISTS clause
|
59
|
+
#
|
60
|
+
# @return [String] The CREATE SCHEMA SQL statement
|
61
|
+
#
|
62
|
+
# @example
|
63
|
+
# create_schema_sql(:test, if_not_exists: true)
|
64
|
+
# # => 'CREATE SCHEMA IF NOT EXISTS "test"'
|
65
|
+
def create_schema_sql(schema_name, opts)
|
66
|
+
sql = 'CREATE SCHEMA '
|
67
|
+
sql += 'IF NOT EXISTS ' if opts[:if_not_exists]
|
68
|
+
sql += literal(schema_name)
|
69
|
+
sql
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
Database.register_extension(:more_sql, MoreSql)
|
75
|
+
|
76
|
+
end
|
@@ -1,5 +1,56 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#
|
4
|
+
# The settable extension adds a convenient +set+ method to database connections
|
5
|
+
# for executing SET statements with key-value pairs. This is particularly useful
|
6
|
+
# for configuring database session parameters.
|
7
|
+
#
|
8
|
+
# DB.extension :settable
|
9
|
+
# DB.set(search_path: 'public', timezone: 'UTC')
|
10
|
+
# # Executes: SET search_path=public
|
11
|
+
# # SET timezone=UTC
|
12
|
+
#
|
13
|
+
# DB.set(work_mem: '256MB')
|
14
|
+
# # Executes: SET work_mem=256MB
|
15
|
+
#
|
16
|
+
# The extension works with any database adapter and supports various value types
|
17
|
+
# including strings, numbers, booleans, and nil values.
|
18
|
+
#
|
19
|
+
# Related module: Sequel::Settable
|
20
|
+
|
1
21
|
module Sequel
|
22
|
+
|
23
|
+
# The Settable module provides database configuration functionality through
|
24
|
+
# SET statements. When loaded as an extension, it adds the +set+ method to
|
25
|
+
# database connections.
|
2
26
|
module Settable
|
27
|
+
|
28
|
+
# Execute SET statements for the given options hash.
|
29
|
+
#
|
30
|
+
# Each key-value pair in the options hash is converted to a SET statement
|
31
|
+
# and executed against the database. Multiple options result in multiple
|
32
|
+
# SET statements being executed in sequence.
|
33
|
+
#
|
34
|
+
# @param opts [Hash] Hash of configuration options to set
|
35
|
+
# @option opts [Object] key The configuration parameter name
|
36
|
+
# @option opts [Object] value The value to set for the parameter
|
37
|
+
#
|
38
|
+
# @example Set a single parameter
|
39
|
+
# DB.set(timezone: 'UTC')
|
40
|
+
# # Executes: SET timezone=UTC
|
41
|
+
#
|
42
|
+
# @example Set multiple parameters
|
43
|
+
# DB.set(search_path: 'public', work_mem: '256MB')
|
44
|
+
# # Executes: SET search_path=public
|
45
|
+
# # SET work_mem=256MB
|
46
|
+
#
|
47
|
+
# @example Different value types
|
48
|
+
# DB.set(port: 5432, autocommit: true, custom_setting: nil)
|
49
|
+
# # Executes: SET port=5432
|
50
|
+
# # SET autocommit=true
|
51
|
+
# # SET custom_setting=
|
52
|
+
#
|
53
|
+
# @return [void]
|
3
54
|
def set(opts = {})
|
4
55
|
set_sql(opts).each do |sql|
|
5
56
|
run(sql)
|
@@ -8,10 +59,23 @@ module Sequel
|
|
8
59
|
|
9
60
|
private
|
10
61
|
|
62
|
+
# Generate SET SQL statements from options hash.
|
63
|
+
#
|
64
|
+
# Converts each key-value pair in the options hash into a SET SQL statement
|
65
|
+
# string. This is a private helper method used internally by the +set+ method.
|
66
|
+
#
|
67
|
+
# @param opts [Hash] Hash of options to convert to SET statements
|
68
|
+
# @return [Array<String>] Array of SET SQL statement strings
|
69
|
+
#
|
70
|
+
# @example
|
71
|
+
# set_sql(timezone: 'UTC', port: 5432)
|
72
|
+
# # => ["SET timezone=UTC", "SET port=5432"]
|
11
73
|
def set_sql(opts)
|
12
74
|
opts.map { |k, v| "SET #{k}=#{v}" }
|
13
75
|
end
|
76
|
+
|
14
77
|
end
|
15
78
|
|
16
79
|
Database.register_extension(:settable, Settable)
|
80
|
+
|
17
81
|
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# == Overview
|
4
|
+
#
|
5
|
+
# The sql_recorder extension records each SQL statement sent to the database
|
6
|
+
# in a thread-safe array accessible via the +sql_recorder+ method.
|
7
|
+
#
|
8
|
+
# == Usage
|
9
|
+
#
|
10
|
+
# DB.extension :sql_recorder
|
11
|
+
# DB[:users].all
|
12
|
+
# DB[:posts].where(id: 1).first
|
13
|
+
#
|
14
|
+
# # Access recorded SQL statements
|
15
|
+
# DB.sql_recorder
|
16
|
+
# # => ["SELECT * FROM users", "SELECT * FROM posts WHERE (id = 1) LIMIT 1"]
|
17
|
+
#
|
18
|
+
# # Clear the recorded statements
|
19
|
+
# DB.sql_recorder.clear
|
20
|
+
#
|
21
|
+
# == Thread Safety
|
22
|
+
#
|
23
|
+
# The extension is thread-safe and uses a mutex to synchronize access to the
|
24
|
+
# SQL recording array when multiple threads are executing queries simultaneously.
|
25
|
+
#
|
26
|
+
# == Compatibility
|
27
|
+
#
|
28
|
+
# This extension is designed to work alongside mock databases and other SQL
|
29
|
+
# recording mechanisms. It uses the method name +sql_recorder+ to avoid
|
30
|
+
# conflicts with existing +sqls+ methods that may be present in test frameworks.
|
31
|
+
#
|
32
|
+
# Related module: Sequel::SqlRecorder
|
33
|
+
|
34
|
+
module Sequel
|
35
|
+
|
36
|
+
# Extension module that adds SQL recording capabilities to Sequel databases.
|
37
|
+
# When included, it provides a +sql_recorder+ method that returns an array
|
38
|
+
# of all SQL statements executed against the database.
|
39
|
+
module SqlRecorder
|
40
|
+
|
41
|
+
# Returns the array of recorded SQL statements.
|
42
|
+
#
|
43
|
+
# The array accumulates all SQL statements sent to the database since the
|
44
|
+
# extension was loaded or since the last time +clear+ was called on the array.
|
45
|
+
#
|
46
|
+
# @return [Array<String>] array of SQL statement strings
|
47
|
+
# @example
|
48
|
+
# DB.extension :sql_recorder
|
49
|
+
# DB[:users].all
|
50
|
+
# DB.sql_recorder #=> ["SELECT * FROM users"]
|
51
|
+
attr_reader :sql_recorder
|
52
|
+
|
53
|
+
# Intercepts SQL execution to record statements.
|
54
|
+
#
|
55
|
+
# This method overrides Sequel's +log_connection_yield+ to capture each SQL
|
56
|
+
# statement in a thread-safe manner before delegating to the parent implementation.
|
57
|
+
#
|
58
|
+
# @param sql [String] the SQL statement being executed
|
59
|
+
# @param conn [Object] the database connection object
|
60
|
+
# @param args [Object] additional arguments (optional)
|
61
|
+
# @return [Object] result from the parent +log_connection_yield+ method
|
62
|
+
def log_connection_yield(sql, conn, args = nil)
|
63
|
+
@sql_recorder_mutex.synchronize { sql_recorder.push(sql) }
|
64
|
+
super
|
65
|
+
end
|
66
|
+
|
67
|
+
# Initializes the SQL recording infrastructure when the extension is loaded.
|
68
|
+
#
|
69
|
+
# Sets up the mutex for thread-safe access and initializes the SQL recording
|
70
|
+
# array. This method is automatically called when the extension is loaded
|
71
|
+
# via +DB.extension :sql_recorder+.
|
72
|
+
#
|
73
|
+
# @param db [Sequel::Database] the database instance being extended
|
74
|
+
def self.extended(db)
|
75
|
+
db.instance_exec do
|
76
|
+
@sql_recorder_mutex ||= Mutex.new
|
77
|
+
@sql_recorder ||= []
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
|
83
|
+
Database.register_extension(:sql_recorder, SqlRecorder)
|
84
|
+
|
85
|
+
end
|
@@ -0,0 +1,169 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'digest'
|
4
|
+
|
5
|
+
module Sequel
|
6
|
+
|
7
|
+
# Provides efficient handling of large UNION operations.
|
8
|
+
#
|
9
|
+
# The unionize extension allows combining many datasets through UNION operations
|
10
|
+
# by chunking them into manageable temporary tables or views. This is particularly
|
11
|
+
# useful when dealing with databases that have limitations on the number of UNION
|
12
|
+
# operations in a single query (e.g., Spark SQL, DuckDB).
|
13
|
+
#
|
14
|
+
# @example Load the extension
|
15
|
+
# DB.extension :unionize
|
16
|
+
#
|
17
|
+
# @example Basic usage
|
18
|
+
# DB.unionize([dataset1, dataset2, dataset3, dataset4])
|
19
|
+
#
|
20
|
+
# @example With options
|
21
|
+
# DB.unionize(datasets, chunk_size: 50, all: true, temp_table_prefix: 'my_union')
|
22
|
+
module Unionize
|
23
|
+
|
24
|
+
# Handles the chunking and union of multiple datasets.
|
25
|
+
#
|
26
|
+
# This class manages the process of splitting a large collection of datasets
|
27
|
+
# into smaller chunks, creating temporary tables/views for each chunk, and
|
28
|
+
# then recursively combining them until a single unified dataset is produced.
|
29
|
+
class Unionizer
|
30
|
+
|
31
|
+
# Default number of datasets to combine in each chunk
|
32
|
+
DEFAULT_CHUNK_SIZE = 100
|
33
|
+
|
34
|
+
# Represents a chunk of datasets to be combined via UNION.
|
35
|
+
#
|
36
|
+
# Each chunk handles a subset of datasets, creates a temporary table/view
|
37
|
+
# for the combined result, and provides access to the unified dataset.
|
38
|
+
class Chunk
|
39
|
+
|
40
|
+
# @!attribute [r] db
|
41
|
+
# @return [Sequel::Database] The database connection
|
42
|
+
# @!attribute [r] dses
|
43
|
+
# @return [Array<Sequel::Dataset>] The datasets in this chunk
|
44
|
+
# @!attribute [r] opts
|
45
|
+
# @return [Hash] Options for the union operation
|
46
|
+
attr_reader :db, :dses, :opts
|
47
|
+
|
48
|
+
# Creates a new chunk instance.
|
49
|
+
#
|
50
|
+
# @param db [Sequel::Database] The database connection
|
51
|
+
# @param dses [Array<Sequel::Dataset>] The datasets to combine
|
52
|
+
# @param opts [Hash] Options for the union operation
|
53
|
+
def initialize(db, dses, opts)
|
54
|
+
@db = db
|
55
|
+
@dses = dses
|
56
|
+
@opts = opts
|
57
|
+
end
|
58
|
+
|
59
|
+
# Returns the unified dataset created by combining all datasets in this chunk.
|
60
|
+
#
|
61
|
+
# @return [Sequel::Dataset] The combined dataset
|
62
|
+
def union
|
63
|
+
@union ||= dses.reduce { |a, b| a.union(b, all: opts[:all], from_self: opts[:from_self]) }
|
64
|
+
end
|
65
|
+
|
66
|
+
# Generates a unique name for the temporary table/view.
|
67
|
+
#
|
68
|
+
# The name is based on a hash of the SQL query to ensure uniqueness
|
69
|
+
# and avoid collisions when multiple unionize operations are running.
|
70
|
+
#
|
71
|
+
# @return [Symbol] The temporary table/view name
|
72
|
+
def name
|
73
|
+
@name ||= :"#{opts[:temp_table_prefix]}_#{Digest::SHA1.hexdigest(union.sql)}"
|
74
|
+
end
|
75
|
+
|
76
|
+
# Creates a temporary table or view for this chunk's union result.
|
77
|
+
#
|
78
|
+
# The method used depends on the database type:
|
79
|
+
# - Spark: Creates a temporary view
|
80
|
+
# - DuckDB: Creates a temporary table
|
81
|
+
#
|
82
|
+
# @raise [RuntimeError] If the database type is not supported
|
83
|
+
# @return [void]
|
84
|
+
def create
|
85
|
+
if db.database_type == :spark
|
86
|
+
db.create_view(name, union, temp: true)
|
87
|
+
elsif db.database_type == :duckdb
|
88
|
+
db.create_table(name, temp: true, as: union)
|
89
|
+
else
|
90
|
+
raise "Unsupported database type: #{db.database_type}"
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
95
|
+
|
96
|
+
# @!attribute [r] db
|
97
|
+
# @return [Sequel::Database] The database connection
|
98
|
+
attr_reader :db
|
99
|
+
|
100
|
+
# Creates a new Unionizer instance.
|
101
|
+
#
|
102
|
+
# @param db [Sequel::Database] The database connection
|
103
|
+
# @param ds_set [Array<Sequel::Dataset>] The datasets to combine
|
104
|
+
# @param opts [Hash] Options for the union operation
|
105
|
+
# @option opts [Integer] :chunk_size (100) Number of datasets per chunk
|
106
|
+
# @option opts [String] :temp_table_prefix ('temp_union') Prefix for temporary tables
|
107
|
+
# @option opts [Boolean] :all (false) Use UNION ALL instead of UNION
|
108
|
+
# @option opts [Boolean] :from_self (true) Wrap individual datasets in subqueries
|
109
|
+
def initialize(db, ds_set, opts = {})
|
110
|
+
@db = db
|
111
|
+
@ds_set = ds_set
|
112
|
+
@opts = opts
|
113
|
+
opts[:chunk_size] ||= DEFAULT_CHUNK_SIZE
|
114
|
+
opts[:temp_table_prefix] ||= 'temp_union'
|
115
|
+
opts[:all] ||= false
|
116
|
+
opts[:from_self] = opts.fetch(:from_self, true)
|
117
|
+
end
|
118
|
+
|
119
|
+
# Performs the unionization of datasets.
|
120
|
+
#
|
121
|
+
# This method recursively chunks the datasets, creates temporary tables/views
|
122
|
+
# for each chunk, and then combines them until a single dataset remains.
|
123
|
+
#
|
124
|
+
# @param dses [Array<Sequel::Dataset>] The datasets to combine (defaults to @ds_set)
|
125
|
+
# @return [Sequel::Dataset] The final combined dataset
|
126
|
+
def unionize(dses = @ds_set)
|
127
|
+
chunks = dses.each_slice(@opts[:chunk_size]).map do |chunk_of_dses|
|
128
|
+
Chunk.new(db, chunk_of_dses, @opts)
|
129
|
+
end
|
130
|
+
|
131
|
+
return chunks.first.union if chunks.size == 1
|
132
|
+
|
133
|
+
unionize(chunks.each(&:create).map { |chunk| db[chunk.name] })
|
134
|
+
end
|
135
|
+
|
136
|
+
end
|
137
|
+
|
138
|
+
# Efficiently combines multiple datasets using UNION operations.
|
139
|
+
#
|
140
|
+
# This method handles large numbers of datasets by chunking them into
|
141
|
+
# manageable groups, creating temporary tables/views for intermediate
|
142
|
+
# results, and recursively combining them until a single dataset is produced.
|
143
|
+
#
|
144
|
+
# @param ds_set [Array<Sequel::Dataset>] The datasets to combine via UNION
|
145
|
+
# @param opts [Hash] Options for the union operation
|
146
|
+
# @option opts [Integer] :chunk_size (100) Number of datasets to combine in each chunk
|
147
|
+
# @option opts [String] :temp_table_prefix ('temp_union') Prefix for temporary table names
|
148
|
+
# @option opts [Boolean] :all (false) Use UNION ALL instead of UNION (keeps duplicates)
|
149
|
+
# @option opts [Boolean] :from_self (true) Wrap individual datasets in subqueries
|
150
|
+
#
|
151
|
+
# @return [Sequel::Dataset] The combined dataset
|
152
|
+
#
|
153
|
+
# @example Basic union of datasets
|
154
|
+
# db.unionize([ds1, ds2, ds3, ds4])
|
155
|
+
#
|
156
|
+
# @example Union all with custom chunk size
|
157
|
+
# db.unionize(datasets, all: true, chunk_size: 50)
|
158
|
+
#
|
159
|
+
# @example Custom temporary table prefix
|
160
|
+
# db.unionize(datasets, temp_table_prefix: 'my_union_batch')
|
161
|
+
def unionize(ds_set, opts = {})
|
162
|
+
Unionizer.new(self, ds_set, opts).unionize
|
163
|
+
end
|
164
|
+
|
165
|
+
end
|
166
|
+
|
167
|
+
Database.register_extension(:unionize, Unionize)
|
168
|
+
|
169
|
+
end
|