sequelizer 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/.beads/.gitignore +54 -0
  3. data/.beads/.jsonl.lock +0 -0
  4. data/.beads/.migration-hint-ts +1 -0
  5. data/.beads/README.md +81 -0
  6. data/.beads/config.yaml +42 -0
  7. data/.beads/issues.jsonl +20 -0
  8. data/.beads/metadata.json +7 -0
  9. data/.coderabbit.yaml +94 -0
  10. data/.devcontainer/.p10k.zsh +1713 -0
  11. data/.devcontainer/.zshrc +29 -0
  12. data/.devcontainer/Dockerfile +137 -0
  13. data/.devcontainer/copy-claude-credentials.sh +32 -0
  14. data/.devcontainer/devcontainer.json +102 -0
  15. data/.devcontainer/init-firewall.sh +123 -0
  16. data/.devcontainer/setup-credentials.sh +95 -0
  17. data/.github/dependabot.yml +18 -0
  18. data/.github/workflows/dependabot-auto-merge.yml +36 -0
  19. data/.github/workflows/test.yml +44 -9
  20. data/.gitignore +6 -1
  21. data/.overcommit.yml +73 -0
  22. data/.rubocop.yml +167 -0
  23. data/AGENTS.md +126 -0
  24. data/CHANGELOG.md +41 -0
  25. data/CLAUDE.md +230 -0
  26. data/Gemfile +6 -2
  27. data/Gemfile.lock +189 -0
  28. data/Guardfile +1 -1
  29. data/Rakefile +28 -3
  30. data/config/platforms/base.csv +5 -0
  31. data/config/platforms/rdbms/athena.csv +4 -0
  32. data/config/platforms/rdbms/postgres.csv +3 -0
  33. data/config/platforms/rdbms/snowflake.csv +1 -0
  34. data/config/platforms/rdbms/spark.csv +3 -0
  35. data/lib/sequel/extensions/cold_col.rb +436 -0
  36. data/lib/sequel/extensions/db_opts.rb +65 -4
  37. data/lib/sequel/extensions/funky.rb +136 -0
  38. data/lib/sequel/extensions/make_readyable.rb +146 -30
  39. data/lib/sequel/extensions/more_sql.rb +76 -0
  40. data/lib/sequel/extensions/platform.rb +301 -0
  41. data/lib/sequel/extensions/settable.rb +64 -0
  42. data/lib/sequel/extensions/sql_recorder.rb +85 -0
  43. data/lib/sequel/extensions/unionize.rb +169 -0
  44. data/lib/sequel/extensions/usable.rb +30 -1
  45. data/lib/sequelizer/cli.rb +61 -18
  46. data/lib/sequelizer/connection_maker.rb +54 -72
  47. data/lib/sequelizer/env_config.rb +6 -6
  48. data/lib/sequelizer/gemfile_modifier.rb +23 -21
  49. data/lib/sequelizer/monkey_patches/database_in_after_connect.rb +7 -5
  50. data/lib/sequelizer/options.rb +102 -19
  51. data/lib/sequelizer/options_hash.rb +2 -0
  52. data/lib/sequelizer/version.rb +3 -1
  53. data/lib/sequelizer/yaml_config.rb +9 -4
  54. data/lib/sequelizer.rb +65 -9
  55. data/sequelizer.gemspec +20 -12
  56. data/test/lib/sequel/extensions/test_cold_col.rb +251 -0
  57. data/test/lib/sequel/extensions/test_db_opts.rb +10 -8
  58. data/test/lib/sequel/extensions/test_make_readyable.rb +198 -28
  59. data/test/lib/sequel/extensions/test_more_sql.rb +132 -0
  60. data/test/lib/sequel/extensions/test_platform.rb +222 -0
  61. data/test/lib/sequel/extensions/test_settable.rb +109 -0
  62. data/test/lib/sequel/extensions/test_sql_recorder.rb +231 -0
  63. data/test/lib/sequel/extensions/test_unionize.rb +76 -0
  64. data/test/lib/sequel/extensions/test_usable.rb +5 -2
  65. data/test/lib/sequelizer/test_connection_maker.rb +21 -17
  66. data/test/lib/sequelizer/test_env_config.rb +5 -2
  67. data/test/lib/sequelizer/test_gemfile_modifier.rb +7 -6
  68. data/test/lib/sequelizer/test_options.rb +42 -9
  69. data/test/lib/sequelizer/test_yaml_config.rb +13 -12
  70. data/test/test_helper.rb +37 -8
  71. metadata +196 -39
  72. data/lib/sequel/extensions/sqls.rb +0 -31
@@ -0,0 +1,301 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # The platform extension provides a unified interface for platform-specific
5
+ # database behavior. It uses KVCSV configuration files to define capabilities
6
+ # and preferences, with Ruby classes only for function translations.
7
+ #
8
+ # DB.extension :platform
9
+ # DB.platform.supports?(:cte) # => true (from config)
10
+ # DB.platform.prefers?(:cte) # => true (from config)
11
+ # DB.platform[:interim_schema] # => nil (from config)
12
+ # DB.platform.date_diff(from, to) # => Sequel expression (from code)
13
+ #
14
+ # Configuration is loaded from CSV files in config/platforms/:
15
+ # - base.csv: conservative defaults
16
+ # - rdbms/<adapter>.csv: RDBMS-specific overrides
17
+ # - Additional configs can be stacked via platform_configs option
18
+ #
19
+ # Related module: Sequel::Platform
20
+
21
+ require 'kvcsv'
22
+
23
+ module Sequel
24
+
25
+ # The Platform module provides database platform abstraction through
26
+ # configuration-driven capabilities and code-driven function translations.
27
+ module Platform
28
+
29
+ # Base platform class with KVCSV config loading and default implementations.
30
+ # Subclasses override function translation methods for platform-specific SQL.
31
+ class Base
32
+
33
+ attr_reader :db, :config
34
+
35
+ # Initialize platform with database connection and config paths.
36
+ #
37
+ # @param db [Sequel::Database] The database connection
38
+ # @param config_paths [Array<String>] Paths to CSV config files (stacked in order)
39
+ def initialize(db, *config_paths)
40
+ @db = db
41
+ @config = config_paths.empty? ? {} : KVCSV::Settings.new(*config_paths)
42
+ end
43
+
44
+ # Check if the platform supports a feature.
45
+ #
46
+ # @param feature [Symbol] Feature name (e.g., :cte, :temp_tables)
47
+ # @return [Boolean] true if supported
48
+ #
49
+ # @example
50
+ # platform.supports?(:cte) # checks supports_cte in config
51
+ # platform.supports?(:temp_tables) # checks supports_temp_tables in config
52
+ def supports?(feature)
53
+ config[:"supports_#{feature}"] || false
54
+ end
55
+
56
+ # Check if the platform prefers a feature (may support but not prefer).
57
+ #
58
+ # @param feature [Symbol] Feature name (e.g., :cte, :parquet)
59
+ # @return [Boolean] true if preferred
60
+ #
61
+ # @example
62
+ # platform.prefers?(:cte) # Spark supports CTEs but doesn't prefer them
63
+ # platform.prefers?(:parquet) # Spark prefers parquet format
64
+ def prefers?(feature)
65
+ config[:"prefers_#{feature}"] || false
66
+ end
67
+
68
+ # Access arbitrary config values.
69
+ #
70
+ # @param key [Symbol] Config key
71
+ # @return [Object] Config value or nil
72
+ #
73
+ # @example
74
+ # platform[:interim_schema] # => "scratch"
75
+ # platform[:schema_switching_method] # => "use"
76
+ def [](key)
77
+ config[key]
78
+ end
79
+
80
+ # Fetch config value with default.
81
+ #
82
+ # @param key [Symbol] Config key
83
+ # @param default [Object] Default value if key not found
84
+ # @return [Object] Config value or default
85
+ def fetch(key, default = nil)
86
+ config.respond_to?(:fetch) ? config.fetch(key, default) : (config[key] || default)
87
+ end
88
+
89
+ # ---- Function translations (override in subclasses) ----
90
+
91
+ # Calculate date difference between two dates.
92
+ #
93
+ # @param from [Symbol, Sequel::SQL::Expression] Start date
94
+ # @param to [Symbol, Sequel::SQL::Expression] End date
95
+ # @return [Sequel::SQL::Expression] Date difference expression
96
+ def date_diff(from, to)
97
+ Sequel.function(:datediff, from, to)
98
+ end
99
+
100
+ # Cast expression to date type.
101
+ #
102
+ # @param expr [Object] Expression to cast
103
+ # @return [Sequel::SQL::Cast] Cast expression
104
+ def cast_date(expr)
105
+ Sequel.cast(expr, Date)
106
+ end
107
+
108
+ # Parse string to date with format.
109
+ #
110
+ # @param value [Object] String value to parse
111
+ # @param format [String] Date format string
112
+ # @return [Sequel::SQL::Expression] Parsed date expression
113
+ def str_to_date(value, format)
114
+ Sequel.function(:to_date, value, format)
115
+ end
116
+
117
+ # Calculate days between two dates.
118
+ #
119
+ # @param from [Symbol, Sequel::SQL::Expression] Start date
120
+ # @param to [Symbol, Sequel::SQL::Expression] End date
121
+ # @return [Sequel::SQL::Expression] Days between expression
122
+ def days_between(from, to)
123
+ date_diff(from, to)
124
+ end
125
+
126
+ end
127
+
128
+ # PostgreSQL platform with Postgres-specific function translations.
129
+ class Postgres < Base
130
+
131
+ def date_diff(from, to)
132
+ # Postgres uses date subtraction
133
+ Sequel.lit('(? - ?)', to, from)
134
+ end
135
+
136
+ def days_between(from, to)
137
+ # Postgres date subtraction returns integer days
138
+ Sequel.lit('(? - ?)', to, from)
139
+ end
140
+
141
+ end
142
+
143
+ # Spark platform with Spark SQL-specific function translations.
144
+ class Spark < Base
145
+
146
+ def date_diff(from, to)
147
+ # Spark datediff has reversed argument order (end, start)
148
+ Sequel.function(:datediff, to, from)
149
+ end
150
+
151
+ def str_to_date(value, format)
152
+ Sequel.function(:to_date, Sequel.cast_string(value), format)
153
+ end
154
+
155
+ end
156
+
157
+ # Snowflake platform with Snowflake-specific function translations.
158
+ class Snowflake < Base
159
+
160
+ def date_diff(from, to)
161
+ # Snowflake requires unit parameter
162
+ Sequel.function(:datediff, 'day', from, to)
163
+ end
164
+
165
+ def days_between(from, to)
166
+ Sequel.function(:datediff, 'day', from, to)
167
+ end
168
+
169
+ end
170
+
171
+ # Athena platform (Presto/Trino based) with Athena-specific function translations.
172
+ class Athena < Base
173
+
174
+ def date_diff(from, to)
175
+ # Athena/Presto uses date_diff with unit
176
+ Sequel.function(:date_diff, 'day', from, to)
177
+ end
178
+
179
+ def days_between(from, to)
180
+ Sequel.function(:date_diff, 'day', from, to)
181
+ end
182
+
183
+ end
184
+
185
+ # Map adapter schemes to platform classes
186
+ PLATFORM_CLASSES = {
187
+ postgres: Postgres,
188
+ postgresql: Postgres,
189
+ spark: Spark,
190
+ athena: Athena,
191
+ presto: Athena,
192
+ trino: Athena,
193
+ snowflake: Snowflake,
194
+ }.freeze
195
+
196
+ # Map adapter schemes to config file names
197
+ ADAPTER_CONFIG_NAMES = {
198
+ postgres: 'postgres',
199
+ postgresql: 'postgres',
200
+ spark: 'spark',
201
+ athena: 'athena',
202
+ presto: 'athena',
203
+ trino: 'athena',
204
+ snowflake: 'snowflake',
205
+ }.freeze
206
+
207
+ class << self
208
+
209
+ # Find the config directory, searching gem paths
210
+ def config_dir
211
+ @config_dir ||= find_config_dir
212
+ end
213
+
214
+ # Allow overriding config dir for testing
215
+ attr_writer :config_dir
216
+
217
+ private
218
+
219
+ def find_config_dir
220
+ # Check relative to this file (gem's config)
221
+ gem_config = File.expand_path('../../../config/platforms', __dir__)
222
+ return gem_config if File.directory?(gem_config)
223
+
224
+ # Fallback to working directory
225
+ local_config = File.join(Dir.pwd, 'config/platforms')
226
+ return local_config if File.directory?(local_config)
227
+
228
+ nil
229
+ end
230
+
231
+ end
232
+
233
+ # Build config paths for the given adapter
234
+ #
235
+ # @param adapter_scheme [Symbol] Database adapter scheme
236
+ # @param extra_configs [Array<String>] Additional config paths to stack
237
+ # @return [Array<String>] Ordered config paths
238
+ def self.config_paths_for(adapter_scheme, extra_configs = [])
239
+ paths = []
240
+
241
+ if config_dir
242
+ base_config = File.join(config_dir, 'base.csv')
243
+ paths << base_config if File.exist?(base_config)
244
+
245
+ adapter_name = ADAPTER_CONFIG_NAMES[adapter_scheme]
246
+ if adapter_name
247
+ rdbms_config = File.join(config_dir, 'rdbms', "#{adapter_name}.csv")
248
+ paths << rdbms_config if File.exist?(rdbms_config)
249
+ end
250
+ end
251
+
252
+ paths.concat(extra_configs.select { |p| File.exist?(p) })
253
+ paths
254
+ end
255
+
256
+ # Build platform instance for database
257
+ #
258
+ # @param db [Sequel::Database] Database connection
259
+ # @param extra_configs [Array<String>] Additional config paths
260
+ # @return [Platform::Base] Platform instance
261
+ def self.build_platform(db, extra_configs = [])
262
+ adapter = effective_adapter(db)
263
+ platform_class = PLATFORM_CLASSES.fetch(adapter, Base)
264
+ config_paths = config_paths_for(adapter, extra_configs)
265
+ platform_class.new(db, *config_paths)
266
+ end
267
+
268
+ # Detect the effective adapter for platform selection.
269
+ # For mock databases, use database_type or host option; otherwise use adapter_scheme.
270
+ #
271
+ # @param db [Sequel::Database] Database connection
272
+ # @return [Symbol] Effective adapter type
273
+ def self.effective_adapter(db)
274
+ return db.adapter_scheme unless db.adapter_scheme == :mock
275
+
276
+ # Mock databases: try database_type first (set for known types like postgres, spark)
277
+ db_type = db.database_type
278
+ return db_type if db_type && db_type != :mock
279
+
280
+ # Fall back to host option (for unknown types like snowflake, athena)
281
+ db.opts[:host]
282
+ end
283
+
284
+ # Extension hook - called when extension is loaded
285
+ def self.extended(db)
286
+ extra_configs = db.opts[:platform_configs] || []
287
+ db.instance_variable_set(:@platform, build_platform(db, extra_configs))
288
+ end
289
+
290
+ # Access the platform instance
291
+ #
292
+ # @return [Platform::Base] Platform instance for this database
293
+ def platform
294
+ @platform
295
+ end
296
+
297
+ end
298
+
299
+ Database.register_extension(:platform, Platform)
300
+
301
+ end
@@ -1,5 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # The settable extension adds a convenient +set+ method to database connections
5
+ # for executing SET statements with key-value pairs. This is particularly useful
6
+ # for configuring database session parameters.
7
+ #
8
+ # DB.extension :settable
9
+ # DB.set(search_path: 'public', timezone: 'UTC')
10
+ # # Executes: SET search_path=public
11
+ # # SET timezone=UTC
12
+ #
13
+ # DB.set(work_mem: '256MB')
14
+ # # Executes: SET work_mem=256MB
15
+ #
16
+ # The extension works with any database adapter and supports various value types
17
+ # including strings, numbers, booleans, and nil values.
18
+ #
19
+ # Related module: Sequel::Settable
20
+
1
21
  module Sequel
22
+
23
+ # The Settable module provides database configuration functionality through
24
+ # SET statements. When loaded as an extension, it adds the +set+ method to
25
+ # database connections.
2
26
  module Settable
27
+
28
+ # Execute SET statements for the given options hash.
29
+ #
30
+ # Each key-value pair in the options hash is converted to a SET statement
31
+ # and executed against the database. Multiple options result in multiple
32
+ # SET statements being executed in sequence.
33
+ #
34
+ # @param opts [Hash] Hash of configuration options to set
35
+ # @option opts [Object] key The configuration parameter name
36
+ # @option opts [Object] value The value to set for the parameter
37
+ #
38
+ # @example Set a single parameter
39
+ # DB.set(timezone: 'UTC')
40
+ # # Executes: SET timezone=UTC
41
+ #
42
+ # @example Set multiple parameters
43
+ # DB.set(search_path: 'public', work_mem: '256MB')
44
+ # # Executes: SET search_path=public
45
+ # # SET work_mem=256MB
46
+ #
47
+ # @example Different value types
48
+ # DB.set(port: 5432, autocommit: true, custom_setting: nil)
49
+ # # Executes: SET port=5432
50
+ # # SET autocommit=true
51
+ # # SET custom_setting=
52
+ #
53
+ # @return [void]
3
54
  def set(opts = {})
4
55
  set_sql(opts).each do |sql|
5
56
  run(sql)
@@ -8,10 +59,23 @@ module Sequel
8
59
 
9
60
  private
10
61
 
62
+ # Generate SET SQL statements from options hash.
63
+ #
64
+ # Converts each key-value pair in the options hash into a SET SQL statement
65
+ # string. This is a private helper method used internally by the +set+ method.
66
+ #
67
+ # @param opts [Hash] Hash of options to convert to SET statements
68
+ # @return [Array<String>] Array of SET SQL statement strings
69
+ #
70
+ # @example
71
+ # set_sql(timezone: 'UTC', port: 5432)
72
+ # # => ["SET timezone=UTC", "SET port=5432"]
11
73
  def set_sql(opts)
12
74
  opts.map { |k, v| "SET #{k}=#{v}" }
13
75
  end
76
+
14
77
  end
15
78
 
16
79
  Database.register_extension(:settable, Settable)
80
+
17
81
  end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ # == Overview
4
+ #
5
+ # The sql_recorder extension records each SQL statement sent to the database
6
+ # in a thread-safe array accessible via the +sql_recorder+ method.
7
+ #
8
+ # == Usage
9
+ #
10
+ # DB.extension :sql_recorder
11
+ # DB[:users].all
12
+ # DB[:posts].where(id: 1).first
13
+ #
14
+ # # Access recorded SQL statements
15
+ # DB.sql_recorder
16
+ # # => ["SELECT * FROM users", "SELECT * FROM posts WHERE (id = 1) LIMIT 1"]
17
+ #
18
+ # # Clear the recorded statements
19
+ # DB.sql_recorder.clear
20
+ #
21
+ # == Thread Safety
22
+ #
23
+ # The extension is thread-safe and uses a mutex to synchronize access to the
24
+ # SQL recording array when multiple threads are executing queries simultaneously.
25
+ #
26
+ # == Compatibility
27
+ #
28
+ # This extension is designed to work alongside mock databases and other SQL
29
+ # recording mechanisms. It uses the method name +sql_recorder+ to avoid
30
+ # conflicts with existing +sqls+ methods that may be present in test frameworks.
31
+ #
32
+ # Related module: Sequel::SqlRecorder
33
+
34
+ module Sequel
35
+
36
+ # Extension module that adds SQL recording capabilities to Sequel databases.
37
+ # When included, it provides a +sql_recorder+ method that returns an array
38
+ # of all SQL statements executed against the database.
39
+ module SqlRecorder
40
+
41
+ # Returns the array of recorded SQL statements.
42
+ #
43
+ # The array accumulates all SQL statements sent to the database since the
44
+ # extension was loaded or since the last time +clear+ was called on the array.
45
+ #
46
+ # @return [Array<String>] array of SQL statement strings
47
+ # @example
48
+ # DB.extension :sql_recorder
49
+ # DB[:users].all
50
+ # DB.sql_recorder #=> ["SELECT * FROM users"]
51
+ attr_reader :sql_recorder
52
+
53
+ # Intercepts SQL execution to record statements.
54
+ #
55
+ # This method overrides Sequel's +log_connection_yield+ to capture each SQL
56
+ # statement in a thread-safe manner before delegating to the parent implementation.
57
+ #
58
+ # @param sql [String] the SQL statement being executed
59
+ # @param conn [Object] the database connection object
60
+ # @param args [Object] additional arguments (optional)
61
+ # @return [Object] result from the parent +log_connection_yield+ method
62
+ def log_connection_yield(sql, conn, args = nil)
63
+ @sql_recorder_mutex.synchronize { sql_recorder.push(sql) }
64
+ super
65
+ end
66
+
67
+ # Initializes the SQL recording infrastructure when the extension is loaded.
68
+ #
69
+ # Sets up the mutex for thread-safe access and initializes the SQL recording
70
+ # array. This method is automatically called when the extension is loaded
71
+ # via +DB.extension :sql_recorder+.
72
+ #
73
+ # @param db [Sequel::Database] the database instance being extended
74
+ def self.extended(db)
75
+ db.instance_exec do
76
+ @sql_recorder_mutex ||= Mutex.new
77
+ @sql_recorder ||= []
78
+ end
79
+ end
80
+
81
+ end
82
+
83
+ Database.register_extension(:sql_recorder, SqlRecorder)
84
+
85
+ end
@@ -0,0 +1,169 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'digest'
4
+
5
+ module Sequel
6
+
7
+ # Provides efficient handling of large UNION operations.
8
+ #
9
+ # The unionize extension allows combining many datasets through UNION operations
10
+ # by chunking them into manageable temporary tables or views. This is particularly
11
+ # useful when dealing with databases that have limitations on the number of UNION
12
+ # operations in a single query (e.g., Spark SQL, DuckDB).
13
+ #
14
+ # @example Load the extension
15
+ # DB.extension :unionize
16
+ #
17
+ # @example Basic usage
18
+ # DB.unionize([dataset1, dataset2, dataset3, dataset4])
19
+ #
20
+ # @example With options
21
+ # DB.unionize(datasets, chunk_size: 50, all: true, temp_table_prefix: 'my_union')
22
+ module Unionize
23
+
24
+ # Handles the chunking and union of multiple datasets.
25
+ #
26
+ # This class manages the process of splitting a large collection of datasets
27
+ # into smaller chunks, creating temporary tables/views for each chunk, and
28
+ # then recursively combining them until a single unified dataset is produced.
29
+ class Unionizer
30
+
31
+ # Default number of datasets to combine in each chunk
32
+ DEFAULT_CHUNK_SIZE = 100
33
+
34
+ # Represents a chunk of datasets to be combined via UNION.
35
+ #
36
+ # Each chunk handles a subset of datasets, creates a temporary table/view
37
+ # for the combined result, and provides access to the unified dataset.
38
+ class Chunk
39
+
40
+ # @!attribute [r] db
41
+ # @return [Sequel::Database] The database connection
42
+ # @!attribute [r] dses
43
+ # @return [Array<Sequel::Dataset>] The datasets in this chunk
44
+ # @!attribute [r] opts
45
+ # @return [Hash] Options for the union operation
46
+ attr_reader :db, :dses, :opts
47
+
48
+ # Creates a new chunk instance.
49
+ #
50
+ # @param db [Sequel::Database] The database connection
51
+ # @param dses [Array<Sequel::Dataset>] The datasets to combine
52
+ # @param opts [Hash] Options for the union operation
53
+ def initialize(db, dses, opts)
54
+ @db = db
55
+ @dses = dses
56
+ @opts = opts
57
+ end
58
+
59
+ # Returns the unified dataset created by combining all datasets in this chunk.
60
+ #
61
+ # @return [Sequel::Dataset] The combined dataset
62
+ def union
63
+ @union ||= dses.reduce { |a, b| a.union(b, all: opts[:all], from_self: opts[:from_self]) }
64
+ end
65
+
66
+ # Generates a unique name for the temporary table/view.
67
+ #
68
+ # The name is based on a hash of the SQL query to ensure uniqueness
69
+ # and avoid collisions when multiple unionize operations are running.
70
+ #
71
+ # @return [Symbol] The temporary table/view name
72
+ def name
73
+ @name ||= :"#{opts[:temp_table_prefix]}_#{Digest::SHA1.hexdigest(union.sql)}"
74
+ end
75
+
76
+ # Creates a temporary table or view for this chunk's union result.
77
+ #
78
+ # The method used depends on the database type:
79
+ # - Spark: Creates a temporary view
80
+ # - DuckDB: Creates a temporary table
81
+ #
82
+ # @raise [RuntimeError] If the database type is not supported
83
+ # @return [void]
84
+ def create
85
+ if db.database_type == :spark
86
+ db.create_view(name, union, temp: true)
87
+ elsif db.database_type == :duckdb
88
+ db.create_table(name, temp: true, as: union)
89
+ else
90
+ raise "Unsupported database type: #{db.database_type}"
91
+ end
92
+ end
93
+
94
+ end
95
+
96
+ # @!attribute [r] db
97
+ # @return [Sequel::Database] The database connection
98
+ attr_reader :db
99
+
100
+ # Creates a new Unionizer instance.
101
+ #
102
+ # @param db [Sequel::Database] The database connection
103
+ # @param ds_set [Array<Sequel::Dataset>] The datasets to combine
104
+ # @param opts [Hash] Options for the union operation
105
+ # @option opts [Integer] :chunk_size (100) Number of datasets per chunk
106
+ # @option opts [String] :temp_table_prefix ('temp_union') Prefix for temporary tables
107
+ # @option opts [Boolean] :all (false) Use UNION ALL instead of UNION
108
+ # @option opts [Boolean] :from_self (true) Wrap individual datasets in subqueries
109
+ def initialize(db, ds_set, opts = {})
110
+ @db = db
111
+ @ds_set = ds_set
112
+ @opts = opts
113
+ opts[:chunk_size] ||= DEFAULT_CHUNK_SIZE
114
+ opts[:temp_table_prefix] ||= 'temp_union'
115
+ opts[:all] ||= false
116
+ opts[:from_self] = opts.fetch(:from_self, true)
117
+ end
118
+
119
+ # Performs the unionization of datasets.
120
+ #
121
+ # This method recursively chunks the datasets, creates temporary tables/views
122
+ # for each chunk, and then combines them until a single dataset remains.
123
+ #
124
+ # @param dses [Array<Sequel::Dataset>] The datasets to combine (defaults to @ds_set)
125
+ # @return [Sequel::Dataset] The final combined dataset
126
+ def unionize(dses = @ds_set)
127
+ chunks = dses.each_slice(@opts[:chunk_size]).map do |chunk_of_dses|
128
+ Chunk.new(db, chunk_of_dses, @opts)
129
+ end
130
+
131
+ return chunks.first.union if chunks.size == 1
132
+
133
+ unionize(chunks.each(&:create).map { |chunk| db[chunk.name] })
134
+ end
135
+
136
+ end
137
+
138
+ # Efficiently combines multiple datasets using UNION operations.
139
+ #
140
+ # This method handles large numbers of datasets by chunking them into
141
+ # manageable groups, creating temporary tables/views for intermediate
142
+ # results, and recursively combining them until a single dataset is produced.
143
+ #
144
+ # @param ds_set [Array<Sequel::Dataset>] The datasets to combine via UNION
145
+ # @param opts [Hash] Options for the union operation
146
+ # @option opts [Integer] :chunk_size (100) Number of datasets to combine in each chunk
147
+ # @option opts [String] :temp_table_prefix ('temp_union') Prefix for temporary table names
148
+ # @option opts [Boolean] :all (false) Use UNION ALL instead of UNION (keeps duplicates)
149
+ # @option opts [Boolean] :from_self (true) Wrap individual datasets in subqueries
150
+ #
151
+ # @return [Sequel::Dataset] The combined dataset
152
+ #
153
+ # @example Basic union of datasets
154
+ # db.unionize([ds1, ds2, ds3, ds4])
155
+ #
156
+ # @example Union all with custom chunk size
157
+ # db.unionize(datasets, all: true, chunk_size: 50)
158
+ #
159
+ # @example Custom temporary table prefix
160
+ # db.unionize(datasets, temp_table_prefix: 'my_union_batch')
161
+ def unionize(ds_set, opts = {})
162
+ Unionizer.new(self, ds_set, opts).unionize
163
+ end
164
+
165
+ end
166
+
167
+ Database.register_extension(:unionize, Unionize)
168
+
169
+ end
@@ -1,15 +1,44 @@
1
1
  module Sequel
2
+
3
+ # = Usable
4
+ #
5
+ # Sequel extension that provides a convenient +use+ method for switching
6
+ # the current database/schema context. This is particularly useful for
7
+ # databases that support the USE statement like MySQL, SQL Server, and
8
+ # some big data engines.
9
+ #
10
+ # @example
11
+ # db.extension :usable
12
+ # db.use(:my_schema)
13
+ # # Executes: USE `my_schema`
2
14
  module Usable
15
+
16
+ # Switches to the specified database or schema.
17
+ #
18
+ # Executes a USE statement to change the current database context.
19
+ # The schema name is properly quoted using the database's identifier
20
+ # quoting rules.
21
+ #
22
+ # @param schema_name [Symbol, String] the name of the schema/database to use
23
+ # @example
24
+ # db.use(:production_db)
25
+ # db.use('test_schema')
3
26
  def use(schema_name)
4
27
  run(use_sql(schema_name))
5
28
  end
6
29
 
7
30
  private
8
31
 
32
+ # Generates the USE SQL statement for the given schema name.
33
+ #
34
+ # @param schema_name [Symbol, String] the schema name to use
35
+ # @return [String] the USE SQL statement
9
36
  def use_sql(schema_name)
10
- "USE #{quote_identifier(schema_name)}"
37
+ "USE #{literal(schema_name)}"
11
38
  end
39
+
12
40
  end
13
41
 
14
42
  Database.register_extension(:usable, Usable)
43
+
15
44
  end