sequelizer 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.claude/settings.local.json +64 -0
  3. data/.devcontainer/.p10k.zsh +1713 -0
  4. data/.devcontainer/.zshrc +29 -0
  5. data/.devcontainer/Dockerfile +137 -0
  6. data/.devcontainer/copy-claude-credentials.sh +32 -0
  7. data/.devcontainer/devcontainer.json +102 -0
  8. data/.devcontainer/init-firewall.sh +123 -0
  9. data/.devcontainer/setup-credentials.sh +95 -0
  10. data/.github/workflows/test.yml +1 -1
  11. data/.gitignore +6 -1
  12. data/.overcommit.yml +73 -0
  13. data/.rubocop.yml +167 -0
  14. data/CHANGELOG.md +24 -0
  15. data/CLAUDE.md +219 -0
  16. data/Gemfile +6 -2
  17. data/Gemfile.lock +158 -0
  18. data/Guardfile +1 -1
  19. data/Rakefile +28 -3
  20. data/lib/sequel/extensions/cold_col.rb +436 -0
  21. data/lib/sequel/extensions/db_opts.rb +65 -4
  22. data/lib/sequel/extensions/make_readyable.rb +148 -30
  23. data/lib/sequel/extensions/more_sql.rb +76 -0
  24. data/lib/sequel/extensions/settable.rb +64 -0
  25. data/lib/sequel/extensions/sql_recorder.rb +85 -0
  26. data/lib/sequel/extensions/unionize.rb +169 -0
  27. data/lib/sequel/extensions/usable.rb +30 -1
  28. data/lib/sequelizer/cli.rb +61 -18
  29. data/lib/sequelizer/connection_maker.rb +54 -72
  30. data/lib/sequelizer/env_config.rb +6 -6
  31. data/lib/sequelizer/gemfile_modifier.rb +23 -21
  32. data/lib/sequelizer/monkey_patches/database_in_after_connect.rb +7 -5
  33. data/lib/sequelizer/options.rb +97 -18
  34. data/lib/sequelizer/options_hash.rb +2 -0
  35. data/lib/sequelizer/version.rb +3 -1
  36. data/lib/sequelizer/yaml_config.rb +9 -3
  37. data/lib/sequelizer.rb +65 -9
  38. data/sequelizer.gemspec +12 -7
  39. data/test/lib/sequel/extensions/test_cold_col.rb +251 -0
  40. data/test/lib/sequel/extensions/test_db_opts.rb +10 -8
  41. data/test/lib/sequel/extensions/test_make_readyable.rb +199 -28
  42. data/test/lib/sequel/extensions/test_more_sql.rb +132 -0
  43. data/test/lib/sequel/extensions/test_settable.rb +109 -0
  44. data/test/lib/sequel/extensions/test_sql_recorder.rb +231 -0
  45. data/test/lib/sequel/extensions/test_unionize.rb +76 -0
  46. data/test/lib/sequel/extensions/test_usable.rb +5 -2
  47. data/test/lib/sequelizer/test_connection_maker.rb +21 -17
  48. data/test/lib/sequelizer/test_env_config.rb +5 -2
  49. data/test/lib/sequelizer/test_gemfile_modifier.rb +7 -6
  50. data/test/lib/sequelizer/test_options.rb +14 -9
  51. data/test/lib/sequelizer/test_yaml_config.rb +13 -12
  52. data/test/test_helper.rb +36 -8
  53. metadata +107 -28
  54. data/lib/sequel/extensions/sqls.rb +0 -31
@@ -1,60 +1,101 @@
1
+ require 'pathname'
2
+
1
3
  module Sequel
4
+
5
+ # = MakeReadyable
6
+ #
7
+ # Sequel extension that provides database readiness functionality,
8
+ # primarily geared towards Spark SQL-based databases. This extension
9
+ # allows setting up temporary views and schema configurations to prepare
10
+ # a database for use.
11
+ #
12
+ # @example Basic schema usage
13
+ # db.extension :make_readyable
14
+ # db.make_ready(use_schema: :my_schema)
15
+ #
16
+ # @example Search path with schema precedence
17
+ # db.make_ready(search_path: [:schema1, :schema2])
18
+ #
19
+ # @example External file sources
20
+ # db.make_ready(search_path: [Pathname.new('data.parquet')])
2
21
  module MakeReadyable
3
- ##
4
- # This method is primarily geared towards Spark SQL-based databases.
22
+
23
+ # Prepares the database by setting up schemas, views, and external data sources.
5
24
  #
25
+ # This method is primarily geared towards Spark SQL-based databases.
6
26
  # Given some options, prepares a set of views to represent a set
7
27
  # of tables across a collection of different schemas and external,
8
28
  # unmanaged tables.
9
29
  #
30
+ # @param opts [Hash] the options used to prepare the database
31
+ # @option opts [Symbol] :use_schema The schema to be used as the primary schema
32
+ # @option opts [Array] :search_path A set of symbols (schemas) or Pathnames (external files)
33
+ # @option opts [Array] :only Limit view creation to these tables only
34
+ # @option opts [Array] :except Skip view creation for these tables
35
+ #
36
+ # @example Set primary schema
10
37
  # DB.make_ready(use_schema: :schema)
11
38
  # # => USE `schema`
12
39
  #
13
- # When using search_path, tables from previous schema override tables
14
- # from the next schema. This is analogous to the way Unix searches
15
- # the PATH variable for programs.
16
- #
17
- # Assuming the following tables: schema1.a, schema2.a, schema2.b
18
- #
40
+ # @example Search path with precedence
41
+ # # Assuming tables: schema1.a, schema2.a, schema2.b
19
42
  # DB.make_ready(search_path: [:schema1, :schema2])
20
43
  # # => CREATE TEMPORARY VIEW `a` AS SELECT * FROM `schema1`.`a;`
21
44
  # # => CREATE TEMPORARY VIEW `b` AS SELECT * FROM `schema2`.`b;`
22
45
  #
23
- # When using Pathnames, the extension on the file becomes the format
24
- # to try to read from the file.
25
- #
46
+ # @example External file sources
26
47
  # DB.make_ready(search_path: [Pathname.new("c.parquet"), Pathname.new("d.orc")])
27
48
  # # => CREATE TEMPORARY VIEW `c` USING parquet OPTIONS ('path'='c.parquet')
28
49
  # # => CREATE TEMPORARY VIEW `d` USING orc OPTIONS ('path'='d.orc')
29
- #
30
- # @param [Hash] opts the options used to prepare the database
31
- # @option opts [String] :use_schema The schema to be used as the primary schema
32
- # @option opts [Array] :search_path A set of sympbols (to represent schemas) or Pathnames (to represent externally managed data files)
33
50
  def make_ready(opts = {})
34
51
  ReadyMaker.new(self, opts).run
35
52
  end
53
+
36
54
  end
37
55
 
38
- private
56
+ # = ReadyMaker
57
+ #
58
+ # Internal class that handles the actual database preparation logic.
59
+ # This class processes the make_ready options and executes the necessary
60
+ # SQL statements to set up schemas, views, and external data sources.
39
61
  class ReadyMaker
62
+
63
+ # @!attribute [r] db
64
+ # @return [Sequel::Database] the database instance
65
+ # @!attribute [r] opts
66
+ # @return [Hash] the preparation options
40
67
  attr_reader :db, :opts
41
68
 
69
+ # Creates a new ReadyMaker instance.
70
+ #
71
+ # @param db [Sequel::Database] the database to prepare
72
+ # @param opts [Hash] the preparation options
42
73
  def initialize(db, opts)
43
74
  @db = db
44
75
  @opts = opts
45
76
  end
46
-
77
+
78
+ # Executes the database preparation process.
79
+ #
80
+ # This method handles:
81
+ # 1. Setting the primary schema if specified
82
+ # 2. Processing the search path to create views
83
+ # 3. Handling table filtering (only/except options)
47
84
  def run
48
85
  if opts[:use_schema]
49
86
  db.extension :usable
50
87
  db.use(opts[:use_schema])
51
88
  end
52
89
  only_tables = Array(opts[:only])
53
- created_views = (Array(opts[:except]) || [])
54
- (opts[:search_path] || []).each do |schema|
55
- schema = schema.is_a?(Pathname) ? schema : schema.to_sym
90
+ created_views = Array(opts[:except]) || []
91
+ (opts[:search_path] || []).flatten.each do |schema|
92
+ schema = schema.to_sym unless schema.is_a?(Pathname)
56
93
  source = get_source(db, schema)
57
- tables = source.tables(schema: schema) - created_views
94
+ tables = if schema.is_a?(Pathname)
95
+ source.tables - created_views
96
+ else
97
+ source.tables(schema: schema) - created_views
98
+ end
58
99
  tables &= only_tables unless only_tables.empty?
59
100
  tables.each do |table|
60
101
  create_view(source, table, schema)
@@ -63,46 +104,123 @@ module Sequel
63
104
  end
64
105
  end
65
106
 
107
+ # Creates a temporary view for the given table.
108
+ #
109
+ # @param source [Object] the source (database or FileSourcerer)
110
+ # @param table [Symbol] the table name
111
+ # @param schema [Symbol, Pathname] the schema or file path
66
112
  def create_view(source, table, schema)
67
113
  if schema.to_s =~ %r{/}
68
114
  source.create_view(table, temp: true)
69
115
  else
116
+ # For schema-based tables, just create temporary views
117
+ # This extension is primarily for Spark SQL-based databases
70
118
  source.create_view(table, db[Sequel.qualify(schema, table)], temp: true)
71
119
  end
72
120
  end
73
121
 
122
+ # Gets the appropriate source handler for the schema.
123
+ #
124
+ # @param db [Sequel::Database] the database instance
125
+ # @param schema [Symbol, Pathname] the schema or file path
126
+ # @return [Sequel::Database, FileSourcerer] the source handler
74
127
  def get_source(db, schema)
75
128
  if schema.to_s =~ %r{/}
76
- FileSourcerer.new(db, Pathname.new(schema))
129
+ FileSourcerer.new(db, Pathname.new(schema.to_s))
77
130
  else
78
131
  db
79
132
  end
80
133
  end
81
134
 
135
+ # = FileSourcerer
136
+ #
137
+ # Handles external file sources for the make_ready functionality.
138
+ # This class creates temporary views that read from external files
139
+ # like Parquet, ORC, etc.
82
140
  class FileSourcerer
141
+
142
+ # @!attribute [r] db
143
+ # @return [Sequel::Database] the database instance
144
+ # @!attribute [r] schema
145
+ # @return [Pathname] the file path
83
146
  attr_reader :db, :schema
147
+
148
+ # Creates a new FileSourcerer instance.
149
+ #
150
+ # @param db [Sequel::Database] the database instance
151
+ # @param schema [Pathname] the file path
84
152
  def initialize(db, schema)
85
153
  @db = db
86
154
  @schema = schema
87
155
  end
88
156
 
89
- def tables(opts = {})
90
- [schema.basename(".*").to_s.to_sym]
157
+ # Returns the table name derived from the file name.
158
+ #
159
+ # @param _opts [Hash] unused options parameter
160
+ # @return [Array<Symbol>] array containing the table name
161
+ def tables(_opts = {})
162
+ [schema.basename(schema.extname).to_s.to_sym]
91
163
  end
92
164
 
165
+ # Creates a temporary view that reads from the external file.
166
+ #
167
+ # @param table [Symbol] the table/view name
168
+ # @param opts [Hash] additional options to merge
93
169
  def create_view(table, opts = {})
94
- db.create_view(table, {
95
- temp: true,
96
- using: format,
97
- options: { path: schema.expand_path }
98
- }.merge(opts))
170
+ case db.database_type
171
+ when :spark
172
+ # Spark SQL uses USING clause for external tables
173
+ db.create_view(table, {
174
+ temp: true,
175
+ using: format,
176
+ options: { path: schema.expand_path },
177
+ }.merge(opts))
178
+ when :duckdb
179
+ # DuckDB uses direct file reading with read_* functions
180
+ create_duckdb_view(table, opts)
181
+ else
182
+ raise Sequel::Error, "External file sources are not supported on #{db.database_type}"
183
+ end
99
184
  end
100
185
 
186
+ private
187
+
188
+ # Creates a view for DuckDB to read external files
189
+ #
190
+ # @param table [Symbol] the table/view name
191
+ # @param _opts [Hash] additional options to merge (currently unused for DuckDB)
192
+ def create_duckdb_view(table, _opts)
193
+ file_path = if schema.directory?
194
+ schema.expand_path.join('**').join("*.#{format}").to_s
195
+ else
196
+ schema.expand_path.to_s
197
+ end
198
+ read_function = case format
199
+ when 'parquet'
200
+ :read_parquet
201
+ when 'csv'
202
+ :read_csv_auto
203
+ when 'json'
204
+ :read_json_auto
205
+ else
206
+ raise Sequel::Error, "Unsupported file format '#{format}' for DuckDB"
207
+ end
208
+
209
+ # DuckDB doesn't support TEMPORARY views, use regular CREATE VIEW
210
+ db.create_view(table, db.from(Sequel.function(read_function, file_path)))
211
+ end
212
+
213
+ # Returns the file format based on the file extension.
214
+ #
215
+ # @return [String] the file format (e.g., 'parquet', 'orc')
101
216
  def format
102
- schema.extname[1..-1]
217
+ schema.extname[1..]
103
218
  end
219
+
104
220
  end
221
+
105
222
  end
106
223
 
107
224
  Database.register_extension(:make_readyable, MakeReadyable)
225
+
108
226
  end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Sequel
4
+
5
+ # Provides additional SQL helper methods for database operations.
6
+ #
7
+ # The more_sql extension adds convenience methods for SQL operations that
8
+ # aren't covered by Sequel's core functionality, particularly schema-related
9
+ # operations like CREATE SCHEMA.
10
+ #
11
+ # @example Load the extension
12
+ # DB.extension :more_sql
13
+ #
14
+ # @example Create a schema
15
+ # DB.create_schema(:analytics)
16
+ # # Executes: CREATE SCHEMA "analytics"
17
+ #
18
+ # @example Create schema if it doesn't exist
19
+ # DB.create_schema(:staging, if_not_exists: true)
20
+ # # Executes: CREATE SCHEMA IF NOT EXISTS "staging"
21
+ module MoreSql
22
+
23
+ # Creates a database schema.
24
+ #
25
+ # Generates and executes a CREATE SCHEMA statement with optional
26
+ # IF NOT EXISTS clause for idempotent schema creation.
27
+ #
28
+ # @param schema_name [Symbol, String] The name of the schema to create
29
+ # @param opts [Hash] Options for schema creation
30
+ # @option opts [Boolean] :if_not_exists (false) Only create the schema if it doesn't already exist
31
+ #
32
+ # @return [nil]
33
+ #
34
+ # @example Basic schema creation
35
+ # DB.create_schema(:reports)
36
+ # # Executes: CREATE SCHEMA "reports"
37
+ #
38
+ # @example Idempotent schema creation
39
+ # DB.create_schema(:analytics, if_not_exists: true)
40
+ # # Executes: CREATE SCHEMA IF NOT EXISTS "analytics"
41
+ #
42
+ # @example With string schema name
43
+ # DB.create_schema('user_data')
44
+ # # Executes: CREATE SCHEMA "user_data"
45
+ def create_schema(schema_name, opts = {})
46
+ run(create_schema_sql(schema_name, opts))
47
+ end
48
+
49
+ private
50
+
51
+ # Generates the SQL for creating a schema.
52
+ #
53
+ # Builds a CREATE SCHEMA statement with proper identifier quoting
54
+ # and optional IF NOT EXISTS clause.
55
+ #
56
+ # @param schema_name [Symbol, String] The name of the schema to create
57
+ # @param opts [Hash] Options for schema creation
58
+ # @option opts [Boolean] :if_not_exists (false) Include IF NOT EXISTS clause
59
+ #
60
+ # @return [String] The CREATE SCHEMA SQL statement
61
+ #
62
+ # @example
63
+ # create_schema_sql(:test, if_not_exists: true)
64
+ # # => 'CREATE SCHEMA IF NOT EXISTS "test"'
65
+ def create_schema_sql(schema_name, opts)
66
+ sql = 'CREATE SCHEMA '
67
+ sql += 'IF NOT EXISTS ' if opts[:if_not_exists]
68
+ sql += literal(schema_name)
69
+ sql
70
+ end
71
+
72
+ end
73
+
74
+ Database.register_extension(:more_sql, MoreSql)
75
+
76
+ end
@@ -1,5 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # The settable extension adds a convenient +set+ method to database connections
5
+ # for executing SET statements with key-value pairs. This is particularly useful
6
+ # for configuring database session parameters.
7
+ #
8
+ # DB.extension :settable
9
+ # DB.set(search_path: 'public', timezone: 'UTC')
10
+ # # Executes: SET search_path=public
11
+ # # SET timezone=UTC
12
+ #
13
+ # DB.set(work_mem: '256MB')
14
+ # # Executes: SET work_mem=256MB
15
+ #
16
+ # The extension works with any database adapter and supports various value types
17
+ # including strings, numbers, booleans, and nil values.
18
+ #
19
+ # Related module: Sequel::Settable
20
+
1
21
  module Sequel
22
+
23
+ # The Settable module provides database configuration functionality through
24
+ # SET statements. When loaded as an extension, it adds the +set+ method to
25
+ # database connections.
2
26
  module Settable
27
+
28
+ # Execute SET statements for the given options hash.
29
+ #
30
+ # Each key-value pair in the options hash is converted to a SET statement
31
+ # and executed against the database. Multiple options result in multiple
32
+ # SET statements being executed in sequence.
33
+ #
34
+ # @param opts [Hash] Hash of configuration options to set
35
+ # @option opts [Object] key The configuration parameter name
36
+ # @option opts [Object] value The value to set for the parameter
37
+ #
38
+ # @example Set a single parameter
39
+ # DB.set(timezone: 'UTC')
40
+ # # Executes: SET timezone=UTC
41
+ #
42
+ # @example Set multiple parameters
43
+ # DB.set(search_path: 'public', work_mem: '256MB')
44
+ # # Executes: SET search_path=public
45
+ # # SET work_mem=256MB
46
+ #
47
+ # @example Different value types
48
+ # DB.set(port: 5432, autocommit: true, custom_setting: nil)
49
+ # # Executes: SET port=5432
50
+ # # SET autocommit=true
51
+ # # SET custom_setting=
52
+ #
53
+ # @return [void]
3
54
  def set(opts = {})
4
55
  set_sql(opts).each do |sql|
5
56
  run(sql)
@@ -8,10 +59,23 @@ module Sequel
8
59
 
9
60
  private
10
61
 
62
+ # Generate SET SQL statements from options hash.
63
+ #
64
+ # Converts each key-value pair in the options hash into a SET SQL statement
65
+ # string. This is a private helper method used internally by the +set+ method.
66
+ #
67
+ # @param opts [Hash] Hash of options to convert to SET statements
68
+ # @return [Array<String>] Array of SET SQL statement strings
69
+ #
70
+ # @example
71
+ # set_sql(timezone: 'UTC', port: 5432)
72
+ # # => ["SET timezone=UTC", "SET port=5432"]
11
73
  def set_sql(opts)
12
74
  opts.map { |k, v| "SET #{k}=#{v}" }
13
75
  end
76
+
14
77
  end
15
78
 
16
79
  Database.register_extension(:settable, Settable)
80
+
17
81
  end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ # == Overview
4
+ #
5
+ # The sql_recorder extension records each SQL statement sent to the database
6
+ # in a thread-safe array accessible via the +sql_recorder+ method.
7
+ #
8
+ # == Usage
9
+ #
10
+ # DB.extension :sql_recorder
11
+ # DB[:users].all
12
+ # DB[:posts].where(id: 1).first
13
+ #
14
+ # # Access recorded SQL statements
15
+ # DB.sql_recorder
16
+ # # => ["SELECT * FROM users", "SELECT * FROM posts WHERE (id = 1) LIMIT 1"]
17
+ #
18
+ # # Clear the recorded statements
19
+ # DB.sql_recorder.clear
20
+ #
21
+ # == Thread Safety
22
+ #
23
+ # The extension is thread-safe and uses a mutex to synchronize access to the
24
+ # SQL recording array when multiple threads are executing queries simultaneously.
25
+ #
26
+ # == Compatibility
27
+ #
28
+ # This extension is designed to work alongside mock databases and other SQL
29
+ # recording mechanisms. It uses the method name +sql_recorder+ to avoid
30
+ # conflicts with existing +sqls+ methods that may be present in test frameworks.
31
+ #
32
+ # Related module: Sequel::SqlRecorder
33
+
34
+ module Sequel
35
+
36
+ # Extension module that adds SQL recording capabilities to Sequel databases.
37
+ # When included, it provides a +sql_recorder+ method that returns an array
38
+ # of all SQL statements executed against the database.
39
+ module SqlRecorder
40
+
41
+ # Returns the array of recorded SQL statements.
42
+ #
43
+ # The array accumulates all SQL statements sent to the database since the
44
+ # extension was loaded or since the last time +clear+ was called on the array.
45
+ #
46
+ # @return [Array<String>] array of SQL statement strings
47
+ # @example
48
+ # DB.extension :sql_recorder
49
+ # DB[:users].all
50
+ # DB.sql_recorder #=> ["SELECT * FROM users"]
51
+ attr_reader :sql_recorder
52
+
53
+ # Intercepts SQL execution to record statements.
54
+ #
55
+ # This method overrides Sequel's +log_connection_yield+ to capture each SQL
56
+ # statement in a thread-safe manner before delegating to the parent implementation.
57
+ #
58
+ # @param sql [String] the SQL statement being executed
59
+ # @param conn [Object] the database connection object
60
+ # @param args [Object] additional arguments (optional)
61
+ # @return [Object] result from the parent +log_connection_yield+ method
62
+ def log_connection_yield(sql, conn, args = nil)
63
+ @sql_recorder_mutex.synchronize { sql_recorder.push(sql) }
64
+ super
65
+ end
66
+
67
+ # Initializes the SQL recording infrastructure when the extension is loaded.
68
+ #
69
+ # Sets up the mutex for thread-safe access and initializes the SQL recording
70
+ # array. This method is automatically called when the extension is loaded
71
+ # via +DB.extension :sql_recorder+.
72
+ #
73
+ # @param db [Sequel::Database] the database instance being extended
74
+ def self.extended(db)
75
+ db.instance_exec do
76
+ @sql_recorder_mutex ||= Mutex.new
77
+ @sql_recorder ||= []
78
+ end
79
+ end
80
+
81
+ end
82
+
83
+ Database.register_extension(:sql_recorder, SqlRecorder)
84
+
85
+ end
@@ -0,0 +1,169 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'digest'
4
+
5
+ module Sequel
6
+
7
+ # Provides efficient handling of large UNION operations.
8
+ #
9
+ # The unionize extension allows combining many datasets through UNION operations
10
+ # by chunking them into manageable temporary tables or views. This is particularly
11
+ # useful when dealing with databases that have limitations on the number of UNION
12
+ # operations in a single query (e.g., Spark SQL, DuckDB).
13
+ #
14
+ # @example Load the extension
15
+ # DB.extension :unionize
16
+ #
17
+ # @example Basic usage
18
+ # DB.unionize([dataset1, dataset2, dataset3, dataset4])
19
+ #
20
+ # @example With options
21
+ # DB.unionize(datasets, chunk_size: 50, all: true, temp_table_prefix: 'my_union')
22
+ module Unionize
23
+
24
+ # Handles the chunking and union of multiple datasets.
25
+ #
26
+ # This class manages the process of splitting a large collection of datasets
27
+ # into smaller chunks, creating temporary tables/views for each chunk, and
28
+ # then recursively combining them until a single unified dataset is produced.
29
+ class Unionizer
30
+
31
+ # Default number of datasets to combine in each chunk
32
+ DEFAULT_CHUNK_SIZE = 100
33
+
34
+ # Represents a chunk of datasets to be combined via UNION.
35
+ #
36
+ # Each chunk handles a subset of datasets, creates a temporary table/view
37
+ # for the combined result, and provides access to the unified dataset.
38
+ class Chunk
39
+
40
+ # @!attribute [r] db
41
+ # @return [Sequel::Database] The database connection
42
+ # @!attribute [r] dses
43
+ # @return [Array<Sequel::Dataset>] The datasets in this chunk
44
+ # @!attribute [r] opts
45
+ # @return [Hash] Options for the union operation
46
+ attr_reader :db, :dses, :opts
47
+
48
+ # Creates a new chunk instance.
49
+ #
50
+ # @param db [Sequel::Database] The database connection
51
+ # @param dses [Array<Sequel::Dataset>] The datasets to combine
52
+ # @param opts [Hash] Options for the union operation
53
+ def initialize(db, dses, opts)
54
+ @db = db
55
+ @dses = dses
56
+ @opts = opts
57
+ end
58
+
59
+ # Returns the unified dataset created by combining all datasets in this chunk.
60
+ #
61
+ # @return [Sequel::Dataset] The combined dataset
62
+ def union
63
+ @union ||= dses.reduce { |a, b| a.union(b, all: opts[:all], from_self: opts[:from_self]) }
64
+ end
65
+
66
+ # Generates a unique name for the temporary table/view.
67
+ #
68
+ # The name is based on a hash of the SQL query to ensure uniqueness
69
+ # and avoid collisions when multiple unionize operations are running.
70
+ #
71
+ # @return [Symbol] The temporary table/view name
72
+ def name
73
+ @name ||= :"#{opts[:temp_table_prefix]}_#{Digest::SHA1.hexdigest(union.sql)}"
74
+ end
75
+
76
+ # Creates a temporary table or view for this chunk's union result.
77
+ #
78
+ # The method used depends on the database type:
79
+ # - Spark: Creates a temporary view
80
+ # - DuckDB: Creates a temporary table
81
+ #
82
+ # @raise [RuntimeError] If the database type is not supported
83
+ # @return [void]
84
+ def create
85
+ if db.database_type == :spark
86
+ db.create_view(name, union, temp: true)
87
+ elsif db.database_type == :duckdb
88
+ db.create_table(name, temp: true, as: union)
89
+ else
90
+ raise "Unsupported database type: #{db.database_type}"
91
+ end
92
+ end
93
+
94
+ end
95
+
96
+ # @!attribute [r] db
97
+ # @return [Sequel::Database] The database connection
98
+ attr_reader :db
99
+
100
+ # Creates a new Unionizer instance.
101
+ #
102
+ # @param db [Sequel::Database] The database connection
103
+ # @param ds_set [Array<Sequel::Dataset>] The datasets to combine
104
+ # @param opts [Hash] Options for the union operation
105
+ # @option opts [Integer] :chunk_size (100) Number of datasets per chunk
106
+ # @option opts [String] :temp_table_prefix ('temp_union') Prefix for temporary tables
107
+ # @option opts [Boolean] :all (false) Use UNION ALL instead of UNION
108
+ # @option opts [Boolean] :from_self (true) Wrap individual datasets in subqueries
109
+ def initialize(db, ds_set, opts = {})
110
+ @db = db
111
+ @ds_set = ds_set
112
+ @opts = opts
113
+ opts[:chunk_size] ||= DEFAULT_CHUNK_SIZE
114
+ opts[:temp_table_prefix] ||= 'temp_union'
115
+ opts[:all] ||= false
116
+ opts[:from_self] = opts.fetch(:from_self, true)
117
+ end
118
+
119
+ # Performs the unionization of datasets.
120
+ #
121
+ # This method recursively chunks the datasets, creates temporary tables/views
122
+ # for each chunk, and then combines them until a single dataset remains.
123
+ #
124
+ # @param dses [Array<Sequel::Dataset>] The datasets to combine (defaults to @ds_set)
125
+ # @return [Sequel::Dataset] The final combined dataset
126
+ def unionize(dses = @ds_set)
127
+ chunks = dses.each_slice(@opts[:chunk_size]).map do |chunk_of_dses|
128
+ Chunk.new(db, chunk_of_dses, @opts)
129
+ end
130
+
131
+ return chunks.first.union if chunks.size == 1
132
+
133
+ unionize(chunks.each(&:create).map { |chunk| db[chunk.name] })
134
+ end
135
+
136
+ end
137
+
138
+ # Efficiently combines multiple datasets using UNION operations.
139
+ #
140
+ # This method handles large numbers of datasets by chunking them into
141
+ # manageable groups, creating temporary tables/views for intermediate
142
+ # results, and recursively combining them until a single dataset is produced.
143
+ #
144
+ # @param ds_set [Array<Sequel::Dataset>] The datasets to combine via UNION
145
+ # @param opts [Hash] Options for the union operation
146
+ # @option opts [Integer] :chunk_size (100) Number of datasets to combine in each chunk
147
+ # @option opts [String] :temp_table_prefix ('temp_union') Prefix for temporary table names
148
+ # @option opts [Boolean] :all (false) Use UNION ALL instead of UNION (keeps duplicates)
149
+ # @option opts [Boolean] :from_self (true) Wrap individual datasets in subqueries
150
+ #
151
+ # @return [Sequel::Dataset] The combined dataset
152
+ #
153
+ # @example Basic union of datasets
154
+ # db.unionize([ds1, ds2, ds3, ds4])
155
+ #
156
+ # @example Union all with custom chunk size
157
+ # db.unionize(datasets, all: true, chunk_size: 50)
158
+ #
159
+ # @example Custom temporary table prefix
160
+ # db.unionize(datasets, temp_table_prefix: 'my_union_batch')
161
+ def unionize(ds_set, opts = {})
162
+ Unionizer.new(self, ds_set, opts).unionize
163
+ end
164
+
165
+ end
166
+
167
+ Database.register_extension(:unionize, Unionize)
168
+
169
+ end