exwiw 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,15 +3,72 @@
3
3
  module Exwiw
4
4
  module Adapter
5
5
  class SqliteAdapter < Base
6
+ include SqlBulkInsert
7
+
8
+ # A lazy, streaming stand-in for the materialized rows #execute used to
9
+ # return (`connection.execute(sql)`). It walks the result one row at a time
10
+ # via SQLite's statement cursor (Statement#each -> sqlite3_step) instead of
11
+ # buffering the whole result set, so the dump's dominant memory cost — a
12
+ # Ruby array as large as the table — never materializes. The Runner drives
13
+ # it exactly like the old Array: #size to skip empty tables and log the
14
+ # count, then a single streaming pass (SqlBulkInsert#write_inserts ->
15
+ # each_slice) to write the INSERT.
16
+ #
17
+ # Mirrors Mysql/PostgresqlAdapter::StreamingResult, with two SQLite
18
+ # specifics:
19
+ # - #size runs a separate `SELECT COUNT(*)` of the same query with the
20
+ # projection replaced by COUNT(*) (compile_ast(count_only: true)) —
21
+ # exact because exwiw's extraction queries have no DISTINCT/GROUP
22
+ # BY/LIMIT, so the row count is independent of the projection. (Unlike
23
+ # MySQL, SQLite tolerates a duplicate-column subquery wrap too, but the
24
+ # count_only form is shared with MySQL and avoids the extra subquery.)
25
+ # - SQLite is an embedded, single-connection engine that allows several
26
+ # active prepared statements at once, so the #size COUNT and the data
27
+ # cursor do not contend. The statement is closed in an ensure block so
28
+ # an abandoned mid-stream iteration still releases the cursor.
29
+ class StreamingResult
30
+ include Enumerable
31
+
32
+ def initialize(connection:, data_sql:, count_sql:)
33
+ @connection = connection
34
+ @data_sql = data_sql
35
+ @count_sql = count_sql
36
+ end
37
+
38
+ def size
39
+ @size ||= @connection.execute(@count_sql).dig(0, 0).to_i
40
+ end
41
+ alias length size
42
+
43
+ # Stream the result set row by row. Each row is an Array of values in
44
+ # SQLite's native type mapping — byte-identical to what
45
+ # `connection.execute(sql)` produced, so the generated INSERT is unchanged.
46
+ def each
47
+ return enum_for(:each) { size } unless block_given?
48
+
49
+ statement = @connection.prepare(@data_sql)
50
+ begin
51
+ statement.each { |row| yield row }
52
+ ensure
53
+ statement.close
54
+ end
55
+ self
56
+ end
57
+ end
58
+
6
59
  def build_query(table, dump_target, table_by_name)
7
60
  Exwiw::QueryAstBuilder.run(table.name, table_by_name, dump_target, @logger)
8
61
  end
9
62
 
10
63
  def execute(query_ast)
11
- sql = commented_sql(query_ast)
12
-
13
- @logger.debug(" Executing SQL: \n#{sql}")
14
- connection.execute(sql)
64
+ data_sql = commented_sql(query_ast)
65
+ # Count via the same FROM/JOIN/WHERE (projection replaced by COUNT(*)) so
66
+ # the Runner can skip empty tables and log the row count without draining
67
+ # the cursor. See StreamingResult for why this is exact.
68
+ count_sql = "#{sql_query_comment(query_ast)} #{compile_ast(query_ast, count_only: true)}"
69
+
70
+ @logger.debug(" Executing SQL (cursor stream): \n#{data_sql}")
71
+ StreamingResult.new(connection: connection, data_sql: data_sql, count_sql: count_sql)
15
72
  end
16
73
 
17
74
  def explain(query_ast)
@@ -63,22 +120,16 @@ module Exwiw
63
120
  stmt.end_with?(';') ? stmt : "#{stmt};"
64
121
  end
65
122
 
66
- def to_bulk_insert(results, table)
123
+ # The INSERT header for this adapter. SQLite uses bare identifiers.
124
+ # #to_bulk_insert / #write_inserts (SqlBulkInsert) append the value tuples
125
+ # and the trailing `;`.
126
+ private def insert_header(table)
67
127
  table_name = table.name
68
-
69
- value_list = results.map do |row|
70
- quoted_values = row.map do |value|
71
- escape_value(value)
72
- end
73
- "(" + quoted_values.join(', ') + ")"
74
- end
75
- values = value_list.join(",\n")
76
-
77
128
  if table.rails_managed?
78
- "INSERT INTO #{table_name} VALUES\n#{values};"
129
+ "INSERT INTO #{table_name} VALUES\n"
79
130
  else
80
131
  column_names = table.columns.map(&:name).join(', ')
81
- "INSERT INTO #{table_name} (#{column_names}) VALUES\n#{values};"
132
+ "INSERT INTO #{table_name} (#{column_names}) VALUES\n"
82
133
  end
83
134
  end
84
135
 
@@ -140,11 +191,17 @@ module Exwiw
140
191
  sql
141
192
  end
142
193
 
143
- def compile_ast(query_ast)
194
+ # @param count_only [Boolean] emit `SELECT COUNT(*)` instead of the
195
+ # projected columns (used by StreamingResult#size). Safe because exwiw's
196
+ # extraction queries have no DISTINCT/GROUP BY/LIMIT, so the count does
197
+ # not depend on the projection.
198
+ def compile_ast(query_ast, count_only: false)
144
199
  raise NotImplementedError unless query_ast.is_a?(Exwiw::QueryAst::Select)
145
200
 
146
201
  sql = "SELECT "
147
- sql += if query_ast.select_all
202
+ sql += if count_only
203
+ "COUNT(*)"
204
+ elsif query_ast.select_all
148
205
  "*"
149
206
  else
150
207
  query_ast.columns.map { |col| compile_column_name(query_ast, col) }.join(', ')
data/lib/exwiw/adapter.rb CHANGED
@@ -113,6 +113,44 @@ module Exwiw
113
113
  raise NotImplementedError, "COPY format is not supported by #{self.class.name}"
114
114
  end
115
115
 
116
+ # Default bulk-insert chunk size when a table config does not set one.
117
+ # The Runner streams each chunk straight to the output file, so a non-nil
118
+ # value here bounds how much serialized output (and how many transient
119
+ # intermediate objects) live in memory at once. SQL adapters keep nil
120
+ # (one statement per table, as before); adapters whose output is large
121
+ # and built per-row (e.g. MongoDB JSONL) override with a positive value.
122
+ def default_bulk_insert_chunk_size
123
+ nil
124
+ end
125
+
126
+ # Write the bulk INSERT/JSONL output for `results` to the open `io`,
127
+ # returning the number of statements written. The Runner calls this once
128
+ # per table for the non-COPY path.
129
+ #
130
+ # Default: build each chunk's output as a full string via #to_bulk_insert
131
+ # and write it, separating statements with "\n" — exactly what the Runner
132
+ # used to inline. This keeps the dominant memory cost at one chunk's
133
+ # serialized string (bounded by `chunk_size`), which is why MongoDB sets a
134
+ # positive default chunk size. Adapters whose output is a single large
135
+ # statement (the SQL adapters, where chunk_size is nil) override this to
136
+ # stream the statement to `io` in bounded buffers instead of holding the
137
+ # whole thing in memory.
138
+ #
139
+ # @param io [IO] open output file
140
+ # @param results [Enumerable] rows/documents from #execute
141
+ # @param table the table/collection config
142
+ # @param chunk_size [Integer, nil] rows per statement (nil => one statement)
143
+ def write_inserts(io, results, table, chunk_size)
144
+ chunks = chunk_size ? results.each_slice(chunk_size) : [results]
145
+ statement_count = 0
146
+ chunks.each do |chunk_rows|
147
+ io.print("\n") if statement_count.positive?
148
+ io.print(to_bulk_insert(chunk_rows, table))
149
+ statement_count += 1
150
+ end
151
+ statement_count
152
+ end
153
+
116
154
  # Run the database-specific EXPLAIN for the given query and return the
117
155
  # output as a single string for `explain` subcommand to print.
118
156
  # SQL adapters override; MongodbAdapter currently raises.
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Exwiw
6
+ # MongoDB Relaxed Extended JSON encoder for a single dumped document.
7
+ #
8
+ # `encode` is the one entry point. When the optional native extension compiled
9
+ # (the common case once `gem install exwiw` builds it), it emits the line in a
10
+ # single C tree-walk; otherwise it falls back to the pure-Ruby path. Both are
11
+ # byte-for-byte identical — the native path delegates every value it does not
12
+ # format itself back to `encode_fragment` (see ext/exwiw/ext_json/ext_json.c).
13
+ module ExtJson
14
+ module_function
15
+
16
+ # Pure-Ruby encoder for one value, identical to the historical
17
+ # `JSON.generate(doc.as_extended_json(mode: :relaxed))`. Used both as the
18
+ # whole-document fallback and as the native path's per-value delegate, so the
19
+ # two paths cannot diverge.
20
+ def encode_fragment(value)
21
+ JSON.generate(value.respond_to?(:as_extended_json) ? value.as_extended_json(mode: :relaxed) : value)
22
+ end
23
+
24
+ begin
25
+ require "exwiw/ext_json_native" # defines Exwiw::ExtJson.encode_native
26
+ def encode(doc) = encode_native(doc)
27
+ rescue LoadError
28
+ # No compiled extension (JRuby/TruffleRuby, or a host where the build
29
+ # failed): keep exwiw working as a pure-Ruby gem.
30
+ def encode(doc) = encode_fragment(doc)
31
+ end
32
+ end
33
+ end
data/lib/exwiw/runner.rb CHANGED
@@ -97,18 +97,30 @@ module Exwiw
97
97
  else
98
98
  phase = "generating INSERT statement"
99
99
  @logger.debug(" Generate INSERT statement...")
100
- chunk_size = table.bulk_insert_chunk_size
101
- chunks = chunk_size ? results.each_slice(chunk_size).to_a : [results]
102
- insert_sql = chunks.map { |chunk_rows| adapter.to_bulk_insert(chunk_rows, table) }.join("\n")
103
-
104
- @logger.info(" Generated INSERT statement for #{record_num} records (#{chunks.size} statement(s)).")
100
+ # Let the adapter write the INSERT/JSONL output straight to the file
101
+ # instead of building the whole table's output as one string first,
102
+ # so only a bounded amount of serialized text is resident at a time —
103
+ # important for large tables/collections whose one-shot output would
104
+ # otherwise be held in full alongside the already-large result set.
105
+ #
106
+ # The chunk size falls back to the adapter's default when the table
107
+ # config does not set one (SQL adapters: nil -> one statement, but
108
+ # streamed in bounded buffers; MongoDB: a positive default so the
109
+ # JSONL is chunked). #write_inserts emits bytes identical to the
110
+ # previous inline chunk loop and returns the statement count.
111
+ chunk_size = table.bulk_insert_chunk_size || adapter.default_bulk_insert_chunk_size
112
+
113
+ statement_count = 0
105
114
  File.open(File.join(@output_dir, "insert-#{insert_idx}-#{table_name}.#{adapter.output_extension}"), 'w') do |file|
106
115
  pre = adapter.pre_insert_sql(table)
107
116
  file.puts(pre) if pre
108
- file.puts(insert_sql)
117
+ statement_count = adapter.write_inserts(file, results, table, chunk_size)
118
+ file.print("\n")
109
119
  post = adapter.post_insert_sql(table)
110
120
  file.puts(post) if post
111
121
  end
122
+
123
+ @logger.info(" Generated INSERT statement for #{record_num} records (#{statement_count} statement(s)).")
112
124
  end
113
125
 
114
126
  if adapter.supports_bulk_delete? && !@insert_only && !(table.respond_to?(:rails_managed?) && table.rails_managed?)
data/lib/exwiw/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Exwiw
4
- VERSION = "0.5.2"
4
+ VERSION = "0.6.0"
5
5
  end
data/lib/exwiw.rb CHANGED
@@ -5,6 +5,7 @@ require_relative "exwiw/version"
5
5
  require "json"
6
6
  require "serdes"
7
7
 
8
+ require_relative "exwiw/ext_json"
8
9
  require_relative "exwiw/belongs_to"
9
10
  require_relative "exwiw/table_column"
10
11
  require_relative "exwiw/table_config"
@@ -13,6 +14,7 @@ require_relative "exwiw/mongodb_field"
13
14
  require_relative "exwiw/mongodb_collection_config"
14
15
  require_relative "exwiw/ddl_postprocessor"
15
16
  require_relative "exwiw/adapter"
17
+ require_relative "exwiw/adapter/sql_bulk_insert"
16
18
  require_relative "exwiw/adapter/sqlite_adapter"
17
19
  require_relative "exwiw/adapter/mysql_client"
18
20
  require_relative "exwiw/adapter/mysql_adapter"
data/mise.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [env]
2
- # Prepend scenario/bin so `pg_dump` resolves to the wrapper that delegates to
2
+ # Prepend e2e/bin so `pg_dump` resolves to the wrapper that delegates to
3
3
  # the postgres container (compose.yml). exwiw's PostgreSQL adapter shells out
4
4
  # to pg_dump, which requires a server/client major-version match — the dev DB
5
5
  # is postgres:17 while host clients are often older (e.g. Homebrew pg14).
6
- _.path = ["./scenario/bin"]
6
+ _.path = ["./e2e/bin"]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: exwiw
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shia
@@ -29,25 +29,33 @@ email:
29
29
  - rise.shia@gmail.com
30
30
  executables:
31
31
  - exwiw
32
- extensions: []
32
+ extensions:
33
+ - ext/exwiw/ext_json/extconf.rb
33
34
  extra_rdoc_files: []
34
35
  files:
35
36
  - CHANGELOG.md
36
37
  - LICENSE.txt
37
38
  - README.md
39
+ - docs/optimization-notes.md
40
+ - docs/optimize-mongodb-export-with-native-ext.md
38
41
  - docs/plans/2026-05-15-insert-000-schema-file.md
39
42
  - docs/plans/2026-05-16-mongodb-from-clean-scenario.md
40
43
  - docs/plans/2026-05-22-after-insert-hook.md
41
44
  - docs/plans/2026-05-22-postgres-copy-mode-scenario-test.md
42
45
  - docs/plans/2026-05-29-rails-managed-tables.md
43
46
  - docs/plans/2026-05-31-ids-column-for-sql-adapters.md
47
+ - docs/plans/2026-06-19-mongodb-export-remove-parallelism-native-ext.md
48
+ - docs/sql-dump-optimization-notes.md
44
49
  - exe/exwiw
50
+ - ext/exwiw/ext_json/ext_json.c
51
+ - ext/exwiw/ext_json/extconf.rb
45
52
  - lib/exwiw.rb
46
53
  - lib/exwiw/adapter.rb
47
54
  - lib/exwiw/adapter/mongodb_adapter.rb
48
55
  - lib/exwiw/adapter/mysql_adapter.rb
49
56
  - lib/exwiw/adapter/mysql_client.rb
50
57
  - lib/exwiw/adapter/postgresql_adapter.rb
58
+ - lib/exwiw/adapter/sql_bulk_insert.rb
51
59
  - lib/exwiw/adapter/sqlite_adapter.rb
52
60
  - lib/exwiw/after_insert_hook.rb
53
61
  - lib/exwiw/belongs_to.rb
@@ -56,6 +64,7 @@ files:
56
64
  - lib/exwiw/determine_table_processing_order.rb
57
65
  - lib/exwiw/embedded_in.rb
58
66
  - lib/exwiw/explain_runner.rb
67
+ - lib/exwiw/ext_json.rb
59
68
  - lib/exwiw/mongo_query.rb
60
69
  - lib/exwiw/mongodb_collection_config.rb
61
70
  - lib/exwiw/mongodb_field.rb