exwiw 0.5.3 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,15 +3,72 @@
3
3
  module Exwiw
4
4
  module Adapter
5
5
  class SqliteAdapter < Base
6
+ include SqlBulkInsert
7
+
8
+ # A lazy, streaming stand-in for the materialized rows #execute used to
9
+ # return (`connection.execute(sql)`). It walks the result one row at a time
10
+ # via SQLite's statement cursor (Statement#each -> sqlite3_step) instead of
11
+ # buffering the whole result set, so the dump's dominant memory cost — a
12
+ # Ruby array as large as the table — never materializes. The Runner drives
13
+ # it exactly like the old Array: #size to skip empty tables and log the
14
+ # count, then a single streaming pass (SqlBulkInsert#write_inserts ->
15
+ # each_slice) to write the INSERT.
16
+ #
17
+ # Mirrors Mysql/PostgresqlAdapter::StreamingResult, with two SQLite
18
+ # specifics:
19
+ # - #size runs a separate `SELECT COUNT(*)` of the same query with the
20
+ # projection replaced by COUNT(*) (compile_ast(count_only: true)) —
21
+ # exact because exwiw's extraction queries have no DISTINCT/GROUP
22
+ # BY/LIMIT, so the row count is independent of the projection. (Unlike
23
+ # MySQL, SQLite tolerates a duplicate-column subquery wrap too, but the
24
+ # count_only form is shared with MySQL and avoids the extra subquery.)
25
+ # - SQLite is an embedded, single-connection engine that allows several
26
+ # active prepared statements at once, so the #size COUNT and the data
27
+ # cursor do not contend. The statement is closed in an ensure block so
28
+ # an abandoned mid-stream iteration still releases the cursor.
29
+ class StreamingResult
30
+ include Enumerable
31
+
32
+ def initialize(connection:, data_sql:, count_sql:)
33
+ @connection = connection
34
+ @data_sql = data_sql
35
+ @count_sql = count_sql
36
+ end
37
+
38
+ def size
39
+ @size ||= @connection.execute(@count_sql).dig(0, 0).to_i
40
+ end
41
+ alias length size
42
+
43
+ # Stream the result set row by row. Each row is an Array of values in
44
+ # SQLite's native type mapping — byte-identical to what
45
+ # `connection.execute(sql)` produced, so the generated INSERT is unchanged.
46
+ def each
47
+ return enum_for(:each) { size } unless block_given?
48
+
49
+ statement = @connection.prepare(@data_sql)
50
+ begin
51
+ statement.each { |row| yield row }
52
+ ensure
53
+ statement.close
54
+ end
55
+ self
56
+ end
57
+ end
58
+
6
59
  def build_query(table, dump_target, table_by_name)
7
60
  Exwiw::QueryAstBuilder.run(table.name, table_by_name, dump_target, @logger)
8
61
  end
9
62
 
10
63
  def execute(query_ast)
11
- sql = commented_sql(query_ast)
12
-
13
- @logger.debug(" Executing SQL: \n#{sql}")
14
- connection.execute(sql)
64
+ data_sql = commented_sql(query_ast)
65
+ # Count via the same FROM/JOIN/WHERE (projection replaced by COUNT(*)) so
66
+ # the Runner can skip empty tables and log the row count without draining
67
+ # the cursor. See StreamingResult for why this is exact.
68
+ count_sql = "#{sql_query_comment(query_ast)} #{compile_ast(query_ast, count_only: true)}"
69
+
70
+ @logger.debug(" Executing SQL (cursor stream): \n#{data_sql}")
71
+ StreamingResult.new(connection: connection, data_sql: data_sql, count_sql: count_sql)
15
72
  end
16
73
 
17
74
  def explain(query_ast)
@@ -63,22 +120,16 @@ module Exwiw
63
120
  stmt.end_with?(';') ? stmt : "#{stmt};"
64
121
  end
65
122
 
66
- def to_bulk_insert(results, table)
123
+ # The INSERT header for this adapter. SQLite uses bare identifiers.
124
+ # #to_bulk_insert / #write_inserts (SqlBulkInsert) append the value tuples
125
+ # and the trailing `;`.
126
+ private def insert_header(table)
67
127
  table_name = table.name
68
-
69
- value_list = results.map do |row|
70
- quoted_values = row.map do |value|
71
- escape_value(value)
72
- end
73
- "(" + quoted_values.join(', ') + ")"
74
- end
75
- values = value_list.join(",\n")
76
-
77
128
  if table.rails_managed?
78
- "INSERT INTO #{table_name} VALUES\n#{values};"
129
+ "INSERT INTO #{table_name} VALUES\n"
79
130
  else
80
131
  column_names = table.columns.map(&:name).join(', ')
81
- "INSERT INTO #{table_name} (#{column_names}) VALUES\n#{values};"
132
+ "INSERT INTO #{table_name} (#{column_names}) VALUES\n"
82
133
  end
83
134
  end
84
135
 
@@ -140,11 +191,17 @@ module Exwiw
140
191
  sql
141
192
  end
142
193
 
143
- def compile_ast(query_ast)
194
+ # @param count_only [Boolean] emit `SELECT COUNT(*)` instead of the
195
+ # projected columns (used by StreamingResult#size). Safe because exwiw's
196
+ # extraction queries have no DISTINCT/GROUP BY/LIMIT, so the count does
197
+ # not depend on the projection.
198
+ def compile_ast(query_ast, count_only: false)
144
199
  raise NotImplementedError unless query_ast.is_a?(Exwiw::QueryAst::Select)
145
200
 
146
201
  sql = "SELECT "
147
- sql += if query_ast.select_all
202
+ sql += if count_only
203
+ "COUNT(*)"
204
+ elsif query_ast.select_all
148
205
  "*"
149
206
  else
150
207
  query_ast.columns.map { |col| compile_column_name(query_ast, col) }.join(', ')
data/lib/exwiw/adapter.rb CHANGED
@@ -123,6 +123,34 @@ module Exwiw
123
123
  nil
124
124
  end
125
125
 
126
+ # Write the bulk INSERT/JSONL output for `results` to the open `io`,
127
+ # returning the number of statements written. The Runner calls this once
128
+ # per table for the non-COPY path.
129
+ #
130
+ # Default: build each chunk's output as a full string via #to_bulk_insert
131
+ # and write it, separating statements with "\n" — exactly what the Runner
132
+ # used to inline. This keeps the dominant memory cost at one chunk's
133
+ # serialized string (bounded by `chunk_size`), which is why MongoDB sets a
134
+ # positive default chunk size. Adapters whose output is a single large
135
+ # statement (the SQL adapters, where chunk_size is nil) override this to
136
+ # stream the statement to `io` in bounded buffers instead of holding the
137
+ # whole thing in memory.
138
+ #
139
+ # @param io [IO] open output file
140
+ # @param results [Enumerable] rows/documents from #execute
141
+ # @param table the table/collection config
142
+ # @param chunk_size [Integer, nil] rows per statement (nil => one statement)
143
+ def write_inserts(io, results, table, chunk_size)
144
+ chunks = chunk_size ? results.each_slice(chunk_size) : [results]
145
+ statement_count = 0
146
+ chunks.each do |chunk_rows|
147
+ io.print("\n") if statement_count.positive?
148
+ io.print(to_bulk_insert(chunk_rows, table))
149
+ statement_count += 1
150
+ end
151
+ statement_count
152
+ end
153
+
126
154
  # Run the database-specific EXPLAIN for the given query and return the
127
155
  # output as a single string for `explain` subcommand to print.
128
156
  # SQL adapters override; MongodbAdapter currently raises.
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Exwiw
6
+ # MongoDB Relaxed Extended JSON encoder for a single dumped document.
7
+ #
8
+ # `encode` is the one entry point. When the optional native extension compiled
9
+ # (the common case once `gem install exwiw` builds it), it emits the line in a
10
+ # single C tree-walk; otherwise it falls back to the pure-Ruby path. Both are
11
+ # byte-for-byte identical — the native path delegates every value it does not
12
+ # format itself back to `encode_fragment` (see ext/exwiw/ext_json/ext_json.c).
13
+ module ExtJson
14
+ module_function
15
+
16
+ # Pure-Ruby encoder for one value, identical to the historical
17
+ # `JSON.generate(doc.as_extended_json(mode: :relaxed))`. Used both as the
18
+ # whole-document fallback and as the native path's per-value delegate, so the
19
+ # two paths cannot diverge.
20
+ def encode_fragment(value)
21
+ JSON.generate(value.respond_to?(:as_extended_json) ? value.as_extended_json(mode: :relaxed) : value)
22
+ end
23
+
24
+ begin
25
+ require "exwiw/ext_json_native" # defines Exwiw::ExtJson.encode_native
26
+ def encode(doc) = encode_native(doc)
27
+ rescue LoadError
28
+ # No compiled extension (JRuby/TruffleRuby, or a host where the build
29
+ # failed): keep exwiw working as a pure-Ruby gem.
30
+ def encode(doc) = encode_fragment(doc)
31
+ end
32
+ end
33
+ end
data/lib/exwiw/runner.rb CHANGED
@@ -97,30 +97,24 @@ module Exwiw
97
97
  else
98
98
  phase = "generating INSERT statement"
99
99
  @logger.debug(" Generate INSERT statement...")
100
- # Stream each chunk straight to the file instead of building the whole
101
- # table's INSERT/JSONL output as one string first. This keeps only a
102
- # single chunk's serialized text (and its transient intermediate
103
- # objects) in memory at a time important for large MongoDB
104
- # collections, whose one-giant-chunk JSONL would otherwise be held in
105
- # full alongside the already-large in-memory result set.
100
+ # Let the adapter write the INSERT/JSONL output straight to the file
101
+ # instead of building the whole table's output as one string first,
102
+ # so only a bounded amount of serialized text is resident at a time —
103
+ # important for large tables/collections whose one-shot output would
104
+ # otherwise be held in full alongside the already-large result set.
106
105
  #
107
106
  # The chunk size falls back to the adapter's default when the table
108
- # config does not set one (SQL adapters: nil -> one statement, as
109
- # before; MongoDB: a positive default so the output is chunked). The
110
- # bytes written are identical to joining the chunks with "\n" and
111
- # appending a trailing newline, matching the previous `file.puts`.
107
+ # config does not set one (SQL adapters: nil -> one statement, but
108
+ # streamed in bounded buffers; MongoDB: a positive default so the
109
+ # JSONL is chunked). #write_inserts emits bytes identical to the
110
+ # previous inline chunk loop and returns the statement count.
112
111
  chunk_size = table.bulk_insert_chunk_size || adapter.default_bulk_insert_chunk_size
113
- chunks = chunk_size ? results.each_slice(chunk_size) : [results]
114
112
 
115
113
  statement_count = 0
116
114
  File.open(File.join(@output_dir, "insert-#{insert_idx}-#{table_name}.#{adapter.output_extension}"), 'w') do |file|
117
115
  pre = adapter.pre_insert_sql(table)
118
116
  file.puts(pre) if pre
119
- chunks.each do |chunk_rows|
120
- file.print("\n") if statement_count.positive?
121
- file.print(adapter.to_bulk_insert(chunk_rows, table))
122
- statement_count += 1
123
- end
117
+ statement_count = adapter.write_inserts(file, results, table, chunk_size)
124
118
  file.print("\n")
125
119
  post = adapter.post_insert_sql(table)
126
120
  file.puts(post) if post
data/lib/exwiw/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Exwiw
4
- VERSION = "0.5.3"
4
+ VERSION = "0.6.0"
5
5
  end
data/lib/exwiw.rb CHANGED
@@ -5,6 +5,7 @@ require_relative "exwiw/version"
5
5
  require "json"
6
6
  require "serdes"
7
7
 
8
+ require_relative "exwiw/ext_json"
8
9
  require_relative "exwiw/belongs_to"
9
10
  require_relative "exwiw/table_column"
10
11
  require_relative "exwiw/table_config"
@@ -13,6 +14,7 @@ require_relative "exwiw/mongodb_field"
13
14
  require_relative "exwiw/mongodb_collection_config"
14
15
  require_relative "exwiw/ddl_postprocessor"
15
16
  require_relative "exwiw/adapter"
17
+ require_relative "exwiw/adapter/sql_bulk_insert"
16
18
  require_relative "exwiw/adapter/sqlite_adapter"
17
19
  require_relative "exwiw/adapter/mysql_client"
18
20
  require_relative "exwiw/adapter/mysql_adapter"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: exwiw
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.3
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shia
@@ -29,7 +29,8 @@ email:
29
29
  - rise.shia@gmail.com
30
30
  executables:
31
31
  - exwiw
32
- extensions: []
32
+ extensions:
33
+ - ext/exwiw/ext_json/extconf.rb
33
34
  extra_rdoc_files: []
34
35
  files:
35
36
  - CHANGELOG.md
@@ -44,13 +45,17 @@ files:
44
45
  - docs/plans/2026-05-29-rails-managed-tables.md
45
46
  - docs/plans/2026-05-31-ids-column-for-sql-adapters.md
46
47
  - docs/plans/2026-06-19-mongodb-export-remove-parallelism-native-ext.md
48
+ - docs/sql-dump-optimization-notes.md
47
49
  - exe/exwiw
50
+ - ext/exwiw/ext_json/ext_json.c
51
+ - ext/exwiw/ext_json/extconf.rb
48
52
  - lib/exwiw.rb
49
53
  - lib/exwiw/adapter.rb
50
54
  - lib/exwiw/adapter/mongodb_adapter.rb
51
55
  - lib/exwiw/adapter/mysql_adapter.rb
52
56
  - lib/exwiw/adapter/mysql_client.rb
53
57
  - lib/exwiw/adapter/postgresql_adapter.rb
58
+ - lib/exwiw/adapter/sql_bulk_insert.rb
54
59
  - lib/exwiw/adapter/sqlite_adapter.rb
55
60
  - lib/exwiw/after_insert_hook.rb
56
61
  - lib/exwiw/belongs_to.rb
@@ -59,6 +64,7 @@ files:
59
64
  - lib/exwiw/determine_table_processing_order.rb
60
65
  - lib/exwiw/embedded_in.rb
61
66
  - lib/exwiw/explain_runner.rb
67
+ - lib/exwiw/ext_json.rb
62
68
  - lib/exwiw/mongo_query.rb
63
69
  - lib/exwiw/mongodb_collection_config.rb
64
70
  - lib/exwiw/mongodb_field.rb