exwiw 0.5.3 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,15 +3,91 @@
3
3
  module Exwiw
4
4
  module Adapter
5
5
  class PostgresqlAdapter < Base
6
+ include SqlBulkInsert
7
+
8
+ # A lazy, streaming stand-in for the materialized rows #execute used to
9
+ # return (`connection.exec(sql).values`). It pulls rows off the wire one
10
+ # at a time via libpq's single-row mode instead of buffering the whole
11
+ # result set, so the dump's dominant memory cost — a Ruby array as large
12
+ # as the table — never materializes. The Runner drives it exactly like the
13
+ # old Array: #size to skip empty tables and log the count, then a single
14
+ # streaming pass (SqlBulkInsert#write_inserts -> each_slice) to write the
15
+ # INSERT.
16
+ #
17
+ # Mirrors MongodbAdapter::StreamingResult; two SQL-specific differences:
18
+ # - #size cannot be answered cheaply from the cursor, so it runs a
19
+ # separate `SELECT COUNT(*)` of the same query. (MongoDB uses
20
+ # count_documents, an index-only walk; the SQL COUNT re-runs the query
21
+ # plan but transfers no row data — Postgres prunes the unused
22
+ # projection of the wrapped subquery.) This keeps the Runner contract
23
+ # unchanged, so MongoDB and the other SQL adapters are untouched.
24
+ # - the streaming pass ties up the connection until fully drained. The
25
+ # Runner always drains it (write_inserts) before issuing any further
26
+ # query (post_insert_sql / DELETE) on the same connection, so the
27
+ # ordering invariant holds.
28
+ class StreamingResult
29
+ include Enumerable
30
+
31
+ def initialize(connection:, data_sql:, count_sql:)
32
+ @connection = connection
33
+ @data_sql = data_sql
34
+ @count_sql = count_sql
35
+ end
36
+
37
+ def size
38
+ @size ||= @connection.exec(@count_sql).getvalue(0, 0).to_i
39
+ end
40
+ alias length size
41
+
42
+ # Stream the result set row by row. Each row is an Array of String|nil
43
+ # in libpq's text format — byte-identical to what `#exec(sql).values`
44
+ # produced, so the generated INSERT is unchanged.
45
+ def each
46
+ return enum_for(:each) { size } unless block_given?
47
+
48
+ @connection.send_query(@data_sql)
49
+ @connection.set_single_row_mode
50
+ begin
51
+ while (result = @connection.get_result)
52
+ begin
53
+ result.check
54
+ result.each_row { |row| yield row }
55
+ ensure
56
+ result.clear
57
+ end
58
+ end
59
+ rescue StandardError
60
+ # If iteration is abandoned mid-stream (a SQL error surfaced by
61
+ # #check, or the consumer raised), drain any results still queued so
62
+ # a later query on this same connection does not fail with "another
63
+ # command is already in progress".
64
+ drain
65
+ raise
66
+ end
67
+ self
68
+ end
69
+
70
+ private def drain
71
+ while (result = @connection.get_result)
72
+ result.clear
73
+ end
74
+ rescue PG::Error
75
+ # Connection already errored/clean; nothing left to drain.
76
+ end
77
+ end
78
+
6
79
  def build_query(table, dump_target, table_by_name)
7
80
  Exwiw::QueryAstBuilder.run(table.name, table_by_name, dump_target, @logger)
8
81
  end
9
82
 
10
83
  def execute(query_ast)
11
- sql = commented_sql(query_ast)
84
+ data_sql = commented_sql(query_ast)
85
+ # Count via the same query (wrapped as a subquery) so the Runner can
86
+ # skip empty tables and log the row count without draining the stream.
87
+ count_sql = "#{sql_query_comment(query_ast)} SELECT COUNT(*) FROM (#{compile_ast(query_ast)}) AS exwiw_count_src"
12
88
 
13
- @logger.debug(" Executing SQL: \n#{sql}")
14
- connection.exec(sql).values
89
+ @logger.debug(" Executing SQL (single-row stream): \n#{data_sql}")
90
+ StreamingResult.new(connection: connection, data_sql: data_sql, count_sql: count_sql)
15
91
  end
16
92
 
17
93
  def explain(query_ast)
@@ -97,22 +173,16 @@ module Exwiw
97
173
  @logger.info(" Wrote schema for #{table_names.size} table(s) to #{output_path}.")
98
174
  end
99
175
 
100
- def to_bulk_insert(results, table)
176
+ # The INSERT header for this adapter. PostgreSQL uses bare identifiers.
177
+ # #to_bulk_insert / #write_inserts (SqlBulkInsert) append the value tuples
178
+ # and the trailing `;`.
179
+ private def insert_header(table)
101
180
  table_name = table.name
102
-
103
- value_list = results.map do |row|
104
- quoted_values = row.map do |value|
105
- escape_value(value)
106
- end
107
- "(" + quoted_values.join(', ') + ")"
108
- end
109
- values = value_list.join(",\n")
110
-
111
181
  if table.rails_managed?
112
- "INSERT INTO #{table_name} VALUES\n#{values};"
182
+ "INSERT INTO #{table_name} VALUES\n"
113
183
  else
114
184
  column_names = table.columns.map(&:name).join(', ')
115
- "INSERT INTO #{table_name} (#{column_names}) VALUES\n#{values};"
185
+ "INSERT INTO #{table_name} (#{column_names}) VALUES\n"
116
186
  end
117
187
  end
118
188
 
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Exwiw
4
+ module Adapter
5
+ # Shared bulk-INSERT construction for the SQL adapters (mysql / postgresql /
6
+ # sqlite). They produce the same `INSERT INTO ... VALUES (...),(...);` shape
7
+ # and differ only in the header's identifier quoting (see #insert_header) and
8
+ # in #escape_value, so both the in-memory builder (#to_bulk_insert) and the
9
+ # bounded-memory streaming writer (#write_inserts) live here.
10
+ #
11
+ # Each including adapter must provide two private methods:
12
+ # - insert_header(table) -> the "INSERT INTO ... VALUES\n" prefix
13
+ # - escape_value(value) -> the SQL literal for one column value
14
+ module SqlBulkInsert
15
+ # How many rows' tuples to build-and-flush at a time when streaming. Bounds
16
+ # peak memory to this many tuples (plus their joined string) instead of the
17
+ # whole table's INSERT string, while keeping each flush a single fast
18
+ # Array#map + Array#join (the same C-level path #to_bulk_insert uses) so it
19
+ # stays close to whole-string speed — far faster than a naive row-at-a-time
20
+ # IO#print (see script/bench_sql_dump.rb / docs/sql-dump-optimization-notes.md).
21
+ # Mirrors MongoDB's default chunk size: bounded work per flush, but the SQL
22
+ # adapters still emit ONE statement (byte-identical to the un-chunked build).
23
+ STREAM_FLUSH_ROWS = 2_000
24
+
25
+ # Build the whole INSERT statement as a single String. Kept for callers
26
+ # that want the string form (and as the readable definition of the exact
27
+ # bytes #write_inserts streams).
28
+ def to_bulk_insert(results, table)
29
+ value_list = results.map { |row| insert_tuple(row) }
30
+ "#{insert_header(table)}#{value_list.join(",\n")};"
31
+ end
32
+
33
+ # Stream the bulk INSERT(s) straight to `io` instead of materializing the
34
+ # whole statement string first. Byte-for-byte identical to writing
35
+ # #to_bulk_insert per chunk joined by "\n" (verified by
36
+ # insert_output_snapshot_spec), but only one ~STREAM_FLUSH_BYTES buffer is
37
+ # resident at a time rather than the entire table's INSERT string. Returns
38
+ # the number of statements written.
39
+ def write_inserts(io, results, table, chunk_size)
40
+ chunks = chunk_size ? results.each_slice(chunk_size) : [results]
41
+ statement_count = 0
42
+ chunks.each do |chunk_rows|
43
+ io.print("\n") if statement_count.positive?
44
+ stream_single_insert(io, chunk_rows, table)
45
+ statement_count += 1
46
+ end
47
+ statement_count
48
+ end
49
+
50
+ # Emit one `INSERT INTO ... VALUES <tuples>;` statement to `io`, building
51
+ # and flushing the value tuples STREAM_FLUSH_ROWS at a time so the full
52
+ # statement text is never held in memory at once. Each slice is one fast
53
+ # map+join; the ",\n" between slices reproduces the same separator
54
+ # #to_bulk_insert puts between every tuple, so the bytes are identical.
55
+ private def stream_single_insert(io, rows, table)
56
+ io.print(insert_header(table))
57
+ first = true
58
+ rows.each_slice(STREAM_FLUSH_ROWS) do |slice|
59
+ io.print(",\n") unless first
60
+ first = false
61
+ io.print(slice.map { |row| insert_tuple(row) }.join(",\n"))
62
+ end
63
+ io.print(";")
64
+ end
65
+
66
+ private def insert_tuple(row)
67
+ "(" + row.map { |value| escape_value(value) }.join(', ') + ")"
68
+ end
69
+ end
70
+ end
71
+ end
@@ -3,15 +3,72 @@
3
3
  module Exwiw
4
4
  module Adapter
5
5
  class SqliteAdapter < Base
6
+ include SqlBulkInsert
7
+
8
+ # A lazy, streaming stand-in for the materialized rows #execute used to
9
+ # return (`connection.execute(sql)`). It walks the result one row at a time
10
+ # via SQLite's statement cursor (Statement#each -> sqlite3_step) instead of
11
+ # buffering the whole result set, so the dump's dominant memory cost — a
12
+ # Ruby array as large as the table — never materializes. The Runner drives
13
+ # it exactly like the old Array: #size to skip empty tables and log the
14
+ # count, then a single streaming pass (SqlBulkInsert#write_inserts ->
15
+ # each_slice) to write the INSERT.
16
+ #
17
+ # Mirrors Mysql/PostgresqlAdapter::StreamingResult, with two SQLite
18
+ # specifics:
19
+ # - #size runs a separate `SELECT COUNT(*)` of the same query with the
20
+ # projection replaced by COUNT(*) (compile_ast(count_only: true)) —
21
+ # exact because exwiw's extraction queries have no DISTINCT/GROUP
22
+ # BY/LIMIT, so the row count is independent of the projection. (Unlike
23
+ # MySQL, SQLite tolerates a duplicate-column subquery wrap too, but the
24
+ # count_only form is shared with MySQL and avoids the extra subquery.)
25
+ # - SQLite is an embedded, single-connection engine that allows several
26
+ # active prepared statements at once, so the #size COUNT and the data
27
+ # cursor do not contend. The statement is closed in an ensure block so
28
+ # an abandoned mid-stream iteration still releases the cursor.
29
+ class StreamingResult
30
+ include Enumerable
31
+
32
+ def initialize(connection:, data_sql:, count_sql:)
33
+ @connection = connection
34
+ @data_sql = data_sql
35
+ @count_sql = count_sql
36
+ end
37
+
38
+ def size
39
+ @size ||= @connection.execute(@count_sql).dig(0, 0).to_i
40
+ end
41
+ alias length size
42
+
43
+ # Stream the result set row by row. Each row is an Array of values in
44
+ # SQLite's native type mapping — byte-identical to what
45
+ # `connection.execute(sql)` produced, so the generated INSERT is unchanged.
46
+ def each
47
+ return enum_for(:each) { size } unless block_given?
48
+
49
+ statement = @connection.prepare(@data_sql)
50
+ begin
51
+ statement.each { |row| yield row }
52
+ ensure
53
+ statement.close
54
+ end
55
+ self
56
+ end
57
+ end
58
+
6
59
  def build_query(table, dump_target, table_by_name)
7
60
  Exwiw::QueryAstBuilder.run(table.name, table_by_name, dump_target, @logger)
8
61
  end
9
62
 
10
63
  def execute(query_ast)
11
- sql = commented_sql(query_ast)
12
-
13
- @logger.debug(" Executing SQL: \n#{sql}")
14
- connection.execute(sql)
64
+ data_sql = commented_sql(query_ast)
65
+ # Count via the same FROM/JOIN/WHERE (projection replaced by COUNT(*)) so
66
+ # the Runner can skip empty tables and log the row count without draining
67
+ # the cursor. See StreamingResult for why this is exact.
68
+ count_sql = "#{sql_query_comment(query_ast)} #{compile_ast(query_ast, count_only: true)}"
69
+
70
+ @logger.debug(" Executing SQL (cursor stream): \n#{data_sql}")
71
+ StreamingResult.new(connection: connection, data_sql: data_sql, count_sql: count_sql)
15
72
  end
16
73
 
17
74
  def explain(query_ast)
@@ -63,22 +120,16 @@ module Exwiw
63
120
  stmt.end_with?(';') ? stmt : "#{stmt};"
64
121
  end
65
122
 
66
- def to_bulk_insert(results, table)
123
+ # The INSERT header for this adapter. SQLite uses bare identifiers.
124
+ # #to_bulk_insert / #write_inserts (SqlBulkInsert) append the value tuples
125
+ # and the trailing `;`.
126
+ private def insert_header(table)
67
127
  table_name = table.name
68
-
69
- value_list = results.map do |row|
70
- quoted_values = row.map do |value|
71
- escape_value(value)
72
- end
73
- "(" + quoted_values.join(', ') + ")"
74
- end
75
- values = value_list.join(",\n")
76
-
77
128
  if table.rails_managed?
78
- "INSERT INTO #{table_name} VALUES\n#{values};"
129
+ "INSERT INTO #{table_name} VALUES\n"
79
130
  else
80
131
  column_names = table.columns.map(&:name).join(', ')
81
- "INSERT INTO #{table_name} (#{column_names}) VALUES\n#{values};"
132
+ "INSERT INTO #{table_name} (#{column_names}) VALUES\n"
82
133
  end
83
134
  end
84
135
 
@@ -140,11 +191,17 @@ module Exwiw
140
191
  sql
141
192
  end
142
193
 
143
- def compile_ast(query_ast)
194
+ # @param count_only [Boolean] emit `SELECT COUNT(*)` instead of the
195
+ # projected columns (used by StreamingResult#size). Safe because exwiw's
196
+ # extraction queries have no DISTINCT/GROUP BY/LIMIT, so the count does
197
+ # not depend on the projection.
198
+ def compile_ast(query_ast, count_only: false)
144
199
  raise NotImplementedError unless query_ast.is_a?(Exwiw::QueryAst::Select)
145
200
 
146
201
  sql = "SELECT "
147
- sql += if query_ast.select_all
202
+ sql += if count_only
203
+ "COUNT(*)"
204
+ elsif query_ast.select_all
148
205
  "*"
149
206
  else
150
207
  query_ast.columns.map { |col| compile_column_name(query_ast, col) }.join(', ')
data/lib/exwiw/adapter.rb CHANGED
@@ -123,6 +123,34 @@ module Exwiw
123
123
  nil
124
124
  end
125
125
 
126
+ # Write the bulk INSERT/JSONL output for `results` to the open `io`,
127
+ # returning the number of statements written. The Runner calls this once
128
+ # per table for the non-COPY path.
129
+ #
130
+ # Default: build each chunk's output as a full string via #to_bulk_insert
131
+ # and write it, separating statements with "\n" — exactly what the Runner
132
+ # used to inline. This keeps the dominant memory cost at one chunk's
133
+ # serialized string (bounded by `chunk_size`), which is why MongoDB sets a
134
+ # positive default chunk size. Adapters whose output is a single large
135
+ # statement (the SQL adapters, where chunk_size is nil) override this to
136
+ # stream the statement to `io` in bounded buffers instead of holding the
137
+ # whole thing in memory.
138
+ #
139
+ # @param io [IO] open output file
140
+ # @param results [Enumerable] rows/documents from #execute
141
+ # @param table the table/collection config
142
+ # @param chunk_size [Integer, nil] rows per statement (nil => one statement)
143
+ def write_inserts(io, results, table, chunk_size)
144
+ chunks = chunk_size ? results.each_slice(chunk_size) : [results]
145
+ statement_count = 0
146
+ chunks.each do |chunk_rows|
147
+ io.print("\n") if statement_count.positive?
148
+ io.print(to_bulk_insert(chunk_rows, table))
149
+ statement_count += 1
150
+ end
151
+ statement_count
152
+ end
153
+
126
154
  # Run the database-specific EXPLAIN for the given query and return the
127
155
  # output as a single string for `explain` subcommand to print.
128
156
  # SQL adapters override; MongodbAdapter currently raises.
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Exwiw
6
+ # MongoDB Relaxed Extended JSON encoder for a single dumped document.
7
+ #
8
+ # `encode` is the one entry point. When the optional native extension compiled
9
+ # (the common case once `gem install exwiw` builds it), it emits the line in a
10
+ # single C tree-walk; otherwise it falls back to the pure-Ruby path. Both are
11
+ # byte-for-byte identical — the native path delegates every value it does not
12
+ # format itself back to `encode_fragment` (see ext/exwiw/ext_json/ext_json.c).
13
+ module ExtJson
14
+ module_function
15
+
16
+ # Pure-Ruby encoder for one value, identical to the historical
17
+ # `JSON.generate(doc.as_extended_json(mode: :relaxed))`. Used both as the
18
+ # whole-document fallback and as the native path's per-value delegate, so the
19
+ # two paths cannot diverge.
20
+ def encode_fragment(value)
21
+ JSON.generate(value.respond_to?(:as_extended_json) ? value.as_extended_json(mode: :relaxed) : value)
22
+ end
23
+
24
+ begin
25
+ require "exwiw/ext_json_native" # defines Exwiw::ExtJson.encode_native
26
+ def encode(doc) = encode_native(doc)
27
+ rescue LoadError
28
+ # No compiled extension (JRuby/TruffleRuby, or a host where the build
29
+ # failed): keep exwiw working as a pure-Ruby gem.
30
+ def encode(doc) = encode_fragment(doc)
31
+ end
32
+ end
33
+ end
data/lib/exwiw/runner.rb CHANGED
@@ -97,30 +97,24 @@ module Exwiw
97
97
  else
98
98
  phase = "generating INSERT statement"
99
99
  @logger.debug(" Generate INSERT statement...")
100
- # Stream each chunk straight to the file instead of building the whole
101
- # table's INSERT/JSONL output as one string first. This keeps only a
102
- # single chunk's serialized text (and its transient intermediate
103
- # objects) in memory at a time important for large MongoDB
104
- # collections, whose one-giant-chunk JSONL would otherwise be held in
105
- # full alongside the already-large in-memory result set.
100
+ # Let the adapter write the INSERT/JSONL output straight to the file
101
+ # instead of building the whole table's output as one string first,
102
+ # so only a bounded amount of serialized text is resident at a time —
103
+ # important for large tables/collections whose one-shot output would
104
+ # otherwise be held in full alongside the already-large result set.
106
105
  #
107
106
  # The chunk size falls back to the adapter's default when the table
108
- # config does not set one (SQL adapters: nil -> one statement, as
109
- # before; MongoDB: a positive default so the output is chunked). The
110
- # bytes written are identical to joining the chunks with "\n" and
111
- # appending a trailing newline, matching the previous `file.puts`.
107
+ # config does not set one (SQL adapters: nil -> one statement, but
108
+ # streamed in bounded buffers; MongoDB: a positive default so the
109
+ # JSONL is chunked). #write_inserts emits bytes identical to the
110
+ # previous inline chunk loop and returns the statement count.
112
111
  chunk_size = table.bulk_insert_chunk_size || adapter.default_bulk_insert_chunk_size
113
- chunks = chunk_size ? results.each_slice(chunk_size) : [results]
114
112
 
115
113
  statement_count = 0
116
114
  File.open(File.join(@output_dir, "insert-#{insert_idx}-#{table_name}.#{adapter.output_extension}"), 'w') do |file|
117
115
  pre = adapter.pre_insert_sql(table)
118
116
  file.puts(pre) if pre
119
- chunks.each do |chunk_rows|
120
- file.print("\n") if statement_count.positive?
121
- file.print(adapter.to_bulk_insert(chunk_rows, table))
122
- statement_count += 1
123
- end
117
+ statement_count = adapter.write_inserts(file, results, table, chunk_size)
124
118
  file.print("\n")
125
119
  post = adapter.post_insert_sql(table)
126
120
  file.puts(post) if post
data/lib/exwiw/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Exwiw
4
- VERSION = "0.5.3"
4
+ VERSION = "0.6.1"
5
5
  end
data/lib/exwiw.rb CHANGED
@@ -5,6 +5,7 @@ require_relative "exwiw/version"
5
5
  require "json"
6
6
  require "serdes"
7
7
 
8
+ require_relative "exwiw/ext_json"
8
9
  require_relative "exwiw/belongs_to"
9
10
  require_relative "exwiw/table_column"
10
11
  require_relative "exwiw/table_config"
@@ -13,6 +14,7 @@ require_relative "exwiw/mongodb_field"
13
14
  require_relative "exwiw/mongodb_collection_config"
14
15
  require_relative "exwiw/ddl_postprocessor"
15
16
  require_relative "exwiw/adapter"
17
+ require_relative "exwiw/adapter/sql_bulk_insert"
16
18
  require_relative "exwiw/adapter/sqlite_adapter"
17
19
  require_relative "exwiw/adapter/mysql_client"
18
20
  require_relative "exwiw/adapter/mysql_adapter"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: exwiw
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.3
4
+ version: 0.6.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shia
@@ -29,12 +29,14 @@ email:
29
29
  - rise.shia@gmail.com
30
30
  executables:
31
31
  - exwiw
32
- extensions: []
32
+ extensions:
33
+ - ext/exwiw/ext_json/extconf.rb
33
34
  extra_rdoc_files: []
34
35
  files:
35
36
  - CHANGELOG.md
36
37
  - LICENSE.txt
37
38
  - README.md
39
+ - docs/mongodb-scoping-fullscan-notes.md
38
40
  - docs/optimization-notes.md
39
41
  - docs/optimize-mongodb-export-with-native-ext.md
40
42
  - docs/plans/2026-05-15-insert-000-schema-file.md
@@ -44,13 +46,17 @@ files:
44
46
  - docs/plans/2026-05-29-rails-managed-tables.md
45
47
  - docs/plans/2026-05-31-ids-column-for-sql-adapters.md
46
48
  - docs/plans/2026-06-19-mongodb-export-remove-parallelism-native-ext.md
49
+ - docs/sql-dump-optimization-notes.md
47
50
  - exe/exwiw
51
+ - ext/exwiw/ext_json/ext_json.c
52
+ - ext/exwiw/ext_json/extconf.rb
48
53
  - lib/exwiw.rb
49
54
  - lib/exwiw/adapter.rb
50
55
  - lib/exwiw/adapter/mongodb_adapter.rb
51
56
  - lib/exwiw/adapter/mysql_adapter.rb
52
57
  - lib/exwiw/adapter/mysql_client.rb
53
58
  - lib/exwiw/adapter/postgresql_adapter.rb
59
+ - lib/exwiw/adapter/sql_bulk_insert.rb
54
60
  - lib/exwiw/adapter/sqlite_adapter.rb
55
61
  - lib/exwiw/after_insert_hook.rb
56
62
  - lib/exwiw/belongs_to.rb
@@ -59,6 +65,7 @@ files:
59
65
  - lib/exwiw/determine_table_processing_order.rb
60
66
  - lib/exwiw/embedded_in.rb
61
67
  - lib/exwiw/explain_runner.rb
68
+ - lib/exwiw/ext_json.rb
62
69
  - lib/exwiw/mongo_query.rb
63
70
  - lib/exwiw/mongodb_collection_config.rb
64
71
  - lib/exwiw/mongodb_field.rb