exwiw 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +3 -2
- data/docs/optimization-notes.md +126 -0
- data/docs/optimize-mongodb-export-with-native-ext.md +249 -0
- data/docs/plans/2026-05-15-insert-000-schema-file.md +4 -4
- data/docs/plans/2026-05-16-mongodb-from-clean-scenario.md +8 -8
- data/docs/plans/2026-05-22-postgres-copy-mode-scenario-test.md +7 -7
- data/docs/plans/2026-05-31-ids-column-for-sql-adapters.md +1 -1
- data/docs/plans/2026-06-19-mongodb-export-remove-parallelism-native-ext.md +70 -0
- data/docs/sql-dump-optimization-notes.md +278 -0
- data/ext/exwiw/ext_json/ext_json.c +274 -0
- data/ext/exwiw/ext_json/extconf.rb +8 -0
- data/lib/exwiw/adapter/mongodb_adapter.rb +159 -40
- data/lib/exwiw/adapter/mysql_adapter.rb +70 -18
- data/lib/exwiw/adapter/mysql_client.rb +43 -0
- data/lib/exwiw/adapter/postgresql_adapter.rb +85 -15
- data/lib/exwiw/adapter/sql_bulk_insert.rb +71 -0
- data/lib/exwiw/adapter/sqlite_adapter.rb +75 -18
- data/lib/exwiw/adapter.rb +38 -0
- data/lib/exwiw/ext_json.rb +33 -0
- data/lib/exwiw/runner.rb +18 -6
- data/lib/exwiw/version.rb +1 -1
- data/lib/exwiw.rb +2 -0
- data/mise.toml +2 -2
- metadata +11 -2
|
@@ -14,6 +14,57 @@ module Exwiw
|
|
|
14
14
|
Exwiw::MongodbCollectionConfig
|
|
15
15
|
end
|
|
16
16
|
|
|
17
|
+
# A lazy, streaming stand-in for the materialized result array #execute
|
|
18
|
+
# used to return. Wrapping the Mongo cursor (instead of `.to_a`) keeps the
|
|
19
|
+
# dump's dominant memory cost — the full result set — off the heap: the
|
|
20
|
+
# Runner pulls documents through `each_slice`, so at most one chunk of
|
|
21
|
+
# documents (plus the small propagation-key arrays) is resident at a time,
|
|
22
|
+
# even for large or embed-heavy collections.
|
|
23
|
+
#
|
|
24
|
+
# It satisfies the two things the Runner asks of an execute result:
|
|
25
|
+
# - #size: the record count, used to skip empty collections and to log.
|
|
26
|
+
# Answered with a `count_documents` query (which only walks index
|
|
27
|
+
# entries, far cheaper than fetching every document) rather than by
|
|
28
|
+
# draining the cursor.
|
|
29
|
+
# - #each (via Enumerable / each_slice): a single streaming pass over the
|
|
30
|
+
# cursor. While streaming it captures — per propagation key, BEFORE
|
|
31
|
+
# handing the document to the caller's masking — the values downstream
|
|
32
|
+
# children will `$in`-match against, publishing them into @state once
|
|
33
|
+
# the pass completes.
|
|
34
|
+
#
|
|
35
|
+
# Contract note: unlike the old `.to_a` execute, which populated @state
|
|
36
|
+
# eagerly, this defers state capture until the result is consumed. The
|
|
37
|
+
# Runner always fully consumes a non-empty result before any child
|
|
38
|
+
# collection is processed, so propagation is unaffected; a caller that only
|
|
39
|
+
# needs @state must iterate the result (e.g. `.to_a`).
|
|
40
|
+
class StreamingResult
|
|
41
|
+
include Enumerable
|
|
42
|
+
|
|
43
|
+
def initialize(view:, collection:, keys:, state:)
|
|
44
|
+
@view = view
|
|
45
|
+
@collection = collection
|
|
46
|
+
@keys = keys
|
|
47
|
+
@state = state
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def size
|
|
51
|
+
@size ||= @view.count_documents
|
|
52
|
+
end
|
|
53
|
+
alias length size
|
|
54
|
+
|
|
55
|
+
def each
|
|
56
|
+
return enum_for(:each) { size } unless block_given?
|
|
57
|
+
|
|
58
|
+
captured = @keys.each_with_object({}) { |key, acc| acc[key] = [] }
|
|
59
|
+
@view.each do |doc|
|
|
60
|
+
@keys.each { |key| captured[key] << doc[key] }
|
|
61
|
+
yield doc
|
|
62
|
+
end
|
|
63
|
+
@state[@collection] = captured
|
|
64
|
+
self
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
17
68
|
def initialize(connection_config, logger)
|
|
18
69
|
super
|
|
19
70
|
@state = {}
|
|
@@ -86,22 +137,24 @@ module Exwiw
|
|
|
86
137
|
def execute(query)
|
|
87
138
|
@logger.debug(" Executing Mongo find on '#{query.collection}': filter=#{query.filter.inspect} projection=#{query.projection.inspect}")
|
|
88
139
|
|
|
89
|
-
|
|
140
|
+
view = db[query.collection]
|
|
90
141
|
.find(query.filter)
|
|
91
142
|
.projection(query.projection)
|
|
92
143
|
.comment(query_comment_text("collection=#{query.collection}"))
|
|
93
|
-
.to_a
|
|
94
144
|
|
|
95
|
-
#
|
|
96
|
-
#
|
|
145
|
+
# Per referenced field, the values children will `$in`-match against.
|
|
146
|
+
# @propagation_keys is set by the build_query call for this same
|
|
97
147
|
# collection; fall back to the primary key if execute is driven without a
|
|
98
148
|
# preceding build_query (e.g. in isolation from a test).
|
|
99
149
|
keys = @propagation_keys || [query.primary_key]
|
|
100
|
-
@state[query.collection] = keys.each_with_object({}) do |key, acc|
|
|
101
|
-
acc[key] = docs.map { |doc| doc[key] }
|
|
102
|
-
end
|
|
103
150
|
|
|
104
|
-
|
|
151
|
+
# Return a streaming view of the result set rather than `.to_a`-ing the
|
|
152
|
+
# whole collection into memory. The Runner pulls documents through
|
|
153
|
+
# `each_slice`, so only one chunk's worth is resident at a time even for
|
|
154
|
+
# large / embed-heavy collections — the dump's dominant memory cost. The
|
|
155
|
+
# propagation-key values are captured as the cursor streams and published
|
|
156
|
+
# into @state once the pass completes (see StreamingResult).
|
|
157
|
+
StreamingResult.new(view: view, collection: query.collection, keys: keys, state: @state)
|
|
105
158
|
end
|
|
106
159
|
|
|
107
160
|
# NOTE: relies on @embedded_children_by_parent set by a prior build_query
|
|
@@ -110,10 +163,10 @@ module Exwiw
|
|
|
110
163
|
# to_bulk_insert (SQL adapters don't need it). Safe in Runner, fragile in
|
|
111
164
|
# tests — call build_query first.
|
|
112
165
|
def to_bulk_insert(rows, config)
|
|
166
|
+
plan = mask_plan(config)
|
|
113
167
|
rows.map do |doc|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
JSON.generate(extended_json(doc))
|
|
168
|
+
apply_mask_plan!(doc, plan)
|
|
169
|
+
Exwiw::ExtJson.encode(doc)
|
|
117
170
|
end.join("\n")
|
|
118
171
|
end
|
|
119
172
|
|
|
@@ -135,6 +188,20 @@ module Exwiw
|
|
|
135
188
|
'jsonl'
|
|
136
189
|
end
|
|
137
190
|
|
|
191
|
+
# Bound how many documents are serialized at once when a collection config
|
|
192
|
+
# carries no explicit bulk_insert_chunk_size. A MongoDB dump is one JSONL
|
|
193
|
+
# line per document and, without chunking, the Runner would materialize the
|
|
194
|
+
# entire collection's output as a single giant string while the full
|
|
195
|
+
# in-memory result set is still alive — doubling peak memory on large or
|
|
196
|
+
# embed-heavy collections. Chunking lets the Runner stream each slice to the
|
|
197
|
+
# file and release its serialized string (and the transient extended-JSON
|
|
198
|
+
# trees) before building the next.
|
|
199
|
+
DEFAULT_BULK_INSERT_CHUNK_SIZE = 1_000
|
|
200
|
+
|
|
201
|
+
def default_bulk_insert_chunk_size
|
|
202
|
+
DEFAULT_BULK_INSERT_CHUNK_SIZE
|
|
203
|
+
end
|
|
204
|
+
|
|
138
205
|
def schema_output_extension
|
|
139
206
|
'js'
|
|
140
207
|
end
|
|
@@ -325,49 +392,101 @@ module Exwiw
|
|
|
325
392
|
parent_fields[reference_field]
|
|
326
393
|
end
|
|
327
394
|
|
|
328
|
-
|
|
329
|
-
|
|
395
|
+
# A masking plan compiled once per collection config and reused for every
|
|
396
|
+
# document of that collection. `masked_fields` is `[field_name,
|
|
397
|
+
# template_segments]` for each field carrying a `replace_with`;
|
|
398
|
+
# `embedded` is one EmbeddedMask per embedded child.
|
|
399
|
+
MaskPlan = Struct.new(:masked_fields, :embedded)
|
|
400
|
+
|
|
401
|
+
# A pre-resolved embedded-child mask: the parent path split once into
|
|
402
|
+
# `prefix` (the containers to descend into) and `last` (the field holding
|
|
403
|
+
# the subdocument(s)), plus the child's own MaskPlan.
|
|
404
|
+
EmbeddedMask = Struct.new(:prefix, :last, :plan)
|
|
405
|
+
|
|
406
|
+
# Build (or fetch) the cached MaskPlan for `config`. Masking runs over every
|
|
407
|
+
# document AND every embedded subdocument, so for an embed-heavy collection
|
|
408
|
+
# the same per-config decisions — which fields carry a `replace_with`, how
|
|
409
|
+
# each template splits into segments, where the embedded children live —
|
|
410
|
+
# were previously recomputed tens of times per document. Compiling them once
|
|
411
|
+
# per config lets #apply_mask_plan! do nothing but the work that actually
|
|
412
|
+
# varies per document (rendering templates, descending into subdocuments),
|
|
413
|
+
# so the saved per-subdocument overhead scales down with embedding count.
|
|
414
|
+
#
|
|
415
|
+
# Cached by config name: names are unique within a run and the configs do
|
|
416
|
+
# not mutate mid-dump. Relies on @embedded_children_by_parent, set by the
|
|
417
|
+
# build_query call that always precedes to_bulk_insert (see #to_bulk_insert).
|
|
418
|
+
private def mask_plan(config)
|
|
419
|
+
(@mask_plans ||= {})[config.name] ||= build_mask_plan(config)
|
|
420
|
+
end
|
|
421
|
+
|
|
422
|
+
private def build_mask_plan(config)
|
|
423
|
+
masked_fields = config.fields.each_with_object([]) do |field, acc|
|
|
330
424
|
next unless field.replace_with
|
|
331
425
|
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
426
|
+
acc << [field.name, compile_template(field.replace_with)]
|
|
427
|
+
end
|
|
428
|
+
embedded = embedded_children_of(config).map do |child|
|
|
429
|
+
*prefix, last = child.embedded_in.path.split(".")
|
|
430
|
+
EmbeddedMask.new(prefix, last, build_mask_plan(child))
|
|
336
431
|
end
|
|
432
|
+
MaskPlan.new(masked_fields, embedded)
|
|
337
433
|
end
|
|
338
434
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
435
|
+
# Apply a precompiled MaskPlan to a document in place: render each masked
|
|
436
|
+
# field, then descend into each embedded child (recursing into its own
|
|
437
|
+
# plan). Equivalent to the old apply_replace_with! + apply_embedded_masking!
|
|
438
|
+
# pair, with all per-config lookups hoisted into the plan.
|
|
439
|
+
private def apply_mask_plan!(doc, plan)
|
|
440
|
+
plan.masked_fields.each do |name, segments|
|
|
441
|
+
doc[name] = render_template(segments, doc)
|
|
442
|
+
end
|
|
443
|
+
plan.embedded.each do |child|
|
|
444
|
+
container = child.prefix.reduce(doc) { |acc, seg| acc.is_a?(Hash) ? acc[seg] : nil }
|
|
445
|
+
next unless container.is_a?(Hash)
|
|
446
|
+
|
|
447
|
+
case (value = container[child.last])
|
|
448
|
+
when Array then value.each { |sub| apply_mask_plan!(sub, child.plan) if sub.is_a?(Hash) }
|
|
449
|
+
when Hash then apply_mask_plan!(value, child.plan)
|
|
344
450
|
end
|
|
345
451
|
end
|
|
346
452
|
end
|
|
347
453
|
|
|
348
|
-
|
|
349
|
-
|
|
454
|
+
PLACEHOLDER_PATTERN = /\{([^{}]+)\}/
|
|
455
|
+
|
|
456
|
+
# Split a `replace_with` template into a flat list of segments (called once
|
|
457
|
+
# per masked field at plan-build time, see #build_mask_plan). A segment is
|
|
458
|
+
# either a literal String or a 1-element Array `[ref]` marking a `{ref}`
|
|
459
|
+
# placeholder. #render_template then concatenates them, skipping the regex
|
|
460
|
+
# scan / block / `Regexp.last_match` a per-document `gsub` would repeat (~2.5x
|
|
461
|
+
# faster per field). The segment walk reproduces the old gsub byte-for-byte
|
|
462
|
+
# (missing keys render as "", literals pass through unchanged).
|
|
463
|
+
private def compile_template(template)
|
|
464
|
+
segments = []
|
|
465
|
+
pos = 0
|
|
466
|
+
while (md = PLACEHOLDER_PATTERN.match(template, pos))
|
|
467
|
+
segments << template[pos...md.begin(0)] if md.begin(0) > pos
|
|
468
|
+
segments << [md[1]]
|
|
469
|
+
pos = md.end(0)
|
|
470
|
+
end
|
|
471
|
+
segments << template[pos..] if pos < template.length
|
|
472
|
+
segments
|
|
350
473
|
end
|
|
351
474
|
|
|
352
|
-
private def
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
when Hash then yield value
|
|
475
|
+
private def render_template(segments, doc)
|
|
476
|
+
out = +''
|
|
477
|
+
segments.each do |seg|
|
|
478
|
+
if seg.is_a?(Array)
|
|
479
|
+
ref = seg[0]
|
|
480
|
+
out << (doc.key?(ref) ? doc[ref] : nil).to_s
|
|
481
|
+
else
|
|
482
|
+
out << seg
|
|
483
|
+
end
|
|
362
484
|
end
|
|
485
|
+
out
|
|
363
486
|
end
|
|
364
487
|
|
|
365
|
-
private def
|
|
366
|
-
|
|
367
|
-
doc.as_extended_json(mode: :relaxed)
|
|
368
|
-
else
|
|
369
|
-
doc
|
|
370
|
-
end
|
|
488
|
+
private def embedded_children_of(parent_config)
|
|
489
|
+
@embedded_children_by_parent.fetch(parent_config.name, [])
|
|
371
490
|
end
|
|
372
491
|
|
|
373
492
|
private def db
|
|
@@ -5,15 +5,67 @@ require 'open3'
|
|
|
5
5
|
module Exwiw
|
|
6
6
|
module Adapter
|
|
7
7
|
class MysqlAdapter < Base
|
|
8
|
+
include SqlBulkInsert
|
|
9
|
+
|
|
10
|
+
# A lazy, streaming stand-in for the materialized rows #execute used to
|
|
11
|
+
# return (`connection.query(sql).rows`). It pulls rows off the wire one at
|
|
12
|
+
# a time (mysql2 single-row stream) instead of buffering the whole result
|
|
13
|
+
# set, so the dump's dominant memory cost — a Ruby array as large as the
|
|
14
|
+
# table — never materializes. The Runner drives it exactly like the old
|
|
15
|
+
# Array: #size to skip empty tables and log the count, then a single
|
|
16
|
+
# streaming pass (SqlBulkInsert#write_inserts -> each_slice).
|
|
17
|
+
#
|
|
18
|
+
# Mirrors PostgresqlAdapter::StreamingResult, with two MySQL specifics:
|
|
19
|
+
# - #size runs a separate `SELECT COUNT(*)` of the same query. Unlike the
|
|
20
|
+
# pg path, it does NOT wrap the SELECT in a subquery: MySQL rejects a
|
|
21
|
+
# derived table with duplicate column names, which a rails-managed
|
|
22
|
+
# `SELECT *` joined to another table produces. Instead the projection
|
|
23
|
+
# is replaced by `COUNT(*)` (compile_ast(count_only: true)) — exact
|
|
24
|
+
# because exwiw's extraction queries have no DISTINCT/GROUP BY/LIMIT,
|
|
25
|
+
# so the row count is independent of the projected columns.
|
|
26
|
+
# - the stream ties up the connection until fully drained. The Runner
|
|
27
|
+
# always drains it (write_inserts) before any further query
|
|
28
|
+
# (post_insert_sql / DELETE), and MysqlClient#stream_rows drains the
|
|
29
|
+
# remainder if iteration is abandoned, so the connection stays usable.
|
|
30
|
+
class StreamingResult
|
|
31
|
+
include Enumerable
|
|
32
|
+
|
|
33
|
+
def initialize(client:, data_sql:, count_sql:)
|
|
34
|
+
@client = client
|
|
35
|
+
@data_sql = data_sql
|
|
36
|
+
@count_sql = count_sql
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def size
|
|
40
|
+
@size ||= @client.query(@count_sql).rows.dig(0, 0).to_i
|
|
41
|
+
end
|
|
42
|
+
alias length size
|
|
43
|
+
|
|
44
|
+
# Stream the result set row by row. Each row is an Array of String|nil
|
|
45
|
+
# (mysql2 `cast: false` / stringified) — identical to what
|
|
46
|
+
# `connection.query(sql).rows` produced, so the generated INSERT is
|
|
47
|
+
# unchanged.
|
|
48
|
+
def each(&block)
|
|
49
|
+
return enum_for(:each) { size } unless block_given?
|
|
50
|
+
|
|
51
|
+
@client.stream_rows(@data_sql, &block)
|
|
52
|
+
self
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
8
56
|
def build_query(table, dump_target, table_by_name)
|
|
9
57
|
Exwiw::QueryAstBuilder.run(table.name, table_by_name, dump_target, @logger)
|
|
10
58
|
end
|
|
11
59
|
|
|
12
60
|
def execute(query_ast)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
61
|
+
data_sql = commented_sql(query_ast)
|
|
62
|
+
# Count via the same FROM/JOIN/WHERE (projection replaced by COUNT(*)) so
|
|
63
|
+
# the Runner can skip empty tables and log the row count without draining
|
|
64
|
+
# the stream. See StreamingResult for why this is not a subquery wrap.
|
|
65
|
+
count_sql = "#{sql_query_comment(query_ast)} #{compile_ast(query_ast, count_only: true)}"
|
|
66
|
+
|
|
67
|
+
@logger.debug(" Executing SQL (streaming): \n#{data_sql}")
|
|
68
|
+
StreamingResult.new(client: connection, data_sql: data_sql, count_sql: count_sql)
|
|
17
69
|
end
|
|
18
70
|
|
|
19
71
|
def explain(query_ast)
|
|
@@ -99,22 +151,16 @@ module Exwiw
|
|
|
99
151
|
"SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;"
|
|
100
152
|
end
|
|
101
153
|
|
|
102
|
-
|
|
154
|
+
# The INSERT header for this adapter. MySQL backtick-quotes the table and
|
|
155
|
+
# column identifiers. #to_bulk_insert / #write_inserts (SqlBulkInsert)
|
|
156
|
+
# append the value tuples and the trailing `;`.
|
|
157
|
+
private def insert_header(table)
|
|
103
158
|
table_name = table.name
|
|
104
|
-
|
|
105
|
-
value_list = results.map do |row|
|
|
106
|
-
quoted_values = row.map do |value|
|
|
107
|
-
escape_value(value)
|
|
108
|
-
end
|
|
109
|
-
"(" + quoted_values.join(', ') + ")"
|
|
110
|
-
end
|
|
111
|
-
values = value_list.join(",\n")
|
|
112
|
-
|
|
113
159
|
if table.rails_managed?
|
|
114
|
-
"INSERT INTO `#{table_name}` VALUES\n
|
|
160
|
+
"INSERT INTO `#{table_name}` VALUES\n"
|
|
115
161
|
else
|
|
116
162
|
column_names = table.columns.map { |c| "`#{c.name}`" }.join(', ')
|
|
117
|
-
"INSERT INTO `#{table_name}` (#{column_names}) VALUES\n
|
|
163
|
+
"INSERT INTO `#{table_name}` (#{column_names}) VALUES\n"
|
|
118
164
|
end
|
|
119
165
|
end
|
|
120
166
|
|
|
@@ -176,11 +222,17 @@ module Exwiw
|
|
|
176
222
|
sql
|
|
177
223
|
end
|
|
178
224
|
|
|
179
|
-
|
|
225
|
+
# @param count_only [Boolean] emit `SELECT COUNT(*)` instead of the
|
|
226
|
+
# projected columns (used by StreamingResult#size). Safe because exwiw's
|
|
227
|
+
# extraction queries have no DISTINCT/GROUP BY/LIMIT, so the count does
|
|
228
|
+
# not depend on the projection.
|
|
229
|
+
def compile_ast(query_ast, count_only: false)
|
|
180
230
|
raise NotImplementedError unless query_ast.is_a?(Exwiw::QueryAst::Select)
|
|
181
231
|
|
|
182
232
|
sql = "SELECT "
|
|
183
|
-
sql += if
|
|
233
|
+
sql += if count_only
|
|
234
|
+
"COUNT(*)"
|
|
235
|
+
elsif query_ast.select_all
|
|
184
236
|
"*"
|
|
185
237
|
else
|
|
186
238
|
query_ast.columns.map { |col| compile_column_name(query_ast, col) }.join(', ')
|
|
@@ -118,6 +118,49 @@ module Exwiw
|
|
|
118
118
|
end
|
|
119
119
|
end
|
|
120
120
|
|
|
121
|
+
# Stream a query's rows one at a time, yielding each as an
|
|
122
|
+
# Array<String|nil> (the same row shape as #query) instead of buffering
|
|
123
|
+
# the whole result set. This keeps a large dump's dominant memory cost — a
|
|
124
|
+
# Ruby array as big as the table — from materializing.
|
|
125
|
+
#
|
|
126
|
+
# mysql2 streams server-side (`stream: true` + `cache_rows: false`).
|
|
127
|
+
# Its contract: a streamed result MUST be fully consumed before the next
|
|
128
|
+
# query on this connection, or the driver raises "Commands out of sync".
|
|
129
|
+
# The Runner consumes every row (it writes them all), but if the consumer
|
|
130
|
+
# block raises mid-stream we drain the remaining rows so the same
|
|
131
|
+
# connection is still usable for the next table's query.
|
|
132
|
+
#
|
|
133
|
+
# trilogy has no streaming cursor (no QUERY_FLAGS_STREAMING), so it buffers
|
|
134
|
+
# the result and yields from it — parity, but without the memory win (the
|
|
135
|
+
# same situation as the sqlite adapter). trilogy is a test-only driver;
|
|
136
|
+
# production connects via mysql2.
|
|
137
|
+
#
|
|
138
|
+
# @param sql [String]
|
|
139
|
+
# @yieldparam row [Array<String|nil>]
|
|
140
|
+
def stream_rows(sql)
|
|
141
|
+
return enum_for(:stream_rows, sql) unless block_given?
|
|
142
|
+
|
|
143
|
+
case @driver
|
|
144
|
+
when :mysql2
|
|
145
|
+
res = raw.query(sql, cast: false, as: :array, stream: true, cache_rows: false)
|
|
146
|
+
begin
|
|
147
|
+
res.each { |row| yield row.map { |value| self.class.stringify_value(value) } }
|
|
148
|
+
rescue StandardError
|
|
149
|
+
begin
|
|
150
|
+
res.each { |_row| } # drain the remainder so the connection stays usable
|
|
151
|
+
rescue StandardError
|
|
152
|
+
nil
|
|
153
|
+
end
|
|
154
|
+
raise
|
|
155
|
+
end
|
|
156
|
+
when :trilogy
|
|
157
|
+
raw.query(sql).rows.each { |row| yield row.map { |value| self.class.stringify_value(value) } }
|
|
158
|
+
else
|
|
159
|
+
raise "Unsupported MySQL driver: #{@driver.inspect}"
|
|
160
|
+
end
|
|
161
|
+
self
|
|
162
|
+
end
|
|
163
|
+
|
|
121
164
|
private def ensure_driver_loaded!
|
|
122
165
|
case @driver
|
|
123
166
|
when :mysql2 then require 'mysql2'
|
|
@@ -3,15 +3,91 @@
|
|
|
3
3
|
module Exwiw
|
|
4
4
|
module Adapter
|
|
5
5
|
class PostgresqlAdapter < Base
|
|
6
|
+
include SqlBulkInsert
|
|
7
|
+
|
|
8
|
+
# A lazy, streaming stand-in for the materialized rows #execute used to
|
|
9
|
+
# return (`connection.exec(sql).values`). It pulls rows off the wire one
|
|
10
|
+
# at a time via libpq's single-row mode instead of buffering the whole
|
|
11
|
+
# result set, so the dump's dominant memory cost — a Ruby array as large
|
|
12
|
+
# as the table — never materializes. The Runner drives it exactly like the
|
|
13
|
+
# old Array: #size to skip empty tables and log the count, then a single
|
|
14
|
+
# streaming pass (SqlBulkInsert#write_inserts -> each_slice) to write the
|
|
15
|
+
# INSERT.
|
|
16
|
+
#
|
|
17
|
+
# Mirrors MongodbAdapter::StreamingResult; two SQL-specific differences:
|
|
18
|
+
# - #size cannot be answered cheaply from the cursor, so it runs a
|
|
19
|
+
# separate `SELECT COUNT(*)` of the same query. (MongoDB uses
|
|
20
|
+
# count_documents, an index-only walk; the SQL COUNT re-runs the query
|
|
21
|
+
# plan but transfers no row data — Postgres prunes the unused
|
|
22
|
+
# projection of the wrapped subquery.) This keeps the Runner contract
|
|
23
|
+
# unchanged, so MongoDB and the other SQL adapters are untouched.
|
|
24
|
+
# - the streaming pass ties up the connection until fully drained. The
|
|
25
|
+
# Runner always drains it (write_inserts) before issuing any further
|
|
26
|
+
# query (post_insert_sql / DELETE) on the same connection, so the
|
|
27
|
+
# ordering invariant holds.
|
|
28
|
+
class StreamingResult
|
|
29
|
+
include Enumerable
|
|
30
|
+
|
|
31
|
+
def initialize(connection:, data_sql:, count_sql:)
|
|
32
|
+
@connection = connection
|
|
33
|
+
@data_sql = data_sql
|
|
34
|
+
@count_sql = count_sql
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def size
|
|
38
|
+
@size ||= @connection.exec(@count_sql).getvalue(0, 0).to_i
|
|
39
|
+
end
|
|
40
|
+
alias length size
|
|
41
|
+
|
|
42
|
+
# Stream the result set row by row. Each row is an Array of String|nil
|
|
43
|
+
# in libpq's text format — byte-identical to what `#exec(sql).values`
|
|
44
|
+
# produced, so the generated INSERT is unchanged.
|
|
45
|
+
def each
|
|
46
|
+
return enum_for(:each) { size } unless block_given?
|
|
47
|
+
|
|
48
|
+
@connection.send_query(@data_sql)
|
|
49
|
+
@connection.set_single_row_mode
|
|
50
|
+
begin
|
|
51
|
+
while (result = @connection.get_result)
|
|
52
|
+
begin
|
|
53
|
+
result.check
|
|
54
|
+
result.each_row { |row| yield row }
|
|
55
|
+
ensure
|
|
56
|
+
result.clear
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
rescue StandardError
|
|
60
|
+
# If iteration is abandoned mid-stream (a SQL error surfaced by
|
|
61
|
+
# #check, or the consumer raised), drain any results still queued so
|
|
62
|
+
# a later query on this same connection does not fail with "another
|
|
63
|
+
# command is already in progress".
|
|
64
|
+
drain
|
|
65
|
+
raise
|
|
66
|
+
end
|
|
67
|
+
self
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
private def drain
|
|
71
|
+
while (result = @connection.get_result)
|
|
72
|
+
result.clear
|
|
73
|
+
end
|
|
74
|
+
rescue PG::Error
|
|
75
|
+
# Connection already errored/clean; nothing left to drain.
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
6
79
|
def build_query(table, dump_target, table_by_name)
|
|
7
80
|
Exwiw::QueryAstBuilder.run(table.name, table_by_name, dump_target, @logger)
|
|
8
81
|
end
|
|
9
82
|
|
|
10
83
|
def execute(query_ast)
|
|
11
|
-
|
|
84
|
+
data_sql = commented_sql(query_ast)
|
|
85
|
+
# Count via the same query (wrapped as a subquery) so the Runner can
|
|
86
|
+
# skip empty tables and log the row count without draining the stream.
|
|
87
|
+
count_sql = "#{sql_query_comment(query_ast)} SELECT COUNT(*) FROM (#{compile_ast(query_ast)}) AS exwiw_count_src"
|
|
12
88
|
|
|
13
|
-
@logger.debug(" Executing SQL: \n#{
|
|
14
|
-
|
|
89
|
+
@logger.debug(" Executing SQL (single-row stream): \n#{data_sql}")
|
|
90
|
+
StreamingResult.new(connection: connection, data_sql: data_sql, count_sql: count_sql)
|
|
15
91
|
end
|
|
16
92
|
|
|
17
93
|
def explain(query_ast)
|
|
@@ -97,22 +173,16 @@ module Exwiw
|
|
|
97
173
|
@logger.info(" Wrote schema for #{table_names.size} table(s) to #{output_path}.")
|
|
98
174
|
end
|
|
99
175
|
|
|
100
|
-
|
|
176
|
+
# The INSERT header for this adapter. PostgreSQL uses bare identifiers.
|
|
177
|
+
# #to_bulk_insert / #write_inserts (SqlBulkInsert) append the value tuples
|
|
178
|
+
# and the trailing `;`.
|
|
179
|
+
private def insert_header(table)
|
|
101
180
|
table_name = table.name
|
|
102
|
-
|
|
103
|
-
value_list = results.map do |row|
|
|
104
|
-
quoted_values = row.map do |value|
|
|
105
|
-
escape_value(value)
|
|
106
|
-
end
|
|
107
|
-
"(" + quoted_values.join(', ') + ")"
|
|
108
|
-
end
|
|
109
|
-
values = value_list.join(",\n")
|
|
110
|
-
|
|
111
181
|
if table.rails_managed?
|
|
112
|
-
"INSERT INTO #{table_name} VALUES\n
|
|
182
|
+
"INSERT INTO #{table_name} VALUES\n"
|
|
113
183
|
else
|
|
114
184
|
column_names = table.columns.map(&:name).join(', ')
|
|
115
|
-
"INSERT INTO #{table_name} (#{column_names}) VALUES\n
|
|
185
|
+
"INSERT INTO #{table_name} (#{column_names}) VALUES\n"
|
|
116
186
|
end
|
|
117
187
|
end
|
|
118
188
|
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Exwiw
|
|
4
|
+
module Adapter
|
|
5
|
+
# Shared bulk-INSERT construction for the SQL adapters (mysql / postgresql /
|
|
6
|
+
# sqlite). They produce the same `INSERT INTO ... VALUES (...),(...);` shape
|
|
7
|
+
# and differ only in the header's identifier quoting (see #insert_header) and
|
|
8
|
+
# in #escape_value, so both the in-memory builder (#to_bulk_insert) and the
|
|
9
|
+
# bounded-memory streaming writer (#write_inserts) live here.
|
|
10
|
+
#
|
|
11
|
+
# Each including adapter must provide two private methods:
|
|
12
|
+
# - insert_header(table) -> the "INSERT INTO ... VALUES\n" prefix
|
|
13
|
+
# - escape_value(value) -> the SQL literal for one column value
|
|
14
|
+
module SqlBulkInsert
|
|
15
|
+
# How many rows' tuples to build-and-flush at a time when streaming. Bounds
|
|
16
|
+
# peak memory to this many tuples (plus their joined string) instead of the
|
|
17
|
+
# whole table's INSERT string, while keeping each flush a single fast
|
|
18
|
+
# Array#map + Array#join (the same C-level path #to_bulk_insert uses) so it
|
|
19
|
+
# stays close to whole-string speed — far faster than a naive row-at-a-time
|
|
20
|
+
# IO#print (see script/bench_sql_dump.rb / docs/sql-dump-optimization-notes.md).
|
|
21
|
+
# Mirrors MongoDB's default chunk size: bounded work per flush, but the SQL
|
|
22
|
+
# adapters still emit ONE statement (byte-identical to the un-chunked build).
|
|
23
|
+
STREAM_FLUSH_ROWS = 2_000
|
|
24
|
+
|
|
25
|
+
# Build the whole INSERT statement as a single String. Kept for callers
|
|
26
|
+
# that want the string form (and as the readable definition of the exact
|
|
27
|
+
# bytes #write_inserts streams).
|
|
28
|
+
def to_bulk_insert(results, table)
|
|
29
|
+
value_list = results.map { |row| insert_tuple(row) }
|
|
30
|
+
"#{insert_header(table)}#{value_list.join(",\n")};"
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Stream the bulk INSERT(s) straight to `io` instead of materializing the
|
|
34
|
+
# whole statement string first. Byte-for-byte identical to writing
|
|
35
|
+
# #to_bulk_insert per chunk joined by "\n" (verified by
|
|
36
|
+
# insert_output_snapshot_spec), but only one ~STREAM_FLUSH_BYTES buffer is
|
|
37
|
+
# resident at a time rather than the entire table's INSERT string. Returns
|
|
38
|
+
# the number of statements written.
|
|
39
|
+
def write_inserts(io, results, table, chunk_size)
|
|
40
|
+
chunks = chunk_size ? results.each_slice(chunk_size) : [results]
|
|
41
|
+
statement_count = 0
|
|
42
|
+
chunks.each do |chunk_rows|
|
|
43
|
+
io.print("\n") if statement_count.positive?
|
|
44
|
+
stream_single_insert(io, chunk_rows, table)
|
|
45
|
+
statement_count += 1
|
|
46
|
+
end
|
|
47
|
+
statement_count
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Emit one `INSERT INTO ... VALUES <tuples>;` statement to `io`, building
|
|
51
|
+
# and flushing the value tuples STREAM_FLUSH_ROWS at a time so the full
|
|
52
|
+
# statement text is never held in memory at once. Each slice is one fast
|
|
53
|
+
# map+join; the ",\n" between slices reproduces the same separator
|
|
54
|
+
# #to_bulk_insert puts between every tuple, so the bytes are identical.
|
|
55
|
+
private def stream_single_insert(io, rows, table)
|
|
56
|
+
io.print(insert_header(table))
|
|
57
|
+
first = true
|
|
58
|
+
rows.each_slice(STREAM_FLUSH_ROWS) do |slice|
|
|
59
|
+
io.print(",\n") unless first
|
|
60
|
+
first = false
|
|
61
|
+
io.print(slice.map { |row| insert_tuple(row) }.join(",\n"))
|
|
62
|
+
end
|
|
63
|
+
io.print(";")
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private def insert_tuple(row)
|
|
67
|
+
"(" + row.map { |value| escape_value(value) }.join(', ') + ")"
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|