exwiw 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,57 @@ module Exwiw
14
14
  Exwiw::MongodbCollectionConfig
15
15
  end
16
16
 
17
+ # A lazy, streaming stand-in for the materialized result array #execute
18
+ # used to return. Wrapping the Mongo cursor (instead of `.to_a`) keeps the
19
+ # dump's dominant memory cost — the full result set — off the heap: the
20
+ # Runner pulls documents through `each_slice`, so at most one chunk of
21
+ # documents (plus the small propagation-key arrays) is resident at a time,
22
+ # even for large or embed-heavy collections.
23
+ #
24
+ # It satisfies the two things the Runner asks of an execute result:
25
+ # - #size: the record count, used to skip empty collections and to log.
26
+ # Answered with a `count_documents` query (which only walks index
27
+ # entries, far cheaper than fetching every document) rather than by
28
+ # draining the cursor.
29
+ # - #each (via Enumerable / each_slice): a single streaming pass over the
30
+ # cursor. While streaming it captures — per propagation key, BEFORE
31
+ # handing the document to the caller's masking — the values downstream
32
+ # children will `$in`-match against, publishing them into @state once
33
+ # the pass completes.
34
+ #
35
+ # Contract note: unlike the old `.to_a` execute, which populated @state
36
+ # eagerly, this defers state capture until the result is consumed. The
37
+ # Runner always fully consumes a non-empty result before any child
38
+ # collection is processed, so propagation is unaffected; a caller that only
39
+ # needs @state must iterate the result (e.g. `.to_a`).
40
+ class StreamingResult
41
+ include Enumerable
42
+
43
+ def initialize(view:, collection:, keys:, state:)
44
+ @view = view
45
+ @collection = collection
46
+ @keys = keys
47
+ @state = state
48
+ end
49
+
50
+ def size
51
+ @size ||= @view.count_documents
52
+ end
53
+ alias length size
54
+
55
+ def each
56
+ return enum_for(:each) { size } unless block_given?
57
+
58
+ captured = @keys.each_with_object({}) { |key, acc| acc[key] = [] }
59
+ @view.each do |doc|
60
+ @keys.each { |key| captured[key] << doc[key] }
61
+ yield doc
62
+ end
63
+ @state[@collection] = captured
64
+ self
65
+ end
66
+ end
67
+
17
68
  def initialize(connection_config, logger)
18
69
  super
19
70
  @state = {}
@@ -86,22 +137,24 @@ module Exwiw
86
137
  def execute(query)
87
138
  @logger.debug(" Executing Mongo find on '#{query.collection}': filter=#{query.filter.inspect} projection=#{query.projection.inspect}")
88
139
 
89
- docs = db[query.collection]
140
+ view = db[query.collection]
90
141
  .find(query.filter)
91
142
  .projection(query.projection)
92
143
  .comment(query_comment_text("collection=#{query.collection}"))
93
- .to_a
94
144
 
95
- # Stash, per referenced field, the values children will `$in`-match
96
- # against. @propagation_keys is set by the build_query call for this same
145
+ # Per referenced field, the values children will `$in`-match against.
146
+ # @propagation_keys is set by the build_query call for this same
97
147
  # collection; fall back to the primary key if execute is driven without a
98
148
  # preceding build_query (e.g. in isolation from a test).
99
149
  keys = @propagation_keys || [query.primary_key]
100
- @state[query.collection] = keys.each_with_object({}) do |key, acc|
101
- acc[key] = docs.map { |doc| doc[key] }
102
- end
103
150
 
104
- docs
151
+ # Return a streaming view of the result set rather than `.to_a`-ing the
152
+ # whole collection into memory. The Runner pulls documents through
153
+ # `each_slice`, so only one chunk's worth is resident at a time even for
154
+ # large / embed-heavy collections — the dump's dominant memory cost. The
155
+ # propagation-key values are captured as the cursor streams and published
156
+ # into @state once the pass completes (see StreamingResult).
157
+ StreamingResult.new(view: view, collection: query.collection, keys: keys, state: @state)
105
158
  end
106
159
 
107
160
  # NOTE: relies on @embedded_children_by_parent set by a prior build_query
@@ -110,10 +163,10 @@ module Exwiw
110
163
  # to_bulk_insert (SQL adapters don't need it). Safe in Runner, fragile in
111
164
  # tests — call build_query first.
112
165
  def to_bulk_insert(rows, config)
166
+ plan = mask_plan(config)
113
167
  rows.map do |doc|
114
- apply_replace_with!(doc, config)
115
- apply_embedded_masking!(doc, config)
116
- JSON.generate(extended_json(doc))
168
+ apply_mask_plan!(doc, plan)
169
+ Exwiw::ExtJson.encode(doc)
117
170
  end.join("\n")
118
171
  end
119
172
 
@@ -135,6 +188,20 @@ module Exwiw
135
188
  'jsonl'
136
189
  end
137
190
 
191
+ # Bound how many documents are serialized at once when a collection config
192
+ # carries no explicit bulk_insert_chunk_size. A MongoDB dump is one JSONL
193
+ # line per document and, without chunking, the Runner would materialize the
194
+ # entire collection's output as a single giant string while the full
195
+ # in-memory result set is still alive — doubling peak memory on large or
196
+ # embed-heavy collections. Chunking lets the Runner stream each slice to the
197
+ # file and release its serialized string (and the transient extended-JSON
198
+ # trees) before building the next.
199
+ DEFAULT_BULK_INSERT_CHUNK_SIZE = 1_000
200
+
201
+ def default_bulk_insert_chunk_size
202
+ DEFAULT_BULK_INSERT_CHUNK_SIZE
203
+ end
204
+
138
205
  def schema_output_extension
139
206
  'js'
140
207
  end
@@ -325,49 +392,101 @@ module Exwiw
325
392
  parent_fields[reference_field]
326
393
  end
327
394
 
328
- private def apply_replace_with!(doc, config)
329
- config.fields.each do |field|
395
+ # A masking plan compiled once per collection config and reused for every
396
+ # document of that collection. `masked_fields` is `[field_name,
397
+ # template_segments]` for each field carrying a `replace_with`;
398
+ # `embedded` is one EmbeddedMask per embedded child.
399
+ MaskPlan = Struct.new(:masked_fields, :embedded)
400
+
401
+ # A pre-resolved embedded-child mask: the parent path split once into
402
+ # `prefix` (the containers to descend into) and `last` (the field holding
403
+ # the subdocument(s)), plus the child's own MaskPlan.
404
+ EmbeddedMask = Struct.new(:prefix, :last, :plan)
405
+
406
+ # Build (or fetch) the cached MaskPlan for `config`. Masking runs over every
407
+ # document AND every embedded subdocument, so for an embed-heavy collection
408
+ # the same per-config decisions — which fields carry a `replace_with`, how
409
+ # each template splits into segments, where the embedded children live —
410
+ # were previously recomputed tens of times per document. Compiling them once
411
+ # per config lets #apply_mask_plan! do nothing but the work that actually
412
+ # varies per document (rendering templates, descending into subdocuments),
413
+ # so the saved per-subdocument overhead scales down with embedding count.
414
+ #
415
+ # Cached by config name: names are unique within a run and the configs do
416
+ # not mutate mid-dump. Relies on @embedded_children_by_parent, set by the
417
+ # build_query call that always precedes to_bulk_insert (see #to_bulk_insert).
418
+ private def mask_plan(config)
419
+ (@mask_plans ||= {})[config.name] ||= build_mask_plan(config)
420
+ end
421
+
422
+ private def build_mask_plan(config)
423
+ masked_fields = config.fields.each_with_object([]) do |field, acc|
330
424
  next unless field.replace_with
331
425
 
332
- doc[field.name] = field.replace_with.gsub(/\{([^{}]+)\}/) do
333
- ref = Regexp.last_match(1)
334
- (doc.key?(ref) ? doc[ref] : nil).to_s
335
- end
426
+ acc << [field.name, compile_template(field.replace_with)]
427
+ end
428
+ embedded = embedded_children_of(config).map do |child|
429
+ *prefix, last = child.embedded_in.path.split(".")
430
+ EmbeddedMask.new(prefix, last, build_mask_plan(child))
336
431
  end
432
+ MaskPlan.new(masked_fields, embedded)
337
433
  end
338
434
 
339
- private def apply_embedded_masking!(doc, parent_config)
340
- embedded_children_of(parent_config).each do |child|
341
- walk(doc, child.embedded_in.path) do |subdoc|
342
- apply_replace_with!(subdoc, child)
343
- apply_embedded_masking!(subdoc, child)
435
+ # Apply a precompiled MaskPlan to a document in place: render each masked
436
+ # field, then descend into each embedded child (recursing into its own
437
+ # plan). Equivalent to the old apply_replace_with! + apply_embedded_masking!
438
+ # pair, with all per-config lookups hoisted into the plan.
439
+ private def apply_mask_plan!(doc, plan)
440
+ plan.masked_fields.each do |name, segments|
441
+ doc[name] = render_template(segments, doc)
442
+ end
443
+ plan.embedded.each do |child|
444
+ container = child.prefix.reduce(doc) { |acc, seg| acc.is_a?(Hash) ? acc[seg] : nil }
445
+ next unless container.is_a?(Hash)
446
+
447
+ case (value = container[child.last])
448
+ when Array then value.each { |sub| apply_mask_plan!(sub, child.plan) if sub.is_a?(Hash) }
449
+ when Hash then apply_mask_plan!(value, child.plan)
344
450
  end
345
451
  end
346
452
  end
347
453
 
348
- private def embedded_children_of(parent_config)
349
- @embedded_children_by_parent.fetch(parent_config.name, [])
454
+ PLACEHOLDER_PATTERN = /\{([^{}]+)\}/
455
+
456
+ # Split a `replace_with` template into a flat list of segments (called once
457
+ # per masked field at plan-build time, see #build_mask_plan). A segment is
458
+ # either a literal String or a 1-element Array `[ref]` marking a `{ref}`
459
+ # placeholder. #render_template then concatenates them, skipping the regex
460
+ # scan / block / `Regexp.last_match` a per-document `gsub` would repeat (~2.5x
461
+ # faster per field). The segment walk reproduces the old gsub byte-for-byte
462
+ # (missing keys render as "", literals pass through unchanged).
463
+ private def compile_template(template)
464
+ segments = []
465
+ pos = 0
466
+ while (md = PLACEHOLDER_PATTERN.match(template, pos))
467
+ segments << template[pos...md.begin(0)] if md.begin(0) > pos
468
+ segments << [md[1]]
469
+ pos = md.end(0)
470
+ end
471
+ segments << template[pos..] if pos < template.length
472
+ segments
350
473
  end
351
474
 
352
- private def walk(doc, dotted_path)
353
- segments = dotted_path.split(".")
354
- *prefix, last = segments
355
- container = prefix.reduce(doc) { |acc, seg| acc.is_a?(Hash) ? acc[seg] : nil }
356
- return unless container.is_a?(Hash)
357
-
358
- value = container[last]
359
- case value
360
- when Array then value.each { |sub| yield sub if sub.is_a?(Hash) }
361
- when Hash then yield value
475
+ private def render_template(segments, doc)
476
+ out = +''
477
+ segments.each do |seg|
478
+ if seg.is_a?(Array)
479
+ ref = seg[0]
480
+ out << (doc.key?(ref) ? doc[ref] : nil).to_s
481
+ else
482
+ out << seg
483
+ end
362
484
  end
485
+ out
363
486
  end
364
487
 
365
- private def extended_json(doc)
366
- if doc.respond_to?(:as_extended_json)
367
- doc.as_extended_json(mode: :relaxed)
368
- else
369
- doc
370
- end
488
+ private def embedded_children_of(parent_config)
489
+ @embedded_children_by_parent.fetch(parent_config.name, [])
371
490
  end
372
491
 
373
492
  private def db
@@ -5,15 +5,67 @@ require 'open3'
5
5
  module Exwiw
6
6
  module Adapter
7
7
  class MysqlAdapter < Base
8
+ include SqlBulkInsert
9
+
10
+ # A lazy, streaming stand-in for the materialized rows #execute used to
11
+ # return (`connection.query(sql).rows`). It pulls rows off the wire one at
12
+ # a time (mysql2 single-row stream) instead of buffering the whole result
13
+ # set, so the dump's dominant memory cost — a Ruby array as large as the
14
+ # table — never materializes. The Runner drives it exactly like the old
15
+ # Array: #size to skip empty tables and log the count, then a single
16
+ # streaming pass (SqlBulkInsert#write_inserts -> each_slice).
17
+ #
18
+ # Mirrors PostgresqlAdapter::StreamingResult, with two MySQL specifics:
19
+ # - #size runs a separate `SELECT COUNT(*)` of the same query. Unlike the
20
+ # pg path, it does NOT wrap the SELECT in a subquery: MySQL rejects a
21
+ # derived table with duplicate column names, which a rails-managed
22
+ # `SELECT *` joined to another table produces. Instead the projection
23
+ # is replaced by `COUNT(*)` (compile_ast(count_only: true)) — exact
24
+ # because exwiw's extraction queries have no DISTINCT/GROUP BY/LIMIT,
25
+ # so the row count is independent of the projected columns.
26
+ # - the stream ties up the connection until fully drained. The Runner
27
+ # always drains it (write_inserts) before any further query
28
+ # (post_insert_sql / DELETE), and MysqlClient#stream_rows drains the
29
+ # remainder if iteration is abandoned, so the connection stays usable.
30
+ class StreamingResult
31
+ include Enumerable
32
+
33
+ def initialize(client:, data_sql:, count_sql:)
34
+ @client = client
35
+ @data_sql = data_sql
36
+ @count_sql = count_sql
37
+ end
38
+
39
+ def size
40
+ @size ||= @client.query(@count_sql).rows.dig(0, 0).to_i
41
+ end
42
+ alias length size
43
+
44
+ # Stream the result set row by row. Each row is an Array of String|nil
45
+ # (mysql2 `cast: false` / stringified) — identical to what
46
+ # `connection.query(sql).rows` produced, so the generated INSERT is
47
+ # unchanged.
48
+ def each(&block)
49
+ return enum_for(:each) { size } unless block_given?
50
+
51
+ @client.stream_rows(@data_sql, &block)
52
+ self
53
+ end
54
+ end
55
+
8
56
  def build_query(table, dump_target, table_by_name)
9
57
  Exwiw::QueryAstBuilder.run(table.name, table_by_name, dump_target, @logger)
10
58
  end
11
59
 
12
60
  def execute(query_ast)
13
- sql = commented_sql(query_ast)
14
-
15
- @logger.debug(" Executing SQL: \n#{sql}")
16
- connection.query(sql).rows
61
+ data_sql = commented_sql(query_ast)
62
+ # Count via the same FROM/JOIN/WHERE (projection replaced by COUNT(*)) so
63
+ # the Runner can skip empty tables and log the row count without draining
64
+ # the stream. See StreamingResult for why this is not a subquery wrap.
65
+ count_sql = "#{sql_query_comment(query_ast)} #{compile_ast(query_ast, count_only: true)}"
66
+
67
+ @logger.debug(" Executing SQL (streaming): \n#{data_sql}")
68
+ StreamingResult.new(client: connection, data_sql: data_sql, count_sql: count_sql)
17
69
  end
18
70
 
19
71
  def explain(query_ast)
@@ -99,22 +151,16 @@ module Exwiw
99
151
  "SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;"
100
152
  end
101
153
 
102
- def to_bulk_insert(results, table)
154
+ # The INSERT header for this adapter. MySQL backtick-quotes the table and
155
+ # column identifiers. #to_bulk_insert / #write_inserts (SqlBulkInsert)
156
+ # append the value tuples and the trailing `;`.
157
+ private def insert_header(table)
103
158
  table_name = table.name
104
-
105
- value_list = results.map do |row|
106
- quoted_values = row.map do |value|
107
- escape_value(value)
108
- end
109
- "(" + quoted_values.join(', ') + ")"
110
- end
111
- values = value_list.join(",\n")
112
-
113
159
  if table.rails_managed?
114
- "INSERT INTO `#{table_name}` VALUES\n#{values};"
160
+ "INSERT INTO `#{table_name}` VALUES\n"
115
161
  else
116
162
  column_names = table.columns.map { |c| "`#{c.name}`" }.join(', ')
117
- "INSERT INTO `#{table_name}` (#{column_names}) VALUES\n#{values};"
163
+ "INSERT INTO `#{table_name}` (#{column_names}) VALUES\n"
118
164
  end
119
165
  end
120
166
 
@@ -176,11 +222,17 @@ module Exwiw
176
222
  sql
177
223
  end
178
224
 
179
- def compile_ast(query_ast)
225
+ # @param count_only [Boolean] emit `SELECT COUNT(*)` instead of the
226
+ # projected columns (used by StreamingResult#size). Safe because exwiw's
227
+ # extraction queries have no DISTINCT/GROUP BY/LIMIT, so the count does
228
+ # not depend on the projection.
229
+ def compile_ast(query_ast, count_only: false)
180
230
  raise NotImplementedError unless query_ast.is_a?(Exwiw::QueryAst::Select)
181
231
 
182
232
  sql = "SELECT "
183
- sql += if query_ast.select_all
233
+ sql += if count_only
234
+ "COUNT(*)"
235
+ elsif query_ast.select_all
184
236
  "*"
185
237
  else
186
238
  query_ast.columns.map { |col| compile_column_name(query_ast, col) }.join(', ')
@@ -118,6 +118,49 @@ module Exwiw
118
118
  end
119
119
  end
120
120
 
121
+ # Stream a query's rows one at a time, yielding each as an
122
+ # Array<String|nil> (the same row shape as #query) instead of buffering
123
+ # the whole result set. This keeps a large dump's dominant memory cost — a
124
+ # Ruby array as big as the table — from materializing.
125
+ #
126
+ # mysql2 streams server-side (`stream: true` + `cache_rows: false`).
127
+ # Its contract: a streamed result MUST be fully consumed before the next
128
+ # query on this connection, or the driver raises "Commands out of sync".
129
+ # The Runner consumes every row (it writes them all), but if the consumer
130
+ # block raises mid-stream we drain the remaining rows so the same
131
+ # connection is still usable for the next table's query.
132
+ #
133
+ # trilogy has no streaming cursor (no QUERY_FLAGS_STREAMING), so it buffers
134
+ # the result and yields from it — parity, but without the memory win (the
135
+ # same situation as the sqlite adapter). trilogy is a test-only driver;
136
+ # production connects via mysql2.
137
+ #
138
+ # @param sql [String]
139
+ # @yieldparam row [Array<String|nil>]
140
+ def stream_rows(sql)
141
+ return enum_for(:stream_rows, sql) unless block_given?
142
+
143
+ case @driver
144
+ when :mysql2
145
+ res = raw.query(sql, cast: false, as: :array, stream: true, cache_rows: false)
146
+ begin
147
+ res.each { |row| yield row.map { |value| self.class.stringify_value(value) } }
148
+ rescue StandardError
149
+ begin
150
+ res.each { |_row| } # drain the remainder so the connection stays usable
151
+ rescue StandardError
152
+ nil
153
+ end
154
+ raise
155
+ end
156
+ when :trilogy
157
+ raw.query(sql).rows.each { |row| yield row.map { |value| self.class.stringify_value(value) } }
158
+ else
159
+ raise "Unsupported MySQL driver: #{@driver.inspect}"
160
+ end
161
+ self
162
+ end
163
+
121
164
  private def ensure_driver_loaded!
122
165
  case @driver
123
166
  when :mysql2 then require 'mysql2'
@@ -3,15 +3,91 @@
3
3
  module Exwiw
4
4
  module Adapter
5
5
  class PostgresqlAdapter < Base
6
+ include SqlBulkInsert
7
+
8
+ # A lazy, streaming stand-in for the materialized rows #execute used to
9
+ # return (`connection.exec(sql).values`). It pulls rows off the wire one
10
+ # at a time via libpq's single-row mode instead of buffering the whole
11
+ # result set, so the dump's dominant memory cost — a Ruby array as large
12
+ # as the table — never materializes. The Runner drives it exactly like the
13
+ # old Array: #size to skip empty tables and log the count, then a single
14
+ # streaming pass (SqlBulkInsert#write_inserts -> each_slice) to write the
15
+ # INSERT.
16
+ #
17
+ # Mirrors MongodbAdapter::StreamingResult; two SQL-specific differences:
18
+ # - #size cannot be answered cheaply from the cursor, so it runs a
19
+ # separate `SELECT COUNT(*)` of the same query. (MongoDB uses
20
+ # count_documents, an index-only walk; the SQL COUNT re-runs the query
21
+ # plan but transfers no row data — Postgres prunes the unused
22
+ # projection of the wrapped subquery.) This keeps the Runner contract
23
+ # unchanged, so MongoDB and the other SQL adapters are untouched.
24
+ # - the streaming pass ties up the connection until fully drained. The
25
+ # Runner always drains it (write_inserts) before issuing any further
26
+ # query (post_insert_sql / DELETE) on the same connection, so the
27
+ # ordering invariant holds.
28
+ class StreamingResult
29
+ include Enumerable
30
+
31
+ def initialize(connection:, data_sql:, count_sql:)
32
+ @connection = connection
33
+ @data_sql = data_sql
34
+ @count_sql = count_sql
35
+ end
36
+
37
+ def size
38
+ @size ||= @connection.exec(@count_sql).getvalue(0, 0).to_i
39
+ end
40
+ alias length size
41
+
42
+ # Stream the result set row by row. Each row is an Array of String|nil
43
+ # in libpq's text format — byte-identical to what `#exec(sql).values`
44
+ # produced, so the generated INSERT is unchanged.
45
+ def each
46
+ return enum_for(:each) { size } unless block_given?
47
+
48
+ @connection.send_query(@data_sql)
49
+ @connection.set_single_row_mode
50
+ begin
51
+ while (result = @connection.get_result)
52
+ begin
53
+ result.check
54
+ result.each_row { |row| yield row }
55
+ ensure
56
+ result.clear
57
+ end
58
+ end
59
+ rescue StandardError
60
+ # If iteration is abandoned mid-stream (a SQL error surfaced by
61
+ # #check, or the consumer raised), drain any results still queued so
62
+ # a later query on this same connection does not fail with "another
63
+ # command is already in progress".
64
+ drain
65
+ raise
66
+ end
67
+ self
68
+ end
69
+
70
+ private def drain
71
+ while (result = @connection.get_result)
72
+ result.clear
73
+ end
74
+ rescue PG::Error
75
+ # Connection already errored/clean; nothing left to drain.
76
+ end
77
+ end
78
+
6
79
  def build_query(table, dump_target, table_by_name)
7
80
  Exwiw::QueryAstBuilder.run(table.name, table_by_name, dump_target, @logger)
8
81
  end
9
82
 
10
83
  def execute(query_ast)
11
- sql = commented_sql(query_ast)
84
+ data_sql = commented_sql(query_ast)
85
+ # Count via the same query (wrapped as a subquery) so the Runner can
86
+ # skip empty tables and log the row count without draining the stream.
87
+ count_sql = "#{sql_query_comment(query_ast)} SELECT COUNT(*) FROM (#{compile_ast(query_ast)}) AS exwiw_count_src"
12
88
 
13
- @logger.debug(" Executing SQL: \n#{sql}")
14
- connection.exec(sql).values
89
+ @logger.debug(" Executing SQL (single-row stream): \n#{data_sql}")
90
+ StreamingResult.new(connection: connection, data_sql: data_sql, count_sql: count_sql)
15
91
  end
16
92
 
17
93
  def explain(query_ast)
@@ -97,22 +173,16 @@ module Exwiw
97
173
  @logger.info(" Wrote schema for #{table_names.size} table(s) to #{output_path}.")
98
174
  end
99
175
 
100
- def to_bulk_insert(results, table)
176
+ # The INSERT header for this adapter. PostgreSQL uses bare identifiers.
177
+ # #to_bulk_insert / #write_inserts (SqlBulkInsert) append the value tuples
178
+ # and the trailing `;`.
179
+ private def insert_header(table)
101
180
  table_name = table.name
102
-
103
- value_list = results.map do |row|
104
- quoted_values = row.map do |value|
105
- escape_value(value)
106
- end
107
- "(" + quoted_values.join(', ') + ")"
108
- end
109
- values = value_list.join(",\n")
110
-
111
181
  if table.rails_managed?
112
- "INSERT INTO #{table_name} VALUES\n#{values};"
182
+ "INSERT INTO #{table_name} VALUES\n"
113
183
  else
114
184
  column_names = table.columns.map(&:name).join(', ')
115
- "INSERT INTO #{table_name} (#{column_names}) VALUES\n#{values};"
185
+ "INSERT INTO #{table_name} (#{column_names}) VALUES\n"
116
186
  end
117
187
  end
118
188
 
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Exwiw
4
+ module Adapter
5
+ # Shared bulk-INSERT construction for the SQL adapters (mysql / postgresql /
6
+ # sqlite). They produce the same `INSERT INTO ... VALUES (...),(...);` shape
7
+ # and differ only in the header's identifier quoting (see #insert_header) and
8
+ # in #escape_value, so both the in-memory builder (#to_bulk_insert) and the
9
+ # bounded-memory streaming writer (#write_inserts) live here.
10
+ #
11
+ # Each including adapter must provide two private methods:
12
+ # - insert_header(table) -> the "INSERT INTO ... VALUES\n" prefix
13
+ # - escape_value(value) -> the SQL literal for one column value
14
+ module SqlBulkInsert
15
+ # How many rows' tuples to build-and-flush at a time when streaming. Bounds
16
+ # peak memory to this many tuples (plus their joined string) instead of the
17
+ # whole table's INSERT string, while keeping each flush a single fast
18
+ # Array#map + Array#join (the same C-level path #to_bulk_insert uses) so it
19
+ # stays close to whole-string speed — far faster than a naive row-at-a-time
20
+ # IO#print (see script/bench_sql_dump.rb / docs/sql-dump-optimization-notes.md).
21
+ # Mirrors MongoDB's default chunk size: bounded work per flush, but the SQL
22
+ # adapters still emit ONE statement (byte-identical to the un-chunked build).
23
+ STREAM_FLUSH_ROWS = 2_000
24
+
25
+ # Build the whole INSERT statement as a single String. Kept for callers
26
+ # that want the string form (and as the readable definition of the exact
27
+ # bytes #write_inserts streams).
28
+ def to_bulk_insert(results, table)
29
+ value_list = results.map { |row| insert_tuple(row) }
30
+ "#{insert_header(table)}#{value_list.join(",\n")};"
31
+ end
32
+
33
+ # Stream the bulk INSERT(s) straight to `io` instead of materializing the
34
+ # whole statement string first. Byte-for-byte identical to writing
35
+ # #to_bulk_insert per chunk joined by "\n" (verified by
36
+ # insert_output_snapshot_spec), but only one ~STREAM_FLUSH_BYTES buffer is
37
+ # resident at a time rather than the entire table's INSERT string. Returns
38
+ # the number of statements written.
39
+ def write_inserts(io, results, table, chunk_size)
40
+ chunks = chunk_size ? results.each_slice(chunk_size) : [results]
41
+ statement_count = 0
42
+ chunks.each do |chunk_rows|
43
+ io.print("\n") if statement_count.positive?
44
+ stream_single_insert(io, chunk_rows, table)
45
+ statement_count += 1
46
+ end
47
+ statement_count
48
+ end
49
+
50
+ # Emit one `INSERT INTO ... VALUES <tuples>;` statement to `io`, building
51
+ # and flushing the value tuples STREAM_FLUSH_ROWS at a time so the full
52
+ # statement text is never held in memory at once. Each slice is one fast
53
+ # map+join; the ",\n" between slices reproduces the same separator
54
+ # #to_bulk_insert puts between every tuple, so the bytes are identical.
55
+ private def stream_single_insert(io, rows, table)
56
+ io.print(insert_header(table))
57
+ first = true
58
+ rows.each_slice(STREAM_FLUSH_ROWS) do |slice|
59
+ io.print(",\n") unless first
60
+ first = false
61
+ io.print(slice.map { |row| insert_tuple(row) }.join(",\n"))
62
+ end
63
+ io.print(";")
64
+ end
65
+
66
+ private def insert_tuple(row)
67
+ "(" + row.map { |value| escape_value(value) }.join(', ') + ")"
68
+ end
69
+ end
70
+ end
71
+ end