exwiw 0.5.3 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,274 @@
1
+ // Native emitter for MongoDB Relaxed Extended JSON.
2
+ //
3
+ // Replaces the pure-Ruby `JSON.generate(doc.as_extended_json(mode: :relaxed))`
4
+ // (which rebuilds the whole document into an intermediate transformed Hash tree
5
+ // and then walks it a second time in JSON.generate) with a single native
6
+ // tree-walk that emits the JSONL line directly.
7
+ //
8
+ // Byte-identity strategy (see docs/optimize-mongodb-export-with-native-ext.md):
9
+ // only the structural bulk + the cheapest, most-stable leaves are formatted in
10
+ // C — Hash, Array, String, fixnum Integer, true/false/nil, BSON::ObjectId, and
11
+ // in-range Time (years 1970..9999; see encode_time_native). Everything else
12
+ // (Float, out-of-int64 Integer, out-of-range Time, Symbol, Decimal128, Binary,
13
+ // ...) is handed back to Ruby's `encode_fragment`, which is the exact pure-Ruby
14
+ // path. This is provably byte-identical because Hash#as_extended_json
15
+ // and Array#as_extended_json are non-transforming structural recursion: the
16
+ // bytes `JSON.generate(v.as_extended_json(mode: :relaxed))` produces for any
17
+ // sub-value `v` are exactly the bytes the whole-document generate would produce
18
+ // in that position, so a value the native walk does not format can be spliced
19
+ // in verbatim with no divergence.
20
+
21
+ #include <ruby.h>
22
+ #include <ruby/encoding.h>
23
+ #include <stdio.h>
24
+ #include <time.h>
25
+
26
+ static VALUE rb_mExtJson;
27
+ // Cached BSON::ObjectId class, or Qnil until bson is loaded and it resolves.
28
+ // Resolution is lazy (bson is required only when the Mongo adapter touches the
29
+ // DB, which always precedes serialization in a real run); see resolve below.
30
+ static VALUE rb_cObjectId;
31
+
32
+ static ID id_encode_fragment;
33
+ static ID id_to_s;
34
+ static ID id_const_BSON;
35
+ static ID id_const_ObjectId;
36
+
37
+ static const char hexdigits[] = "0123456789abcdef";
38
+
39
+ static void encode_value(VALUE buf, VALUE val);
40
+
41
+ // Append `str` as a JSON string literal (surrounding quotes included), escaping
42
+ // exactly as JSON.generate does: \b \t \n \f \r \" \\ get their short escapes,
43
+ // any other byte < 0x20 becomes a lowercase \u00xx, and every other byte —
44
+ // including '/', DEL (0x7f), U+2028/U+2029, and UTF-8 multi-byte sequences — is
45
+ // passed through raw. Unescaped runs are appended in bulk to avoid a per-byte
46
+ // rb_str_cat call.
47
+ static void encode_string(VALUE buf, const char *p, long len)
48
+ {
49
+ rb_str_cat(buf, "\"", 1);
50
+
51
+ long start = 0;
52
+ for (long i = 0; i < len; i++) {
53
+ unsigned char c = (unsigned char)p[i];
54
+ const char *esc = NULL;
55
+ long esclen = 0;
56
+ char ubuf[6];
57
+
58
+ switch (c) {
59
+ case '"': esc = "\\\""; esclen = 2; break;
60
+ case '\\': esc = "\\\\"; esclen = 2; break;
61
+ case '\b': esc = "\\b"; esclen = 2; break;
62
+ case '\t': esc = "\\t"; esclen = 2; break;
63
+ case '\n': esc = "\\n"; esclen = 2; break;
64
+ case '\f': esc = "\\f"; esclen = 2; break;
65
+ case '\r': esc = "\\r"; esclen = 2; break;
66
+ default:
67
+ if (c < 0x20) {
68
+ ubuf[0] = '\\'; ubuf[1] = 'u'; ubuf[2] = '0'; ubuf[3] = '0';
69
+ ubuf[4] = hexdigits[(c >> 4) & 0xf];
70
+ ubuf[5] = hexdigits[c & 0xf];
71
+ esc = ubuf; esclen = 6;
72
+ }
73
+ }
74
+
75
+ if (esc) {
76
+ if (i > start) rb_str_cat(buf, p + start, i - start);
77
+ rb_str_cat(buf, esc, esclen);
78
+ start = i + 1;
79
+ }
80
+ }
81
+ if (len > start) rb_str_cat(buf, p + start, len - start);
82
+
83
+ rb_str_cat(buf, "\"", 1);
84
+ }
85
+
86
+ // Hash keys mirror JSON.generate: a String key is emitted as-is, anything else
87
+ // is stringified (Symbol via its name, otherwise #to_s) before escaping.
88
+ static void encode_key(VALUE buf, VALUE key)
89
+ {
90
+ VALUE kstr;
91
+ if (RB_TYPE_P(key, T_STRING)) {
92
+ kstr = key;
93
+ } else if (RB_TYPE_P(key, T_SYMBOL)) {
94
+ kstr = rb_sym2str(key);
95
+ } else {
96
+ kstr = rb_funcall(key, id_to_s, 0);
97
+ }
98
+ encode_string(buf, RSTRING_PTR(kstr), RSTRING_LEN(kstr));
99
+ }
100
+
101
+ typedef struct {
102
+ VALUE buf;
103
+ int first;
104
+ } hash_ctx;
105
+
106
+ static int hash_iter(VALUE key, VALUE value, VALUE arg)
107
+ {
108
+ hash_ctx *ctx = (hash_ctx *)arg;
109
+ if (!ctx->first) rb_str_cat(ctx->buf, ",", 1);
110
+ ctx->first = 0;
111
+ encode_key(ctx->buf, key);
112
+ rb_str_cat(ctx->buf, ":", 1);
113
+ encode_value(ctx->buf, value);
114
+ return ST_CONTINUE;
115
+ }
116
+
117
+ // Splice the pure-Ruby fragment for a value the native path does not format.
118
+ static void delegate(VALUE buf, VALUE val)
119
+ {
120
+ VALUE frag = rb_funcall(rb_mExtJson, id_encode_fragment, 1, val);
121
+ rb_str_cat(buf, RSTRING_PTR(frag), RSTRING_LEN(frag));
122
+ }
123
+
124
+ // Epoch second for 10000-01-01T00:00:00Z. `bson`'s relaxed Time encoding uses
125
+ // the ISO-8601 string form only for years 1970..9999 (inclusive) and the
126
+ // {"$numberLong":"<ms>"} form otherwise; that year window is exactly the
127
+ // half-open epoch-second range [0, MAX_ISO_EPOCH).
128
+ #define MAX_ISO_EPOCH 253402300800LL
129
+
130
+ // Format a Time as Relaxed Extended JSON in C, matching bson 5.2.0 byte for
131
+ // byte for the common in-range case (see bson/time.rb and the empirical probe):
132
+ // - whole second (usec == 0, i.e. nsec < 1000): {"$date":"...:SSZ"} (no fraction)
133
+ // - sub-second (nsec >= 1000): {"$date":"...:SS.mmmZ"}, where the
134
+ // millisecond is floor(nsec / 1e6) — bson floors the Time to milliseconds.
135
+ // Returns 1 when handled. Returns 0 (leaving buf untouched) for years outside
136
+ // 1970..9999, whose {"$numberLong"} form involves negative-epoch arithmetic too
137
+ // fiddly to risk in C — the caller then delegates that rare case to Ruby.
138
+ static int encode_time_native(VALUE buf, VALUE val)
139
+ {
140
+ struct timespec ts = rb_time_timespec(val);
141
+ if (ts.tv_sec < 0 || ts.tv_sec >= MAX_ISO_EPOCH) return 0;
142
+
143
+ time_t secs = (time_t)ts.tv_sec;
144
+ struct tm tm;
145
+ if (gmtime_r(&secs, &tm) == NULL) return 0;
146
+
147
+ char tmp[40];
148
+ int n;
149
+ if (ts.tv_nsec >= 1000) {
150
+ int ms = (int)(ts.tv_nsec / 1000000L);
151
+ n = snprintf(tmp, sizeof(tmp),
152
+ "{\"$date\":\"%04d-%02d-%02dT%02d:%02d:%02d.%03dZ\"}",
153
+ tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
154
+ tm.tm_hour, tm.tm_min, tm.tm_sec, ms);
155
+ } else {
156
+ n = snprintf(tmp, sizeof(tmp),
157
+ "{\"$date\":\"%04d-%02d-%02dT%02d:%02d:%02dZ\"}",
158
+ tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
159
+ tm.tm_hour, tm.tm_min, tm.tm_sec);
160
+ }
161
+ rb_str_cat(buf, tmp, n);
162
+ return 1;
163
+ }
164
+
165
+ static void encode_value(VALUE buf, VALUE val)
166
+ {
167
+ switch (TYPE(val)) {
168
+ case T_NIL:
169
+ rb_str_cat(buf, "null", 4);
170
+ return;
171
+ case T_TRUE:
172
+ rb_str_cat(buf, "true", 4);
173
+ return;
174
+ case T_FALSE:
175
+ rb_str_cat(buf, "false", 5);
176
+ return;
177
+ case T_FIXNUM: {
178
+ // A Fixnum always fits in a C long (and thus int64) on the platforms
179
+ // exwiw targets, so it can never be the out-of-int64 case that must
180
+ // raise; emit it directly. Bignums fall through to delegate, where
181
+ // encode_fragment emits in-range ones and raises RangeError for the
182
+ // rest — matching today's behavior exactly.
183
+ char tmp[24];
184
+ int n = snprintf(tmp, sizeof(tmp), "%ld", (long)FIX2LONG(val));
185
+ rb_str_cat(buf, tmp, n);
186
+ return;
187
+ }
188
+ case T_STRING:
189
+ encode_string(buf, RSTRING_PTR(val), RSTRING_LEN(val));
190
+ return;
191
+ case T_ARRAY: {
192
+ long len = RARRAY_LEN(val);
193
+ rb_str_cat(buf, "[", 1);
194
+ for (long i = 0; i < len; i++) {
195
+ if (i > 0) rb_str_cat(buf, ",", 1);
196
+ encode_value(buf, rb_ary_entry(val, i));
197
+ }
198
+ rb_str_cat(buf, "]", 1);
199
+ return;
200
+ }
201
+ case T_HASH: {
202
+ // rb_hash_foreach preserves insertion order, matching JSON output.
203
+ hash_ctx ctx = { buf, 1 };
204
+ rb_str_cat(buf, "{", 1);
205
+ rb_hash_foreach(val, hash_iter, (VALUE)&ctx);
206
+ rb_str_cat(buf, "}", 1);
207
+ return;
208
+ }
209
+ default:
210
+ // BSON::ObjectId is the single most common leaf (`_id`) and its
211
+ // Relaxed form is the stable {"$oid":"<24 hex>"}, so format it here.
212
+ // The hex comes from #to_s (the same source as as_extended_json) and
213
+ // is always [0-9a-f]{24}, so it needs no escaping.
214
+ if (!NIL_P(rb_cObjectId) && RTEST(rb_obj_is_kind_of(val, rb_cObjectId))) {
215
+ VALUE hex = rb_funcall(val, id_to_s, 0);
216
+ rb_str_cat(buf, "{\"$oid\":\"", 9);
217
+ rb_str_cat(buf, RSTRING_PTR(hex), RSTRING_LEN(hex));
218
+ rb_str_cat(buf, "\"}", 2);
219
+ return;
220
+ }
221
+ // Time is the other common leaf in dumped documents (Mongoid's
222
+ // created_at/updated_at); format the in-range case natively. The
223
+ // out-of-range $numberLong form returns 0 and falls through to Ruby.
224
+ if (RTEST(rb_obj_is_kind_of(val, rb_cTime)) && encode_time_native(buf, val)) {
225
+ return;
226
+ }
227
+ // Float, Bignum, Symbol, Decimal128, Binary, out-of-range Time, ... -> Ruby.
228
+ delegate(buf, val);
229
+ return;
230
+ }
231
+ }
232
+
233
+ // Resolve and cache BSON::ObjectId the first time a document is encoded with
234
+ // bson loaded. Cheap const lookups guarded by the Qnil cache; once resolved it
235
+ // is skipped. Until resolved, ObjectId simply takes the (correct) delegate path.
236
+ static void resolve_objectid_class(void)
237
+ {
238
+ if (!NIL_P(rb_cObjectId)) return;
239
+ if (!rb_const_defined(rb_cObject, id_const_BSON)) return;
240
+
241
+ VALUE bson = rb_const_get(rb_cObject, id_const_BSON);
242
+ if (rb_const_defined(bson, id_const_ObjectId)) {
243
+ rb_cObjectId = rb_const_get(bson, id_const_ObjectId);
244
+ }
245
+ }
246
+
247
+ // Exwiw::ExtJson.encode_native(doc) -> String
248
+ // Returns one JSONL line (no trailing newline); the caller owns separators.
249
+ static VALUE rb_encode_native(VALUE self, VALUE doc)
250
+ {
251
+ resolve_objectid_class();
252
+
253
+ VALUE buf = rb_str_buf_new(256);
254
+ rb_enc_associate(buf, rb_utf8_encoding());
255
+ encode_value(buf, doc);
256
+ return buf;
257
+ }
258
+
259
+ void Init_ext_json_native(void)
260
+ {
261
+ id_encode_fragment = rb_intern("encode_fragment");
262
+ id_to_s = rb_intern("to_s");
263
+ id_const_BSON = rb_intern("BSON");
264
+ id_const_ObjectId = rb_intern("ObjectId");
265
+
266
+ VALUE mExwiw = rb_define_module("Exwiw");
267
+ rb_mExtJson = rb_define_module_under(mExwiw, "ExtJson");
268
+ rb_global_variable(&rb_mExtJson);
269
+
270
+ rb_cObjectId = Qnil;
271
+ rb_global_variable(&rb_cObjectId);
272
+
273
+ rb_define_singleton_method(rb_mExtJson, "encode_native", rb_encode_native, 1);
274
+ }
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mkmf"
4
+
5
+ # Compiled to lib/exwiw/ext_json_native.{so,bundle}. The name is distinct from
6
+ # the `ext_json.rb` shim so `require "exwiw/ext_json_native"` does not collide
7
+ # with `require_relative "exwiw/ext_json"`.
8
+ create_makefile("exwiw/ext_json_native")
@@ -123,7 +123,7 @@ module Exwiw
123
123
  { config.primary_key => { "$in" => coerce_ids(dump_target.ids) } }
124
124
  end
125
125
  else
126
- related_collection_filter(config, config_by_name)
126
+ related_collection_filter(config, config_by_name, dump_target)
127
127
  end
128
128
 
129
129
  Exwiw::MongoQuery::Find.new(
@@ -166,7 +166,7 @@ module Exwiw
166
166
  plan = mask_plan(config)
167
167
  rows.map do |doc|
168
168
  apply_mask_plan!(doc, plan)
169
- JSON.generate(extended_json(doc))
169
+ Exwiw::ExtJson.encode(doc)
170
170
  end.join("\n")
171
171
  end
172
172
 
@@ -352,23 +352,74 @@ module Exwiw
352
352
  # the values were captured from that field in #execute, so their BSON type
353
353
  # already matches the stored FK — no coercion.
354
354
  #
355
- # A belongs_to whose parent produced no ids contributes no constraint:
356
- # either the parent matched nothing, or it is not dumped here (e.g. an
357
- # embedded collection, or one excluded from the run). If that leaves the
358
- # filter empty even though the collection HAS belongs_to, the collection
359
- # cannot be scoped from the dump target — and falling back to an empty `{}`
360
- # filter would scan and dump the ENTIRE collection across every scope. That
361
- # is never what a scoped extraction wants, so constrain it to match nothing
362
- # and warn instead. (A collection with no belongs_to at all is genuine
363
- # reference/master data and is still dumped in full via `{}`.)
364
- private def related_collection_filter(config, config_by_name)
365
- filter = config.belongs_tos.each_with_object({}) do |relation, acc|
355
+ # Scope flows from the dump target along belongs_to edges. A belongs_to is
356
+ # classified by whether its parent is *genuinely scoped* reachable back to
357
+ # the dump target through belongs_to chains (see #genuine_scope_set) which
358
+ # determines how its constraint is applied:
359
+ #
360
+ # - Among the genuine parents, the most selective one (fewest captured ids)
361
+ # is the ANCHOR and is applied strictly. It carries the real narrowing and,
362
+ # being strict, bounds the result to a small set which keeps both this
363
+ # query and the `$in` sets it feeds downstream from ballooning.
364
+ #
365
+ # - The OTHER genuine parents are applied null-aware: a row whose (nullable)
366
+ # FK is null/absent has no reference through that relation and must not be
367
+ # excluded by it. `nil` is added to the `$in` set (Mongo's `$in: [nil]`
368
+ # matches both explicit nulls and missing fields). Without this, a nullable
369
+ # genuine FK that is null on otherwise in-scope rows ANDs the result to
370
+ # empty — dropping legitimate rows, and (when it zeroes a parent) making
371
+ # children lose that parent's selective+indexed scope and degenerate to a
372
+ # full COLLSCAN. See docs/mongodb-scoping-fullscan-notes.md. Null-aware is
373
+ # applied to non-anchor parents only: making the sole/anchor scope itself
374
+ # null-aware would match every row whose FK is null (e.g. a not-yet-
375
+ # backfilled column), ballooning the result instead of scoping it.
376
+ #
377
+ # - Reference parents (NOT reachable to the dump target — master/reference
378
+ # data dumped in full, or only reachable via such data) produce a non-
379
+ # scoping id set: "all/most of a reference table", which neither narrows
380
+ # meaningfully nor, made null-aware, stays bounded. So when the collection
381
+ # has a genuine parent to anchor on, reference-parent constraints are
382
+ # dropped entirely.
383
+ #
384
+ # When NO genuine parent produced ids, the collection is not reachable from
385
+ # the dump target; fall back to the historical strict-AND of whatever
386
+ # constraints exist (bounded, preserves prior behavior).
387
+ #
388
+ # A belongs_to whose parent produced no ids contributes no constraint: either
389
+ # the parent matched nothing, or it is not dumped here (e.g. an embedded
390
+ # collection, or one excluded from the run). If that leaves the filter empty
391
+ # even though the collection HAS belongs_to, the collection cannot be scoped
392
+ # from the dump target — and an empty `{}` filter would scan and dump the
393
+ # ENTIRE collection across every scope. That is never what a scoped
394
+ # extraction wants, so constrain it to match nothing and warn instead. (A
395
+ # collection with no belongs_to at all is genuine reference/master data and
396
+ # is still dumped in full via `{}`.)
397
+ private def related_collection_filter(config, config_by_name, dump_target)
398
+ genuine = genuine_scope_set(config_by_name, dump_target.table_name)
399
+
400
+ genuine_clauses = []
401
+ reference_clauses = []
402
+ config.belongs_tos.each do |relation|
366
403
  values = parent_state_for(relation, config_by_name)
367
404
  next if values.nil? || values.empty?
368
405
 
369
- acc[relation.foreign_key] = { "$in" => values }
406
+ target = genuine.include?(relation.table_name) ? genuine_clauses : reference_clauses
407
+ target << [relation.foreign_key, values]
370
408
  end
371
409
 
410
+ filter =
411
+ if genuine_clauses.any?
412
+ anchor_index = (0...genuine_clauses.size).min_by { |i| genuine_clauses[i][1].size }
413
+ genuine_clauses.each_with_index.each_with_object({}) do |((foreign_key, values), index), acc|
414
+ acc[foreign_key] =
415
+ index == anchor_index ? { "$in" => values } : { "$in" => [nil] + values }
416
+ end
417
+ else
418
+ reference_clauses.each_with_object({}) do |(foreign_key, values), acc|
419
+ acc[foreign_key] = { "$in" => values }
420
+ end
421
+ end
422
+
372
423
  return filter unless filter.empty? && config.belongs_tos.any?
373
424
 
374
425
  @logger.warn(
@@ -379,6 +430,31 @@ module Exwiw
379
430
  { config.primary_key => { "$in" => [] } }
380
431
  end
381
432
 
433
+ # The set of collection names *genuinely scoped* by the dump target: the
434
+ # target itself, plus every collection that can reach it by following
435
+ # belongs_to edges (child -> parent) transitively. Computed by fixpoint over
436
+ # the configs. Everything outside this set is reference/master data (or only
437
+ # reachable through it) whose belongs_to id sets do not represent a real
438
+ # scope. Memoized per target name; the configs do not mutate mid-run.
439
+ private def genuine_scope_set(config_by_name, target_name)
440
+ (@genuine_scope_set_cache ||= {})[target_name] ||=
441
+ begin
442
+ reachable = Set.new([target_name])
443
+ loop do
444
+ added = false
445
+ config_by_name.each_value do |cfg|
446
+ next if cfg.embedded? || reachable.include?(cfg.name)
447
+ next unless cfg.belongs_tos.any? { |relation| reachable.include?(relation.table_name) }
448
+
449
+ reachable << cfg.name
450
+ added = true
451
+ end
452
+ break unless added
453
+ end
454
+ reachable
455
+ end
456
+ end
457
+
382
458
  # The captured parent-collection values a child belongs_to should be
383
459
  # constrained by: the values of the parent field the FK references
384
460
  # (`relation.references`, default the parent primary_key). nil when the
@@ -489,14 +565,6 @@ module Exwiw
489
565
  @embedded_children_by_parent.fetch(parent_config.name, [])
490
566
  end
491
567
 
492
- private def extended_json(doc)
493
- if doc.respond_to?(:as_extended_json)
494
- doc.as_extended_json(mode: :relaxed)
495
- else
496
- doc
497
- end
498
- end
499
-
500
568
  private def db
501
569
  @db ||=
502
570
  begin
@@ -5,15 +5,67 @@ require 'open3'
5
5
  module Exwiw
6
6
  module Adapter
7
7
  class MysqlAdapter < Base
8
+ include SqlBulkInsert
9
+
10
+ # A lazy, streaming stand-in for the materialized rows #execute used to
11
+ # return (`connection.query(sql).rows`). It pulls rows off the wire one at
12
+ # a time (mysql2 single-row stream) instead of buffering the whole result
13
+ # set, so the dump's dominant memory cost — a Ruby array as large as the
14
+ # table — never materializes. The Runner drives it exactly like the old
15
+ # Array: #size to skip empty tables and log the count, then a single
16
+ # streaming pass (SqlBulkInsert#write_inserts -> each_slice).
17
+ #
18
+ # Mirrors PostgresqlAdapter::StreamingResult, with two MySQL specifics:
19
+ # - #size runs a separate `SELECT COUNT(*)` of the same query. Unlike the
20
+ # pg path, it does NOT wrap the SELECT in a subquery: MySQL rejects a
21
+ # derived table with duplicate column names, which a rails-managed
22
+ # `SELECT *` joined to another table produces. Instead the projection
23
+ # is replaced by `COUNT(*)` (compile_ast(count_only: true)) — exact
24
+ # because exwiw's extraction queries have no DISTINCT/GROUP BY/LIMIT,
25
+ # so the row count is independent of the projected columns.
26
+ # - the stream ties up the connection until fully drained. The Runner
27
+ # always drains it (write_inserts) before any further query
28
+ # (post_insert_sql / DELETE), and MysqlClient#stream_rows drains the
29
+ # remainder if iteration is abandoned, so the connection stays usable.
30
+ class StreamingResult
31
+ include Enumerable
32
+
33
+ def initialize(client:, data_sql:, count_sql:)
34
+ @client = client
35
+ @data_sql = data_sql
36
+ @count_sql = count_sql
37
+ end
38
+
39
+ def size
40
+ @size ||= @client.query(@count_sql).rows.dig(0, 0).to_i
41
+ end
42
+ alias length size
43
+
44
+ # Stream the result set row by row. Each row is an Array of String|nil
45
+ # (mysql2 `cast: false` / stringified) — identical to what
46
+ # `connection.query(sql).rows` produced, so the generated INSERT is
47
+ # unchanged.
48
+ def each(&block)
49
+ return enum_for(:each) { size } unless block_given?
50
+
51
+ @client.stream_rows(@data_sql, &block)
52
+ self
53
+ end
54
+ end
55
+
8
56
  def build_query(table, dump_target, table_by_name)
9
57
  Exwiw::QueryAstBuilder.run(table.name, table_by_name, dump_target, @logger)
10
58
  end
11
59
 
12
60
  def execute(query_ast)
13
- sql = commented_sql(query_ast)
14
-
15
- @logger.debug(" Executing SQL: \n#{sql}")
16
- connection.query(sql).rows
61
+ data_sql = commented_sql(query_ast)
62
+ # Count via the same FROM/JOIN/WHERE (projection replaced by COUNT(*)) so
63
+ # the Runner can skip empty tables and log the row count without draining
64
+ # the stream. See StreamingResult for why this is not a subquery wrap.
65
+ count_sql = "#{sql_query_comment(query_ast)} #{compile_ast(query_ast, count_only: true)}"
66
+
67
+ @logger.debug(" Executing SQL (streaming): \n#{data_sql}")
68
+ StreamingResult.new(client: connection, data_sql: data_sql, count_sql: count_sql)
17
69
  end
18
70
 
19
71
  def explain(query_ast)
@@ -99,22 +151,16 @@ module Exwiw
99
151
  "SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;"
100
152
  end
101
153
 
102
- def to_bulk_insert(results, table)
154
+ # The INSERT header for this adapter. MySQL backtick-quotes the table and
155
+ # column identifiers. #to_bulk_insert / #write_inserts (SqlBulkInsert)
156
+ # append the value tuples and the trailing `;`.
157
+ private def insert_header(table)
103
158
  table_name = table.name
104
-
105
- value_list = results.map do |row|
106
- quoted_values = row.map do |value|
107
- escape_value(value)
108
- end
109
- "(" + quoted_values.join(', ') + ")"
110
- end
111
- values = value_list.join(",\n")
112
-
113
159
  if table.rails_managed?
114
- "INSERT INTO `#{table_name}` VALUES\n#{values};"
160
+ "INSERT INTO `#{table_name}` VALUES\n"
115
161
  else
116
162
  column_names = table.columns.map { |c| "`#{c.name}`" }.join(', ')
117
- "INSERT INTO `#{table_name}` (#{column_names}) VALUES\n#{values};"
163
+ "INSERT INTO `#{table_name}` (#{column_names}) VALUES\n"
118
164
  end
119
165
  end
120
166
 
@@ -176,11 +222,17 @@ module Exwiw
176
222
  sql
177
223
  end
178
224
 
179
- def compile_ast(query_ast)
225
+ # @param count_only [Boolean] emit `SELECT COUNT(*)` instead of the
226
+ # projected columns (used by StreamingResult#size). Safe because exwiw's
227
+ # extraction queries have no DISTINCT/GROUP BY/LIMIT, so the count does
228
+ # not depend on the projection.
229
+ def compile_ast(query_ast, count_only: false)
180
230
  raise NotImplementedError unless query_ast.is_a?(Exwiw::QueryAst::Select)
181
231
 
182
232
  sql = "SELECT "
183
- sql += if query_ast.select_all
233
+ sql += if count_only
234
+ "COUNT(*)"
235
+ elsif query_ast.select_all
184
236
  "*"
185
237
  else
186
238
  query_ast.columns.map { |col| compile_column_name(query_ast, col) }.join(', ')
@@ -118,6 +118,49 @@ module Exwiw
118
118
  end
119
119
  end
120
120
 
121
+ # Stream a query's rows one at a time, yielding each as an
122
+ # Array<String|nil> (the same row shape as #query) instead of buffering
123
+ # the whole result set. This keeps a large dump's dominant memory cost — a
124
+ # Ruby array as big as the table — from materializing.
125
+ #
126
+ # mysql2 streams server-side (`stream: true` + `cache_rows: false`).
127
+ # Its contract: a streamed result MUST be fully consumed before the next
128
+ # query on this connection, or the driver raises "Commands out of sync".
129
+ # The Runner consumes every row (it writes them all), but if the consumer
130
+ # block raises mid-stream we drain the remaining rows so the same
131
+ # connection is still usable for the next table's query.
132
+ #
133
+ # trilogy has no streaming cursor (no QUERY_FLAGS_STREAMING), so it buffers
134
+ # the result and yields from it — parity, but without the memory win (the
135
+ # same situation as the sqlite adapter). trilogy is a test-only driver;
136
+ # production connects via mysql2.
137
+ #
138
+ # @param sql [String]
139
+ # @yieldparam row [Array<String|nil>]
140
+ def stream_rows(sql)
141
+ return enum_for(:stream_rows, sql) unless block_given?
142
+
143
+ case @driver
144
+ when :mysql2
145
+ res = raw.query(sql, cast: false, as: :array, stream: true, cache_rows: false)
146
+ begin
147
+ res.each { |row| yield row.map { |value| self.class.stringify_value(value) } }
148
+ rescue StandardError
149
+ begin
150
+ res.each { |_row| } # drain the remainder so the connection stays usable
151
+ rescue StandardError
152
+ nil
153
+ end
154
+ raise
155
+ end
156
+ when :trilogy
157
+ raw.query(sql).rows.each { |row| yield row.map { |value| self.class.stringify_value(value) } }
158
+ else
159
+ raise "Unsupported MySQL driver: #{@driver.inspect}"
160
+ end
161
+ self
162
+ end
163
+
121
164
  private def ensure_driver_loaded!
122
165
  case @driver
123
166
  when :mysql2 then require 'mysql2'