exwiw 0.5.3 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +1 -1
- data/docs/mongodb-scoping-fullscan-notes.md +145 -0
- data/docs/optimize-mongodb-export-with-native-ext.md +31 -11
- data/docs/sql-dump-optimization-notes.md +278 -0
- data/ext/exwiw/ext_json/ext_json.c +274 -0
- data/ext/exwiw/ext_json/extconf.rb +8 -0
- data/lib/exwiw/adapter/mongodb_adapter.rb +90 -22
- data/lib/exwiw/adapter/mysql_adapter.rb +70 -18
- data/lib/exwiw/adapter/mysql_client.rb +43 -0
- data/lib/exwiw/adapter/postgresql_adapter.rb +85 -15
- data/lib/exwiw/adapter/sql_bulk_insert.rb +71 -0
- data/lib/exwiw/adapter/sqlite_adapter.rb +75 -18
- data/lib/exwiw/adapter.rb +28 -0
- data/lib/exwiw/ext_json.rb +33 -0
- data/lib/exwiw/runner.rb +10 -16
- data/lib/exwiw/version.rb +1 -1
- data/lib/exwiw.rb +2 -0
- metadata +9 -2
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
// Native emitter for MongoDB Relaxed Extended JSON.
|
|
2
|
+
//
|
|
3
|
+
// Replaces the pure-Ruby `JSON.generate(doc.as_extended_json(mode: :relaxed))`
|
|
4
|
+
// (which rebuilds the whole document into an intermediate transformed Hash tree
|
|
5
|
+
// and then walks it a second time in JSON.generate) with a single native
|
|
6
|
+
// tree-walk that emits the JSONL line directly.
|
|
7
|
+
//
|
|
8
|
+
// Byte-identity strategy (see docs/optimize-mongodb-export-with-native-ext.md):
|
|
9
|
+
// only the structural bulk + the cheapest, most-stable leaves are formatted in
|
|
10
|
+
// C — Hash, Array, String, fixnum Integer, true/false/nil, BSON::ObjectId, and
|
|
11
|
+
// in-range Time (years 1970..9999; see encode_time_native). Everything else
|
|
12
|
+
// (Float, out-of-int64 Integer, out-of-range Time, Symbol, Decimal128, Binary,
|
|
13
|
+
// ...) is handed back to Ruby's `encode_fragment`, which is the exact pure-Ruby
|
|
14
|
+
// path. This is provably byte-identical because Hash#as_extended_json
|
|
15
|
+
// and Array#as_extended_json are non-transforming structural recursion: the
|
|
16
|
+
// bytes `JSON.generate(v.as_extended_json(mode: :relaxed))` produces for any
|
|
17
|
+
// sub-value `v` are exactly the bytes the whole-document generate would produce
|
|
18
|
+
// in that position, so a value the native walk does not format can be spliced
|
|
19
|
+
// in verbatim with no divergence.
|
|
20
|
+
|
|
21
|
+
#include <ruby.h>
|
|
22
|
+
#include <ruby/encoding.h>
|
|
23
|
+
#include <stdio.h>
|
|
24
|
+
#include <time.h>
|
|
25
|
+
|
|
26
|
+
static VALUE rb_mExtJson;
|
|
27
|
+
// Cached BSON::ObjectId class, or Qnil until bson is loaded and it resolves.
|
|
28
|
+
// Resolution is lazy (bson is required only when the Mongo adapter touches the
|
|
29
|
+
// DB, which always precedes serialization in a real run); see resolve below.
|
|
30
|
+
static VALUE rb_cObjectId;
|
|
31
|
+
|
|
32
|
+
static ID id_encode_fragment;
|
|
33
|
+
static ID id_to_s;
|
|
34
|
+
static ID id_const_BSON;
|
|
35
|
+
static ID id_const_ObjectId;
|
|
36
|
+
|
|
37
|
+
static const char hexdigits[] = "0123456789abcdef";
|
|
38
|
+
|
|
39
|
+
static void encode_value(VALUE buf, VALUE val);
|
|
40
|
+
|
|
41
|
+
// Append `str` as a JSON string literal (surrounding quotes included), escaping
|
|
42
|
+
// exactly as JSON.generate does: \b \t \n \f \r \" \\ get their short escapes,
|
|
43
|
+
// any other byte < 0x20 becomes a lowercase \u00xx, and every other byte —
|
|
44
|
+
// including '/', DEL (0x7f), U+2028/U+2029, and UTF-8 multi-byte sequences — is
|
|
45
|
+
// passed through raw. Unescaped runs are appended in bulk to avoid a per-byte
|
|
46
|
+
// rb_str_cat call.
|
|
47
|
+
static void encode_string(VALUE buf, const char *p, long len)
|
|
48
|
+
{
|
|
49
|
+
rb_str_cat(buf, "\"", 1);
|
|
50
|
+
|
|
51
|
+
long start = 0;
|
|
52
|
+
for (long i = 0; i < len; i++) {
|
|
53
|
+
unsigned char c = (unsigned char)p[i];
|
|
54
|
+
const char *esc = NULL;
|
|
55
|
+
long esclen = 0;
|
|
56
|
+
char ubuf[6];
|
|
57
|
+
|
|
58
|
+
switch (c) {
|
|
59
|
+
case '"': esc = "\\\""; esclen = 2; break;
|
|
60
|
+
case '\\': esc = "\\\\"; esclen = 2; break;
|
|
61
|
+
case '\b': esc = "\\b"; esclen = 2; break;
|
|
62
|
+
case '\t': esc = "\\t"; esclen = 2; break;
|
|
63
|
+
case '\n': esc = "\\n"; esclen = 2; break;
|
|
64
|
+
case '\f': esc = "\\f"; esclen = 2; break;
|
|
65
|
+
case '\r': esc = "\\r"; esclen = 2; break;
|
|
66
|
+
default:
|
|
67
|
+
if (c < 0x20) {
|
|
68
|
+
ubuf[0] = '\\'; ubuf[1] = 'u'; ubuf[2] = '0'; ubuf[3] = '0';
|
|
69
|
+
ubuf[4] = hexdigits[(c >> 4) & 0xf];
|
|
70
|
+
ubuf[5] = hexdigits[c & 0xf];
|
|
71
|
+
esc = ubuf; esclen = 6;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
if (esc) {
|
|
76
|
+
if (i > start) rb_str_cat(buf, p + start, i - start);
|
|
77
|
+
rb_str_cat(buf, esc, esclen);
|
|
78
|
+
start = i + 1;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
if (len > start) rb_str_cat(buf, p + start, len - start);
|
|
82
|
+
|
|
83
|
+
rb_str_cat(buf, "\"", 1);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Hash keys mirror JSON.generate: a String key is emitted as-is, anything else
|
|
87
|
+
// is stringified (Symbol via its name, otherwise #to_s) before escaping.
|
|
88
|
+
static void encode_key(VALUE buf, VALUE key)
|
|
89
|
+
{
|
|
90
|
+
VALUE kstr;
|
|
91
|
+
if (RB_TYPE_P(key, T_STRING)) {
|
|
92
|
+
kstr = key;
|
|
93
|
+
} else if (RB_TYPE_P(key, T_SYMBOL)) {
|
|
94
|
+
kstr = rb_sym2str(key);
|
|
95
|
+
} else {
|
|
96
|
+
kstr = rb_funcall(key, id_to_s, 0);
|
|
97
|
+
}
|
|
98
|
+
encode_string(buf, RSTRING_PTR(kstr), RSTRING_LEN(kstr));
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
typedef struct {
|
|
102
|
+
VALUE buf;
|
|
103
|
+
int first;
|
|
104
|
+
} hash_ctx;
|
|
105
|
+
|
|
106
|
+
static int hash_iter(VALUE key, VALUE value, VALUE arg)
|
|
107
|
+
{
|
|
108
|
+
hash_ctx *ctx = (hash_ctx *)arg;
|
|
109
|
+
if (!ctx->first) rb_str_cat(ctx->buf, ",", 1);
|
|
110
|
+
ctx->first = 0;
|
|
111
|
+
encode_key(ctx->buf, key);
|
|
112
|
+
rb_str_cat(ctx->buf, ":", 1);
|
|
113
|
+
encode_value(ctx->buf, value);
|
|
114
|
+
return ST_CONTINUE;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Splice the pure-Ruby fragment for a value the native path does not format.
|
|
118
|
+
static void delegate(VALUE buf, VALUE val)
|
|
119
|
+
{
|
|
120
|
+
VALUE frag = rb_funcall(rb_mExtJson, id_encode_fragment, 1, val);
|
|
121
|
+
rb_str_cat(buf, RSTRING_PTR(frag), RSTRING_LEN(frag));
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Epoch second for 10000-01-01T00:00:00Z. `bson`'s relaxed Time encoding uses
|
|
125
|
+
// the ISO-8601 string form only for years 1970..9999 (inclusive) and the
|
|
126
|
+
// {"$numberLong":"<ms>"} form otherwise; that year window is exactly the
|
|
127
|
+
// half-open epoch-second range [0, MAX_ISO_EPOCH).
|
|
128
|
+
#define MAX_ISO_EPOCH 253402300800LL
|
|
129
|
+
|
|
130
|
+
// Format a Time as Relaxed Extended JSON in C, matching bson 5.2.0 byte for
|
|
131
|
+
// byte for the common in-range case (see bson/time.rb and the empirical probe):
|
|
132
|
+
// - whole second (usec == 0, i.e. nsec < 1000): {"$date":"...:SSZ"} (no fraction)
|
|
133
|
+
// - sub-second (nsec >= 1000): {"$date":"...:SS.mmmZ"}, where the
|
|
134
|
+
// millisecond is floor(nsec / 1e6) — bson floors the Time to milliseconds.
|
|
135
|
+
// Returns 1 when handled. Returns 0 (leaving buf untouched) for years outside
|
|
136
|
+
// 1970..9999, whose {"$numberLong"} form involves negative-epoch arithmetic too
|
|
137
|
+
// fiddly to risk in C — the caller then delegates that rare case to Ruby.
|
|
138
|
+
static int encode_time_native(VALUE buf, VALUE val)
|
|
139
|
+
{
|
|
140
|
+
struct timespec ts = rb_time_timespec(val);
|
|
141
|
+
if (ts.tv_sec < 0 || ts.tv_sec >= MAX_ISO_EPOCH) return 0;
|
|
142
|
+
|
|
143
|
+
time_t secs = (time_t)ts.tv_sec;
|
|
144
|
+
struct tm tm;
|
|
145
|
+
if (gmtime_r(&secs, &tm) == NULL) return 0;
|
|
146
|
+
|
|
147
|
+
char tmp[40];
|
|
148
|
+
int n;
|
|
149
|
+
if (ts.tv_nsec >= 1000) {
|
|
150
|
+
int ms = (int)(ts.tv_nsec / 1000000L);
|
|
151
|
+
n = snprintf(tmp, sizeof(tmp),
|
|
152
|
+
"{\"$date\":\"%04d-%02d-%02dT%02d:%02d:%02d.%03dZ\"}",
|
|
153
|
+
tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
|
|
154
|
+
tm.tm_hour, tm.tm_min, tm.tm_sec, ms);
|
|
155
|
+
} else {
|
|
156
|
+
n = snprintf(tmp, sizeof(tmp),
|
|
157
|
+
"{\"$date\":\"%04d-%02d-%02dT%02d:%02d:%02dZ\"}",
|
|
158
|
+
tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
|
|
159
|
+
tm.tm_hour, tm.tm_min, tm.tm_sec);
|
|
160
|
+
}
|
|
161
|
+
rb_str_cat(buf, tmp, n);
|
|
162
|
+
return 1;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
static void encode_value(VALUE buf, VALUE val)
|
|
166
|
+
{
|
|
167
|
+
switch (TYPE(val)) {
|
|
168
|
+
case T_NIL:
|
|
169
|
+
rb_str_cat(buf, "null", 4);
|
|
170
|
+
return;
|
|
171
|
+
case T_TRUE:
|
|
172
|
+
rb_str_cat(buf, "true", 4);
|
|
173
|
+
return;
|
|
174
|
+
case T_FALSE:
|
|
175
|
+
rb_str_cat(buf, "false", 5);
|
|
176
|
+
return;
|
|
177
|
+
case T_FIXNUM: {
|
|
178
|
+
// A Fixnum always fits in a C long (and thus int64) on the platforms
|
|
179
|
+
// exwiw targets, so it can never be the out-of-int64 case that must
|
|
180
|
+
// raise; emit it directly. Bignums fall through to delegate, where
|
|
181
|
+
// encode_fragment emits in-range ones and raises RangeError for the
|
|
182
|
+
// rest — matching today's behavior exactly.
|
|
183
|
+
char tmp[24];
|
|
184
|
+
int n = snprintf(tmp, sizeof(tmp), "%ld", (long)FIX2LONG(val));
|
|
185
|
+
rb_str_cat(buf, tmp, n);
|
|
186
|
+
return;
|
|
187
|
+
}
|
|
188
|
+
case T_STRING:
|
|
189
|
+
encode_string(buf, RSTRING_PTR(val), RSTRING_LEN(val));
|
|
190
|
+
return;
|
|
191
|
+
case T_ARRAY: {
|
|
192
|
+
long len = RARRAY_LEN(val);
|
|
193
|
+
rb_str_cat(buf, "[", 1);
|
|
194
|
+
for (long i = 0; i < len; i++) {
|
|
195
|
+
if (i > 0) rb_str_cat(buf, ",", 1);
|
|
196
|
+
encode_value(buf, rb_ary_entry(val, i));
|
|
197
|
+
}
|
|
198
|
+
rb_str_cat(buf, "]", 1);
|
|
199
|
+
return;
|
|
200
|
+
}
|
|
201
|
+
case T_HASH: {
|
|
202
|
+
// rb_hash_foreach preserves insertion order, matching JSON output.
|
|
203
|
+
hash_ctx ctx = { buf, 1 };
|
|
204
|
+
rb_str_cat(buf, "{", 1);
|
|
205
|
+
rb_hash_foreach(val, hash_iter, (VALUE)&ctx);
|
|
206
|
+
rb_str_cat(buf, "}", 1);
|
|
207
|
+
return;
|
|
208
|
+
}
|
|
209
|
+
default:
|
|
210
|
+
// BSON::ObjectId is the single most common leaf (`_id`) and its
|
|
211
|
+
// Relaxed form is the stable {"$oid":"<24 hex>"}, so format it here.
|
|
212
|
+
// The hex comes from #to_s (the same source as as_extended_json) and
|
|
213
|
+
// is always [0-9a-f]{24}, so it needs no escaping.
|
|
214
|
+
if (!NIL_P(rb_cObjectId) && RTEST(rb_obj_is_kind_of(val, rb_cObjectId))) {
|
|
215
|
+
VALUE hex = rb_funcall(val, id_to_s, 0);
|
|
216
|
+
rb_str_cat(buf, "{\"$oid\":\"", 9);
|
|
217
|
+
rb_str_cat(buf, RSTRING_PTR(hex), RSTRING_LEN(hex));
|
|
218
|
+
rb_str_cat(buf, "\"}", 2);
|
|
219
|
+
return;
|
|
220
|
+
}
|
|
221
|
+
// Time is the other common leaf in dumped documents (Mongoid's
|
|
222
|
+
// created_at/updated_at); format the in-range case natively. The
|
|
223
|
+
// out-of-range $numberLong form returns 0 and falls through to Ruby.
|
|
224
|
+
if (RTEST(rb_obj_is_kind_of(val, rb_cTime)) && encode_time_native(buf, val)) {
|
|
225
|
+
return;
|
|
226
|
+
}
|
|
227
|
+
// Float, Bignum, Symbol, Decimal128, Binary, out-of-range Time, ... -> Ruby.
|
|
228
|
+
delegate(buf, val);
|
|
229
|
+
return;
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// Resolve and cache BSON::ObjectId the first time a document is encoded with
|
|
234
|
+
// bson loaded. Cheap const lookups guarded by the Qnil cache; once resolved it
|
|
235
|
+
// is skipped. Until resolved, ObjectId simply takes the (correct) delegate path.
|
|
236
|
+
static void resolve_objectid_class(void)
|
|
237
|
+
{
|
|
238
|
+
if (!NIL_P(rb_cObjectId)) return;
|
|
239
|
+
if (!rb_const_defined(rb_cObject, id_const_BSON)) return;
|
|
240
|
+
|
|
241
|
+
VALUE bson = rb_const_get(rb_cObject, id_const_BSON);
|
|
242
|
+
if (rb_const_defined(bson, id_const_ObjectId)) {
|
|
243
|
+
rb_cObjectId = rb_const_get(bson, id_const_ObjectId);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Exwiw::ExtJson.encode_native(doc) -> String
|
|
248
|
+
// Returns one JSONL line (no trailing newline); the caller owns separators.
|
|
249
|
+
static VALUE rb_encode_native(VALUE self, VALUE doc)
|
|
250
|
+
{
|
|
251
|
+
resolve_objectid_class();
|
|
252
|
+
|
|
253
|
+
VALUE buf = rb_str_buf_new(256);
|
|
254
|
+
rb_enc_associate(buf, rb_utf8_encoding());
|
|
255
|
+
encode_value(buf, doc);
|
|
256
|
+
return buf;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
void Init_ext_json_native(void)
|
|
260
|
+
{
|
|
261
|
+
id_encode_fragment = rb_intern("encode_fragment");
|
|
262
|
+
id_to_s = rb_intern("to_s");
|
|
263
|
+
id_const_BSON = rb_intern("BSON");
|
|
264
|
+
id_const_ObjectId = rb_intern("ObjectId");
|
|
265
|
+
|
|
266
|
+
VALUE mExwiw = rb_define_module("Exwiw");
|
|
267
|
+
rb_mExtJson = rb_define_module_under(mExwiw, "ExtJson");
|
|
268
|
+
rb_global_variable(&rb_mExtJson);
|
|
269
|
+
|
|
270
|
+
rb_cObjectId = Qnil;
|
|
271
|
+
rb_global_variable(&rb_cObjectId);
|
|
272
|
+
|
|
273
|
+
rb_define_singleton_method(rb_mExtJson, "encode_native", rb_encode_native, 1);
|
|
274
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "mkmf"
|
|
4
|
+
|
|
5
|
+
# Compiled to lib/exwiw/ext_json_native.{so,bundle}. The name is distinct from
|
|
6
|
+
# the `ext_json.rb` shim so `require "exwiw/ext_json_native"` does not collide
|
|
7
|
+
# with `require_relative "exwiw/ext_json"`.
|
|
8
|
+
create_makefile("exwiw/ext_json_native")
|
|
@@ -123,7 +123,7 @@ module Exwiw
|
|
|
123
123
|
{ config.primary_key => { "$in" => coerce_ids(dump_target.ids) } }
|
|
124
124
|
end
|
|
125
125
|
else
|
|
126
|
-
related_collection_filter(config, config_by_name)
|
|
126
|
+
related_collection_filter(config, config_by_name, dump_target)
|
|
127
127
|
end
|
|
128
128
|
|
|
129
129
|
Exwiw::MongoQuery::Find.new(
|
|
@@ -166,7 +166,7 @@ module Exwiw
|
|
|
166
166
|
plan = mask_plan(config)
|
|
167
167
|
rows.map do |doc|
|
|
168
168
|
apply_mask_plan!(doc, plan)
|
|
169
|
-
|
|
169
|
+
Exwiw::ExtJson.encode(doc)
|
|
170
170
|
end.join("\n")
|
|
171
171
|
end
|
|
172
172
|
|
|
@@ -352,23 +352,74 @@ module Exwiw
|
|
|
352
352
|
# the values were captured from that field in #execute, so their BSON type
|
|
353
353
|
# already matches the stored FK — no coercion.
|
|
354
354
|
#
|
|
355
|
-
#
|
|
356
|
-
#
|
|
357
|
-
#
|
|
358
|
-
#
|
|
359
|
-
#
|
|
360
|
-
#
|
|
361
|
-
#
|
|
362
|
-
#
|
|
363
|
-
#
|
|
364
|
-
|
|
365
|
-
|
|
355
|
+
# Scope flows from the dump target along belongs_to edges. A belongs_to is
|
|
356
|
+
# classified by whether its parent is *genuinely scoped* — reachable back to
|
|
357
|
+
# the dump target through belongs_to chains (see #genuine_scope_set) — which
|
|
358
|
+
# determines how its constraint is applied:
|
|
359
|
+
#
|
|
360
|
+
# - Among the genuine parents, the most selective one (fewest captured ids)
|
|
361
|
+
# is the ANCHOR and is applied strictly. It carries the real narrowing and,
|
|
362
|
+
# being strict, bounds the result to a small set — which keeps both this
|
|
363
|
+
# query and the `$in` sets it feeds downstream from ballooning.
|
|
364
|
+
#
|
|
365
|
+
# - The OTHER genuine parents are applied null-aware: a row whose (nullable)
|
|
366
|
+
# FK is null/absent has no reference through that relation and must not be
|
|
367
|
+
# excluded by it. `nil` is added to the `$in` set (Mongo's `$in: [nil]`
|
|
368
|
+
# matches both explicit nulls and missing fields). Without this, a nullable
|
|
369
|
+
# genuine FK that is null on otherwise in-scope rows ANDs the result to
|
|
370
|
+
# empty — dropping legitimate rows, and (when it zeroes a parent) making
|
|
371
|
+
# children lose that parent's selective+indexed scope and degenerate to a
|
|
372
|
+
# full COLLSCAN. See docs/mongodb-scoping-fullscan-notes.md. Null-aware is
|
|
373
|
+
# applied to non-anchor parents only: making the sole/anchor scope itself
|
|
374
|
+
# null-aware would match every row whose FK is null (e.g. a not-yet-
|
|
375
|
+
# backfilled column), ballooning the result instead of scoping it.
|
|
376
|
+
#
|
|
377
|
+
# - Reference parents (NOT reachable to the dump target — master/reference
|
|
378
|
+
# data dumped in full, or only reachable via such data) produce a non-
|
|
379
|
+
# scoping id set: "all/most of a reference table", which neither narrows
|
|
380
|
+
# meaningfully nor, made null-aware, stays bounded. So when the collection
|
|
381
|
+
# has a genuine parent to anchor on, reference-parent constraints are
|
|
382
|
+
# dropped entirely.
|
|
383
|
+
#
|
|
384
|
+
# When NO genuine parent produced ids, the collection is not reachable from
|
|
385
|
+
# the dump target; fall back to the historical strict-AND of whatever
|
|
386
|
+
# constraints exist (bounded, preserves prior behavior).
|
|
387
|
+
#
|
|
388
|
+
# A belongs_to whose parent produced no ids contributes no constraint: either
|
|
389
|
+
# the parent matched nothing, or it is not dumped here (e.g. an embedded
|
|
390
|
+
# collection, or one excluded from the run). If that leaves the filter empty
|
|
391
|
+
# even though the collection HAS belongs_to, the collection cannot be scoped
|
|
392
|
+
# from the dump target — and an empty `{}` filter would scan and dump the
|
|
393
|
+
# ENTIRE collection across every scope. That is never what a scoped
|
|
394
|
+
# extraction wants, so constrain it to match nothing and warn instead. (A
|
|
395
|
+
# collection with no belongs_to at all is genuine reference/master data and
|
|
396
|
+
# is still dumped in full via `{}`.)
|
|
397
|
+
private def related_collection_filter(config, config_by_name, dump_target)
|
|
398
|
+
genuine = genuine_scope_set(config_by_name, dump_target.table_name)
|
|
399
|
+
|
|
400
|
+
genuine_clauses = []
|
|
401
|
+
reference_clauses = []
|
|
402
|
+
config.belongs_tos.each do |relation|
|
|
366
403
|
values = parent_state_for(relation, config_by_name)
|
|
367
404
|
next if values.nil? || values.empty?
|
|
368
405
|
|
|
369
|
-
|
|
406
|
+
target = genuine.include?(relation.table_name) ? genuine_clauses : reference_clauses
|
|
407
|
+
target << [relation.foreign_key, values]
|
|
370
408
|
end
|
|
371
409
|
|
|
410
|
+
filter =
|
|
411
|
+
if genuine_clauses.any?
|
|
412
|
+
anchor_index = (0...genuine_clauses.size).min_by { |i| genuine_clauses[i][1].size }
|
|
413
|
+
genuine_clauses.each_with_index.each_with_object({}) do |((foreign_key, values), index), acc|
|
|
414
|
+
acc[foreign_key] =
|
|
415
|
+
index == anchor_index ? { "$in" => values } : { "$in" => [nil] + values }
|
|
416
|
+
end
|
|
417
|
+
else
|
|
418
|
+
reference_clauses.each_with_object({}) do |(foreign_key, values), acc|
|
|
419
|
+
acc[foreign_key] = { "$in" => values }
|
|
420
|
+
end
|
|
421
|
+
end
|
|
422
|
+
|
|
372
423
|
return filter unless filter.empty? && config.belongs_tos.any?
|
|
373
424
|
|
|
374
425
|
@logger.warn(
|
|
@@ -379,6 +430,31 @@ module Exwiw
|
|
|
379
430
|
{ config.primary_key => { "$in" => [] } }
|
|
380
431
|
end
|
|
381
432
|
|
|
433
|
+
# The set of collection names *genuinely scoped* by the dump target: the
|
|
434
|
+
# target itself, plus every collection that can reach it by following
|
|
435
|
+
# belongs_to edges (child -> parent) transitively. Computed by fixpoint over
|
|
436
|
+
# the configs. Everything outside this set is reference/master data (or only
|
|
437
|
+
# reachable through it) whose belongs_to id sets do not represent a real
|
|
438
|
+
# scope. Memoized per target name; the configs do not mutate mid-run.
|
|
439
|
+
private def genuine_scope_set(config_by_name, target_name)
|
|
440
|
+
(@genuine_scope_set_cache ||= {})[target_name] ||=
|
|
441
|
+
begin
|
|
442
|
+
reachable = Set.new([target_name])
|
|
443
|
+
loop do
|
|
444
|
+
added = false
|
|
445
|
+
config_by_name.each_value do |cfg|
|
|
446
|
+
next if cfg.embedded? || reachable.include?(cfg.name)
|
|
447
|
+
next unless cfg.belongs_tos.any? { |relation| reachable.include?(relation.table_name) }
|
|
448
|
+
|
|
449
|
+
reachable << cfg.name
|
|
450
|
+
added = true
|
|
451
|
+
end
|
|
452
|
+
break unless added
|
|
453
|
+
end
|
|
454
|
+
reachable
|
|
455
|
+
end
|
|
456
|
+
end
|
|
457
|
+
|
|
382
458
|
# The captured parent-collection values a child belongs_to should be
|
|
383
459
|
# constrained by: the values of the parent field the FK references
|
|
384
460
|
# (`relation.references`, default the parent primary_key). nil when the
|
|
@@ -489,14 +565,6 @@ module Exwiw
|
|
|
489
565
|
@embedded_children_by_parent.fetch(parent_config.name, [])
|
|
490
566
|
end
|
|
491
567
|
|
|
492
|
-
private def extended_json(doc)
|
|
493
|
-
if doc.respond_to?(:as_extended_json)
|
|
494
|
-
doc.as_extended_json(mode: :relaxed)
|
|
495
|
-
else
|
|
496
|
-
doc
|
|
497
|
-
end
|
|
498
|
-
end
|
|
499
|
-
|
|
500
568
|
private def db
|
|
501
569
|
@db ||=
|
|
502
570
|
begin
|
|
@@ -5,15 +5,67 @@ require 'open3'
|
|
|
5
5
|
module Exwiw
|
|
6
6
|
module Adapter
|
|
7
7
|
class MysqlAdapter < Base
|
|
8
|
+
include SqlBulkInsert
|
|
9
|
+
|
|
10
|
+
# A lazy, streaming stand-in for the materialized rows #execute used to
|
|
11
|
+
# return (`connection.query(sql).rows`). It pulls rows off the wire one at
|
|
12
|
+
# a time (mysql2 single-row stream) instead of buffering the whole result
|
|
13
|
+
# set, so the dump's dominant memory cost — a Ruby array as large as the
|
|
14
|
+
# table — never materializes. The Runner drives it exactly like the old
|
|
15
|
+
# Array: #size to skip empty tables and log the count, then a single
|
|
16
|
+
# streaming pass (SqlBulkInsert#write_inserts -> each_slice).
|
|
17
|
+
#
|
|
18
|
+
# Mirrors PostgresqlAdapter::StreamingResult, with two MySQL specifics:
|
|
19
|
+
# - #size runs a separate `SELECT COUNT(*)` of the same query. Unlike the
|
|
20
|
+
# pg path, it does NOT wrap the SELECT in a subquery: MySQL rejects a
|
|
21
|
+
# derived table with duplicate column names, which a rails-managed
|
|
22
|
+
# `SELECT *` joined to another table produces. Instead the projection
|
|
23
|
+
# is replaced by `COUNT(*)` (compile_ast(count_only: true)) — exact
|
|
24
|
+
# because exwiw's extraction queries have no DISTINCT/GROUP BY/LIMIT,
|
|
25
|
+
# so the row count is independent of the projected columns.
|
|
26
|
+
# - the stream ties up the connection until fully drained. The Runner
|
|
27
|
+
# always drains it (write_inserts) before any further query
|
|
28
|
+
# (post_insert_sql / DELETE), and MysqlClient#stream_rows drains the
|
|
29
|
+
# remainder if iteration is abandoned, so the connection stays usable.
|
|
30
|
+
class StreamingResult
|
|
31
|
+
include Enumerable
|
|
32
|
+
|
|
33
|
+
def initialize(client:, data_sql:, count_sql:)
|
|
34
|
+
@client = client
|
|
35
|
+
@data_sql = data_sql
|
|
36
|
+
@count_sql = count_sql
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def size
|
|
40
|
+
@size ||= @client.query(@count_sql).rows.dig(0, 0).to_i
|
|
41
|
+
end
|
|
42
|
+
alias length size
|
|
43
|
+
|
|
44
|
+
# Stream the result set row by row. Each row is an Array of String|nil
|
|
45
|
+
# (mysql2 `cast: false` / stringified) — identical to what
|
|
46
|
+
# `connection.query(sql).rows` produced, so the generated INSERT is
|
|
47
|
+
# unchanged.
|
|
48
|
+
def each(&block)
|
|
49
|
+
return enum_for(:each) { size } unless block_given?
|
|
50
|
+
|
|
51
|
+
@client.stream_rows(@data_sql, &block)
|
|
52
|
+
self
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
8
56
|
def build_query(table, dump_target, table_by_name)
|
|
9
57
|
Exwiw::QueryAstBuilder.run(table.name, table_by_name, dump_target, @logger)
|
|
10
58
|
end
|
|
11
59
|
|
|
12
60
|
def execute(query_ast)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
61
|
+
data_sql = commented_sql(query_ast)
|
|
62
|
+
# Count via the same FROM/JOIN/WHERE (projection replaced by COUNT(*)) so
|
|
63
|
+
# the Runner can skip empty tables and log the row count without draining
|
|
64
|
+
# the stream. See StreamingResult for why this is not a subquery wrap.
|
|
65
|
+
count_sql = "#{sql_query_comment(query_ast)} #{compile_ast(query_ast, count_only: true)}"
|
|
66
|
+
|
|
67
|
+
@logger.debug(" Executing SQL (streaming): \n#{data_sql}")
|
|
68
|
+
StreamingResult.new(client: connection, data_sql: data_sql, count_sql: count_sql)
|
|
17
69
|
end
|
|
18
70
|
|
|
19
71
|
def explain(query_ast)
|
|
@@ -99,22 +151,16 @@ module Exwiw
|
|
|
99
151
|
"SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;"
|
|
100
152
|
end
|
|
101
153
|
|
|
102
|
-
|
|
154
|
+
# The INSERT header for this adapter. MySQL backtick-quotes the table and
|
|
155
|
+
# column identifiers. #to_bulk_insert / #write_inserts (SqlBulkInsert)
|
|
156
|
+
# append the value tuples and the trailing `;`.
|
|
157
|
+
private def insert_header(table)
|
|
103
158
|
table_name = table.name
|
|
104
|
-
|
|
105
|
-
value_list = results.map do |row|
|
|
106
|
-
quoted_values = row.map do |value|
|
|
107
|
-
escape_value(value)
|
|
108
|
-
end
|
|
109
|
-
"(" + quoted_values.join(', ') + ")"
|
|
110
|
-
end
|
|
111
|
-
values = value_list.join(",\n")
|
|
112
|
-
|
|
113
159
|
if table.rails_managed?
|
|
114
|
-
"INSERT INTO `#{table_name}` VALUES\n
|
|
160
|
+
"INSERT INTO `#{table_name}` VALUES\n"
|
|
115
161
|
else
|
|
116
162
|
column_names = table.columns.map { |c| "`#{c.name}`" }.join(', ')
|
|
117
|
-
"INSERT INTO `#{table_name}` (#{column_names}) VALUES\n
|
|
163
|
+
"INSERT INTO `#{table_name}` (#{column_names}) VALUES\n"
|
|
118
164
|
end
|
|
119
165
|
end
|
|
120
166
|
|
|
@@ -176,11 +222,17 @@ module Exwiw
|
|
|
176
222
|
sql
|
|
177
223
|
end
|
|
178
224
|
|
|
179
|
-
|
|
225
|
+
# @param count_only [Boolean] emit `SELECT COUNT(*)` instead of the
|
|
226
|
+
# projected columns (used by StreamingResult#size). Safe because exwiw's
|
|
227
|
+
# extraction queries have no DISTINCT/GROUP BY/LIMIT, so the count does
|
|
228
|
+
# not depend on the projection.
|
|
229
|
+
def compile_ast(query_ast, count_only: false)
|
|
180
230
|
raise NotImplementedError unless query_ast.is_a?(Exwiw::QueryAst::Select)
|
|
181
231
|
|
|
182
232
|
sql = "SELECT "
|
|
183
|
-
sql += if
|
|
233
|
+
sql += if count_only
|
|
234
|
+
"COUNT(*)"
|
|
235
|
+
elsif query_ast.select_all
|
|
184
236
|
"*"
|
|
185
237
|
else
|
|
186
238
|
query_ast.columns.map { |col| compile_column_name(query_ast, col) }.join(', ')
|
|
@@ -118,6 +118,49 @@ module Exwiw
|
|
|
118
118
|
end
|
|
119
119
|
end
|
|
120
120
|
|
|
121
|
+
# Stream a query's rows one at a time, yielding each as an
|
|
122
|
+
# Array<String|nil> (the same row shape as #query) instead of buffering
|
|
123
|
+
# the whole result set. This keeps a large dump's dominant memory cost — a
|
|
124
|
+
# Ruby array as big as the table — from materializing.
|
|
125
|
+
#
|
|
126
|
+
# mysql2 streams server-side (`stream: true` + `cache_rows: false`).
|
|
127
|
+
# Its contract: a streamed result MUST be fully consumed before the next
|
|
128
|
+
# query on this connection, or the driver raises "Commands out of sync".
|
|
129
|
+
# The Runner consumes every row (it writes them all), but if the consumer
|
|
130
|
+
# block raises mid-stream we drain the remaining rows so the same
|
|
131
|
+
# connection is still usable for the next table's query.
|
|
132
|
+
#
|
|
133
|
+
# trilogy has no streaming cursor (no QUERY_FLAGS_STREAMING), so it buffers
|
|
134
|
+
# the result and yields from it — parity, but without the memory win (the
|
|
135
|
+
# same situation as the sqlite adapter). trilogy is a test-only driver;
|
|
136
|
+
# production connects via mysql2.
|
|
137
|
+
#
|
|
138
|
+
# @param sql [String]
|
|
139
|
+
# @yieldparam row [Array<String|nil>]
|
|
140
|
+
def stream_rows(sql)
|
|
141
|
+
return enum_for(:stream_rows, sql) unless block_given?
|
|
142
|
+
|
|
143
|
+
case @driver
|
|
144
|
+
when :mysql2
|
|
145
|
+
res = raw.query(sql, cast: false, as: :array, stream: true, cache_rows: false)
|
|
146
|
+
begin
|
|
147
|
+
res.each { |row| yield row.map { |value| self.class.stringify_value(value) } }
|
|
148
|
+
rescue StandardError
|
|
149
|
+
begin
|
|
150
|
+
res.each { |_row| } # drain the remainder so the connection stays usable
|
|
151
|
+
rescue StandardError
|
|
152
|
+
nil
|
|
153
|
+
end
|
|
154
|
+
raise
|
|
155
|
+
end
|
|
156
|
+
when :trilogy
|
|
157
|
+
raw.query(sql).rows.each { |row| yield row.map { |value| self.class.stringify_value(value) } }
|
|
158
|
+
else
|
|
159
|
+
raise "Unsupported MySQL driver: #{@driver.inspect}"
|
|
160
|
+
end
|
|
161
|
+
self
|
|
162
|
+
end
|
|
163
|
+
|
|
121
164
|
private def ensure_driver_loaded!
|
|
122
165
|
case @driver
|
|
123
166
|
when :mysql2 then require 'mysql2'
|