exwiw 0.5.3 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,274 @@
1
+ // Native emitter for MongoDB Relaxed Extended JSON.
2
+ //
3
+ // Replaces the pure-Ruby `JSON.generate(doc.as_extended_json(mode: :relaxed))`
4
+ // (which rebuilds the whole document into an intermediate transformed Hash tree
5
+ // and then walks it a second time in JSON.generate) with a single native
6
+ // tree-walk that emits the JSONL line directly.
7
+ //
8
+ // Byte-identity strategy (see docs/optimize-mongodb-export-with-native-ext.md):
9
+ // only the structural bulk + the cheapest, most-stable leaves are formatted in
10
+ // C — Hash, Array, String, fixnum Integer, true/false/nil, BSON::ObjectId, and
11
+ // in-range Time (years 1970..9999; see encode_time_native). Everything else
12
+ // (Float, out-of-int64 Integer, out-of-range Time, Symbol, Decimal128, Binary,
13
+ // ...) is handed back to Ruby's `encode_fragment`, which is the exact pure-Ruby
14
+ // path. This is provably byte-identical because Hash#as_extended_json
15
+ // and Array#as_extended_json are non-transforming structural recursion: the
16
+ // bytes `JSON.generate(v.as_extended_json(mode: :relaxed))` produces for any
17
+ // sub-value `v` are exactly the bytes the whole-document generate would produce
18
+ // in that position, so a value the native walk does not format can be spliced
19
+ // in verbatim with no divergence.
20
+
21
+ #include <ruby.h>
22
+ #include <ruby/encoding.h>
23
+ #include <stdio.h>
24
+ #include <time.h>
25
+
26
+ static VALUE rb_mExtJson;
27
+ // Cached BSON::ObjectId class, or Qnil until bson is loaded and it resolves.
28
+ // Resolution is lazy (bson is required only when the Mongo adapter touches the
29
+ // DB, which always precedes serialization in a real run); see resolve below.
30
+ static VALUE rb_cObjectId;
31
+
32
+ static ID id_encode_fragment;
33
+ static ID id_to_s;
34
+ static ID id_const_BSON;
35
+ static ID id_const_ObjectId;
36
+
37
+ static const char hexdigits[] = "0123456789abcdef";
38
+
39
+ static void encode_value(VALUE buf, VALUE val);
40
+
41
+ // Append `str` as a JSON string literal (surrounding quotes included), escaping
42
+ // exactly as JSON.generate does: \b \t \n \f \r \" \\ get their short escapes,
43
+ // any other byte < 0x20 becomes a lowercase \u00xx, and every other byte —
44
+ // including '/', DEL (0x7f), U+2028/U+2029, and UTF-8 multi-byte sequences — is
45
+ // passed through raw. Unescaped runs are appended in bulk to avoid a per-byte
46
+ // rb_str_cat call.
47
+ static void encode_string(VALUE buf, const char *p, long len)
48
+ {
49
+ rb_str_cat(buf, "\"", 1);
50
+
51
+ long start = 0;
52
+ for (long i = 0; i < len; i++) {
53
+ unsigned char c = (unsigned char)p[i];
54
+ const char *esc = NULL;
55
+ long esclen = 0;
56
+ char ubuf[6];
57
+
58
+ switch (c) {
59
+ case '"': esc = "\\\""; esclen = 2; break;
60
+ case '\\': esc = "\\\\"; esclen = 2; break;
61
+ case '\b': esc = "\\b"; esclen = 2; break;
62
+ case '\t': esc = "\\t"; esclen = 2; break;
63
+ case '\n': esc = "\\n"; esclen = 2; break;
64
+ case '\f': esc = "\\f"; esclen = 2; break;
65
+ case '\r': esc = "\\r"; esclen = 2; break;
66
+ default:
67
+ if (c < 0x20) {
68
+ ubuf[0] = '\\'; ubuf[1] = 'u'; ubuf[2] = '0'; ubuf[3] = '0';
69
+ ubuf[4] = hexdigits[(c >> 4) & 0xf];
70
+ ubuf[5] = hexdigits[c & 0xf];
71
+ esc = ubuf; esclen = 6;
72
+ }
73
+ }
74
+
75
+ if (esc) {
76
+ if (i > start) rb_str_cat(buf, p + start, i - start);
77
+ rb_str_cat(buf, esc, esclen);
78
+ start = i + 1;
79
+ }
80
+ }
81
+ if (len > start) rb_str_cat(buf, p + start, len - start);
82
+
83
+ rb_str_cat(buf, "\"", 1);
84
+ }
85
+
86
+ // Hash keys mirror JSON.generate: a String key is emitted as-is, anything else
87
+ // is stringified (Symbol via its name, otherwise #to_s) before escaping.
88
+ static void encode_key(VALUE buf, VALUE key)
89
+ {
90
+ VALUE kstr;
91
+ if (RB_TYPE_P(key, T_STRING)) {
92
+ kstr = key;
93
+ } else if (RB_TYPE_P(key, T_SYMBOL)) {
94
+ kstr = rb_sym2str(key);
95
+ } else {
96
+ kstr = rb_funcall(key, id_to_s, 0);
97
+ }
98
+ encode_string(buf, RSTRING_PTR(kstr), RSTRING_LEN(kstr));
99
+ }
100
+
101
+ typedef struct {
102
+ VALUE buf;
103
+ int first;
104
+ } hash_ctx;
105
+
106
+ static int hash_iter(VALUE key, VALUE value, VALUE arg)
107
+ {
108
+ hash_ctx *ctx = (hash_ctx *)arg;
109
+ if (!ctx->first) rb_str_cat(ctx->buf, ",", 1);
110
+ ctx->first = 0;
111
+ encode_key(ctx->buf, key);
112
+ rb_str_cat(ctx->buf, ":", 1);
113
+ encode_value(ctx->buf, value);
114
+ return ST_CONTINUE;
115
+ }
116
+
117
+ // Splice the pure-Ruby fragment for a value the native path does not format.
118
+ static void delegate(VALUE buf, VALUE val)
119
+ {
120
+ VALUE frag = rb_funcall(rb_mExtJson, id_encode_fragment, 1, val);
121
+ rb_str_cat(buf, RSTRING_PTR(frag), RSTRING_LEN(frag));
122
+ }
123
+
124
+ // Epoch second for 10000-01-01T00:00:00Z. `bson`'s relaxed Time encoding uses
125
+ // the ISO-8601 string form only for years 1970..9999 (inclusive) and the
126
+ // {"$numberLong":"<ms>"} form otherwise; that year window is exactly the
127
+ // half-open epoch-second range [0, MAX_ISO_EPOCH).
128
+ #define MAX_ISO_EPOCH 253402300800LL
129
+
130
+ // Format a Time as Relaxed Extended JSON in C, matching bson 5.2.0 byte for
131
+ // byte for the common in-range case (see bson/time.rb and the empirical probe):
132
+ // - whole second (usec == 0, i.e. nsec < 1000): {"$date":"...:SSZ"} (no fraction)
133
+ // - sub-second (nsec >= 1000): {"$date":"...:SS.mmmZ"}, where the
134
+ // millisecond is floor(nsec / 1e6) — bson floors the Time to milliseconds.
135
+ // Returns 1 when handled. Returns 0 (leaving buf untouched) for years outside
136
+ // 1970..9999, whose {"$numberLong"} form involves negative-epoch arithmetic too
137
+ // fiddly to risk in C — the caller then delegates that rare case to Ruby.
138
+ static int encode_time_native(VALUE buf, VALUE val)
139
+ {
140
+ struct timespec ts = rb_time_timespec(val);
141
+ if (ts.tv_sec < 0 || ts.tv_sec >= MAX_ISO_EPOCH) return 0;
142
+
143
+ time_t secs = (time_t)ts.tv_sec;
144
+ struct tm tm;
145
+ if (gmtime_r(&secs, &tm) == NULL) return 0;
146
+
147
+ char tmp[40];
148
+ int n;
149
+ if (ts.tv_nsec >= 1000) {
150
+ int ms = (int)(ts.tv_nsec / 1000000L);
151
+ n = snprintf(tmp, sizeof(tmp),
152
+ "{\"$date\":\"%04d-%02d-%02dT%02d:%02d:%02d.%03dZ\"}",
153
+ tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
154
+ tm.tm_hour, tm.tm_min, tm.tm_sec, ms);
155
+ } else {
156
+ n = snprintf(tmp, sizeof(tmp),
157
+ "{\"$date\":\"%04d-%02d-%02dT%02d:%02d:%02dZ\"}",
158
+ tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
159
+ tm.tm_hour, tm.tm_min, tm.tm_sec);
160
+ }
161
+ rb_str_cat(buf, tmp, n);
162
+ return 1;
163
+ }
164
+
165
+ static void encode_value(VALUE buf, VALUE val)
166
+ {
167
+ switch (TYPE(val)) {
168
+ case T_NIL:
169
+ rb_str_cat(buf, "null", 4);
170
+ return;
171
+ case T_TRUE:
172
+ rb_str_cat(buf, "true", 4);
173
+ return;
174
+ case T_FALSE:
175
+ rb_str_cat(buf, "false", 5);
176
+ return;
177
+ case T_FIXNUM: {
178
+ // A Fixnum always fits in a C long (and thus int64) on the platforms
179
+ // exwiw targets, so it can never be the out-of-int64 case that must
180
+ // raise; emit it directly. Bignums fall through to delegate, where
181
+ // encode_fragment emits in-range ones and raises RangeError for the
182
+ // rest — matching today's behavior exactly.
183
+ char tmp[24];
184
+ int n = snprintf(tmp, sizeof(tmp), "%ld", (long)FIX2LONG(val));
185
+ rb_str_cat(buf, tmp, n);
186
+ return;
187
+ }
188
+ case T_STRING:
189
+ encode_string(buf, RSTRING_PTR(val), RSTRING_LEN(val));
190
+ return;
191
+ case T_ARRAY: {
192
+ long len = RARRAY_LEN(val);
193
+ rb_str_cat(buf, "[", 1);
194
+ for (long i = 0; i < len; i++) {
195
+ if (i > 0) rb_str_cat(buf, ",", 1);
196
+ encode_value(buf, rb_ary_entry(val, i));
197
+ }
198
+ rb_str_cat(buf, "]", 1);
199
+ return;
200
+ }
201
+ case T_HASH: {
202
+ // rb_hash_foreach preserves insertion order, matching JSON output.
203
+ hash_ctx ctx = { buf, 1 };
204
+ rb_str_cat(buf, "{", 1);
205
+ rb_hash_foreach(val, hash_iter, (VALUE)&ctx);
206
+ rb_str_cat(buf, "}", 1);
207
+ return;
208
+ }
209
+ default:
210
+ // BSON::ObjectId is the single most common leaf (`_id`) and its
211
+ // Relaxed form is the stable {"$oid":"<24 hex>"}, so format it here.
212
+ // The hex comes from #to_s (the same source as as_extended_json) and
213
+ // is always [0-9a-f]{24}, so it needs no escaping.
214
+ if (!NIL_P(rb_cObjectId) && RTEST(rb_obj_is_kind_of(val, rb_cObjectId))) {
215
+ VALUE hex = rb_funcall(val, id_to_s, 0);
216
+ rb_str_cat(buf, "{\"$oid\":\"", 9);
217
+ rb_str_cat(buf, RSTRING_PTR(hex), RSTRING_LEN(hex));
218
+ rb_str_cat(buf, "\"}", 2);
219
+ return;
220
+ }
221
+ // Time is the other common leaf in dumped documents (Mongoid's
222
+ // created_at/updated_at); format the in-range case natively. The
223
+ // out-of-range $numberLong form returns 0 and falls through to Ruby.
224
+ if (RTEST(rb_obj_is_kind_of(val, rb_cTime)) && encode_time_native(buf, val)) {
225
+ return;
226
+ }
227
+ // Float, Bignum, Symbol, Decimal128, Binary, out-of-range Time, ... -> Ruby.
228
+ delegate(buf, val);
229
+ return;
230
+ }
231
+ }
232
+
233
+ // Resolve and cache BSON::ObjectId the first time a document is encoded with
234
+ // bson loaded. Cheap const lookups guarded by the Qnil cache; once resolved it
235
+ // is skipped. Until resolved, ObjectId simply takes the (correct) delegate path.
236
+ static void resolve_objectid_class(void)
237
+ {
238
+ if (!NIL_P(rb_cObjectId)) return;
239
+ if (!rb_const_defined(rb_cObject, id_const_BSON)) return;
240
+
241
+ VALUE bson = rb_const_get(rb_cObject, id_const_BSON);
242
+ if (rb_const_defined(bson, id_const_ObjectId)) {
243
+ rb_cObjectId = rb_const_get(bson, id_const_ObjectId);
244
+ }
245
+ }
246
+
247
+ // Exwiw::ExtJson.encode_native(doc) -> String
248
+ // Returns one JSONL line (no trailing newline); the caller owns separators.
249
+ static VALUE rb_encode_native(VALUE self, VALUE doc)
250
+ {
251
+ resolve_objectid_class();
252
+
253
+ VALUE buf = rb_str_buf_new(256);
254
+ rb_enc_associate(buf, rb_utf8_encoding());
255
+ encode_value(buf, doc);
256
+ return buf;
257
+ }
258
+
259
+ void Init_ext_json_native(void)
260
+ {
261
+ id_encode_fragment = rb_intern("encode_fragment");
262
+ id_to_s = rb_intern("to_s");
263
+ id_const_BSON = rb_intern("BSON");
264
+ id_const_ObjectId = rb_intern("ObjectId");
265
+
266
+ VALUE mExwiw = rb_define_module("Exwiw");
267
+ rb_mExtJson = rb_define_module_under(mExwiw, "ExtJson");
268
+ rb_global_variable(&rb_mExtJson);
269
+
270
+ rb_cObjectId = Qnil;
271
+ rb_global_variable(&rb_cObjectId);
272
+
273
+ rb_define_singleton_method(rb_mExtJson, "encode_native", rb_encode_native, 1);
274
+ }
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mkmf"
4
+
5
+ # Compiled to lib/exwiw/ext_json_native.{so,bundle}. The name is distinct from
6
+ # the `ext_json.rb` shim so `require "exwiw/ext_json_native"` does not collide
7
+ # with `require_relative "exwiw/ext_json"`.
8
+ create_makefile("exwiw/ext_json_native")
@@ -166,7 +166,7 @@ module Exwiw
166
166
  plan = mask_plan(config)
167
167
  rows.map do |doc|
168
168
  apply_mask_plan!(doc, plan)
169
- JSON.generate(extended_json(doc))
169
+ Exwiw::ExtJson.encode(doc)
170
170
  end.join("\n")
171
171
  end
172
172
 
@@ -489,14 +489,6 @@ module Exwiw
489
489
  @embedded_children_by_parent.fetch(parent_config.name, [])
490
490
  end
491
491
 
492
- private def extended_json(doc)
493
- if doc.respond_to?(:as_extended_json)
494
- doc.as_extended_json(mode: :relaxed)
495
- else
496
- doc
497
- end
498
- end
499
-
500
492
  private def db
501
493
  @db ||=
502
494
  begin
@@ -5,15 +5,67 @@ require 'open3'
5
5
  module Exwiw
6
6
  module Adapter
7
7
  class MysqlAdapter < Base
8
+ include SqlBulkInsert
9
+
10
+ # A lazy, streaming stand-in for the materialized rows #execute used to
11
+ # return (`connection.query(sql).rows`). It pulls rows off the wire one at
12
+ # a time (mysql2 single-row stream) instead of buffering the whole result
13
+ # set, so the dump's dominant memory cost — a Ruby array as large as the
14
+ # table — never materializes. The Runner drives it exactly like the old
15
+ # Array: #size to skip empty tables and log the count, then a single
16
+ # streaming pass (SqlBulkInsert#write_inserts -> each_slice).
17
+ #
18
+ # Mirrors PostgresqlAdapter::StreamingResult, with two MySQL specifics:
19
+ # - #size runs a separate `SELECT COUNT(*)` of the same query. Unlike the
20
+ # pg path, it does NOT wrap the SELECT in a subquery: MySQL rejects a
21
+ # derived table with duplicate column names, which a rails-managed
22
+ # `SELECT *` joined to another table produces. Instead the projection
23
+ # is replaced by `COUNT(*)` (compile_ast(count_only: true)) — exact
24
+ # because exwiw's extraction queries have no DISTINCT/GROUP BY/LIMIT,
25
+ # so the row count is independent of the projected columns.
26
+ # - the stream ties up the connection until fully drained. The Runner
27
+ # always drains it (write_inserts) before any further query
28
+ # (post_insert_sql / DELETE), and MysqlClient#stream_rows drains the
29
+ # remainder if iteration is abandoned, so the connection stays usable.
30
+ class StreamingResult
31
+ include Enumerable
32
+
33
+ def initialize(client:, data_sql:, count_sql:)
34
+ @client = client
35
+ @data_sql = data_sql
36
+ @count_sql = count_sql
37
+ end
38
+
39
+ def size
40
+ @size ||= @client.query(@count_sql).rows.dig(0, 0).to_i
41
+ end
42
+ alias length size
43
+
44
+ # Stream the result set row by row. Each row is an Array of String|nil
45
+ # (mysql2 `cast: false` / stringified) — identical to what
46
+ # `connection.query(sql).rows` produced, so the generated INSERT is
47
+ # unchanged.
48
+ def each(&block)
49
+ return enum_for(:each) { size } unless block_given?
50
+
51
+ @client.stream_rows(@data_sql, &block)
52
+ self
53
+ end
54
+ end
55
+
8
56
  def build_query(table, dump_target, table_by_name)
9
57
  Exwiw::QueryAstBuilder.run(table.name, table_by_name, dump_target, @logger)
10
58
  end
11
59
 
12
60
  def execute(query_ast)
13
- sql = commented_sql(query_ast)
14
-
15
- @logger.debug(" Executing SQL: \n#{sql}")
16
- connection.query(sql).rows
61
+ data_sql = commented_sql(query_ast)
62
+ # Count via the same FROM/JOIN/WHERE (projection replaced by COUNT(*)) so
63
+ # the Runner can skip empty tables and log the row count without draining
64
+ # the stream. See StreamingResult for why this is not a subquery wrap.
65
+ count_sql = "#{sql_query_comment(query_ast)} #{compile_ast(query_ast, count_only: true)}"
66
+
67
+ @logger.debug(" Executing SQL (streaming): \n#{data_sql}")
68
+ StreamingResult.new(client: connection, data_sql: data_sql, count_sql: count_sql)
17
69
  end
18
70
 
19
71
  def explain(query_ast)
@@ -99,22 +151,16 @@ module Exwiw
99
151
  "SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;"
100
152
  end
101
153
 
102
- def to_bulk_insert(results, table)
154
+ # The INSERT header for this adapter. MySQL backtick-quotes the table and
155
+ # column identifiers. #to_bulk_insert / #write_inserts (SqlBulkInsert)
156
+ # append the value tuples and the trailing `;`.
157
+ private def insert_header(table)
103
158
  table_name = table.name
104
-
105
- value_list = results.map do |row|
106
- quoted_values = row.map do |value|
107
- escape_value(value)
108
- end
109
- "(" + quoted_values.join(', ') + ")"
110
- end
111
- values = value_list.join(",\n")
112
-
113
159
  if table.rails_managed?
114
- "INSERT INTO `#{table_name}` VALUES\n#{values};"
160
+ "INSERT INTO `#{table_name}` VALUES\n"
115
161
  else
116
162
  column_names = table.columns.map { |c| "`#{c.name}`" }.join(', ')
117
- "INSERT INTO `#{table_name}` (#{column_names}) VALUES\n#{values};"
163
+ "INSERT INTO `#{table_name}` (#{column_names}) VALUES\n"
118
164
  end
119
165
  end
120
166
 
@@ -176,11 +222,17 @@ module Exwiw
176
222
  sql
177
223
  end
178
224
 
179
- def compile_ast(query_ast)
225
+ # @param count_only [Boolean] emit `SELECT COUNT(*)` instead of the
226
+ # projected columns (used by StreamingResult#size). Safe because exwiw's
227
+ # extraction queries have no DISTINCT/GROUP BY/LIMIT, so the count does
228
+ # not depend on the projection.
229
+ def compile_ast(query_ast, count_only: false)
180
230
  raise NotImplementedError unless query_ast.is_a?(Exwiw::QueryAst::Select)
181
231
 
182
232
  sql = "SELECT "
183
- sql += if query_ast.select_all
233
+ sql += if count_only
234
+ "COUNT(*)"
235
+ elsif query_ast.select_all
184
236
  "*"
185
237
  else
186
238
  query_ast.columns.map { |col| compile_column_name(query_ast, col) }.join(', ')
@@ -118,6 +118,49 @@ module Exwiw
118
118
  end
119
119
  end
120
120
 
121
+ # Stream a query's rows one at a time, yielding each as an
122
+ # Array<String|nil> (the same row shape as #query) instead of buffering
123
+ # the whole result set. This keeps a large dump's dominant memory cost — a
124
+ # Ruby array as big as the table — from materializing.
125
+ #
126
+ # mysql2 streams server-side (`stream: true` + `cache_rows: false`).
127
+ # Its contract: a streamed result MUST be fully consumed before the next
128
+ # query on this connection, or the driver raises "Commands out of sync".
129
+ # The Runner consumes every row (it writes them all), but if the consumer
130
+ # block raises mid-stream we drain the remaining rows so the same
131
+ # connection is still usable for the next table's query.
132
+ #
133
+ # trilogy has no streaming cursor (no QUERY_FLAGS_STREAMING), so it buffers
134
+ # the result and yields from it — parity, but without the memory win (the
135
+ # same situation as the sqlite adapter). trilogy is a test-only driver;
136
+ # production connects via mysql2.
137
+ #
138
+ # @param sql [String]
139
+ # @yieldparam row [Array<String|nil>]
140
+ def stream_rows(sql)
141
+ return enum_for(:stream_rows, sql) unless block_given?
142
+
143
+ case @driver
144
+ when :mysql2
145
+ res = raw.query(sql, cast: false, as: :array, stream: true, cache_rows: false)
146
+ begin
147
+ res.each { |row| yield row.map { |value| self.class.stringify_value(value) } }
148
+ rescue StandardError
149
+ begin
150
+ res.each { |_row| } # drain the remainder so the connection stays usable
151
+ rescue StandardError
152
+ nil
153
+ end
154
+ raise
155
+ end
156
+ when :trilogy
157
+ raw.query(sql).rows.each { |row| yield row.map { |value| self.class.stringify_value(value) } }
158
+ else
159
+ raise "Unsupported MySQL driver: #{@driver.inspect}"
160
+ end
161
+ self
162
+ end
163
+
121
164
  private def ensure_driver_loaded!
122
165
  case @driver
123
166
  when :mysql2 then require 'mysql2'
@@ -3,15 +3,91 @@
3
3
  module Exwiw
4
4
  module Adapter
5
5
  class PostgresqlAdapter < Base
6
+ include SqlBulkInsert
7
+
8
+ # A lazy, streaming stand-in for the materialized rows #execute used to
9
+ # return (`connection.exec(sql).values`). It pulls rows off the wire one
10
+ # at a time via libpq's single-row mode instead of buffering the whole
11
+ # result set, so the dump's dominant memory cost — a Ruby array as large
12
+ # as the table — never materializes. The Runner drives it exactly like the
13
+ # old Array: #size to skip empty tables and log the count, then a single
14
+ # streaming pass (SqlBulkInsert#write_inserts -> each_slice) to write the
15
+ # INSERT.
16
+ #
17
+ # Mirrors MongodbAdapter::StreamingResult; two SQL-specific differences:
18
+ # - #size cannot be answered cheaply from the cursor, so it runs a
19
+ # separate `SELECT COUNT(*)` of the same query. (MongoDB uses
20
+ # count_documents, an index-only walk; the SQL COUNT re-runs the query
21
+ # plan but transfers no row data — Postgres prunes the unused
22
+ # projection of the wrapped subquery.) This keeps the Runner contract
23
+ # unchanged, so MongoDB and the other SQL adapters are untouched.
24
+ # - the streaming pass ties up the connection until fully drained. The
25
+ # Runner always drains it (write_inserts) before issuing any further
26
+ # query (post_insert_sql / DELETE) on the same connection, so the
27
+ # ordering invariant holds.
28
+ class StreamingResult
29
+ include Enumerable
30
+
31
+ def initialize(connection:, data_sql:, count_sql:)
32
+ @connection = connection
33
+ @data_sql = data_sql
34
+ @count_sql = count_sql
35
+ end
36
+
37
+ def size
38
+ @size ||= @connection.exec(@count_sql).getvalue(0, 0).to_i
39
+ end
40
+ alias length size
41
+
42
+ # Stream the result set row by row. Each row is an Array of String|nil
43
+ # in libpq's text format — byte-identical to what `#exec(sql).values`
44
+ # produced, so the generated INSERT is unchanged.
45
+ def each
46
+ return enum_for(:each) { size } unless block_given?
47
+
48
+ @connection.send_query(@data_sql)
49
+ @connection.set_single_row_mode
50
+ begin
51
+ while (result = @connection.get_result)
52
+ begin
53
+ result.check
54
+ result.each_row { |row| yield row }
55
+ ensure
56
+ result.clear
57
+ end
58
+ end
59
+ rescue StandardError
60
+ # If iteration is abandoned mid-stream (a SQL error surfaced by
61
+ # #check, or the consumer raised), drain any results still queued so
62
+ # a later query on this same connection does not fail with "another
63
+ # command is already in progress".
64
+ drain
65
+ raise
66
+ end
67
+ self
68
+ end
69
+
70
+ private def drain
71
+ while (result = @connection.get_result)
72
+ result.clear
73
+ end
74
+ rescue PG::Error
75
+ # Connection already errored/clean; nothing left to drain.
76
+ end
77
+ end
78
+
6
79
  def build_query(table, dump_target, table_by_name)
7
80
  Exwiw::QueryAstBuilder.run(table.name, table_by_name, dump_target, @logger)
8
81
  end
9
82
 
10
83
  def execute(query_ast)
11
- sql = commented_sql(query_ast)
84
+ data_sql = commented_sql(query_ast)
85
+ # Count via the same query (wrapped as a subquery) so the Runner can
86
+ # skip empty tables and log the row count without draining the stream.
87
+ count_sql = "#{sql_query_comment(query_ast)} SELECT COUNT(*) FROM (#{compile_ast(query_ast)}) AS exwiw_count_src"
12
88
 
13
- @logger.debug(" Executing SQL: \n#{sql}")
14
- connection.exec(sql).values
89
+ @logger.debug(" Executing SQL (single-row stream): \n#{data_sql}")
90
+ StreamingResult.new(connection: connection, data_sql: data_sql, count_sql: count_sql)
15
91
  end
16
92
 
17
93
  def explain(query_ast)
@@ -97,22 +173,16 @@ module Exwiw
97
173
  @logger.info(" Wrote schema for #{table_names.size} table(s) to #{output_path}.")
98
174
  end
99
175
 
100
- def to_bulk_insert(results, table)
176
+ # The INSERT header for this adapter. PostgreSQL uses bare identifiers.
177
+ # #to_bulk_insert / #write_inserts (SqlBulkInsert) append the value tuples
178
+ # and the trailing `;`.
179
+ private def insert_header(table)
101
180
  table_name = table.name
102
-
103
- value_list = results.map do |row|
104
- quoted_values = row.map do |value|
105
- escape_value(value)
106
- end
107
- "(" + quoted_values.join(', ') + ")"
108
- end
109
- values = value_list.join(",\n")
110
-
111
181
  if table.rails_managed?
112
- "INSERT INTO #{table_name} VALUES\n#{values};"
182
+ "INSERT INTO #{table_name} VALUES\n"
113
183
  else
114
184
  column_names = table.columns.map(&:name).join(', ')
115
- "INSERT INTO #{table_name} (#{column_names}) VALUES\n#{values};"
185
+ "INSERT INTO #{table_name} (#{column_names}) VALUES\n"
116
186
  end
117
187
  end
118
188
 
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Exwiw
4
+ module Adapter
5
+ # Shared bulk-INSERT construction for the SQL adapters (mysql / postgresql /
6
+ # sqlite). They produce the same `INSERT INTO ... VALUES (...),(...);` shape
7
+ # and differ only in the header's identifier quoting (see #insert_header) and
8
+ # in #escape_value, so both the in-memory builder (#to_bulk_insert) and the
9
+ # bounded-memory streaming writer (#write_inserts) live here.
10
+ #
11
+ # Each including adapter must provide two private methods:
12
+ # - insert_header(table) -> the "INSERT INTO ... VALUES\n" prefix
13
+ # - escape_value(value) -> the SQL literal for one column value
14
+ module SqlBulkInsert
15
+ # How many rows' tuples to build-and-flush at a time when streaming. Bounds
16
+ # peak memory to this many tuples (plus their joined string) instead of the
17
+ # whole table's INSERT string, while keeping each flush a single fast
18
+ # Array#map + Array#join (the same C-level path #to_bulk_insert uses) so it
19
+ # stays close to whole-string speed — far faster than a naive row-at-a-time
20
+ # IO#print (see script/bench_sql_dump.rb / docs/sql-dump-optimization-notes.md).
21
+ # Mirrors MongoDB's default chunk size: bounded work per flush, but the SQL
22
+ # adapters still emit ONE statement (byte-identical to the un-chunked build).
23
+ STREAM_FLUSH_ROWS = 2_000
24
+
25
+ # Build the whole INSERT statement as a single String. Kept for callers
26
+ # that want the string form (and as the readable definition of the exact
27
+ # bytes #write_inserts streams).
28
+ def to_bulk_insert(results, table)
29
+ value_list = results.map { |row| insert_tuple(row) }
30
+ "#{insert_header(table)}#{value_list.join(",\n")};"
31
+ end
32
+
33
+ # Stream the bulk INSERT(s) straight to `io` instead of materializing the
34
+ # whole statement string first. Byte-for-byte identical to writing
35
+ # #to_bulk_insert per chunk joined by "\n" (verified by
36
+ # insert_output_snapshot_spec), but only one ~STREAM_FLUSH_BYTES buffer is
37
+ # resident at a time rather than the entire table's INSERT string. Returns
38
+ # the number of statements written.
39
+ def write_inserts(io, results, table, chunk_size)
40
+ chunks = chunk_size ? results.each_slice(chunk_size) : [results]
41
+ statement_count = 0
42
+ chunks.each do |chunk_rows|
43
+ io.print("\n") if statement_count.positive?
44
+ stream_single_insert(io, chunk_rows, table)
45
+ statement_count += 1
46
+ end
47
+ statement_count
48
+ end
49
+
50
+ # Emit one `INSERT INTO ... VALUES <tuples>;` statement to `io`, building
51
+ # and flushing the value tuples STREAM_FLUSH_ROWS at a time so the full
52
+ # statement text is never held in memory at once. Each slice is one fast
53
+ # map+join; the ",\n" between slices reproduces the same separator
54
+ # #to_bulk_insert puts between every tuple, so the bytes are identical.
55
+ private def stream_single_insert(io, rows, table)
56
+ io.print(insert_header(table))
57
+ first = true
58
+ rows.each_slice(STREAM_FLUSH_ROWS) do |slice|
59
+ io.print(",\n") unless first
60
+ first = false
61
+ io.print(slice.map { |row| insert_tuple(row) }.join(",\n"))
62
+ end
63
+ io.print(";")
64
+ end
65
+
66
+ private def insert_tuple(row)
67
+ "(" + row.map { |value| escape_value(value) }.join(', ') + ")"
68
+ end
69
+ end
70
+ end
71
+ end