RubyGems - exwiw - Versions diffs - 0.5.3 → 0.6.0 - Mend

exwiw 0.5.3 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +7 -0
data/README.md +1 -1
data/docs/optimize-mongodb-export-with-native-ext.md +31 -11
data/docs/sql-dump-optimization-notes.md +278 -0
data/ext/exwiw/ext_json/ext_json.c +274 -0
data/ext/exwiw/ext_json/extconf.rb +8 -0
data/lib/exwiw/adapter/mongodb_adapter.rb +1 -9
data/lib/exwiw/adapter/mysql_adapter.rb +70 -18
data/lib/exwiw/adapter/mysql_client.rb +43 -0
data/lib/exwiw/adapter/postgresql_adapter.rb +85 -15
data/lib/exwiw/adapter/sql_bulk_insert.rb +71 -0
data/lib/exwiw/adapter/sqlite_adapter.rb +75 -18
data/lib/exwiw/adapter.rb +28 -0
data/lib/exwiw/ext_json.rb +33 -0
data/lib/exwiw/runner.rb +10 -16
data/lib/exwiw/version.rb +1 -1
data/lib/exwiw.rb +2 -0
metadata +8 -2

data/ext/exwiw/ext_json/ext_json.c ADDED Viewed

@@ -0,0 +1,274 @@
+// Native emitter for MongoDB Relaxed Extended JSON.
+//
+// Replaces the pure-Ruby `JSON.generate(doc.as_extended_json(mode: :relaxed))`
+// (which rebuilds the whole document into an intermediate transformed Hash tree
+// and then walks it a second time in JSON.generate) with a single native
+// tree-walk that emits the JSONL line directly.
+//
+// Byte-identity strategy (see docs/optimize-mongodb-export-with-native-ext.md):
+// only the structural bulk + the cheapest, most-stable leaves are formatted in
+// C — Hash, Array, String, fixnum Integer, true/false/nil, BSON::ObjectId, and
+// in-range Time (years 1970..9999; see encode_time_native). Everything else
+// (Float, out-of-int64 Integer, out-of-range Time, Symbol, Decimal128, Binary,
+// ...) is handed back to Ruby's `encode_fragment`, which is the exact pure-Ruby
+// path. This is provably byte-identical because Hash#as_extended_json
+// and Array#as_extended_json are non-transforming structural recursion: the
+// bytes `JSON.generate(v.as_extended_json(mode: :relaxed))` produces for any
+// sub-value `v` are exactly the bytes the whole-document generate would produce
+// in that position, so a value the native walk does not format can be spliced
+// in verbatim with no divergence.
+#include <ruby.h>
+#include <ruby/encoding.h>
+#include <stdio.h>
+#include <time.h>
+static VALUE rb_mExtJson;
+// Cached BSON::ObjectId class, or Qnil until bson is loaded and it resolves.
+// Resolution is lazy (bson is required only when the Mongo adapter touches the
+// DB, which always precedes serialization in a real run); see resolve below.
+static VALUE rb_cObjectId;
+static ID id_encode_fragment;
+static ID id_to_s;
+static ID id_const_BSON;
+static ID id_const_ObjectId;
+static const char hexdigits[] = "0123456789abcdef";
+static void encode_value(VALUE buf, VALUE val);
+// Append `str` as a JSON string literal (surrounding quotes included), escaping
+// exactly as JSON.generate does: \b \t \n \f \r \" \\ get their short escapes,
+// any other byte < 0x20 becomes a lowercase \u00xx, and every other byte —
+// including '/', DEL (0x7f), U+2028/U+2029, and UTF-8 multi-byte sequences — is
+// passed through raw. Unescaped runs are appended in bulk to avoid a per-byte
+// rb_str_cat call.
+static void encode_string(VALUE buf, const char *p, long len)
+{
+    rb_str_cat(buf, "\"", 1);
+    long start = 0;
+    for (long i = 0; i < len; i++) {
+        unsigned char c = (unsigned char)p[i];
+        const char *esc = NULL;
+        long esclen = 0;
+        char ubuf[6];
+        switch (c) {
+            case '"':  esc = "\\\""; esclen = 2; break;
+            case '\\': esc = "\\\\"; esclen = 2; break;
+            case '\b': esc = "\\b";  esclen = 2; break;
+            case '\t': esc = "\\t";  esclen = 2; break;
+            case '\n': esc = "\\n";  esclen = 2; break;
+            case '\f': esc = "\\f";  esclen = 2; break;
+            case '\r': esc = "\\r";  esclen = 2; break;
+            default:
+                if (c < 0x20) {
+                    ubuf[0] = '\\'; ubuf[1] = 'u'; ubuf[2] = '0'; ubuf[3] = '0';
+                    ubuf[4] = hexdigits[(c >> 4) & 0xf];
+                    ubuf[5] = hexdigits[c & 0xf];
+                    esc = ubuf; esclen = 6;
+                }
+        }
+        if (esc) {
+            if (i > start) rb_str_cat(buf, p + start, i - start);
+            rb_str_cat(buf, esc, esclen);
+            start = i + 1;
+        }
+    }
+    if (len > start) rb_str_cat(buf, p + start, len - start);
+    rb_str_cat(buf, "\"", 1);
+}
+// Hash keys mirror JSON.generate: a String key is emitted as-is, anything else
+// is stringified (Symbol via its name, otherwise #to_s) before escaping.
+static void encode_key(VALUE buf, VALUE key)
+{
+    VALUE kstr;
+    if (RB_TYPE_P(key, T_STRING)) {
+        kstr = key;
+    } else if (RB_TYPE_P(key, T_SYMBOL)) {
+        kstr = rb_sym2str(key);
+    } else {
+        kstr = rb_funcall(key, id_to_s, 0);
+    }
+    encode_string(buf, RSTRING_PTR(kstr), RSTRING_LEN(kstr));
+}
+typedef struct {
+    VALUE buf;
+    int first;
+} hash_ctx;
+static int hash_iter(VALUE key, VALUE value, VALUE arg)
+{
+    hash_ctx *ctx = (hash_ctx *)arg;
+    if (!ctx->first) rb_str_cat(ctx->buf, ",", 1);
+    ctx->first = 0;
+    encode_key(ctx->buf, key);
+    rb_str_cat(ctx->buf, ":", 1);
+    encode_value(ctx->buf, value);
+    return ST_CONTINUE;
+}
+// Splice the pure-Ruby fragment for a value the native path does not format.
+static void delegate(VALUE buf, VALUE val)
+{
+    VALUE frag = rb_funcall(rb_mExtJson, id_encode_fragment, 1, val);
+    rb_str_cat(buf, RSTRING_PTR(frag), RSTRING_LEN(frag));
+}
+// Epoch second for 10000-01-01T00:00:00Z. `bson`'s relaxed Time encoding uses
+// the ISO-8601 string form only for years 1970..9999 (inclusive) and the
+// {"$numberLong":"<ms>"} form otherwise; that year window is exactly the
+// half-open epoch-second range [0, MAX_ISO_EPOCH).
+#define MAX_ISO_EPOCH 253402300800LL
+// Format a Time as Relaxed Extended JSON in C, matching bson 5.2.0 byte for
+// byte for the common in-range case (see bson/time.rb and the empirical probe):
+//   - whole second (usec == 0, i.e. nsec < 1000): {"$date":"...:SSZ"} (no fraction)
+//   - sub-second   (nsec >= 1000):                {"$date":"...:SS.mmmZ"}, where the
+//     millisecond is floor(nsec / 1e6) — bson floors the Time to milliseconds.
+// Returns 1 when handled. Returns 0 (leaving buf untouched) for years outside
+// 1970..9999, whose {"$numberLong"} form involves negative-epoch arithmetic too
+// fiddly to risk in C — the caller then delegates that rare case to Ruby.
+static int encode_time_native(VALUE buf, VALUE val)
+{
+    struct timespec ts = rb_time_timespec(val);
+    if (ts.tv_sec < 0 || ts.tv_sec >= MAX_ISO_EPOCH) return 0;
+    time_t secs = (time_t)ts.tv_sec;
+    struct tm tm;
+    if (gmtime_r(&secs, &tm) == NULL) return 0;
+    char tmp[40];
+    int n;
+    if (ts.tv_nsec >= 1000) {
+        int ms = (int)(ts.tv_nsec / 1000000L);
+        n = snprintf(tmp, sizeof(tmp),
+                     "{\"$date\":\"%04d-%02d-%02dT%02d:%02d:%02d.%03dZ\"}",
+                     tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+                     tm.tm_hour, tm.tm_min, tm.tm_sec, ms);
+    } else {
+        n = snprintf(tmp, sizeof(tmp),
+                     "{\"$date\":\"%04d-%02d-%02dT%02d:%02d:%02dZ\"}",
+                     tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+                     tm.tm_hour, tm.tm_min, tm.tm_sec);
+    }
+    rb_str_cat(buf, tmp, n);
+    return 1;
+}
+static void encode_value(VALUE buf, VALUE val)
+{
+    switch (TYPE(val)) {
+        case T_NIL:
+            rb_str_cat(buf, "null", 4);
+            return;
+        case T_TRUE:
+            rb_str_cat(buf, "true", 4);
+            return;
+        case T_FALSE:
+            rb_str_cat(buf, "false", 5);
+            return;
+        case T_FIXNUM: {
+            // A Fixnum always fits in a C long (and thus int64) on the platforms
+            // exwiw targets, so it can never be the out-of-int64 case that must
+            // raise; emit it directly. Bignums fall through to delegate, where
+            // encode_fragment emits in-range ones and raises RangeError for the
+            // rest — matching today's behavior exactly.
+            char tmp[24];
+            int n = snprintf(tmp, sizeof(tmp), "%ld", (long)FIX2LONG(val));
+            rb_str_cat(buf, tmp, n);
+            return;
+        }
+        case T_STRING:
+            encode_string(buf, RSTRING_PTR(val), RSTRING_LEN(val));
+            return;
+        case T_ARRAY: {
+            long len = RARRAY_LEN(val);
+            rb_str_cat(buf, "[", 1);
+            for (long i = 0; i < len; i++) {
+                if (i > 0) rb_str_cat(buf, ",", 1);
+                encode_value(buf, rb_ary_entry(val, i));
+            }
+            rb_str_cat(buf, "]", 1);
+            return;
+        }
+        case T_HASH: {
+            // rb_hash_foreach preserves insertion order, matching JSON output.
+            hash_ctx ctx = { buf, 1 };
+            rb_str_cat(buf, "{", 1);
+            rb_hash_foreach(val, hash_iter, (VALUE)&ctx);
+            rb_str_cat(buf, "}", 1);
+            return;
+        }
+        default:
+            // BSON::ObjectId is the single most common leaf (`_id`) and its
+            // Relaxed form is the stable {"$oid":"<24 hex>"}, so format it here.
+            // The hex comes from #to_s (the same source as as_extended_json) and
+            // is always [0-9a-f]{24}, so it needs no escaping.
+            if (!NIL_P(rb_cObjectId) && RTEST(rb_obj_is_kind_of(val, rb_cObjectId))) {
+                VALUE hex = rb_funcall(val, id_to_s, 0);
+                rb_str_cat(buf, "{\"$oid\":\"", 9);
+                rb_str_cat(buf, RSTRING_PTR(hex), RSTRING_LEN(hex));
+                rb_str_cat(buf, "\"}", 2);
+                return;
+            }
+            // Time is the other common leaf in dumped documents (Mongoid's
+            // created_at/updated_at); format the in-range case natively. The
+            // out-of-range $numberLong form returns 0 and falls through to Ruby.
+            if (RTEST(rb_obj_is_kind_of(val, rb_cTime)) && encode_time_native(buf, val)) {
+                return;
+            }
+            // Float, Bignum, Symbol, Decimal128, Binary, out-of-range Time, ... -> Ruby.
+            delegate(buf, val);
+            return;
+    }
+}
+// Resolve and cache BSON::ObjectId the first time a document is encoded with
+// bson loaded. Cheap const lookups guarded by the Qnil cache; once resolved it
+// is skipped. Until resolved, ObjectId simply takes the (correct) delegate path.
+static void resolve_objectid_class(void)
+{
+    if (!NIL_P(rb_cObjectId)) return;
+    if (!rb_const_defined(rb_cObject, id_const_BSON)) return;
+    VALUE bson = rb_const_get(rb_cObject, id_const_BSON);
+    if (rb_const_defined(bson, id_const_ObjectId)) {
+        rb_cObjectId = rb_const_get(bson, id_const_ObjectId);
+    }
+}
+// Exwiw::ExtJson.encode_native(doc) -> String
+// Returns one JSONL line (no trailing newline); the caller owns separators.
+static VALUE rb_encode_native(VALUE self, VALUE doc)
+{
+    resolve_objectid_class();
+    VALUE buf = rb_str_buf_new(256);
+    rb_enc_associate(buf, rb_utf8_encoding());
+    encode_value(buf, doc);
+    return buf;
+}
+void Init_ext_json_native(void)
+{
+    id_encode_fragment = rb_intern("encode_fragment");
+    id_to_s = rb_intern("to_s");
+    id_const_BSON = rb_intern("BSON");
+    id_const_ObjectId = rb_intern("ObjectId");
+    VALUE mExwiw = rb_define_module("Exwiw");
+    rb_mExtJson = rb_define_module_under(mExwiw, "ExtJson");
+    rb_global_variable(&rb_mExtJson);
+    rb_cObjectId = Qnil;
+    rb_global_variable(&rb_cObjectId);
+    rb_define_singleton_method(rb_mExtJson, "encode_native", rb_encode_native, 1);
+}

data/ext/exwiw/ext_json/extconf.rb ADDED Viewed

@@ -0,0 +1,8 @@
+# frozen_string_literal: true
+require "mkmf"
+# Compiled to lib/exwiw/ext_json_native.{so,bundle}. The name is distinct from
+# the `ext_json.rb` shim so `require "exwiw/ext_json_native"` does not collide
+# with `require_relative "exwiw/ext_json"`.
+create_makefile("exwiw/ext_json_native")

data/lib/exwiw/adapter/mongodb_adapter.rb CHANGED Viewed

@@ -166,7 +166,7 @@ module Exwiw
         plan = mask_plan(config)
         rows.map do |doc|
           apply_mask_plan!(doc, plan)
-          JSON.generate(extended_json(doc))
+          Exwiw::ExtJson.encode(doc)
         end.join("\n")
       end
@@ -489,14 +489,6 @@ module Exwiw
         @embedded_children_by_parent.fetch(parent_config.name, [])
       end
-      private def extended_json(doc)
-        if doc.respond_to?(:as_extended_json)
-          doc.as_extended_json(mode: :relaxed)
-        else
-          doc
-        end
-      end
       private def db
         @db ||=
           begin

data/lib/exwiw/adapter/mysql_adapter.rb CHANGED Viewed

@@ -5,15 +5,67 @@ require 'open3'
 module Exwiw
   module Adapter
     class MysqlAdapter < Base
+      include SqlBulkInsert
+      # A lazy, streaming stand-in for the materialized rows #execute used to
+      # return (`connection.query(sql).rows`). It pulls rows off the wire one at
+      # a time (mysql2 single-row stream) instead of buffering the whole result
+      # set, so the dump's dominant memory cost — a Ruby array as large as the
+      # table — never materializes. The Runner drives it exactly like the old
+      # Array: #size to skip empty tables and log the count, then a single
+      # streaming pass (SqlBulkInsert#write_inserts -> each_slice).
+      #
+      # Mirrors PostgresqlAdapter::StreamingResult, with two MySQL specifics:
+      #   - #size runs a separate `SELECT COUNT(*)` of the same query. Unlike the
+      #     pg path, it does NOT wrap the SELECT in a subquery: MySQL rejects a
+      #     derived table with duplicate column names, which a rails-managed
+      #     `SELECT *` joined to another table produces. Instead the projection
+      #     is replaced by `COUNT(*)` (compile_ast(count_only: true)) — exact
+      #     because exwiw's extraction queries have no DISTINCT/GROUP BY/LIMIT,
+      #     so the row count is independent of the projected columns.
+      #   - the stream ties up the connection until fully drained. The Runner
+      #     always drains it (write_inserts) before any further query
+      #     (post_insert_sql / DELETE), and MysqlClient#stream_rows drains the
+      #     remainder if iteration is abandoned, so the connection stays usable.
+      class StreamingResult
+        include Enumerable
+        def initialize(client:, data_sql:, count_sql:)
+          @client = client
+          @data_sql = data_sql
+          @count_sql = count_sql
+        end
+        def size
+          @size ||= @client.query(@count_sql).rows.dig(0, 0).to_i
+        end
+        alias length size
+        # Stream the result set row by row. Each row is an Array of String|nil
+        # (mysql2 `cast: false` / stringified) — identical to what
+        # `connection.query(sql).rows` produced, so the generated INSERT is
+        # unchanged.
+        def each(&block)
+          return enum_for(:each) { size } unless block_given?
+          @client.stream_rows(@data_sql, &block)
+          self
+        end
+      end
       def build_query(table, dump_target, table_by_name)
         Exwiw::QueryAstBuilder.run(table.name, table_by_name, dump_target, @logger)
       end
       def execute(query_ast)
-        sql = commented_sql(query_ast)
-        @logger.debug("  Executing SQL: \n#{sql}")
-        connection.query(sql).rows
+        data_sql = commented_sql(query_ast)
+        # Count via the same FROM/JOIN/WHERE (projection replaced by COUNT(*)) so
+        # the Runner can skip empty tables and log the row count without draining
+        # the stream. See StreamingResult for why this is not a subquery wrap.
+        count_sql = "#{sql_query_comment(query_ast)} #{compile_ast(query_ast, count_only: true)}"
+        @logger.debug("  Executing SQL (streaming): \n#{data_sql}")
+        StreamingResult.new(client: connection, data_sql: data_sql, count_sql: count_sql)
       end
       def explain(query_ast)
@@ -99,22 +151,16 @@ module Exwiw
         "SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;"
       end
-      def to_bulk_insert(results, table)
+      # The INSERT header for this adapter. MySQL backtick-quotes the table and
+      # column identifiers. #to_bulk_insert / #write_inserts (SqlBulkInsert)
+      # append the value tuples and the trailing `;`.
+      private def insert_header(table)
         table_name = table.name
-        value_list = results.map do |row|
-          quoted_values = row.map do |value|
-            escape_value(value)
-          end
-          "(" + quoted_values.join(', ') + ")"
-        end
-        values = value_list.join(",\n")
         if table.rails_managed?
-          "INSERT INTO `#{table_name}` VALUES\n#{values};"
+          "INSERT INTO `#{table_name}` VALUES\n"
         else
           column_names = table.columns.map { |c| "`#{c.name}`" }.join(', ')
-          "INSERT INTO `#{table_name}` (#{column_names}) VALUES\n#{values};"
+          "INSERT INTO `#{table_name}` (#{column_names}) VALUES\n"
         end
       end
@@ -176,11 +222,17 @@ module Exwiw
         sql
       end
-      def compile_ast(query_ast)
+      # @param count_only [Boolean] emit `SELECT COUNT(*)` instead of the
+      #   projected columns (used by StreamingResult#size). Safe because exwiw's
+      #   extraction queries have no DISTINCT/GROUP BY/LIMIT, so the count does
+      #   not depend on the projection.
+      def compile_ast(query_ast, count_only: false)
         raise NotImplementedError unless query_ast.is_a?(Exwiw::QueryAst::Select)
         sql = "SELECT "
-        sql += if query_ast.select_all
+        sql += if count_only
+                 "COUNT(*)"
+               elsif query_ast.select_all
                  "*"
                else
                  query_ast.columns.map { |col| compile_column_name(query_ast, col) }.join(', ')

data/lib/exwiw/adapter/mysql_client.rb CHANGED Viewed

@@ -118,6 +118,49 @@ module Exwiw
         end
       end
+      # Stream a query's rows one at a time, yielding each as an
+      # Array<String|nil> (the same row shape as #query) instead of buffering
+      # the whole result set. This keeps a large dump's dominant memory cost — a
+      # Ruby array as big as the table — from materializing.
+      #
+      # mysql2 streams server-side (`stream: true` + `cache_rows: false`).
+      # Its contract: a streamed result MUST be fully consumed before the next
+      # query on this connection, or the driver raises "Commands out of sync".
+      # The Runner consumes every row (it writes them all), but if the consumer
+      # block raises mid-stream we drain the remaining rows so the same
+      # connection is still usable for the next table's query.
+      #
+      # trilogy has no streaming cursor (no QUERY_FLAGS_STREAMING), so it buffers
+      # the result and yields from it — parity, but without the memory win (the
+      # same situation as the sqlite adapter). trilogy is a test-only driver;
+      # production connects via mysql2.
+      #
+      # @param sql [String]
+      # @yieldparam row [Array<String|nil>]
+      def stream_rows(sql)
+        return enum_for(:stream_rows, sql) unless block_given?
+        case @driver
+        when :mysql2
+          res = raw.query(sql, cast: false, as: :array, stream: true, cache_rows: false)
+          begin
+            res.each { |row| yield row.map { |value| self.class.stringify_value(value) } }
+          rescue StandardError
+            begin
+              res.each { |_row| } # drain the remainder so the connection stays usable
+            rescue StandardError
+              nil
+            end
+            raise
+          end
+        when :trilogy
+          raw.query(sql).rows.each { |row| yield row.map { |value| self.class.stringify_value(value) } }
+        else
+          raise "Unsupported MySQL driver: #{@driver.inspect}"
+        end
+        self
+      end
       private def ensure_driver_loaded!
         case @driver
         when :mysql2 then require 'mysql2'

data/lib/exwiw/adapter/postgresql_adapter.rb CHANGED Viewed

@@ -3,15 +3,91 @@
 module Exwiw
   module Adapter
     class PostgresqlAdapter < Base
+      include SqlBulkInsert
+      # A lazy, streaming stand-in for the materialized rows #execute used to
+      # return (`connection.exec(sql).values`). It pulls rows off the wire one
+      # at a time via libpq's single-row mode instead of buffering the whole
+      # result set, so the dump's dominant memory cost — a Ruby array as large
+      # as the table — never materializes. The Runner drives it exactly like the
+      # old Array: #size to skip empty tables and log the count, then a single
+      # streaming pass (SqlBulkInsert#write_inserts -> each_slice) to write the
+      # INSERT.
+      #
+      # Mirrors MongodbAdapter::StreamingResult; two SQL-specific differences:
+      #   - #size cannot be answered cheaply from the cursor, so it runs a
+      #     separate `SELECT COUNT(*)` of the same query. (MongoDB uses
+      #     count_documents, an index-only walk; the SQL COUNT re-runs the query
+      #     plan but transfers no row data — Postgres prunes the unused
+      #     projection of the wrapped subquery.) This keeps the Runner contract
+      #     unchanged, so MongoDB and the other SQL adapters are untouched.
+      #   - the streaming pass ties up the connection until fully drained. The
+      #     Runner always drains it (write_inserts) before issuing any further
+      #     query (post_insert_sql / DELETE) on the same connection, so the
+      #     ordering invariant holds.
+      class StreamingResult
+        include Enumerable
+        def initialize(connection:, data_sql:, count_sql:)
+          @connection = connection
+          @data_sql = data_sql
+          @count_sql = count_sql
+        end
+        def size
+          @size ||= @connection.exec(@count_sql).getvalue(0, 0).to_i
+        end
+        alias length size
+        # Stream the result set row by row. Each row is an Array of String|nil
+        # in libpq's text format — byte-identical to what `#exec(sql).values`
+        # produced, so the generated INSERT is unchanged.
+        def each
+          return enum_for(:each) { size } unless block_given?
+          @connection.send_query(@data_sql)
+          @connection.set_single_row_mode
+          begin
+            while (result = @connection.get_result)
+              begin
+                result.check
+                result.each_row { |row| yield row }
+              ensure
+                result.clear
+              end
+            end
+          rescue StandardError
+            # If iteration is abandoned mid-stream (a SQL error surfaced by
+            # #check, or the consumer raised), drain any results still queued so
+            # a later query on this same connection does not fail with "another
+            # command is already in progress".
+            drain
+            raise
+          end
+          self
+        end
+        private def drain
+          while (result = @connection.get_result)
+            result.clear
+          end
+        rescue PG::Error
+          # Connection already errored/clean; nothing left to drain.
+        end
+      end
       def build_query(table, dump_target, table_by_name)
         Exwiw::QueryAstBuilder.run(table.name, table_by_name, dump_target, @logger)
       end
       def execute(query_ast)
-        sql = commented_sql(query_ast)
+        data_sql = commented_sql(query_ast)
+        # Count via the same query (wrapped as a subquery) so the Runner can
+        # skip empty tables and log the row count without draining the stream.
+        count_sql = "#{sql_query_comment(query_ast)} SELECT COUNT(*) FROM (#{compile_ast(query_ast)}) AS exwiw_count_src"
-        @logger.debug("  Executing SQL: \n#{sql}")
-        connection.exec(sql).values
+        @logger.debug("  Executing SQL (single-row stream): \n#{data_sql}")
+        StreamingResult.new(connection: connection, data_sql: data_sql, count_sql: count_sql)
       end
       def explain(query_ast)
@@ -97,22 +173,16 @@ module Exwiw
         @logger.info("  Wrote schema for #{table_names.size} table(s) to #{output_path}.")
       end
-      def to_bulk_insert(results, table)
+      # The INSERT header for this adapter. PostgreSQL uses bare identifiers.
+      # #to_bulk_insert / #write_inserts (SqlBulkInsert) append the value tuples
+      # and the trailing `;`.
+      private def insert_header(table)
         table_name = table.name
-        value_list = results.map do |row|
-          quoted_values = row.map do |value|
-            escape_value(value)
-          end
-          "(" + quoted_values.join(', ') + ")"
-        end
-        values = value_list.join(",\n")
         if table.rails_managed?
-          "INSERT INTO #{table_name} VALUES\n#{values};"
+          "INSERT INTO #{table_name} VALUES\n"
         else
           column_names = table.columns.map(&:name).join(', ')
-          "INSERT INTO #{table_name} (#{column_names}) VALUES\n#{values};"
+          "INSERT INTO #{table_name} (#{column_names}) VALUES\n"
         end
       end

data/lib/exwiw/adapter/sql_bulk_insert.rb ADDED Viewed

@@ -0,0 +1,71 @@
+# frozen_string_literal: true
+module Exwiw
+  module Adapter
+    # Shared bulk-INSERT construction for the SQL adapters (mysql / postgresql /
+    # sqlite). They produce the same `INSERT INTO ... VALUES (...),(...);` shape
+    # and differ only in the header's identifier quoting (see #insert_header) and
+    # in #escape_value, so both the in-memory builder (#to_bulk_insert) and the
+    # bounded-memory streaming writer (#write_inserts) live here.
+    #
+    # Each including adapter must provide two private methods:
+    #   - insert_header(table) -> the "INSERT INTO ... VALUES\n" prefix
+    #   - escape_value(value)  -> the SQL literal for one column value
+    module SqlBulkInsert
+      # How many rows' tuples to build-and-flush at a time when streaming. Bounds
+      # peak memory to this many tuples (plus their joined string) instead of the
+      # whole table's INSERT string, while keeping each flush a single fast
+      # Array#map + Array#join (the same C-level path #to_bulk_insert uses) so it
+      # stays close to whole-string speed — far faster than a naive row-at-a-time
+      # IO#print (see script/bench_sql_dump.rb / docs/sql-dump-optimization-notes.md).
+      # Mirrors MongoDB's default chunk size: bounded work per flush, but the SQL
+      # adapters still emit ONE statement (byte-identical to the un-chunked build).
+      STREAM_FLUSH_ROWS = 2_000
+      # Build the whole INSERT statement as a single String. Kept for callers
+      # that want the string form (and as the readable definition of the exact
+      # bytes #write_inserts streams).
+      def to_bulk_insert(results, table)
+        value_list = results.map { |row| insert_tuple(row) }
+        "#{insert_header(table)}#{value_list.join(",\n")};"
+      end
+      # Stream the bulk INSERT(s) straight to `io` instead of materializing the
+      # whole statement string first. Byte-for-byte identical to writing
+      # #to_bulk_insert per chunk joined by "\n" (verified by
+      # insert_output_snapshot_spec), but only one ~STREAM_FLUSH_BYTES buffer is
+      # resident at a time rather than the entire table's INSERT string. Returns
+      # the number of statements written.
+      def write_inserts(io, results, table, chunk_size)
+        chunks = chunk_size ? results.each_slice(chunk_size) : [results]
+        statement_count = 0
+        chunks.each do |chunk_rows|
+          io.print("\n") if statement_count.positive?
+          stream_single_insert(io, chunk_rows, table)
+          statement_count += 1
+        end
+        statement_count
+      end
+      # Emit one `INSERT INTO ... VALUES <tuples>;` statement to `io`, building
+      # and flushing the value tuples STREAM_FLUSH_ROWS at a time so the full
+      # statement text is never held in memory at once. Each slice is one fast
+      # map+join; the ",\n" between slices reproduces the same separator
+      # #to_bulk_insert puts between every tuple, so the bytes are identical.
+      private def stream_single_insert(io, rows, table)
+        io.print(insert_header(table))
+        first = true
+        rows.each_slice(STREAM_FLUSH_ROWS) do |slice|
+          io.print(",\n") unless first
+          first = false
+          io.print(slice.map { |row| insert_tuple(row) }.join(",\n"))
+        end
+        io.print(";")
+      end
+      private def insert_tuple(row)
+        "(" + row.map { |value| escape_value(value) }.join(', ') + ")"
+      end
+    end
+  end
+end