RubyGems - purplelight - Versions diffs - 0.1.8 → 0.1.9 - Mend

purplelight 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/README.md +10 -7
data/bin/purplelight +51 -4
data/lib/purplelight/snapshot.rb +33 -7
data/lib/purplelight/version.rb +1 -1
data/lib/purplelight/writer_csv.rb +20 -14
data/lib/purplelight/writer_jsonl.rb +28 -19
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 89114bd20a65a5a398be619718ae9a92e535d8118d9f928c61735da9a11cb5aa
-  data.tar.gz: 372372e6f5efe1cedd30033d661523fcb45a986c20e6e614ca46ca69def97e70
+  metadata.gz: b87960253dbd1ab6aae3b60dc790068d851f3798b124c23451bdae96734d6d67
+  data.tar.gz: b1eab05f8580a282b836da8eddb5dfe964ef6cb90a94300304ecd0426f786998
 SHA512:
-  metadata.gz: 17103c062f0c6002ee53fbb1c2eed3179fb0df582c0dd99ef5100f8a26dbb0c56432f02f9a7f62c49bd201a05d414d7797c0e32044551522eb9d625ba9c179bf
-  data.tar.gz: e64bfd67ce31afae0c2209eaf058c731c331c63562f8e23de5e1f86a16a4fd3c8033f5938b7a509b3348daa8f4349afe926acb5e32fba8c7882996bd0d435616
+  metadata.gz: 7bff1db0acebc6416b7dd484fe882947bc74927a6833e99a0fec64d03203babfbf625f44c6a8d6c29cab31a6bc7ccae31de3a7d0b55283d073053a21515faeb3
+  data.tar.gz: b56bd93e12571aafe2ab47a1dc087d3429c4a15a731d50159552fbe70a0f63b40ee2d44fb23bf27752045df9f6e146376af906a00afdfada7e068420a4012925

data/README.md CHANGED Viewed

@@ -9,7 +9,7 @@ Purplelight is published on RubyGems: [purplelight on RubyGems](https://rubygems
 Add to your Gemfile:
 ```ruby
-gem 'purplelight', '~> 0.1.8'
+gem 'purplelight', '~> 0.1.9'
 ```
 Or install directly:
@@ -248,14 +248,17 @@ bundle exec bin/purplelight \
 ### Quick Benchmark
 ```
-% bash -lc 'BENCH=1 BENCH_PARTITIONS=16 BENCH_BATCH_SIZE=8000 BENCH_QUEUE_MB=512 BENCH_ROTATE_MB=512 BENCH_COMPRESSION=gzip bundle exec rspec spec/benchmark_perf_spec.rb --format doc | cat'
+% BENCH=1 BENCH_PARTITIONS=16 BENCH_BATCH_SIZE=8000 BENCH_QUEUE_MB=512 BENCH_ROTATE_MB=512 BENCH_COMPRESSION=gzip bundle exec rspec spec/benchmark_perf_spec.rb --format doc
 Performance benchmark (1M docs, gated by BENCH=1)
-W, [2025-09-03T16:10:40.437304 #33546]  WARN -- : MONGODB | Error checking 127.0.0.1:27018: Mongo::Error::SocketError: Errno::ECONNREFUSED: Connection refused - connect(2) for 127.0.0.1:27018 (for 127.0.0.1:27018 (no TLS)) (on 127.0.0.1:27018)
 Benchmark results:
-  Inserted: 1000000 docs in 8.16s
-  Exported: 1000000 docs in 8.21s
-  Parts:    1, Bytes: 10646279
-  Throughput: 121729.17 docs/s, 1.24 MB/s
+  Inserted: 1000000 docs in 8.13s
+  Exported: 1000000 docs in 4.03s
+  Parts:    1, Bytes: 10625336
+  Throughput: 248241.7 docs/s, 2.52 MB/s
   Settings: partitions=16, batch_size=8000, queue_mb=512, rotate_mb=512, compression=gzip
+  exports 1,000,000 documents and reports throughput
+Finished in 14.02 seconds (files took 0.31974 seconds to load)
+1 example, 0 failures
 ```

data/bin/purplelight CHANGED Viewed

@@ -18,7 +18,17 @@ options = {
   resume: { enabled: true },
   read_preference: nil,
   read_tags: nil,
-  dry_run: false
+  dry_run: false,
+  queue_size_bytes: nil,
+  rotate_bytes: nil,
+  compression_level: nil,
+  writer_threads: nil,
+  write_chunk_bytes: nil,
+  parquet_row_group: nil,
+  telemetry_flag: nil,
+  read_concern: nil,
+  no_cursor_timeout: nil,
+  resume_overwrite_incompatible: false
 }
 parser = OptionParser.new do |opts|
@@ -30,8 +40,11 @@ parser = OptionParser.new do |opts|
   opts.on('-o', '--output PATH', 'Output directory or file (required)') { |v| options[:output] = v }
   opts.on('-f', '--format FORMAT', 'Format: jsonl|csv|parquet (default jsonl)') { |v| options[:format] = v.to_sym }
   opts.on('--compression NAME', 'Compression: zstd|gzip|none') { |v| options[:compression] = v.to_sym }
+  opts.on('--compression-level N', Integer, 'Compression level for zstd/gzip (JSONL/CSV)') { |v| options[:compression_level] = v }
   opts.on('--partitions N', Integer, 'Number of partitions') { |v| options[:partitions] = v }
   opts.on('--batch-size N', Integer, 'Mongo batch size (default 2000)') { |v| options[:batch_size] = v }
+  opts.on('--queue-mb MB', Integer, 'Queue size in MB (default 256)') { |v| options[:queue_size_bytes] = v * 1024 * 1024 }
+  opts.on('--rotate-mb MB', Integer, 'Rotate part size in MB (default 256)') { |v| options[:rotate_bytes] = v * 1024 * 1024 }
   opts.on('--by-size BYTES', Integer, 'Shard by size (bytes); default 268435456') do |v|
     options[:sharding] ||= {}
     options[:sharding][:mode] = :by_size
@@ -45,6 +58,9 @@ parser = OptionParser.new do |opts|
     options[:sharding] ||= {}
     options[:sharding][:prefix] = v
   end
+  opts.on('--writer-threads N', Integer, 'Number of writer threads (experimental, JSONL only)') { |v| options[:writer_threads] = v }
+  opts.on('--write-chunk-mb MB', Integer, 'JSONL encode/write chunk size in MB') { |v| options[:write_chunk_bytes] = v * 1024 * 1024 }
+  opts.on('--parquet-row-group N', Integer, 'Parquet row group size (rows)') { |v| options[:parquet_row_group] = v }
   opts.on('-q', '--query JSON', 'Filter query as JSON (Extended JSON supported)') do |v|
     # Prefer BSON Extended JSON to support $date, $oid, etc.
     options[:query] = BSON::ExtJSON.parse(v)
@@ -57,6 +73,7 @@ parser = OptionParser.new do |opts|
           'Read preference mode: primary|primary_preferred|secondary|secondary_preferred|nearest') do |v|
     options[:read_preference] = v.to_sym
   end
+  opts.on('--read-concern LEVEL', 'Read concern: majority|local|linearizable|available|snapshot') { |v| options[:read_concern] = v.to_sym }
   opts.on('--read-tags TAGS',
           'Comma-separated key=value list to target tagged nodes (e.g., nodeType=ANALYTICS,region=EAST)') do |v|
     tags = {}
@@ -69,6 +86,13 @@ parser = OptionParser.new do |opts|
     options[:read_tags] = tags unless tags.empty?
   end
   opts.on('--dry-run', 'Parse options and print effective read preference JSON, then exit') { options[:dry_run] = true }
+  opts.on('--telemetry MODE', 'Telemetry on|off (overrides PL_TELEMETRY)') { |v| options[:telemetry_flag] = v }
+  opts.on('--no-cursor-timeout BOOL', 'noCursorTimeout true|false (default true)') do |v|
+    options[:no_cursor_timeout] = %w[true 1 yes].include?(v.to_s.downcase)
+  end
+  opts.on('--resume-overwrite-incompatible', 'Overwrite incompatible existing manifest on resume') do
+    options[:resume_overwrite_incompatible] = true
+  end
   opts.on('--version', 'Show version') do
     puts Purplelight::VERSION
     exit 0
@@ -110,7 +134,7 @@ end
 client = Mongo::Client.new(options[:uri])
 options[:partitions] ||= (Etc.respond_to?(:nprocessors) ? [Etc.nprocessors * 2, 4].max : 4)
-ok = Purplelight.snapshot(
+snapshot_args = {
   client: client.use(options[:db]),
   collection: options[:collection],
   output: options[:output],
@@ -122,8 +146,31 @@ ok = Purplelight.snapshot(
   projection: options[:projection],
   sharding: options[:sharding],
   read_preference: effective_read || options[:read_preference],
-  resume: { enabled: true },
+  resume: { enabled: true, overwrite_incompatible: options[:resume_overwrite_incompatible] },
   on_progress: ->(s) { warn("progress: #{s.to_json}") }
-)
+}
+# optional tunables
+snapshot_args[:queue_size_bytes] = options[:queue_size_bytes] if options[:queue_size_bytes]
+snapshot_args[:rotate_bytes] = options[:rotate_bytes] if options[:rotate_bytes]
+snapshot_args[:read_concern] = options[:read_concern] if options[:read_concern]
+snapshot_args[:no_cursor_timeout] = options[:no_cursor_timeout] unless options[:no_cursor_timeout].nil?
+snapshot_args[:compression_level] = options[:compression_level] if options[:compression_level]
+snapshot_args[:writer_threads] = options[:writer_threads] if options[:writer_threads]
+snapshot_args[:write_chunk_bytes] = options[:write_chunk_bytes] if options[:write_chunk_bytes]
+snapshot_args[:parquet_row_group] = options[:parquet_row_group] if options[:parquet_row_group]
+# telemetry env override
+if options[:telemetry_flag]
+  ENV['PL_TELEMETRY'] = (options[:telemetry_flag].to_s.downcase == 'on' ? '1' : '0')
+end
+# writer-specific overrides via environment for v1 compatibility
+ENV['PL_ZSTD_LEVEL'] = options[:compression_level].to_s if options[:compression_level]
+ENV['PL_WRITE_CHUNK_BYTES'] = options[:write_chunk_bytes].to_s if options[:write_chunk_bytes]
+ENV['PL_PARQUET_ROW_GROUP'] = options[:parquet_row_group].to_s if options[:parquet_row_group]
+ENV['PL_WRITER_THREADS'] = options[:writer_threads].to_s if options[:writer_threads]
+ok = Purplelight.snapshot(**snapshot_args)
 exit(ok ? 0 : 1)

data/lib/purplelight/snapshot.rb CHANGED Viewed

@@ -38,7 +38,8 @@ module Purplelight
                    resume: { enabled: true, state_path: nil, overwrite_incompatible: false },
                    sharding: { mode: :by_size, part_bytes: DEFAULTS[:rotate_bytes], prefix: nil },
                    logger: nil, on_progress: nil, read_concern: DEFAULTS[:read_concern], read_preference: DEFAULTS[:read_preference],
-                   no_cursor_timeout: DEFAULTS[:no_cursor_timeout], telemetry: nil)
+                   no_cursor_timeout: DEFAULTS[:no_cursor_timeout], telemetry: nil,
+                   compression_level: nil, writer_threads: 1, write_chunk_bytes: nil, parquet_row_group: nil)
       @client = client
       @collection = client[collection]
       @output = output
@@ -59,6 +60,10 @@ module Purplelight
       @read_concern = read_concern
       @read_preference = read_preference
       @no_cursor_timeout = no_cursor_timeout
+      @compression_level = compression_level
+      @writer_threads = writer_threads || 1
+      @write_chunk_bytes = write_chunk_bytes
+      @parquet_row_group = parquet_row_group
       @running = true
       @telemetry_enabled = telemetry ? telemetry.enabled? : (ENV['PL_TELEMETRY'] == '1')
@@ -90,7 +95,20 @@ module Purplelight
                  end
       manifest.configure!(collection: @collection.name, format: @format, compression: @compression, query_digest: query_digest, options: {
-                            partitions: @partitions, batch_size: @batch_size, rotate_bytes: @rotate_bytes, hint: @hint
+                            partitions: @partitions,
+                            batch_size: @batch_size,
+                            queue_size_bytes: @queue_size_bytes,
+                            rotate_bytes: @rotate_bytes,
+                            hint: @hint,
+                            read_concern: (@read_concern.is_a?(Hash) ? @read_concern : { level: @read_concern }),
+                            no_cursor_timeout: @no_cursor_timeout,
+                            writer_threads: @writer_threads,
+                            compression_level: @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @compression.to_s == 'zstd') || ENV['PL_ZSTD_LEVEL']&.to_i,
+                            write_chunk_bytes: @write_chunk_bytes || ENV['PL_WRITE_CHUNK_BYTES']&.to_i,
+                            parquet_row_group: @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i,
+                            sharding: @sharding,
+                            resume_overwrite_incompatible: @resume && @resume[:overwrite_incompatible] ? true : false,
+                            telemetry: @telemetry_enabled
                           })
       manifest.ensure_partitions!(@partitions)
@@ -114,8 +132,9 @@ module Purplelight
                                logger: @logger, manifest: manifest, single_file: single_file)
                when :parquet
                  single_file = @sharding && @sharding[:mode].to_s == 'single_file'
+                 row_group = @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i || WriterParquet::DEFAULT_ROW_GROUP_SIZE
                  WriterParquet.new(directory: dir, prefix: prefix, compression: @compression, logger: @logger,
-                                   manifest: manifest, single_file: single_file)
+                                   manifest: manifest, single_file: single_file, row_group_size: row_group)
                else
                  raise ArgumentError, "format not implemented: #{@format}"
                end
@@ -225,6 +244,11 @@ module Purplelight
       string_batch = +''
       buffer = []
       buffer_bytes = 0
+      json_state = if encode_lines
+                     JSON::Ext::Generator::State.new(ascii_only: false, max_nesting: false,
+                                                     buffer_initial_length: 4_096)
+                   end
+      size_state = encode_lines ? nil : JSON::Ext::Generator::State.new(ascii_only: false, max_nesting: false)
       last_id = checkpoint
       begin
         cursor.each do |doc|
@@ -232,13 +256,15 @@ module Purplelight
           doc = @mapper.call(doc) if @mapper
           t_ser = telemetry.start(:serialize_time)
           if encode_lines
-            line = "#{JSON.fast_generate(doc)}\n"
+            json = json_state.generate(doc)
             telemetry.finish(:serialize_time, t_ser)
-            bytes = line.bytesize
-            string_batch << line
+            string_batch << json
+            string_batch << "\n"
+            bytes = json.bytesize + 1
           else
             # For CSV/Parquet keep raw docs to allow schema/row building
-            bytes = (JSON.fast_generate(doc).bytesize + 1)
+            json = size_state.generate(doc)
+            bytes = json.bytesize + 1
             telemetry.finish(:serialize_time, t_ser)
             buffer << doc
           end

data/lib/purplelight/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Purplelight
-  VERSION = '0.1.8'
+  VERSION = '0.1.9'
 end

data/lib/purplelight/writer_csv.rb CHANGED Viewed

@@ -5,16 +5,14 @@ require 'json'
 require 'zlib'
 require 'fileutils'
-begin
-  require 'zstds'
-rescue LoadError
-  # zstd not available; fallback handled later via gzip
-end
 begin
   require 'zstd-ruby'
 rescue LoadError
-  # alternative zstd gem not available
+  begin
+    require 'zstds'
+  rescue LoadError
+    # no zstd backend; gzip fallback used later
+  end
 end
 module Purplelight
@@ -200,14 +198,22 @@ module Purplelight
     end
     def determine_effective_compression(requested)
-      case requested.to_s
-      when 'zstd'
-        (defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)) ? :zstd : :gzip)
-      when 'none'
-        :none
-      else
-        :gzip
+      # Order: explicit request -> zstd-ruby -> zstds -> gzip
+      req = requested.to_s
+      return :none if req == 'none'
+      return :gzip if req == 'gzip'
+      if req == 'zstd'
+        return :zstd if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
+        return :zstd if defined?(ZSTDS)
+        return :gzip
       end
+      # Default auto-select
+      return :zstd if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
+      return :zstd if defined?(ZSTDS)
+      :gzip
     end
     def infer_columns(docs)

data/lib/purplelight/writer_jsonl.rb CHANGED Viewed

@@ -4,16 +4,14 @@ require 'json'
 require 'zlib'
 require 'fileutils'
-begin
-  require 'zstds'
-rescue LoadError
-  # zstd not available; will fallback to gzip
-end
 begin
   require 'zstd-ruby'
 rescue LoadError
-  # alternative zstd gem not available
+  begin
+    require 'zstds'
+  rescue LoadError
+    # no zstd backend; gzip fallback
+  end
 end
 module Purplelight
@@ -40,6 +38,7 @@ module Purplelight
       @closed = false
       @effective_compression = determine_effective_compression(@compression)
+      @json_state = JSON::Ext::Generator::State.new(ascii_only: false, max_nesting: false)
       if @logger
         level_disp = @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @effective_compression.to_s == 'zstd')
         @logger.info("WriterJSONL using compression='#{@effective_compression}' level='#{level_disp || 'default'}'")
@@ -86,20 +85,22 @@ module Purplelight
         chunk = +''
         chunk_bytes = 0
         batch.each do |doc|
-          line = "#{JSON.fast_generate(doc)}\n"
+          json = @json_state.generate(doc)
           rows += 1
-          chunk << line
-          chunk_bytes += line.bytesize
+          bytes = json.bytesize + 1
+          chunk << json
+          chunk << "\n"
+          chunk_bytes += bytes
           next unless chunk_bytes >= chunk_threshold
           write_buffer(chunk)
-          total_bytes += chunk.bytesize
+          total_bytes += chunk_bytes
           chunk = +''
           chunk_bytes = 0
         end
         unless chunk.empty?
           write_buffer(chunk)
-          total_bytes += chunk.bytesize
+          total_bytes += chunk_bytes
         end
       end
@@ -198,14 +199,22 @@ module Purplelight
     end
     def determine_effective_compression(requested)
-      case requested.to_s
-      when 'zstd'
-        (defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)) ? :zstd : :gzip)
-      when 'none'
-        :none
-      else
-        :gzip
+      # Order: explicit request -> zstd-ruby -> zstds -> gzip
+      req = requested.to_s
+      return :none if req == 'none'
+      return :gzip if req == 'gzip'
+      if req == 'zstd'
+        return :zstd if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
+        return :zstd if defined?(ZSTDS)
+        return :gzip
       end
+      # Default auto-select
+      return :zstd if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
+      return :zstd if defined?(ZSTDS)
+      :gzip
     end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: purplelight
 version: !ruby/object:Gem::Version
-  version: 0.1.8
+  version: 0.1.9
 platform: ruby
 authors:
 - Alexander Nicholson