purplelight 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a650fdd2113129b151396a1a90a83a6f1ede97eb5c34c60e028eb7639d5cc4fd
4
- data.tar.gz: 1ab1bc421ddaf1c457639ae2ac5968245c6141b8504897ab762a49bb69c51a69
3
+ metadata.gz: 941ef04ede346a29c0afe19a02d69bd9a25d08ce241e21c6c47960498f4a42c6
4
+ data.tar.gz: c34b089d2842082f5d4be60c96d58e208e079f92712d397726a964a7361ae895
5
5
  SHA512:
6
- metadata.gz: 506e52dce7c474998c8bc4b9afa9f5140349e8e2eed2eed7cccbacac0bd9d9f41528b234f1b99ad8407674791471368ee5b99d93b7ab058522311f2642006a20
7
- data.tar.gz: 5c17e387f0d67a21d1351cf4e1e69beaa7beecdf5b9f8011e16bd740e990902abf32c54b02257cdead9c296241557571608b6004446cca5d429675fda07ff61a
6
+ metadata.gz: 8d666cb565283e6410fb0412d8ed369db18b0eaa0e80c573d677295910319b809396e99fca6627214d15942d531c9ac378c7d5a850842a7124d2ff4c9b03d7d8
7
+ data.tar.gz: 9188cd0b55e0d3e54fb2d209b85eeeacfb1c0d8daa73b1ace0fbb3bf4a2af865d6dd5c125f1e487458f096077a70cc93e7c0191c1222f6edd58701487f5479da
data/README.md CHANGED
@@ -9,7 +9,7 @@ Purplelight is published on RubyGems: [purplelight on RubyGems](https://rubygems
9
9
  Add to your Gemfile:
10
10
 
11
11
  ```ruby
12
- gem 'purplelight', '~> 0.1.5'
12
+ gem 'purplelight', '~> 0.1.6'
13
13
  ```
14
14
 
15
15
  Or install directly:
data/bin/purplelight CHANGED
@@ -40,6 +40,7 @@ parser = OptionParser.new do |opts|
40
40
  options[:sharding][:prefix] = v
41
41
  end
42
42
  opts.on('-q', '--query JSON', 'Filter query as JSON') { |v| options[:query] = JSON.parse(v) }
43
+ opts.on('--projection JSON', 'Projection as JSON (e.g., {"_id":1,"field":1})') { |v| options[:projection] = JSON.parse(v) }
43
44
  opts.on('--read-preference MODE',
44
45
  'Read preference mode: primary|primary_preferred|secondary|secondary_preferred|nearest') do |v|
45
46
  options[:read_preference] = v.to_sym
@@ -106,6 +107,7 @@ ok = Purplelight.snapshot(
106
107
  partitions: options[:partitions],
107
108
  batch_size: options[:batch_size],
108
109
  query: options[:query],
110
+ projection: options[:projection],
109
111
  sharding: options[:sharding],
110
112
  read_preference: effective_read || options[:read_preference],
111
113
  resume: { enabled: true },
@@ -221,6 +221,8 @@ module Purplelight
221
221
  cursor = @collection.find(filter, opts)
222
222
 
223
223
  encode_lines = (@format == :jsonl)
224
+ # When JSONL, build one big string per batch to offload join cost from writer.
225
+ string_batch = +''
224
226
  buffer = []
225
227
  buffer_bytes = 0
226
228
  last_id = checkpoint
@@ -230,27 +232,43 @@ module Purplelight
230
232
  doc = @mapper.call(doc) if @mapper
231
233
  t_ser = telemetry.start(:serialize_time)
232
234
  if encode_lines
233
- line = "#{JSON.generate(doc)}\n"
235
+ line = "#{JSON.fast_generate(doc)}\n"
234
236
  telemetry.finish(:serialize_time, t_ser)
235
237
  bytes = line.bytesize
236
- buffer << line
238
+ string_batch << line
237
239
  else
238
240
  # For CSV/Parquet keep raw docs to allow schema/row building
239
- bytes = (JSON.generate(doc).bytesize + 1)
241
+ bytes = (JSON.fast_generate(doc).bytesize + 1)
240
242
  telemetry.finish(:serialize_time, t_ser)
241
243
  buffer << doc
242
244
  end
243
245
  buffer_bytes += bytes
244
- next unless buffer.length >= batch_size || buffer_bytes >= 1_000_000
246
+ # For JSONL, we count rows via newline accumulation; for others, use array length
247
+ ready = encode_lines ? (buffer_bytes >= 1_000_000 || (string_batch.length >= 1_000_000)) : (buffer.length >= batch_size || buffer_bytes >= 1_000_000)
248
+ next unless ready
245
249
 
246
250
  t_q = telemetry.start(:queue_wait_time)
247
- queue.push(buffer, bytes: buffer_bytes)
251
+ if encode_lines
252
+ queue.push(string_batch, bytes: buffer_bytes)
253
+ string_batch = +''
254
+ else
255
+ queue.push(buffer, bytes: buffer_bytes)
256
+ buffer = []
257
+ end
248
258
  telemetry.finish(:queue_wait_time, t_q)
249
259
  manifest.update_partition_checkpoint!(idx, last_id)
250
- buffer = []
251
260
  buffer_bytes = 0
252
261
  end
253
- unless buffer.empty?
262
+ if encode_lines
263
+ unless string_batch.empty?
264
+ t_q = telemetry.start(:queue_wait_time)
265
+ queue.push(string_batch, bytes: buffer_bytes)
266
+ telemetry.finish(:queue_wait_time, t_q)
267
+ manifest.update_partition_checkpoint!(idx, last_id)
268
+ string_batch = +''
269
+ buffer_bytes = 0
270
+ end
271
+ elsif !buffer.empty?
254
272
  t_q = telemetry.start(:queue_wait_time)
255
273
  queue.push(buffer, bytes: buffer_bytes)
256
274
  telemetry.finish(:queue_wait_time, t_q)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Purplelight
4
- VERSION = '0.1.5'
4
+ VERSION = '0.1.6'
5
5
  end
@@ -11,6 +11,12 @@ rescue LoadError
11
11
  # zstd not available; fallback handled later via gzip
12
12
  end
13
13
 
14
+ begin
15
+ require 'zstd-ruby'
16
+ rescue LoadError
17
+ # alternative zstd gem not available
18
+ end
19
+
14
20
  module Purplelight
15
21
  # WriterCSV writes documents to CSV files with optional compression.
16
22
  class WriterCSV
@@ -24,6 +30,8 @@ module Purplelight
24
30
  @rotate_bytes = rotate_bytes
25
31
  @logger = logger
26
32
  @manifest = manifest
33
+ env_level = ENV['PL_ZSTD_LEVEL']&.to_i
34
+ @compression_level = (env_level && env_level > 0 ? env_level : nil)
27
35
  @single_file = single_file
28
36
 
29
37
  @columns = columns&.map(&:to_s)
@@ -91,6 +99,39 @@ module Purplelight
91
99
 
92
100
  private
93
101
 
102
+ # Minimal wrapper to count bytes written for rotate logic when
103
+ # underlying compressed writer doesn't expose position (e.g., zstd-ruby).
104
+ class CountingIO
105
+ def initialize(io, on_write:)
106
+ @io = io
107
+ @on_write = on_write
108
+ end
109
+
110
+ def write(data)
111
+ bytes_written = @io.write(data)
112
+ @on_write.call(bytes_written) if bytes_written && @on_write
113
+ bytes_written
114
+ end
115
+
116
+ # CSV calls '<<' on the underlying IO in some code paths
117
+ def <<(data)
118
+ write(data)
119
+ end
120
+
121
+ # CSV#flush may forward flush to underlying IO; make it a no-op if unavailable
122
+ def flush
123
+ @io.flush if @io.respond_to?(:flush)
124
+ end
125
+
126
+ def method_missing(method_name, *args, &block)
127
+ @io.send(method_name, *args, &block)
128
+ end
129
+
130
+ def respond_to_missing?(method_name, include_private = false)
131
+ @io.respond_to?(method_name, include_private)
132
+ end
133
+ end
134
+
94
135
  def ensure_open!
95
136
  return if @io
96
137
 
@@ -98,7 +139,8 @@ module Purplelight
98
139
  path = next_part_path
99
140
  @part_index = @manifest&.open_part!(path) if @manifest
100
141
  raw = File.open(path, 'wb')
101
- @io = build_compressed_io(raw)
142
+ compressed = build_compressed_io(raw)
143
+ @io = CountingIO.new(compressed, on_write: ->(n) { @bytes_written += n })
102
144
  @csv = CSV.new(@io)
103
145
  @bytes_written = 0
104
146
  @rows_written = 0
@@ -107,7 +149,13 @@ module Purplelight
107
149
  def build_compressed_io(raw)
108
150
  case @effective_compression.to_s
109
151
  when 'zstd'
110
- return ZSTDS::Writer.open(raw, level: 10) if defined?(ZSTDS)
152
+ if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
153
+ level = @compression_level || 10
154
+ return ::Zstd::StreamWriter.new(raw, level: level)
155
+ elsif defined?(ZSTDS)
156
+ level = @compression_level || 10
157
+ return ZSTDS::Stream::Writer.new(raw, compression_level: level)
158
+ end
111
159
 
112
160
  @logger&.warn('zstd gem not loaded; using gzip')
113
161
  Zlib::GzipWriter.new(raw)
@@ -154,7 +202,7 @@ module Purplelight
154
202
  def determine_effective_compression(requested)
155
203
  case requested.to_s
156
204
  when 'zstd'
157
- (defined?(ZSTDS) ? :zstd : :gzip)
205
+ ((defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter))) ? :zstd : :gzip)
158
206
  when 'none'
159
207
  :none
160
208
  else
@@ -10,6 +10,12 @@ rescue LoadError
10
10
  # zstd not available; will fallback to gzip
11
11
  end
12
12
 
13
+ begin
14
+ require 'zstd-ruby'
15
+ rescue LoadError
16
+ # alternative zstd gem not available
17
+ end
18
+
13
19
  module Purplelight
14
20
  # WriterJSONL writes newline-delimited JSON with optional compression.
15
21
  class WriterJSONL
@@ -23,7 +29,8 @@ module Purplelight
23
29
  @rotate_bytes = rotate_bytes
24
30
  @logger = logger
25
31
  @manifest = manifest
26
- @compression_level = compression_level
32
+ env_level = ENV['PL_ZSTD_LEVEL']&.to_i
33
+ @compression_level = compression_level || (env_level && env_level > 0 ? env_level : nil)
27
34
 
28
35
  @part_index = nil
29
36
  @io = nil
@@ -33,23 +40,71 @@ module Purplelight
33
40
  @closed = false
34
41
 
35
42
  @effective_compression = determine_effective_compression(@compression)
36
- return unless @effective_compression.to_s != @compression.to_s
37
-
38
- @logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
43
+ if @logger
44
+ level_disp = @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @effective_compression.to_s == 'zstd')
45
+ @logger.info("WriterJSONL using compression='#{@effective_compression}' level='#{level_disp || 'default'}'")
46
+ end
47
+ if @effective_compression.to_s != @compression.to_s
48
+ @logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
49
+ end
39
50
  end
40
51
 
41
- def write_many(array_of_docs)
52
+ def write_many(batch)
42
53
  ensure_open!
43
- # If upstream already produced newline-terminated strings, join fast.
44
- buffer = if array_of_docs.first.is_a?(String)
45
- array_of_docs.join
46
- else
47
- array_of_docs.map { |doc| "#{JSON.generate(doc)}\n" }.join
48
- end
49
- rows = array_of_docs.size
50
- write_buffer(buffer)
54
+
55
+ chunk_threshold = (ENV['PL_WRITE_CHUNK_BYTES']&.to_i || (8 * 1024 * 1024))
56
+ total_bytes = 0
57
+ rows = 0
58
+
59
+ if batch.is_a?(String)
60
+ # Fast-path: writer received a preassembled buffer string
61
+ buffer = batch
62
+ rows = buffer.count("\n")
63
+ write_buffer(buffer)
64
+ total_bytes = buffer.bytesize
65
+ elsif batch.first.is_a?(String)
66
+ # Join and write in chunks to avoid large intermediate allocations
67
+ chunk = +''
68
+ chunk_bytes = 0
69
+ batch.each do |line|
70
+ chunk << line
71
+ rows += 1
72
+ chunk_bytes += line.bytesize
73
+ next unless chunk_bytes >= chunk_threshold
74
+
75
+ write_buffer(chunk)
76
+ total_bytes += chunk.bytesize
77
+ chunk = +''
78
+ chunk_bytes = 0
79
+ end
80
+ unless chunk.empty?
81
+ write_buffer(chunk)
82
+ total_bytes += chunk.bytesize
83
+ end
84
+ else
85
+ # Fallback: encode docs here (JSON.fast_generate preferred) and write in chunks
86
+ chunk = +''
87
+ chunk_bytes = 0
88
+ batch.each do |doc|
89
+ line = "#{JSON.fast_generate(doc)}\n"
90
+ rows += 1
91
+ chunk << line
92
+ chunk_bytes += line.bytesize
93
+ next unless chunk_bytes >= chunk_threshold
94
+
95
+ write_buffer(chunk)
96
+ total_bytes += chunk.bytesize
97
+ chunk = +''
98
+ chunk_bytes = 0
99
+ end
100
+ unless chunk.empty?
101
+ write_buffer(chunk)
102
+ total_bytes += chunk.bytesize
103
+ end
104
+ end
105
+
51
106
  @rows_written += rows
52
- @manifest&.add_progress_to_part!(index: @part_index, rows_delta: rows, bytes_delta: buffer.bytesize)
107
+ @manifest&.add_progress_to_part!(index: @part_index, rows_delta: rows, bytes_delta: total_bytes)
53
108
  end
54
109
 
55
110
  def rotate_if_needed
@@ -86,15 +141,18 @@ module Purplelight
86
141
  def build_compressed_io(raw)
87
142
  case @effective_compression.to_s
88
143
  when 'zstd'
89
- if defined?(ZSTDS)
90
- # ZSTDS::Writer supports IO-like interface
144
+ # Prefer zstd-ruby if available, else ruby-zstds
145
+ if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
146
+ level = @compression_level || 3
147
+ return ::Zstd::StreamWriter.new(raw, level: level)
148
+ elsif defined?(ZSTDS)
91
149
  level = @compression_level || 3
92
- ZSTDS::Writer.open(raw, level: level)
93
- else
94
- @logger&.warn('zstd gem not loaded; this should have been handled earlier')
95
- level = @compression_level || Zlib::DEFAULT_COMPRESSION
96
- Zlib::GzipWriter.new(raw, level)
150
+ return ZSTDS::Stream::Writer.new(raw, compression_level: level)
97
151
  end
152
+
153
+ @logger&.warn('zstd gems not loaded; falling back to gzip')
154
+ level = @compression_level || Zlib::DEFAULT_COMPRESSION
155
+ Zlib::GzipWriter.new(raw, level)
98
156
  when 'gzip'
99
157
  level = @compression_level || 1
100
158
  Zlib::GzipWriter.new(raw, level)
@@ -142,7 +200,7 @@ module Purplelight
142
200
  def determine_effective_compression(requested)
143
201
  case requested.to_s
144
202
  when 'zstd'
145
- (defined?(ZSTDS) ? :zstd : :gzip)
203
+ ((defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter))) ? :zstd : :gzip)
146
204
  when 'none'
147
205
  :none
148
206
  else
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: purplelight
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alexander Nicholson