purplelight 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a650fdd2113129b151396a1a90a83a6f1ede97eb5c34c60e028eb7639d5cc4fd
4
- data.tar.gz: 1ab1bc421ddaf1c457639ae2ac5968245c6141b8504897ab762a49bb69c51a69
3
+ metadata.gz: 6e3771629528ecab067d858491ec4c5de06a8c88c90c64b56dcdb8658c3a6a89
4
+ data.tar.gz: 4c0427564bf04b5dc3da6b3af97bfd4e5dc0625b2faa0dd8acec84a81bd1c145
5
5
  SHA512:
6
- metadata.gz: 506e52dce7c474998c8bc4b9afa9f5140349e8e2eed2eed7cccbacac0bd9d9f41528b234f1b99ad8407674791471368ee5b99d93b7ab058522311f2642006a20
7
- data.tar.gz: 5c17e387f0d67a21d1351cf4e1e69beaa7beecdf5b9f8011e16bd740e990902abf32c54b02257cdead9c296241557571608b6004446cca5d429675fda07ff61a
6
+ metadata.gz: 871cf3c04dcaa017d1023ff79135a5050254158a7ba20145abb04c39887125c08eedfb8cae4d16ddd3c4fd0acb383ceb9c11efff943b497ff036b68643dbea79
7
+ data.tar.gz: 8a5e7fe7a1913939966df61173380e6ed853f45bec2ef90315410a6d36d6341c07d04ff62d200379629b2366cf68296c7c291b5060d0a080dc3015175f1e6780
data/README.md CHANGED
@@ -9,7 +9,7 @@ Purplelight is published on RubyGems: [purplelight on RubyGems](https://rubygems
9
9
  Add to your Gemfile:
10
10
 
11
11
  ```ruby
12
- gem 'purplelight', '~> 0.1.5'
12
+ gem 'purplelight', '~> 0.1.7'
13
13
  ```
14
14
 
15
15
  Or install directly:
data/bin/purplelight CHANGED
@@ -4,6 +4,7 @@
4
4
  require 'optparse'
5
5
  require 'json'
6
6
  require 'mongo'
7
+ require 'time'
7
8
  require_relative '../lib/purplelight'
8
9
 
9
10
  options = {
@@ -39,7 +40,16 @@ parser = OptionParser.new do |opts|
39
40
  options[:sharding] ||= {}
40
41
  options[:sharding][:prefix] = v
41
42
  end
42
- opts.on('-q', '--query JSON', 'Filter query as JSON') { |v| options[:query] = JSON.parse(v) }
43
+ opts.on('-q', '--query JSON', 'Filter query as JSON (Extended JSON supported)') do |v|
44
+ begin
45
+ # Prefer BSON Extended JSON to support $date, $oid, etc.
46
+ options[:query] = BSON::ExtJSON.parse(v)
47
+ rescue StandardError
48
+ # Fallback to plain JSON for compatibility
49
+ options[:query] = JSON.parse(v)
50
+ end
51
+ end
52
+ opts.on('--projection JSON', 'Projection as JSON (e.g., {"_id":1,"field":1})') { |v| options[:projection] = JSON.parse(v) }
43
53
  opts.on('--read-preference MODE',
44
54
  'Read preference mode: primary|primary_preferred|secondary|secondary_preferred|nearest') do |v|
45
55
  options[:read_preference] = v.to_sym
@@ -106,6 +116,7 @@ ok = Purplelight.snapshot(
106
116
  partitions: options[:partitions],
107
117
  batch_size: options[:batch_size],
108
118
  query: options[:query],
119
+ projection: options[:projection],
109
120
  sharding: options[:sharding],
110
121
  read_preference: effective_read || options[:read_preference],
111
122
  resume: { enabled: true },
@@ -27,13 +27,23 @@ module Purplelight
27
27
  max_id = collection.find(query || {}).projection(_id: 1).sort(_id: -1).limit(1).first&.dig('_id')
28
28
  return [{ filter: query || {}, sort: { _id: 1 } }] if min_id.nil? || max_id.nil?
29
29
 
30
- # Create numeric-ish interpolation by sampling
31
- ids = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(partitions - 1).to_a.map { |d| d['_id'] }
32
- boundaries = [min_id] + ids + [max_id]
30
+ # Create contiguous ranges using ascending inner boundaries.
31
+ # We intentionally skip the very first _id so the first range includes the smallest document.
32
+ inner_boundaries = collection.find(query || {})
33
+ .projection(_id: 1)
34
+ .sort(_id: 1)
35
+ .skip(1)
36
+ .limit([partitions - 1, 0].max)
37
+ .to_a
38
+ .map { |d| d['_id'] }
39
+
33
40
  ranges = []
34
- boundaries.each_cons(2) do |a, b|
35
- ranges << build_range(a, b)
41
+ prev = nil
42
+ inner_boundaries.each do |b|
43
+ ranges << build_range(prev, b)
44
+ prev = b
36
45
  end
46
+ ranges << build_range(prev, nil)
37
47
 
38
48
  ranges.map do |r|
39
49
  filter = query ? query.dup : {}
@@ -54,7 +64,7 @@ module Purplelight
54
64
  min_ts = min_id.respond_to?(:generation_time) ? min_id.generation_time.to_i : nil
55
65
  max_ts = max_id.respond_to?(:generation_time) ? max_id.generation_time.to_i : nil
56
66
 
57
- # Fallback to cursor sampling if _id isn't an ObjectId
67
+ # Fallback to cursor sampling if _id isn't anObjectId
58
68
  return cursor_sampling_partitions(collection: collection, query: query, partitions: partitions) if min_ts.nil? || max_ts.nil? || max_ts <= min_ts
59
69
 
60
70
  step = [(max_ts - min_ts) / partitions, 1].max
@@ -221,6 +221,8 @@ module Purplelight
221
221
  cursor = @collection.find(filter, opts)
222
222
 
223
223
  encode_lines = (@format == :jsonl)
224
+ # When JSONL, build one big string per batch to offload join cost from writer.
225
+ string_batch = +''
224
226
  buffer = []
225
227
  buffer_bytes = 0
226
228
  last_id = checkpoint
@@ -230,27 +232,43 @@ module Purplelight
230
232
  doc = @mapper.call(doc) if @mapper
231
233
  t_ser = telemetry.start(:serialize_time)
232
234
  if encode_lines
233
- line = "#{JSON.generate(doc)}\n"
235
+ line = "#{JSON.fast_generate(doc)}\n"
234
236
  telemetry.finish(:serialize_time, t_ser)
235
237
  bytes = line.bytesize
236
- buffer << line
238
+ string_batch << line
237
239
  else
238
240
  # For CSV/Parquet keep raw docs to allow schema/row building
239
- bytes = (JSON.generate(doc).bytesize + 1)
241
+ bytes = (JSON.fast_generate(doc).bytesize + 1)
240
242
  telemetry.finish(:serialize_time, t_ser)
241
243
  buffer << doc
242
244
  end
243
245
  buffer_bytes += bytes
244
- next unless buffer.length >= batch_size || buffer_bytes >= 1_000_000
246
+ # For JSONL, we count rows via newline accumulation; for others, use array length
247
+ ready = encode_lines ? (buffer_bytes >= 1_000_000 || (string_batch.length >= 1_000_000)) : (buffer.length >= batch_size || buffer_bytes >= 1_000_000)
248
+ next unless ready
245
249
 
246
250
  t_q = telemetry.start(:queue_wait_time)
247
- queue.push(buffer, bytes: buffer_bytes)
251
+ if encode_lines
252
+ queue.push(string_batch, bytes: buffer_bytes)
253
+ string_batch = +''
254
+ else
255
+ queue.push(buffer, bytes: buffer_bytes)
256
+ buffer = []
257
+ end
248
258
  telemetry.finish(:queue_wait_time, t_q)
249
259
  manifest.update_partition_checkpoint!(idx, last_id)
250
- buffer = []
251
260
  buffer_bytes = 0
252
261
  end
253
- unless buffer.empty?
262
+ if encode_lines
263
+ unless string_batch.empty?
264
+ t_q = telemetry.start(:queue_wait_time)
265
+ queue.push(string_batch, bytes: buffer_bytes)
266
+ telemetry.finish(:queue_wait_time, t_q)
267
+ manifest.update_partition_checkpoint!(idx, last_id)
268
+ string_batch = +''
269
+ buffer_bytes = 0
270
+ end
271
+ elsif !buffer.empty?
254
272
  t_q = telemetry.start(:queue_wait_time)
255
273
  queue.push(buffer, bytes: buffer_bytes)
256
274
  telemetry.finish(:queue_wait_time, t_q)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Purplelight
4
- VERSION = '0.1.5'
4
+ VERSION = '0.1.7'
5
5
  end
@@ -11,6 +11,12 @@ rescue LoadError
11
11
  # zstd not available; fallback handled later via gzip
12
12
  end
13
13
 
14
+ begin
15
+ require 'zstd-ruby'
16
+ rescue LoadError
17
+ # alternative zstd gem not available
18
+ end
19
+
14
20
  module Purplelight
15
21
  # WriterCSV writes documents to CSV files with optional compression.
16
22
  class WriterCSV
@@ -24,6 +30,8 @@ module Purplelight
24
30
  @rotate_bytes = rotate_bytes
25
31
  @logger = logger
26
32
  @manifest = manifest
33
+ env_level = ENV['PL_ZSTD_LEVEL']&.to_i
34
+ @compression_level = (env_level && env_level > 0 ? env_level : nil)
27
35
  @single_file = single_file
28
36
 
29
37
  @columns = columns&.map(&:to_s)
@@ -91,6 +99,39 @@ module Purplelight
91
99
 
92
100
  private
93
101
 
102
+ # Minimal wrapper to count bytes written for rotate logic when
103
+ # underlying compressed writer doesn't expose position (e.g., zstd-ruby).
104
+ class CountingIO
105
+ def initialize(io, on_write:)
106
+ @io = io
107
+ @on_write = on_write
108
+ end
109
+
110
+ def write(data)
111
+ bytes_written = @io.write(data)
112
+ @on_write.call(bytes_written) if bytes_written && @on_write
113
+ bytes_written
114
+ end
115
+
116
+ # CSV calls '<<' on the underlying IO in some code paths
117
+ def <<(data)
118
+ write(data)
119
+ end
120
+
121
+ # CSV#flush may forward flush to underlying IO; make it a no-op if unavailable
122
+ def flush
123
+ @io.flush if @io.respond_to?(:flush)
124
+ end
125
+
126
+ def method_missing(method_name, *args, &block)
127
+ @io.send(method_name, *args, &block)
128
+ end
129
+
130
+ def respond_to_missing?(method_name, include_private = false)
131
+ @io.respond_to?(method_name, include_private)
132
+ end
133
+ end
134
+
94
135
  def ensure_open!
95
136
  return if @io
96
137
 
@@ -98,7 +139,8 @@ module Purplelight
98
139
  path = next_part_path
99
140
  @part_index = @manifest&.open_part!(path) if @manifest
100
141
  raw = File.open(path, 'wb')
101
- @io = build_compressed_io(raw)
142
+ compressed = build_compressed_io(raw)
143
+ @io = CountingIO.new(compressed, on_write: ->(n) { @bytes_written += n })
102
144
  @csv = CSV.new(@io)
103
145
  @bytes_written = 0
104
146
  @rows_written = 0
@@ -107,7 +149,13 @@ module Purplelight
107
149
  def build_compressed_io(raw)
108
150
  case @effective_compression.to_s
109
151
  when 'zstd'
110
- return ZSTDS::Writer.open(raw, level: 10) if defined?(ZSTDS)
152
+ if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
153
+ level = @compression_level || 10
154
+ return ::Zstd::StreamWriter.new(raw, level: level)
155
+ elsif defined?(ZSTDS)
156
+ level = @compression_level || 10
157
+ return ZSTDS::Stream::Writer.new(raw, compression_level: level)
158
+ end
111
159
 
112
160
  @logger&.warn('zstd gem not loaded; using gzip')
113
161
  Zlib::GzipWriter.new(raw)
@@ -154,7 +202,7 @@ module Purplelight
154
202
  def determine_effective_compression(requested)
155
203
  case requested.to_s
156
204
  when 'zstd'
157
- (defined?(ZSTDS) ? :zstd : :gzip)
205
+ ((defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter))) ? :zstd : :gzip)
158
206
  when 'none'
159
207
  :none
160
208
  else
@@ -10,6 +10,12 @@ rescue LoadError
10
10
  # zstd not available; will fallback to gzip
11
11
  end
12
12
 
13
+ begin
14
+ require 'zstd-ruby'
15
+ rescue LoadError
16
+ # alternative zstd gem not available
17
+ end
18
+
13
19
  module Purplelight
14
20
  # WriterJSONL writes newline-delimited JSON with optional compression.
15
21
  class WriterJSONL
@@ -23,7 +29,8 @@ module Purplelight
23
29
  @rotate_bytes = rotate_bytes
24
30
  @logger = logger
25
31
  @manifest = manifest
26
- @compression_level = compression_level
32
+ env_level = ENV['PL_ZSTD_LEVEL']&.to_i
33
+ @compression_level = compression_level || (env_level && env_level > 0 ? env_level : nil)
27
34
 
28
35
  @part_index = nil
29
36
  @io = nil
@@ -33,23 +40,71 @@ module Purplelight
33
40
  @closed = false
34
41
 
35
42
  @effective_compression = determine_effective_compression(@compression)
36
- return unless @effective_compression.to_s != @compression.to_s
37
-
38
- @logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
43
+ if @logger
44
+ level_disp = @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @effective_compression.to_s == 'zstd')
45
+ @logger.info("WriterJSONL using compression='#{@effective_compression}' level='#{level_disp || 'default'}'")
46
+ end
47
+ if @effective_compression.to_s != @compression.to_s
48
+ @logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
49
+ end
39
50
  end
40
51
 
41
- def write_many(array_of_docs)
52
+ def write_many(batch)
42
53
  ensure_open!
43
- # If upstream already produced newline-terminated strings, join fast.
44
- buffer = if array_of_docs.first.is_a?(String)
45
- array_of_docs.join
46
- else
47
- array_of_docs.map { |doc| "#{JSON.generate(doc)}\n" }.join
48
- end
49
- rows = array_of_docs.size
50
- write_buffer(buffer)
54
+
55
+ chunk_threshold = (ENV['PL_WRITE_CHUNK_BYTES']&.to_i || (8 * 1024 * 1024))
56
+ total_bytes = 0
57
+ rows = 0
58
+
59
+ if batch.is_a?(String)
60
+ # Fast-path: writer received a preassembled buffer string
61
+ buffer = batch
62
+ rows = buffer.count("\n")
63
+ write_buffer(buffer)
64
+ total_bytes = buffer.bytesize
65
+ elsif batch.first.is_a?(String)
66
+ # Join and write in chunks to avoid large intermediate allocations
67
+ chunk = +''
68
+ chunk_bytes = 0
69
+ batch.each do |line|
70
+ chunk << line
71
+ rows += 1
72
+ chunk_bytes += line.bytesize
73
+ next unless chunk_bytes >= chunk_threshold
74
+
75
+ write_buffer(chunk)
76
+ total_bytes += chunk.bytesize
77
+ chunk = +''
78
+ chunk_bytes = 0
79
+ end
80
+ unless chunk.empty?
81
+ write_buffer(chunk)
82
+ total_bytes += chunk.bytesize
83
+ end
84
+ else
85
+ # Fallback: encode docs here (JSON.fast_generate preferred) and write in chunks
86
+ chunk = +''
87
+ chunk_bytes = 0
88
+ batch.each do |doc|
89
+ line = "#{JSON.fast_generate(doc)}\n"
90
+ rows += 1
91
+ chunk << line
92
+ chunk_bytes += line.bytesize
93
+ next unless chunk_bytes >= chunk_threshold
94
+
95
+ write_buffer(chunk)
96
+ total_bytes += chunk.bytesize
97
+ chunk = +''
98
+ chunk_bytes = 0
99
+ end
100
+ unless chunk.empty?
101
+ write_buffer(chunk)
102
+ total_bytes += chunk.bytesize
103
+ end
104
+ end
105
+
51
106
  @rows_written += rows
52
- @manifest&.add_progress_to_part!(index: @part_index, rows_delta: rows, bytes_delta: buffer.bytesize)
107
+ @manifest&.add_progress_to_part!(index: @part_index, rows_delta: rows, bytes_delta: total_bytes)
53
108
  end
54
109
 
55
110
  def rotate_if_needed
@@ -86,15 +141,18 @@ module Purplelight
86
141
  def build_compressed_io(raw)
87
142
  case @effective_compression.to_s
88
143
  when 'zstd'
89
- if defined?(ZSTDS)
90
- # ZSTDS::Writer supports IO-like interface
144
+ # Prefer zstd-ruby if available, else ruby-zstds
145
+ if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
146
+ level = @compression_level || 3
147
+ return ::Zstd::StreamWriter.new(raw, level: level)
148
+ elsif defined?(ZSTDS)
91
149
  level = @compression_level || 3
92
- ZSTDS::Writer.open(raw, level: level)
93
- else
94
- @logger&.warn('zstd gem not loaded; this should have been handled earlier')
95
- level = @compression_level || Zlib::DEFAULT_COMPRESSION
96
- Zlib::GzipWriter.new(raw, level)
150
+ return ZSTDS::Stream::Writer.new(raw, compression_level: level)
97
151
  end
152
+
153
+ @logger&.warn('zstd gems not loaded; falling back to gzip')
154
+ level = @compression_level || Zlib::DEFAULT_COMPRESSION
155
+ Zlib::GzipWriter.new(raw, level)
98
156
  when 'gzip'
99
157
  level = @compression_level || 1
100
158
  Zlib::GzipWriter.new(raw, level)
@@ -142,7 +200,7 @@ module Purplelight
142
200
  def determine_effective_compression(requested)
143
201
  case requested.to_s
144
202
  when 'zstd'
145
- (defined?(ZSTDS) ? :zstd : :gzip)
203
+ ((defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter))) ? :zstd : :gzip)
146
204
  when 'none'
147
205
  :none
148
206
  else
@@ -116,9 +116,9 @@ module Purplelight
116
116
  def next_part_path
117
117
  ext = 'parquet'
118
118
  filename = if @single_file
119
- format('%<prefix>s.%<ext>s', prefix: @prefix, ext: ext)
119
+ "#{@prefix}.#{ext}"
120
120
  else
121
- format('%<prefix}s-part-%<seq>06d.%<ext>s', prefix: @prefix, seq: @file_seq, ext: ext)
121
+ format('%<prefix>s-part-%<seq>06d.%<ext>s', prefix: @prefix, seq: @file_seq, ext: ext)
122
122
  end
123
123
  File.join(@directory, filename)
124
124
  end
@@ -132,7 +132,12 @@ module Purplelight
132
132
  end
133
133
 
134
134
  def extract_value(doc, key)
135
- doc[key] || doc[key.to_sym]
135
+ value = doc[key] || doc[key.to_sym]
136
+ # Normalize common MongoDB/BSON types to Parquet-friendly values
137
+ if defined?(BSON) && value.is_a?(BSON::ObjectId)
138
+ return value.to_s
139
+ end
140
+ value
136
141
  end
137
142
  end
138
143
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: purplelight
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alexander Nicholson