purplelight 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/bin/purplelight +2 -0
- data/lib/purplelight/snapshot.rb +25 -7
- data/lib/purplelight/version.rb +1 -1
- data/lib/purplelight/writer_csv.rb +51 -3
- data/lib/purplelight/writer_jsonl.rb +80 -22
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 941ef04ede346a29c0afe19a02d69bd9a25d08ce241e21c6c47960498f4a42c6
|
4
|
+
data.tar.gz: c34b089d2842082f5d4be60c96d58e208e079f92712d397726a964a7361ae895
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8d666cb565283e6410fb0412d8ed369db18b0eaa0e80c573d677295910319b809396e99fca6627214d15942d531c9ac378c7d5a850842a7124d2ff4c9b03d7d8
|
7
|
+
data.tar.gz: 9188cd0b55e0d3e54fb2d209b85eeeacfb1c0d8daa73b1ace0fbb3bf4a2af865d6dd5c125f1e487458f096077a70cc93e7c0191c1222f6edd58701487f5479da
|
data/README.md
CHANGED
data/bin/purplelight
CHANGED
@@ -40,6 +40,7 @@ parser = OptionParser.new do |opts|
|
|
40
40
|
options[:sharding][:prefix] = v
|
41
41
|
end
|
42
42
|
opts.on('-q', '--query JSON', 'Filter query as JSON') { |v| options[:query] = JSON.parse(v) }
|
43
|
+
opts.on('--projection JSON', 'Projection as JSON (e.g., {"_id":1,"field":1})') { |v| options[:projection] = JSON.parse(v) }
|
43
44
|
opts.on('--read-preference MODE',
|
44
45
|
'Read preference mode: primary|primary_preferred|secondary|secondary_preferred|nearest') do |v|
|
45
46
|
options[:read_preference] = v.to_sym
|
@@ -106,6 +107,7 @@ ok = Purplelight.snapshot(
|
|
106
107
|
partitions: options[:partitions],
|
107
108
|
batch_size: options[:batch_size],
|
108
109
|
query: options[:query],
|
110
|
+
projection: options[:projection],
|
109
111
|
sharding: options[:sharding],
|
110
112
|
read_preference: effective_read || options[:read_preference],
|
111
113
|
resume: { enabled: true },
|
data/lib/purplelight/snapshot.rb
CHANGED
@@ -221,6 +221,8 @@ module Purplelight
|
|
221
221
|
cursor = @collection.find(filter, opts)
|
222
222
|
|
223
223
|
encode_lines = (@format == :jsonl)
|
224
|
+
# When JSONL, build one big string per batch to offload join cost from writer.
|
225
|
+
string_batch = +''
|
224
226
|
buffer = []
|
225
227
|
buffer_bytes = 0
|
226
228
|
last_id = checkpoint
|
@@ -230,27 +232,43 @@ module Purplelight
|
|
230
232
|
doc = @mapper.call(doc) if @mapper
|
231
233
|
t_ser = telemetry.start(:serialize_time)
|
232
234
|
if encode_lines
|
233
|
-
line = "#{JSON.
|
235
|
+
line = "#{JSON.fast_generate(doc)}\n"
|
234
236
|
telemetry.finish(:serialize_time, t_ser)
|
235
237
|
bytes = line.bytesize
|
236
|
-
|
238
|
+
string_batch << line
|
237
239
|
else
|
238
240
|
# For CSV/Parquet keep raw docs to allow schema/row building
|
239
|
-
bytes = (JSON.
|
241
|
+
bytes = (JSON.fast_generate(doc).bytesize + 1)
|
240
242
|
telemetry.finish(:serialize_time, t_ser)
|
241
243
|
buffer << doc
|
242
244
|
end
|
243
245
|
buffer_bytes += bytes
|
244
|
-
|
246
|
+
# For JSONL, we count rows via newline accumulation; for others, use array length
|
247
|
+
ready = encode_lines ? (buffer_bytes >= 1_000_000 || (string_batch.length >= 1_000_000)) : (buffer.length >= batch_size || buffer_bytes >= 1_000_000)
|
248
|
+
next unless ready
|
245
249
|
|
246
250
|
t_q = telemetry.start(:queue_wait_time)
|
247
|
-
|
251
|
+
if encode_lines
|
252
|
+
queue.push(string_batch, bytes: buffer_bytes)
|
253
|
+
string_batch = +''
|
254
|
+
else
|
255
|
+
queue.push(buffer, bytes: buffer_bytes)
|
256
|
+
buffer = []
|
257
|
+
end
|
248
258
|
telemetry.finish(:queue_wait_time, t_q)
|
249
259
|
manifest.update_partition_checkpoint!(idx, last_id)
|
250
|
-
buffer = []
|
251
260
|
buffer_bytes = 0
|
252
261
|
end
|
253
|
-
|
262
|
+
if encode_lines
|
263
|
+
unless string_batch.empty?
|
264
|
+
t_q = telemetry.start(:queue_wait_time)
|
265
|
+
queue.push(string_batch, bytes: buffer_bytes)
|
266
|
+
telemetry.finish(:queue_wait_time, t_q)
|
267
|
+
manifest.update_partition_checkpoint!(idx, last_id)
|
268
|
+
string_batch = +''
|
269
|
+
buffer_bytes = 0
|
270
|
+
end
|
271
|
+
elsif !buffer.empty?
|
254
272
|
t_q = telemetry.start(:queue_wait_time)
|
255
273
|
queue.push(buffer, bytes: buffer_bytes)
|
256
274
|
telemetry.finish(:queue_wait_time, t_q)
|
data/lib/purplelight/version.rb
CHANGED
@@ -11,6 +11,12 @@ rescue LoadError
|
|
11
11
|
# zstd not available; fallback handled later via gzip
|
12
12
|
end
|
13
13
|
|
14
|
+
begin
|
15
|
+
require 'zstd-ruby'
|
16
|
+
rescue LoadError
|
17
|
+
# alternative zstd gem not available
|
18
|
+
end
|
19
|
+
|
14
20
|
module Purplelight
|
15
21
|
# WriterCSV writes documents to CSV files with optional compression.
|
16
22
|
class WriterCSV
|
@@ -24,6 +30,8 @@ module Purplelight
|
|
24
30
|
@rotate_bytes = rotate_bytes
|
25
31
|
@logger = logger
|
26
32
|
@manifest = manifest
|
33
|
+
env_level = ENV['PL_ZSTD_LEVEL']&.to_i
|
34
|
+
@compression_level = (env_level && env_level > 0 ? env_level : nil)
|
27
35
|
@single_file = single_file
|
28
36
|
|
29
37
|
@columns = columns&.map(&:to_s)
|
@@ -91,6 +99,39 @@ module Purplelight
|
|
91
99
|
|
92
100
|
private
|
93
101
|
|
102
|
+
# Minimal wrapper to count bytes written for rotate logic when
|
103
|
+
# underlying compressed writer doesn't expose position (e.g., zstd-ruby).
|
104
|
+
class CountingIO
|
105
|
+
def initialize(io, on_write:)
|
106
|
+
@io = io
|
107
|
+
@on_write = on_write
|
108
|
+
end
|
109
|
+
|
110
|
+
def write(data)
|
111
|
+
bytes_written = @io.write(data)
|
112
|
+
@on_write.call(bytes_written) if bytes_written && @on_write
|
113
|
+
bytes_written
|
114
|
+
end
|
115
|
+
|
116
|
+
# CSV calls '<<' on the underlying IO in some code paths
|
117
|
+
def <<(data)
|
118
|
+
write(data)
|
119
|
+
end
|
120
|
+
|
121
|
+
# CSV#flush may forward flush to underlying IO; make it a no-op if unavailable
|
122
|
+
def flush
|
123
|
+
@io.flush if @io.respond_to?(:flush)
|
124
|
+
end
|
125
|
+
|
126
|
+
def method_missing(method_name, *args, &block)
|
127
|
+
@io.send(method_name, *args, &block)
|
128
|
+
end
|
129
|
+
|
130
|
+
def respond_to_missing?(method_name, include_private = false)
|
131
|
+
@io.respond_to?(method_name, include_private)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
94
135
|
def ensure_open!
|
95
136
|
return if @io
|
96
137
|
|
@@ -98,7 +139,8 @@ module Purplelight
|
|
98
139
|
path = next_part_path
|
99
140
|
@part_index = @manifest&.open_part!(path) if @manifest
|
100
141
|
raw = File.open(path, 'wb')
|
101
|
-
|
142
|
+
compressed = build_compressed_io(raw)
|
143
|
+
@io = CountingIO.new(compressed, on_write: ->(n) { @bytes_written += n })
|
102
144
|
@csv = CSV.new(@io)
|
103
145
|
@bytes_written = 0
|
104
146
|
@rows_written = 0
|
@@ -107,7 +149,13 @@ module Purplelight
|
|
107
149
|
def build_compressed_io(raw)
|
108
150
|
case @effective_compression.to_s
|
109
151
|
when 'zstd'
|
110
|
-
|
152
|
+
if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
|
153
|
+
level = @compression_level || 10
|
154
|
+
return ::Zstd::StreamWriter.new(raw, level: level)
|
155
|
+
elsif defined?(ZSTDS)
|
156
|
+
level = @compression_level || 10
|
157
|
+
return ZSTDS::Stream::Writer.new(raw, compression_level: level)
|
158
|
+
end
|
111
159
|
|
112
160
|
@logger&.warn('zstd gem not loaded; using gzip')
|
113
161
|
Zlib::GzipWriter.new(raw)
|
@@ -154,7 +202,7 @@ module Purplelight
|
|
154
202
|
def determine_effective_compression(requested)
|
155
203
|
case requested.to_s
|
156
204
|
when 'zstd'
|
157
|
-
(defined?(ZSTDS) ? :zstd : :gzip)
|
205
|
+
((defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter))) ? :zstd : :gzip)
|
158
206
|
when 'none'
|
159
207
|
:none
|
160
208
|
else
|
@@ -10,6 +10,12 @@ rescue LoadError
|
|
10
10
|
# zstd not available; will fallback to gzip
|
11
11
|
end
|
12
12
|
|
13
|
+
begin
|
14
|
+
require 'zstd-ruby'
|
15
|
+
rescue LoadError
|
16
|
+
# alternative zstd gem not available
|
17
|
+
end
|
18
|
+
|
13
19
|
module Purplelight
|
14
20
|
# WriterJSONL writes newline-delimited JSON with optional compression.
|
15
21
|
class WriterJSONL
|
@@ -23,7 +29,8 @@ module Purplelight
|
|
23
29
|
@rotate_bytes = rotate_bytes
|
24
30
|
@logger = logger
|
25
31
|
@manifest = manifest
|
26
|
-
|
32
|
+
env_level = ENV['PL_ZSTD_LEVEL']&.to_i
|
33
|
+
@compression_level = compression_level || (env_level && env_level > 0 ? env_level : nil)
|
27
34
|
|
28
35
|
@part_index = nil
|
29
36
|
@io = nil
|
@@ -33,23 +40,71 @@ module Purplelight
|
|
33
40
|
@closed = false
|
34
41
|
|
35
42
|
@effective_compression = determine_effective_compression(@compression)
|
36
|
-
|
37
|
-
|
38
|
-
|
43
|
+
if @logger
|
44
|
+
level_disp = @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @effective_compression.to_s == 'zstd')
|
45
|
+
@logger.info("WriterJSONL using compression='#{@effective_compression}' level='#{level_disp || 'default'}'")
|
46
|
+
end
|
47
|
+
if @effective_compression.to_s != @compression.to_s
|
48
|
+
@logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
|
49
|
+
end
|
39
50
|
end
|
40
51
|
|
41
|
-
def write_many(
|
52
|
+
def write_many(batch)
|
42
53
|
ensure_open!
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
54
|
+
|
55
|
+
chunk_threshold = (ENV['PL_WRITE_CHUNK_BYTES']&.to_i || (8 * 1024 * 1024))
|
56
|
+
total_bytes = 0
|
57
|
+
rows = 0
|
58
|
+
|
59
|
+
if batch.is_a?(String)
|
60
|
+
# Fast-path: writer received a preassembled buffer string
|
61
|
+
buffer = batch
|
62
|
+
rows = buffer.count("\n")
|
63
|
+
write_buffer(buffer)
|
64
|
+
total_bytes = buffer.bytesize
|
65
|
+
elsif batch.first.is_a?(String)
|
66
|
+
# Join and write in chunks to avoid large intermediate allocations
|
67
|
+
chunk = +''
|
68
|
+
chunk_bytes = 0
|
69
|
+
batch.each do |line|
|
70
|
+
chunk << line
|
71
|
+
rows += 1
|
72
|
+
chunk_bytes += line.bytesize
|
73
|
+
next unless chunk_bytes >= chunk_threshold
|
74
|
+
|
75
|
+
write_buffer(chunk)
|
76
|
+
total_bytes += chunk.bytesize
|
77
|
+
chunk = +''
|
78
|
+
chunk_bytes = 0
|
79
|
+
end
|
80
|
+
unless chunk.empty?
|
81
|
+
write_buffer(chunk)
|
82
|
+
total_bytes += chunk.bytesize
|
83
|
+
end
|
84
|
+
else
|
85
|
+
# Fallback: encode docs here (JSON.fast_generate preferred) and write in chunks
|
86
|
+
chunk = +''
|
87
|
+
chunk_bytes = 0
|
88
|
+
batch.each do |doc|
|
89
|
+
line = "#{JSON.fast_generate(doc)}\n"
|
90
|
+
rows += 1
|
91
|
+
chunk << line
|
92
|
+
chunk_bytes += line.bytesize
|
93
|
+
next unless chunk_bytes >= chunk_threshold
|
94
|
+
|
95
|
+
write_buffer(chunk)
|
96
|
+
total_bytes += chunk.bytesize
|
97
|
+
chunk = +''
|
98
|
+
chunk_bytes = 0
|
99
|
+
end
|
100
|
+
unless chunk.empty?
|
101
|
+
write_buffer(chunk)
|
102
|
+
total_bytes += chunk.bytesize
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
51
106
|
@rows_written += rows
|
52
|
-
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: rows, bytes_delta:
|
107
|
+
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: rows, bytes_delta: total_bytes)
|
53
108
|
end
|
54
109
|
|
55
110
|
def rotate_if_needed
|
@@ -86,15 +141,18 @@ module Purplelight
|
|
86
141
|
def build_compressed_io(raw)
|
87
142
|
case @effective_compression.to_s
|
88
143
|
when 'zstd'
|
89
|
-
if
|
90
|
-
|
144
|
+
# Prefer zstd-ruby if available, else ruby-zstds
|
145
|
+
if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
|
146
|
+
level = @compression_level || 3
|
147
|
+
return ::Zstd::StreamWriter.new(raw, level: level)
|
148
|
+
elsif defined?(ZSTDS)
|
91
149
|
level = @compression_level || 3
|
92
|
-
ZSTDS::Writer.
|
93
|
-
else
|
94
|
-
@logger&.warn('zstd gem not loaded; this should have been handled earlier')
|
95
|
-
level = @compression_level || Zlib::DEFAULT_COMPRESSION
|
96
|
-
Zlib::GzipWriter.new(raw, level)
|
150
|
+
return ZSTDS::Stream::Writer.new(raw, compression_level: level)
|
97
151
|
end
|
152
|
+
|
153
|
+
@logger&.warn('zstd gems not loaded; falling back to gzip')
|
154
|
+
level = @compression_level || Zlib::DEFAULT_COMPRESSION
|
155
|
+
Zlib::GzipWriter.new(raw, level)
|
98
156
|
when 'gzip'
|
99
157
|
level = @compression_level || 1
|
100
158
|
Zlib::GzipWriter.new(raw, level)
|
@@ -142,7 +200,7 @@ module Purplelight
|
|
142
200
|
def determine_effective_compression(requested)
|
143
201
|
case requested.to_s
|
144
202
|
when 'zstd'
|
145
|
-
(defined?(ZSTDS) ? :zstd : :gzip)
|
203
|
+
((defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter))) ? :zstd : :gzip)
|
146
204
|
when 'none'
|
147
205
|
:none
|
148
206
|
else
|