purplelight 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/bin/purplelight +2 -0
- data/lib/purplelight/partitioner.rb +80 -22
- data/lib/purplelight/snapshot.rb +62 -11
- data/lib/purplelight/telemetry.rb +51 -0
- data/lib/purplelight/version.rb +1 -1
- data/lib/purplelight/writer_csv.rb +55 -3
- data/lib/purplelight/writer_jsonl.rb +84 -22
- data/lib/purplelight/writer_parquet.rb +5 -0
- metadata +27 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 941ef04ede346a29c0afe19a02d69bd9a25d08ce241e21c6c47960498f4a42c6
|
4
|
+
data.tar.gz: c34b089d2842082f5d4be60c96d58e208e079f92712d397726a964a7361ae895
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8d666cb565283e6410fb0412d8ed369db18b0eaa0e80c573d677295910319b809396e99fca6627214d15942d531c9ac378c7d5a850842a7124d2ff4c9b03d7d8
|
7
|
+
data.tar.gz: 9188cd0b55e0d3e54fb2d209b85eeeacfb1c0d8daa73b1ace0fbb3bf4a2af865d6dd5c125f1e487458f096077a70cc93e7c0191c1222f6edd58701487f5479da
|
data/README.md
CHANGED
data/bin/purplelight
CHANGED
@@ -40,6 +40,7 @@ parser = OptionParser.new do |opts|
|
|
40
40
|
options[:sharding][:prefix] = v
|
41
41
|
end
|
42
42
|
opts.on('-q', '--query JSON', 'Filter query as JSON') { |v| options[:query] = JSON.parse(v) }
|
43
|
+
opts.on('--projection JSON', 'Projection as JSON (e.g., {"_id":1,"field":1})') { |v| options[:projection] = JSON.parse(v) }
|
43
44
|
opts.on('--read-preference MODE',
|
44
45
|
'Read preference mode: primary|primary_preferred|secondary|secondary_preferred|nearest') do |v|
|
45
46
|
options[:read_preference] = v.to_sym
|
@@ -106,6 +107,7 @@ ok = Purplelight.snapshot(
|
|
106
107
|
partitions: options[:partitions],
|
107
108
|
batch_size: options[:batch_size],
|
108
109
|
query: options[:query],
|
110
|
+
projection: options[:projection],
|
109
111
|
sharding: options[:sharding],
|
110
112
|
read_preference: effective_read || options[:read_preference],
|
111
113
|
resume: { enabled: true },
|
@@ -11,7 +11,86 @@ module Purplelight
|
|
11
11
|
class Partitioner
|
12
12
|
# Builds contiguous _id range filters for N partitions.
|
13
13
|
# For ObjectId _id, we sample quantiles to split into near-equal document counts.
|
14
|
-
def self.object_id_partitions(collection:, query:, partitions:)
|
14
|
+
def self.object_id_partitions(collection:, query:, partitions:, mode: nil, telemetry: nil)
|
15
|
+
# Choose planning mode: :timestamp (fast), :cursor (legacy)
|
16
|
+
chosen_mode = (mode || ENV['PL_PARTITIONER_MODE'] || :timestamp).to_sym
|
17
|
+
telemetry ||= (defined?(Telemetry) ? Telemetry::NULL : nil)
|
18
|
+
|
19
|
+
return cursor_sampling_partitions(collection: collection, query: query, partitions: partitions) if chosen_mode == :cursor
|
20
|
+
|
21
|
+
timestamp_partitions(collection: collection, query: query, partitions: partitions, telemetry: telemetry)
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.simple_ranges(collection:, query:, partitions:)
|
25
|
+
# Split by _id quantiles using min/max endpoints
|
26
|
+
min_id = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(1).first&.dig('_id')
|
27
|
+
max_id = collection.find(query || {}).projection(_id: 1).sort(_id: -1).limit(1).first&.dig('_id')
|
28
|
+
return [{ filter: query || {}, sort: { _id: 1 } }] if min_id.nil? || max_id.nil?
|
29
|
+
|
30
|
+
# Create numeric-ish interpolation by sampling
|
31
|
+
ids = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(partitions - 1).to_a.map { |d| d['_id'] }
|
32
|
+
boundaries = [min_id] + ids + [max_id]
|
33
|
+
ranges = []
|
34
|
+
boundaries.each_cons(2) do |a, b|
|
35
|
+
ranges << build_range(a, b)
|
36
|
+
end
|
37
|
+
|
38
|
+
ranges.map do |r|
|
39
|
+
filter = query ? query.dup : {}
|
40
|
+
filter['_id'] = r
|
41
|
+
{ filter: filter, sort: { _id: 1 }, hint: { _id: 1 } }
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Faster planning using ObjectId timestamps: O(partitions) indexed lookups
|
46
|
+
def self.timestamp_partitions(collection:, query:, partitions:, telemetry: nil)
|
47
|
+
t_minmax = telemetry&.start(:plan_minmax_time)
|
48
|
+
min_id = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(1).first&.dig('_id')
|
49
|
+
max_id = collection.find(query || {}).projection(_id: 1).sort(_id: -1).limit(1).first&.dig('_id')
|
50
|
+
telemetry&.finish(:plan_minmax_time, t_minmax)
|
51
|
+
|
52
|
+
return [{ filter: query || {}, sort: { _id: 1 } }] if min_id.nil? || max_id.nil?
|
53
|
+
|
54
|
+
min_ts = min_id.respond_to?(:generation_time) ? min_id.generation_time.to_i : nil
|
55
|
+
max_ts = max_id.respond_to?(:generation_time) ? max_id.generation_time.to_i : nil
|
56
|
+
|
57
|
+
# Fallback to cursor sampling if _id isn't an ObjectId
|
58
|
+
return cursor_sampling_partitions(collection: collection, query: query, partitions: partitions) if min_ts.nil? || max_ts.nil? || max_ts <= min_ts
|
59
|
+
|
60
|
+
step = [(max_ts - min_ts) / partitions, 1].max
|
61
|
+
inner_boundaries = []
|
62
|
+
t_boundaries = telemetry&.start(:plan_boundary_queries_time)
|
63
|
+
1.upto(partitions - 1) do |i|
|
64
|
+
target_ts = min_ts + (step * i)
|
65
|
+
candidate = BSON::ObjectId.from_time(Time.at(target_ts))
|
66
|
+
f = query ? query.dup : {}
|
67
|
+
f['_id'] = { '$gt' => candidate }
|
68
|
+
b = collection.find(f).projection(_id: 1).sort(_id: 1).hint(_id: 1).limit(1).first&.dig('_id')
|
69
|
+
inner_boundaries << b if b
|
70
|
+
end
|
71
|
+
telemetry&.finish(:plan_boundary_queries_time, t_boundaries)
|
72
|
+
|
73
|
+
# Build ranges: first range has nil lower bound to include min_id,
|
74
|
+
# middle ranges are (prev, current], and last is (last, +inf)
|
75
|
+
ranges = []
|
76
|
+
t_ranges = telemetry&.start(:plan_ranges_build_time)
|
77
|
+
prev = nil
|
78
|
+
inner_boundaries.each do |b|
|
79
|
+
ranges << build_range(prev, b)
|
80
|
+
prev = b
|
81
|
+
end
|
82
|
+
ranges << build_range(prev, nil)
|
83
|
+
telemetry&.finish(:plan_ranges_build_time, t_ranges)
|
84
|
+
|
85
|
+
ranges.map do |r|
|
86
|
+
filter = query ? query.dup : {}
|
87
|
+
filter['_id'] = r
|
88
|
+
{ filter: filter, sort: { _id: 1 }, hint: { _id: 1 } }
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Legacy cursor sampling planner
|
93
|
+
def self.cursor_sampling_partitions(collection:, query:, partitions:)
|
15
94
|
# Ensure sort order for sampling
|
16
95
|
base_query = collection.find(query || {}, {}.merge(sort: { _id: 1 }))
|
17
96
|
|
@@ -49,27 +128,6 @@ module Purplelight
|
|
49
128
|
end
|
50
129
|
end
|
51
130
|
|
52
|
-
def self.simple_ranges(collection:, query:, partitions:)
|
53
|
-
# Split by _id quantiles using min/max endpoints
|
54
|
-
min_id = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(1).first&.dig('_id')
|
55
|
-
max_id = collection.find(query || {}).projection(_id: 1).sort(_id: -1).limit(1).first&.dig('_id')
|
56
|
-
return [{ filter: query || {}, sort: { _id: 1 } }] if min_id.nil? || max_id.nil?
|
57
|
-
|
58
|
-
# Create numeric-ish interpolation by sampling
|
59
|
-
ids = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(partitions - 1).to_a.map { |d| d['_id'] }
|
60
|
-
boundaries = [min_id] + ids + [max_id]
|
61
|
-
ranges = []
|
62
|
-
boundaries.each_cons(2) do |a, b|
|
63
|
-
ranges << build_range(a, b)
|
64
|
-
end
|
65
|
-
|
66
|
-
ranges.map do |r|
|
67
|
-
filter = query ? query.dup : {}
|
68
|
-
filter['_id'] = r
|
69
|
-
{ filter: filter, sort: { _id: 1 }, hint: { _id: 1 } }
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
131
|
def self.build_range(from_id, to_id)
|
74
132
|
if from_id && to_id
|
75
133
|
{ '$gt' => from_id, '$lte' => to_id }
|
data/lib/purplelight/snapshot.rb
CHANGED
@@ -10,6 +10,7 @@ require_relative 'writer_csv'
|
|
10
10
|
require_relative 'writer_parquet'
|
11
11
|
require_relative 'manifest'
|
12
12
|
require_relative 'errors'
|
13
|
+
require_relative 'telemetry'
|
13
14
|
|
14
15
|
module Purplelight
|
15
16
|
# Snapshot orchestrates partition planning, parallel reads, and writing.
|
@@ -37,7 +38,7 @@ module Purplelight
|
|
37
38
|
resume: { enabled: true, state_path: nil, overwrite_incompatible: false },
|
38
39
|
sharding: { mode: :by_size, part_bytes: DEFAULTS[:rotate_bytes], prefix: nil },
|
39
40
|
logger: nil, on_progress: nil, read_concern: DEFAULTS[:read_concern], read_preference: DEFAULTS[:read_preference],
|
40
|
-
no_cursor_timeout: DEFAULTS[:no_cursor_timeout])
|
41
|
+
no_cursor_timeout: DEFAULTS[:no_cursor_timeout], telemetry: nil)
|
41
42
|
@client = client
|
42
43
|
@collection = client[collection]
|
43
44
|
@output = output
|
@@ -60,6 +61,10 @@ module Purplelight
|
|
60
61
|
@no_cursor_timeout = no_cursor_timeout
|
61
62
|
|
62
63
|
@running = true
|
64
|
+
@telemetry_enabled = telemetry ? telemetry.enabled? : (ENV['PL_TELEMETRY'] == '1')
|
65
|
+
@telemetry = telemetry || (
|
66
|
+
@telemetry_enabled ? Telemetry.new(enabled: true) : Telemetry::NULL
|
67
|
+
)
|
63
68
|
end
|
64
69
|
|
65
70
|
# rubocop:disable Naming/PredicateMethod
|
@@ -90,8 +95,10 @@ module Purplelight
|
|
90
95
|
manifest.ensure_partitions!(@partitions)
|
91
96
|
|
92
97
|
# Plan partitions
|
98
|
+
t_plan = @telemetry.start(:partition_plan_time)
|
93
99
|
partition_filters = Partitioner.object_id_partitions(collection: @collection, query: @query,
|
94
|
-
partitions: @partitions)
|
100
|
+
partitions: @partitions, telemetry: @telemetry)
|
101
|
+
@telemetry.finish(:partition_plan_time, t_plan)
|
95
102
|
|
96
103
|
# Reader queue
|
97
104
|
queue = ByteQueue.new(max_bytes: @queue_size_bytes)
|
@@ -116,12 +123,17 @@ module Purplelight
|
|
116
123
|
# Start reader threads
|
117
124
|
readers = partition_filters.each_with_index.map do |pf, idx|
|
118
125
|
Thread.new do
|
119
|
-
|
126
|
+
local_telemetry = @telemetry_enabled ? Telemetry.new(enabled: true) : Telemetry::NULL
|
127
|
+
read_partition(idx: idx, filter_spec: pf, queue: queue, batch_size: @batch_size, manifest: manifest, telemetry: local_telemetry)
|
128
|
+
# Merge per-thread telemetry
|
129
|
+
@telemetry.merge!(local_telemetry) if @telemetry_enabled
|
120
130
|
end
|
121
131
|
end
|
122
132
|
|
123
133
|
# Writer loop
|
134
|
+
writer_telemetry = @telemetry_enabled ? Telemetry.new(enabled: true) : Telemetry::NULL
|
124
135
|
writer_thread = Thread.new do
|
136
|
+
Thread.current[:pl_telemetry] = writer_telemetry if @telemetry_enabled
|
125
137
|
loop do
|
126
138
|
batch = queue.pop
|
127
139
|
break if batch.nil?
|
@@ -146,8 +158,22 @@ module Purplelight
|
|
146
158
|
readers.each(&:join)
|
147
159
|
queue.close
|
148
160
|
writer_thread.join
|
161
|
+
@telemetry.merge!(writer_telemetry) if @telemetry_enabled
|
149
162
|
@running = false
|
150
163
|
progress_thread.join
|
164
|
+
if @telemetry_enabled
|
165
|
+
total = @telemetry.timers.values.sum
|
166
|
+
breakdown = @telemetry.timers
|
167
|
+
.sort_by { |_k, v| -v }
|
168
|
+
.map { |k, v| [k, v, total.zero? ? 0 : ((v / total) * 100.0)] }
|
169
|
+
if @logger
|
170
|
+
@logger.info('Telemetry (seconds and % of timed work):')
|
171
|
+
breakdown.each { |k, v, pct| @logger.info(" #{k}: #{v.round(3)}s (#{pct.round(1)}%)") }
|
172
|
+
else
|
173
|
+
puts 'Telemetry (seconds and % of timed work):'
|
174
|
+
breakdown.each { |k, v, pct| puts " #{k}: #{v.round(3)}s (#{pct.round(1)}%)" }
|
175
|
+
end
|
176
|
+
end
|
151
177
|
true
|
152
178
|
end
|
153
179
|
# rubocop:enable Naming/PredicateMethod
|
@@ -167,7 +193,7 @@ module Purplelight
|
|
167
193
|
[dir, prefix]
|
168
194
|
end
|
169
195
|
|
170
|
-
def read_partition(idx:, filter_spec:, queue:, batch_size:, manifest:)
|
196
|
+
def read_partition(idx:, filter_spec:, queue:, batch_size:, manifest:, telemetry: Telemetry::NULL)
|
171
197
|
filter = filter_spec[:filter]
|
172
198
|
sort = filter_spec[:sort] || { _id: 1 }
|
173
199
|
hint = @hint || filter_spec[:hint] || { _id: 1 }
|
@@ -195,6 +221,8 @@ module Purplelight
|
|
195
221
|
cursor = @collection.find(filter, opts)
|
196
222
|
|
197
223
|
encode_lines = (@format == :jsonl)
|
224
|
+
# When JSONL, build one big string per batch to offload join cost from writer.
|
225
|
+
string_batch = +''
|
198
226
|
buffer = []
|
199
227
|
buffer_bytes = 0
|
200
228
|
last_id = checkpoint
|
@@ -202,25 +230,48 @@ module Purplelight
|
|
202
230
|
cursor.each do |doc|
|
203
231
|
last_id = doc['_id']
|
204
232
|
doc = @mapper.call(doc) if @mapper
|
233
|
+
t_ser = telemetry.start(:serialize_time)
|
205
234
|
if encode_lines
|
206
|
-
line = "#{JSON.
|
235
|
+
line = "#{JSON.fast_generate(doc)}\n"
|
236
|
+
telemetry.finish(:serialize_time, t_ser)
|
207
237
|
bytes = line.bytesize
|
208
|
-
|
238
|
+
string_batch << line
|
209
239
|
else
|
210
240
|
# For CSV/Parquet keep raw docs to allow schema/row building
|
211
|
-
bytes = (JSON.
|
241
|
+
bytes = (JSON.fast_generate(doc).bytesize + 1)
|
242
|
+
telemetry.finish(:serialize_time, t_ser)
|
212
243
|
buffer << doc
|
213
244
|
end
|
214
245
|
buffer_bytes += bytes
|
215
|
-
|
246
|
+
# For JSONL, we count rows via newline accumulation; for others, use array length
|
247
|
+
ready = encode_lines ? (buffer_bytes >= 1_000_000 || (string_batch.length >= 1_000_000)) : (buffer.length >= batch_size || buffer_bytes >= 1_000_000)
|
248
|
+
next unless ready
|
216
249
|
|
217
|
-
|
250
|
+
t_q = telemetry.start(:queue_wait_time)
|
251
|
+
if encode_lines
|
252
|
+
queue.push(string_batch, bytes: buffer_bytes)
|
253
|
+
string_batch = +''
|
254
|
+
else
|
255
|
+
queue.push(buffer, bytes: buffer_bytes)
|
256
|
+
buffer = []
|
257
|
+
end
|
258
|
+
telemetry.finish(:queue_wait_time, t_q)
|
218
259
|
manifest.update_partition_checkpoint!(idx, last_id)
|
219
|
-
buffer = []
|
220
260
|
buffer_bytes = 0
|
221
261
|
end
|
222
|
-
|
262
|
+
if encode_lines
|
263
|
+
unless string_batch.empty?
|
264
|
+
t_q = telemetry.start(:queue_wait_time)
|
265
|
+
queue.push(string_batch, bytes: buffer_bytes)
|
266
|
+
telemetry.finish(:queue_wait_time, t_q)
|
267
|
+
manifest.update_partition_checkpoint!(idx, last_id)
|
268
|
+
string_batch = +''
|
269
|
+
buffer_bytes = 0
|
270
|
+
end
|
271
|
+
elsif !buffer.empty?
|
272
|
+
t_q = telemetry.start(:queue_wait_time)
|
223
273
|
queue.push(buffer, bytes: buffer_bytes)
|
274
|
+
telemetry.finish(:queue_wait_time, t_q)
|
224
275
|
manifest.update_partition_checkpoint!(idx, last_id)
|
225
276
|
buffer = []
|
226
277
|
buffer_bytes = 0
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Purplelight
|
4
|
+
# Lightweight, low-overhead timing and counters with a ticket API.
|
5
|
+
class Telemetry
|
6
|
+
def initialize(enabled: true)
|
7
|
+
@enabled = enabled
|
8
|
+
@counters = Hash.new(0)
|
9
|
+
@timers = Hash.new(0.0)
|
10
|
+
@mutex = Mutex.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def enabled?
|
14
|
+
@enabled
|
15
|
+
end
|
16
|
+
|
17
|
+
# Start a timer. Returns a ticket (Float) when enabled, or nil when disabled.
|
18
|
+
def start(_key)
|
19
|
+
return nil unless @enabled
|
20
|
+
|
21
|
+
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Finish a timer using a ticket from start. No-ops if ticket is nil.
|
25
|
+
def finish(key, ticket)
|
26
|
+
return unless @enabled && ticket
|
27
|
+
|
28
|
+
dt = Process.clock_gettime(Process::CLOCK_MONOTONIC) - ticket
|
29
|
+
@timers[key] += dt
|
30
|
+
end
|
31
|
+
|
32
|
+
def add(key, count = 1)
|
33
|
+
return unless @enabled
|
34
|
+
|
35
|
+
@counters[key] += count
|
36
|
+
end
|
37
|
+
|
38
|
+
def merge!(other)
|
39
|
+
return self unless @enabled
|
40
|
+
|
41
|
+
other.counters.each { |k, v| @counters[k] += v }
|
42
|
+
other.timers.each { |k, v| @timers[k] += v }
|
43
|
+
self
|
44
|
+
end
|
45
|
+
|
46
|
+
attr_reader :counters, :timers
|
47
|
+
|
48
|
+
# A disabled singleton for zero overhead checks if needed.
|
49
|
+
NULL = new(enabled: false)
|
50
|
+
end
|
51
|
+
end
|
data/lib/purplelight/version.rb
CHANGED
@@ -11,6 +11,12 @@ rescue LoadError
|
|
11
11
|
# zstd not available; fallback handled later via gzip
|
12
12
|
end
|
13
13
|
|
14
|
+
begin
|
15
|
+
require 'zstd-ruby'
|
16
|
+
rescue LoadError
|
17
|
+
# alternative zstd gem not available
|
18
|
+
end
|
19
|
+
|
14
20
|
module Purplelight
|
15
21
|
# WriterCSV writes documents to CSV files with optional compression.
|
16
22
|
class WriterCSV
|
@@ -24,6 +30,8 @@ module Purplelight
|
|
24
30
|
@rotate_bytes = rotate_bytes
|
25
31
|
@logger = logger
|
26
32
|
@manifest = manifest
|
33
|
+
env_level = ENV['PL_ZSTD_LEVEL']&.to_i
|
34
|
+
@compression_level = (env_level && env_level > 0 ? env_level : nil)
|
27
35
|
@single_file = single_file
|
28
36
|
|
29
37
|
@columns = columns&.map(&:to_s)
|
@@ -81,14 +89,49 @@ module Purplelight
|
|
81
89
|
|
82
90
|
@csv&.flush
|
83
91
|
if @io
|
92
|
+
t = Thread.current[:pl_telemetry]&.start(:rotate_time)
|
84
93
|
finalize_current_part!
|
85
94
|
@io.close
|
95
|
+
Thread.current[:pl_telemetry]&.finish(:rotate_time, t)
|
86
96
|
end
|
87
97
|
@closed = true
|
88
98
|
end
|
89
99
|
|
90
100
|
private
|
91
101
|
|
102
|
+
# Minimal wrapper to count bytes written for rotate logic when
|
103
|
+
# underlying compressed writer doesn't expose position (e.g., zstd-ruby).
|
104
|
+
class CountingIO
|
105
|
+
def initialize(io, on_write:)
|
106
|
+
@io = io
|
107
|
+
@on_write = on_write
|
108
|
+
end
|
109
|
+
|
110
|
+
def write(data)
|
111
|
+
bytes_written = @io.write(data)
|
112
|
+
@on_write.call(bytes_written) if bytes_written && @on_write
|
113
|
+
bytes_written
|
114
|
+
end
|
115
|
+
|
116
|
+
# CSV calls '<<' on the underlying IO in some code paths
|
117
|
+
def <<(data)
|
118
|
+
write(data)
|
119
|
+
end
|
120
|
+
|
121
|
+
# CSV#flush may forward flush to underlying IO; make it a no-op if unavailable
|
122
|
+
def flush
|
123
|
+
@io.flush if @io.respond_to?(:flush)
|
124
|
+
end
|
125
|
+
|
126
|
+
def method_missing(method_name, *args, &block)
|
127
|
+
@io.send(method_name, *args, &block)
|
128
|
+
end
|
129
|
+
|
130
|
+
def respond_to_missing?(method_name, include_private = false)
|
131
|
+
@io.respond_to?(method_name, include_private)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
92
135
|
def ensure_open!
|
93
136
|
return if @io
|
94
137
|
|
@@ -96,7 +139,8 @@ module Purplelight
|
|
96
139
|
path = next_part_path
|
97
140
|
@part_index = @manifest&.open_part!(path) if @manifest
|
98
141
|
raw = File.open(path, 'wb')
|
99
|
-
|
142
|
+
compressed = build_compressed_io(raw)
|
143
|
+
@io = CountingIO.new(compressed, on_write: ->(n) { @bytes_written += n })
|
100
144
|
@csv = CSV.new(@io)
|
101
145
|
@bytes_written = 0
|
102
146
|
@rows_written = 0
|
@@ -105,7 +149,13 @@ module Purplelight
|
|
105
149
|
def build_compressed_io(raw)
|
106
150
|
case @effective_compression.to_s
|
107
151
|
when 'zstd'
|
108
|
-
|
152
|
+
if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
|
153
|
+
level = @compression_level || 10
|
154
|
+
return ::Zstd::StreamWriter.new(raw, level: level)
|
155
|
+
elsif defined?(ZSTDS)
|
156
|
+
level = @compression_level || 10
|
157
|
+
return ZSTDS::Stream::Writer.new(raw, compression_level: level)
|
158
|
+
end
|
109
159
|
|
110
160
|
@logger&.warn('zstd gem not loaded; using gzip')
|
111
161
|
Zlib::GzipWriter.new(raw)
|
@@ -122,8 +172,10 @@ module Purplelight
|
|
122
172
|
def rotate!
|
123
173
|
return unless @io
|
124
174
|
|
175
|
+
t = Thread.current[:pl_telemetry]&.start(:rotate_time)
|
125
176
|
finalize_current_part!
|
126
177
|
@io.close
|
178
|
+
Thread.current[:pl_telemetry]&.finish(:rotate_time, t)
|
127
179
|
@io = nil
|
128
180
|
@csv = nil
|
129
181
|
ensure_open!
|
@@ -150,7 +202,7 @@ module Purplelight
|
|
150
202
|
def determine_effective_compression(requested)
|
151
203
|
case requested.to_s
|
152
204
|
when 'zstd'
|
153
|
-
(defined?(ZSTDS) ? :zstd : :gzip)
|
205
|
+
((defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter))) ? :zstd : :gzip)
|
154
206
|
when 'none'
|
155
207
|
:none
|
156
208
|
else
|
@@ -10,6 +10,12 @@ rescue LoadError
|
|
10
10
|
# zstd not available; will fallback to gzip
|
11
11
|
end
|
12
12
|
|
13
|
+
begin
|
14
|
+
require 'zstd-ruby'
|
15
|
+
rescue LoadError
|
16
|
+
# alternative zstd gem not available
|
17
|
+
end
|
18
|
+
|
13
19
|
module Purplelight
|
14
20
|
# WriterJSONL writes newline-delimited JSON with optional compression.
|
15
21
|
class WriterJSONL
|
@@ -23,7 +29,8 @@ module Purplelight
|
|
23
29
|
@rotate_bytes = rotate_bytes
|
24
30
|
@logger = logger
|
25
31
|
@manifest = manifest
|
26
|
-
|
32
|
+
env_level = ENV['PL_ZSTD_LEVEL']&.to_i
|
33
|
+
@compression_level = compression_level || (env_level && env_level > 0 ? env_level : nil)
|
27
34
|
|
28
35
|
@part_index = nil
|
29
36
|
@io = nil
|
@@ -33,23 +40,71 @@ module Purplelight
|
|
33
40
|
@closed = false
|
34
41
|
|
35
42
|
@effective_compression = determine_effective_compression(@compression)
|
36
|
-
|
37
|
-
|
38
|
-
|
43
|
+
if @logger
|
44
|
+
level_disp = @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @effective_compression.to_s == 'zstd')
|
45
|
+
@logger.info("WriterJSONL using compression='#{@effective_compression}' level='#{level_disp || 'default'}'")
|
46
|
+
end
|
47
|
+
if @effective_compression.to_s != @compression.to_s
|
48
|
+
@logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
|
49
|
+
end
|
39
50
|
end
|
40
51
|
|
41
|
-
def write_many(
|
52
|
+
def write_many(batch)
|
42
53
|
ensure_open!
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
54
|
+
|
55
|
+
chunk_threshold = (ENV['PL_WRITE_CHUNK_BYTES']&.to_i || (8 * 1024 * 1024))
|
56
|
+
total_bytes = 0
|
57
|
+
rows = 0
|
58
|
+
|
59
|
+
if batch.is_a?(String)
|
60
|
+
# Fast-path: writer received a preassembled buffer string
|
61
|
+
buffer = batch
|
62
|
+
rows = buffer.count("\n")
|
63
|
+
write_buffer(buffer)
|
64
|
+
total_bytes = buffer.bytesize
|
65
|
+
elsif batch.first.is_a?(String)
|
66
|
+
# Join and write in chunks to avoid large intermediate allocations
|
67
|
+
chunk = +''
|
68
|
+
chunk_bytes = 0
|
69
|
+
batch.each do |line|
|
70
|
+
chunk << line
|
71
|
+
rows += 1
|
72
|
+
chunk_bytes += line.bytesize
|
73
|
+
next unless chunk_bytes >= chunk_threshold
|
74
|
+
|
75
|
+
write_buffer(chunk)
|
76
|
+
total_bytes += chunk.bytesize
|
77
|
+
chunk = +''
|
78
|
+
chunk_bytes = 0
|
79
|
+
end
|
80
|
+
unless chunk.empty?
|
81
|
+
write_buffer(chunk)
|
82
|
+
total_bytes += chunk.bytesize
|
83
|
+
end
|
84
|
+
else
|
85
|
+
# Fallback: encode docs here (JSON.fast_generate preferred) and write in chunks
|
86
|
+
chunk = +''
|
87
|
+
chunk_bytes = 0
|
88
|
+
batch.each do |doc|
|
89
|
+
line = "#{JSON.fast_generate(doc)}\n"
|
90
|
+
rows += 1
|
91
|
+
chunk << line
|
92
|
+
chunk_bytes += line.bytesize
|
93
|
+
next unless chunk_bytes >= chunk_threshold
|
94
|
+
|
95
|
+
write_buffer(chunk)
|
96
|
+
total_bytes += chunk.bytesize
|
97
|
+
chunk = +''
|
98
|
+
chunk_bytes = 0
|
99
|
+
end
|
100
|
+
unless chunk.empty?
|
101
|
+
write_buffer(chunk)
|
102
|
+
total_bytes += chunk.bytesize
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
51
106
|
@rows_written += rows
|
52
|
-
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: rows, bytes_delta:
|
107
|
+
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: rows, bytes_delta: total_bytes)
|
53
108
|
end
|
54
109
|
|
55
110
|
def rotate_if_needed
|
@@ -86,15 +141,18 @@ module Purplelight
|
|
86
141
|
def build_compressed_io(raw)
|
87
142
|
case @effective_compression.to_s
|
88
143
|
when 'zstd'
|
89
|
-
if
|
90
|
-
|
144
|
+
# Prefer zstd-ruby if available, else ruby-zstds
|
145
|
+
if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
|
146
|
+
level = @compression_level || 3
|
147
|
+
return ::Zstd::StreamWriter.new(raw, level: level)
|
148
|
+
elsif defined?(ZSTDS)
|
91
149
|
level = @compression_level || 3
|
92
|
-
ZSTDS::Writer.
|
93
|
-
else
|
94
|
-
@logger&.warn('zstd gem not loaded; this should have been handled earlier')
|
95
|
-
level = @compression_level || Zlib::DEFAULT_COMPRESSION
|
96
|
-
Zlib::GzipWriter.new(raw, level)
|
150
|
+
return ZSTDS::Stream::Writer.new(raw, compression_level: level)
|
97
151
|
end
|
152
|
+
|
153
|
+
@logger&.warn('zstd gems not loaded; falling back to gzip')
|
154
|
+
level = @compression_level || Zlib::DEFAULT_COMPRESSION
|
155
|
+
Zlib::GzipWriter.new(raw, level)
|
98
156
|
when 'gzip'
|
99
157
|
level = @compression_level || 1
|
100
158
|
Zlib::GzipWriter.new(raw, level)
|
@@ -106,7 +164,9 @@ module Purplelight
|
|
106
164
|
end
|
107
165
|
|
108
166
|
def write_buffer(buffer)
|
167
|
+
t = Thread.current[:pl_telemetry]&.start(:write_time)
|
109
168
|
@io.write(buffer)
|
169
|
+
Thread.current[:pl_telemetry]&.finish(:write_time, t)
|
110
170
|
@bytes_written += buffer.bytesize
|
111
171
|
rotate_if_needed
|
112
172
|
end
|
@@ -114,8 +174,10 @@ module Purplelight
|
|
114
174
|
def rotate!
|
115
175
|
return unless @io
|
116
176
|
|
177
|
+
t = Thread.current[:pl_telemetry]&.start(:rotate_time)
|
117
178
|
finalize_current_part!
|
118
179
|
@io.close
|
180
|
+
Thread.current[:pl_telemetry]&.finish(:rotate_time, t)
|
119
181
|
@io = nil
|
120
182
|
ensure_open!
|
121
183
|
end
|
@@ -138,7 +200,7 @@ module Purplelight
|
|
138
200
|
def determine_effective_compression(requested)
|
139
201
|
case requested.to_s
|
140
202
|
when 'zstd'
|
141
|
-
(defined?(ZSTDS) ? :zstd : :gzip)
|
203
|
+
((defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter))) ? :zstd : :gzip)
|
142
204
|
when 'none'
|
143
205
|
:none
|
144
206
|
else
|
@@ -44,8 +44,13 @@ module Purplelight
|
|
44
44
|
|
45
45
|
ensure_open!
|
46
46
|
unless @buffer_docs.empty?
|
47
|
+
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
47
48
|
table = build_table(@buffer_docs)
|
49
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
50
|
+
|
51
|
+
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
48
52
|
write_table(table, @writer_path, append: false)
|
53
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
49
54
|
end
|
50
55
|
finalize_current_part!
|
51
56
|
@closed = true
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: purplelight
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alexander Nicholson
|
@@ -13,44 +13,62 @@ dependencies:
|
|
13
13
|
name: csv
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
15
15
|
requirements:
|
16
|
+
- - "~>"
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: '3.3'
|
16
19
|
- - ">="
|
17
20
|
- !ruby/object:Gem::Version
|
18
|
-
version:
|
21
|
+
version: 3.3.5
|
19
22
|
type: :runtime
|
20
23
|
prerelease: false
|
21
24
|
version_requirements: !ruby/object:Gem::Requirement
|
22
25
|
requirements:
|
26
|
+
- - "~>"
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
version: '3.3'
|
23
29
|
- - ">="
|
24
30
|
- !ruby/object:Gem::Version
|
25
|
-
version:
|
31
|
+
version: 3.3.5
|
26
32
|
- !ruby/object:Gem::Dependency
|
27
33
|
name: logger
|
28
34
|
requirement: !ruby/object:Gem::Requirement
|
29
35
|
requirements:
|
36
|
+
- - "~>"
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '1.7'
|
30
39
|
- - ">="
|
31
40
|
- !ruby/object:Gem::Version
|
32
|
-
version:
|
41
|
+
version: 1.7.0
|
33
42
|
type: :runtime
|
34
43
|
prerelease: false
|
35
44
|
version_requirements: !ruby/object:Gem::Requirement
|
36
45
|
requirements:
|
46
|
+
- - "~>"
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '1.7'
|
37
49
|
- - ">="
|
38
50
|
- !ruby/object:Gem::Version
|
39
|
-
version:
|
51
|
+
version: 1.7.0
|
40
52
|
- !ruby/object:Gem::Dependency
|
41
53
|
name: mongo
|
42
54
|
requirement: !ruby/object:Gem::Requirement
|
43
55
|
requirements:
|
56
|
+
- - "~>"
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: '2.21'
|
44
59
|
- - ">="
|
45
60
|
- !ruby/object:Gem::Version
|
46
|
-
version:
|
61
|
+
version: 2.21.3
|
47
62
|
type: :runtime
|
48
63
|
prerelease: false
|
49
64
|
version_requirements: !ruby/object:Gem::Requirement
|
50
65
|
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '2.21'
|
51
69
|
- - ">="
|
52
70
|
- !ruby/object:Gem::Version
|
53
|
-
version:
|
71
|
+
version: 2.21.3
|
54
72
|
description: High-throughput, resumable snapshots of MongoDB collections with partitioning,
|
55
73
|
multi-threaded readers, and size-based sharded outputs.
|
56
74
|
email:
|
@@ -70,15 +88,16 @@ files:
|
|
70
88
|
- lib/purplelight/partitioner.rb
|
71
89
|
- lib/purplelight/queue.rb
|
72
90
|
- lib/purplelight/snapshot.rb
|
91
|
+
- lib/purplelight/telemetry.rb
|
73
92
|
- lib/purplelight/version.rb
|
74
93
|
- lib/purplelight/writer_csv.rb
|
75
94
|
- lib/purplelight/writer_jsonl.rb
|
76
95
|
- lib/purplelight/writer_parquet.rb
|
96
|
+
homepage: https://github.com/alexandernicholson/purplelight
|
77
97
|
licenses:
|
78
98
|
- MIT
|
79
99
|
metadata:
|
80
100
|
rubygems_mfa_required: 'true'
|
81
|
-
homepage_uri: https://github.com/alexandernicholson/purplelight
|
82
101
|
source_code_uri: https://github.com/alexandernicholson/purplelight
|
83
102
|
changelog_uri: https://github.com/alexandernicholson/purplelight/releases
|
84
103
|
rdoc_options: []
|