purplelight 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 707f5c45aade0603b8a3c500e91015b9850c2c92deb91492f06cb3f8bbee76d5
4
- data.tar.gz: dd6217a55fa5bcd3f008699bd8710e8e0ac72f9bb0b8bf5dd71d2f2eccbc5d84
3
+ metadata.gz: 941ef04ede346a29c0afe19a02d69bd9a25d08ce241e21c6c47960498f4a42c6
4
+ data.tar.gz: c34b089d2842082f5d4be60c96d58e208e079f92712d397726a964a7361ae895
5
5
  SHA512:
6
- metadata.gz: 23da05fd59362787069ae1df5168d85c210d2c7cfc0fd254c064cae26212ee177b84dff8c72ba825a612b3aef3072188e54fad267c60331105dc026e96b42d50
7
- data.tar.gz: 8b8a728f0002f60d55c31270ee2751aaddab422777aa2a51d6a8b8d3a25a93fe2be27d218316b6fd077103d2901f7561b4983a27a27fde895b80fdc6ee67089f
6
+ metadata.gz: 8d666cb565283e6410fb0412d8ed369db18b0eaa0e80c573d677295910319b809396e99fca6627214d15942d531c9ac378c7d5a850842a7124d2ff4c9b03d7d8
7
+ data.tar.gz: 9188cd0b55e0d3e54fb2d209b85eeeacfb1c0d8daa73b1ace0fbb3bf4a2af865d6dd5c125f1e487458f096077a70cc93e7c0191c1222f6edd58701487f5479da
data/README.md CHANGED
@@ -9,7 +9,7 @@ Purplelight is published on RubyGems: [purplelight on RubyGems](https://rubygems
9
9
  Add to your Gemfile:
10
10
 
11
11
  ```ruby
12
- gem 'purplelight', '~> 0.1.4'
12
+ gem 'purplelight', '~> 0.1.6'
13
13
  ```
14
14
 
15
15
  Or install directly:
data/bin/purplelight CHANGED
@@ -40,6 +40,7 @@ parser = OptionParser.new do |opts|
40
40
  options[:sharding][:prefix] = v
41
41
  end
42
42
  opts.on('-q', '--query JSON', 'Filter query as JSON') { |v| options[:query] = JSON.parse(v) }
43
+ opts.on('--projection JSON', 'Projection as JSON (e.g., {"_id":1,"field":1})') { |v| options[:projection] = JSON.parse(v) }
43
44
  opts.on('--read-preference MODE',
44
45
  'Read preference mode: primary|primary_preferred|secondary|secondary_preferred|nearest') do |v|
45
46
  options[:read_preference] = v.to_sym
@@ -106,6 +107,7 @@ ok = Purplelight.snapshot(
106
107
  partitions: options[:partitions],
107
108
  batch_size: options[:batch_size],
108
109
  query: options[:query],
110
+ projection: options[:projection],
109
111
  sharding: options[:sharding],
110
112
  read_preference: effective_read || options[:read_preference],
111
113
  resume: { enabled: true },
@@ -11,7 +11,86 @@ module Purplelight
11
11
  class Partitioner
12
12
  # Builds contiguous _id range filters for N partitions.
13
13
  # For ObjectId _id, we sample quantiles to split into near-equal document counts.
14
- def self.object_id_partitions(collection:, query:, partitions:)
14
+ def self.object_id_partitions(collection:, query:, partitions:, mode: nil, telemetry: nil)
15
+ # Choose planning mode: :timestamp (fast), :cursor (legacy)
16
+ chosen_mode = (mode || ENV['PL_PARTITIONER_MODE'] || :timestamp).to_sym
17
+ telemetry ||= (defined?(Telemetry) ? Telemetry::NULL : nil)
18
+
19
+ return cursor_sampling_partitions(collection: collection, query: query, partitions: partitions) if chosen_mode == :cursor
20
+
21
+ timestamp_partitions(collection: collection, query: query, partitions: partitions, telemetry: telemetry)
22
+ end
23
+
24
+ def self.simple_ranges(collection:, query:, partitions:)
25
+ # Split by _id quantiles using min/max endpoints
26
+ min_id = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(1).first&.dig('_id')
27
+ max_id = collection.find(query || {}).projection(_id: 1).sort(_id: -1).limit(1).first&.dig('_id')
28
+ return [{ filter: query || {}, sort: { _id: 1 } }] if min_id.nil? || max_id.nil?
29
+
30
+ # Create numeric-ish interpolation by sampling
31
+ ids = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(partitions - 1).to_a.map { |d| d['_id'] }
32
+ boundaries = [min_id] + ids + [max_id]
33
+ ranges = []
34
+ boundaries.each_cons(2) do |a, b|
35
+ ranges << build_range(a, b)
36
+ end
37
+
38
+ ranges.map do |r|
39
+ filter = query ? query.dup : {}
40
+ filter['_id'] = r
41
+ { filter: filter, sort: { _id: 1 }, hint: { _id: 1 } }
42
+ end
43
+ end
44
+
45
+ # Faster planning using ObjectId timestamps: O(partitions) indexed lookups
46
+ def self.timestamp_partitions(collection:, query:, partitions:, telemetry: nil)
47
+ t_minmax = telemetry&.start(:plan_minmax_time)
48
+ min_id = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(1).first&.dig('_id')
49
+ max_id = collection.find(query || {}).projection(_id: 1).sort(_id: -1).limit(1).first&.dig('_id')
50
+ telemetry&.finish(:plan_minmax_time, t_minmax)
51
+
52
+ return [{ filter: query || {}, sort: { _id: 1 } }] if min_id.nil? || max_id.nil?
53
+
54
+ min_ts = min_id.respond_to?(:generation_time) ? min_id.generation_time.to_i : nil
55
+ max_ts = max_id.respond_to?(:generation_time) ? max_id.generation_time.to_i : nil
56
+
57
+ # Fallback to cursor sampling if _id isn't an ObjectId
58
+ return cursor_sampling_partitions(collection: collection, query: query, partitions: partitions) if min_ts.nil? || max_ts.nil? || max_ts <= min_ts
59
+
60
+ step = [(max_ts - min_ts) / partitions, 1].max
61
+ inner_boundaries = []
62
+ t_boundaries = telemetry&.start(:plan_boundary_queries_time)
63
+ 1.upto(partitions - 1) do |i|
64
+ target_ts = min_ts + (step * i)
65
+ candidate = BSON::ObjectId.from_time(Time.at(target_ts))
66
+ f = query ? query.dup : {}
67
+ f['_id'] = { '$gt' => candidate }
68
+ b = collection.find(f).projection(_id: 1).sort(_id: 1).hint(_id: 1).limit(1).first&.dig('_id')
69
+ inner_boundaries << b if b
70
+ end
71
+ telemetry&.finish(:plan_boundary_queries_time, t_boundaries)
72
+
73
+ # Build ranges: first range has nil lower bound to include min_id,
74
+ # middle ranges are (prev, current], and last is (last, +inf)
75
+ ranges = []
76
+ t_ranges = telemetry&.start(:plan_ranges_build_time)
77
+ prev = nil
78
+ inner_boundaries.each do |b|
79
+ ranges << build_range(prev, b)
80
+ prev = b
81
+ end
82
+ ranges << build_range(prev, nil)
83
+ telemetry&.finish(:plan_ranges_build_time, t_ranges)
84
+
85
+ ranges.map do |r|
86
+ filter = query ? query.dup : {}
87
+ filter['_id'] = r
88
+ { filter: filter, sort: { _id: 1 }, hint: { _id: 1 } }
89
+ end
90
+ end
91
+
92
+ # Legacy cursor sampling planner
93
+ def self.cursor_sampling_partitions(collection:, query:, partitions:)
15
94
  # Ensure sort order for sampling
16
95
  base_query = collection.find(query || {}, {}.merge(sort: { _id: 1 }))
17
96
 
@@ -49,27 +128,6 @@ module Purplelight
49
128
  end
50
129
  end
51
130
 
52
- def self.simple_ranges(collection:, query:, partitions:)
53
- # Split by _id quantiles using min/max endpoints
54
- min_id = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(1).first&.dig('_id')
55
- max_id = collection.find(query || {}).projection(_id: 1).sort(_id: -1).limit(1).first&.dig('_id')
56
- return [{ filter: query || {}, sort: { _id: 1 } }] if min_id.nil? || max_id.nil?
57
-
58
- # Create numeric-ish interpolation by sampling
59
- ids = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(partitions - 1).to_a.map { |d| d['_id'] }
60
- boundaries = [min_id] + ids + [max_id]
61
- ranges = []
62
- boundaries.each_cons(2) do |a, b|
63
- ranges << build_range(a, b)
64
- end
65
-
66
- ranges.map do |r|
67
- filter = query ? query.dup : {}
68
- filter['_id'] = r
69
- { filter: filter, sort: { _id: 1 }, hint: { _id: 1 } }
70
- end
71
- end
72
-
73
131
  def self.build_range(from_id, to_id)
74
132
  if from_id && to_id
75
133
  { '$gt' => from_id, '$lte' => to_id }
@@ -10,6 +10,7 @@ require_relative 'writer_csv'
10
10
  require_relative 'writer_parquet'
11
11
  require_relative 'manifest'
12
12
  require_relative 'errors'
13
+ require_relative 'telemetry'
13
14
 
14
15
  module Purplelight
15
16
  # Snapshot orchestrates partition planning, parallel reads, and writing.
@@ -37,7 +38,7 @@ module Purplelight
37
38
  resume: { enabled: true, state_path: nil, overwrite_incompatible: false },
38
39
  sharding: { mode: :by_size, part_bytes: DEFAULTS[:rotate_bytes], prefix: nil },
39
40
  logger: nil, on_progress: nil, read_concern: DEFAULTS[:read_concern], read_preference: DEFAULTS[:read_preference],
40
- no_cursor_timeout: DEFAULTS[:no_cursor_timeout])
41
+ no_cursor_timeout: DEFAULTS[:no_cursor_timeout], telemetry: nil)
41
42
  @client = client
42
43
  @collection = client[collection]
43
44
  @output = output
@@ -60,6 +61,10 @@ module Purplelight
60
61
  @no_cursor_timeout = no_cursor_timeout
61
62
 
62
63
  @running = true
64
+ @telemetry_enabled = telemetry ? telemetry.enabled? : (ENV['PL_TELEMETRY'] == '1')
65
+ @telemetry = telemetry || (
66
+ @telemetry_enabled ? Telemetry.new(enabled: true) : Telemetry::NULL
67
+ )
63
68
  end
64
69
 
65
70
  # rubocop:disable Naming/PredicateMethod
@@ -90,8 +95,10 @@ module Purplelight
90
95
  manifest.ensure_partitions!(@partitions)
91
96
 
92
97
  # Plan partitions
98
+ t_plan = @telemetry.start(:partition_plan_time)
93
99
  partition_filters = Partitioner.object_id_partitions(collection: @collection, query: @query,
94
- partitions: @partitions)
100
+ partitions: @partitions, telemetry: @telemetry)
101
+ @telemetry.finish(:partition_plan_time, t_plan)
95
102
 
96
103
  # Reader queue
97
104
  queue = ByteQueue.new(max_bytes: @queue_size_bytes)
@@ -116,12 +123,17 @@ module Purplelight
116
123
  # Start reader threads
117
124
  readers = partition_filters.each_with_index.map do |pf, idx|
118
125
  Thread.new do
119
- read_partition(idx: idx, filter_spec: pf, queue: queue, batch_size: @batch_size, manifest: manifest)
126
+ local_telemetry = @telemetry_enabled ? Telemetry.new(enabled: true) : Telemetry::NULL
127
+ read_partition(idx: idx, filter_spec: pf, queue: queue, batch_size: @batch_size, manifest: manifest, telemetry: local_telemetry)
128
+ # Merge per-thread telemetry
129
+ @telemetry.merge!(local_telemetry) if @telemetry_enabled
120
130
  end
121
131
  end
122
132
 
123
133
  # Writer loop
134
+ writer_telemetry = @telemetry_enabled ? Telemetry.new(enabled: true) : Telemetry::NULL
124
135
  writer_thread = Thread.new do
136
+ Thread.current[:pl_telemetry] = writer_telemetry if @telemetry_enabled
125
137
  loop do
126
138
  batch = queue.pop
127
139
  break if batch.nil?
@@ -146,8 +158,22 @@ module Purplelight
146
158
  readers.each(&:join)
147
159
  queue.close
148
160
  writer_thread.join
161
+ @telemetry.merge!(writer_telemetry) if @telemetry_enabled
149
162
  @running = false
150
163
  progress_thread.join
164
+ if @telemetry_enabled
165
+ total = @telemetry.timers.values.sum
166
+ breakdown = @telemetry.timers
167
+ .sort_by { |_k, v| -v }
168
+ .map { |k, v| [k, v, total.zero? ? 0 : ((v / total) * 100.0)] }
169
+ if @logger
170
+ @logger.info('Telemetry (seconds and % of timed work):')
171
+ breakdown.each { |k, v, pct| @logger.info(" #{k}: #{v.round(3)}s (#{pct.round(1)}%)") }
172
+ else
173
+ puts 'Telemetry (seconds and % of timed work):'
174
+ breakdown.each { |k, v, pct| puts " #{k}: #{v.round(3)}s (#{pct.round(1)}%)" }
175
+ end
176
+ end
151
177
  true
152
178
  end
153
179
  # rubocop:enable Naming/PredicateMethod
@@ -167,7 +193,7 @@ module Purplelight
167
193
  [dir, prefix]
168
194
  end
169
195
 
170
- def read_partition(idx:, filter_spec:, queue:, batch_size:, manifest:)
196
+ def read_partition(idx:, filter_spec:, queue:, batch_size:, manifest:, telemetry: Telemetry::NULL)
171
197
  filter = filter_spec[:filter]
172
198
  sort = filter_spec[:sort] || { _id: 1 }
173
199
  hint = @hint || filter_spec[:hint] || { _id: 1 }
@@ -195,6 +221,8 @@ module Purplelight
195
221
  cursor = @collection.find(filter, opts)
196
222
 
197
223
  encode_lines = (@format == :jsonl)
224
+ # When JSONL, build one big string per batch to offload join cost from writer.
225
+ string_batch = +''
198
226
  buffer = []
199
227
  buffer_bytes = 0
200
228
  last_id = checkpoint
@@ -202,25 +230,48 @@ module Purplelight
202
230
  cursor.each do |doc|
203
231
  last_id = doc['_id']
204
232
  doc = @mapper.call(doc) if @mapper
233
+ t_ser = telemetry.start(:serialize_time)
205
234
  if encode_lines
206
- line = "#{JSON.generate(doc)}\n"
235
+ line = "#{JSON.fast_generate(doc)}\n"
236
+ telemetry.finish(:serialize_time, t_ser)
207
237
  bytes = line.bytesize
208
- buffer << line
238
+ string_batch << line
209
239
  else
210
240
  # For CSV/Parquet keep raw docs to allow schema/row building
211
- bytes = (JSON.generate(doc).bytesize + 1)
241
+ bytes = (JSON.fast_generate(doc).bytesize + 1)
242
+ telemetry.finish(:serialize_time, t_ser)
212
243
  buffer << doc
213
244
  end
214
245
  buffer_bytes += bytes
215
- next unless buffer.length >= batch_size || buffer_bytes >= 1_000_000
246
+ # For JSONL, we count rows via newline accumulation; for others, use array length
247
+ ready = encode_lines ? (buffer_bytes >= 1_000_000 || (string_batch.length >= 1_000_000)) : (buffer.length >= batch_size || buffer_bytes >= 1_000_000)
248
+ next unless ready
216
249
 
217
- queue.push(buffer, bytes: buffer_bytes)
250
+ t_q = telemetry.start(:queue_wait_time)
251
+ if encode_lines
252
+ queue.push(string_batch, bytes: buffer_bytes)
253
+ string_batch = +''
254
+ else
255
+ queue.push(buffer, bytes: buffer_bytes)
256
+ buffer = []
257
+ end
258
+ telemetry.finish(:queue_wait_time, t_q)
218
259
  manifest.update_partition_checkpoint!(idx, last_id)
219
- buffer = []
220
260
  buffer_bytes = 0
221
261
  end
222
- unless buffer.empty?
262
+ if encode_lines
263
+ unless string_batch.empty?
264
+ t_q = telemetry.start(:queue_wait_time)
265
+ queue.push(string_batch, bytes: buffer_bytes)
266
+ telemetry.finish(:queue_wait_time, t_q)
267
+ manifest.update_partition_checkpoint!(idx, last_id)
268
+ string_batch = +''
269
+ buffer_bytes = 0
270
+ end
271
+ elsif !buffer.empty?
272
+ t_q = telemetry.start(:queue_wait_time)
223
273
  queue.push(buffer, bytes: buffer_bytes)
274
+ telemetry.finish(:queue_wait_time, t_q)
224
275
  manifest.update_partition_checkpoint!(idx, last_id)
225
276
  buffer = []
226
277
  buffer_bytes = 0
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Purplelight
4
+ # Lightweight, low-overhead timing and counters with a ticket API.
5
+ class Telemetry
6
+ def initialize(enabled: true)
7
+ @enabled = enabled
8
+ @counters = Hash.new(0)
9
+ @timers = Hash.new(0.0)
10
+ @mutex = Mutex.new
11
+ end
12
+
13
+ def enabled?
14
+ @enabled
15
+ end
16
+
17
+ # Start a timer. Returns a ticket (Float) when enabled, or nil when disabled.
18
+ def start(_key)
19
+ return nil unless @enabled
20
+
21
+ Process.clock_gettime(Process::CLOCK_MONOTONIC)
22
+ end
23
+
24
+ # Finish a timer using a ticket from start. No-ops if ticket is nil.
25
+ def finish(key, ticket)
26
+ return unless @enabled && ticket
27
+
28
+ dt = Process.clock_gettime(Process::CLOCK_MONOTONIC) - ticket
29
+ @timers[key] += dt
30
+ end
31
+
32
+ def add(key, count = 1)
33
+ return unless @enabled
34
+
35
+ @counters[key] += count
36
+ end
37
+
38
+ def merge!(other)
39
+ return self unless @enabled
40
+
41
+ other.counters.each { |k, v| @counters[k] += v }
42
+ other.timers.each { |k, v| @timers[k] += v }
43
+ self
44
+ end
45
+
46
+ attr_reader :counters, :timers
47
+
48
+ # A disabled singleton for zero overhead checks if needed.
49
+ NULL = new(enabled: false)
50
+ end
51
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Purplelight
4
- VERSION = '0.1.4'
4
+ VERSION = '0.1.6'
5
5
  end
@@ -11,6 +11,12 @@ rescue LoadError
11
11
  # zstd not available; fallback handled later via gzip
12
12
  end
13
13
 
14
+ begin
15
+ require 'zstd-ruby'
16
+ rescue LoadError
17
+ # alternative zstd gem not available
18
+ end
19
+
14
20
  module Purplelight
15
21
  # WriterCSV writes documents to CSV files with optional compression.
16
22
  class WriterCSV
@@ -24,6 +30,8 @@ module Purplelight
24
30
  @rotate_bytes = rotate_bytes
25
31
  @logger = logger
26
32
  @manifest = manifest
33
+ env_level = ENV['PL_ZSTD_LEVEL']&.to_i
34
+ @compression_level = (env_level && env_level > 0 ? env_level : nil)
27
35
  @single_file = single_file
28
36
 
29
37
  @columns = columns&.map(&:to_s)
@@ -81,14 +89,49 @@ module Purplelight
81
89
 
82
90
  @csv&.flush
83
91
  if @io
92
+ t = Thread.current[:pl_telemetry]&.start(:rotate_time)
84
93
  finalize_current_part!
85
94
  @io.close
95
+ Thread.current[:pl_telemetry]&.finish(:rotate_time, t)
86
96
  end
87
97
  @closed = true
88
98
  end
89
99
 
90
100
  private
91
101
 
102
+ # Minimal wrapper to count bytes written for rotate logic when
103
+ # underlying compressed writer doesn't expose position (e.g., zstd-ruby).
104
+ class CountingIO
105
+ def initialize(io, on_write:)
106
+ @io = io
107
+ @on_write = on_write
108
+ end
109
+
110
+ def write(data)
111
+ bytes_written = @io.write(data)
112
+ @on_write.call(bytes_written) if bytes_written && @on_write
113
+ bytes_written
114
+ end
115
+
116
+ # CSV calls '<<' on the underlying IO in some code paths
117
+ def <<(data)
118
+ write(data)
119
+ end
120
+
121
+ # CSV#flush may forward flush to underlying IO; make it a no-op if unavailable
122
+ def flush
123
+ @io.flush if @io.respond_to?(:flush)
124
+ end
125
+
126
+ def method_missing(method_name, *args, &block)
127
+ @io.send(method_name, *args, &block)
128
+ end
129
+
130
+ def respond_to_missing?(method_name, include_private = false)
131
+ @io.respond_to?(method_name, include_private)
132
+ end
133
+ end
134
+
92
135
  def ensure_open!
93
136
  return if @io
94
137
 
@@ -96,7 +139,8 @@ module Purplelight
96
139
  path = next_part_path
97
140
  @part_index = @manifest&.open_part!(path) if @manifest
98
141
  raw = File.open(path, 'wb')
99
- @io = build_compressed_io(raw)
142
+ compressed = build_compressed_io(raw)
143
+ @io = CountingIO.new(compressed, on_write: ->(n) { @bytes_written += n })
100
144
  @csv = CSV.new(@io)
101
145
  @bytes_written = 0
102
146
  @rows_written = 0
@@ -105,7 +149,13 @@ module Purplelight
105
149
  def build_compressed_io(raw)
106
150
  case @effective_compression.to_s
107
151
  when 'zstd'
108
- return ZSTDS::Writer.open(raw, level: 10) if defined?(ZSTDS)
152
+ if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
153
+ level = @compression_level || 10
154
+ return ::Zstd::StreamWriter.new(raw, level: level)
155
+ elsif defined?(ZSTDS)
156
+ level = @compression_level || 10
157
+ return ZSTDS::Stream::Writer.new(raw, compression_level: level)
158
+ end
109
159
 
110
160
  @logger&.warn('zstd gem not loaded; using gzip')
111
161
  Zlib::GzipWriter.new(raw)
@@ -122,8 +172,10 @@ module Purplelight
122
172
  def rotate!
123
173
  return unless @io
124
174
 
175
+ t = Thread.current[:pl_telemetry]&.start(:rotate_time)
125
176
  finalize_current_part!
126
177
  @io.close
178
+ Thread.current[:pl_telemetry]&.finish(:rotate_time, t)
127
179
  @io = nil
128
180
  @csv = nil
129
181
  ensure_open!
@@ -150,7 +202,7 @@ module Purplelight
150
202
  def determine_effective_compression(requested)
151
203
  case requested.to_s
152
204
  when 'zstd'
153
- (defined?(ZSTDS) ? :zstd : :gzip)
205
+ ((defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter))) ? :zstd : :gzip)
154
206
  when 'none'
155
207
  :none
156
208
  else
@@ -10,6 +10,12 @@ rescue LoadError
10
10
  # zstd not available; will fallback to gzip
11
11
  end
12
12
 
13
+ begin
14
+ require 'zstd-ruby'
15
+ rescue LoadError
16
+ # alternative zstd gem not available
17
+ end
18
+
13
19
  module Purplelight
14
20
  # WriterJSONL writes newline-delimited JSON with optional compression.
15
21
  class WriterJSONL
@@ -23,7 +29,8 @@ module Purplelight
23
29
  @rotate_bytes = rotate_bytes
24
30
  @logger = logger
25
31
  @manifest = manifest
26
- @compression_level = compression_level
32
+ env_level = ENV['PL_ZSTD_LEVEL']&.to_i
33
+ @compression_level = compression_level || (env_level && env_level > 0 ? env_level : nil)
27
34
 
28
35
  @part_index = nil
29
36
  @io = nil
@@ -33,23 +40,71 @@ module Purplelight
33
40
  @closed = false
34
41
 
35
42
  @effective_compression = determine_effective_compression(@compression)
36
- return unless @effective_compression.to_s != @compression.to_s
37
-
38
- @logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
43
+ if @logger
44
+ level_disp = @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @effective_compression.to_s == 'zstd')
45
+ @logger.info("WriterJSONL using compression='#{@effective_compression}' level='#{level_disp || 'default'}'")
46
+ end
47
+ if @effective_compression.to_s != @compression.to_s
48
+ @logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
49
+ end
39
50
  end
40
51
 
41
- def write_many(array_of_docs)
52
+ def write_many(batch)
42
53
  ensure_open!
43
- # If upstream already produced newline-terminated strings, join fast.
44
- buffer = if array_of_docs.first.is_a?(String)
45
- array_of_docs.join
46
- else
47
- array_of_docs.map { |doc| "#{JSON.generate(doc)}\n" }.join
48
- end
49
- rows = array_of_docs.size
50
- write_buffer(buffer)
54
+
55
+ chunk_threshold = (ENV['PL_WRITE_CHUNK_BYTES']&.to_i || (8 * 1024 * 1024))
56
+ total_bytes = 0
57
+ rows = 0
58
+
59
+ if batch.is_a?(String)
60
+ # Fast-path: writer received a preassembled buffer string
61
+ buffer = batch
62
+ rows = buffer.count("\n")
63
+ write_buffer(buffer)
64
+ total_bytes = buffer.bytesize
65
+ elsif batch.first.is_a?(String)
66
+ # Join and write in chunks to avoid large intermediate allocations
67
+ chunk = +''
68
+ chunk_bytes = 0
69
+ batch.each do |line|
70
+ chunk << line
71
+ rows += 1
72
+ chunk_bytes += line.bytesize
73
+ next unless chunk_bytes >= chunk_threshold
74
+
75
+ write_buffer(chunk)
76
+ total_bytes += chunk.bytesize
77
+ chunk = +''
78
+ chunk_bytes = 0
79
+ end
80
+ unless chunk.empty?
81
+ write_buffer(chunk)
82
+ total_bytes += chunk.bytesize
83
+ end
84
+ else
85
+ # Fallback: encode docs here (JSON.fast_generate preferred) and write in chunks
86
+ chunk = +''
87
+ chunk_bytes = 0
88
+ batch.each do |doc|
89
+ line = "#{JSON.fast_generate(doc)}\n"
90
+ rows += 1
91
+ chunk << line
92
+ chunk_bytes += line.bytesize
93
+ next unless chunk_bytes >= chunk_threshold
94
+
95
+ write_buffer(chunk)
96
+ total_bytes += chunk.bytesize
97
+ chunk = +''
98
+ chunk_bytes = 0
99
+ end
100
+ unless chunk.empty?
101
+ write_buffer(chunk)
102
+ total_bytes += chunk.bytesize
103
+ end
104
+ end
105
+
51
106
  @rows_written += rows
52
- @manifest&.add_progress_to_part!(index: @part_index, rows_delta: rows, bytes_delta: buffer.bytesize)
107
+ @manifest&.add_progress_to_part!(index: @part_index, rows_delta: rows, bytes_delta: total_bytes)
53
108
  end
54
109
 
55
110
  def rotate_if_needed
@@ -86,15 +141,18 @@ module Purplelight
86
141
  def build_compressed_io(raw)
87
142
  case @effective_compression.to_s
88
143
  when 'zstd'
89
- if defined?(ZSTDS)
90
- # ZSTDS::Writer supports IO-like interface
144
+ # Prefer zstd-ruby if available, else ruby-zstds
145
+ if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
146
+ level = @compression_level || 3
147
+ return ::Zstd::StreamWriter.new(raw, level: level)
148
+ elsif defined?(ZSTDS)
91
149
  level = @compression_level || 3
92
- ZSTDS::Writer.open(raw, level: level)
93
- else
94
- @logger&.warn('zstd gem not loaded; this should have been handled earlier')
95
- level = @compression_level || Zlib::DEFAULT_COMPRESSION
96
- Zlib::GzipWriter.new(raw, level)
150
+ return ZSTDS::Stream::Writer.new(raw, compression_level: level)
97
151
  end
152
+
153
+ @logger&.warn('zstd gems not loaded; falling back to gzip')
154
+ level = @compression_level || Zlib::DEFAULT_COMPRESSION
155
+ Zlib::GzipWriter.new(raw, level)
98
156
  when 'gzip'
99
157
  level = @compression_level || 1
100
158
  Zlib::GzipWriter.new(raw, level)
@@ -106,7 +164,9 @@ module Purplelight
106
164
  end
107
165
 
108
166
  def write_buffer(buffer)
167
+ t = Thread.current[:pl_telemetry]&.start(:write_time)
109
168
  @io.write(buffer)
169
+ Thread.current[:pl_telemetry]&.finish(:write_time, t)
110
170
  @bytes_written += buffer.bytesize
111
171
  rotate_if_needed
112
172
  end
@@ -114,8 +174,10 @@ module Purplelight
114
174
  def rotate!
115
175
  return unless @io
116
176
 
177
+ t = Thread.current[:pl_telemetry]&.start(:rotate_time)
117
178
  finalize_current_part!
118
179
  @io.close
180
+ Thread.current[:pl_telemetry]&.finish(:rotate_time, t)
119
181
  @io = nil
120
182
  ensure_open!
121
183
  end
@@ -138,7 +200,7 @@ module Purplelight
138
200
  def determine_effective_compression(requested)
139
201
  case requested.to_s
140
202
  when 'zstd'
141
- (defined?(ZSTDS) ? :zstd : :gzip)
203
+ ((defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter))) ? :zstd : :gzip)
142
204
  when 'none'
143
205
  :none
144
206
  else
@@ -44,8 +44,13 @@ module Purplelight
44
44
 
45
45
  ensure_open!
46
46
  unless @buffer_docs.empty?
47
+ t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
47
48
  table = build_table(@buffer_docs)
49
+ Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
50
+
51
+ t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
48
52
  write_table(table, @writer_path, append: false)
53
+ Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
49
54
  end
50
55
  finalize_current_part!
51
56
  @closed = true
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: purplelight
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alexander Nicholson
@@ -13,44 +13,62 @@ dependencies:
13
13
  name: csv
14
14
  requirement: !ruby/object:Gem::Requirement
15
15
  requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '3.3'
16
19
  - - ">="
17
20
  - !ruby/object:Gem::Version
18
- version: '0'
21
+ version: 3.3.5
19
22
  type: :runtime
20
23
  prerelease: false
21
24
  version_requirements: !ruby/object:Gem::Requirement
22
25
  requirements:
26
+ - - "~>"
27
+ - !ruby/object:Gem::Version
28
+ version: '3.3'
23
29
  - - ">="
24
30
  - !ruby/object:Gem::Version
25
- version: '0'
31
+ version: 3.3.5
26
32
  - !ruby/object:Gem::Dependency
27
33
  name: logger
28
34
  requirement: !ruby/object:Gem::Requirement
29
35
  requirements:
36
+ - - "~>"
37
+ - !ruby/object:Gem::Version
38
+ version: '1.7'
30
39
  - - ">="
31
40
  - !ruby/object:Gem::Version
32
- version: '1.6'
41
+ version: 1.7.0
33
42
  type: :runtime
34
43
  prerelease: false
35
44
  version_requirements: !ruby/object:Gem::Requirement
36
45
  requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: '1.7'
37
49
  - - ">="
38
50
  - !ruby/object:Gem::Version
39
- version: '1.6'
51
+ version: 1.7.0
40
52
  - !ruby/object:Gem::Dependency
41
53
  name: mongo
42
54
  requirement: !ruby/object:Gem::Requirement
43
55
  requirements:
56
+ - - "~>"
57
+ - !ruby/object:Gem::Version
58
+ version: '2.21'
44
59
  - - ">="
45
60
  - !ruby/object:Gem::Version
46
- version: '2.19'
61
+ version: 2.21.3
47
62
  type: :runtime
48
63
  prerelease: false
49
64
  version_requirements: !ruby/object:Gem::Requirement
50
65
  requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '2.21'
51
69
  - - ">="
52
70
  - !ruby/object:Gem::Version
53
- version: '2.19'
71
+ version: 2.21.3
54
72
  description: High-throughput, resumable snapshots of MongoDB collections with partitioning,
55
73
  multi-threaded readers, and size-based sharded outputs.
56
74
  email:
@@ -70,15 +88,16 @@ files:
70
88
  - lib/purplelight/partitioner.rb
71
89
  - lib/purplelight/queue.rb
72
90
  - lib/purplelight/snapshot.rb
91
+ - lib/purplelight/telemetry.rb
73
92
  - lib/purplelight/version.rb
74
93
  - lib/purplelight/writer_csv.rb
75
94
  - lib/purplelight/writer_jsonl.rb
76
95
  - lib/purplelight/writer_parquet.rb
96
+ homepage: https://github.com/alexandernicholson/purplelight
77
97
  licenses:
78
98
  - MIT
79
99
  metadata:
80
100
  rubygems_mfa_required: 'true'
81
- homepage_uri: https://github.com/alexandernicholson/purplelight
82
101
  source_code_uri: https://github.com/alexandernicholson/purplelight
83
102
  changelog_uri: https://github.com/alexandernicholson/purplelight/releases
84
103
  rdoc_options: []