purplelight 0.1.11 → 0.1.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -2
- data/bin/purplelight +5 -1
- data/lib/purplelight/snapshot.rb +2 -2
- data/lib/purplelight/version.rb +1 -1
- data/lib/purplelight/writer_parquet.rb +148 -15
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c500768ed34e247a92be3979f56c1036314b51003cb1ff7953562f1ca8278677
|
|
4
|
+
data.tar.gz: f12b305e35b201192e219aec23a0c66c13a7b9eb262c038dbac9d1e2a3c77bc5
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0f6959b02fb695a4c28802ec555d7b6ce6e7020d348024d2df8381efbb022632916d2915c688131cf79aa042f05aa3e2ec8780ad866fdb774b45b51f53093d6b
|
|
7
|
+
data.tar.gz: aa2885e120f57bd492c1a98aee91f52b7c83bee15492ce34a0b6d3dae985e0355ea77489e69dfde2f27fb5b51974263b7f7358a51103697d8222a03cd98a4185
|
data/README.md
CHANGED
|
@@ -9,7 +9,7 @@ Purplelight is published on RubyGems: [purplelight on RubyGems](https://rubygems
|
|
|
9
9
|
Add to your Gemfile:
|
|
10
10
|
|
|
11
11
|
```ruby
|
|
12
|
-
gem 'purplelight', '~> 0.1.
|
|
12
|
+
gem 'purplelight', '~> 0.1.13'
|
|
13
13
|
```
|
|
14
14
|
|
|
15
15
|
Or install directly:
|
|
@@ -212,6 +212,7 @@ bundle exec bin/purplelight \
|
|
|
212
212
|
- `--read-concern LEVEL`: `majority|local|linearizable|available|snapshot`.
|
|
213
213
|
- `--no-cursor-timeout BOOL`: Toggle `noCursorTimeout` (default true).
|
|
214
214
|
- `--parquet-row-group N`: Parquet row group size (rows).
|
|
215
|
+
- `--parquet-max-rows N`: Parquet max rows per part (enables multi-part when not `--single-file`).
|
|
215
216
|
- `--write-chunk-mb MB`: JSONL encode/write chunk size before enqueueing.
|
|
216
217
|
- `--writer-threads N` (experimental): Number of writer threads (JSONL only).
|
|
217
218
|
- `--telemetry on|off`: Force enable/disable telemetry output.
|
|
@@ -222,7 +223,7 @@ bundle exec bin/purplelight \
|
|
|
222
223
|
Notes:
|
|
223
224
|
- Compression backend selection order is: requested format → `zstd-ruby` → `zstds` → `gzip`.
|
|
224
225
|
- `--single-file` and `--by-size` update only the sharding mode/params and preserve any provided `--prefix`.
|
|
225
|
-
|
|
226
|
+
– Parquet multi-part sizing can be controlled via `parquet_max_rows` programmatically, or via CLI `--parquet-max-rows`.
|
|
226
227
|
- To increase concurrent connections, set `maxPoolSize` on your Mongo URI (used by `--uri`), e.g., `mongodb://.../?maxPoolSize=32`. A good starting point is `maxPoolSize >= --partitions`.
|
|
227
228
|
|
|
228
229
|
### Architecture
|
data/bin/purplelight
CHANGED
|
@@ -25,6 +25,7 @@ options = {
|
|
|
25
25
|
writer_threads: nil,
|
|
26
26
|
write_chunk_bytes: nil,
|
|
27
27
|
parquet_row_group: nil,
|
|
28
|
+
parquet_max_rows: nil,
|
|
28
29
|
telemetry_flag: nil,
|
|
29
30
|
read_concern: nil,
|
|
30
31
|
no_cursor_timeout: nil,
|
|
@@ -61,6 +62,7 @@ parser = OptionParser.new do |opts|
|
|
|
61
62
|
opts.on('--writer-threads N', Integer, 'Number of writer threads (experimental, JSONL only)') { |v| options[:writer_threads] = v }
|
|
62
63
|
opts.on('--write-chunk-mb MB', Integer, 'JSONL encode/write chunk size in MB') { |v| options[:write_chunk_bytes] = v * 1024 * 1024 }
|
|
63
64
|
opts.on('--parquet-row-group N', Integer, 'Parquet row group size (rows)') { |v| options[:parquet_row_group] = v }
|
|
65
|
+
opts.on('--parquet-max-rows N', Integer, 'Parquet max rows per part (multi-part only)') { |v| options[:parquet_max_rows] = v }
|
|
64
66
|
opts.on('-q', '--query JSON', 'Filter query as JSON (Extended JSON supported)') do |v|
|
|
65
67
|
# Prefer BSON Extended JSON to support $date, $oid, etc.
|
|
66
68
|
options[:query] = BSON::ExtJSON.parse(v)
|
|
@@ -158,7 +160,8 @@ snapshot_args[:no_cursor_timeout] = options[:no_cursor_timeout] unless options[:
|
|
|
158
160
|
snapshot_args[:compression_level] = options[:compression_level] if options[:compression_level]
|
|
159
161
|
snapshot_args[:writer_threads] = options[:writer_threads] if options[:writer_threads]
|
|
160
162
|
snapshot_args[:write_chunk_bytes] = options[:write_chunk_bytes] if options[:write_chunk_bytes]
|
|
161
|
-
snapshot_args[:parquet_row_group] = options[:parquet_row_group] if options[:parquet_row_group]
|
|
163
|
+
snapshot_args[:parquet_row_group] = options[:parquet_row_group] if options[:parquet_row_group]
|
|
164
|
+
snapshot_args[:parquet_max_rows] = options[:parquet_max_rows] if options[:parquet_max_rows]
|
|
162
165
|
|
|
163
166
|
# telemetry env override
|
|
164
167
|
if options[:telemetry_flag]
|
|
@@ -169,6 +172,7 @@ end
|
|
|
169
172
|
ENV['PL_ZSTD_LEVEL'] = options[:compression_level].to_s if options[:compression_level]
|
|
170
173
|
ENV['PL_WRITE_CHUNK_BYTES'] = options[:write_chunk_bytes].to_s if options[:write_chunk_bytes]
|
|
171
174
|
ENV['PL_PARQUET_ROW_GROUP'] = options[:parquet_row_group].to_s if options[:parquet_row_group]
|
|
175
|
+
# No env default for parquet_max_rows; it is snapshot-argument only
|
|
172
176
|
ENV['PL_WRITER_THREADS'] = options[:writer_threads].to_s if options[:writer_threads]
|
|
173
177
|
|
|
174
178
|
ok = Purplelight.snapshot(**snapshot_args)
|
data/lib/purplelight/snapshot.rb
CHANGED
|
@@ -137,8 +137,8 @@ module Purplelight
|
|
|
137
137
|
single_file = @sharding && @sharding[:mode].to_s == 'single_file'
|
|
138
138
|
row_group = @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i || WriterParquet::DEFAULT_ROW_GROUP_SIZE
|
|
139
139
|
WriterParquet.new(directory: dir, prefix: prefix, compression: @compression, logger: @logger,
|
|
140
|
-
|
|
141
|
-
|
|
140
|
+
manifest: manifest, single_file: single_file, row_group_size: row_group,
|
|
141
|
+
rotate_rows: @parquet_max_rows)
|
|
142
142
|
else
|
|
143
143
|
raise ArgumentError, "format not implemented: #{@format}"
|
|
144
144
|
end
|
data/lib/purplelight/version.rb
CHANGED
|
@@ -40,7 +40,6 @@ module Purplelight
|
|
|
40
40
|
ensure_open!
|
|
41
41
|
array_of_docs.each { |doc| @buffer_docs << doc }
|
|
42
42
|
flush_row_groups_if_needed
|
|
43
|
-
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: array_of_docs.length, bytes_delta: 0)
|
|
44
43
|
end
|
|
45
44
|
|
|
46
45
|
def close
|
|
@@ -91,7 +90,8 @@ module Purplelight
|
|
|
91
90
|
# Stream via ArrowFileWriter when available to avoid building huge tables
|
|
92
91
|
if defined?(Parquet::ArrowFileWriter)
|
|
93
92
|
unless @pq_writer
|
|
94
|
-
|
|
93
|
+
props = build_writer_properties_for_compression(@compression)
|
|
94
|
+
@pq_writer = create_arrow_file_writer(table.schema, path, props)
|
|
95
95
|
end
|
|
96
96
|
# Prefer passing row_group_size; fallback to single-arg for older APIs
|
|
97
97
|
begin
|
|
@@ -103,7 +103,11 @@ module Purplelight
|
|
|
103
103
|
end
|
|
104
104
|
# Fallback to one-shot save when streaming API is not available
|
|
105
105
|
if table.respond_to?(:save)
|
|
106
|
-
|
|
106
|
+
begin
|
|
107
|
+
table.save(path, format: :parquet, compression: normalize_parquet_compression_name(@compression))
|
|
108
|
+
rescue StandardError
|
|
109
|
+
table.save(path, format: :parquet)
|
|
110
|
+
end
|
|
107
111
|
return
|
|
108
112
|
end
|
|
109
113
|
raise 'Parquet writer not available in this environment'
|
|
@@ -111,6 +115,7 @@ module Purplelight
|
|
|
111
115
|
|
|
112
116
|
def finalize_current_part!
|
|
113
117
|
return if @writer_path.nil?
|
|
118
|
+
|
|
114
119
|
if @pq_writer
|
|
115
120
|
@pq_writer.close
|
|
116
121
|
@pq_writer = nil
|
|
@@ -154,15 +159,50 @@ module Purplelight
|
|
|
154
159
|
while @buffer_docs.length >= @row_group_size
|
|
155
160
|
ensure_open!
|
|
156
161
|
group = @buffer_docs.shift(@row_group_size)
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
162
|
+
if @rotate_rows && !@single_file && (@rows_in_current_file + group.length) > @rotate_rows
|
|
163
|
+
# Write a partial chunk to fill the current file, then rotate and write the rest
|
|
164
|
+
remaining_allowed = @rotate_rows - @rows_in_current_file
|
|
165
|
+
if remaining_allowed.positive?
|
|
166
|
+
part_a = group.first(remaining_allowed)
|
|
167
|
+
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
|
168
|
+
table_a = build_table(part_a)
|
|
169
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
|
170
|
+
|
|
171
|
+
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
|
172
|
+
write_table(table_a, @writer_path, append: true)
|
|
173
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
|
174
|
+
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: part_a.length, bytes_delta: 0)
|
|
175
|
+
@rows_in_current_file += part_a.length
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
finalize_current_part!
|
|
179
|
+
ensure_open!
|
|
180
|
+
|
|
181
|
+
part_b = group.drop(remaining_allowed)
|
|
182
|
+
unless part_b.empty?
|
|
183
|
+
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
|
184
|
+
table_b = build_table(part_b)
|
|
185
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
|
186
|
+
|
|
187
|
+
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
|
188
|
+
write_table(table_b, @writer_path, append: true)
|
|
189
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
|
190
|
+
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: part_b.length, bytes_delta: 0)
|
|
191
|
+
@rows_in_current_file += part_b.length
|
|
192
|
+
maybe_rotate!
|
|
193
|
+
end
|
|
194
|
+
else
|
|
195
|
+
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
|
196
|
+
table = build_table(group)
|
|
197
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
|
198
|
+
|
|
199
|
+
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
|
200
|
+
write_table(table, @writer_path, append: true)
|
|
201
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
|
202
|
+
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: group.length, bytes_delta: 0)
|
|
203
|
+
@rows_in_current_file += group.length
|
|
204
|
+
maybe_rotate!
|
|
205
|
+
end
|
|
166
206
|
end
|
|
167
207
|
end
|
|
168
208
|
|
|
@@ -174,17 +214,25 @@ module Purplelight
|
|
|
174
214
|
return if @buffer_docs.empty?
|
|
175
215
|
|
|
176
216
|
# Flush remaining as a final smaller group
|
|
217
|
+
remaining = @buffer_docs.length
|
|
177
218
|
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
|
178
219
|
table = build_table(@buffer_docs)
|
|
179
220
|
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
|
180
221
|
|
|
181
|
-
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
|
182
222
|
ensure_open!
|
|
223
|
+
# Pre-rotate to avoid exceeding rotate_rows on this final write
|
|
224
|
+
if @rotate_rows && !@single_file && @rows_in_current_file.positive? && (@rows_in_current_file + remaining) > @rotate_rows
|
|
225
|
+
finalize_current_part!
|
|
226
|
+
ensure_open!
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
|
183
230
|
write_table(table, @writer_path, append: true)
|
|
184
231
|
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
|
232
|
+
rows_written = (table.respond_to?(:n_rows) ? table.n_rows : remaining)
|
|
233
|
+
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: rows_written, bytes_delta: 0)
|
|
234
|
+
@rows_in_current_file += rows_written
|
|
185
235
|
@buffer_docs.clear
|
|
186
|
-
@rows_in_current_file += table.n_rows if table.respond_to?(:n_rows)
|
|
187
|
-
@rows_in_current_file += @buffer_docs.length unless table.respond_to?(:n_rows)
|
|
188
236
|
maybe_rotate!
|
|
189
237
|
end
|
|
190
238
|
|
|
@@ -195,5 +243,90 @@ module Purplelight
|
|
|
195
243
|
finalize_current_part!
|
|
196
244
|
# Next write will open a new part
|
|
197
245
|
end
|
|
246
|
+
|
|
247
|
+
def build_writer_properties_for_compression(requested)
|
|
248
|
+
codec_const = parquet_codec_constant(requested)
|
|
249
|
+
return nil unless codec_const
|
|
250
|
+
|
|
251
|
+
# Prefer WriterProperties builder if available
|
|
252
|
+
begin
|
|
253
|
+
if defined?(Parquet::WriterProperties) && Parquet::WriterProperties.respond_to?(:builder)
|
|
254
|
+
builder = Parquet::WriterProperties.builder
|
|
255
|
+
if builder.respond_to?(:compression)
|
|
256
|
+
builder = builder.compression(codec_const)
|
|
257
|
+
elsif builder.respond_to?(:set_compression)
|
|
258
|
+
builder = builder.set_compression(codec_const)
|
|
259
|
+
end
|
|
260
|
+
return builder.build if builder.respond_to?(:build)
|
|
261
|
+
end
|
|
262
|
+
rescue StandardError
|
|
263
|
+
# fall through to other strategies
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
# Alternative builder class naming fallback
|
|
267
|
+
begin
|
|
268
|
+
if defined?(Parquet::WriterPropertiesBuilder)
|
|
269
|
+
b = Parquet::WriterPropertiesBuilder.new
|
|
270
|
+
if b.respond_to?(:compression)
|
|
271
|
+
b.compression(codec_const)
|
|
272
|
+
elsif b.respond_to?(:set_compression)
|
|
273
|
+
b.set_compression(codec_const)
|
|
274
|
+
end
|
|
275
|
+
return b.build if b.respond_to?(:build)
|
|
276
|
+
end
|
|
277
|
+
rescue StandardError
|
|
278
|
+
# ignore
|
|
279
|
+
end
|
|
280
|
+
nil
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
def create_arrow_file_writer(schema, path, props)
|
|
284
|
+
attempts = []
|
|
285
|
+
if props
|
|
286
|
+
attempts << -> { Parquet::ArrowFileWriter.open(schema, path, props) }
|
|
287
|
+
attempts << -> { Parquet::ArrowFileWriter.open(schema, path, properties: props) }
|
|
288
|
+
end
|
|
289
|
+
attempts << -> { Parquet::ArrowFileWriter.open(schema, path) }
|
|
290
|
+
|
|
291
|
+
attempts.each do |call|
|
|
292
|
+
return call.call
|
|
293
|
+
rescue StandardError
|
|
294
|
+
next
|
|
295
|
+
end
|
|
296
|
+
raise 'failed to open Parquet::ArrowFileWriter'
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
def parquet_codec_constant(requested)
|
|
300
|
+
name = normalize_parquet_compression_name(requested)
|
|
301
|
+
return nil unless name
|
|
302
|
+
|
|
303
|
+
up = case name
|
|
304
|
+
when 'zstd', 'zstandard' then 'ZSTD'
|
|
305
|
+
when 'gzip' then 'GZIP'
|
|
306
|
+
when 'snappy' then 'SNAPPY'
|
|
307
|
+
when 'none' then 'UNCOMPRESSED'
|
|
308
|
+
else name.upcase
|
|
309
|
+
end
|
|
310
|
+
candidates = %w[CompressionType Compression CompressionCodec]
|
|
311
|
+
candidates.each do |mod|
|
|
312
|
+
m = Parquet.const_get(mod)
|
|
313
|
+
return m.const_get(up) if m.const_defined?(up)
|
|
314
|
+
rescue StandardError
|
|
315
|
+
next
|
|
316
|
+
end
|
|
317
|
+
nil
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
def normalize_parquet_compression_name(requested)
|
|
321
|
+
return nil if requested.nil?
|
|
322
|
+
|
|
323
|
+
s = requested.to_s.downcase
|
|
324
|
+
return 'none' if s == 'none'
|
|
325
|
+
return 'gzip' if s == 'gzip'
|
|
326
|
+
return 'snappy' if s == 'snappy'
|
|
327
|
+
return 'zstd' if %w[zstd zstandard].include?(s)
|
|
328
|
+
|
|
329
|
+
nil
|
|
330
|
+
end
|
|
198
331
|
end
|
|
199
332
|
end
|