purplelight 0.1.10 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +27 -1
- data/lib/purplelight/snapshot.rb +6 -2
- data/lib/purplelight/version.rb +1 -1
- data/lib/purplelight/writer_parquet.rb +165 -12
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 920b534dc9ac832d83600031277ddd35da2920cff494e0f96d0ca230652d4ba4
|
4
|
+
data.tar.gz: 2fd4476e73efc67d1f4a722dae8d7759ac8b51b5c21546dfb4b06d9fc1cd3934
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 370660a815b47c4aa4a0725a6188d6e0455074232000a24c4f909a313c3b9c2d5d3219a17edb628df58f167115b317347c159cc93d5bd59e216d6de1ec7ecd77
|
7
|
+
data.tar.gz: efec787b1e355af50ec07e45b8c125d0f1054640734afbe7790a3a35181f999900d3c843104911eabcea644d2dc7f4604050571e17469f08fe97d9fa5aec92e9
|
data/README.md
CHANGED
@@ -9,7 +9,7 @@ Purplelight is published on RubyGems: [purplelight on RubyGems](https://rubygems
|
|
9
9
|
Add to your Gemfile:
|
10
10
|
|
11
11
|
```ruby
|
12
|
-
gem 'purplelight', '~> 0.1.
|
12
|
+
gem 'purplelight', '~> 0.1.12'
|
13
13
|
```
|
14
14
|
|
15
15
|
Or install directly:
|
@@ -144,6 +144,29 @@ Purplelight.snapshot(
|
|
144
144
|
)
|
145
145
|
```
|
146
146
|
|
147
|
+
Parquet multi-part (rows-based rotation):
|
148
|
+
|
149
|
+
```ruby
|
150
|
+
Purplelight.snapshot(
|
151
|
+
client: client,
|
152
|
+
collection: 'users',
|
153
|
+
output: '/data/exports',
|
154
|
+
format: :parquet,
|
155
|
+
# Any mode other than :single_file enables multi-part filenames for Parquet
|
156
|
+
sharding: { mode: :by_size, prefix: 'users_parquet_parts' },
|
157
|
+
# Split into multiple .parquet files, limiting rows per file
|
158
|
+
parquet_max_rows: 100_000,
|
159
|
+
# Optional: Parquet row group size (rows)
|
160
|
+
parquet_row_group: 10_000,
|
161
|
+
resume: { enabled: true }
|
162
|
+
)
|
163
|
+
```
|
164
|
+
|
165
|
+
Notes for Parquet:
|
166
|
+
- Parquet multi-part sizing is controlled by rows via `parquet_max_rows`.
|
167
|
+
- `--rotate-mb` / `part_bytes` do not affect Parquet part size; they apply to JSONL/CSV.
|
168
|
+
- Use `sharding: { mode: :single_file }` to force a single `.parquet` file.
|
169
|
+
|
147
170
|
### Environment variables (optional)
|
148
171
|
|
149
172
|
CLI flags take precedence, but these environment variables can set sensible defaults:
|
@@ -199,6 +222,8 @@ bundle exec bin/purplelight \
|
|
199
222
|
Notes:
|
200
223
|
- Compression backend selection order is: requested format → `zstd-ruby` → `zstds` → `gzip`.
|
201
224
|
- `--single-file` and `--by-size` update only the sharding mode/params and preserve any provided `--prefix`.
|
225
|
+
- Parquet multi-part sizing is programmatic via `parquet_max_rows`; there is no CLI flag for it.
|
226
|
+
- To increase concurrent connections, set `maxPoolSize` on your Mongo URI (used by `--uri`), e.g., `mongodb://.../?maxPoolSize=32`. A good starting point is `maxPoolSize >= --partitions`.
|
202
227
|
|
203
228
|
### Architecture
|
204
229
|
|
@@ -236,6 +261,7 @@ Key points:
|
|
236
261
|
- **Rotation size**: larger (512MB–1GB) reduces finalize overhead for many parts. CLI: `--rotate-mb` (and/or `--by-size`).
|
237
262
|
- **JSONL chunking**: tune builder write chunk size for throughput. CLI: `--write-chunk-mb`.
|
238
263
|
- **Parquet row groups**: choose a row group size that fits downstream readers. CLI: `--parquet-row-group`.
|
264
|
+
- **Parquet parts (rows)**: split Parquet outputs by rows with `parquet_max_rows` (programmatic API). Set `sharding.mode` to anything other than `:single_file` to enable multi-part filenames.
|
239
265
|
- **Read preference**: offload to secondaries or tagged analytics nodes when available. CLI: `--read-preference`, `--read-tags`.
|
240
266
|
- **Read concern**: pick an appropriate level for consistency/latency trade-offs. CLI: `--read-concern`.
|
241
267
|
- **Cursor timeout**: for very long scans, leave `noCursorTimeout` enabled. CLI: `--no-cursor-timeout true|false`.
|
data/lib/purplelight/snapshot.rb
CHANGED
@@ -39,7 +39,8 @@ module Purplelight
|
|
39
39
|
sharding: { mode: :by_size, part_bytes: DEFAULTS[:rotate_bytes], prefix: nil },
|
40
40
|
logger: nil, on_progress: nil, read_concern: DEFAULTS[:read_concern], read_preference: DEFAULTS[:read_preference],
|
41
41
|
no_cursor_timeout: DEFAULTS[:no_cursor_timeout], telemetry: nil,
|
42
|
-
compression_level: nil, writer_threads: 1, write_chunk_bytes: nil, parquet_row_group: nil
|
42
|
+
compression_level: nil, writer_threads: 1, write_chunk_bytes: nil, parquet_row_group: nil,
|
43
|
+
parquet_max_rows: nil)
|
43
44
|
@client = client
|
44
45
|
@collection = client[collection]
|
45
46
|
@output = output
|
@@ -64,6 +65,7 @@ module Purplelight
|
|
64
65
|
@writer_threads = writer_threads || 1
|
65
66
|
@write_chunk_bytes = write_chunk_bytes
|
66
67
|
@parquet_row_group = parquet_row_group
|
68
|
+
@parquet_max_rows = parquet_max_rows
|
67
69
|
|
68
70
|
@running = true
|
69
71
|
@telemetry_enabled = telemetry ? telemetry.enabled? : (ENV['PL_TELEMETRY'] == '1')
|
@@ -106,6 +108,7 @@ module Purplelight
|
|
106
108
|
compression_level: @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @compression.to_s == 'zstd') || ENV['PL_ZSTD_LEVEL']&.to_i,
|
107
109
|
write_chunk_bytes: @write_chunk_bytes || ENV['PL_WRITE_CHUNK_BYTES']&.to_i,
|
108
110
|
parquet_row_group: @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i,
|
111
|
+
parquet_max_rows: @parquet_max_rows,
|
109
112
|
sharding: @sharding,
|
110
113
|
resume_overwrite_incompatible: @resume && @resume[:overwrite_incompatible] ? true : false,
|
111
114
|
telemetry: @telemetry_enabled
|
@@ -134,7 +137,8 @@ module Purplelight
|
|
134
137
|
single_file = @sharding && @sharding[:mode].to_s == 'single_file'
|
135
138
|
row_group = @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i || WriterParquet::DEFAULT_ROW_GROUP_SIZE
|
136
139
|
WriterParquet.new(directory: dir, prefix: prefix, compression: @compression, logger: @logger,
|
137
|
-
manifest: manifest, single_file: single_file, row_group_size: row_group
|
140
|
+
manifest: manifest, single_file: single_file, row_group_size: row_group,
|
141
|
+
rotate_rows: @parquet_max_rows)
|
138
142
|
else
|
139
143
|
raise ArgumentError, "format not implemented: #{@format}"
|
140
144
|
end
|
data/lib/purplelight/version.rb
CHANGED
@@ -15,7 +15,7 @@ module Purplelight
|
|
15
15
|
DEFAULT_ROW_GROUP_SIZE = 10_000
|
16
16
|
|
17
17
|
def initialize(directory:, prefix:, compression: :zstd, row_group_size: DEFAULT_ROW_GROUP_SIZE, logger: nil,
|
18
|
-
manifest: nil, single_file: true, schema: nil)
|
18
|
+
manifest: nil, single_file: true, schema: nil, rotate_rows: nil)
|
19
19
|
@directory = directory
|
20
20
|
@prefix = prefix
|
21
21
|
@compression = compression
|
@@ -24,11 +24,13 @@ module Purplelight
|
|
24
24
|
@manifest = manifest
|
25
25
|
@single_file = single_file
|
26
26
|
@schema = schema
|
27
|
+
@rotate_rows = rotate_rows
|
27
28
|
|
28
29
|
@closed = false
|
29
30
|
@file_seq = 0
|
30
31
|
@part_index = nil
|
31
32
|
@pq_writer = nil
|
33
|
+
@rows_in_current_file = 0
|
32
34
|
|
33
35
|
ensure_dependencies!
|
34
36
|
reset_buffers
|
@@ -38,15 +40,13 @@ module Purplelight
|
|
38
40
|
ensure_open!
|
39
41
|
array_of_docs.each { |doc| @buffer_docs << doc }
|
40
42
|
flush_row_groups_if_needed
|
41
|
-
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: array_of_docs.length, bytes_delta: 0)
|
42
43
|
end
|
43
44
|
|
44
45
|
def close
|
45
46
|
return if @closed
|
46
47
|
|
47
|
-
ensure_open!
|
48
48
|
flush_all_row_groups
|
49
|
-
finalize_current_part!
|
49
|
+
finalize_current_part! if @writer_path
|
50
50
|
@closed = true
|
51
51
|
end
|
52
52
|
|
@@ -70,6 +70,7 @@ module Purplelight
|
|
70
70
|
FileUtils.mkdir_p(@directory)
|
71
71
|
@writer_path = next_part_path
|
72
72
|
@part_index = @manifest&.open_part!(@writer_path) if @manifest
|
73
|
+
@rows_in_current_file = 0
|
73
74
|
end
|
74
75
|
|
75
76
|
# No-op; we now write once on close for simplicity
|
@@ -89,7 +90,8 @@ module Purplelight
|
|
89
90
|
# Stream via ArrowFileWriter when available to avoid building huge tables
|
90
91
|
if defined?(Parquet::ArrowFileWriter)
|
91
92
|
unless @pq_writer
|
92
|
-
|
93
|
+
props = build_writer_properties_for_compression(@compression)
|
94
|
+
@pq_writer = create_arrow_file_writer(table.schema, path, props)
|
93
95
|
end
|
94
96
|
# Prefer passing row_group_size; fallback to single-arg for older APIs
|
95
97
|
begin
|
@@ -101,13 +103,19 @@ module Purplelight
|
|
101
103
|
end
|
102
104
|
# Fallback to one-shot save when streaming API is not available
|
103
105
|
if table.respond_to?(:save)
|
104
|
-
|
106
|
+
begin
|
107
|
+
table.save(path, format: :parquet, compression: normalize_parquet_compression_name(@compression))
|
108
|
+
rescue StandardError
|
109
|
+
table.save(path, format: :parquet)
|
110
|
+
end
|
105
111
|
return
|
106
112
|
end
|
107
113
|
raise 'Parquet writer not available in this environment'
|
108
114
|
end
|
109
115
|
|
110
116
|
def finalize_current_part!
|
117
|
+
return if @writer_path.nil?
|
118
|
+
|
111
119
|
if @pq_writer
|
112
120
|
@pq_writer.close
|
113
121
|
@pq_writer = nil
|
@@ -115,6 +123,8 @@ module Purplelight
|
|
115
123
|
@manifest&.complete_part!(index: @part_index, checksum: nil)
|
116
124
|
@file_seq += 1 unless @single_file
|
117
125
|
@writer_path = nil
|
126
|
+
@part_index = nil
|
127
|
+
@rows_in_current_file = 0
|
118
128
|
end
|
119
129
|
|
120
130
|
def next_part_path
|
@@ -147,14 +157,52 @@ module Purplelight
|
|
147
157
|
return if @buffer_docs.empty?
|
148
158
|
|
149
159
|
while @buffer_docs.length >= @row_group_size
|
160
|
+
ensure_open!
|
150
161
|
group = @buffer_docs.shift(@row_group_size)
|
151
|
-
|
152
|
-
|
153
|
-
|
162
|
+
if @rotate_rows && !@single_file && (@rows_in_current_file + group.length) > @rotate_rows
|
163
|
+
# Write a partial chunk to fill the current file, then rotate and write the rest
|
164
|
+
remaining_allowed = @rotate_rows - @rows_in_current_file
|
165
|
+
if remaining_allowed.positive?
|
166
|
+
part_a = group.first(remaining_allowed)
|
167
|
+
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
168
|
+
table_a = build_table(part_a)
|
169
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
170
|
+
|
171
|
+
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
172
|
+
write_table(table_a, @writer_path, append: true)
|
173
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
174
|
+
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: part_a.length, bytes_delta: 0)
|
175
|
+
@rows_in_current_file += part_a.length
|
176
|
+
end
|
177
|
+
|
178
|
+
finalize_current_part!
|
179
|
+
ensure_open!
|
180
|
+
|
181
|
+
part_b = group.drop(remaining_allowed)
|
182
|
+
unless part_b.empty?
|
183
|
+
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
184
|
+
table_b = build_table(part_b)
|
185
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
186
|
+
|
187
|
+
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
188
|
+
write_table(table_b, @writer_path, append: true)
|
189
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
190
|
+
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: part_b.length, bytes_delta: 0)
|
191
|
+
@rows_in_current_file += part_b.length
|
192
|
+
maybe_rotate!
|
193
|
+
end
|
194
|
+
else
|
195
|
+
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
196
|
+
table = build_table(group)
|
197
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
154
198
|
|
155
|
-
|
156
|
-
|
157
|
-
|
199
|
+
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
200
|
+
write_table(table, @writer_path, append: true)
|
201
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
202
|
+
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: group.length, bytes_delta: 0)
|
203
|
+
@rows_in_current_file += group.length
|
204
|
+
maybe_rotate!
|
205
|
+
end
|
158
206
|
end
|
159
207
|
end
|
160
208
|
|
@@ -166,14 +214,119 @@ module Purplelight
|
|
166
214
|
return if @buffer_docs.empty?
|
167
215
|
|
168
216
|
# Flush remaining as a final smaller group
|
217
|
+
remaining = @buffer_docs.length
|
169
218
|
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
170
219
|
table = build_table(@buffer_docs)
|
171
220
|
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
172
221
|
|
222
|
+
ensure_open!
|
223
|
+
# Pre-rotate to avoid exceeding rotate_rows on this final write
|
224
|
+
if @rotate_rows && !@single_file && @rows_in_current_file.positive? && (@rows_in_current_file + remaining) > @rotate_rows
|
225
|
+
finalize_current_part!
|
226
|
+
ensure_open!
|
227
|
+
end
|
228
|
+
|
173
229
|
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
174
230
|
write_table(table, @writer_path, append: true)
|
175
231
|
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
232
|
+
rows_written = (table.respond_to?(:n_rows) ? table.n_rows : remaining)
|
233
|
+
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: rows_written, bytes_delta: 0)
|
234
|
+
@rows_in_current_file += rows_written
|
176
235
|
@buffer_docs.clear
|
236
|
+
maybe_rotate!
|
237
|
+
end
|
238
|
+
|
239
|
+
def maybe_rotate!
|
240
|
+
return if @single_file
|
241
|
+
return unless @rotate_rows && @rows_in_current_file >= @rotate_rows
|
242
|
+
|
243
|
+
finalize_current_part!
|
244
|
+
# Next write will open a new part
|
245
|
+
end
|
246
|
+
|
247
|
+
def build_writer_properties_for_compression(requested)
|
248
|
+
codec_const = parquet_codec_constant(requested)
|
249
|
+
return nil unless codec_const
|
250
|
+
|
251
|
+
# Prefer WriterProperties builder if available
|
252
|
+
begin
|
253
|
+
if defined?(Parquet::WriterProperties) && Parquet::WriterProperties.respond_to?(:builder)
|
254
|
+
builder = Parquet::WriterProperties.builder
|
255
|
+
if builder.respond_to?(:compression)
|
256
|
+
builder = builder.compression(codec_const)
|
257
|
+
elsif builder.respond_to?(:set_compression)
|
258
|
+
builder = builder.set_compression(codec_const)
|
259
|
+
end
|
260
|
+
return builder.build if builder.respond_to?(:build)
|
261
|
+
end
|
262
|
+
rescue StandardError
|
263
|
+
# fall through to other strategies
|
264
|
+
end
|
265
|
+
|
266
|
+
# Alternative builder class naming fallback
|
267
|
+
begin
|
268
|
+
if defined?(Parquet::WriterPropertiesBuilder)
|
269
|
+
b = Parquet::WriterPropertiesBuilder.new
|
270
|
+
if b.respond_to?(:compression)
|
271
|
+
b.compression(codec_const)
|
272
|
+
elsif b.respond_to?(:set_compression)
|
273
|
+
b.set_compression(codec_const)
|
274
|
+
end
|
275
|
+
return b.build if b.respond_to?(:build)
|
276
|
+
end
|
277
|
+
rescue StandardError
|
278
|
+
# ignore
|
279
|
+
end
|
280
|
+
nil
|
281
|
+
end
|
282
|
+
|
283
|
+
def create_arrow_file_writer(schema, path, props)
|
284
|
+
attempts = []
|
285
|
+
if props
|
286
|
+
attempts << -> { Parquet::ArrowFileWriter.open(schema, path, props) }
|
287
|
+
attempts << -> { Parquet::ArrowFileWriter.open(schema, path, properties: props) }
|
288
|
+
end
|
289
|
+
attempts << -> { Parquet::ArrowFileWriter.open(schema, path) }
|
290
|
+
|
291
|
+
attempts.each do |call|
|
292
|
+
return call.call
|
293
|
+
rescue StandardError
|
294
|
+
next
|
295
|
+
end
|
296
|
+
raise 'failed to open Parquet::ArrowFileWriter'
|
297
|
+
end
|
298
|
+
|
299
|
+
def parquet_codec_constant(requested)
|
300
|
+
name = normalize_parquet_compression_name(requested)
|
301
|
+
return nil unless name
|
302
|
+
|
303
|
+
up = case name
|
304
|
+
when 'zstd', 'zstandard' then 'ZSTD'
|
305
|
+
when 'gzip' then 'GZIP'
|
306
|
+
when 'snappy' then 'SNAPPY'
|
307
|
+
when 'none' then 'UNCOMPRESSED'
|
308
|
+
else name.upcase
|
309
|
+
end
|
310
|
+
candidates = %w[CompressionType Compression CompressionCodec]
|
311
|
+
candidates.each do |mod|
|
312
|
+
m = Parquet.const_get(mod)
|
313
|
+
return m.const_get(up) if m.const_defined?(up)
|
314
|
+
rescue StandardError
|
315
|
+
next
|
316
|
+
end
|
317
|
+
nil
|
318
|
+
end
|
319
|
+
|
320
|
+
def normalize_parquet_compression_name(requested)
|
321
|
+
return nil if requested.nil?
|
322
|
+
|
323
|
+
s = requested.to_s.downcase
|
324
|
+
return 'none' if s == 'none'
|
325
|
+
return 'gzip' if s == 'gzip'
|
326
|
+
return 'snappy' if s == 'snappy'
|
327
|
+
return 'zstd' if %w[zstd zstandard].include?(s)
|
328
|
+
|
329
|
+
nil
|
177
330
|
end
|
178
331
|
end
|
179
332
|
end
|