purplelight 0.1.9 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +97 -9
- data/lib/purplelight/snapshot.rb +6 -2
- data/lib/purplelight/version.rb +1 -1
- data/lib/purplelight/writer_parquet.rb +77 -20
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 80f6e48231b485750fc65529ada74a07758befba9b324b58ef372f077305a144
|
4
|
+
data.tar.gz: 4d7eed034f90155d2686da45a76caa73928cbe2d080d2031539830f6f4399cfb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 63f49c8dab688ec5cd922304b472ee448aaada1a3e2c113b7a4ddbe2092f3a3d2a83e1fe396066e87971514d8069831486c1a5ad972e604807c6c3289efd8e31
|
7
|
+
data.tar.gz: 6a281a23a0abf3244045b3e99af606881f3f85bab7eeabd7b4ca94e36c823d2405d7e85b60048ec62e5a7ace63fceddd268e185a831398cc7dbd00c213198961
|
data/README.md
CHANGED
@@ -9,7 +9,7 @@ Purplelight is published on RubyGems: [purplelight on RubyGems](https://rubygems
|
|
9
9
|
Add to your Gemfile:
|
10
10
|
|
11
11
|
```ruby
|
12
|
-
gem 'purplelight', '~> 0.1.
|
12
|
+
gem 'purplelight', '~> 0.1.11'
|
13
13
|
```
|
14
14
|
|
15
15
|
Or install directly:
|
@@ -138,10 +138,44 @@ Purplelight.snapshot(
|
|
138
138
|
output: '/data/exports',
|
139
139
|
format: :parquet,
|
140
140
|
sharding: { mode: :single_file, prefix: 'users' },
|
141
|
+
# Optional: tune row group size
|
142
|
+
# parquet_row_group: 50_000,
|
141
143
|
resume: { enabled: true }
|
142
144
|
)
|
143
145
|
```
|
144
146
|
|
147
|
+
Parquet multi-part (rows-based rotation):
|
148
|
+
|
149
|
+
```ruby
|
150
|
+
Purplelight.snapshot(
|
151
|
+
client: client,
|
152
|
+
collection: 'users',
|
153
|
+
output: '/data/exports',
|
154
|
+
format: :parquet,
|
155
|
+
# Any mode other than :single_file enables multi-part filenames for Parquet
|
156
|
+
sharding: { mode: :by_size, prefix: 'users_parquet_parts' },
|
157
|
+
# Split into multiple .parquet files, limiting rows per file
|
158
|
+
parquet_max_rows: 100_000,
|
159
|
+
# Optional: Parquet row group size (rows)
|
160
|
+
parquet_row_group: 10_000,
|
161
|
+
resume: { enabled: true }
|
162
|
+
)
|
163
|
+
```
|
164
|
+
|
165
|
+
Notes for Parquet:
|
166
|
+
- Parquet multi-part sizing is controlled by rows via `parquet_max_rows`.
|
167
|
+
- `--rotate-mb` / `part_bytes` do not affect Parquet part size; they apply to JSONL/CSV.
|
168
|
+
- Use `sharding: { mode: :single_file }` to force a single `.parquet` file.
|
169
|
+
|
170
|
+
### Environment variables (optional)
|
171
|
+
|
172
|
+
CLI flags take precedence, but these environment variables can set sensible defaults:
|
173
|
+
|
174
|
+
- `PL_ZSTD_LEVEL`: default zstd compression level used by writers.
|
175
|
+
- `PL_WRITE_CHUNK_BYTES`: JSONL join/write chunk size in bytes.
|
176
|
+
- `PL_PARQUET_ROW_GROUP`: default Parquet row group size (rows).
|
177
|
+
- `PL_TELEMETRY`: set to `1` to enable telemetry by default.
|
178
|
+
|
145
179
|
### CLI
|
146
180
|
|
147
181
|
```bash
|
@@ -149,9 +183,48 @@ bundle exec bin/purplelight \
|
|
149
183
|
--uri "$MONGO_URL" \
|
150
184
|
--db mydb --collection users \
|
151
185
|
--output /data/exports \
|
152
|
-
--format jsonl --partitions 8 --by-size $((256*1024*1024)) --prefix users
|
186
|
+
--format jsonl --partitions 8 --by-size $((256*1024*1024)) --prefix users \
|
187
|
+
--queue-mb 512 --rotate-mb 512 --compression zstd --compression-level 6 \
|
188
|
+
--read-preference secondary --read-tags nodeType=ANALYTICS,region=EAST \
|
189
|
+
--read-concern majority --no-cursor-timeout true
|
153
190
|
```
|
154
191
|
|
192
|
+
#### CLI options (reference)
|
193
|
+
|
194
|
+
- `--uri URI` (required): Mongo connection string.
|
195
|
+
- `--db NAME` (required): Database name.
|
196
|
+
- `--collection NAME` (required): Collection name.
|
197
|
+
- `--output PATH` (required): Output directory or file path.
|
198
|
+
- `--format FORMAT`: `jsonl|csv|parquet` (default `jsonl`).
|
199
|
+
- `--compression NAME`: `zstd|gzip|none` (default `zstd`).
|
200
|
+
- `--compression-level N`: Compression level (zstd or gzip; writer-specific defaults if omitted).
|
201
|
+
- `--partitions N`: Number of reader partitions (defaults to ≥4 and ≤32 based on cores).
|
202
|
+
- `--batch-size N`: Mongo batch size (default 2000).
|
203
|
+
- `--queue-mb MB`: In-memory queue size in MB (default 256).
|
204
|
+
- `--rotate-mb MB`: Target rotate size for JSONL/CSV parts in MB (default 256). For multi-part outputs, pairs well with `--by-size`.
|
205
|
+
- `--by-size BYTES`: Plan size-based sharding for multi-part outputs.
|
206
|
+
- `--single-file`: Single output file (CSV/Parquet; JSONL remains multi-part).
|
207
|
+
- `--prefix NAME`: Output filename prefix (defaults to collection name when output is a directory).
|
208
|
+
- `--query JSON`: Filter as JSON/Extended JSON (supports `$date`, `$oid`, etc.).
|
209
|
+
- `--projection JSON`: Projection as JSON, e.g. `{"_id":1,"email":1}`.
|
210
|
+
- `--read-preference MODE`: `primary|primary_preferred|secondary|secondary_preferred|nearest`.
|
211
|
+
- `--read-tags key=value[,key=value...]`: Tag sets for node pinning.
|
212
|
+
- `--read-concern LEVEL`: `majority|local|linearizable|available|snapshot`.
|
213
|
+
- `--no-cursor-timeout BOOL`: Toggle `noCursorTimeout` (default true).
|
214
|
+
- `--parquet-row-group N`: Parquet row group size (rows).
|
215
|
+
- `--write-chunk-mb MB`: JSONL encode/write chunk size before enqueueing.
|
216
|
+
- `--writer-threads N` (experimental): Number of writer threads (JSONL only).
|
217
|
+
- `--telemetry on|off`: Force enable/disable telemetry output.
|
218
|
+
- `--resume-overwrite-incompatible`: Overwrite an existing incompatible manifest to safely resume anew.
|
219
|
+
- `--dry-run`: Print effective read preference JSON and exit (no snapshot).
|
220
|
+
- `--version`, `--help`: Utility commands.
|
221
|
+
|
222
|
+
Notes:
|
223
|
+
- Compression backend selection order is: requested format → `zstd-ruby` → `zstds` → `gzip`.
|
224
|
+
- `--single-file` and `--by-size` update only the sharding mode/params and preserve any provided `--prefix`.
|
225
|
+
- Parquet multi-part sizing is programmatic via `parquet_max_rows`; there is no CLI flag for it.
|
226
|
+
- To increase concurrent connections, set `maxPoolSize` on your Mongo URI (used by `--uri`), e.g., `mongodb://.../?maxPoolSize=32`. A good starting point is `maxPoolSize >= --partitions`.
|
227
|
+
|
155
228
|
### Architecture
|
156
229
|
|
157
230
|
```mermaid
|
@@ -181,19 +254,29 @@ Key points:
|
|
181
254
|
|
182
255
|
### Tuning for performance
|
183
256
|
|
184
|
-
- Partitions
|
185
|
-
- Batch size
|
186
|
-
- Queue size
|
187
|
-
- Compression
|
188
|
-
- Rotation size
|
189
|
-
-
|
257
|
+
- **Partitions**: start with `2 × cores` (default). Increase gradually if reads are underutilized; too high can add overhead. CLI: `--partitions`.
|
258
|
+
- **Batch size**: 2k–10k usually works well. Larger batches reduce cursor roundtrips, but can raise latency/memory. CLI: `--batch-size`.
|
259
|
+
- **Queue size**: increase to 256–512MB to reduce backpressure on readers for fast disks. CLI: `--queue-mb`.
|
260
|
+
- **Compression**: prefer `zstd`; adjust level to balance speed/ratio. CLI: `--compression zstd --compression-level N`. For max speed, try `--compression gzip --compression-level 1`.
|
261
|
+
- **Rotation size**: larger (512MB–1GB) reduces finalize overhead for many parts. CLI: `--rotate-mb` (and/or `--by-size`).
|
262
|
+
- **JSONL chunking**: tune builder write chunk size for throughput. CLI: `--write-chunk-mb`.
|
263
|
+
- **Parquet row groups**: choose a row group size that fits downstream readers. CLI: `--parquet-row-group`.
|
264
|
+
- **Parquet parts (rows)**: split Parquet outputs by rows with `parquet_max_rows` (programmatic API). Set `sharding.mode` to anything other than `:single_file` to enable multi-part filenames.
|
265
|
+
- **Read preference**: offload to secondaries or tagged analytics nodes when available. CLI: `--read-preference`, `--read-tags`.
|
266
|
+
- **Read concern**: pick an appropriate level for consistency/latency trade-offs. CLI: `--read-concern`.
|
267
|
+
- **Cursor timeout**: for very long scans, leave `noCursorTimeout` enabled. CLI: `--no-cursor-timeout true|false`.
|
268
|
+
- **Telemetry**: enable to inspect timing breakdowns; disable for minimal output. CLI: `--telemetry on|off`.
|
190
269
|
|
191
270
|
Benchmarking (optional):
|
192
271
|
|
193
272
|
```bash
|
194
|
-
# 1M docs benchmark with tunables
|
273
|
+
# 1M docs benchmark with tunables (JSONL)
|
195
274
|
BENCH=1 BENCH_PARTITIONS=16 BENCH_BATCH_SIZE=8000 BENCH_QUEUE_MB=512 BENCH_ROTATE_MB=512 BENCH_COMPRESSION=gzip \
|
196
275
|
bundle exec rspec spec/benchmark_perf_spec.rb --format doc
|
276
|
+
|
277
|
+
# Parquet benchmark (requires Arrow/Parquet)
|
278
|
+
BENCH=1 BENCH_FORMAT=parquet BENCH_PARQUET_ROW_GROUP=50000 BENCH_PARTITIONS=16 BENCH_BATCH_SIZE=8000 \
|
279
|
+
bundle exec rspec spec/benchmark_perf_spec.rb --format doc
|
197
280
|
```
|
198
281
|
|
199
282
|
### Read preference and node pinning
|
@@ -262,3 +345,8 @@ Benchmark results:
|
|
262
345
|
Finished in 14.02 seconds (files took 0.31974 seconds to load)
|
263
346
|
1 example, 0 failures
|
264
347
|
```
|
348
|
+
|
349
|
+
Additional BENCH variables:
|
350
|
+
|
351
|
+
- `BENCH_FORMAT`: `jsonl|parquet` (default `jsonl`).
|
352
|
+
- `BENCH_PARQUET_ROW_GROUP`: Parquet row group size (rows), e.g. `50000`.
|
data/lib/purplelight/snapshot.rb
CHANGED
@@ -39,7 +39,8 @@ module Purplelight
|
|
39
39
|
sharding: { mode: :by_size, part_bytes: DEFAULTS[:rotate_bytes], prefix: nil },
|
40
40
|
logger: nil, on_progress: nil, read_concern: DEFAULTS[:read_concern], read_preference: DEFAULTS[:read_preference],
|
41
41
|
no_cursor_timeout: DEFAULTS[:no_cursor_timeout], telemetry: nil,
|
42
|
-
compression_level: nil, writer_threads: 1, write_chunk_bytes: nil, parquet_row_group: nil
|
42
|
+
compression_level: nil, writer_threads: 1, write_chunk_bytes: nil, parquet_row_group: nil,
|
43
|
+
parquet_max_rows: nil)
|
43
44
|
@client = client
|
44
45
|
@collection = client[collection]
|
45
46
|
@output = output
|
@@ -64,6 +65,7 @@ module Purplelight
|
|
64
65
|
@writer_threads = writer_threads || 1
|
65
66
|
@write_chunk_bytes = write_chunk_bytes
|
66
67
|
@parquet_row_group = parquet_row_group
|
68
|
+
@parquet_max_rows = parquet_max_rows
|
67
69
|
|
68
70
|
@running = true
|
69
71
|
@telemetry_enabled = telemetry ? telemetry.enabled? : (ENV['PL_TELEMETRY'] == '1')
|
@@ -106,6 +108,7 @@ module Purplelight
|
|
106
108
|
compression_level: @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @compression.to_s == 'zstd') || ENV['PL_ZSTD_LEVEL']&.to_i,
|
107
109
|
write_chunk_bytes: @write_chunk_bytes || ENV['PL_WRITE_CHUNK_BYTES']&.to_i,
|
108
110
|
parquet_row_group: @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i,
|
111
|
+
parquet_max_rows: @parquet_max_rows,
|
109
112
|
sharding: @sharding,
|
110
113
|
resume_overwrite_incompatible: @resume && @resume[:overwrite_incompatible] ? true : false,
|
111
114
|
telemetry: @telemetry_enabled
|
@@ -134,7 +137,8 @@ module Purplelight
|
|
134
137
|
single_file = @sharding && @sharding[:mode].to_s == 'single_file'
|
135
138
|
row_group = @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i || WriterParquet::DEFAULT_ROW_GROUP_SIZE
|
136
139
|
WriterParquet.new(directory: dir, prefix: prefix, compression: @compression, logger: @logger,
|
137
|
-
|
140
|
+
manifest: manifest, single_file: single_file, row_group_size: row_group,
|
141
|
+
rotate_rows: @parquet_max_rows)
|
138
142
|
else
|
139
143
|
raise ArgumentError, "format not implemented: #{@format}"
|
140
144
|
end
|
data/lib/purplelight/version.rb
CHANGED
@@ -15,7 +15,7 @@ module Purplelight
|
|
15
15
|
DEFAULT_ROW_GROUP_SIZE = 10_000
|
16
16
|
|
17
17
|
def initialize(directory:, prefix:, compression: :zstd, row_group_size: DEFAULT_ROW_GROUP_SIZE, logger: nil,
|
18
|
-
manifest: nil, single_file: true, schema: nil)
|
18
|
+
manifest: nil, single_file: true, schema: nil, rotate_rows: nil)
|
19
19
|
@directory = directory
|
20
20
|
@prefix = prefix
|
21
21
|
@compression = compression
|
@@ -24,10 +24,13 @@ module Purplelight
|
|
24
24
|
@manifest = manifest
|
25
25
|
@single_file = single_file
|
26
26
|
@schema = schema
|
27
|
+
@rotate_rows = rotate_rows
|
27
28
|
|
28
29
|
@closed = false
|
29
30
|
@file_seq = 0
|
30
31
|
@part_index = nil
|
32
|
+
@pq_writer = nil
|
33
|
+
@rows_in_current_file = 0
|
31
34
|
|
32
35
|
ensure_dependencies!
|
33
36
|
reset_buffers
|
@@ -36,23 +39,15 @@ module Purplelight
|
|
36
39
|
def write_many(array_of_docs)
|
37
40
|
ensure_open!
|
38
41
|
array_of_docs.each { |doc| @buffer_docs << doc }
|
42
|
+
flush_row_groups_if_needed
|
39
43
|
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: array_of_docs.length, bytes_delta: 0)
|
40
44
|
end
|
41
45
|
|
42
46
|
def close
|
43
47
|
return if @closed
|
44
48
|
|
45
|
-
|
46
|
-
|
47
|
-
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
48
|
-
table = build_table(@buffer_docs)
|
49
|
-
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
50
|
-
|
51
|
-
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
52
|
-
write_table(table, @writer_path, append: false)
|
53
|
-
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
54
|
-
end
|
55
|
-
finalize_current_part!
|
49
|
+
flush_all_row_groups
|
50
|
+
finalize_current_part! if @writer_path
|
56
51
|
@closed = true
|
57
52
|
end
|
58
53
|
|
@@ -76,6 +71,7 @@ module Purplelight
|
|
76
71
|
FileUtils.mkdir_p(@directory)
|
77
72
|
@writer_path = next_part_path
|
78
73
|
@part_index = @manifest&.open_part!(@writer_path) if @manifest
|
74
|
+
@rows_in_current_file = 0
|
79
75
|
end
|
80
76
|
|
81
77
|
# No-op; we now write once on close for simplicity
|
@@ -92,25 +88,38 @@ module Purplelight
|
|
92
88
|
end
|
93
89
|
|
94
90
|
def write_table(table, path, append: false) # rubocop:disable Lint/UnusedMethodArgument
|
95
|
-
#
|
96
|
-
if
|
97
|
-
|
91
|
+
# Stream via ArrowFileWriter when available to avoid building huge tables
|
92
|
+
if defined?(Parquet::ArrowFileWriter)
|
93
|
+
unless @pq_writer
|
94
|
+
@pq_writer = Parquet::ArrowFileWriter.open(table.schema, path)
|
95
|
+
end
|
96
|
+
# Prefer passing row_group_size; fallback to single-arg for older APIs
|
97
|
+
begin
|
98
|
+
@pq_writer.write_table(table, @row_group_size)
|
99
|
+
rescue ArgumentError
|
100
|
+
@pq_writer.write_table(table)
|
101
|
+
end
|
98
102
|
return
|
99
103
|
end
|
100
|
-
# Fallback to
|
101
|
-
if
|
102
|
-
|
103
|
-
writer.write_table(table)
|
104
|
-
writer.close
|
104
|
+
# Fallback to one-shot save when streaming API is not available
|
105
|
+
if table.respond_to?(:save)
|
106
|
+
table.save(path, format: :parquet)
|
105
107
|
return
|
106
108
|
end
|
107
109
|
raise 'Parquet writer not available in this environment'
|
108
110
|
end
|
109
111
|
|
110
112
|
def finalize_current_part!
|
113
|
+
return if @writer_path.nil?
|
114
|
+
if @pq_writer
|
115
|
+
@pq_writer.close
|
116
|
+
@pq_writer = nil
|
117
|
+
end
|
111
118
|
@manifest&.complete_part!(index: @part_index, checksum: nil)
|
112
119
|
@file_seq += 1 unless @single_file
|
113
120
|
@writer_path = nil
|
121
|
+
@part_index = nil
|
122
|
+
@rows_in_current_file = 0
|
114
123
|
end
|
115
124
|
|
116
125
|
def next_part_path
|
@@ -138,5 +147,53 @@ module Purplelight
|
|
138
147
|
|
139
148
|
value
|
140
149
|
end
|
150
|
+
|
151
|
+
def flush_row_groups_if_needed
|
152
|
+
return if @buffer_docs.empty?
|
153
|
+
|
154
|
+
while @buffer_docs.length >= @row_group_size
|
155
|
+
ensure_open!
|
156
|
+
group = @buffer_docs.shift(@row_group_size)
|
157
|
+
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
158
|
+
table = build_table(group)
|
159
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
160
|
+
|
161
|
+
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
162
|
+
write_table(table, @writer_path, append: true)
|
163
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
164
|
+
@rows_in_current_file += group.length
|
165
|
+
maybe_rotate!
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def flush_all_row_groups
|
170
|
+
return if @buffer_docs.empty?
|
171
|
+
|
172
|
+
# Flush any full groups first
|
173
|
+
flush_row_groups_if_needed
|
174
|
+
return if @buffer_docs.empty?
|
175
|
+
|
176
|
+
# Flush remaining as a final smaller group
|
177
|
+
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
178
|
+
table = build_table(@buffer_docs)
|
179
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
180
|
+
|
181
|
+
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
182
|
+
ensure_open!
|
183
|
+
write_table(table, @writer_path, append: true)
|
184
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
185
|
+
@buffer_docs.clear
|
186
|
+
@rows_in_current_file += table.n_rows if table.respond_to?(:n_rows)
|
187
|
+
@rows_in_current_file += @buffer_docs.length unless table.respond_to?(:n_rows)
|
188
|
+
maybe_rotate!
|
189
|
+
end
|
190
|
+
|
191
|
+
def maybe_rotate!
|
192
|
+
return if @single_file
|
193
|
+
return unless @rotate_rows && @rows_in_current_file >= @rotate_rows
|
194
|
+
|
195
|
+
finalize_current_part!
|
196
|
+
# Next write will open a new part
|
197
|
+
end
|
141
198
|
end
|
142
199
|
end
|