purplelight 0.1.8 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +80 -15
- data/bin/purplelight +51 -4
- data/lib/purplelight/snapshot.rb +33 -7
- data/lib/purplelight/version.rb +1 -1
- data/lib/purplelight/writer_csv.rb +20 -14
- data/lib/purplelight/writer_jsonl.rb +28 -19
- data/lib/purplelight/writer_parquet.rb +54 -17
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f0f51fd601a59915a2a022831663fd4f2468e781b68b96f59d396359be49adbc
|
4
|
+
data.tar.gz: c899a18e7ce390bfc05f832dd32248aa8cbdc7b43bccf86197350e2c7929e7a6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f6546911873ed22865b9d4cdd2cc62d855ab3b991030808d8f49f3e054727a406b80c7dc43c518a450915152f2934dcba180d53bf75807c540eef893b3ca50b8
|
7
|
+
data.tar.gz: 5e7176eec64956388e72fd3d894db12e006a18edd4aade7eaf13b144381802932d7207ffefac6ad06157c03363f41acec4ca997871fb8abe8efc9e06e2238804
|
data/README.md
CHANGED
@@ -9,7 +9,7 @@ Purplelight is published on RubyGems: [purplelight on RubyGems](https://rubygems
|
|
9
9
|
Add to your Gemfile:
|
10
10
|
|
11
11
|
```ruby
|
12
|
-
gem 'purplelight', '~> 0.1.
|
12
|
+
gem 'purplelight', '~> 0.1.10'
|
13
13
|
```
|
14
14
|
|
15
15
|
Or install directly:
|
@@ -138,10 +138,21 @@ Purplelight.snapshot(
|
|
138
138
|
output: '/data/exports',
|
139
139
|
format: :parquet,
|
140
140
|
sharding: { mode: :single_file, prefix: 'users' },
|
141
|
+
# Optional: tune row group size
|
142
|
+
# parquet_row_group: 50_000,
|
141
143
|
resume: { enabled: true }
|
142
144
|
)
|
143
145
|
```
|
144
146
|
|
147
|
+
### Environment variables (optional)
|
148
|
+
|
149
|
+
CLI flags take precedence, but these environment variables can set sensible defaults:
|
150
|
+
|
151
|
+
- `PL_ZSTD_LEVEL`: default zstd compression level used by writers.
|
152
|
+
- `PL_WRITE_CHUNK_BYTES`: JSONL join/write chunk size in bytes.
|
153
|
+
- `PL_PARQUET_ROW_GROUP`: default Parquet row group size (rows).
|
154
|
+
- `PL_TELEMETRY`: set to `1` to enable telemetry by default.
|
155
|
+
|
145
156
|
### CLI
|
146
157
|
|
147
158
|
```bash
|
@@ -149,9 +160,46 @@ bundle exec bin/purplelight \
|
|
149
160
|
--uri "$MONGO_URL" \
|
150
161
|
--db mydb --collection users \
|
151
162
|
--output /data/exports \
|
152
|
-
--format jsonl --partitions 8 --by-size $((256*1024*1024)) --prefix users
|
163
|
+
--format jsonl --partitions 8 --by-size $((256*1024*1024)) --prefix users \
|
164
|
+
--queue-mb 512 --rotate-mb 512 --compression zstd --compression-level 6 \
|
165
|
+
--read-preference secondary --read-tags nodeType=ANALYTICS,region=EAST \
|
166
|
+
--read-concern majority --no-cursor-timeout true
|
153
167
|
```
|
154
168
|
|
169
|
+
#### CLI options (reference)
|
170
|
+
|
171
|
+
- `--uri URI` (required): Mongo connection string.
|
172
|
+
- `--db NAME` (required): Database name.
|
173
|
+
- `--collection NAME` (required): Collection name.
|
174
|
+
- `--output PATH` (required): Output directory or file path.
|
175
|
+
- `--format FORMAT`: `jsonl|csv|parquet` (default `jsonl`).
|
176
|
+
- `--compression NAME`: `zstd|gzip|none` (default `zstd`).
|
177
|
+
- `--compression-level N`: Compression level (zstd or gzip; writer-specific defaults if omitted).
|
178
|
+
- `--partitions N`: Number of reader partitions (defaults to ≥4 and ≤32 based on cores).
|
179
|
+
- `--batch-size N`: Mongo batch size (default 2000).
|
180
|
+
- `--queue-mb MB`: In-memory queue size in MB (default 256).
|
181
|
+
- `--rotate-mb MB`: Target rotate size for JSONL/CSV parts in MB (default 256). For multi-part outputs, pairs well with `--by-size`.
|
182
|
+
- `--by-size BYTES`: Plan size-based sharding for multi-part outputs.
|
183
|
+
- `--single-file`: Single output file (CSV/Parquet; JSONL remains multi-part).
|
184
|
+
- `--prefix NAME`: Output filename prefix (defaults to collection name when output is a directory).
|
185
|
+
- `--query JSON`: Filter as JSON/Extended JSON (supports `$date`, `$oid`, etc.).
|
186
|
+
- `--projection JSON`: Projection as JSON, e.g. `{"_id":1,"email":1}`.
|
187
|
+
- `--read-preference MODE`: `primary|primary_preferred|secondary|secondary_preferred|nearest`.
|
188
|
+
- `--read-tags key=value[,key=value...]`: Tag sets for node pinning.
|
189
|
+
- `--read-concern LEVEL`: `majority|local|linearizable|available|snapshot`.
|
190
|
+
- `--no-cursor-timeout BOOL`: Toggle `noCursorTimeout` (default true).
|
191
|
+
- `--parquet-row-group N`: Parquet row group size (rows).
|
192
|
+
- `--write-chunk-mb MB`: JSONL encode/write chunk size before enqueueing.
|
193
|
+
- `--writer-threads N` (experimental): Number of writer threads (JSONL only).
|
194
|
+
- `--telemetry on|off`: Force enable/disable telemetry output.
|
195
|
+
- `--resume-overwrite-incompatible`: Overwrite an existing incompatible manifest to safely resume anew.
|
196
|
+
- `--dry-run`: Print effective read preference JSON and exit (no snapshot).
|
197
|
+
- `--version`, `--help`: Utility commands.
|
198
|
+
|
199
|
+
Notes:
|
200
|
+
- Compression backend selection order is: requested format → `zstd-ruby` → `zstds` → `gzip`.
|
201
|
+
- `--single-file` and `--by-size` update only the sharding mode/params and preserve any provided `--prefix`.
|
202
|
+
|
155
203
|
### Architecture
|
156
204
|
|
157
205
|
```mermaid
|
@@ -181,19 +229,28 @@ Key points:
|
|
181
229
|
|
182
230
|
### Tuning for performance
|
183
231
|
|
184
|
-
- Partitions
|
185
|
-
- Batch size
|
186
|
-
- Queue size
|
187
|
-
- Compression
|
188
|
-
- Rotation size
|
189
|
-
-
|
232
|
+
- **Partitions**: start with `2 × cores` (default). Increase gradually if reads are underutilized; too high can add overhead. CLI: `--partitions`.
|
233
|
+
- **Batch size**: 2k–10k usually works well. Larger batches reduce cursor roundtrips, but can raise latency/memory. CLI: `--batch-size`.
|
234
|
+
- **Queue size**: increase to 256–512MB to reduce backpressure on readers for fast disks. CLI: `--queue-mb`.
|
235
|
+
- **Compression**: prefer `zstd`; adjust level to balance speed/ratio. CLI: `--compression zstd --compression-level N`. For max speed, try `--compression gzip --compression-level 1`.
|
236
|
+
- **Rotation size**: larger (512MB–1GB) reduces finalize overhead for many parts. CLI: `--rotate-mb` (and/or `--by-size`).
|
237
|
+
- **JSONL chunking**: tune builder write chunk size for throughput. CLI: `--write-chunk-mb`.
|
238
|
+
- **Parquet row groups**: choose a row group size that fits downstream readers. CLI: `--parquet-row-group`.
|
239
|
+
- **Read preference**: offload to secondaries or tagged analytics nodes when available. CLI: `--read-preference`, `--read-tags`.
|
240
|
+
- **Read concern**: pick an appropriate level for consistency/latency trade-offs. CLI: `--read-concern`.
|
241
|
+
- **Cursor timeout**: for very long scans, leave `noCursorTimeout` enabled. CLI: `--no-cursor-timeout true|false`.
|
242
|
+
- **Telemetry**: enable to inspect timing breakdowns; disable for minimal output. CLI: `--telemetry on|off`.
|
190
243
|
|
191
244
|
Benchmarking (optional):
|
192
245
|
|
193
246
|
```bash
|
194
|
-
# 1M docs benchmark with tunables
|
247
|
+
# 1M docs benchmark with tunables (JSONL)
|
195
248
|
BENCH=1 BENCH_PARTITIONS=16 BENCH_BATCH_SIZE=8000 BENCH_QUEUE_MB=512 BENCH_ROTATE_MB=512 BENCH_COMPRESSION=gzip \
|
196
249
|
bundle exec rspec spec/benchmark_perf_spec.rb --format doc
|
250
|
+
|
251
|
+
# Parquet benchmark (requires Arrow/Parquet)
|
252
|
+
BENCH=1 BENCH_FORMAT=parquet BENCH_PARQUET_ROW_GROUP=50000 BENCH_PARTITIONS=16 BENCH_BATCH_SIZE=8000 \
|
253
|
+
bundle exec rspec spec/benchmark_perf_spec.rb --format doc
|
197
254
|
```
|
198
255
|
|
199
256
|
### Read preference and node pinning
|
@@ -248,14 +305,22 @@ bundle exec bin/purplelight \
|
|
248
305
|
|
249
306
|
### Quick Benchmark
|
250
307
|
```
|
251
|
-
%
|
308
|
+
% BENCH=1 BENCH_PARTITIONS=16 BENCH_BATCH_SIZE=8000 BENCH_QUEUE_MB=512 BENCH_ROTATE_MB=512 BENCH_COMPRESSION=gzip bundle exec rspec spec/benchmark_perf_spec.rb --format doc
|
252
309
|
|
253
310
|
Performance benchmark (1M docs, gated by BENCH=1)
|
254
|
-
W, [2025-09-03T16:10:40.437304 #33546] WARN -- : MONGODB | Error checking 127.0.0.1:27018: Mongo::Error::SocketError: Errno::ECONNREFUSED: Connection refused - connect(2) for 127.0.0.1:27018 (for 127.0.0.1:27018 (no TLS)) (on 127.0.0.1:27018)
|
255
311
|
Benchmark results:
|
256
|
-
Inserted: 1000000 docs in 8.
|
257
|
-
Exported: 1000000 docs in
|
258
|
-
Parts: 1, Bytes:
|
259
|
-
Throughput:
|
312
|
+
Inserted: 1000000 docs in 8.13s
|
313
|
+
Exported: 1000000 docs in 4.03s
|
314
|
+
Parts: 1, Bytes: 10625336
|
315
|
+
Throughput: 248241.7 docs/s, 2.52 MB/s
|
260
316
|
Settings: partitions=16, batch_size=8000, queue_mb=512, rotate_mb=512, compression=gzip
|
317
|
+
exports 1,000,000 documents and reports throughput
|
318
|
+
|
319
|
+
Finished in 14.02 seconds (files took 0.31974 seconds to load)
|
320
|
+
1 example, 0 failures
|
261
321
|
```
|
322
|
+
|
323
|
+
Additional BENCH variables:
|
324
|
+
|
325
|
+
- `BENCH_FORMAT`: `jsonl|parquet` (default `jsonl`).
|
326
|
+
- `BENCH_PARQUET_ROW_GROUP`: Parquet row group size (rows), e.g. `50000`.
|
data/bin/purplelight
CHANGED
@@ -18,7 +18,17 @@ options = {
|
|
18
18
|
resume: { enabled: true },
|
19
19
|
read_preference: nil,
|
20
20
|
read_tags: nil,
|
21
|
-
dry_run: false
|
21
|
+
dry_run: false,
|
22
|
+
queue_size_bytes: nil,
|
23
|
+
rotate_bytes: nil,
|
24
|
+
compression_level: nil,
|
25
|
+
writer_threads: nil,
|
26
|
+
write_chunk_bytes: nil,
|
27
|
+
parquet_row_group: nil,
|
28
|
+
telemetry_flag: nil,
|
29
|
+
read_concern: nil,
|
30
|
+
no_cursor_timeout: nil,
|
31
|
+
resume_overwrite_incompatible: false
|
22
32
|
}
|
23
33
|
|
24
34
|
parser = OptionParser.new do |opts|
|
@@ -30,8 +40,11 @@ parser = OptionParser.new do |opts|
|
|
30
40
|
opts.on('-o', '--output PATH', 'Output directory or file (required)') { |v| options[:output] = v }
|
31
41
|
opts.on('-f', '--format FORMAT', 'Format: jsonl|csv|parquet (default jsonl)') { |v| options[:format] = v.to_sym }
|
32
42
|
opts.on('--compression NAME', 'Compression: zstd|gzip|none') { |v| options[:compression] = v.to_sym }
|
43
|
+
opts.on('--compression-level N', Integer, 'Compression level for zstd/gzip (JSONL/CSV)') { |v| options[:compression_level] = v }
|
33
44
|
opts.on('--partitions N', Integer, 'Number of partitions') { |v| options[:partitions] = v }
|
34
45
|
opts.on('--batch-size N', Integer, 'Mongo batch size (default 2000)') { |v| options[:batch_size] = v }
|
46
|
+
opts.on('--queue-mb MB', Integer, 'Queue size in MB (default 256)') { |v| options[:queue_size_bytes] = v * 1024 * 1024 }
|
47
|
+
opts.on('--rotate-mb MB', Integer, 'Rotate part size in MB (default 256)') { |v| options[:rotate_bytes] = v * 1024 * 1024 }
|
35
48
|
opts.on('--by-size BYTES', Integer, 'Shard by size (bytes); default 268435456') do |v|
|
36
49
|
options[:sharding] ||= {}
|
37
50
|
options[:sharding][:mode] = :by_size
|
@@ -45,6 +58,9 @@ parser = OptionParser.new do |opts|
|
|
45
58
|
options[:sharding] ||= {}
|
46
59
|
options[:sharding][:prefix] = v
|
47
60
|
end
|
61
|
+
opts.on('--writer-threads N', Integer, 'Number of writer threads (experimental, JSONL only)') { |v| options[:writer_threads] = v }
|
62
|
+
opts.on('--write-chunk-mb MB', Integer, 'JSONL encode/write chunk size in MB') { |v| options[:write_chunk_bytes] = v * 1024 * 1024 }
|
63
|
+
opts.on('--parquet-row-group N', Integer, 'Parquet row group size (rows)') { |v| options[:parquet_row_group] = v }
|
48
64
|
opts.on('-q', '--query JSON', 'Filter query as JSON (Extended JSON supported)') do |v|
|
49
65
|
# Prefer BSON Extended JSON to support $date, $oid, etc.
|
50
66
|
options[:query] = BSON::ExtJSON.parse(v)
|
@@ -57,6 +73,7 @@ parser = OptionParser.new do |opts|
|
|
57
73
|
'Read preference mode: primary|primary_preferred|secondary|secondary_preferred|nearest') do |v|
|
58
74
|
options[:read_preference] = v.to_sym
|
59
75
|
end
|
76
|
+
opts.on('--read-concern LEVEL', 'Read concern: majority|local|linearizable|available|snapshot') { |v| options[:read_concern] = v.to_sym }
|
60
77
|
opts.on('--read-tags TAGS',
|
61
78
|
'Comma-separated key=value list to target tagged nodes (e.g., nodeType=ANALYTICS,region=EAST)') do |v|
|
62
79
|
tags = {}
|
@@ -69,6 +86,13 @@ parser = OptionParser.new do |opts|
|
|
69
86
|
options[:read_tags] = tags unless tags.empty?
|
70
87
|
end
|
71
88
|
opts.on('--dry-run', 'Parse options and print effective read preference JSON, then exit') { options[:dry_run] = true }
|
89
|
+
opts.on('--telemetry MODE', 'Telemetry on|off (overrides PL_TELEMETRY)') { |v| options[:telemetry_flag] = v }
|
90
|
+
opts.on('--no-cursor-timeout BOOL', 'noCursorTimeout true|false (default true)') do |v|
|
91
|
+
options[:no_cursor_timeout] = %w[true 1 yes].include?(v.to_s.downcase)
|
92
|
+
end
|
93
|
+
opts.on('--resume-overwrite-incompatible', 'Overwrite incompatible existing manifest on resume') do
|
94
|
+
options[:resume_overwrite_incompatible] = true
|
95
|
+
end
|
72
96
|
opts.on('--version', 'Show version') do
|
73
97
|
puts Purplelight::VERSION
|
74
98
|
exit 0
|
@@ -110,7 +134,7 @@ end
|
|
110
134
|
client = Mongo::Client.new(options[:uri])
|
111
135
|
options[:partitions] ||= (Etc.respond_to?(:nprocessors) ? [Etc.nprocessors * 2, 4].max : 4)
|
112
136
|
|
113
|
-
|
137
|
+
snapshot_args = {
|
114
138
|
client: client.use(options[:db]),
|
115
139
|
collection: options[:collection],
|
116
140
|
output: options[:output],
|
@@ -122,8 +146,31 @@ ok = Purplelight.snapshot(
|
|
122
146
|
projection: options[:projection],
|
123
147
|
sharding: options[:sharding],
|
124
148
|
read_preference: effective_read || options[:read_preference],
|
125
|
-
resume: { enabled: true },
|
149
|
+
resume: { enabled: true, overwrite_incompatible: options[:resume_overwrite_incompatible] },
|
126
150
|
on_progress: ->(s) { warn("progress: #{s.to_json}") }
|
127
|
-
|
151
|
+
}
|
152
|
+
|
153
|
+
# optional tunables
|
154
|
+
snapshot_args[:queue_size_bytes] = options[:queue_size_bytes] if options[:queue_size_bytes]
|
155
|
+
snapshot_args[:rotate_bytes] = options[:rotate_bytes] if options[:rotate_bytes]
|
156
|
+
snapshot_args[:read_concern] = options[:read_concern] if options[:read_concern]
|
157
|
+
snapshot_args[:no_cursor_timeout] = options[:no_cursor_timeout] unless options[:no_cursor_timeout].nil?
|
158
|
+
snapshot_args[:compression_level] = options[:compression_level] if options[:compression_level]
|
159
|
+
snapshot_args[:writer_threads] = options[:writer_threads] if options[:writer_threads]
|
160
|
+
snapshot_args[:write_chunk_bytes] = options[:write_chunk_bytes] if options[:write_chunk_bytes]
|
161
|
+
snapshot_args[:parquet_row_group] = options[:parquet_row_group] if options[:parquet_row_group]
|
162
|
+
|
163
|
+
# telemetry env override
|
164
|
+
if options[:telemetry_flag]
|
165
|
+
ENV['PL_TELEMETRY'] = (options[:telemetry_flag].to_s.downcase == 'on' ? '1' : '0')
|
166
|
+
end
|
167
|
+
|
168
|
+
# writer-specific overrides via environment for v1 compatibility
|
169
|
+
ENV['PL_ZSTD_LEVEL'] = options[:compression_level].to_s if options[:compression_level]
|
170
|
+
ENV['PL_WRITE_CHUNK_BYTES'] = options[:write_chunk_bytes].to_s if options[:write_chunk_bytes]
|
171
|
+
ENV['PL_PARQUET_ROW_GROUP'] = options[:parquet_row_group].to_s if options[:parquet_row_group]
|
172
|
+
ENV['PL_WRITER_THREADS'] = options[:writer_threads].to_s if options[:writer_threads]
|
173
|
+
|
174
|
+
ok = Purplelight.snapshot(**snapshot_args)
|
128
175
|
|
129
176
|
exit(ok ? 0 : 1)
|
data/lib/purplelight/snapshot.rb
CHANGED
@@ -38,7 +38,8 @@ module Purplelight
|
|
38
38
|
resume: { enabled: true, state_path: nil, overwrite_incompatible: false },
|
39
39
|
sharding: { mode: :by_size, part_bytes: DEFAULTS[:rotate_bytes], prefix: nil },
|
40
40
|
logger: nil, on_progress: nil, read_concern: DEFAULTS[:read_concern], read_preference: DEFAULTS[:read_preference],
|
41
|
-
no_cursor_timeout: DEFAULTS[:no_cursor_timeout], telemetry: nil
|
41
|
+
no_cursor_timeout: DEFAULTS[:no_cursor_timeout], telemetry: nil,
|
42
|
+
compression_level: nil, writer_threads: 1, write_chunk_bytes: nil, parquet_row_group: nil)
|
42
43
|
@client = client
|
43
44
|
@collection = client[collection]
|
44
45
|
@output = output
|
@@ -59,6 +60,10 @@ module Purplelight
|
|
59
60
|
@read_concern = read_concern
|
60
61
|
@read_preference = read_preference
|
61
62
|
@no_cursor_timeout = no_cursor_timeout
|
63
|
+
@compression_level = compression_level
|
64
|
+
@writer_threads = writer_threads || 1
|
65
|
+
@write_chunk_bytes = write_chunk_bytes
|
66
|
+
@parquet_row_group = parquet_row_group
|
62
67
|
|
63
68
|
@running = true
|
64
69
|
@telemetry_enabled = telemetry ? telemetry.enabled? : (ENV['PL_TELEMETRY'] == '1')
|
@@ -90,7 +95,20 @@ module Purplelight
|
|
90
95
|
end
|
91
96
|
|
92
97
|
manifest.configure!(collection: @collection.name, format: @format, compression: @compression, query_digest: query_digest, options: {
|
93
|
-
partitions: @partitions,
|
98
|
+
partitions: @partitions,
|
99
|
+
batch_size: @batch_size,
|
100
|
+
queue_size_bytes: @queue_size_bytes,
|
101
|
+
rotate_bytes: @rotate_bytes,
|
102
|
+
hint: @hint,
|
103
|
+
read_concern: (@read_concern.is_a?(Hash) ? @read_concern : { level: @read_concern }),
|
104
|
+
no_cursor_timeout: @no_cursor_timeout,
|
105
|
+
writer_threads: @writer_threads,
|
106
|
+
compression_level: @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @compression.to_s == 'zstd') || ENV['PL_ZSTD_LEVEL']&.to_i,
|
107
|
+
write_chunk_bytes: @write_chunk_bytes || ENV['PL_WRITE_CHUNK_BYTES']&.to_i,
|
108
|
+
parquet_row_group: @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i,
|
109
|
+
sharding: @sharding,
|
110
|
+
resume_overwrite_incompatible: @resume && @resume[:overwrite_incompatible] ? true : false,
|
111
|
+
telemetry: @telemetry_enabled
|
94
112
|
})
|
95
113
|
manifest.ensure_partitions!(@partitions)
|
96
114
|
|
@@ -114,8 +132,9 @@ module Purplelight
|
|
114
132
|
logger: @logger, manifest: manifest, single_file: single_file)
|
115
133
|
when :parquet
|
116
134
|
single_file = @sharding && @sharding[:mode].to_s == 'single_file'
|
135
|
+
row_group = @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i || WriterParquet::DEFAULT_ROW_GROUP_SIZE
|
117
136
|
WriterParquet.new(directory: dir, prefix: prefix, compression: @compression, logger: @logger,
|
118
|
-
manifest: manifest, single_file: single_file)
|
137
|
+
manifest: manifest, single_file: single_file, row_group_size: row_group)
|
119
138
|
else
|
120
139
|
raise ArgumentError, "format not implemented: #{@format}"
|
121
140
|
end
|
@@ -225,6 +244,11 @@ module Purplelight
|
|
225
244
|
string_batch = +''
|
226
245
|
buffer = []
|
227
246
|
buffer_bytes = 0
|
247
|
+
json_state = if encode_lines
|
248
|
+
JSON::Ext::Generator::State.new(ascii_only: false, max_nesting: false,
|
249
|
+
buffer_initial_length: 4_096)
|
250
|
+
end
|
251
|
+
size_state = encode_lines ? nil : JSON::Ext::Generator::State.new(ascii_only: false, max_nesting: false)
|
228
252
|
last_id = checkpoint
|
229
253
|
begin
|
230
254
|
cursor.each do |doc|
|
@@ -232,13 +256,15 @@ module Purplelight
|
|
232
256
|
doc = @mapper.call(doc) if @mapper
|
233
257
|
t_ser = telemetry.start(:serialize_time)
|
234
258
|
if encode_lines
|
235
|
-
|
259
|
+
json = json_state.generate(doc)
|
236
260
|
telemetry.finish(:serialize_time, t_ser)
|
237
|
-
|
238
|
-
string_batch <<
|
261
|
+
string_batch << json
|
262
|
+
string_batch << "\n"
|
263
|
+
bytes = json.bytesize + 1
|
239
264
|
else
|
240
265
|
# For CSV/Parquet keep raw docs to allow schema/row building
|
241
|
-
|
266
|
+
json = size_state.generate(doc)
|
267
|
+
bytes = json.bytesize + 1
|
242
268
|
telemetry.finish(:serialize_time, t_ser)
|
243
269
|
buffer << doc
|
244
270
|
end
|
data/lib/purplelight/version.rb
CHANGED
@@ -5,16 +5,14 @@ require 'json'
|
|
5
5
|
require 'zlib'
|
6
6
|
require 'fileutils'
|
7
7
|
|
8
|
-
begin
|
9
|
-
require 'zstds'
|
10
|
-
rescue LoadError
|
11
|
-
# zstd not available; fallback handled later via gzip
|
12
|
-
end
|
13
|
-
|
14
8
|
begin
|
15
9
|
require 'zstd-ruby'
|
16
10
|
rescue LoadError
|
17
|
-
|
11
|
+
begin
|
12
|
+
require 'zstds'
|
13
|
+
rescue LoadError
|
14
|
+
# no zstd backend; gzip fallback used later
|
15
|
+
end
|
18
16
|
end
|
19
17
|
|
20
18
|
module Purplelight
|
@@ -200,14 +198,22 @@ module Purplelight
|
|
200
198
|
end
|
201
199
|
|
202
200
|
def determine_effective_compression(requested)
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
:
|
201
|
+
# Order: explicit request -> zstd-ruby -> zstds -> gzip
|
202
|
+
req = requested.to_s
|
203
|
+
return :none if req == 'none'
|
204
|
+
return :gzip if req == 'gzip'
|
205
|
+
|
206
|
+
if req == 'zstd'
|
207
|
+
return :zstd if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
|
208
|
+
return :zstd if defined?(ZSTDS)
|
209
|
+
|
210
|
+
return :gzip
|
210
211
|
end
|
212
|
+
# Default auto-select
|
213
|
+
return :zstd if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
|
214
|
+
return :zstd if defined?(ZSTDS)
|
215
|
+
|
216
|
+
:gzip
|
211
217
|
end
|
212
218
|
|
213
219
|
def infer_columns(docs)
|
@@ -4,16 +4,14 @@ require 'json'
|
|
4
4
|
require 'zlib'
|
5
5
|
require 'fileutils'
|
6
6
|
|
7
|
-
begin
|
8
|
-
require 'zstds'
|
9
|
-
rescue LoadError
|
10
|
-
# zstd not available; will fallback to gzip
|
11
|
-
end
|
12
|
-
|
13
7
|
begin
|
14
8
|
require 'zstd-ruby'
|
15
9
|
rescue LoadError
|
16
|
-
|
10
|
+
begin
|
11
|
+
require 'zstds'
|
12
|
+
rescue LoadError
|
13
|
+
# no zstd backend; gzip fallback
|
14
|
+
end
|
17
15
|
end
|
18
16
|
|
19
17
|
module Purplelight
|
@@ -40,6 +38,7 @@ module Purplelight
|
|
40
38
|
@closed = false
|
41
39
|
|
42
40
|
@effective_compression = determine_effective_compression(@compression)
|
41
|
+
@json_state = JSON::Ext::Generator::State.new(ascii_only: false, max_nesting: false)
|
43
42
|
if @logger
|
44
43
|
level_disp = @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @effective_compression.to_s == 'zstd')
|
45
44
|
@logger.info("WriterJSONL using compression='#{@effective_compression}' level='#{level_disp || 'default'}'")
|
@@ -86,20 +85,22 @@ module Purplelight
|
|
86
85
|
chunk = +''
|
87
86
|
chunk_bytes = 0
|
88
87
|
batch.each do |doc|
|
89
|
-
|
88
|
+
json = @json_state.generate(doc)
|
90
89
|
rows += 1
|
91
|
-
|
92
|
-
|
90
|
+
bytes = json.bytesize + 1
|
91
|
+
chunk << json
|
92
|
+
chunk << "\n"
|
93
|
+
chunk_bytes += bytes
|
93
94
|
next unless chunk_bytes >= chunk_threshold
|
94
95
|
|
95
96
|
write_buffer(chunk)
|
96
|
-
total_bytes +=
|
97
|
+
total_bytes += chunk_bytes
|
97
98
|
chunk = +''
|
98
99
|
chunk_bytes = 0
|
99
100
|
end
|
100
101
|
unless chunk.empty?
|
101
102
|
write_buffer(chunk)
|
102
|
-
total_bytes +=
|
103
|
+
total_bytes += chunk_bytes
|
103
104
|
end
|
104
105
|
end
|
105
106
|
|
@@ -198,14 +199,22 @@ module Purplelight
|
|
198
199
|
end
|
199
200
|
|
200
201
|
def determine_effective_compression(requested)
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
:
|
202
|
+
# Order: explicit request -> zstd-ruby -> zstds -> gzip
|
203
|
+
req = requested.to_s
|
204
|
+
return :none if req == 'none'
|
205
|
+
return :gzip if req == 'gzip'
|
206
|
+
|
207
|
+
if req == 'zstd'
|
208
|
+
return :zstd if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
|
209
|
+
return :zstd if defined?(ZSTDS)
|
210
|
+
|
211
|
+
return :gzip
|
208
212
|
end
|
213
|
+
# Default auto-select
|
214
|
+
return :zstd if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
|
215
|
+
return :zstd if defined?(ZSTDS)
|
216
|
+
|
217
|
+
:gzip
|
209
218
|
end
|
210
219
|
end
|
211
220
|
end
|
@@ -28,6 +28,7 @@ module Purplelight
|
|
28
28
|
@closed = false
|
29
29
|
@file_seq = 0
|
30
30
|
@part_index = nil
|
31
|
+
@pq_writer = nil
|
31
32
|
|
32
33
|
ensure_dependencies!
|
33
34
|
reset_buffers
|
@@ -36,6 +37,7 @@ module Purplelight
|
|
36
37
|
def write_many(array_of_docs)
|
37
38
|
ensure_open!
|
38
39
|
array_of_docs.each { |doc| @buffer_docs << doc }
|
40
|
+
flush_row_groups_if_needed
|
39
41
|
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: array_of_docs.length, bytes_delta: 0)
|
40
42
|
end
|
41
43
|
|
@@ -43,15 +45,7 @@ module Purplelight
|
|
43
45
|
return if @closed
|
44
46
|
|
45
47
|
ensure_open!
|
46
|
-
|
47
|
-
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
48
|
-
table = build_table(@buffer_docs)
|
49
|
-
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
50
|
-
|
51
|
-
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
52
|
-
write_table(table, @writer_path, append: false)
|
53
|
-
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
54
|
-
end
|
48
|
+
flush_all_row_groups
|
55
49
|
finalize_current_part!
|
56
50
|
@closed = true
|
57
51
|
end
|
@@ -92,22 +86,32 @@ module Purplelight
|
|
92
86
|
end
|
93
87
|
|
94
88
|
def write_table(table, path, append: false) # rubocop:disable Lint/UnusedMethodArgument
|
95
|
-
#
|
96
|
-
if
|
97
|
-
|
89
|
+
# Stream via ArrowFileWriter when available to avoid building huge tables
|
90
|
+
if defined?(Parquet::ArrowFileWriter)
|
91
|
+
unless @pq_writer
|
92
|
+
@pq_writer = Parquet::ArrowFileWriter.open(table.schema, path)
|
93
|
+
end
|
94
|
+
# Prefer passing row_group_size; fallback to single-arg for older APIs
|
95
|
+
begin
|
96
|
+
@pq_writer.write_table(table, @row_group_size)
|
97
|
+
rescue ArgumentError
|
98
|
+
@pq_writer.write_table(table)
|
99
|
+
end
|
98
100
|
return
|
99
101
|
end
|
100
|
-
# Fallback to
|
101
|
-
if
|
102
|
-
|
103
|
-
writer.write_table(table)
|
104
|
-
writer.close
|
102
|
+
# Fallback to one-shot save when streaming API is not available
|
103
|
+
if table.respond_to?(:save)
|
104
|
+
table.save(path, format: :parquet)
|
105
105
|
return
|
106
106
|
end
|
107
107
|
raise 'Parquet writer not available in this environment'
|
108
108
|
end
|
109
109
|
|
110
110
|
def finalize_current_part!
|
111
|
+
if @pq_writer
|
112
|
+
@pq_writer.close
|
113
|
+
@pq_writer = nil
|
114
|
+
end
|
111
115
|
@manifest&.complete_part!(index: @part_index, checksum: nil)
|
112
116
|
@file_seq += 1 unless @single_file
|
113
117
|
@writer_path = nil
|
@@ -138,5 +142,38 @@ module Purplelight
|
|
138
142
|
|
139
143
|
value
|
140
144
|
end
|
145
|
+
|
146
|
+
def flush_row_groups_if_needed
|
147
|
+
return if @buffer_docs.empty?
|
148
|
+
|
149
|
+
while @buffer_docs.length >= @row_group_size
|
150
|
+
group = @buffer_docs.shift(@row_group_size)
|
151
|
+
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
152
|
+
table = build_table(group)
|
153
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
154
|
+
|
155
|
+
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
156
|
+
write_table(table, @writer_path, append: true)
|
157
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def flush_all_row_groups
|
162
|
+
return if @buffer_docs.empty?
|
163
|
+
|
164
|
+
# Flush any full groups first
|
165
|
+
flush_row_groups_if_needed
|
166
|
+
return if @buffer_docs.empty?
|
167
|
+
|
168
|
+
# Flush remaining as a final smaller group
|
169
|
+
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
170
|
+
table = build_table(@buffer_docs)
|
171
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
172
|
+
|
173
|
+
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
174
|
+
write_table(table, @writer_path, append: true)
|
175
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
176
|
+
@buffer_docs.clear
|
177
|
+
end
|
141
178
|
end
|
142
179
|
end
|