purplelight 0.1.10 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +27 -1
- data/lib/purplelight/snapshot.rb +6 -2
- data/lib/purplelight/version.rb +1 -1
- data/lib/purplelight/writer_parquet.rb +23 -3
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 80f6e48231b485750fc65529ada74a07758befba9b324b58ef372f077305a144
|
4
|
+
data.tar.gz: 4d7eed034f90155d2686da45a76caa73928cbe2d080d2031539830f6f4399cfb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 63f49c8dab688ec5cd922304b472ee448aaada1a3e2c113b7a4ddbe2092f3a3d2a83e1fe396066e87971514d8069831486c1a5ad972e604807c6c3289efd8e31
|
7
|
+
data.tar.gz: 6a281a23a0abf3244045b3e99af606881f3f85bab7eeabd7b4ca94e36c823d2405d7e85b60048ec62e5a7ace63fceddd268e185a831398cc7dbd00c213198961
|
data/README.md
CHANGED
@@ -9,7 +9,7 @@ Purplelight is published on RubyGems: [purplelight on RubyGems](https://rubygems
|
|
9
9
|
Add to your Gemfile:
|
10
10
|
|
11
11
|
```ruby
|
12
|
-
gem 'purplelight', '~> 0.1.
|
12
|
+
gem 'purplelight', '~> 0.1.11'
|
13
13
|
```
|
14
14
|
|
15
15
|
Or install directly:
|
@@ -144,6 +144,29 @@ Purplelight.snapshot(
|
|
144
144
|
)
|
145
145
|
```
|
146
146
|
|
147
|
+
Parquet multi-part (rows-based rotation):
|
148
|
+
|
149
|
+
```ruby
|
150
|
+
Purplelight.snapshot(
|
151
|
+
client: client,
|
152
|
+
collection: 'users',
|
153
|
+
output: '/data/exports',
|
154
|
+
format: :parquet,
|
155
|
+
# Any mode other than :single_file enables multi-part filenames for Parquet
|
156
|
+
sharding: { mode: :by_size, prefix: 'users_parquet_parts' },
|
157
|
+
# Split into multiple .parquet files, limiting rows per file
|
158
|
+
parquet_max_rows: 100_000,
|
159
|
+
# Optional: Parquet row group size (rows)
|
160
|
+
parquet_row_group: 10_000,
|
161
|
+
resume: { enabled: true }
|
162
|
+
)
|
163
|
+
```
|
164
|
+
|
165
|
+
Notes for Parquet:
|
166
|
+
- Parquet multi-part sizing is controlled by rows via `parquet_max_rows`.
|
167
|
+
- `--rotate-mb` / `part_bytes` do not affect Parquet part size; they apply to JSONL/CSV.
|
168
|
+
- Use `sharding: { mode: :single_file }` to force a single `.parquet` file.
|
169
|
+
|
147
170
|
### Environment variables (optional)
|
148
171
|
|
149
172
|
CLI flags take precedence, but these environment variables can set sensible defaults:
|
@@ -199,6 +222,8 @@ bundle exec bin/purplelight \
|
|
199
222
|
Notes:
|
200
223
|
- Compression backend selection order is: requested format → `zstd-ruby` → `zstds` → `gzip`.
|
201
224
|
- `--single-file` and `--by-size` update only the sharding mode/params and preserve any provided `--prefix`.
|
225
|
+
- Parquet multi-part sizing is programmatic via `parquet_max_rows`; there is no CLI flag for it.
|
226
|
+
- To increase concurrent connections, set `maxPoolSize` on your Mongo URI (used by `--uri`), e.g., `mongodb://.../?maxPoolSize=32`. A good starting point is `maxPoolSize >= --partitions`.
|
202
227
|
|
203
228
|
### Architecture
|
204
229
|
|
@@ -236,6 +261,7 @@ Key points:
|
|
236
261
|
- **Rotation size**: larger (512MB–1GB) reduces finalize overhead for many parts. CLI: `--rotate-mb` (and/or `--by-size`).
|
237
262
|
- **JSONL chunking**: tune builder write chunk size for throughput. CLI: `--write-chunk-mb`.
|
238
263
|
- **Parquet row groups**: choose a row group size that fits downstream readers. CLI: `--parquet-row-group`.
|
264
|
+
- **Parquet parts (rows)**: split Parquet outputs by rows with `parquet_max_rows` (programmatic API). Set `sharding.mode` to anything other than `:single_file` to enable multi-part filenames.
|
239
265
|
- **Read preference**: offload to secondaries or tagged analytics nodes when available. CLI: `--read-preference`, `--read-tags`.
|
240
266
|
- **Read concern**: pick an appropriate level for consistency/latency trade-offs. CLI: `--read-concern`.
|
241
267
|
- **Cursor timeout**: for very long scans, leave `noCursorTimeout` enabled. CLI: `--no-cursor-timeout true|false`.
|
data/lib/purplelight/snapshot.rb
CHANGED
@@ -39,7 +39,8 @@ module Purplelight
|
|
39
39
|
sharding: { mode: :by_size, part_bytes: DEFAULTS[:rotate_bytes], prefix: nil },
|
40
40
|
logger: nil, on_progress: nil, read_concern: DEFAULTS[:read_concern], read_preference: DEFAULTS[:read_preference],
|
41
41
|
no_cursor_timeout: DEFAULTS[:no_cursor_timeout], telemetry: nil,
|
42
|
-
compression_level: nil, writer_threads: 1, write_chunk_bytes: nil, parquet_row_group: nil
|
42
|
+
compression_level: nil, writer_threads: 1, write_chunk_bytes: nil, parquet_row_group: nil,
|
43
|
+
parquet_max_rows: nil)
|
43
44
|
@client = client
|
44
45
|
@collection = client[collection]
|
45
46
|
@output = output
|
@@ -64,6 +65,7 @@ module Purplelight
|
|
64
65
|
@writer_threads = writer_threads || 1
|
65
66
|
@write_chunk_bytes = write_chunk_bytes
|
66
67
|
@parquet_row_group = parquet_row_group
|
68
|
+
@parquet_max_rows = parquet_max_rows
|
67
69
|
|
68
70
|
@running = true
|
69
71
|
@telemetry_enabled = telemetry ? telemetry.enabled? : (ENV['PL_TELEMETRY'] == '1')
|
@@ -106,6 +108,7 @@ module Purplelight
|
|
106
108
|
compression_level: @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @compression.to_s == 'zstd') || ENV['PL_ZSTD_LEVEL']&.to_i,
|
107
109
|
write_chunk_bytes: @write_chunk_bytes || ENV['PL_WRITE_CHUNK_BYTES']&.to_i,
|
108
110
|
parquet_row_group: @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i,
|
111
|
+
parquet_max_rows: @parquet_max_rows,
|
109
112
|
sharding: @sharding,
|
110
113
|
resume_overwrite_incompatible: @resume && @resume[:overwrite_incompatible] ? true : false,
|
111
114
|
telemetry: @telemetry_enabled
|
@@ -134,7 +137,8 @@ module Purplelight
|
|
134
137
|
single_file = @sharding && @sharding[:mode].to_s == 'single_file'
|
135
138
|
row_group = @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i || WriterParquet::DEFAULT_ROW_GROUP_SIZE
|
136
139
|
WriterParquet.new(directory: dir, prefix: prefix, compression: @compression, logger: @logger,
|
137
|
-
|
140
|
+
manifest: manifest, single_file: single_file, row_group_size: row_group,
|
141
|
+
rotate_rows: @parquet_max_rows)
|
138
142
|
else
|
139
143
|
raise ArgumentError, "format not implemented: #{@format}"
|
140
144
|
end
|
data/lib/purplelight/version.rb
CHANGED
@@ -15,7 +15,7 @@ module Purplelight
|
|
15
15
|
DEFAULT_ROW_GROUP_SIZE = 10_000
|
16
16
|
|
17
17
|
def initialize(directory:, prefix:, compression: :zstd, row_group_size: DEFAULT_ROW_GROUP_SIZE, logger: nil,
|
18
|
-
manifest: nil, single_file: true, schema: nil)
|
18
|
+
manifest: nil, single_file: true, schema: nil, rotate_rows: nil)
|
19
19
|
@directory = directory
|
20
20
|
@prefix = prefix
|
21
21
|
@compression = compression
|
@@ -24,11 +24,13 @@ module Purplelight
|
|
24
24
|
@manifest = manifest
|
25
25
|
@single_file = single_file
|
26
26
|
@schema = schema
|
27
|
+
@rotate_rows = rotate_rows
|
27
28
|
|
28
29
|
@closed = false
|
29
30
|
@file_seq = 0
|
30
31
|
@part_index = nil
|
31
32
|
@pq_writer = nil
|
33
|
+
@rows_in_current_file = 0
|
32
34
|
|
33
35
|
ensure_dependencies!
|
34
36
|
reset_buffers
|
@@ -44,9 +46,8 @@ module Purplelight
|
|
44
46
|
def close
|
45
47
|
return if @closed
|
46
48
|
|
47
|
-
ensure_open!
|
48
49
|
flush_all_row_groups
|
49
|
-
finalize_current_part!
|
50
|
+
finalize_current_part! if @writer_path
|
50
51
|
@closed = true
|
51
52
|
end
|
52
53
|
|
@@ -70,6 +71,7 @@ module Purplelight
|
|
70
71
|
FileUtils.mkdir_p(@directory)
|
71
72
|
@writer_path = next_part_path
|
72
73
|
@part_index = @manifest&.open_part!(@writer_path) if @manifest
|
74
|
+
@rows_in_current_file = 0
|
73
75
|
end
|
74
76
|
|
75
77
|
# No-op; we now write once on close for simplicity
|
@@ -108,6 +110,7 @@ module Purplelight
|
|
108
110
|
end
|
109
111
|
|
110
112
|
def finalize_current_part!
|
113
|
+
return if @writer_path.nil?
|
111
114
|
if @pq_writer
|
112
115
|
@pq_writer.close
|
113
116
|
@pq_writer = nil
|
@@ -115,6 +118,8 @@ module Purplelight
|
|
115
118
|
@manifest&.complete_part!(index: @part_index, checksum: nil)
|
116
119
|
@file_seq += 1 unless @single_file
|
117
120
|
@writer_path = nil
|
121
|
+
@part_index = nil
|
122
|
+
@rows_in_current_file = 0
|
118
123
|
end
|
119
124
|
|
120
125
|
def next_part_path
|
@@ -147,6 +152,7 @@ module Purplelight
|
|
147
152
|
return if @buffer_docs.empty?
|
148
153
|
|
149
154
|
while @buffer_docs.length >= @row_group_size
|
155
|
+
ensure_open!
|
150
156
|
group = @buffer_docs.shift(@row_group_size)
|
151
157
|
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
152
158
|
table = build_table(group)
|
@@ -155,6 +161,8 @@ module Purplelight
|
|
155
161
|
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
156
162
|
write_table(table, @writer_path, append: true)
|
157
163
|
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
164
|
+
@rows_in_current_file += group.length
|
165
|
+
maybe_rotate!
|
158
166
|
end
|
159
167
|
end
|
160
168
|
|
@@ -171,9 +179,21 @@ module Purplelight
|
|
171
179
|
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
172
180
|
|
173
181
|
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
182
|
+
ensure_open!
|
174
183
|
write_table(table, @writer_path, append: true)
|
175
184
|
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
176
185
|
@buffer_docs.clear
|
186
|
+
@rows_in_current_file += table.n_rows if table.respond_to?(:n_rows)
|
187
|
+
@rows_in_current_file += @buffer_docs.length unless table.respond_to?(:n_rows)
|
188
|
+
maybe_rotate!
|
189
|
+
end
|
190
|
+
|
191
|
+
def maybe_rotate!
|
192
|
+
return if @single_file
|
193
|
+
return unless @rotate_rows && @rows_in_current_file >= @rotate_rows
|
194
|
+
|
195
|
+
finalize_current_part!
|
196
|
+
# Next write will open a new part
|
177
197
|
end
|
178
198
|
end
|
179
199
|
end
|