purplelight 0.1.10 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f0f51fd601a59915a2a022831663fd4f2468e781b68b96f59d396359be49adbc
4
- data.tar.gz: c899a18e7ce390bfc05f832dd32248aa8cbdc7b43bccf86197350e2c7929e7a6
3
+ metadata.gz: 80f6e48231b485750fc65529ada74a07758befba9b324b58ef372f077305a144
4
+ data.tar.gz: 4d7eed034f90155d2686da45a76caa73928cbe2d080d2031539830f6f4399cfb
5
5
  SHA512:
6
- metadata.gz: f6546911873ed22865b9d4cdd2cc62d855ab3b991030808d8f49f3e054727a406b80c7dc43c518a450915152f2934dcba180d53bf75807c540eef893b3ca50b8
7
- data.tar.gz: 5e7176eec64956388e72fd3d894db12e006a18edd4aade7eaf13b144381802932d7207ffefac6ad06157c03363f41acec4ca997871fb8abe8efc9e06e2238804
6
+ metadata.gz: 63f49c8dab688ec5cd922304b472ee448aaada1a3e2c113b7a4ddbe2092f3a3d2a83e1fe396066e87971514d8069831486c1a5ad972e604807c6c3289efd8e31
7
+ data.tar.gz: 6a281a23a0abf3244045b3e99af606881f3f85bab7eeabd7b4ca94e36c823d2405d7e85b60048ec62e5a7ace63fceddd268e185a831398cc7dbd00c213198961
data/README.md CHANGED
@@ -9,7 +9,7 @@ Purplelight is published on RubyGems: [purplelight on RubyGems](https://rubygems
9
9
  Add to your Gemfile:
10
10
 
11
11
  ```ruby
12
- gem 'purplelight', '~> 0.1.10'
12
+ gem 'purplelight', '~> 0.1.11'
13
13
  ```
14
14
 
15
15
  Or install directly:
@@ -144,6 +144,29 @@ Purplelight.snapshot(
144
144
  )
145
145
  ```
146
146
 
147
+ Parquet multi-part (rows-based rotation):
148
+
149
+ ```ruby
150
+ Purplelight.snapshot(
151
+ client: client,
152
+ collection: 'users',
153
+ output: '/data/exports',
154
+ format: :parquet,
155
+ # Any mode other than :single_file enables multi-part filenames for Parquet
156
+ sharding: { mode: :by_size, prefix: 'users_parquet_parts' },
157
+ # Split into multiple .parquet files, limiting rows per file
158
+ parquet_max_rows: 100_000,
159
+ # Optional: Parquet row group size (rows)
160
+ parquet_row_group: 10_000,
161
+ resume: { enabled: true }
162
+ )
163
+ ```
164
+
165
+ Notes for Parquet:
166
+ - Parquet multi-part sizing is controlled by rows via `parquet_max_rows`.
167
+ - `--rotate-mb` / `part_bytes` do not affect Parquet part size; they apply to JSONL/CSV.
168
+ - Use `sharding: { mode: :single_file }` to force a single `.parquet` file.
169
+
147
170
  ### Environment variables (optional)
148
171
 
149
172
  CLI flags take precedence, but these environment variables can set sensible defaults:
@@ -199,6 +222,8 @@ bundle exec bin/purplelight \
199
222
  Notes:
200
223
  - Compression backend selection order is: requested format → `zstd-ruby` → `zstds` → `gzip`.
201
224
  - `--single-file` and `--by-size` update only the sharding mode/params and preserve any provided `--prefix`.
225
+ - Parquet multi-part sizing is programmatic via `parquet_max_rows`; there is no CLI flag for it.
226
+ - To increase concurrent connections, set `maxPoolSize` on your Mongo URI (used by `--uri`), e.g., `mongodb://.../?maxPoolSize=32`. A good starting point is `maxPoolSize >= --partitions`.
202
227
 
203
228
  ### Architecture
204
229
 
@@ -236,6 +261,7 @@ Key points:
236
261
  - **Rotation size**: larger (512MB–1GB) reduces finalize overhead for many parts. CLI: `--rotate-mb` (and/or `--by-size`).
237
262
  - **JSONL chunking**: tune builder write chunk size for throughput. CLI: `--write-chunk-mb`.
238
263
  - **Parquet row groups**: choose a row group size that fits downstream readers. CLI: `--parquet-row-group`.
264
+ - **Parquet parts (rows)**: split Parquet outputs by rows with `parquet_max_rows` (programmatic API). Set `sharding.mode` to anything other than `:single_file` to enable multi-part filenames.
239
265
  - **Read preference**: offload to secondaries or tagged analytics nodes when available. CLI: `--read-preference`, `--read-tags`.
240
266
  - **Read concern**: pick an appropriate level for consistency/latency trade-offs. CLI: `--read-concern`.
241
267
  - **Cursor timeout**: for very long scans, leave `noCursorTimeout` enabled. CLI: `--no-cursor-timeout true|false`.
@@ -39,7 +39,8 @@ module Purplelight
39
39
  sharding: { mode: :by_size, part_bytes: DEFAULTS[:rotate_bytes], prefix: nil },
40
40
  logger: nil, on_progress: nil, read_concern: DEFAULTS[:read_concern], read_preference: DEFAULTS[:read_preference],
41
41
  no_cursor_timeout: DEFAULTS[:no_cursor_timeout], telemetry: nil,
42
- compression_level: nil, writer_threads: 1, write_chunk_bytes: nil, parquet_row_group: nil)
42
+ compression_level: nil, writer_threads: 1, write_chunk_bytes: nil, parquet_row_group: nil,
43
+ parquet_max_rows: nil)
43
44
  @client = client
44
45
  @collection = client[collection]
45
46
  @output = output
@@ -64,6 +65,7 @@ module Purplelight
64
65
  @writer_threads = writer_threads || 1
65
66
  @write_chunk_bytes = write_chunk_bytes
66
67
  @parquet_row_group = parquet_row_group
68
+ @parquet_max_rows = parquet_max_rows
67
69
 
68
70
  @running = true
69
71
  @telemetry_enabled = telemetry ? telemetry.enabled? : (ENV['PL_TELEMETRY'] == '1')
@@ -106,6 +108,7 @@ module Purplelight
106
108
  compression_level: @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @compression.to_s == 'zstd') || ENV['PL_ZSTD_LEVEL']&.to_i,
107
109
  write_chunk_bytes: @write_chunk_bytes || ENV['PL_WRITE_CHUNK_BYTES']&.to_i,
108
110
  parquet_row_group: @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i,
111
+ parquet_max_rows: @parquet_max_rows,
109
112
  sharding: @sharding,
110
113
  resume_overwrite_incompatible: @resume && @resume[:overwrite_incompatible] ? true : false,
111
114
  telemetry: @telemetry_enabled
@@ -134,7 +137,8 @@ module Purplelight
134
137
  single_file = @sharding && @sharding[:mode].to_s == 'single_file'
135
138
  row_group = @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i || WriterParquet::DEFAULT_ROW_GROUP_SIZE
136
139
  WriterParquet.new(directory: dir, prefix: prefix, compression: @compression, logger: @logger,
137
- manifest: manifest, single_file: single_file, row_group_size: row_group)
140
+ manifest: manifest, single_file: single_file, row_group_size: row_group,
141
+ rotate_rows: @parquet_max_rows)
138
142
  else
139
143
  raise ArgumentError, "format not implemented: #{@format}"
140
144
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Purplelight
4
- VERSION = '0.1.10'
4
+ VERSION = '0.1.11'
5
5
  end
@@ -15,7 +15,7 @@ module Purplelight
15
15
  DEFAULT_ROW_GROUP_SIZE = 10_000
16
16
 
17
17
  def initialize(directory:, prefix:, compression: :zstd, row_group_size: DEFAULT_ROW_GROUP_SIZE, logger: nil,
18
- manifest: nil, single_file: true, schema: nil)
18
+ manifest: nil, single_file: true, schema: nil, rotate_rows: nil)
19
19
  @directory = directory
20
20
  @prefix = prefix
21
21
  @compression = compression
@@ -24,11 +24,13 @@ module Purplelight
24
24
  @manifest = manifest
25
25
  @single_file = single_file
26
26
  @schema = schema
27
+ @rotate_rows = rotate_rows
27
28
 
28
29
  @closed = false
29
30
  @file_seq = 0
30
31
  @part_index = nil
31
32
  @pq_writer = nil
33
+ @rows_in_current_file = 0
32
34
 
33
35
  ensure_dependencies!
34
36
  reset_buffers
@@ -44,9 +46,8 @@ module Purplelight
44
46
  def close
45
47
  return if @closed
46
48
 
47
- ensure_open!
48
49
  flush_all_row_groups
49
- finalize_current_part!
50
+ finalize_current_part! if @writer_path
50
51
  @closed = true
51
52
  end
52
53
 
@@ -70,6 +71,7 @@ module Purplelight
70
71
  FileUtils.mkdir_p(@directory)
71
72
  @writer_path = next_part_path
72
73
  @part_index = @manifest&.open_part!(@writer_path) if @manifest
74
+ @rows_in_current_file = 0
73
75
  end
74
76
 
75
77
  # No-op; we now write once on close for simplicity
@@ -108,6 +110,7 @@ module Purplelight
108
110
  end
109
111
 
110
112
  def finalize_current_part!
113
+ return if @writer_path.nil?
111
114
  if @pq_writer
112
115
  @pq_writer.close
113
116
  @pq_writer = nil
@@ -115,6 +118,8 @@ module Purplelight
115
118
  @manifest&.complete_part!(index: @part_index, checksum: nil)
116
119
  @file_seq += 1 unless @single_file
117
120
  @writer_path = nil
121
+ @part_index = nil
122
+ @rows_in_current_file = 0
118
123
  end
119
124
 
120
125
  def next_part_path
@@ -147,6 +152,7 @@ module Purplelight
147
152
  return if @buffer_docs.empty?
148
153
 
149
154
  while @buffer_docs.length >= @row_group_size
155
+ ensure_open!
150
156
  group = @buffer_docs.shift(@row_group_size)
151
157
  t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
152
158
  table = build_table(group)
@@ -155,6 +161,8 @@ module Purplelight
155
161
  t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
156
162
  write_table(table, @writer_path, append: true)
157
163
  Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
164
+ @rows_in_current_file += group.length
165
+ maybe_rotate!
158
166
  end
159
167
  end
160
168
 
@@ -171,9 +179,21 @@ module Purplelight
171
179
  Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
172
180
 
173
181
  t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
182
+ ensure_open!
174
183
  write_table(table, @writer_path, append: true)
175
184
  Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
176
185
  @buffer_docs.clear
186
+ @rows_in_current_file += table.n_rows if table.respond_to?(:n_rows)
187
+ @rows_in_current_file += @buffer_docs.length unless table.respond_to?(:n_rows)
188
+ maybe_rotate!
189
+ end
190
+
191
+ def maybe_rotate!
192
+ return if @single_file
193
+ return unless @rotate_rows && @rows_in_current_file >= @rotate_rows
194
+
195
+ finalize_current_part!
196
+ # Next write will open a new part
177
197
  end
178
198
  end
179
199
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: purplelight
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.10
4
+ version: 0.1.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alexander Nicholson