purplelight 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f0f51fd601a59915a2a022831663fd4f2468e781b68b96f59d396359be49adbc
4
- data.tar.gz: c899a18e7ce390bfc05f832dd32248aa8cbdc7b43bccf86197350e2c7929e7a6
3
+ metadata.gz: 920b534dc9ac832d83600031277ddd35da2920cff494e0f96d0ca230652d4ba4
4
+ data.tar.gz: 2fd4476e73efc67d1f4a722dae8d7759ac8b51b5c21546dfb4b06d9fc1cd3934
5
5
  SHA512:
6
- metadata.gz: f6546911873ed22865b9d4cdd2cc62d855ab3b991030808d8f49f3e054727a406b80c7dc43c518a450915152f2934dcba180d53bf75807c540eef893b3ca50b8
7
- data.tar.gz: 5e7176eec64956388e72fd3d894db12e006a18edd4aade7eaf13b144381802932d7207ffefac6ad06157c03363f41acec4ca997871fb8abe8efc9e06e2238804
6
+ metadata.gz: 370660a815b47c4aa4a0725a6188d6e0455074232000a24c4f909a313c3b9c2d5d3219a17edb628df58f167115b317347c159cc93d5bd59e216d6de1ec7ecd77
7
+ data.tar.gz: efec787b1e355af50ec07e45b8c125d0f1054640734afbe7790a3a35181f999900d3c843104911eabcea644d2dc7f4604050571e17469f08fe97d9fa5aec92e9
data/README.md CHANGED
@@ -9,7 +9,7 @@ Purplelight is published on RubyGems: [purplelight on RubyGems](https://rubygems
9
9
  Add to your Gemfile:
10
10
 
11
11
  ```ruby
12
- gem 'purplelight', '~> 0.1.10'
12
+ gem 'purplelight', '~> 0.1.12'
13
13
  ```
14
14
 
15
15
  Or install directly:
@@ -144,6 +144,29 @@ Purplelight.snapshot(
144
144
  )
145
145
  ```
146
146
 
147
+ Parquet multi-part (rows-based rotation):
148
+
149
+ ```ruby
150
+ Purplelight.snapshot(
151
+ client: client,
152
+ collection: 'users',
153
+ output: '/data/exports',
154
+ format: :parquet,
155
+ # Any mode other than :single_file enables multi-part filenames for Parquet
156
+ sharding: { mode: :by_size, prefix: 'users_parquet_parts' },
157
+ # Split into multiple .parquet files, limiting rows per file
158
+ parquet_max_rows: 100_000,
159
+ # Optional: Parquet row group size (rows)
160
+ parquet_row_group: 10_000,
161
+ resume: { enabled: true }
162
+ )
163
+ ```
164
+
165
+ Notes for Parquet:
166
+ - Parquet multi-part sizing is controlled by rows via `parquet_max_rows`.
167
+ - `--rotate-mb` / `part_bytes` do not affect Parquet part size; they apply to JSONL/CSV.
168
+ - Use `sharding: { mode: :single_file }` to force a single `.parquet` file.
169
+
147
170
  ### Environment variables (optional)
148
171
 
149
172
  CLI flags take precedence, but these environment variables can set sensible defaults:
@@ -199,6 +222,8 @@ bundle exec bin/purplelight \
199
222
  Notes:
200
223
  - Compression backend selection order is: requested format → `zstd-ruby` → `zstds` → `gzip`.
201
224
  - `--single-file` and `--by-size` update only the sharding mode/params and preserve any provided `--prefix`.
225
+ - Parquet multi-part sizing is programmatic via `parquet_max_rows`; there is no CLI flag for it.
226
+ - To increase concurrent connections, set `maxPoolSize` on your Mongo URI (used by `--uri`), e.g., `mongodb://.../?maxPoolSize=32`. A good starting point is `maxPoolSize >= --partitions`.
202
227
 
203
228
  ### Architecture
204
229
 
@@ -236,6 +261,7 @@ Key points:
236
261
  - **Rotation size**: larger (512MB–1GB) reduces finalize overhead for many parts. CLI: `--rotate-mb` (and/or `--by-size`).
237
262
  - **JSONL chunking**: tune builder write chunk size for throughput. CLI: `--write-chunk-mb`.
238
263
  - **Parquet row groups**: choose a row group size that fits downstream readers. CLI: `--parquet-row-group`.
264
+ - **Parquet parts (rows)**: split Parquet outputs by rows with `parquet_max_rows` (programmatic API). Set `sharding.mode` to anything other than `:single_file` to enable multi-part filenames.
239
265
  - **Read preference**: offload to secondaries or tagged analytics nodes when available. CLI: `--read-preference`, `--read-tags`.
240
266
  - **Read concern**: pick an appropriate level for consistency/latency trade-offs. CLI: `--read-concern`.
241
267
  - **Cursor timeout**: for very long scans, leave `noCursorTimeout` enabled. CLI: `--no-cursor-timeout true|false`.
@@ -39,7 +39,8 @@ module Purplelight
39
39
  sharding: { mode: :by_size, part_bytes: DEFAULTS[:rotate_bytes], prefix: nil },
40
40
  logger: nil, on_progress: nil, read_concern: DEFAULTS[:read_concern], read_preference: DEFAULTS[:read_preference],
41
41
  no_cursor_timeout: DEFAULTS[:no_cursor_timeout], telemetry: nil,
42
- compression_level: nil, writer_threads: 1, write_chunk_bytes: nil, parquet_row_group: nil)
42
+ compression_level: nil, writer_threads: 1, write_chunk_bytes: nil, parquet_row_group: nil,
43
+ parquet_max_rows: nil)
43
44
  @client = client
44
45
  @collection = client[collection]
45
46
  @output = output
@@ -64,6 +65,7 @@ module Purplelight
64
65
  @writer_threads = writer_threads || 1
65
66
  @write_chunk_bytes = write_chunk_bytes
66
67
  @parquet_row_group = parquet_row_group
68
+ @parquet_max_rows = parquet_max_rows
67
69
 
68
70
  @running = true
69
71
  @telemetry_enabled = telemetry ? telemetry.enabled? : (ENV['PL_TELEMETRY'] == '1')
@@ -106,6 +108,7 @@ module Purplelight
106
108
  compression_level: @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @compression.to_s == 'zstd') || ENV['PL_ZSTD_LEVEL']&.to_i,
107
109
  write_chunk_bytes: @write_chunk_bytes || ENV['PL_WRITE_CHUNK_BYTES']&.to_i,
108
110
  parquet_row_group: @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i,
111
+ parquet_max_rows: @parquet_max_rows,
109
112
  sharding: @sharding,
110
113
  resume_overwrite_incompatible: @resume && @resume[:overwrite_incompatible] ? true : false,
111
114
  telemetry: @telemetry_enabled
@@ -134,7 +137,8 @@ module Purplelight
134
137
  single_file = @sharding && @sharding[:mode].to_s == 'single_file'
135
138
  row_group = @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i || WriterParquet::DEFAULT_ROW_GROUP_SIZE
136
139
  WriterParquet.new(directory: dir, prefix: prefix, compression: @compression, logger: @logger,
137
- manifest: manifest, single_file: single_file, row_group_size: row_group)
140
+ manifest: manifest, single_file: single_file, row_group_size: row_group,
141
+ rotate_rows: @parquet_max_rows)
138
142
  else
139
143
  raise ArgumentError, "format not implemented: #{@format}"
140
144
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Purplelight
4
- VERSION = '0.1.10'
4
+ VERSION = '0.1.12'
5
5
  end
@@ -15,7 +15,7 @@ module Purplelight
15
15
  DEFAULT_ROW_GROUP_SIZE = 10_000
16
16
 
17
17
  def initialize(directory:, prefix:, compression: :zstd, row_group_size: DEFAULT_ROW_GROUP_SIZE, logger: nil,
18
- manifest: nil, single_file: true, schema: nil)
18
+ manifest: nil, single_file: true, schema: nil, rotate_rows: nil)
19
19
  @directory = directory
20
20
  @prefix = prefix
21
21
  @compression = compression
@@ -24,11 +24,13 @@ module Purplelight
24
24
  @manifest = manifest
25
25
  @single_file = single_file
26
26
  @schema = schema
27
+ @rotate_rows = rotate_rows
27
28
 
28
29
  @closed = false
29
30
  @file_seq = 0
30
31
  @part_index = nil
31
32
  @pq_writer = nil
33
+ @rows_in_current_file = 0
32
34
 
33
35
  ensure_dependencies!
34
36
  reset_buffers
@@ -38,15 +40,13 @@ module Purplelight
38
40
  ensure_open!
39
41
  array_of_docs.each { |doc| @buffer_docs << doc }
40
42
  flush_row_groups_if_needed
41
- @manifest&.add_progress_to_part!(index: @part_index, rows_delta: array_of_docs.length, bytes_delta: 0)
42
43
  end
43
44
 
44
45
  def close
45
46
  return if @closed
46
47
 
47
- ensure_open!
48
48
  flush_all_row_groups
49
- finalize_current_part!
49
+ finalize_current_part! if @writer_path
50
50
  @closed = true
51
51
  end
52
52
 
@@ -70,6 +70,7 @@ module Purplelight
70
70
  FileUtils.mkdir_p(@directory)
71
71
  @writer_path = next_part_path
72
72
  @part_index = @manifest&.open_part!(@writer_path) if @manifest
73
+ @rows_in_current_file = 0
73
74
  end
74
75
 
75
76
  # No-op; we now write once on close for simplicity
@@ -89,7 +90,8 @@ module Purplelight
89
90
  # Stream via ArrowFileWriter when available to avoid building huge tables
90
91
  if defined?(Parquet::ArrowFileWriter)
91
92
  unless @pq_writer
92
- @pq_writer = Parquet::ArrowFileWriter.open(table.schema, path)
93
+ props = build_writer_properties_for_compression(@compression)
94
+ @pq_writer = create_arrow_file_writer(table.schema, path, props)
93
95
  end
94
96
  # Prefer passing row_group_size; fallback to single-arg for older APIs
95
97
  begin
@@ -101,13 +103,19 @@ module Purplelight
101
103
  end
102
104
  # Fallback to one-shot save when streaming API is not available
103
105
  if table.respond_to?(:save)
104
- table.save(path, format: :parquet)
106
+ begin
107
+ table.save(path, format: :parquet, compression: normalize_parquet_compression_name(@compression))
108
+ rescue StandardError
109
+ table.save(path, format: :parquet)
110
+ end
105
111
  return
106
112
  end
107
113
  raise 'Parquet writer not available in this environment'
108
114
  end
109
115
 
110
116
  def finalize_current_part!
117
+ return if @writer_path.nil?
118
+
111
119
  if @pq_writer
112
120
  @pq_writer.close
113
121
  @pq_writer = nil
@@ -115,6 +123,8 @@ module Purplelight
115
123
  @manifest&.complete_part!(index: @part_index, checksum: nil)
116
124
  @file_seq += 1 unless @single_file
117
125
  @writer_path = nil
126
+ @part_index = nil
127
+ @rows_in_current_file = 0
118
128
  end
119
129
 
120
130
  def next_part_path
@@ -147,14 +157,52 @@ module Purplelight
147
157
  return if @buffer_docs.empty?
148
158
 
149
159
  while @buffer_docs.length >= @row_group_size
160
+ ensure_open!
150
161
  group = @buffer_docs.shift(@row_group_size)
151
- t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
152
- table = build_table(group)
153
- Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
162
+ if @rotate_rows && !@single_file && (@rows_in_current_file + group.length) > @rotate_rows
163
+ # Write a partial chunk to fill the current file, then rotate and write the rest
164
+ remaining_allowed = @rotate_rows - @rows_in_current_file
165
+ if remaining_allowed.positive?
166
+ part_a = group.first(remaining_allowed)
167
+ t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
168
+ table_a = build_table(part_a)
169
+ Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
170
+
171
+ t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
172
+ write_table(table_a, @writer_path, append: true)
173
+ Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
174
+ @manifest&.add_progress_to_part!(index: @part_index, rows_delta: part_a.length, bytes_delta: 0)
175
+ @rows_in_current_file += part_a.length
176
+ end
177
+
178
+ finalize_current_part!
179
+ ensure_open!
180
+
181
+ part_b = group.drop(remaining_allowed)
182
+ unless part_b.empty?
183
+ t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
184
+ table_b = build_table(part_b)
185
+ Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
186
+
187
+ t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
188
+ write_table(table_b, @writer_path, append: true)
189
+ Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
190
+ @manifest&.add_progress_to_part!(index: @part_index, rows_delta: part_b.length, bytes_delta: 0)
191
+ @rows_in_current_file += part_b.length
192
+ maybe_rotate!
193
+ end
194
+ else
195
+ t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
196
+ table = build_table(group)
197
+ Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
154
198
 
155
- t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
156
- write_table(table, @writer_path, append: true)
157
- Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
199
+ t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
200
+ write_table(table, @writer_path, append: true)
201
+ Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
202
+ @manifest&.add_progress_to_part!(index: @part_index, rows_delta: group.length, bytes_delta: 0)
203
+ @rows_in_current_file += group.length
204
+ maybe_rotate!
205
+ end
158
206
  end
159
207
  end
160
208
 
@@ -166,14 +214,119 @@ module Purplelight
166
214
  return if @buffer_docs.empty?
167
215
 
168
216
  # Flush remaining as a final smaller group
217
+ remaining = @buffer_docs.length
169
218
  t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
170
219
  table = build_table(@buffer_docs)
171
220
  Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
172
221
 
222
+ ensure_open!
223
+ # Pre-rotate to avoid exceeding rotate_rows on this final write
224
+ if @rotate_rows && !@single_file && @rows_in_current_file.positive? && (@rows_in_current_file + remaining) > @rotate_rows
225
+ finalize_current_part!
226
+ ensure_open!
227
+ end
228
+
173
229
  t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
174
230
  write_table(table, @writer_path, append: true)
175
231
  Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
232
+ rows_written = (table.respond_to?(:n_rows) ? table.n_rows : remaining)
233
+ @manifest&.add_progress_to_part!(index: @part_index, rows_delta: rows_written, bytes_delta: 0)
234
+ @rows_in_current_file += rows_written
176
235
  @buffer_docs.clear
236
+ maybe_rotate!
237
+ end
238
+
239
+ def maybe_rotate!
240
+ return if @single_file
241
+ return unless @rotate_rows && @rows_in_current_file >= @rotate_rows
242
+
243
+ finalize_current_part!
244
+ # Next write will open a new part
245
+ end
246
+
247
+ def build_writer_properties_for_compression(requested)
248
+ codec_const = parquet_codec_constant(requested)
249
+ return nil unless codec_const
250
+
251
+ # Prefer WriterProperties builder if available
252
+ begin
253
+ if defined?(Parquet::WriterProperties) && Parquet::WriterProperties.respond_to?(:builder)
254
+ builder = Parquet::WriterProperties.builder
255
+ if builder.respond_to?(:compression)
256
+ builder = builder.compression(codec_const)
257
+ elsif builder.respond_to?(:set_compression)
258
+ builder = builder.set_compression(codec_const)
259
+ end
260
+ return builder.build if builder.respond_to?(:build)
261
+ end
262
+ rescue StandardError
263
+ # fall through to other strategies
264
+ end
265
+
266
+ # Alternative builder class naming fallback
267
+ begin
268
+ if defined?(Parquet::WriterPropertiesBuilder)
269
+ b = Parquet::WriterPropertiesBuilder.new
270
+ if b.respond_to?(:compression)
271
+ b.compression(codec_const)
272
+ elsif b.respond_to?(:set_compression)
273
+ b.set_compression(codec_const)
274
+ end
275
+ return b.build if b.respond_to?(:build)
276
+ end
277
+ rescue StandardError
278
+ # ignore
279
+ end
280
+ nil
281
+ end
282
+
283
+ def create_arrow_file_writer(schema, path, props)
284
+ attempts = []
285
+ if props
286
+ attempts << -> { Parquet::ArrowFileWriter.open(schema, path, props) }
287
+ attempts << -> { Parquet::ArrowFileWriter.open(schema, path, properties: props) }
288
+ end
289
+ attempts << -> { Parquet::ArrowFileWriter.open(schema, path) }
290
+
291
+ attempts.each do |call|
292
+ return call.call
293
+ rescue StandardError
294
+ next
295
+ end
296
+ raise 'failed to open Parquet::ArrowFileWriter'
297
+ end
298
+
299
+ def parquet_codec_constant(requested)
300
+ name = normalize_parquet_compression_name(requested)
301
+ return nil unless name
302
+
303
+ up = case name
304
+ when 'zstd', 'zstandard' then 'ZSTD'
305
+ when 'gzip' then 'GZIP'
306
+ when 'snappy' then 'SNAPPY'
307
+ when 'none' then 'UNCOMPRESSED'
308
+ else name.upcase
309
+ end
310
+ candidates = %w[CompressionType Compression CompressionCodec]
311
+ candidates.each do |mod|
312
+ m = Parquet.const_get(mod)
313
+ return m.const_get(up) if m.const_defined?(up)
314
+ rescue StandardError
315
+ next
316
+ end
317
+ nil
318
+ end
319
+
320
+ def normalize_parquet_compression_name(requested)
321
+ return nil if requested.nil?
322
+
323
+ s = requested.to_s.downcase
324
+ return 'none' if s == 'none'
325
+ return 'gzip' if s == 'gzip'
326
+ return 'snappy' if s == 'snappy'
327
+ return 'zstd' if %w[zstd zstandard].include?(s)
328
+
329
+ nil
177
330
  end
178
331
  end
179
332
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: purplelight
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.10
4
+ version: 0.1.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alexander Nicholson