purplelight 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 80f6e48231b485750fc65529ada74a07758befba9b324b58ef372f077305a144
4
- data.tar.gz: 4d7eed034f90155d2686da45a76caa73928cbe2d080d2031539830f6f4399cfb
3
+ metadata.gz: 920b534dc9ac832d83600031277ddd35da2920cff494e0f96d0ca230652d4ba4
4
+ data.tar.gz: 2fd4476e73efc67d1f4a722dae8d7759ac8b51b5c21546dfb4b06d9fc1cd3934
5
5
  SHA512:
6
- metadata.gz: 63f49c8dab688ec5cd922304b472ee448aaada1a3e2c113b7a4ddbe2092f3a3d2a83e1fe396066e87971514d8069831486c1a5ad972e604807c6c3289efd8e31
7
- data.tar.gz: 6a281a23a0abf3244045b3e99af606881f3f85bab7eeabd7b4ca94e36c823d2405d7e85b60048ec62e5a7ace63fceddd268e185a831398cc7dbd00c213198961
6
+ metadata.gz: 370660a815b47c4aa4a0725a6188d6e0455074232000a24c4f909a313c3b9c2d5d3219a17edb628df58f167115b317347c159cc93d5bd59e216d6de1ec7ecd77
7
+ data.tar.gz: efec787b1e355af50ec07e45b8c125d0f1054640734afbe7790a3a35181f999900d3c843104911eabcea644d2dc7f4604050571e17469f08fe97d9fa5aec92e9
data/README.md CHANGED
@@ -9,7 +9,7 @@ Purplelight is published on RubyGems: [purplelight on RubyGems](https://rubygems
9
9
  Add to your Gemfile:
10
10
 
11
11
  ```ruby
12
- gem 'purplelight', '~> 0.1.11'
12
+ gem 'purplelight', '~> 0.1.12'
13
13
  ```
14
14
 
15
15
  Or install directly:
@@ -137,8 +137,8 @@ module Purplelight
137
137
  single_file = @sharding && @sharding[:mode].to_s == 'single_file'
138
138
  row_group = @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i || WriterParquet::DEFAULT_ROW_GROUP_SIZE
139
139
  WriterParquet.new(directory: dir, prefix: prefix, compression: @compression, logger: @logger,
140
- manifest: manifest, single_file: single_file, row_group_size: row_group,
141
- rotate_rows: @parquet_max_rows)
140
+ manifest: manifest, single_file: single_file, row_group_size: row_group,
141
+ rotate_rows: @parquet_max_rows)
142
142
  else
143
143
  raise ArgumentError, "format not implemented: #{@format}"
144
144
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Purplelight
4
- VERSION = '0.1.11'
4
+ VERSION = '0.1.12'
5
5
  end
@@ -40,7 +40,6 @@ module Purplelight
40
40
  ensure_open!
41
41
  array_of_docs.each { |doc| @buffer_docs << doc }
42
42
  flush_row_groups_if_needed
43
- @manifest&.add_progress_to_part!(index: @part_index, rows_delta: array_of_docs.length, bytes_delta: 0)
44
43
  end
45
44
 
46
45
  def close
@@ -91,7 +90,8 @@ module Purplelight
91
90
  # Stream via ArrowFileWriter when available to avoid building huge tables
92
91
  if defined?(Parquet::ArrowFileWriter)
93
92
  unless @pq_writer
94
- @pq_writer = Parquet::ArrowFileWriter.open(table.schema, path)
93
+ props = build_writer_properties_for_compression(@compression)
94
+ @pq_writer = create_arrow_file_writer(table.schema, path, props)
95
95
  end
96
96
  # Prefer passing row_group_size; fallback to single-arg for older APIs
97
97
  begin
@@ -103,7 +103,11 @@ module Purplelight
103
103
  end
104
104
  # Fallback to one-shot save when streaming API is not available
105
105
  if table.respond_to?(:save)
106
- table.save(path, format: :parquet)
106
+ begin
107
+ table.save(path, format: :parquet, compression: normalize_parquet_compression_name(@compression))
108
+ rescue StandardError
109
+ table.save(path, format: :parquet)
110
+ end
107
111
  return
108
112
  end
109
113
  raise 'Parquet writer not available in this environment'
@@ -111,6 +115,7 @@ module Purplelight
111
115
 
112
116
  def finalize_current_part!
113
117
  return if @writer_path.nil?
118
+
114
119
  if @pq_writer
115
120
  @pq_writer.close
116
121
  @pq_writer = nil
@@ -154,15 +159,50 @@ module Purplelight
154
159
  while @buffer_docs.length >= @row_group_size
155
160
  ensure_open!
156
161
  group = @buffer_docs.shift(@row_group_size)
157
- t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
158
- table = build_table(group)
159
- Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
160
-
161
- t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
162
- write_table(table, @writer_path, append: true)
163
- Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
164
- @rows_in_current_file += group.length
165
- maybe_rotate!
162
+ if @rotate_rows && !@single_file && (@rows_in_current_file + group.length) > @rotate_rows
163
+ # Write a partial chunk to fill the current file, then rotate and write the rest
164
+ remaining_allowed = @rotate_rows - @rows_in_current_file
165
+ if remaining_allowed.positive?
166
+ part_a = group.first(remaining_allowed)
167
+ t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
168
+ table_a = build_table(part_a)
169
+ Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
170
+
171
+ t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
172
+ write_table(table_a, @writer_path, append: true)
173
+ Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
174
+ @manifest&.add_progress_to_part!(index: @part_index, rows_delta: part_a.length, bytes_delta: 0)
175
+ @rows_in_current_file += part_a.length
176
+ end
177
+
178
+ finalize_current_part!
179
+ ensure_open!
180
+
181
+ part_b = group.drop(remaining_allowed)
182
+ unless part_b.empty?
183
+ t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
184
+ table_b = build_table(part_b)
185
+ Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
186
+
187
+ t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
188
+ write_table(table_b, @writer_path, append: true)
189
+ Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
190
+ @manifest&.add_progress_to_part!(index: @part_index, rows_delta: part_b.length, bytes_delta: 0)
191
+ @rows_in_current_file += part_b.length
192
+ maybe_rotate!
193
+ end
194
+ else
195
+ t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
196
+ table = build_table(group)
197
+ Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
198
+
199
+ t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
200
+ write_table(table, @writer_path, append: true)
201
+ Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
202
+ @manifest&.add_progress_to_part!(index: @part_index, rows_delta: group.length, bytes_delta: 0)
203
+ @rows_in_current_file += group.length
204
+ maybe_rotate!
205
+ end
166
206
  end
167
207
  end
168
208
 
@@ -174,17 +214,25 @@ module Purplelight
174
214
  return if @buffer_docs.empty?
175
215
 
176
216
  # Flush remaining as a final smaller group
217
+ remaining = @buffer_docs.length
177
218
  t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
178
219
  table = build_table(@buffer_docs)
179
220
  Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
180
221
 
181
- t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
182
222
  ensure_open!
223
+ # Pre-rotate to avoid exceeding rotate_rows on this final write
224
+ if @rotate_rows && !@single_file && @rows_in_current_file.positive? && (@rows_in_current_file + remaining) > @rotate_rows
225
+ finalize_current_part!
226
+ ensure_open!
227
+ end
228
+
229
+ t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
183
230
  write_table(table, @writer_path, append: true)
184
231
  Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
232
+ rows_written = (table.respond_to?(:n_rows) ? table.n_rows : remaining)
233
+ @manifest&.add_progress_to_part!(index: @part_index, rows_delta: rows_written, bytes_delta: 0)
234
+ @rows_in_current_file += rows_written
185
235
  @buffer_docs.clear
186
- @rows_in_current_file += table.n_rows if table.respond_to?(:n_rows)
187
- @rows_in_current_file += @buffer_docs.length unless table.respond_to?(:n_rows)
188
236
  maybe_rotate!
189
237
  end
190
238
 
@@ -195,5 +243,90 @@ module Purplelight
195
243
  finalize_current_part!
196
244
  # Next write will open a new part
197
245
  end
246
+
247
+ def build_writer_properties_for_compression(requested)
248
+ codec_const = parquet_codec_constant(requested)
249
+ return nil unless codec_const
250
+
251
+ # Prefer WriterProperties builder if available
252
+ begin
253
+ if defined?(Parquet::WriterProperties) && Parquet::WriterProperties.respond_to?(:builder)
254
+ builder = Parquet::WriterProperties.builder
255
+ if builder.respond_to?(:compression)
256
+ builder = builder.compression(codec_const)
257
+ elsif builder.respond_to?(:set_compression)
258
+ builder = builder.set_compression(codec_const)
259
+ end
260
+ return builder.build if builder.respond_to?(:build)
261
+ end
262
+ rescue StandardError
263
+ # fall through to other strategies
264
+ end
265
+
266
+ # Alternative builder class naming fallback
267
+ begin
268
+ if defined?(Parquet::WriterPropertiesBuilder)
269
+ b = Parquet::WriterPropertiesBuilder.new
270
+ if b.respond_to?(:compression)
271
+ b.compression(codec_const)
272
+ elsif b.respond_to?(:set_compression)
273
+ b.set_compression(codec_const)
274
+ end
275
+ return b.build if b.respond_to?(:build)
276
+ end
277
+ rescue StandardError
278
+ # ignore
279
+ end
280
+ nil
281
+ end
282
+
283
+ def create_arrow_file_writer(schema, path, props)
284
+ attempts = []
285
+ if props
286
+ attempts << -> { Parquet::ArrowFileWriter.open(schema, path, props) }
287
+ attempts << -> { Parquet::ArrowFileWriter.open(schema, path, properties: props) }
288
+ end
289
+ attempts << -> { Parquet::ArrowFileWriter.open(schema, path) }
290
+
291
+ attempts.each do |call|
292
+ return call.call
293
+ rescue StandardError
294
+ next
295
+ end
296
+ raise 'failed to open Parquet::ArrowFileWriter'
297
+ end
298
+
299
+ def parquet_codec_constant(requested)
300
+ name = normalize_parquet_compression_name(requested)
301
+ return nil unless name
302
+
303
+ up = case name
304
+ when 'zstd', 'zstandard' then 'ZSTD'
305
+ when 'gzip' then 'GZIP'
306
+ when 'snappy' then 'SNAPPY'
307
+ when 'none' then 'UNCOMPRESSED'
308
+ else name.upcase
309
+ end
310
+ candidates = %w[CompressionType Compression CompressionCodec]
311
+ candidates.each do |mod|
312
+ m = Parquet.const_get(mod)
313
+ return m.const_get(up) if m.const_defined?(up)
314
+ rescue StandardError
315
+ next
316
+ end
317
+ nil
318
+ end
319
+
320
+ def normalize_parquet_compression_name(requested)
321
+ return nil if requested.nil?
322
+
323
+ s = requested.to_s.downcase
324
+ return 'none' if s == 'none'
325
+ return 'gzip' if s == 'gzip'
326
+ return 'snappy' if s == 'snappy'
327
+ return 'zstd' if %w[zstd zstandard].include?(s)
328
+
329
+ nil
330
+ end
198
331
  end
199
332
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: purplelight
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.11
4
+ version: 0.1.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alexander Nicholson