purplelight 0.1.11 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/purplelight/snapshot.rb +2 -2
- data/lib/purplelight/version.rb +1 -1
- data/lib/purplelight/writer_parquet.rb +148 -15
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 920b534dc9ac832d83600031277ddd35da2920cff494e0f96d0ca230652d4ba4
|
4
|
+
data.tar.gz: 2fd4476e73efc67d1f4a722dae8d7759ac8b51b5c21546dfb4b06d9fc1cd3934
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 370660a815b47c4aa4a0725a6188d6e0455074232000a24c4f909a313c3b9c2d5d3219a17edb628df58f167115b317347c159cc93d5bd59e216d6de1ec7ecd77
|
7
|
+
data.tar.gz: efec787b1e355af50ec07e45b8c125d0f1054640734afbe7790a3a35181f999900d3c843104911eabcea644d2dc7f4604050571e17469f08fe97d9fa5aec92e9
|
data/README.md
CHANGED
data/lib/purplelight/snapshot.rb
CHANGED
@@ -137,8 +137,8 @@ module Purplelight
|
|
137
137
|
single_file = @sharding && @sharding[:mode].to_s == 'single_file'
|
138
138
|
row_group = @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i || WriterParquet::DEFAULT_ROW_GROUP_SIZE
|
139
139
|
WriterParquet.new(directory: dir, prefix: prefix, compression: @compression, logger: @logger,
|
140
|
-
|
141
|
-
|
140
|
+
manifest: manifest, single_file: single_file, row_group_size: row_group,
|
141
|
+
rotate_rows: @parquet_max_rows)
|
142
142
|
else
|
143
143
|
raise ArgumentError, "format not implemented: #{@format}"
|
144
144
|
end
|
data/lib/purplelight/version.rb
CHANGED
@@ -40,7 +40,6 @@ module Purplelight
|
|
40
40
|
ensure_open!
|
41
41
|
array_of_docs.each { |doc| @buffer_docs << doc }
|
42
42
|
flush_row_groups_if_needed
|
43
|
-
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: array_of_docs.length, bytes_delta: 0)
|
44
43
|
end
|
45
44
|
|
46
45
|
def close
|
@@ -91,7 +90,8 @@ module Purplelight
|
|
91
90
|
# Stream via ArrowFileWriter when available to avoid building huge tables
|
92
91
|
if defined?(Parquet::ArrowFileWriter)
|
93
92
|
unless @pq_writer
|
94
|
-
|
93
|
+
props = build_writer_properties_for_compression(@compression)
|
94
|
+
@pq_writer = create_arrow_file_writer(table.schema, path, props)
|
95
95
|
end
|
96
96
|
# Prefer passing row_group_size; fallback to single-arg for older APIs
|
97
97
|
begin
|
@@ -103,7 +103,11 @@ module Purplelight
|
|
103
103
|
end
|
104
104
|
# Fallback to one-shot save when streaming API is not available
|
105
105
|
if table.respond_to?(:save)
|
106
|
-
|
106
|
+
begin
|
107
|
+
table.save(path, format: :parquet, compression: normalize_parquet_compression_name(@compression))
|
108
|
+
rescue StandardError
|
109
|
+
table.save(path, format: :parquet)
|
110
|
+
end
|
107
111
|
return
|
108
112
|
end
|
109
113
|
raise 'Parquet writer not available in this environment'
|
@@ -111,6 +115,7 @@ module Purplelight
|
|
111
115
|
|
112
116
|
def finalize_current_part!
|
113
117
|
return if @writer_path.nil?
|
118
|
+
|
114
119
|
if @pq_writer
|
115
120
|
@pq_writer.close
|
116
121
|
@pq_writer = nil
|
@@ -154,15 +159,50 @@ module Purplelight
|
|
154
159
|
while @buffer_docs.length >= @row_group_size
|
155
160
|
ensure_open!
|
156
161
|
group = @buffer_docs.shift(@row_group_size)
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
162
|
+
if @rotate_rows && !@single_file && (@rows_in_current_file + group.length) > @rotate_rows
|
163
|
+
# Write a partial chunk to fill the current file, then rotate and write the rest
|
164
|
+
remaining_allowed = @rotate_rows - @rows_in_current_file
|
165
|
+
if remaining_allowed.positive?
|
166
|
+
part_a = group.first(remaining_allowed)
|
167
|
+
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
168
|
+
table_a = build_table(part_a)
|
169
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
170
|
+
|
171
|
+
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
172
|
+
write_table(table_a, @writer_path, append: true)
|
173
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
174
|
+
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: part_a.length, bytes_delta: 0)
|
175
|
+
@rows_in_current_file += part_a.length
|
176
|
+
end
|
177
|
+
|
178
|
+
finalize_current_part!
|
179
|
+
ensure_open!
|
180
|
+
|
181
|
+
part_b = group.drop(remaining_allowed)
|
182
|
+
unless part_b.empty?
|
183
|
+
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
184
|
+
table_b = build_table(part_b)
|
185
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
186
|
+
|
187
|
+
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
188
|
+
write_table(table_b, @writer_path, append: true)
|
189
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
190
|
+
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: part_b.length, bytes_delta: 0)
|
191
|
+
@rows_in_current_file += part_b.length
|
192
|
+
maybe_rotate!
|
193
|
+
end
|
194
|
+
else
|
195
|
+
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
196
|
+
table = build_table(group)
|
197
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
198
|
+
|
199
|
+
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
200
|
+
write_table(table, @writer_path, append: true)
|
201
|
+
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
202
|
+
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: group.length, bytes_delta: 0)
|
203
|
+
@rows_in_current_file += group.length
|
204
|
+
maybe_rotate!
|
205
|
+
end
|
166
206
|
end
|
167
207
|
end
|
168
208
|
|
@@ -174,17 +214,25 @@ module Purplelight
|
|
174
214
|
return if @buffer_docs.empty?
|
175
215
|
|
176
216
|
# Flush remaining as a final smaller group
|
217
|
+
remaining = @buffer_docs.length
|
177
218
|
t_tbl = Thread.current[:pl_telemetry]&.start(:parquet_table_build_time)
|
178
219
|
table = build_table(@buffer_docs)
|
179
220
|
Thread.current[:pl_telemetry]&.finish(:parquet_table_build_time, t_tbl)
|
180
221
|
|
181
|
-
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
182
222
|
ensure_open!
|
223
|
+
# Pre-rotate to avoid exceeding rotate_rows on this final write
|
224
|
+
if @rotate_rows && !@single_file && @rows_in_current_file.positive? && (@rows_in_current_file + remaining) > @rotate_rows
|
225
|
+
finalize_current_part!
|
226
|
+
ensure_open!
|
227
|
+
end
|
228
|
+
|
229
|
+
t_w = Thread.current[:pl_telemetry]&.start(:parquet_write_time)
|
183
230
|
write_table(table, @writer_path, append: true)
|
184
231
|
Thread.current[:pl_telemetry]&.finish(:parquet_write_time, t_w)
|
232
|
+
rows_written = (table.respond_to?(:n_rows) ? table.n_rows : remaining)
|
233
|
+
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: rows_written, bytes_delta: 0)
|
234
|
+
@rows_in_current_file += rows_written
|
185
235
|
@buffer_docs.clear
|
186
|
-
@rows_in_current_file += table.n_rows if table.respond_to?(:n_rows)
|
187
|
-
@rows_in_current_file += @buffer_docs.length unless table.respond_to?(:n_rows)
|
188
236
|
maybe_rotate!
|
189
237
|
end
|
190
238
|
|
@@ -195,5 +243,90 @@ module Purplelight
|
|
195
243
|
finalize_current_part!
|
196
244
|
# Next write will open a new part
|
197
245
|
end
|
246
|
+
|
247
|
+
def build_writer_properties_for_compression(requested)
|
248
|
+
codec_const = parquet_codec_constant(requested)
|
249
|
+
return nil unless codec_const
|
250
|
+
|
251
|
+
# Prefer WriterProperties builder if available
|
252
|
+
begin
|
253
|
+
if defined?(Parquet::WriterProperties) && Parquet::WriterProperties.respond_to?(:builder)
|
254
|
+
builder = Parquet::WriterProperties.builder
|
255
|
+
if builder.respond_to?(:compression)
|
256
|
+
builder = builder.compression(codec_const)
|
257
|
+
elsif builder.respond_to?(:set_compression)
|
258
|
+
builder = builder.set_compression(codec_const)
|
259
|
+
end
|
260
|
+
return builder.build if builder.respond_to?(:build)
|
261
|
+
end
|
262
|
+
rescue StandardError
|
263
|
+
# fall through to other strategies
|
264
|
+
end
|
265
|
+
|
266
|
+
# Alternative builder class naming fallback
|
267
|
+
begin
|
268
|
+
if defined?(Parquet::WriterPropertiesBuilder)
|
269
|
+
b = Parquet::WriterPropertiesBuilder.new
|
270
|
+
if b.respond_to?(:compression)
|
271
|
+
b.compression(codec_const)
|
272
|
+
elsif b.respond_to?(:set_compression)
|
273
|
+
b.set_compression(codec_const)
|
274
|
+
end
|
275
|
+
return b.build if b.respond_to?(:build)
|
276
|
+
end
|
277
|
+
rescue StandardError
|
278
|
+
# ignore
|
279
|
+
end
|
280
|
+
nil
|
281
|
+
end
|
282
|
+
|
283
|
+
def create_arrow_file_writer(schema, path, props)
|
284
|
+
attempts = []
|
285
|
+
if props
|
286
|
+
attempts << -> { Parquet::ArrowFileWriter.open(schema, path, props) }
|
287
|
+
attempts << -> { Parquet::ArrowFileWriter.open(schema, path, properties: props) }
|
288
|
+
end
|
289
|
+
attempts << -> { Parquet::ArrowFileWriter.open(schema, path) }
|
290
|
+
|
291
|
+
attempts.each do |call|
|
292
|
+
return call.call
|
293
|
+
rescue StandardError
|
294
|
+
next
|
295
|
+
end
|
296
|
+
raise 'failed to open Parquet::ArrowFileWriter'
|
297
|
+
end
|
298
|
+
|
299
|
+
def parquet_codec_constant(requested)
|
300
|
+
name = normalize_parquet_compression_name(requested)
|
301
|
+
return nil unless name
|
302
|
+
|
303
|
+
up = case name
|
304
|
+
when 'zstd', 'zstandard' then 'ZSTD'
|
305
|
+
when 'gzip' then 'GZIP'
|
306
|
+
when 'snappy' then 'SNAPPY'
|
307
|
+
when 'none' then 'UNCOMPRESSED'
|
308
|
+
else name.upcase
|
309
|
+
end
|
310
|
+
candidates = %w[CompressionType Compression CompressionCodec]
|
311
|
+
candidates.each do |mod|
|
312
|
+
m = Parquet.const_get(mod)
|
313
|
+
return m.const_get(up) if m.const_defined?(up)
|
314
|
+
rescue StandardError
|
315
|
+
next
|
316
|
+
end
|
317
|
+
nil
|
318
|
+
end
|
319
|
+
|
320
|
+
def normalize_parquet_compression_name(requested)
|
321
|
+
return nil if requested.nil?
|
322
|
+
|
323
|
+
s = requested.to_s.downcase
|
324
|
+
return 'none' if s == 'none'
|
325
|
+
return 'gzip' if s == 'gzip'
|
326
|
+
return 'snappy' if s == 'snappy'
|
327
|
+
return 'zstd' if %w[zstd zstandard].include?(s)
|
328
|
+
|
329
|
+
nil
|
330
|
+
end
|
198
331
|
end
|
199
332
|
end
|