purplelight 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +10 -7
- data/bin/purplelight +63 -13
- data/lib/purplelight/snapshot.rb +33 -7
- data/lib/purplelight/version.rb +1 -1
- data/lib/purplelight/writer_csv.rb +23 -17
- data/lib/purplelight/writer_jsonl.rb +33 -24
- data/lib/purplelight/writer_parquet.rb +2 -3
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b87960253dbd1ab6aae3b60dc790068d851f3798b124c23451bdae96734d6d67
|
4
|
+
data.tar.gz: b1eab05f8580a282b836da8eddb5dfe964ef6cb90a94300304ecd0426f786998
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7bff1db0acebc6416b7dd484fe882947bc74927a6833e99a0fec64d03203babfbf625f44c6a8d6c29cab31a6bc7ccae31de3a7d0b55283d073053a21515faeb3
|
7
|
+
data.tar.gz: b56bd93e12571aafe2ab47a1dc087d3429c4a15a731d50159552fbe70a0f63b40ee2d44fb23bf27752045df9f6e146376af906a00afdfada7e068420a4012925
|
data/README.md
CHANGED
@@ -9,7 +9,7 @@ Purplelight is published on RubyGems: [purplelight on RubyGems](https://rubygems
|
|
9
9
|
Add to your Gemfile:
|
10
10
|
|
11
11
|
```ruby
|
12
|
-
gem 'purplelight', '~> 0.1.
|
12
|
+
gem 'purplelight', '~> 0.1.9'
|
13
13
|
```
|
14
14
|
|
15
15
|
Or install directly:
|
@@ -248,14 +248,17 @@ bundle exec bin/purplelight \
|
|
248
248
|
|
249
249
|
### Quick Benchmark
|
250
250
|
```
|
251
|
-
%
|
251
|
+
% BENCH=1 BENCH_PARTITIONS=16 BENCH_BATCH_SIZE=8000 BENCH_QUEUE_MB=512 BENCH_ROTATE_MB=512 BENCH_COMPRESSION=gzip bundle exec rspec spec/benchmark_perf_spec.rb --format doc
|
252
252
|
|
253
253
|
Performance benchmark (1M docs, gated by BENCH=1)
|
254
|
-
W, [2025-09-03T16:10:40.437304 #33546] WARN -- : MONGODB | Error checking 127.0.0.1:27018: Mongo::Error::SocketError: Errno::ECONNREFUSED: Connection refused - connect(2) for 127.0.0.1:27018 (for 127.0.0.1:27018 (no TLS)) (on 127.0.0.1:27018)
|
255
254
|
Benchmark results:
|
256
|
-
Inserted: 1000000 docs in 8.
|
257
|
-
Exported: 1000000 docs in
|
258
|
-
Parts: 1, Bytes:
|
259
|
-
Throughput:
|
255
|
+
Inserted: 1000000 docs in 8.13s
|
256
|
+
Exported: 1000000 docs in 4.03s
|
257
|
+
Parts: 1, Bytes: 10625336
|
258
|
+
Throughput: 248241.7 docs/s, 2.52 MB/s
|
260
259
|
Settings: partitions=16, batch_size=8000, queue_mb=512, rotate_mb=512, compression=gzip
|
260
|
+
exports 1,000,000 documents and reports throughput
|
261
|
+
|
262
|
+
Finished in 14.02 seconds (files took 0.31974 seconds to load)
|
263
|
+
1 example, 0 failures
|
261
264
|
```
|
data/bin/purplelight
CHANGED
@@ -18,7 +18,17 @@ options = {
|
|
18
18
|
resume: { enabled: true },
|
19
19
|
read_preference: nil,
|
20
20
|
read_tags: nil,
|
21
|
-
dry_run: false
|
21
|
+
dry_run: false,
|
22
|
+
queue_size_bytes: nil,
|
23
|
+
rotate_bytes: nil,
|
24
|
+
compression_level: nil,
|
25
|
+
writer_threads: nil,
|
26
|
+
write_chunk_bytes: nil,
|
27
|
+
parquet_row_group: nil,
|
28
|
+
telemetry_flag: nil,
|
29
|
+
read_concern: nil,
|
30
|
+
no_cursor_timeout: nil,
|
31
|
+
resume_overwrite_incompatible: false
|
22
32
|
}
|
23
33
|
|
24
34
|
parser = OptionParser.new do |opts|
|
@@ -30,30 +40,40 @@ parser = OptionParser.new do |opts|
|
|
30
40
|
opts.on('-o', '--output PATH', 'Output directory or file (required)') { |v| options[:output] = v }
|
31
41
|
opts.on('-f', '--format FORMAT', 'Format: jsonl|csv|parquet (default jsonl)') { |v| options[:format] = v.to_sym }
|
32
42
|
opts.on('--compression NAME', 'Compression: zstd|gzip|none') { |v| options[:compression] = v.to_sym }
|
43
|
+
opts.on('--compression-level N', Integer, 'Compression level for zstd/gzip (JSONL/CSV)') { |v| options[:compression_level] = v }
|
33
44
|
opts.on('--partitions N', Integer, 'Number of partitions') { |v| options[:partitions] = v }
|
34
45
|
opts.on('--batch-size N', Integer, 'Mongo batch size (default 2000)') { |v| options[:batch_size] = v }
|
46
|
+
opts.on('--queue-mb MB', Integer, 'Queue size in MB (default 256)') { |v| options[:queue_size_bytes] = v * 1024 * 1024 }
|
47
|
+
opts.on('--rotate-mb MB', Integer, 'Rotate part size in MB (default 256)') { |v| options[:rotate_bytes] = v * 1024 * 1024 }
|
35
48
|
opts.on('--by-size BYTES', Integer, 'Shard by size (bytes); default 268435456') do |v|
|
36
|
-
options[:sharding]
|
49
|
+
options[:sharding] ||= {}
|
50
|
+
options[:sharding][:mode] = :by_size
|
51
|
+
options[:sharding][:part_bytes] = v
|
52
|
+
end
|
53
|
+
opts.on('--single-file', 'Write a single output file') do
|
54
|
+
options[:sharding] ||= {}
|
55
|
+
options[:sharding][:mode] = :single_file
|
37
56
|
end
|
38
|
-
opts.on('--single-file', 'Write a single output file') { options[:sharding] = { mode: :single_file } }
|
39
57
|
opts.on('--prefix NAME', 'Output file prefix') do |v|
|
40
58
|
options[:sharding] ||= {}
|
41
59
|
options[:sharding][:prefix] = v
|
42
60
|
end
|
61
|
+
opts.on('--writer-threads N', Integer, 'Number of writer threads (experimental, JSONL only)') { |v| options[:writer_threads] = v }
|
62
|
+
opts.on('--write-chunk-mb MB', Integer, 'JSONL encode/write chunk size in MB') { |v| options[:write_chunk_bytes] = v * 1024 * 1024 }
|
63
|
+
opts.on('--parquet-row-group N', Integer, 'Parquet row group size (rows)') { |v| options[:parquet_row_group] = v }
|
43
64
|
opts.on('-q', '--query JSON', 'Filter query as JSON (Extended JSON supported)') do |v|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
options[:query] = JSON.parse(v)
|
50
|
-
end
|
65
|
+
# Prefer BSON Extended JSON to support $date, $oid, etc.
|
66
|
+
options[:query] = BSON::ExtJSON.parse(v)
|
67
|
+
rescue StandardError
|
68
|
+
# Fallback to plain JSON for compatibility
|
69
|
+
options[:query] = JSON.parse(v)
|
51
70
|
end
|
52
71
|
opts.on('--projection JSON', 'Projection as JSON (e.g., {"_id":1,"field":1})') { |v| options[:projection] = JSON.parse(v) }
|
53
72
|
opts.on('--read-preference MODE',
|
54
73
|
'Read preference mode: primary|primary_preferred|secondary|secondary_preferred|nearest') do |v|
|
55
74
|
options[:read_preference] = v.to_sym
|
56
75
|
end
|
76
|
+
opts.on('--read-concern LEVEL', 'Read concern: majority|local|linearizable|available|snapshot') { |v| options[:read_concern] = v.to_sym }
|
57
77
|
opts.on('--read-tags TAGS',
|
58
78
|
'Comma-separated key=value list to target tagged nodes (e.g., nodeType=ANALYTICS,region=EAST)') do |v|
|
59
79
|
tags = {}
|
@@ -66,6 +86,13 @@ parser = OptionParser.new do |opts|
|
|
66
86
|
options[:read_tags] = tags unless tags.empty?
|
67
87
|
end
|
68
88
|
opts.on('--dry-run', 'Parse options and print effective read preference JSON, then exit') { options[:dry_run] = true }
|
89
|
+
opts.on('--telemetry MODE', 'Telemetry on|off (overrides PL_TELEMETRY)') { |v| options[:telemetry_flag] = v }
|
90
|
+
opts.on('--no-cursor-timeout BOOL', 'noCursorTimeout true|false (default true)') do |v|
|
91
|
+
options[:no_cursor_timeout] = %w[true 1 yes].include?(v.to_s.downcase)
|
92
|
+
end
|
93
|
+
opts.on('--resume-overwrite-incompatible', 'Overwrite incompatible existing manifest on resume') do
|
94
|
+
options[:resume_overwrite_incompatible] = true
|
95
|
+
end
|
69
96
|
opts.on('--version', 'Show version') do
|
70
97
|
puts Purplelight::VERSION
|
71
98
|
exit 0
|
@@ -107,7 +134,7 @@ end
|
|
107
134
|
client = Mongo::Client.new(options[:uri])
|
108
135
|
options[:partitions] ||= (Etc.respond_to?(:nprocessors) ? [Etc.nprocessors * 2, 4].max : 4)
|
109
136
|
|
110
|
-
|
137
|
+
snapshot_args = {
|
111
138
|
client: client.use(options[:db]),
|
112
139
|
collection: options[:collection],
|
113
140
|
output: options[:output],
|
@@ -119,8 +146,31 @@ ok = Purplelight.snapshot(
|
|
119
146
|
projection: options[:projection],
|
120
147
|
sharding: options[:sharding],
|
121
148
|
read_preference: effective_read || options[:read_preference],
|
122
|
-
resume: { enabled: true },
|
149
|
+
resume: { enabled: true, overwrite_incompatible: options[:resume_overwrite_incompatible] },
|
123
150
|
on_progress: ->(s) { warn("progress: #{s.to_json}") }
|
124
|
-
|
151
|
+
}
|
152
|
+
|
153
|
+
# optional tunables
|
154
|
+
snapshot_args[:queue_size_bytes] = options[:queue_size_bytes] if options[:queue_size_bytes]
|
155
|
+
snapshot_args[:rotate_bytes] = options[:rotate_bytes] if options[:rotate_bytes]
|
156
|
+
snapshot_args[:read_concern] = options[:read_concern] if options[:read_concern]
|
157
|
+
snapshot_args[:no_cursor_timeout] = options[:no_cursor_timeout] unless options[:no_cursor_timeout].nil?
|
158
|
+
snapshot_args[:compression_level] = options[:compression_level] if options[:compression_level]
|
159
|
+
snapshot_args[:writer_threads] = options[:writer_threads] if options[:writer_threads]
|
160
|
+
snapshot_args[:write_chunk_bytes] = options[:write_chunk_bytes] if options[:write_chunk_bytes]
|
161
|
+
snapshot_args[:parquet_row_group] = options[:parquet_row_group] if options[:parquet_row_group]
|
162
|
+
|
163
|
+
# telemetry env override
|
164
|
+
if options[:telemetry_flag]
|
165
|
+
ENV['PL_TELEMETRY'] = (options[:telemetry_flag].to_s.downcase == 'on' ? '1' : '0')
|
166
|
+
end
|
167
|
+
|
168
|
+
# writer-specific overrides via environment for v1 compatibility
|
169
|
+
ENV['PL_ZSTD_LEVEL'] = options[:compression_level].to_s if options[:compression_level]
|
170
|
+
ENV['PL_WRITE_CHUNK_BYTES'] = options[:write_chunk_bytes].to_s if options[:write_chunk_bytes]
|
171
|
+
ENV['PL_PARQUET_ROW_GROUP'] = options[:parquet_row_group].to_s if options[:parquet_row_group]
|
172
|
+
ENV['PL_WRITER_THREADS'] = options[:writer_threads].to_s if options[:writer_threads]
|
173
|
+
|
174
|
+
ok = Purplelight.snapshot(**snapshot_args)
|
125
175
|
|
126
176
|
exit(ok ? 0 : 1)
|
data/lib/purplelight/snapshot.rb
CHANGED
@@ -38,7 +38,8 @@ module Purplelight
|
|
38
38
|
resume: { enabled: true, state_path: nil, overwrite_incompatible: false },
|
39
39
|
sharding: { mode: :by_size, part_bytes: DEFAULTS[:rotate_bytes], prefix: nil },
|
40
40
|
logger: nil, on_progress: nil, read_concern: DEFAULTS[:read_concern], read_preference: DEFAULTS[:read_preference],
|
41
|
-
no_cursor_timeout: DEFAULTS[:no_cursor_timeout], telemetry: nil
|
41
|
+
no_cursor_timeout: DEFAULTS[:no_cursor_timeout], telemetry: nil,
|
42
|
+
compression_level: nil, writer_threads: 1, write_chunk_bytes: nil, parquet_row_group: nil)
|
42
43
|
@client = client
|
43
44
|
@collection = client[collection]
|
44
45
|
@output = output
|
@@ -59,6 +60,10 @@ module Purplelight
|
|
59
60
|
@read_concern = read_concern
|
60
61
|
@read_preference = read_preference
|
61
62
|
@no_cursor_timeout = no_cursor_timeout
|
63
|
+
@compression_level = compression_level
|
64
|
+
@writer_threads = writer_threads || 1
|
65
|
+
@write_chunk_bytes = write_chunk_bytes
|
66
|
+
@parquet_row_group = parquet_row_group
|
62
67
|
|
63
68
|
@running = true
|
64
69
|
@telemetry_enabled = telemetry ? telemetry.enabled? : (ENV['PL_TELEMETRY'] == '1')
|
@@ -90,7 +95,20 @@ module Purplelight
|
|
90
95
|
end
|
91
96
|
|
92
97
|
manifest.configure!(collection: @collection.name, format: @format, compression: @compression, query_digest: query_digest, options: {
|
93
|
-
partitions: @partitions,
|
98
|
+
partitions: @partitions,
|
99
|
+
batch_size: @batch_size,
|
100
|
+
queue_size_bytes: @queue_size_bytes,
|
101
|
+
rotate_bytes: @rotate_bytes,
|
102
|
+
hint: @hint,
|
103
|
+
read_concern: (@read_concern.is_a?(Hash) ? @read_concern : { level: @read_concern }),
|
104
|
+
no_cursor_timeout: @no_cursor_timeout,
|
105
|
+
writer_threads: @writer_threads,
|
106
|
+
compression_level: @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @compression.to_s == 'zstd') || ENV['PL_ZSTD_LEVEL']&.to_i,
|
107
|
+
write_chunk_bytes: @write_chunk_bytes || ENV['PL_WRITE_CHUNK_BYTES']&.to_i,
|
108
|
+
parquet_row_group: @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i,
|
109
|
+
sharding: @sharding,
|
110
|
+
resume_overwrite_incompatible: @resume && @resume[:overwrite_incompatible] ? true : false,
|
111
|
+
telemetry: @telemetry_enabled
|
94
112
|
})
|
95
113
|
manifest.ensure_partitions!(@partitions)
|
96
114
|
|
@@ -114,8 +132,9 @@ module Purplelight
|
|
114
132
|
logger: @logger, manifest: manifest, single_file: single_file)
|
115
133
|
when :parquet
|
116
134
|
single_file = @sharding && @sharding[:mode].to_s == 'single_file'
|
135
|
+
row_group = @parquet_row_group || ENV['PL_PARQUET_ROW_GROUP']&.to_i || WriterParquet::DEFAULT_ROW_GROUP_SIZE
|
117
136
|
WriterParquet.new(directory: dir, prefix: prefix, compression: @compression, logger: @logger,
|
118
|
-
manifest: manifest, single_file: single_file)
|
137
|
+
manifest: manifest, single_file: single_file, row_group_size: row_group)
|
119
138
|
else
|
120
139
|
raise ArgumentError, "format not implemented: #{@format}"
|
121
140
|
end
|
@@ -225,6 +244,11 @@ module Purplelight
|
|
225
244
|
string_batch = +''
|
226
245
|
buffer = []
|
227
246
|
buffer_bytes = 0
|
247
|
+
json_state = if encode_lines
|
248
|
+
JSON::Ext::Generator::State.new(ascii_only: false, max_nesting: false,
|
249
|
+
buffer_initial_length: 4_096)
|
250
|
+
end
|
251
|
+
size_state = encode_lines ? nil : JSON::Ext::Generator::State.new(ascii_only: false, max_nesting: false)
|
228
252
|
last_id = checkpoint
|
229
253
|
begin
|
230
254
|
cursor.each do |doc|
|
@@ -232,13 +256,15 @@ module Purplelight
|
|
232
256
|
doc = @mapper.call(doc) if @mapper
|
233
257
|
t_ser = telemetry.start(:serialize_time)
|
234
258
|
if encode_lines
|
235
|
-
|
259
|
+
json = json_state.generate(doc)
|
236
260
|
telemetry.finish(:serialize_time, t_ser)
|
237
|
-
|
238
|
-
string_batch <<
|
261
|
+
string_batch << json
|
262
|
+
string_batch << "\n"
|
263
|
+
bytes = json.bytesize + 1
|
239
264
|
else
|
240
265
|
# For CSV/Parquet keep raw docs to allow schema/row building
|
241
|
-
|
266
|
+
json = size_state.generate(doc)
|
267
|
+
bytes = json.bytesize + 1
|
242
268
|
telemetry.finish(:serialize_time, t_ser)
|
243
269
|
buffer << doc
|
244
270
|
end
|
data/lib/purplelight/version.rb
CHANGED
@@ -5,16 +5,14 @@ require 'json'
|
|
5
5
|
require 'zlib'
|
6
6
|
require 'fileutils'
|
7
7
|
|
8
|
-
begin
|
9
|
-
require 'zstds'
|
10
|
-
rescue LoadError
|
11
|
-
# zstd not available; fallback handled later via gzip
|
12
|
-
end
|
13
|
-
|
14
8
|
begin
|
15
9
|
require 'zstd-ruby'
|
16
10
|
rescue LoadError
|
17
|
-
|
11
|
+
begin
|
12
|
+
require 'zstds'
|
13
|
+
rescue LoadError
|
14
|
+
# no zstd backend; gzip fallback used later
|
15
|
+
end
|
18
16
|
end
|
19
17
|
|
20
18
|
module Purplelight
|
@@ -31,7 +29,7 @@ module Purplelight
|
|
31
29
|
@logger = logger
|
32
30
|
@manifest = manifest
|
33
31
|
env_level = ENV['PL_ZSTD_LEVEL']&.to_i
|
34
|
-
@compression_level = (env_level
|
32
|
+
@compression_level = (env_level&.positive? ? env_level : nil)
|
35
33
|
@single_file = single_file
|
36
34
|
|
37
35
|
@columns = columns&.map(&:to_s)
|
@@ -123,8 +121,8 @@ module Purplelight
|
|
123
121
|
@io.flush if @io.respond_to?(:flush)
|
124
122
|
end
|
125
123
|
|
126
|
-
def method_missing(method_name,
|
127
|
-
@io.send(method_name,
|
124
|
+
def method_missing(method_name, *, &)
|
125
|
+
@io.send(method_name, *, &)
|
128
126
|
end
|
129
127
|
|
130
128
|
def respond_to_missing?(method_name, include_private = false)
|
@@ -200,14 +198,22 @@ module Purplelight
|
|
200
198
|
end
|
201
199
|
|
202
200
|
def determine_effective_compression(requested)
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
:
|
201
|
+
# Order: explicit request -> zstd-ruby -> zstds -> gzip
|
202
|
+
req = requested.to_s
|
203
|
+
return :none if req == 'none'
|
204
|
+
return :gzip if req == 'gzip'
|
205
|
+
|
206
|
+
if req == 'zstd'
|
207
|
+
return :zstd if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
|
208
|
+
return :zstd if defined?(ZSTDS)
|
209
|
+
|
210
|
+
return :gzip
|
210
211
|
end
|
212
|
+
# Default auto-select
|
213
|
+
return :zstd if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
|
214
|
+
return :zstd if defined?(ZSTDS)
|
215
|
+
|
216
|
+
:gzip
|
211
217
|
end
|
212
218
|
|
213
219
|
def infer_columns(docs)
|
@@ -4,16 +4,14 @@ require 'json'
|
|
4
4
|
require 'zlib'
|
5
5
|
require 'fileutils'
|
6
6
|
|
7
|
-
begin
|
8
|
-
require 'zstds'
|
9
|
-
rescue LoadError
|
10
|
-
# zstd not available; will fallback to gzip
|
11
|
-
end
|
12
|
-
|
13
7
|
begin
|
14
8
|
require 'zstd-ruby'
|
15
9
|
rescue LoadError
|
16
|
-
|
10
|
+
begin
|
11
|
+
require 'zstds'
|
12
|
+
rescue LoadError
|
13
|
+
# no zstd backend; gzip fallback
|
14
|
+
end
|
17
15
|
end
|
18
16
|
|
19
17
|
module Purplelight
|
@@ -30,7 +28,7 @@ module Purplelight
|
|
30
28
|
@logger = logger
|
31
29
|
@manifest = manifest
|
32
30
|
env_level = ENV['PL_ZSTD_LEVEL']&.to_i
|
33
|
-
@compression_level = compression_level || (env_level
|
31
|
+
@compression_level = compression_level || (env_level&.positive? ? env_level : nil)
|
34
32
|
|
35
33
|
@part_index = nil
|
36
34
|
@io = nil
|
@@ -40,19 +38,20 @@ module Purplelight
|
|
40
38
|
@closed = false
|
41
39
|
|
42
40
|
@effective_compression = determine_effective_compression(@compression)
|
41
|
+
@json_state = JSON::Ext::Generator::State.new(ascii_only: false, max_nesting: false)
|
43
42
|
if @logger
|
44
43
|
level_disp = @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @effective_compression.to_s == 'zstd')
|
45
44
|
@logger.info("WriterJSONL using compression='#{@effective_compression}' level='#{level_disp || 'default'}'")
|
46
45
|
end
|
47
|
-
|
48
|
-
|
49
|
-
|
46
|
+
return unless @effective_compression.to_s != @compression.to_s
|
47
|
+
|
48
|
+
@logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
|
50
49
|
end
|
51
50
|
|
52
51
|
def write_many(batch)
|
53
52
|
ensure_open!
|
54
53
|
|
55
|
-
chunk_threshold =
|
54
|
+
chunk_threshold = ENV['PL_WRITE_CHUNK_BYTES']&.to_i || (8 * 1024 * 1024)
|
56
55
|
total_bytes = 0
|
57
56
|
rows = 0
|
58
57
|
|
@@ -86,20 +85,22 @@ module Purplelight
|
|
86
85
|
chunk = +''
|
87
86
|
chunk_bytes = 0
|
88
87
|
batch.each do |doc|
|
89
|
-
|
88
|
+
json = @json_state.generate(doc)
|
90
89
|
rows += 1
|
91
|
-
|
92
|
-
|
90
|
+
bytes = json.bytesize + 1
|
91
|
+
chunk << json
|
92
|
+
chunk << "\n"
|
93
|
+
chunk_bytes += bytes
|
93
94
|
next unless chunk_bytes >= chunk_threshold
|
94
95
|
|
95
96
|
write_buffer(chunk)
|
96
|
-
total_bytes +=
|
97
|
+
total_bytes += chunk_bytes
|
97
98
|
chunk = +''
|
98
99
|
chunk_bytes = 0
|
99
100
|
end
|
100
101
|
unless chunk.empty?
|
101
102
|
write_buffer(chunk)
|
102
|
-
total_bytes +=
|
103
|
+
total_bytes += chunk_bytes
|
103
104
|
end
|
104
105
|
end
|
105
106
|
|
@@ -198,14 +199,22 @@ module Purplelight
|
|
198
199
|
end
|
199
200
|
|
200
201
|
def determine_effective_compression(requested)
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
:
|
202
|
+
# Order: explicit request -> zstd-ruby -> zstds -> gzip
|
203
|
+
req = requested.to_s
|
204
|
+
return :none if req == 'none'
|
205
|
+
return :gzip if req == 'gzip'
|
206
|
+
|
207
|
+
if req == 'zstd'
|
208
|
+
return :zstd if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
|
209
|
+
return :zstd if defined?(ZSTDS)
|
210
|
+
|
211
|
+
return :gzip
|
208
212
|
end
|
213
|
+
# Default auto-select
|
214
|
+
return :zstd if Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)
|
215
|
+
return :zstd if defined?(ZSTDS)
|
216
|
+
|
217
|
+
:gzip
|
209
218
|
end
|
210
219
|
end
|
211
220
|
end
|
@@ -134,9 +134,8 @@ module Purplelight
|
|
134
134
|
def extract_value(doc, key)
|
135
135
|
value = doc[key] || doc[key.to_sym]
|
136
136
|
# Normalize common MongoDB/BSON types to Parquet-friendly values
|
137
|
-
if defined?(BSON) && value.is_a?(BSON::ObjectId)
|
138
|
-
|
139
|
-
end
|
137
|
+
return value.to_s if defined?(BSON) && value.is_a?(BSON::ObjectId)
|
138
|
+
|
140
139
|
value
|
141
140
|
end
|
142
141
|
end
|