purplelight 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/bin/purplelight +12 -9
- data/lib/purplelight/version.rb +1 -1
- data/lib/purplelight/writer_csv.rb +4 -4
- data/lib/purplelight/writer_jsonl.rb +6 -6
- data/lib/purplelight/writer_parquet.rb +2 -3
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 89114bd20a65a5a398be619718ae9a92e535d8118d9f928c61735da9a11cb5aa
|
4
|
+
data.tar.gz: 372372e6f5efe1cedd30033d661523fcb45a986c20e6e614ca46ca69def97e70
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 17103c062f0c6002ee53fbb1c2eed3179fb0df582c0dd99ef5100f8a26dbb0c56432f02f9a7f62c49bd201a05d414d7797c0e32044551522eb9d625ba9c179bf
|
7
|
+
data.tar.gz: e64bfd67ce31afae0c2209eaf058c731c331c63562f8e23de5e1f86a16a4fd3c8033f5938b7a509b3348daa8f4349afe926acb5e32fba8c7882996bd0d435616
|
data/README.md
CHANGED
data/bin/purplelight
CHANGED
@@ -33,21 +33,24 @@ parser = OptionParser.new do |opts|
|
|
33
33
|
opts.on('--partitions N', Integer, 'Number of partitions') { |v| options[:partitions] = v }
|
34
34
|
opts.on('--batch-size N', Integer, 'Mongo batch size (default 2000)') { |v| options[:batch_size] = v }
|
35
35
|
opts.on('--by-size BYTES', Integer, 'Shard by size (bytes); default 268435456') do |v|
|
36
|
-
options[:sharding]
|
36
|
+
options[:sharding] ||= {}
|
37
|
+
options[:sharding][:mode] = :by_size
|
38
|
+
options[:sharding][:part_bytes] = v
|
39
|
+
end
|
40
|
+
opts.on('--single-file', 'Write a single output file') do
|
41
|
+
options[:sharding] ||= {}
|
42
|
+
options[:sharding][:mode] = :single_file
|
37
43
|
end
|
38
|
-
opts.on('--single-file', 'Write a single output file') { options[:sharding] = { mode: :single_file } }
|
39
44
|
opts.on('--prefix NAME', 'Output file prefix') do |v|
|
40
45
|
options[:sharding] ||= {}
|
41
46
|
options[:sharding][:prefix] = v
|
42
47
|
end
|
43
48
|
opts.on('-q', '--query JSON', 'Filter query as JSON (Extended JSON supported)') do |v|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
options[:query] = JSON.parse(v)
|
50
|
-
end
|
49
|
+
# Prefer BSON Extended JSON to support $date, $oid, etc.
|
50
|
+
options[:query] = BSON::ExtJSON.parse(v)
|
51
|
+
rescue StandardError
|
52
|
+
# Fallback to plain JSON for compatibility
|
53
|
+
options[:query] = JSON.parse(v)
|
51
54
|
end
|
52
55
|
opts.on('--projection JSON', 'Projection as JSON (e.g., {"_id":1,"field":1})') { |v| options[:projection] = JSON.parse(v) }
|
53
56
|
opts.on('--read-preference MODE',
|
data/lib/purplelight/version.rb
CHANGED
@@ -31,7 +31,7 @@ module Purplelight
|
|
31
31
|
@logger = logger
|
32
32
|
@manifest = manifest
|
33
33
|
env_level = ENV['PL_ZSTD_LEVEL']&.to_i
|
34
|
-
@compression_level = (env_level
|
34
|
+
@compression_level = (env_level&.positive? ? env_level : nil)
|
35
35
|
@single_file = single_file
|
36
36
|
|
37
37
|
@columns = columns&.map(&:to_s)
|
@@ -123,8 +123,8 @@ module Purplelight
|
|
123
123
|
@io.flush if @io.respond_to?(:flush)
|
124
124
|
end
|
125
125
|
|
126
|
-
def method_missing(method_name,
|
127
|
-
@io.send(method_name,
|
126
|
+
def method_missing(method_name, *, &)
|
127
|
+
@io.send(method_name, *, &)
|
128
128
|
end
|
129
129
|
|
130
130
|
def respond_to_missing?(method_name, include_private = false)
|
@@ -202,7 +202,7 @@ module Purplelight
|
|
202
202
|
def determine_effective_compression(requested)
|
203
203
|
case requested.to_s
|
204
204
|
when 'zstd'
|
205
|
-
(
|
205
|
+
(defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)) ? :zstd : :gzip)
|
206
206
|
when 'none'
|
207
207
|
:none
|
208
208
|
else
|
@@ -30,7 +30,7 @@ module Purplelight
|
|
30
30
|
@logger = logger
|
31
31
|
@manifest = manifest
|
32
32
|
env_level = ENV['PL_ZSTD_LEVEL']&.to_i
|
33
|
-
@compression_level = compression_level || (env_level
|
33
|
+
@compression_level = compression_level || (env_level&.positive? ? env_level : nil)
|
34
34
|
|
35
35
|
@part_index = nil
|
36
36
|
@io = nil
|
@@ -44,15 +44,15 @@ module Purplelight
|
|
44
44
|
level_disp = @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @effective_compression.to_s == 'zstd')
|
45
45
|
@logger.info("WriterJSONL using compression='#{@effective_compression}' level='#{level_disp || 'default'}'")
|
46
46
|
end
|
47
|
-
|
48
|
-
|
49
|
-
|
47
|
+
return unless @effective_compression.to_s != @compression.to_s
|
48
|
+
|
49
|
+
@logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
|
50
50
|
end
|
51
51
|
|
52
52
|
def write_many(batch)
|
53
53
|
ensure_open!
|
54
54
|
|
55
|
-
chunk_threshold =
|
55
|
+
chunk_threshold = ENV['PL_WRITE_CHUNK_BYTES']&.to_i || (8 * 1024 * 1024)
|
56
56
|
total_bytes = 0
|
57
57
|
rows = 0
|
58
58
|
|
@@ -200,7 +200,7 @@ module Purplelight
|
|
200
200
|
def determine_effective_compression(requested)
|
201
201
|
case requested.to_s
|
202
202
|
when 'zstd'
|
203
|
-
(
|
203
|
+
(defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)) ? :zstd : :gzip)
|
204
204
|
when 'none'
|
205
205
|
:none
|
206
206
|
else
|
@@ -134,9 +134,8 @@ module Purplelight
|
|
134
134
|
def extract_value(doc, key)
|
135
135
|
value = doc[key] || doc[key.to_sym]
|
136
136
|
# Normalize common MongoDB/BSON types to Parquet-friendly values
|
137
|
-
if defined?(BSON) && value.is_a?(BSON::ObjectId)
|
138
|
-
|
139
|
-
end
|
137
|
+
return value.to_s if defined?(BSON) && value.is_a?(BSON::ObjectId)
|
138
|
+
|
140
139
|
value
|
141
140
|
end
|
142
141
|
end
|