purplelight 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/bin/purplelight +15 -3
- data/lib/purplelight/partitioner.rb +16 -6
- data/lib/purplelight/version.rb +1 -1
- data/lib/purplelight/writer_csv.rb +4 -4
- data/lib/purplelight/writer_jsonl.rb +6 -6
- data/lib/purplelight/writer_parquet.rb +7 -3
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 89114bd20a65a5a398be619718ae9a92e535d8118d9f928c61735da9a11cb5aa
|
4
|
+
data.tar.gz: 372372e6f5efe1cedd30033d661523fcb45a986c20e6e614ca46ca69def97e70
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 17103c062f0c6002ee53fbb1c2eed3179fb0df582c0dd99ef5100f8a26dbb0c56432f02f9a7f62c49bd201a05d414d7797c0e32044551522eb9d625ba9c179bf
|
7
|
+
data.tar.gz: e64bfd67ce31afae0c2209eaf058c731c331c63562f8e23de5e1f86a16a4fd3c8033f5938b7a509b3348daa8f4349afe926acb5e32fba8c7882996bd0d435616
|
data/README.md
CHANGED
data/bin/purplelight
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
require 'optparse'
|
5
5
|
require 'json'
|
6
6
|
require 'mongo'
|
7
|
+
require 'time'
|
7
8
|
require_relative '../lib/purplelight'
|
8
9
|
|
9
10
|
options = {
|
@@ -32,14 +33,25 @@ parser = OptionParser.new do |opts|
|
|
32
33
|
opts.on('--partitions N', Integer, 'Number of partitions') { |v| options[:partitions] = v }
|
33
34
|
opts.on('--batch-size N', Integer, 'Mongo batch size (default 2000)') { |v| options[:batch_size] = v }
|
34
35
|
opts.on('--by-size BYTES', Integer, 'Shard by size (bytes); default 268435456') do |v|
|
35
|
-
options[:sharding]
|
36
|
+
options[:sharding] ||= {}
|
37
|
+
options[:sharding][:mode] = :by_size
|
38
|
+
options[:sharding][:part_bytes] = v
|
39
|
+
end
|
40
|
+
opts.on('--single-file', 'Write a single output file') do
|
41
|
+
options[:sharding] ||= {}
|
42
|
+
options[:sharding][:mode] = :single_file
|
36
43
|
end
|
37
|
-
opts.on('--single-file', 'Write a single output file') { options[:sharding] = { mode: :single_file } }
|
38
44
|
opts.on('--prefix NAME', 'Output file prefix') do |v|
|
39
45
|
options[:sharding] ||= {}
|
40
46
|
options[:sharding][:prefix] = v
|
41
47
|
end
|
42
|
-
opts.on('-q', '--query JSON', 'Filter query as JSON')
|
48
|
+
opts.on('-q', '--query JSON', 'Filter query as JSON (Extended JSON supported)') do |v|
|
49
|
+
# Prefer BSON Extended JSON to support $date, $oid, etc.
|
50
|
+
options[:query] = BSON::ExtJSON.parse(v)
|
51
|
+
rescue StandardError
|
52
|
+
# Fallback to plain JSON for compatibility
|
53
|
+
options[:query] = JSON.parse(v)
|
54
|
+
end
|
43
55
|
opts.on('--projection JSON', 'Projection as JSON (e.g., {"_id":1,"field":1})') { |v| options[:projection] = JSON.parse(v) }
|
44
56
|
opts.on('--read-preference MODE',
|
45
57
|
'Read preference mode: primary|primary_preferred|secondary|secondary_preferred|nearest') do |v|
|
@@ -27,13 +27,23 @@ module Purplelight
|
|
27
27
|
max_id = collection.find(query || {}).projection(_id: 1).sort(_id: -1).limit(1).first&.dig('_id')
|
28
28
|
return [{ filter: query || {}, sort: { _id: 1 } }] if min_id.nil? || max_id.nil?
|
29
29
|
|
30
|
-
# Create
|
31
|
-
|
32
|
-
|
30
|
+
# Create contiguous ranges using ascending inner boundaries.
|
31
|
+
# We intentionally skip the very first _id so the first range includes the smallest document.
|
32
|
+
inner_boundaries = collection.find(query || {})
|
33
|
+
.projection(_id: 1)
|
34
|
+
.sort(_id: 1)
|
35
|
+
.skip(1)
|
36
|
+
.limit([partitions - 1, 0].max)
|
37
|
+
.to_a
|
38
|
+
.map { |d| d['_id'] }
|
39
|
+
|
33
40
|
ranges = []
|
34
|
-
|
35
|
-
|
41
|
+
prev = nil
|
42
|
+
inner_boundaries.each do |b|
|
43
|
+
ranges << build_range(prev, b)
|
44
|
+
prev = b
|
36
45
|
end
|
46
|
+
ranges << build_range(prev, nil)
|
37
47
|
|
38
48
|
ranges.map do |r|
|
39
49
|
filter = query ? query.dup : {}
|
@@ -54,7 +64,7 @@ module Purplelight
|
|
54
64
|
min_ts = min_id.respond_to?(:generation_time) ? min_id.generation_time.to_i : nil
|
55
65
|
max_ts = max_id.respond_to?(:generation_time) ? max_id.generation_time.to_i : nil
|
56
66
|
|
57
|
-
# Fallback to cursor sampling if _id isn't
|
67
|
+
# Fallback to cursor sampling if _id isn't anObjectId
|
58
68
|
return cursor_sampling_partitions(collection: collection, query: query, partitions: partitions) if min_ts.nil? || max_ts.nil? || max_ts <= min_ts
|
59
69
|
|
60
70
|
step = [(max_ts - min_ts) / partitions, 1].max
|
data/lib/purplelight/version.rb
CHANGED
@@ -31,7 +31,7 @@ module Purplelight
|
|
31
31
|
@logger = logger
|
32
32
|
@manifest = manifest
|
33
33
|
env_level = ENV['PL_ZSTD_LEVEL']&.to_i
|
34
|
-
@compression_level = (env_level
|
34
|
+
@compression_level = (env_level&.positive? ? env_level : nil)
|
35
35
|
@single_file = single_file
|
36
36
|
|
37
37
|
@columns = columns&.map(&:to_s)
|
@@ -123,8 +123,8 @@ module Purplelight
|
|
123
123
|
@io.flush if @io.respond_to?(:flush)
|
124
124
|
end
|
125
125
|
|
126
|
-
def method_missing(method_name,
|
127
|
-
@io.send(method_name,
|
126
|
+
def method_missing(method_name, *, &)
|
127
|
+
@io.send(method_name, *, &)
|
128
128
|
end
|
129
129
|
|
130
130
|
def respond_to_missing?(method_name, include_private = false)
|
@@ -202,7 +202,7 @@ module Purplelight
|
|
202
202
|
def determine_effective_compression(requested)
|
203
203
|
case requested.to_s
|
204
204
|
when 'zstd'
|
205
|
-
(
|
205
|
+
(defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)) ? :zstd : :gzip)
|
206
206
|
when 'none'
|
207
207
|
:none
|
208
208
|
else
|
@@ -30,7 +30,7 @@ module Purplelight
|
|
30
30
|
@logger = logger
|
31
31
|
@manifest = manifest
|
32
32
|
env_level = ENV['PL_ZSTD_LEVEL']&.to_i
|
33
|
-
@compression_level = compression_level || (env_level
|
33
|
+
@compression_level = compression_level || (env_level&.positive? ? env_level : nil)
|
34
34
|
|
35
35
|
@part_index = nil
|
36
36
|
@io = nil
|
@@ -44,15 +44,15 @@ module Purplelight
|
|
44
44
|
level_disp = @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @effective_compression.to_s == 'zstd')
|
45
45
|
@logger.info("WriterJSONL using compression='#{@effective_compression}' level='#{level_disp || 'default'}'")
|
46
46
|
end
|
47
|
-
|
48
|
-
|
49
|
-
|
47
|
+
return unless @effective_compression.to_s != @compression.to_s
|
48
|
+
|
49
|
+
@logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
|
50
50
|
end
|
51
51
|
|
52
52
|
def write_many(batch)
|
53
53
|
ensure_open!
|
54
54
|
|
55
|
-
chunk_threshold =
|
55
|
+
chunk_threshold = ENV['PL_WRITE_CHUNK_BYTES']&.to_i || (8 * 1024 * 1024)
|
56
56
|
total_bytes = 0
|
57
57
|
rows = 0
|
58
58
|
|
@@ -200,7 +200,7 @@ module Purplelight
|
|
200
200
|
def determine_effective_compression(requested)
|
201
201
|
case requested.to_s
|
202
202
|
when 'zstd'
|
203
|
-
(
|
203
|
+
(defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)) ? :zstd : :gzip)
|
204
204
|
when 'none'
|
205
205
|
:none
|
206
206
|
else
|
@@ -116,9 +116,9 @@ module Purplelight
|
|
116
116
|
def next_part_path
|
117
117
|
ext = 'parquet'
|
118
118
|
filename = if @single_file
|
119
|
-
|
119
|
+
"#{@prefix}.#{ext}"
|
120
120
|
else
|
121
|
-
format('%<prefix
|
121
|
+
format('%<prefix>s-part-%<seq>06d.%<ext>s', prefix: @prefix, seq: @file_seq, ext: ext)
|
122
122
|
end
|
123
123
|
File.join(@directory, filename)
|
124
124
|
end
|
@@ -132,7 +132,11 @@ module Purplelight
|
|
132
132
|
end
|
133
133
|
|
134
134
|
def extract_value(doc, key)
|
135
|
-
doc[key] || doc[key.to_sym]
|
135
|
+
value = doc[key] || doc[key.to_sym]
|
136
|
+
# Normalize common MongoDB/BSON types to Parquet-friendly values
|
137
|
+
return value.to_s if defined?(BSON) && value.is_a?(BSON::ObjectId)
|
138
|
+
|
139
|
+
value
|
136
140
|
end
|
137
141
|
end
|
138
142
|
end
|