purplelight 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/bin/purplelight +10 -1
- data/lib/purplelight/partitioner.rb +16 -6
- data/lib/purplelight/version.rb +1 -1
- data/lib/purplelight/writer_parquet.rb +8 -3
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6e3771629528ecab067d858491ec4c5de06a8c88c90c64b56dcdb8658c3a6a89
|
4
|
+
data.tar.gz: 4c0427564bf04b5dc3da6b3af97bfd4e5dc0625b2faa0dd8acec84a81bd1c145
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 871cf3c04dcaa017d1023ff79135a5050254158a7ba20145abb04c39887125c08eedfb8cae4d16ddd3c4fd0acb383ceb9c11efff943b497ff036b68643dbea79
|
7
|
+
data.tar.gz: 8a5e7fe7a1913939966df61173380e6ed853f45bec2ef90315410a6d36d6341c07d04ff62d200379629b2366cf68296c7c291b5060d0a080dc3015175f1e6780
|
data/README.md
CHANGED
data/bin/purplelight
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
require 'optparse'
|
5
5
|
require 'json'
|
6
6
|
require 'mongo'
|
7
|
+
require 'time'
|
7
8
|
require_relative '../lib/purplelight'
|
8
9
|
|
9
10
|
options = {
|
@@ -39,7 +40,15 @@ parser = OptionParser.new do |opts|
|
|
39
40
|
options[:sharding] ||= {}
|
40
41
|
options[:sharding][:prefix] = v
|
41
42
|
end
|
42
|
-
opts.on('-q', '--query JSON', 'Filter query as JSON')
|
43
|
+
opts.on('-q', '--query JSON', 'Filter query as JSON (Extended JSON supported)') do |v|
|
44
|
+
begin
|
45
|
+
# Prefer BSON Extended JSON to support $date, $oid, etc.
|
46
|
+
options[:query] = BSON::ExtJSON.parse(v)
|
47
|
+
rescue StandardError
|
48
|
+
# Fallback to plain JSON for compatibility
|
49
|
+
options[:query] = JSON.parse(v)
|
50
|
+
end
|
51
|
+
end
|
43
52
|
opts.on('--projection JSON', 'Projection as JSON (e.g., {"_id":1,"field":1})') { |v| options[:projection] = JSON.parse(v) }
|
44
53
|
opts.on('--read-preference MODE',
|
45
54
|
'Read preference mode: primary|primary_preferred|secondary|secondary_preferred|nearest') do |v|
|
@@ -27,13 +27,23 @@ module Purplelight
|
|
27
27
|
max_id = collection.find(query || {}).projection(_id: 1).sort(_id: -1).limit(1).first&.dig('_id')
|
28
28
|
return [{ filter: query || {}, sort: { _id: 1 } }] if min_id.nil? || max_id.nil?
|
29
29
|
|
30
|
-
# Create
|
31
|
-
|
32
|
-
|
30
|
+
# Create contiguous ranges using ascending inner boundaries.
|
31
|
+
# We intentionally skip the very first _id so the first range includes the smallest document.
|
32
|
+
inner_boundaries = collection.find(query || {})
|
33
|
+
.projection(_id: 1)
|
34
|
+
.sort(_id: 1)
|
35
|
+
.skip(1)
|
36
|
+
.limit([partitions - 1, 0].max)
|
37
|
+
.to_a
|
38
|
+
.map { |d| d['_id'] }
|
39
|
+
|
33
40
|
ranges = []
|
34
|
-
|
35
|
-
|
41
|
+
prev = nil
|
42
|
+
inner_boundaries.each do |b|
|
43
|
+
ranges << build_range(prev, b)
|
44
|
+
prev = b
|
36
45
|
end
|
46
|
+
ranges << build_range(prev, nil)
|
37
47
|
|
38
48
|
ranges.map do |r|
|
39
49
|
filter = query ? query.dup : {}
|
@@ -54,7 +64,7 @@ module Purplelight
|
|
54
64
|
min_ts = min_id.respond_to?(:generation_time) ? min_id.generation_time.to_i : nil
|
55
65
|
max_ts = max_id.respond_to?(:generation_time) ? max_id.generation_time.to_i : nil
|
56
66
|
|
57
|
-
# Fallback to cursor sampling if _id isn't
|
67
|
+
# Fallback to cursor sampling if _id isn't anObjectId
|
58
68
|
return cursor_sampling_partitions(collection: collection, query: query, partitions: partitions) if min_ts.nil? || max_ts.nil? || max_ts <= min_ts
|
59
69
|
|
60
70
|
step = [(max_ts - min_ts) / partitions, 1].max
|
data/lib/purplelight/version.rb
CHANGED
@@ -116,9 +116,9 @@ module Purplelight
|
|
116
116
|
def next_part_path
|
117
117
|
ext = 'parquet'
|
118
118
|
filename = if @single_file
|
119
|
-
|
119
|
+
"#{@prefix}.#{ext}"
|
120
120
|
else
|
121
|
-
format('%<prefix
|
121
|
+
format('%<prefix>s-part-%<seq>06d.%<ext>s', prefix: @prefix, seq: @file_seq, ext: ext)
|
122
122
|
end
|
123
123
|
File.join(@directory, filename)
|
124
124
|
end
|
@@ -132,7 +132,12 @@ module Purplelight
|
|
132
132
|
end
|
133
133
|
|
134
134
|
def extract_value(doc, key)
|
135
|
-
doc[key] || doc[key.to_sym]
|
135
|
+
value = doc[key] || doc[key.to_sym]
|
136
|
+
# Normalize common MongoDB/BSON types to Parquet-friendly values
|
137
|
+
if defined?(BSON) && value.is_a?(BSON::ObjectId)
|
138
|
+
return value.to_s
|
139
|
+
end
|
140
|
+
value
|
136
141
|
end
|
137
142
|
end
|
138
143
|
end
|