purplelight 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +48 -0
- data/Rakefile +14 -0
- data/lib/purplelight/errors.rb +11 -0
- data/lib/purplelight/manifest.rb +128 -0
- data/lib/purplelight/partitioner.rb +84 -0
- data/lib/purplelight/queue.rb +55 -0
- data/lib/purplelight/snapshot.rb +205 -0
- data/lib/purplelight/version.rb +7 -0
- data/lib/purplelight/writer_jsonl.rb +119 -0
- data/lib/purplelight.rb +16 -0
- metadata +109 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: b62be6d2a3810b6278d43fadcee4647efb1c758c9bd04ddc69051737e66d1716
|
4
|
+
data.tar.gz: 2950f98c90869bcc3d6619e00adf68113e12197196f43a4c129dab2c1270e47c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 635a9c3114bc1d6a017a8244dfd6b5cca15f82f19a9e79f59480af91c0e1ec61b72e40703393098f584a2a9d29f68cac9b81f8833c14ccf8d42f527c4cb40c2c
|
7
|
+
data.tar.gz: f24b2faaff4218481b8fa8345842dd1fadeb9174adb92d449008637e072e81861b13bb8f71cacbcb51254fbcca18ea799ca8dfa5ce5c99f418e82262dd5505c1
|
data/README.md
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# purplelight
|
2
|
+
|
3
|
+
Snapshot MongoDB collections efficiently from Ruby with resumable, partitioned exports to JSONL/CSV/Parquet. Defaults to zstd compression and size-based multi-part outputs. MongoDB 7/8.
|
4
|
+
|
5
|
+
### Install
|
6
|
+
|
7
|
+
Add to your Gemfile:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
gem 'purplelight'
|
11
|
+
```
|
12
|
+
|
13
|
+
### Quick start
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
require 'mongo'
|
17
|
+
require 'purplelight'
|
18
|
+
|
19
|
+
client = Mongo::Client.new(ENV.fetch('MONGO_URL'))
|
20
|
+
|
21
|
+
Purplelight.snapshot(
|
22
|
+
client: client,
|
23
|
+
collection: 'users',
|
24
|
+
output: '/data/exports',
|
25
|
+
format: :jsonl,
|
26
|
+
compression: :zstd, # default
|
27
|
+
partitions: 8,
|
28
|
+
batch_size: 2000,
|
29
|
+
query: { active: true },
|
30
|
+
sharding: { mode: :by_size, part_bytes: 256 * 1024 * 1024, prefix: 'users' },
|
31
|
+
resume: { enabled: true },
|
32
|
+
on_progress: ->(s) { puts s.inspect }
|
33
|
+
)
|
34
|
+
```
|
35
|
+
|
36
|
+
Outputs files like:
|
37
|
+
|
38
|
+
```
|
39
|
+
/data/exports/
|
40
|
+
users-part-000000.jsonl.zst
|
41
|
+
users-part-000001.jsonl.zst
|
42
|
+
users.manifest.json
|
43
|
+
```
|
44
|
+
|
45
|
+
### Status
|
46
|
+
|
47
|
+
Phase 1 (JSONL + zstd, partitioning, resume, size-based sharding) in progress.
|
48
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
require 'time'
|
5
|
+
require 'securerandom'
|
6
|
+
require 'digest'
|
7
|
+
require 'fileutils'
|
8
|
+
|
9
|
+
module Purplelight
|
10
|
+
class Manifest
|
11
|
+
DEFAULT_VERSION = 1
|
12
|
+
|
13
|
+
attr_reader :path, :data
|
14
|
+
|
15
|
+
def self.query_digest(query, projection)
|
16
|
+
payload = { query: query, projection: projection }
|
17
|
+
Digest::SHA256.hexdigest(JSON.generate(payload))
|
18
|
+
end
|
19
|
+
|
20
|
+
def initialize(path:, data: nil)
|
21
|
+
@path = path
|
22
|
+
@data = data || {
|
23
|
+
'version' => DEFAULT_VERSION,
|
24
|
+
'run_id' => SecureRandom.uuid,
|
25
|
+
'created_at' => Time.now.utc.iso8601,
|
26
|
+
'collection' => nil,
|
27
|
+
'format' => nil,
|
28
|
+
'compression' => nil,
|
29
|
+
'query_digest' => nil,
|
30
|
+
'options' => {},
|
31
|
+
'parts' => [],
|
32
|
+
'partitions' => []
|
33
|
+
}
|
34
|
+
@mutex = Mutex.new
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.load(path)
|
38
|
+
data = JSON.parse(File.read(path))
|
39
|
+
new(path: path, data: data)
|
40
|
+
end
|
41
|
+
|
42
|
+
def save!
|
43
|
+
dir = File.dirname(path)
|
44
|
+
FileUtils.mkdir_p(dir) unless Dir.exist?(dir)
|
45
|
+
tmp = path + '.tmp'
|
46
|
+
File.open(tmp, 'w') { |f| f.write(JSON.pretty_generate(@data)) }
|
47
|
+
FileUtils.mv(tmp, path)
|
48
|
+
end
|
49
|
+
|
50
|
+
def configure!(collection:, format:, compression:, query_digest:, options: {})
|
51
|
+
@data['collection'] = collection
|
52
|
+
@data['format'] = format.to_s
|
53
|
+
@data['compression'] = compression.to_s
|
54
|
+
@data['query_digest'] = query_digest
|
55
|
+
@data['options'] = options
|
56
|
+
save!
|
57
|
+
end
|
58
|
+
|
59
|
+
def compatible_with?(collection:, format:, compression:, query_digest:)
|
60
|
+
@data['collection'] == collection &&
|
61
|
+
@data['format'] == format.to_s &&
|
62
|
+
@data['compression'] == compression.to_s &&
|
63
|
+
@data['query_digest'] == query_digest
|
64
|
+
end
|
65
|
+
|
66
|
+
def ensure_partitions!(count)
|
67
|
+
@mutex.synchronize do
|
68
|
+
if @data['partitions'].empty?
|
69
|
+
@data['partitions'] = Array.new(count) { |i| { 'index' => i, 'last_id_exclusive' => nil, 'completed' => false } }
|
70
|
+
save!
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def update_partition_checkpoint!(index, last_id_exclusive)
|
76
|
+
@mutex.synchronize do
|
77
|
+
part = @data['partitions'][index]
|
78
|
+
part['last_id_exclusive'] = last_id_exclusive
|
79
|
+
save!
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def mark_partition_complete!(index)
|
84
|
+
@mutex.synchronize do
|
85
|
+
part = @data['partitions'][index]
|
86
|
+
part['completed'] = true
|
87
|
+
save!
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def open_part!(path)
|
92
|
+
@mutex.synchronize do
|
93
|
+
idx = @data['parts'].size
|
94
|
+
@data['parts'] << { 'index' => idx, 'path' => path, 'bytes' => 0, 'rows' => 0, 'complete' => false, 'checksum' => nil }
|
95
|
+
save!
|
96
|
+
idx
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def add_progress_to_part!(index:, rows_delta:, bytes_delta:)
|
101
|
+
@mutex.synchronize do
|
102
|
+
part = @data['parts'][index]
|
103
|
+
part['rows'] += rows_delta
|
104
|
+
part['bytes'] += bytes_delta
|
105
|
+
save!
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def complete_part!(index:, checksum: nil)
|
110
|
+
@mutex.synchronize do
|
111
|
+
part = @data['parts'][index]
|
112
|
+
part['complete'] = true
|
113
|
+
part['checksum'] = checksum
|
114
|
+
save!
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def parts
|
119
|
+
@data['parts']
|
120
|
+
end
|
121
|
+
|
122
|
+
def partitions
|
123
|
+
@data['partitions']
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'mongo'
|
4
|
+
|
5
|
+
module Purplelight
|
6
|
+
class Partitioner
|
7
|
+
# Builds contiguous _id range filters for N partitions.
|
8
|
+
# For ObjectId _id, we sample quantiles to split into near-equal document counts.
|
9
|
+
def self.object_id_partitions(collection:, query:, partitions:)
|
10
|
+
# Ensure sort order for sampling
|
11
|
+
base_query = collection.find(query || {}, {}.merge(sort: { _id: 1 }))
|
12
|
+
|
13
|
+
# Fast path: if small dataset, just chunk by count
|
14
|
+
total = collection.estimated_document_count
|
15
|
+
return simple_ranges(collection: collection, query: query, partitions: partitions) if total <= partitions * 5_000
|
16
|
+
|
17
|
+
# Sample boundaries: take approx quantiles by skipping
|
18
|
+
step = [total / partitions, 1].max
|
19
|
+
boundaries = []
|
20
|
+
cursor = base_query.projection(_id: 1).batch_size(1_000).no_cursor_timeout
|
21
|
+
i = 0
|
22
|
+
cursor.each do |doc|
|
23
|
+
if (i % step).zero?
|
24
|
+
boundaries << doc['_id']
|
25
|
+
end
|
26
|
+
i += 1
|
27
|
+
break if boundaries.size >= partitions
|
28
|
+
end
|
29
|
+
|
30
|
+
ranges = []
|
31
|
+
prev = nil
|
32
|
+
boundaries.each_with_index do |b, idx|
|
33
|
+
if idx == 0
|
34
|
+
prev = nil
|
35
|
+
next
|
36
|
+
end
|
37
|
+
ranges << build_range(prev, b)
|
38
|
+
prev = b
|
39
|
+
end
|
40
|
+
ranges << build_range(prev, nil)
|
41
|
+
|
42
|
+
ranges.map do |r|
|
43
|
+
filter = query ? query.dup : {}
|
44
|
+
filter['_id'] = r
|
45
|
+
{ filter: filter, sort: { _id: 1 }, hint: { _id: 1 } }
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.simple_ranges(collection:, query:, partitions:)
|
50
|
+
# Split by _id quantiles using min/max endpoints
|
51
|
+
min_id = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(1).first&.dig('_id')
|
52
|
+
max_id = collection.find(query || {}).projection(_id: 1).sort(_id: -1).limit(1).first&.dig('_id')
|
53
|
+
return [{ filter: query || {}, sort: { _id: 1 } }] if min_id.nil? || max_id.nil?
|
54
|
+
|
55
|
+
# Create numeric-ish interpolation by sampling
|
56
|
+
ids = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(partitions - 1).to_a.map { |d| d['_id'] }
|
57
|
+
boundaries = [min_id] + ids + [max_id]
|
58
|
+
ranges = []
|
59
|
+
boundaries.each_cons(2) do |a, b|
|
60
|
+
ranges << build_range(a, b)
|
61
|
+
end
|
62
|
+
|
63
|
+
ranges.map do |r|
|
64
|
+
filter = query ? query.dup : {}
|
65
|
+
filter['_id'] = r
|
66
|
+
{ filter: filter, sort: { _id: 1 }, hint: { _id: 1 } }
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.build_range(from_id, to_id)
|
71
|
+
if from_id && to_id
|
72
|
+
{ '$gt' => from_id, '$lte' => to_id }
|
73
|
+
elsif from_id && !to_id
|
74
|
+
{ '$gt' => from_id }
|
75
|
+
elsif !from_id && to_id
|
76
|
+
{ '$lte' => to_id }
|
77
|
+
else
|
78
|
+
{}
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Purplelight
|
4
|
+
# Sized queue that tracks bytes to apply backpressure.
|
5
|
+
class ByteQueue
|
6
|
+
def initialize(max_bytes: 128 * 1024 * 1024)
|
7
|
+
@max_bytes = max_bytes
|
8
|
+
@queue = []
|
9
|
+
@bytes = 0
|
10
|
+
@closed = false
|
11
|
+
@mutex = Mutex.new
|
12
|
+
@cv = ConditionVariable.new
|
13
|
+
end
|
14
|
+
|
15
|
+
def push(item, bytes:)
|
16
|
+
@mutex.synchronize do
|
17
|
+
raise "queue closed" if @closed
|
18
|
+
while (@bytes + bytes) > @max_bytes
|
19
|
+
@cv.wait(@mutex)
|
20
|
+
end
|
21
|
+
@queue << [item, bytes]
|
22
|
+
@bytes += bytes
|
23
|
+
@cv.broadcast
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def pop
|
28
|
+
@mutex.synchronize do
|
29
|
+
while @queue.empty?
|
30
|
+
if @closed
|
31
|
+
return nil
|
32
|
+
end
|
33
|
+
@cv.wait(@mutex)
|
34
|
+
end
|
35
|
+
item, bytes = @queue.shift
|
36
|
+
@bytes -= bytes
|
37
|
+
@cv.broadcast
|
38
|
+
item
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def close
|
43
|
+
@mutex.synchronize do
|
44
|
+
@closed = true
|
45
|
+
@cv.broadcast
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def size_bytes
|
50
|
+
@mutex.synchronize { @bytes }
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
|
@@ -0,0 +1,205 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'mongo'
|
4
|
+
require 'etc'
|
5
|
+
require 'fileutils'
|
6
|
+
require_relative 'partitioner'
|
7
|
+
require_relative 'queue'
|
8
|
+
require_relative 'writer_jsonl'
|
9
|
+
require_relative 'manifest'
|
10
|
+
require_relative 'errors'
|
11
|
+
|
12
|
+
module Purplelight
|
13
|
+
class Snapshot
|
14
|
+
DEFAULTS = {
|
15
|
+
format: :jsonl,
|
16
|
+
compression: :zstd,
|
17
|
+
batch_size: 2_000,
|
18
|
+
partitions: [Etc.respond_to?(:nprocessors) ? [Etc.nprocessors * 2, 4].max : 4, 32].min,
|
19
|
+
queue_size_bytes: 128 * 1024 * 1024,
|
20
|
+
rotate_bytes: 256 * 1024 * 1024,
|
21
|
+
read_concern: :majority,
|
22
|
+
read_preference: :primary,
|
23
|
+
no_cursor_timeout: true
|
24
|
+
}
|
25
|
+
|
26
|
+
def self.snapshot(**options)
|
27
|
+
new(**options).run
|
28
|
+
end
|
29
|
+
|
30
|
+
def initialize(client:, collection:, output:, format: DEFAULTS[:format], compression: DEFAULTS[:compression],
|
31
|
+
partitions: DEFAULTS[:partitions], batch_size: DEFAULTS[:batch_size],
|
32
|
+
queue_size_bytes: DEFAULTS[:queue_size_bytes], rotate_bytes: DEFAULTS[:rotate_bytes],
|
33
|
+
query: {}, projection: nil, hint: nil, mapper: nil,
|
34
|
+
resume: { enabled: true, state_path: nil, overwrite_incompatible: false },
|
35
|
+
sharding: { mode: :by_size, part_bytes: DEFAULTS[:rotate_bytes], prefix: nil },
|
36
|
+
logger: nil, on_progress: nil, read_concern: DEFAULTS[:read_concern], read_preference: DEFAULTS[:read_preference],
|
37
|
+
no_cursor_timeout: DEFAULTS[:no_cursor_timeout])
|
38
|
+
@client = client
|
39
|
+
@collection = client[collection]
|
40
|
+
@output = output
|
41
|
+
@format = format.to_sym
|
42
|
+
@compression = compression.to_sym
|
43
|
+
@partitions = partitions
|
44
|
+
@batch_size = batch_size
|
45
|
+
@queue_size_bytes = queue_size_bytes
|
46
|
+
@rotate_bytes = rotate_bytes
|
47
|
+
@query = query || {}
|
48
|
+
@projection = projection
|
49
|
+
@hint = hint
|
50
|
+
@mapper = mapper
|
51
|
+
@resume = resume || { enabled: true }
|
52
|
+
@sharding = sharding || { mode: :by_size }
|
53
|
+
@logger = logger
|
54
|
+
@on_progress = on_progress
|
55
|
+
@read_concern = read_concern
|
56
|
+
@read_preference = read_preference
|
57
|
+
@no_cursor_timeout = no_cursor_timeout
|
58
|
+
|
59
|
+
@running = true
|
60
|
+
end
|
61
|
+
|
62
|
+
def run
|
63
|
+
dir, prefix = resolve_output(@output, @format)
|
64
|
+
manifest_path = File.join(dir, "#{prefix}.manifest.json")
|
65
|
+
query_digest = Manifest.query_digest(@query, @projection)
|
66
|
+
|
67
|
+
manifest = if @resume && @resume[:enabled] && File.exist?(manifest_path)
|
68
|
+
m = Manifest.load(manifest_path)
|
69
|
+
unless m.compatible_with?(collection: @collection.name, format: @format, compression: @compression, query_digest: query_digest)
|
70
|
+
if @resume[:overwrite_incompatible]
|
71
|
+
m = Manifest.new(path: manifest_path)
|
72
|
+
else
|
73
|
+
raise IncompatibleResumeError, "existing manifest incompatible with request; pass overwrite_incompatible: true to reset"
|
74
|
+
end
|
75
|
+
end
|
76
|
+
m
|
77
|
+
else
|
78
|
+
Manifest.new(path: manifest_path)
|
79
|
+
end
|
80
|
+
|
81
|
+
manifest.configure!(collection: @collection.name, format: @format, compression: @compression, query_digest: query_digest, options: {
|
82
|
+
partitions: @partitions, batch_size: @batch_size, rotate_bytes: @rotate_bytes
|
83
|
+
})
|
84
|
+
manifest.ensure_partitions!(@partitions)
|
85
|
+
|
86
|
+
# Plan partitions
|
87
|
+
partition_filters = Partitioner.object_id_partitions(collection: @collection, query: @query, partitions: @partitions)
|
88
|
+
|
89
|
+
# Reader queue
|
90
|
+
queue = ByteQueue.new(max_bytes: @queue_size_bytes)
|
91
|
+
|
92
|
+
# Writer
|
93
|
+
case @format
|
94
|
+
when :jsonl
|
95
|
+
writer = WriterJSONL.new(directory: dir, prefix: prefix, compression: @compression, rotate_bytes: @rotate_bytes, logger: @logger, manifest: manifest)
|
96
|
+
else
|
97
|
+
raise ArgumentError, "format not implemented: #{@format}"
|
98
|
+
end
|
99
|
+
|
100
|
+
# Start reader threads
|
101
|
+
readers = partition_filters.each_with_index.map do |pf, idx|
|
102
|
+
Thread.new do
|
103
|
+
read_partition(idx: idx, filter_spec: pf, queue: queue, batch_size: @batch_size, manifest: manifest)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Writer loop
|
108
|
+
writer_thread = Thread.new do
|
109
|
+
loop do
|
110
|
+
batch = queue.pop
|
111
|
+
break if batch.nil?
|
112
|
+
writer.write_many(batch)
|
113
|
+
end
|
114
|
+
ensure
|
115
|
+
writer.close
|
116
|
+
end
|
117
|
+
|
118
|
+
progress_thread = Thread.new do
|
119
|
+
last = Time.now
|
120
|
+
loop do
|
121
|
+
sleep 2
|
122
|
+
break unless @running
|
123
|
+
@on_progress&.call({ queue_bytes: queue.size_bytes })
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Join readers
|
128
|
+
readers.each(&:join)
|
129
|
+
queue.close
|
130
|
+
writer_thread.join
|
131
|
+
@running = false
|
132
|
+
progress_thread.join
|
133
|
+
true
|
134
|
+
end
|
135
|
+
|
136
|
+
private
|
137
|
+
|
138
|
+
def resolve_output(output, format)
|
139
|
+
if File.directory?(output) || output.end_with?("/")
|
140
|
+
dir = output
|
141
|
+
prefix = @sharding[:prefix] || @collection.name
|
142
|
+
else
|
143
|
+
dir = File.dirname(output)
|
144
|
+
basename = File.basename(output)
|
145
|
+
prefix = basename.sub(/\.(jsonl|csv|parquet)(\.(zst|gz))?\z/, '')
|
146
|
+
end
|
147
|
+
FileUtils.mkdir_p(dir)
|
148
|
+
[dir, prefix]
|
149
|
+
end
|
150
|
+
|
151
|
+
def read_partition(idx:, filter_spec:, queue:, batch_size:, manifest:)
|
152
|
+
filter = filter_spec[:filter]
|
153
|
+
sort = filter_spec[:sort] || { _id: 1 }
|
154
|
+
hint = filter_spec[:hint] || { _id: 1 }
|
155
|
+
|
156
|
+
# Resume from checkpoint if present
|
157
|
+
checkpoint = manifest.partitions[idx] && manifest.partitions[idx]['last_id_exclusive']
|
158
|
+
if checkpoint
|
159
|
+
filter = filter.dup
|
160
|
+
filter['_id'] = (filter['_id'] || {}).merge({ '$gt' => checkpoint })
|
161
|
+
end
|
162
|
+
|
163
|
+
opts = { sort: sort, hint: hint }
|
164
|
+
opts[:projection] = @projection if @projection
|
165
|
+
opts[:batch_size] = batch_size if batch_size
|
166
|
+
opts[:no_cursor_timeout] = @no_cursor_timeout
|
167
|
+
opts[:read] = { mode: @read_preference }
|
168
|
+
opts[:read_concern] = @read_concern
|
169
|
+
|
170
|
+
cursor = @collection.find(filter, opts)
|
171
|
+
|
172
|
+
buffer = []
|
173
|
+
buffer_bytes = 0
|
174
|
+
last_id = checkpoint
|
175
|
+
begin
|
176
|
+
cursor.each do |doc|
|
177
|
+
last_id = doc['_id']
|
178
|
+
doc = @mapper.call(doc) if @mapper
|
179
|
+
json = Oj.dump(doc, mode: :compat)
|
180
|
+
bytes = json.bytesize + 1 # newline later
|
181
|
+
buffer << doc
|
182
|
+
buffer_bytes += bytes
|
183
|
+
if buffer.length >= batch_size || buffer_bytes >= 1_000_000
|
184
|
+
queue.push(buffer, bytes: buffer_bytes)
|
185
|
+
manifest.update_partition_checkpoint!(idx, last_id)
|
186
|
+
buffer = []
|
187
|
+
buffer_bytes = 0
|
188
|
+
end
|
189
|
+
end
|
190
|
+
unless buffer.empty?
|
191
|
+
queue.push(buffer, bytes: buffer_bytes)
|
192
|
+
manifest.update_partition_checkpoint!(idx, last_id)
|
193
|
+
buffer = []
|
194
|
+
buffer_bytes = 0
|
195
|
+
end
|
196
|
+
manifest.mark_partition_complete!(idx)
|
197
|
+
rescue => e
|
198
|
+
# Re-raise to fail the thread; could implement retry/backoff
|
199
|
+
raise e
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
|
@@ -0,0 +1,119 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'oj'
|
4
|
+
require 'zlib'
|
5
|
+
require 'fileutils'
|
6
|
+
|
7
|
+
begin
|
8
|
+
require 'zstds'
|
9
|
+
rescue LoadError
|
10
|
+
# zstd not available; will fallback to gzip
|
11
|
+
end
|
12
|
+
|
13
|
+
module Purplelight
|
14
|
+
class WriterJSONL
|
15
|
+
DEFAULT_ROTATE_BYTES = 256 * 1024 * 1024
|
16
|
+
|
17
|
+
def initialize(directory:, prefix:, compression: :zstd, rotate_bytes: DEFAULT_ROTATE_BYTES, logger: nil, manifest: nil)
|
18
|
+
@directory = directory
|
19
|
+
@prefix = prefix
|
20
|
+
@compression = compression
|
21
|
+
@rotate_bytes = rotate_bytes
|
22
|
+
@logger = logger
|
23
|
+
@manifest = manifest
|
24
|
+
|
25
|
+
@part_index = nil
|
26
|
+
@io = nil
|
27
|
+
@bytes_written = 0
|
28
|
+
@rows_written = 0
|
29
|
+
@file_seq = 0
|
30
|
+
@closed = false
|
31
|
+
end
|
32
|
+
|
33
|
+
def write_many(array_of_docs)
|
34
|
+
ensure_open!
|
35
|
+
buffer = array_of_docs.map { |doc| Oj.dump(doc, mode: :compat) + "\n" }.join
|
36
|
+
write_buffer(buffer)
|
37
|
+
@rows_written += array_of_docs.size
|
38
|
+
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: array_of_docs.size, bytes_delta: buffer.bytesize)
|
39
|
+
end
|
40
|
+
|
41
|
+
def rotate_if_needed
|
42
|
+
return if @rotate_bytes.nil?
|
43
|
+
return if @bytes_written < @rotate_bytes
|
44
|
+
rotate!
|
45
|
+
end
|
46
|
+
|
47
|
+
def close
|
48
|
+
return if @closed
|
49
|
+
if @io
|
50
|
+
finalize_current_part!
|
51
|
+
@io.close
|
52
|
+
end
|
53
|
+
@closed = true
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def ensure_open!
|
59
|
+
return if @io
|
60
|
+
FileUtils.mkdir_p(@directory)
|
61
|
+
path = next_part_path
|
62
|
+
@part_index = @manifest&.open_part!(path) if @manifest
|
63
|
+
raw = File.open(path, 'wb')
|
64
|
+
@io = build_compressed_io(raw)
|
65
|
+
@bytes_written = 0
|
66
|
+
@rows_written = 0
|
67
|
+
end
|
68
|
+
|
69
|
+
def build_compressed_io(raw)
|
70
|
+
case @compression.to_s
|
71
|
+
when 'zstd'
|
72
|
+
if defined?(ZSTDS)
|
73
|
+
# ZSTDS::Writer supports IO-like interface
|
74
|
+
return ZSTDS::Writer.open(raw, level: 10)
|
75
|
+
else
|
76
|
+
@logger&.warn("zstd not available, falling back to gzip")
|
77
|
+
return Zlib::GzipWriter.new(raw)
|
78
|
+
end
|
79
|
+
when 'gzip'
|
80
|
+
return Zlib::GzipWriter.new(raw)
|
81
|
+
when 'none'
|
82
|
+
return raw
|
83
|
+
else
|
84
|
+
raise ArgumentError, "unknown compression: #{@compression}"
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def write_buffer(buffer)
|
89
|
+
@io.write(buffer)
|
90
|
+
@bytes_written += buffer.bytesize
|
91
|
+
rotate_if_needed
|
92
|
+
end
|
93
|
+
|
94
|
+
def rotate!
|
95
|
+
return unless @io
|
96
|
+
finalize_current_part!
|
97
|
+
@io.close
|
98
|
+
@io = nil
|
99
|
+
ensure_open!
|
100
|
+
end
|
101
|
+
|
102
|
+
def finalize_current_part!
|
103
|
+
@io.flush if @io.respond_to?(:flush)
|
104
|
+
# Could compute checksum here by re-reading, or maintain on the fly; omit for v1
|
105
|
+
@manifest&.complete_part!(index: @part_index, checksum: nil)
|
106
|
+
@file_seq += 1
|
107
|
+
end
|
108
|
+
|
109
|
+
def next_part_path
|
110
|
+
ext = 'jsonl'
|
111
|
+
filename = format("%s-part-%06d.%s", @prefix, @file_seq, ext)
|
112
|
+
filename += ".zst" if @compression.to_s == 'zstd'
|
113
|
+
filename += ".gz" if @compression.to_s == 'gzip'
|
114
|
+
File.join(@directory, filename)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
|
data/lib/purplelight.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "purplelight/version"
|
4
|
+
require_relative "purplelight/errors"
|
5
|
+
require_relative "purplelight/manifest"
|
6
|
+
require_relative "purplelight/snapshot"
|
7
|
+
|
8
|
+
module Purplelight
|
9
|
+
# Convenience top-level API.
|
10
|
+
# See Purplelight::Snapshot for options.
|
11
|
+
def self.snapshot(**options)
|
12
|
+
Snapshot.snapshot(**options)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
|
metadata
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: purplelight
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Purplelight Authors
|
8
|
+
bindir: bin
|
9
|
+
cert_chain: []
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
11
|
+
dependencies:
|
12
|
+
- !ruby/object:Gem::Dependency
|
13
|
+
name: mongo
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - ">="
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: '2.19'
|
19
|
+
type: :runtime
|
20
|
+
prerelease: false
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
22
|
+
requirements:
|
23
|
+
- - ">="
|
24
|
+
- !ruby/object:Gem::Version
|
25
|
+
version: '2.19'
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: oj
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '3.16'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '3.16'
|
40
|
+
- !ruby/object:Gem::Dependency
|
41
|
+
name: rspec
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '3.12'
|
47
|
+
type: :development
|
48
|
+
prerelease: false
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '3.12'
|
54
|
+
- !ruby/object:Gem::Dependency
|
55
|
+
name: rake
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '13.0'
|
61
|
+
type: :development
|
62
|
+
prerelease: false
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '13.0'
|
68
|
+
description: High-throughput, resumable snapshots of MongoDB collections with partitioning,
|
69
|
+
multi-threaded readers, and size-based sharded outputs.
|
70
|
+
email:
|
71
|
+
- devnull@example.com
|
72
|
+
executables: []
|
73
|
+
extensions: []
|
74
|
+
extra_rdoc_files: []
|
75
|
+
files:
|
76
|
+
- README.md
|
77
|
+
- Rakefile
|
78
|
+
- lib/purplelight.rb
|
79
|
+
- lib/purplelight/errors.rb
|
80
|
+
- lib/purplelight/manifest.rb
|
81
|
+
- lib/purplelight/partitioner.rb
|
82
|
+
- lib/purplelight/queue.rb
|
83
|
+
- lib/purplelight/snapshot.rb
|
84
|
+
- lib/purplelight/version.rb
|
85
|
+
- lib/purplelight/writer_jsonl.rb
|
86
|
+
licenses:
|
87
|
+
- MIT
|
88
|
+
metadata:
|
89
|
+
homepage_uri: https://github.com/example/purplelight
|
90
|
+
source_code_uri: https://github.com/example/purplelight
|
91
|
+
changelog_uri: https://github.com/example/purplelight/releases
|
92
|
+
rdoc_options: []
|
93
|
+
require_paths:
|
94
|
+
- lib
|
95
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: '3.2'
|
100
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
|
+
requirements:
|
102
|
+
- - ">="
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: '0'
|
105
|
+
requirements: []
|
106
|
+
rubygems_version: 3.6.7
|
107
|
+
specification_version: 4
|
108
|
+
summary: Snapshot MongoDB collections efficiently to JSONL/CSV/Parquet
|
109
|
+
test_files: []
|