purplelight 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: b62be6d2a3810b6278d43fadcee4647efb1c758c9bd04ddc69051737e66d1716
4
+ data.tar.gz: 2950f98c90869bcc3d6619e00adf68113e12197196f43a4c129dab2c1270e47c
5
+ SHA512:
6
+ metadata.gz: 635a9c3114bc1d6a017a8244dfd6b5cca15f82f19a9e79f59480af91c0e1ec61b72e40703393098f584a2a9d29f68cac9b81f8833c14ccf8d42f527c4cb40c2c
7
+ data.tar.gz: f24b2faaff4218481b8fa8345842dd1fadeb9174adb92d449008637e072e81861b13bb8f71cacbcb51254fbcca18ea799ca8dfa5ce5c99f418e82262dd5505c1
data/README.md ADDED
@@ -0,0 +1,48 @@
1
+ # purplelight
2
+
3
+ Snapshot MongoDB collections efficiently from Ruby with resumable, partitioned exports to JSONL/CSV/Parquet. Defaults to zstd compression and size-based multi-part outputs. MongoDB 7/8.
4
+
5
+ ### Install
6
+
7
+ Add to your Gemfile:
8
+
9
+ ```ruby
10
+ gem 'purplelight'
11
+ ```
12
+
13
+ ### Quick start
14
+
15
+ ```ruby
16
+ require 'mongo'
17
+ require 'purplelight'
18
+
19
+ client = Mongo::Client.new(ENV.fetch('MONGO_URL'))
20
+
21
+ Purplelight.snapshot(
22
+ client: client,
23
+ collection: 'users',
24
+ output: '/data/exports',
25
+ format: :jsonl,
26
+ compression: :zstd, # default
27
+ partitions: 8,
28
+ batch_size: 2000,
29
+ query: { active: true },
30
+ sharding: { mode: :by_size, part_bytes: 256 * 1024 * 1024, prefix: 'users' },
31
+ resume: { enabled: true },
32
+ on_progress: ->(s) { puts s.inspect }
33
+ )
34
+ ```
35
+
36
+ Outputs files like:
37
+
38
+ ```
39
+ /data/exports/
40
+ users-part-000000.jsonl.zst
41
+ users-part-000001.jsonl.zst
42
+ users.manifest.json
43
+ ```
44
+
45
+ ### Status
46
+
47
+ Phase 1 (JSONL + zstd, partitioning, resume, size-based sharding) in progress.
48
+
data/Rakefile ADDED
@@ -0,0 +1,14 @@
1
+ require 'rake/testtask'
2
+
3
+ task default: [:spec]
4
+
5
+ begin
6
+ require 'rspec/core/rake_task'
7
+ RSpec::Core::RakeTask.new(:spec)
8
+ rescue LoadError
9
+ task :spec do
10
+ sh 'echo "RSpec not installed"'
11
+ end
12
+ end
13
+
14
+
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Purplelight
4
+ class Error < StandardError; end
5
+
6
+ class IncompatibleResumeError < Error; end
7
+ class OutputExistsError < Error; end
8
+ class WriterClosedError < Error; end
9
+ end
10
+
11
+
@@ -0,0 +1,128 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'time'
5
+ require 'securerandom'
6
+ require 'digest'
7
+ require 'fileutils'
8
+
9
+ module Purplelight
10
+ class Manifest
11
+ DEFAULT_VERSION = 1
12
+
13
+ attr_reader :path, :data
14
+
15
+ def self.query_digest(query, projection)
16
+ payload = { query: query, projection: projection }
17
+ Digest::SHA256.hexdigest(JSON.generate(payload))
18
+ end
19
+
20
+ def initialize(path:, data: nil)
21
+ @path = path
22
+ @data = data || {
23
+ 'version' => DEFAULT_VERSION,
24
+ 'run_id' => SecureRandom.uuid,
25
+ 'created_at' => Time.now.utc.iso8601,
26
+ 'collection' => nil,
27
+ 'format' => nil,
28
+ 'compression' => nil,
29
+ 'query_digest' => nil,
30
+ 'options' => {},
31
+ 'parts' => [],
32
+ 'partitions' => []
33
+ }
34
+ @mutex = Mutex.new
35
+ end
36
+
37
+ def self.load(path)
38
+ data = JSON.parse(File.read(path))
39
+ new(path: path, data: data)
40
+ end
41
+
42
+ def save!
43
+ dir = File.dirname(path)
44
+ FileUtils.mkdir_p(dir) unless Dir.exist?(dir)
45
+ tmp = path + '.tmp'
46
+ File.open(tmp, 'w') { |f| f.write(JSON.pretty_generate(@data)) }
47
+ FileUtils.mv(tmp, path)
48
+ end
49
+
50
+ def configure!(collection:, format:, compression:, query_digest:, options: {})
51
+ @data['collection'] = collection
52
+ @data['format'] = format.to_s
53
+ @data['compression'] = compression.to_s
54
+ @data['query_digest'] = query_digest
55
+ @data['options'] = options
56
+ save!
57
+ end
58
+
59
+ def compatible_with?(collection:, format:, compression:, query_digest:)
60
+ @data['collection'] == collection &&
61
+ @data['format'] == format.to_s &&
62
+ @data['compression'] == compression.to_s &&
63
+ @data['query_digest'] == query_digest
64
+ end
65
+
66
+ def ensure_partitions!(count)
67
+ @mutex.synchronize do
68
+ if @data['partitions'].empty?
69
+ @data['partitions'] = Array.new(count) { |i| { 'index' => i, 'last_id_exclusive' => nil, 'completed' => false } }
70
+ save!
71
+ end
72
+ end
73
+ end
74
+
75
+ def update_partition_checkpoint!(index, last_id_exclusive)
76
+ @mutex.synchronize do
77
+ part = @data['partitions'][index]
78
+ part['last_id_exclusive'] = last_id_exclusive
79
+ save!
80
+ end
81
+ end
82
+
83
+ def mark_partition_complete!(index)
84
+ @mutex.synchronize do
85
+ part = @data['partitions'][index]
86
+ part['completed'] = true
87
+ save!
88
+ end
89
+ end
90
+
91
+ def open_part!(path)
92
+ @mutex.synchronize do
93
+ idx = @data['parts'].size
94
+ @data['parts'] << { 'index' => idx, 'path' => path, 'bytes' => 0, 'rows' => 0, 'complete' => false, 'checksum' => nil }
95
+ save!
96
+ idx
97
+ end
98
+ end
99
+
100
+ def add_progress_to_part!(index:, rows_delta:, bytes_delta:)
101
+ @mutex.synchronize do
102
+ part = @data['parts'][index]
103
+ part['rows'] += rows_delta
104
+ part['bytes'] += bytes_delta
105
+ save!
106
+ end
107
+ end
108
+
109
+ def complete_part!(index:, checksum: nil)
110
+ @mutex.synchronize do
111
+ part = @data['parts'][index]
112
+ part['complete'] = true
113
+ part['checksum'] = checksum
114
+ save!
115
+ end
116
+ end
117
+
118
+ def parts
119
+ @data['parts']
120
+ end
121
+
122
+ def partitions
123
+ @data['partitions']
124
+ end
125
+ end
126
+ end
127
+
128
+
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mongo'
4
+
5
+ module Purplelight
6
+ class Partitioner
7
+ # Builds contiguous _id range filters for N partitions.
8
+ # For ObjectId _id, we sample quantiles to split into near-equal document counts.
9
+ def self.object_id_partitions(collection:, query:, partitions:)
10
+ # Ensure sort order for sampling
11
+ base_query = collection.find(query || {}, {}.merge(sort: { _id: 1 }))
12
+
13
+ # Fast path: if small dataset, just chunk by count
14
+ total = collection.estimated_document_count
15
+ return simple_ranges(collection: collection, query: query, partitions: partitions) if total <= partitions * 5_000
16
+
17
+ # Sample boundaries: take approx quantiles by skipping
18
+ step = [total / partitions, 1].max
19
+ boundaries = []
20
+ cursor = base_query.projection(_id: 1).batch_size(1_000).no_cursor_timeout
21
+ i = 0
22
+ cursor.each do |doc|
23
+ if (i % step).zero?
24
+ boundaries << doc['_id']
25
+ end
26
+ i += 1
27
+ break if boundaries.size >= partitions
28
+ end
29
+
30
+ ranges = []
31
+ prev = nil
32
+ boundaries.each_with_index do |b, idx|
33
+ if idx == 0
34
+ prev = nil
35
+ next
36
+ end
37
+ ranges << build_range(prev, b)
38
+ prev = b
39
+ end
40
+ ranges << build_range(prev, nil)
41
+
42
+ ranges.map do |r|
43
+ filter = query ? query.dup : {}
44
+ filter['_id'] = r
45
+ { filter: filter, sort: { _id: 1 }, hint: { _id: 1 } }
46
+ end
47
+ end
48
+
49
+ def self.simple_ranges(collection:, query:, partitions:)
50
+ # Split by _id quantiles using min/max endpoints
51
+ min_id = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(1).first&.dig('_id')
52
+ max_id = collection.find(query || {}).projection(_id: 1).sort(_id: -1).limit(1).first&.dig('_id')
53
+ return [{ filter: query || {}, sort: { _id: 1 } }] if min_id.nil? || max_id.nil?
54
+
55
+ # Create numeric-ish interpolation by sampling
56
+ ids = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(partitions - 1).to_a.map { |d| d['_id'] }
57
+ boundaries = [min_id] + ids + [max_id]
58
+ ranges = []
59
+ boundaries.each_cons(2) do |a, b|
60
+ ranges << build_range(a, b)
61
+ end
62
+
63
+ ranges.map do |r|
64
+ filter = query ? query.dup : {}
65
+ filter['_id'] = r
66
+ { filter: filter, sort: { _id: 1 }, hint: { _id: 1 } }
67
+ end
68
+ end
69
+
70
+ def self.build_range(from_id, to_id)
71
+ if from_id && to_id
72
+ { '$gt' => from_id, '$lte' => to_id }
73
+ elsif from_id && !to_id
74
+ { '$gt' => from_id }
75
+ elsif !from_id && to_id
76
+ { '$lte' => to_id }
77
+ else
78
+ {}
79
+ end
80
+ end
81
+ end
82
+ end
83
+
84
+
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Purplelight
4
+ # Sized queue that tracks bytes to apply backpressure.
5
+ class ByteQueue
6
+ def initialize(max_bytes: 128 * 1024 * 1024)
7
+ @max_bytes = max_bytes
8
+ @queue = []
9
+ @bytes = 0
10
+ @closed = false
11
+ @mutex = Mutex.new
12
+ @cv = ConditionVariable.new
13
+ end
14
+
15
+ def push(item, bytes:)
16
+ @mutex.synchronize do
17
+ raise "queue closed" if @closed
18
+ while (@bytes + bytes) > @max_bytes
19
+ @cv.wait(@mutex)
20
+ end
21
+ @queue << [item, bytes]
22
+ @bytes += bytes
23
+ @cv.broadcast
24
+ end
25
+ end
26
+
27
+ def pop
28
+ @mutex.synchronize do
29
+ while @queue.empty?
30
+ if @closed
31
+ return nil
32
+ end
33
+ @cv.wait(@mutex)
34
+ end
35
+ item, bytes = @queue.shift
36
+ @bytes -= bytes
37
+ @cv.broadcast
38
+ item
39
+ end
40
+ end
41
+
42
+ def close
43
+ @mutex.synchronize do
44
+ @closed = true
45
+ @cv.broadcast
46
+ end
47
+ end
48
+
49
+ def size_bytes
50
+ @mutex.synchronize { @bytes }
51
+ end
52
+ end
53
+ end
54
+
55
+
@@ -0,0 +1,205 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mongo'
4
+ require 'etc'
5
+ require 'fileutils'
6
+ require_relative 'partitioner'
7
+ require_relative 'queue'
8
+ require_relative 'writer_jsonl'
9
+ require_relative 'manifest'
10
+ require_relative 'errors'
11
+
12
+ module Purplelight
13
+ class Snapshot
14
+ DEFAULTS = {
15
+ format: :jsonl,
16
+ compression: :zstd,
17
+ batch_size: 2_000,
18
+ partitions: [Etc.respond_to?(:nprocessors) ? [Etc.nprocessors * 2, 4].max : 4, 32].min,
19
+ queue_size_bytes: 128 * 1024 * 1024,
20
+ rotate_bytes: 256 * 1024 * 1024,
21
+ read_concern: :majority,
22
+ read_preference: :primary,
23
+ no_cursor_timeout: true
24
+ }
25
+
26
+ def self.snapshot(**options)
27
+ new(**options).run
28
+ end
29
+
30
+ def initialize(client:, collection:, output:, format: DEFAULTS[:format], compression: DEFAULTS[:compression],
31
+ partitions: DEFAULTS[:partitions], batch_size: DEFAULTS[:batch_size],
32
+ queue_size_bytes: DEFAULTS[:queue_size_bytes], rotate_bytes: DEFAULTS[:rotate_bytes],
33
+ query: {}, projection: nil, hint: nil, mapper: nil,
34
+ resume: { enabled: true, state_path: nil, overwrite_incompatible: false },
35
+ sharding: { mode: :by_size, part_bytes: DEFAULTS[:rotate_bytes], prefix: nil },
36
+ logger: nil, on_progress: nil, read_concern: DEFAULTS[:read_concern], read_preference: DEFAULTS[:read_preference],
37
+ no_cursor_timeout: DEFAULTS[:no_cursor_timeout])
38
+ @client = client
39
+ @collection = client[collection]
40
+ @output = output
41
+ @format = format.to_sym
42
+ @compression = compression.to_sym
43
+ @partitions = partitions
44
+ @batch_size = batch_size
45
+ @queue_size_bytes = queue_size_bytes
46
+ @rotate_bytes = rotate_bytes
47
+ @query = query || {}
48
+ @projection = projection
49
+ @hint = hint
50
+ @mapper = mapper
51
+ @resume = resume || { enabled: true }
52
+ @sharding = sharding || { mode: :by_size }
53
+ @logger = logger
54
+ @on_progress = on_progress
55
+ @read_concern = read_concern
56
+ @read_preference = read_preference
57
+ @no_cursor_timeout = no_cursor_timeout
58
+
59
+ @running = true
60
+ end
61
+
62
+ def run
63
+ dir, prefix = resolve_output(@output, @format)
64
+ manifest_path = File.join(dir, "#{prefix}.manifest.json")
65
+ query_digest = Manifest.query_digest(@query, @projection)
66
+
67
+ manifest = if @resume && @resume[:enabled] && File.exist?(manifest_path)
68
+ m = Manifest.load(manifest_path)
69
+ unless m.compatible_with?(collection: @collection.name, format: @format, compression: @compression, query_digest: query_digest)
70
+ if @resume[:overwrite_incompatible]
71
+ m = Manifest.new(path: manifest_path)
72
+ else
73
+ raise IncompatibleResumeError, "existing manifest incompatible with request; pass overwrite_incompatible: true to reset"
74
+ end
75
+ end
76
+ m
77
+ else
78
+ Manifest.new(path: manifest_path)
79
+ end
80
+
81
+ manifest.configure!(collection: @collection.name, format: @format, compression: @compression, query_digest: query_digest, options: {
82
+ partitions: @partitions, batch_size: @batch_size, rotate_bytes: @rotate_bytes
83
+ })
84
+ manifest.ensure_partitions!(@partitions)
85
+
86
+ # Plan partitions
87
+ partition_filters = Partitioner.object_id_partitions(collection: @collection, query: @query, partitions: @partitions)
88
+
89
+ # Reader queue
90
+ queue = ByteQueue.new(max_bytes: @queue_size_bytes)
91
+
92
+ # Writer
93
+ case @format
94
+ when :jsonl
95
+ writer = WriterJSONL.new(directory: dir, prefix: prefix, compression: @compression, rotate_bytes: @rotate_bytes, logger: @logger, manifest: manifest)
96
+ else
97
+ raise ArgumentError, "format not implemented: #{@format}"
98
+ end
99
+
100
+ # Start reader threads
101
+ readers = partition_filters.each_with_index.map do |pf, idx|
102
+ Thread.new do
103
+ read_partition(idx: idx, filter_spec: pf, queue: queue, batch_size: @batch_size, manifest: manifest)
104
+ end
105
+ end
106
+
107
+ # Writer loop
108
+ writer_thread = Thread.new do
109
+ loop do
110
+ batch = queue.pop
111
+ break if batch.nil?
112
+ writer.write_many(batch)
113
+ end
114
+ ensure
115
+ writer.close
116
+ end
117
+
118
+ progress_thread = Thread.new do
119
+ last = Time.now
120
+ loop do
121
+ sleep 2
122
+ break unless @running
123
+ @on_progress&.call({ queue_bytes: queue.size_bytes })
124
+ end
125
+ end
126
+
127
+ # Join readers
128
+ readers.each(&:join)
129
+ queue.close
130
+ writer_thread.join
131
+ @running = false
132
+ progress_thread.join
133
+ true
134
+ end
135
+
136
+ private
137
+
138
+ def resolve_output(output, format)
139
+ if File.directory?(output) || output.end_with?("/")
140
+ dir = output
141
+ prefix = @sharding[:prefix] || @collection.name
142
+ else
143
+ dir = File.dirname(output)
144
+ basename = File.basename(output)
145
+ prefix = basename.sub(/\.(jsonl|csv|parquet)(\.(zst|gz))?\z/, '')
146
+ end
147
+ FileUtils.mkdir_p(dir)
148
+ [dir, prefix]
149
+ end
150
+
151
+ def read_partition(idx:, filter_spec:, queue:, batch_size:, manifest:)
152
+ filter = filter_spec[:filter]
153
+ sort = filter_spec[:sort] || { _id: 1 }
154
+ hint = filter_spec[:hint] || { _id: 1 }
155
+
156
+ # Resume from checkpoint if present
157
+ checkpoint = manifest.partitions[idx] && manifest.partitions[idx]['last_id_exclusive']
158
+ if checkpoint
159
+ filter = filter.dup
160
+ filter['_id'] = (filter['_id'] || {}).merge({ '$gt' => checkpoint })
161
+ end
162
+
163
+ opts = { sort: sort, hint: hint }
164
+ opts[:projection] = @projection if @projection
165
+ opts[:batch_size] = batch_size if batch_size
166
+ opts[:no_cursor_timeout] = @no_cursor_timeout
167
+ opts[:read] = { mode: @read_preference }
168
+ opts[:read_concern] = @read_concern
169
+
170
+ cursor = @collection.find(filter, opts)
171
+
172
+ buffer = []
173
+ buffer_bytes = 0
174
+ last_id = checkpoint
175
+ begin
176
+ cursor.each do |doc|
177
+ last_id = doc['_id']
178
+ doc = @mapper.call(doc) if @mapper
179
+ json = Oj.dump(doc, mode: :compat)
180
+ bytes = json.bytesize + 1 # newline later
181
+ buffer << doc
182
+ buffer_bytes += bytes
183
+ if buffer.length >= batch_size || buffer_bytes >= 1_000_000
184
+ queue.push(buffer, bytes: buffer_bytes)
185
+ manifest.update_partition_checkpoint!(idx, last_id)
186
+ buffer = []
187
+ buffer_bytes = 0
188
+ end
189
+ end
190
+ unless buffer.empty?
191
+ queue.push(buffer, bytes: buffer_bytes)
192
+ manifest.update_partition_checkpoint!(idx, last_id)
193
+ buffer = []
194
+ buffer_bytes = 0
195
+ end
196
+ manifest.mark_partition_complete!(idx)
197
+ rescue => e
198
+ # Re-raise to fail the thread; could implement retry/backoff
199
+ raise e
200
+ end
201
+ end
202
+ end
203
+ end
204
+
205
+
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Purplelight
4
+ VERSION = "0.1.0"
5
+ end
6
+
7
+
@@ -0,0 +1,119 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'oj'
4
+ require 'zlib'
5
+ require 'fileutils'
6
+
7
+ begin
8
+ require 'zstds'
9
+ rescue LoadError
10
+ # zstd not available; will fallback to gzip
11
+ end
12
+
13
+ module Purplelight
14
+ class WriterJSONL
15
+ DEFAULT_ROTATE_BYTES = 256 * 1024 * 1024
16
+
17
+ def initialize(directory:, prefix:, compression: :zstd, rotate_bytes: DEFAULT_ROTATE_BYTES, logger: nil, manifest: nil)
18
+ @directory = directory
19
+ @prefix = prefix
20
+ @compression = compression
21
+ @rotate_bytes = rotate_bytes
22
+ @logger = logger
23
+ @manifest = manifest
24
+
25
+ @part_index = nil
26
+ @io = nil
27
+ @bytes_written = 0
28
+ @rows_written = 0
29
+ @file_seq = 0
30
+ @closed = false
31
+ end
32
+
33
+ def write_many(array_of_docs)
34
+ ensure_open!
35
+ buffer = array_of_docs.map { |doc| Oj.dump(doc, mode: :compat) + "\n" }.join
36
+ write_buffer(buffer)
37
+ @rows_written += array_of_docs.size
38
+ @manifest&.add_progress_to_part!(index: @part_index, rows_delta: array_of_docs.size, bytes_delta: buffer.bytesize)
39
+ end
40
+
41
+ def rotate_if_needed
42
+ return if @rotate_bytes.nil?
43
+ return if @bytes_written < @rotate_bytes
44
+ rotate!
45
+ end
46
+
47
+ def close
48
+ return if @closed
49
+ if @io
50
+ finalize_current_part!
51
+ @io.close
52
+ end
53
+ @closed = true
54
+ end
55
+
56
+ private
57
+
58
+ def ensure_open!
59
+ return if @io
60
+ FileUtils.mkdir_p(@directory)
61
+ path = next_part_path
62
+ @part_index = @manifest&.open_part!(path) if @manifest
63
+ raw = File.open(path, 'wb')
64
+ @io = build_compressed_io(raw)
65
+ @bytes_written = 0
66
+ @rows_written = 0
67
+ end
68
+
69
+ def build_compressed_io(raw)
70
+ case @compression.to_s
71
+ when 'zstd'
72
+ if defined?(ZSTDS)
73
+ # ZSTDS::Writer supports IO-like interface
74
+ return ZSTDS::Writer.open(raw, level: 10)
75
+ else
76
+ @logger&.warn("zstd not available, falling back to gzip")
77
+ return Zlib::GzipWriter.new(raw)
78
+ end
79
+ when 'gzip'
80
+ return Zlib::GzipWriter.new(raw)
81
+ when 'none'
82
+ return raw
83
+ else
84
+ raise ArgumentError, "unknown compression: #{@compression}"
85
+ end
86
+ end
87
+
88
+ def write_buffer(buffer)
89
+ @io.write(buffer)
90
+ @bytes_written += buffer.bytesize
91
+ rotate_if_needed
92
+ end
93
+
94
+ def rotate!
95
+ return unless @io
96
+ finalize_current_part!
97
+ @io.close
98
+ @io = nil
99
+ ensure_open!
100
+ end
101
+
102
+ def finalize_current_part!
103
+ @io.flush if @io.respond_to?(:flush)
104
+ # Could compute checksum here by re-reading, or maintain on the fly; omit for v1
105
+ @manifest&.complete_part!(index: @part_index, checksum: nil)
106
+ @file_seq += 1
107
+ end
108
+
109
+ def next_part_path
110
+ ext = 'jsonl'
111
+ filename = format("%s-part-%06d.%s", @prefix, @file_seq, ext)
112
+ filename += ".zst" if @compression.to_s == 'zstd'
113
+ filename += ".gz" if @compression.to_s == 'gzip'
114
+ File.join(@directory, filename)
115
+ end
116
+ end
117
+ end
118
+
119
+
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "purplelight/version"
4
+ require_relative "purplelight/errors"
5
+ require_relative "purplelight/manifest"
6
+ require_relative "purplelight/snapshot"
7
+
8
+ module Purplelight
9
+ # Convenience top-level API.
10
+ # See Purplelight::Snapshot for options.
11
+ def self.snapshot(**options)
12
+ Snapshot.snapshot(**options)
13
+ end
14
+ end
15
+
16
+
metadata ADDED
@@ -0,0 +1,109 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: purplelight
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Purplelight Authors
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: mongo
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '2.19'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: '2.19'
26
+ - !ruby/object:Gem::Dependency
27
+ name: oj
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '3.16'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: '3.16'
40
+ - !ruby/object:Gem::Dependency
41
+ name: rspec
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: '3.12'
47
+ type: :development
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: '3.12'
54
+ - !ruby/object:Gem::Dependency
55
+ name: rake
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '13.0'
61
+ type: :development
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: '13.0'
68
+ description: High-throughput, resumable snapshots of MongoDB collections with partitioning,
69
+ multi-threaded readers, and size-based sharded outputs.
70
+ email:
71
+ - devnull@example.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - README.md
77
+ - Rakefile
78
+ - lib/purplelight.rb
79
+ - lib/purplelight/errors.rb
80
+ - lib/purplelight/manifest.rb
81
+ - lib/purplelight/partitioner.rb
82
+ - lib/purplelight/queue.rb
83
+ - lib/purplelight/snapshot.rb
84
+ - lib/purplelight/version.rb
85
+ - lib/purplelight/writer_jsonl.rb
86
+ licenses:
87
+ - MIT
88
+ metadata:
89
+ homepage_uri: https://github.com/example/purplelight
90
+ source_code_uri: https://github.com/example/purplelight
91
+ changelog_uri: https://github.com/example/purplelight/releases
92
+ rdoc_options: []
93
+ require_paths:
94
+ - lib
95
+ required_ruby_version: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: '3.2'
100
+ required_rubygems_version: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - ">="
103
+ - !ruby/object:Gem::Version
104
+ version: '0'
105
+ requirements: []
106
+ rubygems_version: 3.6.7
107
+ specification_version: 4
108
+ summary: Snapshot MongoDB collections efficiently to JSONL/CSV/Parquet
109
+ test_files: []