purplelight 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/Rakefile +2 -2
- data/bin/purplelight +19 -13
- data/lib/purplelight/errors.rb +0 -2
- data/lib/purplelight/manifest.rb +17 -11
- data/lib/purplelight/partitioner.rb +7 -6
- data/lib/purplelight/queue.rb +5 -9
- data/lib/purplelight/snapshot.rb +34 -28
- data/lib/purplelight/version.rb +1 -3
- data/lib/purplelight/writer_csv.rb +31 -29
- data/lib/purplelight/writer_jsonl.rb +27 -26
- data/lib/purplelight/writer_parquet.rb +15 -19
- data/lib/purplelight.rb +10 -8
- metadata +10 -37
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5964231e634be4a743207679e349623c275e0b20771b492bbc54c4261238e352
|
4
|
+
data.tar.gz: 82448e1f4b5ffb8e9846938653b16a4d7008aa29e722d9417863b80362370168
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4b8ac2ba82501978bcd1ae1b9db888f7dab387de76a7e473c5b0993a526398a727a029a0597a64c77bbbed976980e6adaad7a08f7e3967c9cfe6c7afa2d996ac
|
7
|
+
data.tar.gz: 15f536cfc05a0b70f7cdeb6bc7985e56b2e6dad6b2db53922e7aab12a821b8bb1294217390ff00c095b2cacb46c23e027d24d90a083e9e415a0df6e26fec9b59
|
data/README.md
CHANGED
data/Rakefile
CHANGED
data/bin/purplelight
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'optparse'
|
4
5
|
require 'json'
|
@@ -20,7 +21,7 @@ options = {
|
|
20
21
|
}
|
21
22
|
|
22
23
|
parser = OptionParser.new do |opts|
|
23
|
-
opts.banner =
|
24
|
+
opts.banner = 'Usage: purplelight snapshot [options]'
|
24
25
|
|
25
26
|
opts.on('-u', '--uri URI', 'MongoDB connection URI (required)') { |v| options[:uri] = v }
|
26
27
|
opts.on('-d', '--db NAME', 'Database name (required)') { |v| options[:db] = v }
|
@@ -30,19 +31,26 @@ parser = OptionParser.new do |opts|
|
|
30
31
|
opts.on('--compression NAME', 'Compression: zstd|gzip|none') { |v| options[:compression] = v.to_sym }
|
31
32
|
opts.on('--partitions N', Integer, 'Number of partitions') { |v| options[:partitions] = v }
|
32
33
|
opts.on('--batch-size N', Integer, 'Mongo batch size (default 2000)') { |v| options[:batch_size] = v }
|
33
|
-
opts.on('--by-size BYTES', Integer, 'Shard by size (bytes); default 268435456')
|
34
|
+
opts.on('--by-size BYTES', Integer, 'Shard by size (bytes); default 268435456') do |v|
|
35
|
+
options[:sharding] = { mode: :by_size, part_bytes: v }
|
36
|
+
end
|
34
37
|
opts.on('--single-file', 'Write a single output file') { options[:sharding] = { mode: :single_file } }
|
35
38
|
opts.on('--prefix NAME', 'Output file prefix') do |v|
|
36
39
|
options[:sharding] ||= {}
|
37
40
|
options[:sharding][:prefix] = v
|
38
41
|
end
|
39
42
|
opts.on('-q', '--query JSON', 'Filter query as JSON') { |v| options[:query] = JSON.parse(v) }
|
40
|
-
opts.on('--read-preference MODE',
|
41
|
-
|
43
|
+
opts.on('--read-preference MODE',
|
44
|
+
'Read preference mode: primary|primary_preferred|secondary|secondary_preferred|nearest') do |v|
|
45
|
+
options[:read_preference] = v.to_sym
|
46
|
+
end
|
47
|
+
opts.on('--read-tags TAGS',
|
48
|
+
'Comma-separated key=value list to target tagged nodes (e.g., nodeType=ANALYTICS,region=EAST)') do |v|
|
42
49
|
tags = {}
|
43
50
|
v.split(',').each do |pair|
|
44
51
|
k, val = pair.split('=', 2)
|
45
52
|
next if k.nil? || val.nil?
|
53
|
+
|
46
54
|
tags[k] = val
|
47
55
|
end
|
48
56
|
options[:read_tags] = tags unless tags.empty?
|
@@ -67,16 +75,16 @@ rescue OptionParser::ParseError => e
|
|
67
75
|
end
|
68
76
|
|
69
77
|
%i[uri db collection output].each do |k|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
78
|
+
next unless options[k].nil? || options[k].to_s.empty?
|
79
|
+
|
80
|
+
warn "Missing required option: --#{k}"
|
81
|
+
warn parser
|
82
|
+
exit 1
|
75
83
|
end
|
76
84
|
|
77
85
|
effective_read = nil
|
78
86
|
if options[:read_tags]
|
79
|
-
effective_read = { mode:
|
87
|
+
effective_read = { mode: options[:read_preference] || :secondary, tag_sets: [options[:read_tags]] }
|
80
88
|
elsif options[:read_preference]
|
81
89
|
effective_read = { mode: options[:read_preference] }
|
82
90
|
end
|
@@ -101,9 +109,7 @@ ok = Purplelight.snapshot(
|
|
101
109
|
sharding: options[:sharding],
|
102
110
|
read_preference: effective_read || options[:read_preference],
|
103
111
|
resume: { enabled: true },
|
104
|
-
on_progress: ->(s) {
|
112
|
+
on_progress: ->(s) { warn("progress: #{s.to_json}") }
|
105
113
|
)
|
106
114
|
|
107
115
|
exit(ok ? 0 : 1)
|
108
|
-
|
109
|
-
|
data/lib/purplelight/errors.rb
CHANGED
data/lib/purplelight/manifest.rb
CHANGED
@@ -7,6 +7,11 @@ require 'digest'
|
|
7
7
|
require 'fileutils'
|
8
8
|
|
9
9
|
module Purplelight
|
10
|
+
# Manifest persists snapshot run metadata and progress to a JSON file.
|
11
|
+
#
|
12
|
+
# It records configuration, partition checkpoints, and per-part byte/row
|
13
|
+
# counts so interrupted runs can resume safely and completed runs are
|
14
|
+
# reproducible. Methods are thread-safe where mutation occurs.
|
10
15
|
class Manifest
|
11
16
|
DEFAULT_VERSION = 1
|
12
17
|
|
@@ -42,9 +47,9 @@ module Purplelight
|
|
42
47
|
|
43
48
|
def save!
|
44
49
|
dir = File.dirname(path)
|
45
|
-
FileUtils.mkdir_p(dir)
|
46
|
-
tmp = path
|
47
|
-
File.
|
50
|
+
FileUtils.mkdir_p(dir)
|
51
|
+
tmp = "#{path}.tmp"
|
52
|
+
File.write(tmp, JSON.pretty_generate(@data))
|
48
53
|
FileUtils.mv(tmp, path)
|
49
54
|
end
|
50
55
|
|
@@ -67,7 +72,9 @@ module Purplelight
|
|
67
72
|
def ensure_partitions!(count)
|
68
73
|
@mutex.synchronize do
|
69
74
|
if @data['partitions'].empty?
|
70
|
-
@data['partitions'] = Array.new(count)
|
75
|
+
@data['partitions'] = Array.new(count) do |i|
|
76
|
+
{ 'index' => i, 'last_id_exclusive' => nil, 'completed' => false }
|
77
|
+
end
|
71
78
|
save!
|
72
79
|
end
|
73
80
|
end
|
@@ -92,7 +99,8 @@ module Purplelight
|
|
92
99
|
def open_part!(path)
|
93
100
|
@mutex.synchronize do
|
94
101
|
idx = @data['parts'].size
|
95
|
-
@data['parts'] << { 'index' => idx, 'path' => path, 'bytes' => 0, 'rows' => 0, 'complete' => false,
|
102
|
+
@data['parts'] << { 'index' => idx, 'path' => path, 'bytes' => 0, 'rows' => 0, 'complete' => false,
|
103
|
+
'checksum' => nil }
|
96
104
|
save!
|
97
105
|
idx
|
98
106
|
end
|
@@ -128,12 +136,10 @@ module Purplelight
|
|
128
136
|
|
129
137
|
def save_maybe!(interval_seconds: 2.0)
|
130
138
|
now = Time.now
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
139
|
+
return unless (now - @last_save_at) >= interval_seconds
|
140
|
+
|
141
|
+
save!
|
142
|
+
@last_save_at = now
|
135
143
|
end
|
136
144
|
end
|
137
145
|
end
|
138
|
-
|
139
|
-
|
@@ -3,6 +3,11 @@
|
|
3
3
|
require 'mongo'
|
4
4
|
|
5
5
|
module Purplelight
|
6
|
+
# Partitioner builds MongoDB range filters to split work across workers.
|
7
|
+
#
|
8
|
+
# Given a Mongo collection and an optional base query, it returns N
|
9
|
+
# contiguous `_id` ranges that can be processed independently while
|
10
|
+
# maintaining ascending order. Optimized for ObjectId-based `_id`.
|
6
11
|
class Partitioner
|
7
12
|
# Builds contiguous _id range filters for N partitions.
|
8
13
|
# For ObjectId _id, we sample quantiles to split into near-equal document counts.
|
@@ -20,9 +25,7 @@ module Purplelight
|
|
20
25
|
cursor = base_query.projection(_id: 1).batch_size(1_000).no_cursor_timeout
|
21
26
|
i = 0
|
22
27
|
cursor.each do |doc|
|
23
|
-
if (i % step).zero?
|
24
|
-
boundaries << doc['_id']
|
25
|
-
end
|
28
|
+
boundaries << doc['_id'] if (i % step).zero?
|
26
29
|
i += 1
|
27
30
|
break if boundaries.size >= partitions
|
28
31
|
end
|
@@ -30,7 +33,7 @@ module Purplelight
|
|
30
33
|
ranges = []
|
31
34
|
prev = nil
|
32
35
|
boundaries.each_with_index do |b, idx|
|
33
|
-
if idx
|
36
|
+
if idx.zero?
|
34
37
|
prev = nil
|
35
38
|
next
|
36
39
|
end
|
@@ -80,5 +83,3 @@ module Purplelight
|
|
80
83
|
end
|
81
84
|
end
|
82
85
|
end
|
83
|
-
|
84
|
-
|
data/lib/purplelight/queue.rb
CHANGED
@@ -14,10 +14,9 @@ module Purplelight
|
|
14
14
|
|
15
15
|
def push(item, bytes:)
|
16
16
|
@mutex.synchronize do
|
17
|
-
raise
|
18
|
-
|
19
|
-
|
20
|
-
end
|
17
|
+
raise 'queue closed' if @closed
|
18
|
+
|
19
|
+
@cv.wait(@mutex) while (@bytes + bytes) > @max_bytes
|
21
20
|
@queue << [item, bytes]
|
22
21
|
@bytes += bytes
|
23
22
|
@cv.broadcast
|
@@ -27,9 +26,8 @@ module Purplelight
|
|
27
26
|
def pop
|
28
27
|
@mutex.synchronize do
|
29
28
|
while @queue.empty?
|
30
|
-
if @closed
|
31
|
-
|
32
|
-
end
|
29
|
+
return nil if @closed
|
30
|
+
|
33
31
|
@cv.wait(@mutex)
|
34
32
|
end
|
35
33
|
item, bytes = @queue.shift
|
@@ -51,5 +49,3 @@ module Purplelight
|
|
51
49
|
end
|
52
50
|
end
|
53
51
|
end
|
54
|
-
|
55
|
-
|
data/lib/purplelight/snapshot.rb
CHANGED
@@ -12,6 +12,7 @@ require_relative 'manifest'
|
|
12
12
|
require_relative 'errors'
|
13
13
|
|
14
14
|
module Purplelight
|
15
|
+
# Snapshot orchestrates partition planning, parallel reads, and writing.
|
15
16
|
class Snapshot
|
16
17
|
DEFAULTS = {
|
17
18
|
format: :jsonl,
|
@@ -23,10 +24,10 @@ module Purplelight
|
|
23
24
|
read_concern: { level: :majority },
|
24
25
|
read_preference: :primary,
|
25
26
|
no_cursor_timeout: true
|
26
|
-
}
|
27
|
+
}.freeze
|
27
28
|
|
28
|
-
def self.snapshot(
|
29
|
-
new(
|
29
|
+
def self.snapshot(...)
|
30
|
+
new(...).run
|
30
31
|
end
|
31
32
|
|
32
33
|
def initialize(client:, collection:, output:, format: DEFAULTS[:format], compression: DEFAULTS[:compression],
|
@@ -61,6 +62,7 @@ module Purplelight
|
|
61
62
|
@running = true
|
62
63
|
end
|
63
64
|
|
65
|
+
# rubocop:disable Naming/PredicateMethod
|
64
66
|
def run
|
65
67
|
dir, prefix = resolve_output(@output, @format)
|
66
68
|
manifest_path = File.join(dir, "#{prefix}.manifest.json")
|
@@ -68,11 +70,13 @@ module Purplelight
|
|
68
70
|
|
69
71
|
manifest = if @resume && @resume[:enabled] && File.exist?(manifest_path)
|
70
72
|
m = Manifest.load(manifest_path)
|
71
|
-
unless m.compatible_with?(collection: @collection.name, format: @format, compression: @compression,
|
73
|
+
unless m.compatible_with?(collection: @collection.name, format: @format, compression: @compression,
|
74
|
+
query_digest: query_digest)
|
72
75
|
if @resume[:overwrite_incompatible]
|
73
76
|
m = Manifest.new(path: manifest_path)
|
74
77
|
else
|
75
|
-
raise IncompatibleResumeError,
|
78
|
+
raise IncompatibleResumeError,
|
79
|
+
'existing manifest incompatible with request; pass overwrite_incompatible: true to reset'
|
76
80
|
end
|
77
81
|
end
|
78
82
|
m
|
@@ -81,12 +85,13 @@ module Purplelight
|
|
81
85
|
end
|
82
86
|
|
83
87
|
manifest.configure!(collection: @collection.name, format: @format, compression: @compression, query_digest: query_digest, options: {
|
84
|
-
|
85
|
-
|
88
|
+
partitions: @partitions, batch_size: @batch_size, rotate_bytes: @rotate_bytes, hint: @hint
|
89
|
+
})
|
86
90
|
manifest.ensure_partitions!(@partitions)
|
87
91
|
|
88
92
|
# Plan partitions
|
89
|
-
partition_filters = Partitioner.object_id_partitions(collection: @collection, query: @query,
|
93
|
+
partition_filters = Partitioner.object_id_partitions(collection: @collection, query: @query,
|
94
|
+
partitions: @partitions)
|
90
95
|
|
91
96
|
# Reader queue
|
92
97
|
queue = ByteQueue.new(max_bytes: @queue_size_bytes)
|
@@ -94,13 +99,16 @@ module Purplelight
|
|
94
99
|
# Writer
|
95
100
|
writer = case @format
|
96
101
|
when :jsonl
|
97
|
-
WriterJSONL.new(directory: dir, prefix: prefix, compression: @compression,
|
102
|
+
WriterJSONL.new(directory: dir, prefix: prefix, compression: @compression,
|
103
|
+
rotate_bytes: @rotate_bytes, logger: @logger, manifest: manifest)
|
98
104
|
when :csv
|
99
|
-
single_file =
|
100
|
-
WriterCSV.new(directory: dir, prefix: prefix, compression: @compression, rotate_bytes: @rotate_bytes,
|
105
|
+
single_file = @sharding && @sharding[:mode].to_s == 'single_file'
|
106
|
+
WriterCSV.new(directory: dir, prefix: prefix, compression: @compression, rotate_bytes: @rotate_bytes,
|
107
|
+
logger: @logger, manifest: manifest, single_file: single_file)
|
101
108
|
when :parquet
|
102
|
-
single_file =
|
103
|
-
WriterParquet.new(directory: dir, prefix: prefix, compression: @compression, logger: @logger,
|
109
|
+
single_file = @sharding && @sharding[:mode].to_s == 'single_file'
|
110
|
+
WriterParquet.new(directory: dir, prefix: prefix, compression: @compression, logger: @logger,
|
111
|
+
manifest: manifest, single_file: single_file)
|
104
112
|
else
|
105
113
|
raise ArgumentError, "format not implemented: #{@format}"
|
106
114
|
end
|
@@ -117,6 +125,7 @@ module Purplelight
|
|
117
125
|
loop do
|
118
126
|
batch = queue.pop
|
119
127
|
break if batch.nil?
|
128
|
+
|
120
129
|
writer.write_many(batch)
|
121
130
|
end
|
122
131
|
ensure
|
@@ -124,10 +133,11 @@ module Purplelight
|
|
124
133
|
end
|
125
134
|
|
126
135
|
progress_thread = Thread.new do
|
127
|
-
|
136
|
+
Time.now
|
128
137
|
loop do
|
129
138
|
sleep 2
|
130
139
|
break unless @running
|
140
|
+
|
131
141
|
@on_progress&.call({ queue_bytes: queue.size_bytes })
|
132
142
|
end
|
133
143
|
end
|
@@ -140,11 +150,12 @@ module Purplelight
|
|
140
150
|
progress_thread.join
|
141
151
|
true
|
142
152
|
end
|
153
|
+
# rubocop:enable Naming/PredicateMethod
|
143
154
|
|
144
155
|
private
|
145
156
|
|
146
|
-
def resolve_output(output,
|
147
|
-
if File.directory?(output) || output.end_with?(
|
157
|
+
def resolve_output(output, _format)
|
158
|
+
if File.directory?(output) || output.end_with?('/')
|
148
159
|
dir = output
|
149
160
|
prefix = @sharding[:prefix] || @collection.name
|
150
161
|
else
|
@@ -192,7 +203,7 @@ module Purplelight
|
|
192
203
|
last_id = doc['_id']
|
193
204
|
doc = @mapper.call(doc) if @mapper
|
194
205
|
if encode_lines
|
195
|
-
line = Oj.dump(doc, mode: :compat)
|
206
|
+
line = "#{Oj.dump(doc, mode: :compat)}\n"
|
196
207
|
bytes = line.bytesize
|
197
208
|
buffer << line
|
198
209
|
else
|
@@ -201,12 +212,12 @@ module Purplelight
|
|
201
212
|
buffer << doc
|
202
213
|
end
|
203
214
|
buffer_bytes += bytes
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
215
|
+
next unless buffer.length >= batch_size || buffer_bytes >= 1_000_000
|
216
|
+
|
217
|
+
queue.push(buffer, bytes: buffer_bytes)
|
218
|
+
manifest.update_partition_checkpoint!(idx, last_id)
|
219
|
+
buffer = []
|
220
|
+
buffer_bytes = 0
|
210
221
|
end
|
211
222
|
unless buffer.empty?
|
212
223
|
queue.push(buffer, bytes: buffer_bytes)
|
@@ -215,12 +226,7 @@ module Purplelight
|
|
215
226
|
buffer_bytes = 0
|
216
227
|
end
|
217
228
|
manifest.mark_partition_complete!(idx)
|
218
|
-
rescue => e
|
219
|
-
# Re-raise to fail the thread; could implement retry/backoff
|
220
|
-
raise e
|
221
229
|
end
|
222
230
|
end
|
223
231
|
end
|
224
232
|
end
|
225
|
-
|
226
|
-
|
data/lib/purplelight/version.rb
CHANGED
@@ -8,13 +8,16 @@ require 'fileutils'
|
|
8
8
|
begin
|
9
9
|
require 'zstds'
|
10
10
|
rescue LoadError
|
11
|
+
# zstd not available; fallback handled later via gzip
|
11
12
|
end
|
12
13
|
|
13
14
|
module Purplelight
|
15
|
+
# WriterCSV writes documents to CSV files with optional compression.
|
14
16
|
class WriterCSV
|
15
17
|
DEFAULT_ROTATE_BYTES = 256 * 1024 * 1024
|
16
18
|
|
17
|
-
def initialize(directory:, prefix:, compression: :zstd, rotate_bytes: DEFAULT_ROTATE_BYTES, logger: nil,
|
19
|
+
def initialize(directory:, prefix:, compression: :zstd, rotate_bytes: DEFAULT_ROTATE_BYTES, logger: nil,
|
20
|
+
manifest: nil, single_file: false, columns: nil, headers: true)
|
18
21
|
@directory = directory
|
19
22
|
@prefix = prefix
|
20
23
|
@compression = compression
|
@@ -35,9 +38,9 @@ module Purplelight
|
|
35
38
|
@closed = false
|
36
39
|
|
37
40
|
@effective_compression = determine_effective_compression(@compression)
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
+
return unless @effective_compression.to_s != @compression.to_s
|
42
|
+
|
43
|
+
@logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
|
41
44
|
end
|
42
45
|
|
43
46
|
def write_many(array_of_docs)
|
@@ -53,6 +56,7 @@ module Purplelight
|
|
53
56
|
|
54
57
|
array_of_docs.each do |doc|
|
55
58
|
next if doc.is_a?(String)
|
59
|
+
|
56
60
|
row = @columns.map { |k| extract_value(doc, k) }
|
57
61
|
@csv << row
|
58
62
|
@rows_written += 1
|
@@ -65,16 +69,17 @@ module Purplelight
|
|
65
69
|
def rotate_if_needed
|
66
70
|
return if @single_file
|
67
71
|
return if @rotate_bytes.nil?
|
72
|
+
|
68
73
|
raw_bytes = @io.respond_to?(:pos) ? @io.pos : @bytes_written
|
69
74
|
return if raw_bytes < @rotate_bytes
|
75
|
+
|
70
76
|
rotate!
|
71
77
|
end
|
72
78
|
|
73
79
|
def close
|
74
80
|
return if @closed
|
75
|
-
|
76
|
-
|
77
|
-
end
|
81
|
+
|
82
|
+
@csv&.flush
|
78
83
|
if @io
|
79
84
|
finalize_current_part!
|
80
85
|
@io.close
|
@@ -86,6 +91,7 @@ module Purplelight
|
|
86
91
|
|
87
92
|
def ensure_open!
|
88
93
|
return if @io
|
94
|
+
|
89
95
|
FileUtils.mkdir_p(@directory)
|
90
96
|
path = next_part_path
|
91
97
|
@part_index = @manifest&.open_part!(path) if @manifest
|
@@ -99,16 +105,15 @@ module Purplelight
|
|
99
105
|
def build_compressed_io(raw)
|
100
106
|
case @effective_compression.to_s
|
101
107
|
when 'zstd'
|
102
|
-
if defined?(ZSTDS)
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
end
|
108
|
+
return ZSTDS::Writer.open(raw, level: 10) if defined?(ZSTDS)
|
109
|
+
|
110
|
+
@logger&.warn('zstd gem not loaded; using gzip')
|
111
|
+
Zlib::GzipWriter.new(raw)
|
112
|
+
|
108
113
|
when 'gzip'
|
109
|
-
|
114
|
+
Zlib::GzipWriter.new(raw)
|
110
115
|
when 'none'
|
111
|
-
|
116
|
+
raw
|
112
117
|
else
|
113
118
|
raise ArgumentError, "unknown compression: #{@effective_compression}"
|
114
119
|
end
|
@@ -116,6 +121,7 @@ module Purplelight
|
|
116
121
|
|
117
122
|
def rotate!
|
118
123
|
return unless @io
|
124
|
+
|
119
125
|
finalize_current_part!
|
120
126
|
@io.close
|
121
127
|
@io = nil
|
@@ -131,26 +137,24 @@ module Purplelight
|
|
131
137
|
|
132
138
|
def next_part_path
|
133
139
|
ext = 'csv'
|
134
|
-
if @single_file
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
filename +=
|
140
|
-
filename +=
|
140
|
+
filename = if @single_file
|
141
|
+
format('%<prefix>s.%<ext>s', prefix: @prefix, ext: ext)
|
142
|
+
else
|
143
|
+
format('%<prefix>s-part-%<seq>06d.%<ext>s', prefix: @prefix, seq: @file_seq, ext: ext)
|
144
|
+
end
|
145
|
+
filename += '.zst' if @effective_compression.to_s == 'zstd'
|
146
|
+
filename += '.gz' if @effective_compression.to_s == 'gzip'
|
141
147
|
File.join(@directory, filename)
|
142
148
|
end
|
143
149
|
|
144
150
|
def determine_effective_compression(requested)
|
145
151
|
case requested.to_s
|
146
152
|
when 'zstd'
|
147
|
-
|
148
|
-
when 'gzip'
|
149
|
-
return :gzip
|
153
|
+
(defined?(ZSTDS) ? :zstd : :gzip)
|
150
154
|
when 'none'
|
151
|
-
|
155
|
+
:none
|
152
156
|
else
|
153
|
-
|
157
|
+
:gzip
|
154
158
|
end
|
155
159
|
end
|
156
160
|
|
@@ -176,5 +180,3 @@ module Purplelight
|
|
176
180
|
end
|
177
181
|
end
|
178
182
|
end
|
179
|
-
|
180
|
-
|
@@ -11,10 +11,12 @@ rescue LoadError
|
|
11
11
|
end
|
12
12
|
|
13
13
|
module Purplelight
|
14
|
+
# WriterJSONL writes newline-delimited JSON with optional compression.
|
14
15
|
class WriterJSONL
|
15
16
|
DEFAULT_ROTATE_BYTES = 256 * 1024 * 1024
|
16
17
|
|
17
|
-
def initialize(directory:, prefix:, compression: :zstd, rotate_bytes: DEFAULT_ROTATE_BYTES, logger: nil,
|
18
|
+
def initialize(directory:, prefix:, compression: :zstd, rotate_bytes: DEFAULT_ROTATE_BYTES, logger: nil,
|
19
|
+
manifest: nil, compression_level: nil)
|
18
20
|
@directory = directory
|
19
21
|
@prefix = prefix
|
20
22
|
@compression = compression
|
@@ -31,21 +33,20 @@ module Purplelight
|
|
31
33
|
@closed = false
|
32
34
|
|
33
35
|
@effective_compression = determine_effective_compression(@compression)
|
34
|
-
|
35
|
-
|
36
|
-
|
36
|
+
return unless @effective_compression.to_s != @compression.to_s
|
37
|
+
|
38
|
+
@logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
|
37
39
|
end
|
38
40
|
|
39
41
|
def write_many(array_of_docs)
|
40
42
|
ensure_open!
|
41
43
|
# If upstream already produced newline-terminated strings, join fast.
|
42
|
-
if array_of_docs.first.is_a?(String)
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
end
|
44
|
+
buffer = if array_of_docs.first.is_a?(String)
|
45
|
+
array_of_docs.join
|
46
|
+
else
|
47
|
+
array_of_docs.map { |doc| "#{Oj.dump(doc, mode: :compat)}\n" }.join
|
48
|
+
end
|
49
|
+
rows = array_of_docs.size
|
49
50
|
write_buffer(buffer)
|
50
51
|
@rows_written += rows
|
51
52
|
@manifest&.add_progress_to_part!(index: @part_index, rows_delta: rows, bytes_delta: buffer.bytesize)
|
@@ -54,11 +55,13 @@ module Purplelight
|
|
54
55
|
def rotate_if_needed
|
55
56
|
return if @rotate_bytes.nil?
|
56
57
|
return if @bytes_written < @rotate_bytes
|
58
|
+
|
57
59
|
rotate!
|
58
60
|
end
|
59
61
|
|
60
62
|
def close
|
61
63
|
return if @closed
|
64
|
+
|
62
65
|
if @io
|
63
66
|
finalize_current_part!
|
64
67
|
@io.close
|
@@ -70,6 +73,7 @@ module Purplelight
|
|
70
73
|
|
71
74
|
def ensure_open!
|
72
75
|
return if @io
|
76
|
+
|
73
77
|
FileUtils.mkdir_p(@directory)
|
74
78
|
path = next_part_path
|
75
79
|
@part_index = @manifest&.open_part!(path) if @manifest
|
@@ -85,17 +89,17 @@ module Purplelight
|
|
85
89
|
if defined?(ZSTDS)
|
86
90
|
# ZSTDS::Writer supports IO-like interface
|
87
91
|
level = @compression_level || 3
|
88
|
-
|
92
|
+
ZSTDS::Writer.open(raw, level: level)
|
89
93
|
else
|
90
|
-
@logger&.warn(
|
94
|
+
@logger&.warn('zstd gem not loaded; this should have been handled earlier')
|
91
95
|
level = @compression_level || Zlib::DEFAULT_COMPRESSION
|
92
|
-
|
96
|
+
Zlib::GzipWriter.new(raw, level)
|
93
97
|
end
|
94
98
|
when 'gzip'
|
95
99
|
level = @compression_level || 1
|
96
|
-
|
100
|
+
Zlib::GzipWriter.new(raw, level)
|
97
101
|
when 'none'
|
98
|
-
|
102
|
+
raw
|
99
103
|
else
|
100
104
|
raise ArgumentError, "unknown compression: #{@compression}"
|
101
105
|
end
|
@@ -109,6 +113,7 @@ module Purplelight
|
|
109
113
|
|
110
114
|
def rotate!
|
111
115
|
return unless @io
|
116
|
+
|
112
117
|
finalize_current_part!
|
113
118
|
@io.close
|
114
119
|
@io = nil
|
@@ -124,25 +129,21 @@ module Purplelight
|
|
124
129
|
|
125
130
|
def next_part_path
|
126
131
|
ext = 'jsonl'
|
127
|
-
filename = format(
|
128
|
-
filename +=
|
129
|
-
filename +=
|
132
|
+
filename = format('%<prefix>s-part-%<seq>06d.%<ext>s', prefix: @prefix, seq: @file_seq, ext: ext)
|
133
|
+
filename += '.zst' if @effective_compression.to_s == 'zstd'
|
134
|
+
filename += '.gz' if @effective_compression.to_s == 'gzip'
|
130
135
|
File.join(@directory, filename)
|
131
136
|
end
|
132
137
|
|
133
138
|
def determine_effective_compression(requested)
|
134
139
|
case requested.to_s
|
135
140
|
when 'zstd'
|
136
|
-
|
137
|
-
when 'gzip'
|
138
|
-
return :gzip
|
141
|
+
(defined?(ZSTDS) ? :zstd : :gzip)
|
139
142
|
when 'none'
|
140
|
-
|
143
|
+
:none
|
141
144
|
else
|
142
|
-
|
145
|
+
:gzip
|
143
146
|
end
|
144
147
|
end
|
145
148
|
end
|
146
149
|
end
|
147
|
-
|
148
|
-
|
@@ -10,10 +10,12 @@ end
|
|
10
10
|
require 'fileutils'
|
11
11
|
|
12
12
|
module Purplelight
|
13
|
+
# WriterParquet writes Parquet files via Apache Arrow when available.
|
13
14
|
class WriterParquet
|
14
15
|
DEFAULT_ROW_GROUP_SIZE = 10_000
|
15
16
|
|
16
|
-
def initialize(directory:, prefix:, compression: :zstd, row_group_size: DEFAULT_ROW_GROUP_SIZE, logger: nil,
|
17
|
+
def initialize(directory:, prefix:, compression: :zstd, row_group_size: DEFAULT_ROW_GROUP_SIZE, logger: nil,
|
18
|
+
manifest: nil, single_file: true, schema: nil)
|
17
19
|
@directory = directory
|
18
20
|
@prefix = prefix
|
19
21
|
@compression = compression
|
@@ -39,8 +41,9 @@ module Purplelight
|
|
39
41
|
|
40
42
|
def close
|
41
43
|
return if @closed
|
44
|
+
|
42
45
|
ensure_open!
|
43
|
-
|
46
|
+
unless @buffer_docs.empty?
|
44
47
|
table = build_table(@buffer_docs)
|
45
48
|
write_table(table, @writer_path, append: false)
|
46
49
|
end
|
@@ -51,9 +54,9 @@ module Purplelight
|
|
51
54
|
private
|
52
55
|
|
53
56
|
def ensure_dependencies!
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
+
return if defined?(Arrow) && defined?(Parquet)
|
58
|
+
|
59
|
+
raise ArgumentError, 'Parquet support requires gems: red-arrow and red-parquet. Add them to your Gemfile.'
|
57
60
|
end
|
58
61
|
|
59
62
|
def reset_buffers
|
@@ -64,6 +67,7 @@ module Purplelight
|
|
64
67
|
|
65
68
|
def ensure_open!
|
66
69
|
return if @writer_path
|
70
|
+
|
67
71
|
FileUtils.mkdir_p(@directory)
|
68
72
|
@writer_path = next_part_path
|
69
73
|
@part_index = @manifest&.open_part!(@writer_path) if @manifest
|
@@ -82,7 +86,7 @@ module Purplelight
|
|
82
86
|
Arrow::Table.new(columns)
|
83
87
|
end
|
84
88
|
|
85
|
-
def write_table(table, path, append: false)
|
89
|
+
def write_table(table, path, append: false) # rubocop:disable Lint/UnusedMethodArgument
|
86
90
|
# Prefer Arrow's save with explicit parquet format; compression defaults per build.
|
87
91
|
if table.respond_to?(:save)
|
88
92
|
table.save(path, format: :parquet)
|
@@ -95,7 +99,7 @@ module Purplelight
|
|
95
99
|
writer.close
|
96
100
|
return
|
97
101
|
end
|
98
|
-
raise
|
102
|
+
raise 'Parquet writer not available in this environment'
|
99
103
|
end
|
100
104
|
|
101
105
|
def finalize_current_part!
|
@@ -107,9 +111,9 @@ module Purplelight
|
|
107
111
|
def next_part_path
|
108
112
|
ext = 'parquet'
|
109
113
|
filename = if @single_file
|
110
|
-
format(
|
114
|
+
format('%<prefix>s.%<ext>s', prefix: @prefix, ext: ext)
|
111
115
|
else
|
112
|
-
format(
|
116
|
+
format('%<prefix}s-part-%<seq>06d.%<ext>s', prefix: @prefix, seq: @file_seq, ext: ext)
|
113
117
|
end
|
114
118
|
File.join(@directory, filename)
|
115
119
|
end
|
@@ -117,21 +121,13 @@ module Purplelight
|
|
117
121
|
def infer_columns(docs)
|
118
122
|
keys = {}
|
119
123
|
docs.each do |d|
|
120
|
-
d.
|
124
|
+
d.each_key { |k| keys[k.to_s] = true }
|
121
125
|
end
|
122
126
|
keys.keys.sort
|
123
127
|
end
|
124
128
|
|
125
129
|
def extract_value(doc, key)
|
126
|
-
|
127
|
-
case val
|
128
|
-
when Time
|
129
|
-
val
|
130
|
-
else
|
131
|
-
val
|
132
|
-
end
|
130
|
+
doc[key] || doc[key.to_sym]
|
133
131
|
end
|
134
132
|
end
|
135
133
|
end
|
136
|
-
|
137
|
-
|
data/lib/purplelight.rb
CHANGED
@@ -1,16 +1,18 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative
|
4
|
-
require_relative
|
5
|
-
require_relative
|
6
|
-
require_relative
|
3
|
+
require_relative 'purplelight/version'
|
4
|
+
require_relative 'purplelight/errors'
|
5
|
+
require_relative 'purplelight/manifest'
|
6
|
+
require_relative 'purplelight/snapshot'
|
7
7
|
|
8
|
+
# Purplelight is a lightweight toolkit for extracting and snapshotting data.
|
9
|
+
#
|
10
|
+
# The top-level module exposes a convenience API entrypoint via `.snapshot`.
|
11
|
+
# See `Purplelight::Snapshot` for supported options and formats.
|
8
12
|
module Purplelight
|
9
13
|
# Convenience top-level API.
|
10
14
|
# See Purplelight::Snapshot for options.
|
11
|
-
def self.snapshot(
|
12
|
-
Snapshot.snapshot(
|
15
|
+
def self.snapshot(...)
|
16
|
+
Snapshot.snapshot(...)
|
13
17
|
end
|
14
18
|
end
|
15
|
-
|
16
|
-
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: purplelight
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alexander Nicholson
|
@@ -9,34 +9,6 @@ bindir: bin
|
|
9
9
|
cert_chain: []
|
10
10
|
date: 1980-01-02 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
|
-
- !ruby/object:Gem::Dependency
|
13
|
-
name: mongo
|
14
|
-
requirement: !ruby/object:Gem::Requirement
|
15
|
-
requirements:
|
16
|
-
- - ">="
|
17
|
-
- !ruby/object:Gem::Version
|
18
|
-
version: '2.19'
|
19
|
-
type: :runtime
|
20
|
-
prerelease: false
|
21
|
-
version_requirements: !ruby/object:Gem::Requirement
|
22
|
-
requirements:
|
23
|
-
- - ">="
|
24
|
-
- !ruby/object:Gem::Version
|
25
|
-
version: '2.19'
|
26
|
-
- !ruby/object:Gem::Dependency
|
27
|
-
name: oj
|
28
|
-
requirement: !ruby/object:Gem::Requirement
|
29
|
-
requirements:
|
30
|
-
- - ">="
|
31
|
-
- !ruby/object:Gem::Version
|
32
|
-
version: '3.16'
|
33
|
-
type: :runtime
|
34
|
-
prerelease: false
|
35
|
-
version_requirements: !ruby/object:Gem::Requirement
|
36
|
-
requirements:
|
37
|
-
- - ">="
|
38
|
-
- !ruby/object:Gem::Version
|
39
|
-
version: '3.16'
|
40
12
|
- !ruby/object:Gem::Dependency
|
41
13
|
name: csv
|
42
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -66,33 +38,33 @@ dependencies:
|
|
66
38
|
- !ruby/object:Gem::Version
|
67
39
|
version: '1.6'
|
68
40
|
- !ruby/object:Gem::Dependency
|
69
|
-
name:
|
41
|
+
name: mongo
|
70
42
|
requirement: !ruby/object:Gem::Requirement
|
71
43
|
requirements:
|
72
44
|
- - ">="
|
73
45
|
- !ruby/object:Gem::Version
|
74
|
-
version: '
|
75
|
-
type: :
|
46
|
+
version: '2.19'
|
47
|
+
type: :runtime
|
76
48
|
prerelease: false
|
77
49
|
version_requirements: !ruby/object:Gem::Requirement
|
78
50
|
requirements:
|
79
51
|
- - ">="
|
80
52
|
- !ruby/object:Gem::Version
|
81
|
-
version: '
|
53
|
+
version: '2.19'
|
82
54
|
- !ruby/object:Gem::Dependency
|
83
|
-
name:
|
55
|
+
name: oj
|
84
56
|
requirement: !ruby/object:Gem::Requirement
|
85
57
|
requirements:
|
86
58
|
- - ">="
|
87
59
|
- !ruby/object:Gem::Version
|
88
|
-
version: '
|
89
|
-
type: :
|
60
|
+
version: '3.16'
|
61
|
+
type: :runtime
|
90
62
|
prerelease: false
|
91
63
|
version_requirements: !ruby/object:Gem::Requirement
|
92
64
|
requirements:
|
93
65
|
- - ">="
|
94
66
|
- !ruby/object:Gem::Version
|
95
|
-
version: '
|
67
|
+
version: '3.16'
|
96
68
|
description: High-throughput, resumable snapshots of MongoDB collections with partitioning,
|
97
69
|
multi-threaded readers, and size-based sharded outputs.
|
98
70
|
email:
|
@@ -118,6 +90,7 @@ files:
|
|
118
90
|
licenses:
|
119
91
|
- MIT
|
120
92
|
metadata:
|
93
|
+
rubygems_mfa_required: 'true'
|
121
94
|
homepage_uri: https://github.com/alexandernicholson/purplelight
|
122
95
|
source_code_uri: https://github.com/alexandernicholson/purplelight
|
123
96
|
changelog_uri: https://github.com/alexandernicholson/purplelight/releases
|