purplelight 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '07534009e367f28d3374708991cb870f5fa168ee11a95142af8d357885af7abc'
4
- data.tar.gz: e665d587dea94999326c0c42e88d2bcfd99bae01e305aee9e3051d3ddcd266e2
3
+ metadata.gz: 5964231e634be4a743207679e349623c275e0b20771b492bbc54c4261238e352
4
+ data.tar.gz: 82448e1f4b5ffb8e9846938653b16a4d7008aa29e722d9417863b80362370168
5
5
  SHA512:
6
- metadata.gz: e4cabf4d438a8afa0d00902aa07b01320013f4e8588630fb2d5c4f9b2432e1910ac94c19ccce78bdb396a415ac3ea949527b83d22eadfbc036520656c4273869
7
- data.tar.gz: b038e1fa40f36e985571019d7b4d7fe9c5013ea17314640f8d484ca9cbbb68292af08c5f91b1a006f34c473a24f3b83b03aece33d333685afb749437ac920ca4
6
+ metadata.gz: 4b8ac2ba82501978bcd1ae1b9db888f7dab387de76a7e473c5b0993a526398a727a029a0597a64c77bbbed976980e6adaad7a08f7e3967c9cfe6c7afa2d996ac
7
+ data.tar.gz: 15f536cfc05a0b70f7cdeb6bc7985e56b2e6dad6b2db53922e7aab12a821b8bb1294217390ff00c095b2cacb46c23e027d24d90a083e9e415a0df6e26fec9b59
data/README.md CHANGED
@@ -9,7 +9,7 @@ Purplelight is published on RubyGems: [purplelight on RubyGems](https://rubygems
9
9
  Add to your Gemfile:
10
10
 
11
11
  ```ruby
12
- gem 'purplelight', '~> 0.1.2'
12
+ gem 'purplelight', '~> 0.1.3'
13
13
  ```
14
14
 
15
15
  Or install directly:
data/Rakefile CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'rake/testtask'
2
4
 
3
5
  task default: [:spec]
@@ -10,5 +12,3 @@ rescue LoadError
10
12
  sh 'echo "RSpec not installed"'
11
13
  end
12
14
  end
13
-
14
-
data/bin/purplelight CHANGED
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'optparse'
4
5
  require 'json'
@@ -20,7 +21,7 @@ options = {
20
21
  }
21
22
 
22
23
  parser = OptionParser.new do |opts|
23
- opts.banner = "Usage: purplelight snapshot [options]"
24
+ opts.banner = 'Usage: purplelight snapshot [options]'
24
25
 
25
26
  opts.on('-u', '--uri URI', 'MongoDB connection URI (required)') { |v| options[:uri] = v }
26
27
  opts.on('-d', '--db NAME', 'Database name (required)') { |v| options[:db] = v }
@@ -30,19 +31,26 @@ parser = OptionParser.new do |opts|
30
31
  opts.on('--compression NAME', 'Compression: zstd|gzip|none') { |v| options[:compression] = v.to_sym }
31
32
  opts.on('--partitions N', Integer, 'Number of partitions') { |v| options[:partitions] = v }
32
33
  opts.on('--batch-size N', Integer, 'Mongo batch size (default 2000)') { |v| options[:batch_size] = v }
33
- opts.on('--by-size BYTES', Integer, 'Shard by size (bytes); default 268435456') { |v| options[:sharding] = { mode: :by_size, part_bytes: v } }
34
+ opts.on('--by-size BYTES', Integer, 'Shard by size (bytes); default 268435456') do |v|
35
+ options[:sharding] = { mode: :by_size, part_bytes: v }
36
+ end
34
37
  opts.on('--single-file', 'Write a single output file') { options[:sharding] = { mode: :single_file } }
35
38
  opts.on('--prefix NAME', 'Output file prefix') do |v|
36
39
  options[:sharding] ||= {}
37
40
  options[:sharding][:prefix] = v
38
41
  end
39
42
  opts.on('-q', '--query JSON', 'Filter query as JSON') { |v| options[:query] = JSON.parse(v) }
40
- opts.on('--read-preference MODE', 'Read preference mode: primary|primary_preferred|secondary|secondary_preferred|nearest') { |v| options[:read_preference] = v.to_sym }
41
- opts.on('--read-tags TAGS', 'Comma-separated key=value list to target tagged nodes (e.g., nodeType=ANALYTICS,region=EAST)') do |v|
43
+ opts.on('--read-preference MODE',
44
+ 'Read preference mode: primary|primary_preferred|secondary|secondary_preferred|nearest') do |v|
45
+ options[:read_preference] = v.to_sym
46
+ end
47
+ opts.on('--read-tags TAGS',
48
+ 'Comma-separated key=value list to target tagged nodes (e.g., nodeType=ANALYTICS,region=EAST)') do |v|
42
49
  tags = {}
43
50
  v.split(',').each do |pair|
44
51
  k, val = pair.split('=', 2)
45
52
  next if k.nil? || val.nil?
53
+
46
54
  tags[k] = val
47
55
  end
48
56
  options[:read_tags] = tags unless tags.empty?
@@ -67,16 +75,16 @@ rescue OptionParser::ParseError => e
67
75
  end
68
76
 
69
77
  %i[uri db collection output].each do |k|
70
- if options[k].nil? || options[k].to_s.empty?
71
- warn "Missing required option: --#{k}"
72
- warn parser
73
- exit 1
74
- end
78
+ next unless options[k].nil? || options[k].to_s.empty?
79
+
80
+ warn "Missing required option: --#{k}"
81
+ warn parser
82
+ exit 1
75
83
  end
76
84
 
77
85
  effective_read = nil
78
86
  if options[:read_tags]
79
- effective_read = { mode: (options[:read_preference] || :secondary), tag_sets: [options[:read_tags]] }
87
+ effective_read = { mode: options[:read_preference] || :secondary, tag_sets: [options[:read_tags]] }
80
88
  elsif options[:read_preference]
81
89
  effective_read = { mode: options[:read_preference] }
82
90
  end
@@ -101,9 +109,7 @@ ok = Purplelight.snapshot(
101
109
  sharding: options[:sharding],
102
110
  read_preference: effective_read || options[:read_preference],
103
111
  resume: { enabled: true },
104
- on_progress: ->(s) { $stderr.puts("progress: #{s.to_json}") }
112
+ on_progress: ->(s) { warn("progress: #{s.to_json}") }
105
113
  )
106
114
 
107
115
  exit(ok ? 0 : 1)
108
-
109
-
@@ -7,5 +7,3 @@ module Purplelight
7
7
  class OutputExistsError < Error; end
8
8
  class WriterClosedError < Error; end
9
9
  end
10
-
11
-
@@ -7,6 +7,11 @@ require 'digest'
7
7
  require 'fileutils'
8
8
 
9
9
  module Purplelight
10
+ # Manifest persists snapshot run metadata and progress to a JSON file.
11
+ #
12
+ # It records configuration, partition checkpoints, and per-part byte/row
13
+ # counts so interrupted runs can resume safely and completed runs are
14
+ # reproducible. Methods are thread-safe where mutation occurs.
10
15
  class Manifest
11
16
  DEFAULT_VERSION = 1
12
17
 
@@ -42,9 +47,9 @@ module Purplelight
42
47
 
43
48
  def save!
44
49
  dir = File.dirname(path)
45
- FileUtils.mkdir_p(dir) unless Dir.exist?(dir)
46
- tmp = path + '.tmp'
47
- File.open(tmp, 'w') { |f| f.write(JSON.pretty_generate(@data)) }
50
+ FileUtils.mkdir_p(dir)
51
+ tmp = "#{path}.tmp"
52
+ File.write(tmp, JSON.pretty_generate(@data))
48
53
  FileUtils.mv(tmp, path)
49
54
  end
50
55
 
@@ -67,7 +72,9 @@ module Purplelight
67
72
  def ensure_partitions!(count)
68
73
  @mutex.synchronize do
69
74
  if @data['partitions'].empty?
70
- @data['partitions'] = Array.new(count) { |i| { 'index' => i, 'last_id_exclusive' => nil, 'completed' => false } }
75
+ @data['partitions'] = Array.new(count) do |i|
76
+ { 'index' => i, 'last_id_exclusive' => nil, 'completed' => false }
77
+ end
71
78
  save!
72
79
  end
73
80
  end
@@ -92,7 +99,8 @@ module Purplelight
92
99
  def open_part!(path)
93
100
  @mutex.synchronize do
94
101
  idx = @data['parts'].size
95
- @data['parts'] << { 'index' => idx, 'path' => path, 'bytes' => 0, 'rows' => 0, 'complete' => false, 'checksum' => nil }
102
+ @data['parts'] << { 'index' => idx, 'path' => path, 'bytes' => 0, 'rows' => 0, 'complete' => false,
103
+ 'checksum' => nil }
96
104
  save!
97
105
  idx
98
106
  end
@@ -128,12 +136,10 @@ module Purplelight
128
136
 
129
137
  def save_maybe!(interval_seconds: 2.0)
130
138
  now = Time.now
131
- if (now - @last_save_at) >= interval_seconds
132
- save!
133
- @last_save_at = now
134
- end
139
+ return unless (now - @last_save_at) >= interval_seconds
140
+
141
+ save!
142
+ @last_save_at = now
135
143
  end
136
144
  end
137
145
  end
138
-
139
-
@@ -3,6 +3,11 @@
3
3
  require 'mongo'
4
4
 
5
5
  module Purplelight
6
+ # Partitioner builds MongoDB range filters to split work across workers.
7
+ #
8
+ # Given a Mongo collection and an optional base query, it returns N
9
+ # contiguous `_id` ranges that can be processed independently while
10
+ # maintaining ascending order. Optimized for ObjectId-based `_id`.
6
11
  class Partitioner
7
12
  # Builds contiguous _id range filters for N partitions.
8
13
  # For ObjectId _id, we sample quantiles to split into near-equal document counts.
@@ -20,9 +25,7 @@ module Purplelight
20
25
  cursor = base_query.projection(_id: 1).batch_size(1_000).no_cursor_timeout
21
26
  i = 0
22
27
  cursor.each do |doc|
23
- if (i % step).zero?
24
- boundaries << doc['_id']
25
- end
28
+ boundaries << doc['_id'] if (i % step).zero?
26
29
  i += 1
27
30
  break if boundaries.size >= partitions
28
31
  end
@@ -30,7 +33,7 @@ module Purplelight
30
33
  ranges = []
31
34
  prev = nil
32
35
  boundaries.each_with_index do |b, idx|
33
- if idx == 0
36
+ if idx.zero?
34
37
  prev = nil
35
38
  next
36
39
  end
@@ -80,5 +83,3 @@ module Purplelight
80
83
  end
81
84
  end
82
85
  end
83
-
84
-
@@ -14,10 +14,9 @@ module Purplelight
14
14
 
15
15
  def push(item, bytes:)
16
16
  @mutex.synchronize do
17
- raise "queue closed" if @closed
18
- while (@bytes + bytes) > @max_bytes
19
- @cv.wait(@mutex)
20
- end
17
+ raise 'queue closed' if @closed
18
+
19
+ @cv.wait(@mutex) while (@bytes + bytes) > @max_bytes
21
20
  @queue << [item, bytes]
22
21
  @bytes += bytes
23
22
  @cv.broadcast
@@ -27,9 +26,8 @@ module Purplelight
27
26
  def pop
28
27
  @mutex.synchronize do
29
28
  while @queue.empty?
30
- if @closed
31
- return nil
32
- end
29
+ return nil if @closed
30
+
33
31
  @cv.wait(@mutex)
34
32
  end
35
33
  item, bytes = @queue.shift
@@ -51,5 +49,3 @@ module Purplelight
51
49
  end
52
50
  end
53
51
  end
54
-
55
-
@@ -12,6 +12,7 @@ require_relative 'manifest'
12
12
  require_relative 'errors'
13
13
 
14
14
  module Purplelight
15
+ # Snapshot orchestrates partition planning, parallel reads, and writing.
15
16
  class Snapshot
16
17
  DEFAULTS = {
17
18
  format: :jsonl,
@@ -23,10 +24,10 @@ module Purplelight
23
24
  read_concern: { level: :majority },
24
25
  read_preference: :primary,
25
26
  no_cursor_timeout: true
26
- }
27
+ }.freeze
27
28
 
28
- def self.snapshot(**options)
29
- new(**options).run
29
+ def self.snapshot(...)
30
+ new(...).run
30
31
  end
31
32
 
32
33
  def initialize(client:, collection:, output:, format: DEFAULTS[:format], compression: DEFAULTS[:compression],
@@ -61,6 +62,7 @@ module Purplelight
61
62
  @running = true
62
63
  end
63
64
 
65
+ # rubocop:disable Naming/PredicateMethod
64
66
  def run
65
67
  dir, prefix = resolve_output(@output, @format)
66
68
  manifest_path = File.join(dir, "#{prefix}.manifest.json")
@@ -68,11 +70,13 @@ module Purplelight
68
70
 
69
71
  manifest = if @resume && @resume[:enabled] && File.exist?(manifest_path)
70
72
  m = Manifest.load(manifest_path)
71
- unless m.compatible_with?(collection: @collection.name, format: @format, compression: @compression, query_digest: query_digest)
73
+ unless m.compatible_with?(collection: @collection.name, format: @format, compression: @compression,
74
+ query_digest: query_digest)
72
75
  if @resume[:overwrite_incompatible]
73
76
  m = Manifest.new(path: manifest_path)
74
77
  else
75
- raise IncompatibleResumeError, "existing manifest incompatible with request; pass overwrite_incompatible: true to reset"
78
+ raise IncompatibleResumeError,
79
+ 'existing manifest incompatible with request; pass overwrite_incompatible: true to reset'
76
80
  end
77
81
  end
78
82
  m
@@ -81,12 +85,13 @@ module Purplelight
81
85
  end
82
86
 
83
87
  manifest.configure!(collection: @collection.name, format: @format, compression: @compression, query_digest: query_digest, options: {
84
- partitions: @partitions, batch_size: @batch_size, rotate_bytes: @rotate_bytes, hint: @hint
85
- })
88
+ partitions: @partitions, batch_size: @batch_size, rotate_bytes: @rotate_bytes, hint: @hint
89
+ })
86
90
  manifest.ensure_partitions!(@partitions)
87
91
 
88
92
  # Plan partitions
89
- partition_filters = Partitioner.object_id_partitions(collection: @collection, query: @query, partitions: @partitions)
93
+ partition_filters = Partitioner.object_id_partitions(collection: @collection, query: @query,
94
+ partitions: @partitions)
90
95
 
91
96
  # Reader queue
92
97
  queue = ByteQueue.new(max_bytes: @queue_size_bytes)
@@ -94,13 +99,16 @@ module Purplelight
94
99
  # Writer
95
100
  writer = case @format
96
101
  when :jsonl
97
- WriterJSONL.new(directory: dir, prefix: prefix, compression: @compression, rotate_bytes: @rotate_bytes, logger: @logger, manifest: manifest)
102
+ WriterJSONL.new(directory: dir, prefix: prefix, compression: @compression,
103
+ rotate_bytes: @rotate_bytes, logger: @logger, manifest: manifest)
98
104
  when :csv
99
- single_file = (@sharding && @sharding[:mode].to_s == 'single_file')
100
- WriterCSV.new(directory: dir, prefix: prefix, compression: @compression, rotate_bytes: @rotate_bytes, logger: @logger, manifest: manifest, single_file: single_file)
105
+ single_file = @sharding && @sharding[:mode].to_s == 'single_file'
106
+ WriterCSV.new(directory: dir, prefix: prefix, compression: @compression, rotate_bytes: @rotate_bytes,
107
+ logger: @logger, manifest: manifest, single_file: single_file)
101
108
  when :parquet
102
- single_file = (@sharding && @sharding[:mode].to_s == 'single_file')
103
- WriterParquet.new(directory: dir, prefix: prefix, compression: @compression, logger: @logger, manifest: manifest, single_file: single_file)
109
+ single_file = @sharding && @sharding[:mode].to_s == 'single_file'
110
+ WriterParquet.new(directory: dir, prefix: prefix, compression: @compression, logger: @logger,
111
+ manifest: manifest, single_file: single_file)
104
112
  else
105
113
  raise ArgumentError, "format not implemented: #{@format}"
106
114
  end
@@ -117,6 +125,7 @@ module Purplelight
117
125
  loop do
118
126
  batch = queue.pop
119
127
  break if batch.nil?
128
+
120
129
  writer.write_many(batch)
121
130
  end
122
131
  ensure
@@ -124,10 +133,11 @@ module Purplelight
124
133
  end
125
134
 
126
135
  progress_thread = Thread.new do
127
- last = Time.now
136
+ Time.now
128
137
  loop do
129
138
  sleep 2
130
139
  break unless @running
140
+
131
141
  @on_progress&.call({ queue_bytes: queue.size_bytes })
132
142
  end
133
143
  end
@@ -140,11 +150,12 @@ module Purplelight
140
150
  progress_thread.join
141
151
  true
142
152
  end
153
+ # rubocop:enable Naming/PredicateMethod
143
154
 
144
155
  private
145
156
 
146
- def resolve_output(output, format)
147
- if File.directory?(output) || output.end_with?("/")
157
+ def resolve_output(output, _format)
158
+ if File.directory?(output) || output.end_with?('/')
148
159
  dir = output
149
160
  prefix = @sharding[:prefix] || @collection.name
150
161
  else
@@ -192,7 +203,7 @@ module Purplelight
192
203
  last_id = doc['_id']
193
204
  doc = @mapper.call(doc) if @mapper
194
205
  if encode_lines
195
- line = Oj.dump(doc, mode: :compat) + "\n"
206
+ line = "#{Oj.dump(doc, mode: :compat)}\n"
196
207
  bytes = line.bytesize
197
208
  buffer << line
198
209
  else
@@ -201,12 +212,12 @@ module Purplelight
201
212
  buffer << doc
202
213
  end
203
214
  buffer_bytes += bytes
204
- if buffer.length >= batch_size || buffer_bytes >= 1_000_000
205
- queue.push(buffer, bytes: buffer_bytes)
206
- manifest.update_partition_checkpoint!(idx, last_id)
207
- buffer = []
208
- buffer_bytes = 0
209
- end
215
+ next unless buffer.length >= batch_size || buffer_bytes >= 1_000_000
216
+
217
+ queue.push(buffer, bytes: buffer_bytes)
218
+ manifest.update_partition_checkpoint!(idx, last_id)
219
+ buffer = []
220
+ buffer_bytes = 0
210
221
  end
211
222
  unless buffer.empty?
212
223
  queue.push(buffer, bytes: buffer_bytes)
@@ -215,12 +226,7 @@ module Purplelight
215
226
  buffer_bytes = 0
216
227
  end
217
228
  manifest.mark_partition_complete!(idx)
218
- rescue => e
219
- # Re-raise to fail the thread; could implement retry/backoff
220
- raise e
221
229
  end
222
230
  end
223
231
  end
224
232
  end
225
-
226
-
@@ -1,7 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Purplelight
4
- VERSION = "0.1.2"
4
+ VERSION = '0.1.3'
5
5
  end
6
-
7
-
@@ -8,13 +8,16 @@ require 'fileutils'
8
8
  begin
9
9
  require 'zstds'
10
10
  rescue LoadError
11
+ # zstd not available; fallback handled later via gzip
11
12
  end
12
13
 
13
14
  module Purplelight
15
+ # WriterCSV writes documents to CSV files with optional compression.
14
16
  class WriterCSV
15
17
  DEFAULT_ROTATE_BYTES = 256 * 1024 * 1024
16
18
 
17
- def initialize(directory:, prefix:, compression: :zstd, rotate_bytes: DEFAULT_ROTATE_BYTES, logger: nil, manifest: nil, single_file: false, columns: nil, headers: true)
19
+ def initialize(directory:, prefix:, compression: :zstd, rotate_bytes: DEFAULT_ROTATE_BYTES, logger: nil,
20
+ manifest: nil, single_file: false, columns: nil, headers: true)
18
21
  @directory = directory
19
22
  @prefix = prefix
20
23
  @compression = compression
@@ -35,9 +38,9 @@ module Purplelight
35
38
  @closed = false
36
39
 
37
40
  @effective_compression = determine_effective_compression(@compression)
38
- if @effective_compression.to_s != @compression.to_s
39
- @logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
40
- end
41
+ return unless @effective_compression.to_s != @compression.to_s
42
+
43
+ @logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
41
44
  end
42
45
 
43
46
  def write_many(array_of_docs)
@@ -53,6 +56,7 @@ module Purplelight
53
56
 
54
57
  array_of_docs.each do |doc|
55
58
  next if doc.is_a?(String)
59
+
56
60
  row = @columns.map { |k| extract_value(doc, k) }
57
61
  @csv << row
58
62
  @rows_written += 1
@@ -65,16 +69,17 @@ module Purplelight
65
69
  def rotate_if_needed
66
70
  return if @single_file
67
71
  return if @rotate_bytes.nil?
72
+
68
73
  raw_bytes = @io.respond_to?(:pos) ? @io.pos : @bytes_written
69
74
  return if raw_bytes < @rotate_bytes
75
+
70
76
  rotate!
71
77
  end
72
78
 
73
79
  def close
74
80
  return if @closed
75
- if @csv
76
- @csv.flush
77
- end
81
+
82
+ @csv&.flush
78
83
  if @io
79
84
  finalize_current_part!
80
85
  @io.close
@@ -86,6 +91,7 @@ module Purplelight
86
91
 
87
92
  def ensure_open!
88
93
  return if @io
94
+
89
95
  FileUtils.mkdir_p(@directory)
90
96
  path = next_part_path
91
97
  @part_index = @manifest&.open_part!(path) if @manifest
@@ -99,16 +105,15 @@ module Purplelight
99
105
  def build_compressed_io(raw)
100
106
  case @effective_compression.to_s
101
107
  when 'zstd'
102
- if defined?(ZSTDS)
103
- return ZSTDS::Writer.open(raw, level: 10)
104
- else
105
- @logger&.warn("zstd gem not loaded; using gzip")
106
- return Zlib::GzipWriter.new(raw)
107
- end
108
+ return ZSTDS::Writer.open(raw, level: 10) if defined?(ZSTDS)
109
+
110
+ @logger&.warn('zstd gem not loaded; using gzip')
111
+ Zlib::GzipWriter.new(raw)
112
+
108
113
  when 'gzip'
109
- return Zlib::GzipWriter.new(raw)
114
+ Zlib::GzipWriter.new(raw)
110
115
  when 'none'
111
- return raw
116
+ raw
112
117
  else
113
118
  raise ArgumentError, "unknown compression: #{@effective_compression}"
114
119
  end
@@ -116,6 +121,7 @@ module Purplelight
116
121
 
117
122
  def rotate!
118
123
  return unless @io
124
+
119
125
  finalize_current_part!
120
126
  @io.close
121
127
  @io = nil
@@ -131,26 +137,24 @@ module Purplelight
131
137
 
132
138
  def next_part_path
133
139
  ext = 'csv'
134
- if @single_file
135
- filename = format("%s.%s", @prefix, ext)
136
- else
137
- filename = format("%s-part-%06d.%s", @prefix, @file_seq, ext)
138
- end
139
- filename += ".zst" if @effective_compression.to_s == 'zstd'
140
- filename += ".gz" if @effective_compression.to_s == 'gzip'
140
+ filename = if @single_file
141
+ format('%<prefix>s.%<ext>s', prefix: @prefix, ext: ext)
142
+ else
143
+ format('%<prefix>s-part-%<seq>06d.%<ext>s', prefix: @prefix, seq: @file_seq, ext: ext)
144
+ end
145
+ filename += '.zst' if @effective_compression.to_s == 'zstd'
146
+ filename += '.gz' if @effective_compression.to_s == 'gzip'
141
147
  File.join(@directory, filename)
142
148
  end
143
149
 
144
150
  def determine_effective_compression(requested)
145
151
  case requested.to_s
146
152
  when 'zstd'
147
- return (defined?(ZSTDS) ? :zstd : :gzip)
148
- when 'gzip'
149
- return :gzip
153
+ (defined?(ZSTDS) ? :zstd : :gzip)
150
154
  when 'none'
151
- return :none
155
+ :none
152
156
  else
153
- return :gzip
157
+ :gzip
154
158
  end
155
159
  end
156
160
 
@@ -176,5 +180,3 @@ module Purplelight
176
180
  end
177
181
  end
178
182
  end
179
-
180
-
@@ -11,10 +11,12 @@ rescue LoadError
11
11
  end
12
12
 
13
13
  module Purplelight
14
+ # WriterJSONL writes newline-delimited JSON with optional compression.
14
15
  class WriterJSONL
15
16
  DEFAULT_ROTATE_BYTES = 256 * 1024 * 1024
16
17
 
17
- def initialize(directory:, prefix:, compression: :zstd, rotate_bytes: DEFAULT_ROTATE_BYTES, logger: nil, manifest: nil, compression_level: nil)
18
+ def initialize(directory:, prefix:, compression: :zstd, rotate_bytes: DEFAULT_ROTATE_BYTES, logger: nil,
19
+ manifest: nil, compression_level: nil)
18
20
  @directory = directory
19
21
  @prefix = prefix
20
22
  @compression = compression
@@ -31,21 +33,20 @@ module Purplelight
31
33
  @closed = false
32
34
 
33
35
  @effective_compression = determine_effective_compression(@compression)
34
- if @effective_compression.to_s != @compression.to_s
35
- @logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
36
- end
36
+ return unless @effective_compression.to_s != @compression.to_s
37
+
38
+ @logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
37
39
  end
38
40
 
39
41
  def write_many(array_of_docs)
40
42
  ensure_open!
41
43
  # If upstream already produced newline-terminated strings, join fast.
42
- if array_of_docs.first.is_a?(String)
43
- buffer = array_of_docs.join
44
- rows = array_of_docs.size
45
- else
46
- buffer = array_of_docs.map { |doc| Oj.dump(doc, mode: :compat) + "\n" }.join
47
- rows = array_of_docs.size
48
- end
44
+ buffer = if array_of_docs.first.is_a?(String)
45
+ array_of_docs.join
46
+ else
47
+ array_of_docs.map { |doc| "#{Oj.dump(doc, mode: :compat)}\n" }.join
48
+ end
49
+ rows = array_of_docs.size
49
50
  write_buffer(buffer)
50
51
  @rows_written += rows
51
52
  @manifest&.add_progress_to_part!(index: @part_index, rows_delta: rows, bytes_delta: buffer.bytesize)
@@ -54,11 +55,13 @@ module Purplelight
54
55
  def rotate_if_needed
55
56
  return if @rotate_bytes.nil?
56
57
  return if @bytes_written < @rotate_bytes
58
+
57
59
  rotate!
58
60
  end
59
61
 
60
62
  def close
61
63
  return if @closed
64
+
62
65
  if @io
63
66
  finalize_current_part!
64
67
  @io.close
@@ -70,6 +73,7 @@ module Purplelight
70
73
 
71
74
  def ensure_open!
72
75
  return if @io
76
+
73
77
  FileUtils.mkdir_p(@directory)
74
78
  path = next_part_path
75
79
  @part_index = @manifest&.open_part!(path) if @manifest
@@ -85,17 +89,17 @@ module Purplelight
85
89
  if defined?(ZSTDS)
86
90
  # ZSTDS::Writer supports IO-like interface
87
91
  level = @compression_level || 3
88
- return ZSTDS::Writer.open(raw, level: level)
92
+ ZSTDS::Writer.open(raw, level: level)
89
93
  else
90
- @logger&.warn("zstd gem not loaded; this should have been handled earlier")
94
+ @logger&.warn('zstd gem not loaded; this should have been handled earlier')
91
95
  level = @compression_level || Zlib::DEFAULT_COMPRESSION
92
- return Zlib::GzipWriter.new(raw, level)
96
+ Zlib::GzipWriter.new(raw, level)
93
97
  end
94
98
  when 'gzip'
95
99
  level = @compression_level || 1
96
- return Zlib::GzipWriter.new(raw, level)
100
+ Zlib::GzipWriter.new(raw, level)
97
101
  when 'none'
98
- return raw
102
+ raw
99
103
  else
100
104
  raise ArgumentError, "unknown compression: #{@compression}"
101
105
  end
@@ -109,6 +113,7 @@ module Purplelight
109
113
 
110
114
  def rotate!
111
115
  return unless @io
116
+
112
117
  finalize_current_part!
113
118
  @io.close
114
119
  @io = nil
@@ -124,25 +129,21 @@ module Purplelight
124
129
 
125
130
  def next_part_path
126
131
  ext = 'jsonl'
127
- filename = format("%s-part-%06d.%s", @prefix, @file_seq, ext)
128
- filename += ".zst" if @effective_compression.to_s == 'zstd'
129
- filename += ".gz" if @effective_compression.to_s == 'gzip'
132
+ filename = format('%<prefix>s-part-%<seq>06d.%<ext>s', prefix: @prefix, seq: @file_seq, ext: ext)
133
+ filename += '.zst' if @effective_compression.to_s == 'zstd'
134
+ filename += '.gz' if @effective_compression.to_s == 'gzip'
130
135
  File.join(@directory, filename)
131
136
  end
132
137
 
133
138
  def determine_effective_compression(requested)
134
139
  case requested.to_s
135
140
  when 'zstd'
136
- return (defined?(ZSTDS) ? :zstd : :gzip)
137
- when 'gzip'
138
- return :gzip
141
+ (defined?(ZSTDS) ? :zstd : :gzip)
139
142
  when 'none'
140
- return :none
143
+ :none
141
144
  else
142
- return :gzip
145
+ :gzip
143
146
  end
144
147
  end
145
148
  end
146
149
  end
147
-
148
-
@@ -10,10 +10,12 @@ end
10
10
  require 'fileutils'
11
11
 
12
12
  module Purplelight
13
+ # WriterParquet writes Parquet files via Apache Arrow when available.
13
14
  class WriterParquet
14
15
  DEFAULT_ROW_GROUP_SIZE = 10_000
15
16
 
16
- def initialize(directory:, prefix:, compression: :zstd, row_group_size: DEFAULT_ROW_GROUP_SIZE, logger: nil, manifest: nil, single_file: true, schema: nil)
17
+ def initialize(directory:, prefix:, compression: :zstd, row_group_size: DEFAULT_ROW_GROUP_SIZE, logger: nil,
18
+ manifest: nil, single_file: true, schema: nil)
17
19
  @directory = directory
18
20
  @prefix = prefix
19
21
  @compression = compression
@@ -39,8 +41,9 @@ module Purplelight
39
41
 
40
42
  def close
41
43
  return if @closed
44
+
42
45
  ensure_open!
43
- if !@buffer_docs.empty?
46
+ unless @buffer_docs.empty?
44
47
  table = build_table(@buffer_docs)
45
48
  write_table(table, @writer_path, append: false)
46
49
  end
@@ -51,9 +54,9 @@ module Purplelight
51
54
  private
52
55
 
53
56
  def ensure_dependencies!
54
- unless defined?(Arrow) && defined?(Parquet)
55
- raise ArgumentError, "Parquet support requires gems: red-arrow and red-parquet. Add them to your Gemfile."
56
- end
57
+ return if defined?(Arrow) && defined?(Parquet)
58
+
59
+ raise ArgumentError, 'Parquet support requires gems: red-arrow and red-parquet. Add them to your Gemfile.'
57
60
  end
58
61
 
59
62
  def reset_buffers
@@ -64,6 +67,7 @@ module Purplelight
64
67
 
65
68
  def ensure_open!
66
69
  return if @writer_path
70
+
67
71
  FileUtils.mkdir_p(@directory)
68
72
  @writer_path = next_part_path
69
73
  @part_index = @manifest&.open_part!(@writer_path) if @manifest
@@ -82,7 +86,7 @@ module Purplelight
82
86
  Arrow::Table.new(columns)
83
87
  end
84
88
 
85
- def write_table(table, path, append: false)
89
+ def write_table(table, path, append: false) # rubocop:disable Lint/UnusedMethodArgument
86
90
  # Prefer Arrow's save with explicit parquet format; compression defaults per build.
87
91
  if table.respond_to?(:save)
88
92
  table.save(path, format: :parquet)
@@ -95,7 +99,7 @@ module Purplelight
95
99
  writer.close
96
100
  return
97
101
  end
98
- raise "Parquet writer not available in this environment"
102
+ raise 'Parquet writer not available in this environment'
99
103
  end
100
104
 
101
105
  def finalize_current_part!
@@ -107,9 +111,9 @@ module Purplelight
107
111
  def next_part_path
108
112
  ext = 'parquet'
109
113
  filename = if @single_file
110
- format("%s.%s", @prefix, ext)
114
+ format('%<prefix>s.%<ext>s', prefix: @prefix, ext: ext)
111
115
  else
112
- format("%s-part-%06d.%s", @prefix, @file_seq, ext)
116
+ format('%<prefix}s-part-%<seq>06d.%<ext>s', prefix: @prefix, seq: @file_seq, ext: ext)
113
117
  end
114
118
  File.join(@directory, filename)
115
119
  end
@@ -117,21 +121,13 @@ module Purplelight
117
121
  def infer_columns(docs)
118
122
  keys = {}
119
123
  docs.each do |d|
120
- d.keys.each { |k| keys[k.to_s] = true }
124
+ d.each_key { |k| keys[k.to_s] = true }
121
125
  end
122
126
  keys.keys.sort
123
127
  end
124
128
 
125
129
  def extract_value(doc, key)
126
- val = doc[key] || doc[key.to_sym]
127
- case val
128
- when Time
129
- val
130
- else
131
- val
132
- end
130
+ doc[key] || doc[key.to_sym]
133
131
  end
134
132
  end
135
133
  end
136
-
137
-
data/lib/purplelight.rb CHANGED
@@ -1,16 +1,18 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "purplelight/version"
4
- require_relative "purplelight/errors"
5
- require_relative "purplelight/manifest"
6
- require_relative "purplelight/snapshot"
3
+ require_relative 'purplelight/version'
4
+ require_relative 'purplelight/errors'
5
+ require_relative 'purplelight/manifest'
6
+ require_relative 'purplelight/snapshot'
7
7
 
8
+ # Purplelight is a lightweight toolkit for extracting and snapshotting data.
9
+ #
10
+ # The top-level module exposes a convenience API entrypoint via `.snapshot`.
11
+ # See `Purplelight::Snapshot` for supported options and formats.
8
12
  module Purplelight
9
13
  # Convenience top-level API.
10
14
  # See Purplelight::Snapshot for options.
11
- def self.snapshot(**options)
12
- Snapshot.snapshot(**options)
15
+ def self.snapshot(...)
16
+ Snapshot.snapshot(...)
13
17
  end
14
18
  end
15
-
16
-
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: purplelight
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alexander Nicholson
@@ -9,34 +9,6 @@ bindir: bin
9
9
  cert_chain: []
10
10
  date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
- - !ruby/object:Gem::Dependency
13
- name: mongo
14
- requirement: !ruby/object:Gem::Requirement
15
- requirements:
16
- - - ">="
17
- - !ruby/object:Gem::Version
18
- version: '2.19'
19
- type: :runtime
20
- prerelease: false
21
- version_requirements: !ruby/object:Gem::Requirement
22
- requirements:
23
- - - ">="
24
- - !ruby/object:Gem::Version
25
- version: '2.19'
26
- - !ruby/object:Gem::Dependency
27
- name: oj
28
- requirement: !ruby/object:Gem::Requirement
29
- requirements:
30
- - - ">="
31
- - !ruby/object:Gem::Version
32
- version: '3.16'
33
- type: :runtime
34
- prerelease: false
35
- version_requirements: !ruby/object:Gem::Requirement
36
- requirements:
37
- - - ">="
38
- - !ruby/object:Gem::Version
39
- version: '3.16'
40
12
  - !ruby/object:Gem::Dependency
41
13
  name: csv
42
14
  requirement: !ruby/object:Gem::Requirement
@@ -66,33 +38,33 @@ dependencies:
66
38
  - !ruby/object:Gem::Version
67
39
  version: '1.6'
68
40
  - !ruby/object:Gem::Dependency
69
- name: rspec
41
+ name: mongo
70
42
  requirement: !ruby/object:Gem::Requirement
71
43
  requirements:
72
44
  - - ">="
73
45
  - !ruby/object:Gem::Version
74
- version: '3.12'
75
- type: :development
46
+ version: '2.19'
47
+ type: :runtime
76
48
  prerelease: false
77
49
  version_requirements: !ruby/object:Gem::Requirement
78
50
  requirements:
79
51
  - - ">="
80
52
  - !ruby/object:Gem::Version
81
- version: '3.12'
53
+ version: '2.19'
82
54
  - !ruby/object:Gem::Dependency
83
- name: rake
55
+ name: oj
84
56
  requirement: !ruby/object:Gem::Requirement
85
57
  requirements:
86
58
  - - ">="
87
59
  - !ruby/object:Gem::Version
88
- version: '13.0'
89
- type: :development
60
+ version: '3.16'
61
+ type: :runtime
90
62
  prerelease: false
91
63
  version_requirements: !ruby/object:Gem::Requirement
92
64
  requirements:
93
65
  - - ">="
94
66
  - !ruby/object:Gem::Version
95
- version: '13.0'
67
+ version: '3.16'
96
68
  description: High-throughput, resumable snapshots of MongoDB collections with partitioning,
97
69
  multi-threaded readers, and size-based sharded outputs.
98
70
  email:
@@ -118,6 +90,7 @@ files:
118
90
  licenses:
119
91
  - MIT
120
92
  metadata:
93
+ rubygems_mfa_required: 'true'
121
94
  homepage_uri: https://github.com/alexandernicholson/purplelight
122
95
  source_code_uri: https://github.com/alexandernicholson/purplelight
123
96
  changelog_uri: https://github.com/alexandernicholson/purplelight/releases