purplelight 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 941ef04ede346a29c0afe19a02d69bd9a25d08ce241e21c6c47960498f4a42c6
4
- data.tar.gz: c34b089d2842082f5d4be60c96d58e208e079f92712d397726a964a7361ae895
3
+ metadata.gz: 89114bd20a65a5a398be619718ae9a92e535d8118d9f928c61735da9a11cb5aa
4
+ data.tar.gz: 372372e6f5efe1cedd30033d661523fcb45a986c20e6e614ca46ca69def97e70
5
5
  SHA512:
6
- metadata.gz: 8d666cb565283e6410fb0412d8ed369db18b0eaa0e80c573d677295910319b809396e99fca6627214d15942d531c9ac378c7d5a850842a7124d2ff4c9b03d7d8
7
- data.tar.gz: 9188cd0b55e0d3e54fb2d209b85eeeacfb1c0d8daa73b1ace0fbb3bf4a2af865d6dd5c125f1e487458f096077a70cc93e7c0191c1222f6edd58701487f5479da
6
+ metadata.gz: 17103c062f0c6002ee53fbb1c2eed3179fb0df582c0dd99ef5100f8a26dbb0c56432f02f9a7f62c49bd201a05d414d7797c0e32044551522eb9d625ba9c179bf
7
+ data.tar.gz: e64bfd67ce31afae0c2209eaf058c731c331c63562f8e23de5e1f86a16a4fd3c8033f5938b7a509b3348daa8f4349afe926acb5e32fba8c7882996bd0d435616
data/README.md CHANGED
@@ -9,7 +9,7 @@ Purplelight is published on RubyGems: [purplelight on RubyGems](https://rubygems
9
9
  Add to your Gemfile:
10
10
 
11
11
  ```ruby
12
- gem 'purplelight', '~> 0.1.6'
12
+ gem 'purplelight', '~> 0.1.8'
13
13
  ```
14
14
 
15
15
  Or install directly:
data/bin/purplelight CHANGED
@@ -4,6 +4,7 @@
4
4
  require 'optparse'
5
5
  require 'json'
6
6
  require 'mongo'
7
+ require 'time'
7
8
  require_relative '../lib/purplelight'
8
9
 
9
10
  options = {
@@ -32,14 +33,25 @@ parser = OptionParser.new do |opts|
32
33
  opts.on('--partitions N', Integer, 'Number of partitions') { |v| options[:partitions] = v }
33
34
  opts.on('--batch-size N', Integer, 'Mongo batch size (default 2000)') { |v| options[:batch_size] = v }
34
35
  opts.on('--by-size BYTES', Integer, 'Shard by size (bytes); default 268435456') do |v|
35
- options[:sharding] = { mode: :by_size, part_bytes: v }
36
+ options[:sharding] ||= {}
37
+ options[:sharding][:mode] = :by_size
38
+ options[:sharding][:part_bytes] = v
39
+ end
40
+ opts.on('--single-file', 'Write a single output file') do
41
+ options[:sharding] ||= {}
42
+ options[:sharding][:mode] = :single_file
36
43
  end
37
- opts.on('--single-file', 'Write a single output file') { options[:sharding] = { mode: :single_file } }
38
44
  opts.on('--prefix NAME', 'Output file prefix') do |v|
39
45
  options[:sharding] ||= {}
40
46
  options[:sharding][:prefix] = v
41
47
  end
42
- opts.on('-q', '--query JSON', 'Filter query as JSON') { |v| options[:query] = JSON.parse(v) }
48
+ opts.on('-q', '--query JSON', 'Filter query as JSON (Extended JSON supported)') do |v|
49
+ # Prefer BSON Extended JSON to support $date, $oid, etc.
50
+ options[:query] = BSON::ExtJSON.parse(v)
51
+ rescue StandardError
52
+ # Fallback to plain JSON for compatibility
53
+ options[:query] = JSON.parse(v)
54
+ end
43
55
  opts.on('--projection JSON', 'Projection as JSON (e.g., {"_id":1,"field":1})') { |v| options[:projection] = JSON.parse(v) }
44
56
  opts.on('--read-preference MODE',
45
57
  'Read preference mode: primary|primary_preferred|secondary|secondary_preferred|nearest') do |v|
@@ -27,13 +27,23 @@ module Purplelight
27
27
  max_id = collection.find(query || {}).projection(_id: 1).sort(_id: -1).limit(1).first&.dig('_id')
28
28
  return [{ filter: query || {}, sort: { _id: 1 } }] if min_id.nil? || max_id.nil?
29
29
 
30
- # Create numeric-ish interpolation by sampling
31
- ids = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(partitions - 1).to_a.map { |d| d['_id'] }
32
- boundaries = [min_id] + ids + [max_id]
30
+ # Create contiguous ranges using ascending inner boundaries.
31
+ # We intentionally skip the very first _id so the first range includes the smallest document.
32
+ inner_boundaries = collection.find(query || {})
33
+ .projection(_id: 1)
34
+ .sort(_id: 1)
35
+ .skip(1)
36
+ .limit([partitions - 1, 0].max)
37
+ .to_a
38
+ .map { |d| d['_id'] }
39
+
33
40
  ranges = []
34
- boundaries.each_cons(2) do |a, b|
35
- ranges << build_range(a, b)
41
+ prev = nil
42
+ inner_boundaries.each do |b|
43
+ ranges << build_range(prev, b)
44
+ prev = b
36
45
  end
46
+ ranges << build_range(prev, nil)
37
47
 
38
48
  ranges.map do |r|
39
49
  filter = query ? query.dup : {}
@@ -54,7 +64,7 @@ module Purplelight
54
64
  min_ts = min_id.respond_to?(:generation_time) ? min_id.generation_time.to_i : nil
55
65
  max_ts = max_id.respond_to?(:generation_time) ? max_id.generation_time.to_i : nil
56
66
 
57
- # Fallback to cursor sampling if _id isn't an ObjectId
67
+ # Fallback to cursor sampling if _id isn't anObjectId
58
68
  return cursor_sampling_partitions(collection: collection, query: query, partitions: partitions) if min_ts.nil? || max_ts.nil? || max_ts <= min_ts
59
69
 
60
70
  step = [(max_ts - min_ts) / partitions, 1].max
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Purplelight
4
- VERSION = '0.1.6'
4
+ VERSION = '0.1.8'
5
5
  end
@@ -31,7 +31,7 @@ module Purplelight
31
31
  @logger = logger
32
32
  @manifest = manifest
33
33
  env_level = ENV['PL_ZSTD_LEVEL']&.to_i
34
- @compression_level = (env_level && env_level > 0 ? env_level : nil)
34
+ @compression_level = (env_level&.positive? ? env_level : nil)
35
35
  @single_file = single_file
36
36
 
37
37
  @columns = columns&.map(&:to_s)
@@ -123,8 +123,8 @@ module Purplelight
123
123
  @io.flush if @io.respond_to?(:flush)
124
124
  end
125
125
 
126
- def method_missing(method_name, *args, &block)
127
- @io.send(method_name, *args, &block)
126
+ def method_missing(method_name, *, &)
127
+ @io.send(method_name, *, &)
128
128
  end
129
129
 
130
130
  def respond_to_missing?(method_name, include_private = false)
@@ -202,7 +202,7 @@ module Purplelight
202
202
  def determine_effective_compression(requested)
203
203
  case requested.to_s
204
204
  when 'zstd'
205
- ((defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter))) ? :zstd : :gzip)
205
+ (defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)) ? :zstd : :gzip)
206
206
  when 'none'
207
207
  :none
208
208
  else
@@ -30,7 +30,7 @@ module Purplelight
30
30
  @logger = logger
31
31
  @manifest = manifest
32
32
  env_level = ENV['PL_ZSTD_LEVEL']&.to_i
33
- @compression_level = compression_level || (env_level && env_level > 0 ? env_level : nil)
33
+ @compression_level = compression_level || (env_level&.positive? ? env_level : nil)
34
34
 
35
35
  @part_index = nil
36
36
  @io = nil
@@ -44,15 +44,15 @@ module Purplelight
44
44
  level_disp = @compression_level || (ENV['PL_ZSTD_LEVEL']&.to_i if @effective_compression.to_s == 'zstd')
45
45
  @logger.info("WriterJSONL using compression='#{@effective_compression}' level='#{level_disp || 'default'}'")
46
46
  end
47
- if @effective_compression.to_s != @compression.to_s
48
- @logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
49
- end
47
+ return unless @effective_compression.to_s != @compression.to_s
48
+
49
+ @logger&.warn("requested compression '#{@compression}' not available; using '#{@effective_compression}'")
50
50
  end
51
51
 
52
52
  def write_many(batch)
53
53
  ensure_open!
54
54
 
55
- chunk_threshold = (ENV['PL_WRITE_CHUNK_BYTES']&.to_i || (8 * 1024 * 1024))
55
+ chunk_threshold = ENV['PL_WRITE_CHUNK_BYTES']&.to_i || (8 * 1024 * 1024)
56
56
  total_bytes = 0
57
57
  rows = 0
58
58
 
@@ -200,7 +200,7 @@ module Purplelight
200
200
  def determine_effective_compression(requested)
201
201
  case requested.to_s
202
202
  when 'zstd'
203
- ((defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter))) ? :zstd : :gzip)
203
+ (defined?(ZSTDS) || (Object.const_defined?(:Zstd) && defined?(::Zstd::StreamWriter)) ? :zstd : :gzip)
204
204
  when 'none'
205
205
  :none
206
206
  else
@@ -116,9 +116,9 @@ module Purplelight
116
116
  def next_part_path
117
117
  ext = 'parquet'
118
118
  filename = if @single_file
119
- format('%<prefix>s.%<ext>s', prefix: @prefix, ext: ext)
119
+ "#{@prefix}.#{ext}"
120
120
  else
121
- format('%<prefix}s-part-%<seq>06d.%<ext>s', prefix: @prefix, seq: @file_seq, ext: ext)
121
+ format('%<prefix>s-part-%<seq>06d.%<ext>s', prefix: @prefix, seq: @file_seq, ext: ext)
122
122
  end
123
123
  File.join(@directory, filename)
124
124
  end
@@ -132,7 +132,11 @@ module Purplelight
132
132
  end
133
133
 
134
134
  def extract_value(doc, key)
135
- doc[key] || doc[key.to_sym]
135
+ value = doc[key] || doc[key.to_sym]
136
+ # Normalize common MongoDB/BSON types to Parquet-friendly values
137
+ return value.to_s if defined?(BSON) && value.is_a?(BSON::ObjectId)
138
+
139
+ value
136
140
  end
137
141
  end
138
142
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: purplelight
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alexander Nicholson