purplelight 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 941ef04ede346a29c0afe19a02d69bd9a25d08ce241e21c6c47960498f4a42c6
4
- data.tar.gz: c34b089d2842082f5d4be60c96d58e208e079f92712d397726a964a7361ae895
3
+ metadata.gz: 6e3771629528ecab067d858491ec4c5de06a8c88c90c64b56dcdb8658c3a6a89
4
+ data.tar.gz: 4c0427564bf04b5dc3da6b3af97bfd4e5dc0625b2faa0dd8acec84a81bd1c145
5
5
  SHA512:
6
- metadata.gz: 8d666cb565283e6410fb0412d8ed369db18b0eaa0e80c573d677295910319b809396e99fca6627214d15942d531c9ac378c7d5a850842a7124d2ff4c9b03d7d8
7
- data.tar.gz: 9188cd0b55e0d3e54fb2d209b85eeeacfb1c0d8daa73b1ace0fbb3bf4a2af865d6dd5c125f1e487458f096077a70cc93e7c0191c1222f6edd58701487f5479da
6
+ metadata.gz: 871cf3c04dcaa017d1023ff79135a5050254158a7ba20145abb04c39887125c08eedfb8cae4d16ddd3c4fd0acb383ceb9c11efff943b497ff036b68643dbea79
7
+ data.tar.gz: 8a5e7fe7a1913939966df61173380e6ed853f45bec2ef90315410a6d36d6341c07d04ff62d200379629b2366cf68296c7c291b5060d0a080dc3015175f1e6780
data/README.md CHANGED
@@ -9,7 +9,7 @@ Purplelight is published on RubyGems: [purplelight on RubyGems](https://rubygems
9
9
  Add to your Gemfile:
10
10
 
11
11
  ```ruby
12
- gem 'purplelight', '~> 0.1.6'
12
+ gem 'purplelight', '~> 0.1.7'
13
13
  ```
14
14
 
15
15
  Or install directly:
data/bin/purplelight CHANGED
@@ -4,6 +4,7 @@
4
4
  require 'optparse'
5
5
  require 'json'
6
6
  require 'mongo'
7
+ require 'time'
7
8
  require_relative '../lib/purplelight'
8
9
 
9
10
  options = {
@@ -39,7 +40,15 @@ parser = OptionParser.new do |opts|
39
40
  options[:sharding] ||= {}
40
41
  options[:sharding][:prefix] = v
41
42
  end
42
- opts.on('-q', '--query JSON', 'Filter query as JSON') { |v| options[:query] = JSON.parse(v) }
43
+ opts.on('-q', '--query JSON', 'Filter query as JSON (Extended JSON supported)') do |v|
44
+ begin
45
+ # Prefer BSON Extended JSON to support $date, $oid, etc.
46
+ options[:query] = BSON::ExtJSON.parse(v)
47
+ rescue StandardError
48
+ # Fallback to plain JSON for compatibility
49
+ options[:query] = JSON.parse(v)
50
+ end
51
+ end
43
52
  opts.on('--projection JSON', 'Projection as JSON (e.g., {"_id":1,"field":1})') { |v| options[:projection] = JSON.parse(v) }
44
53
  opts.on('--read-preference MODE',
45
54
  'Read preference mode: primary|primary_preferred|secondary|secondary_preferred|nearest') do |v|
@@ -27,13 +27,23 @@ module Purplelight
27
27
  max_id = collection.find(query || {}).projection(_id: 1).sort(_id: -1).limit(1).first&.dig('_id')
28
28
  return [{ filter: query || {}, sort: { _id: 1 } }] if min_id.nil? || max_id.nil?
29
29
 
30
- # Create numeric-ish interpolation by sampling
31
- ids = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(partitions - 1).to_a.map { |d| d['_id'] }
32
- boundaries = [min_id] + ids + [max_id]
30
+ # Create contiguous ranges using ascending inner boundaries.
31
+ # We intentionally skip the very first _id so the first range includes the smallest document.
32
+ inner_boundaries = collection.find(query || {})
33
+ .projection(_id: 1)
34
+ .sort(_id: 1)
35
+ .skip(1)
36
+ .limit([partitions - 1, 0].max)
37
+ .to_a
38
+ .map { |d| d['_id'] }
39
+
33
40
  ranges = []
34
- boundaries.each_cons(2) do |a, b|
35
- ranges << build_range(a, b)
41
+ prev = nil
42
+ inner_boundaries.each do |b|
43
+ ranges << build_range(prev, b)
44
+ prev = b
36
45
  end
46
+ ranges << build_range(prev, nil)
37
47
 
38
48
  ranges.map do |r|
39
49
  filter = query ? query.dup : {}
@@ -54,7 +64,7 @@ module Purplelight
54
64
  min_ts = min_id.respond_to?(:generation_time) ? min_id.generation_time.to_i : nil
55
65
  max_ts = max_id.respond_to?(:generation_time) ? max_id.generation_time.to_i : nil
56
66
 
57
- # Fallback to cursor sampling if _id isn't an ObjectId
67
+ # Fallback to cursor sampling if _id isn't anObjectId
58
68
  return cursor_sampling_partitions(collection: collection, query: query, partitions: partitions) if min_ts.nil? || max_ts.nil? || max_ts <= min_ts
59
69
 
60
70
  step = [(max_ts - min_ts) / partitions, 1].max
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Purplelight
4
- VERSION = '0.1.6'
4
+ VERSION = '0.1.7'
5
5
  end
@@ -116,9 +116,9 @@ module Purplelight
116
116
  def next_part_path
117
117
  ext = 'parquet'
118
118
  filename = if @single_file
119
- format('%<prefix>s.%<ext>s', prefix: @prefix, ext: ext)
119
+ "#{@prefix}.#{ext}"
120
120
  else
121
- format('%<prefix}s-part-%<seq>06d.%<ext>s', prefix: @prefix, seq: @file_seq, ext: ext)
121
+ format('%<prefix>s-part-%<seq>06d.%<ext>s', prefix: @prefix, seq: @file_seq, ext: ext)
122
122
  end
123
123
  File.join(@directory, filename)
124
124
  end
@@ -132,7 +132,12 @@ module Purplelight
132
132
  end
133
133
 
134
134
  def extract_value(doc, key)
135
- doc[key] || doc[key.to_sym]
135
+ value = doc[key] || doc[key.to_sym]
136
+ # Normalize common MongoDB/BSON types to Parquet-friendly values
137
+ if defined?(BSON) && value.is_a?(BSON::ObjectId)
138
+ return value.to_s
139
+ end
140
+ value
136
141
  end
137
142
  end
138
143
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: purplelight
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alexander Nicholson