iso-offline-sort 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 4e6bc96d9982a9a958eddb6f6a9ffa2b669c2adfea523cded397e79402488b87
4
+ data.tar.gz: 5aec83f3929cf85de14dcb28e904f6e6785b17a6c7f221d780fddc9175acc938
5
+ SHA512:
6
+ metadata.gz: 34d5e7072aabe27a3325cdab2de78528628d595fc17c19317d0d4df9ac499015ca83734369b7e59ed7035a40dc75bde385340196c703f9715422870867c3526b
7
+ data.tar.gz: 9ee20fd481891bb48a3265ffa7b27fa83c980517b1d34edc04d6623bf7078efdf56bd636405b4e17b05a4a36d7aae24a07db19dd6f09b0a5f9210f63c85de9d1
@@ -0,0 +1,84 @@
1
+ version: 2.1
2
+ jobs:
3
+ lint:
4
+ docker:
5
+ - image: cimg/ruby:2.6.6
6
+ working_directory: ~/offline-sort
7
+ steps:
8
+ - checkout
9
+ - restore_cache:
10
+ keys:
11
+ - v1-gems-ruby-2.6.6-{{ checksum "offline-sort.gemspec" }}-{{ checksum "Gemfile" }}
12
+ - v1-gems-ruby-2.6.6-
13
+ - run:
14
+ name: Install Gems
15
+ command: |
16
+ if ! bundle check --path=vendor/bundle; then
17
+ bundle install --path=vendor/bundle --jobs=4 --retry=3
18
+ bundle clean
19
+ fi
20
+ - save_cache:
21
+ key: v1-gems-ruby-2.6.6-{{ checksum "offline-sort.gemspec" }}-{{ checksum "Gemfile" }}
22
+ paths:
23
+ - "vendor/bundle"
24
+ - "gemfiles/vendor/bundle"
25
+ - run:
26
+ name: Run Rubocop
27
+ command: bundle exec rubocop
28
+ test:
29
+ parameters:
30
+ gemfile:
31
+ type: string
32
+ ruby_version:
33
+ type: string
34
+ docker:
35
+ - image: cimg/ruby:<< parameters.ruby_version >>
36
+ environment:
37
+ CIRCLE_TEST_REPORTS: "test-results"
38
+ BUNDLE_GEMFILE: << parameters.gemfile >>
39
+ working_directory: ~/offline-sort
40
+ steps:
41
+ - checkout
42
+ - unless:
43
+ condition:
44
+ equal: ["gemfiles/rails_edge.gemfile", << parameters.gemfile >>]
45
+ steps:
46
+ - restore_cache:
47
+ keys:
48
+ - v1-gems-ruby-<< parameters.ruby_version >>-{{ checksum "offline-sort.gemspec" }}-{{ checksum "<< parameters.gemfile >>" }}
49
+ - v1-gems-ruby-<< parameters.ruby_version >>-
50
+ - run:
51
+ name: Install Gems
52
+ command: |
53
+ if ! bundle check --path=vendor/bundle; then
54
+ bundle install --path=vendor/bundle --jobs=4 --retry=3
55
+ bundle clean
56
+ fi
57
+ - unless:
58
+ condition:
59
+ equal: [ "gemfiles/rails_edge.gemfile", << parameters.gemfile >> ]
60
+ steps:
61
+ - save_cache:
62
+ key: v1-gems-ruby-<< parameters.ruby_version >>-{{ checksum "offline-sort.gemspec" }}-{{ checksum "<< parameters.gemfile >>" }}
63
+ paths:
64
+ - "vendor/bundle"
65
+ - "gemfiles/vendor/bundle"
66
+ - run:
67
+ name: Run Tests
68
+ command: |
69
+ bundle exec rspec --format RspecJunitFormatter --out $CIRCLE_TEST_REPORTS/rspec/junit.xml --format progress spec
70
+ - store_test_results:
71
+ path: "test-results"
72
+ workflows:
73
+ build:
74
+ jobs:
75
+ - lint
76
+ - test:
77
+ matrix:
78
+ parameters:
79
+ gemfile:
80
+ - "Gemfile"
81
+ ruby_version:
82
+ - "2.6.8"
83
+ - "2.7.4"
84
+ - "3.0.2"
@@ -0,0 +1 @@
1
+ * @salsify/pim-core-backend
data/.gitignore ADDED
@@ -0,0 +1,15 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
12
+
13
+ .idea/
14
+ *.iml
15
+ Gemfile.lock
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,15 @@
1
+ inherit_gem:
2
+ salsify_rubocop: conf/rubocop.yml
3
+
4
+ AllCops:
5
+ TargetRubyVersion: 2.6
6
+ Exclude:
7
+ - 'vendor/**/*'
8
+ - 'gemfiles/**/*'
9
+
10
+ # Offense count: 9
11
+ # Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
12
+ # AllowedNames: at, by, db, id, in, io, ip, of, on, os, pp, to
13
+ Naming/MethodParameterName:
14
+ Exclude:
15
+ - 'lib/offline_sort/fixed_size_min_heap.rb'
data/CHANGELOG.md ADDED
@@ -0,0 +1,10 @@
1
+ # Changelog
2
+
3
+ ### 0.2.0
4
+ - Add testing for ruby: 2.6, 2.7 & 3.0
5
+ - Add ruby 3 support
6
+ - Require ruby >= 2.6
7
+ - Add rubocop
8
+
9
+ ### 0.1.1
10
+ - Initial implementation
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ # Specify your gem's dependencies in offline-sort.gemspec
6
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Matthew Cross
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,63 @@
1
+ # offline-sort
2
+
3
+ Sort arbitrarily large collections of data with limited memory usage. Given an enumerable and a `sort_by` proc, this gem will break the input data into sorted chunks, persist the chunks, and return an `Enumerator`. Data read from this enumerator will be in its final sorted order.
4
+
5
+ The size of the chunks and the strategy for serializing and deserializing the data are configurable. The gem comes with builtin strategies for `Marshal`, `MessagePack` and `YAML`.
6
+
7
+ The development of this gem is documented in this [post](http://blog.salsify.com/engineering/ruby-scalable-offline-sort) from the Salsify Engineering Blog.
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ gem 'offline-sort'
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install offline-sort
22
+
23
+ ## Usage
24
+ ```ruby
25
+ arrays = [ [4,5,6], [7,8,9], [1,2,3] ]
26
+
27
+ sorter = OfflineSort.sort_by(arrays, chunk_size: 1) do |array|
28
+ array.first
29
+ end
30
+
31
+ # Create a sorted enumerator
32
+ sorter.sort do |sorted|
33
+ # Stream results in sorted order
34
+ sorted.each do |entry|
35
+ # e.g. write to a file
36
+ end
37
+ end
38
+ ```
39
+ The example above will create 3 files with 1 array each, then output them in sorted order. You should try different values of `chunk_size` to find the best speed/memory combination for your use case. In general larger chunk sizes will use more memory but run faster.
40
+
41
+ Sorting is not limited to arrays. You can use anything that can be expressed in a `Enumerable#sort_by` block.
42
+
43
+ ## Using MessagePack
44
+
45
+ Message pack serialization is faster than the default Ruby `Marshal` strategy. To enable message pack serialization follow these steps.
46
+
47
+ `gem install msgpack`
48
+
49
+ `require 'msgpack'`
50
+
51
+ Pass OfflineSort::Chunk::InputOutput::MessagePack to chunk_input_output_class.
52
+
53
+ Limitations
54
+
55
+ The MessagePack serialize/deserialize process stringifies hash keys so it is important to write your sort_by in terms of string keys.
56
+
57
+ ## Contributing
58
+
59
+ 1. Fork it
60
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
61
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
62
+ 4. Push to the branch (`git push origin my-new-feature`)
63
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OfflineSort
4
+ module Chunk
5
+ module InputOutput
6
+
7
+ class Base
8
+ MethodNotImplementedError = Class.new(StandardError)
9
+
10
+ attr_reader :io
11
+
12
+ def initialize(io)
13
+ @io = io
14
+ end
15
+
16
+ def read_entry
17
+ raise MethodNotImplementedError.new("#{__method__} must be overridden by #{self.class}")
18
+ end
19
+
20
+ def write_entry(_entry)
21
+ raise MethodNotImplementedError.new("#{__method__} must be overridden by #{self.class}")
22
+ end
23
+
24
+ def write_entries(entries)
25
+ entries.each { |entry| write_entry(entry) }
26
+ end
27
+
28
+ def flush
29
+ io.flush
30
+ end
31
+
32
+ def rewind
33
+ flush
34
+ io.rewind
35
+ end
36
+
37
+ def close
38
+ io.close
39
+ end
40
+
41
+ def each
42
+ Enumerator.new do |yielder|
43
+ loop do
44
+ yielder.yield(read_entry)
45
+ rescue EOFError
46
+ break
47
+ end
48
+ end
49
+ end
50
+ end
51
+
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OfflineSort
4
+ module Chunk
5
+ module InputOutput
6
+ class Marshal < OfflineSort::Chunk::InputOutput::Base
7
+ def read_entry
8
+ ::Marshal.load(io) # rubocop:disable Security/MarshalLoad, this is loading from a trusted source
9
+ end
10
+
11
+ def write_entry(entry)
12
+ io.write(::Marshal.dump(entry))
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'msgpack'
4
+ require 'offline_sort/chunk/input_output/base'
5
+
6
+ module OfflineSort
7
+ module Chunk
8
+ module InputOutput
9
+ class MessagePack < OfflineSort::Chunk::InputOutput::Base
10
+ attr_reader :packer, :unpacker
11
+
12
+ def initialize(io)
13
+ super
14
+ @packer = ::MessagePack::Packer.new(io)
15
+ @unpacker = ::MessagePack::Unpacker.new(io)
16
+ end
17
+
18
+ def read_entry
19
+ unpacker.read
20
+ end
21
+
22
+ def write_entry(entry)
23
+ packer.write(entry)
24
+ end
25
+
26
+ def flush
27
+ packer.flush
28
+ super
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'yaml'
4
+
5
+ module OfflineSort
6
+ module Chunk
7
+ module InputOutput
8
+ class Yaml < OfflineSort::Chunk::InputOutput::Base
9
+ # The yaml parser does not expose a document enumerator that we can call next on without loading the entire file
10
+ def read_entry
11
+ YAML.load(next_document) # rubocop:disable Security/YAMLLoad, this is loading from a trusted source
12
+ end
13
+
14
+ def write_entry(entry)
15
+ io.write(YAML.dump(entry))
16
+ end
17
+
18
+ private
19
+
20
+ def next_document
21
+ sio = StringIO.new
22
+ document_count = 0
23
+ line = nil
24
+
25
+ loop do
26
+ line = io.gets
27
+
28
+ document_count += 1 if line && line.start_with?('---')
29
+
30
+ sio.write(line)
31
+ break if line.nil? || document_count > 1
32
+ end
33
+
34
+ # reset the io to the beginning of the document
35
+ io.seek(io.pos - line.length, IO::SEEK_SET) if document_count > 1
36
+
37
+ raise EOFError unless sio.size > 0 # rubocop:disable Style/ZeroLengthPredicate
38
+
39
+ sio.string
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'offline_sort/chunk/input_output/base'
4
+ require 'offline_sort/chunk/input_output/marshal'
5
+ require 'offline_sort/chunk/input_output/message_pack' if defined?(MessagePack)
6
+ require 'offline_sort/chunk/input_output/yaml'
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OfflineSort
4
+ class FixedSizeMinHeap
5
+ attr_accessor :array
6
+ attr_reader :sort_by, :size_limit, :heap_end
7
+
8
+ def initialize(array, &sort_by)
9
+ @array = array
10
+ @sort_by = sort_by || Proc.new { |item| item }
11
+ @size_limit = array.size
12
+ @heap_end = array.size - 1
13
+ ((array.size * 0.5) - 1).to_i.downto(0) { |i| heapify(i) }
14
+ end
15
+
16
+ def push(item)
17
+ grow_heap
18
+ array[heap_end] = item
19
+ sift_up(heap_end)
20
+ end
21
+
22
+ def pop
23
+ item = array[0]
24
+ array[0] = array[heap_end]
25
+ heapify(0)
26
+ shrink_heap unless item.nil?
27
+ item
28
+ end
29
+
30
+ private
31
+
32
+ def shrink_heap
33
+ array[heap_end] = nil
34
+ @heap_end -= 1
35
+ end
36
+
37
+ def grow_heap
38
+ raise "Heap Size (#{size_limit}) Exceeded" if heap_end == (size_limit - 1)
39
+
40
+ @heap_end += 1
41
+ end
42
+
43
+ # Compare elements at the supplied indices
44
+ def compare(i, j)
45
+ (sort_by.call(array[i]) <=> sort_by.call(array[j])) == -1
46
+ end
47
+
48
+ # Swap elements in the array
49
+ def swap(i, j)
50
+ temp = array[i]
51
+ array[i] = array[j]
52
+ array[j] = temp
53
+ end
54
+
55
+ # Get the parent of the node i > 0.
56
+ def parent(i)
57
+ (i - 1) / 2
58
+ end
59
+
60
+ # Get the node left of node i >= 0
61
+ def left(i)
62
+ (2 * i) + 1
63
+ end
64
+
65
+ # Get the node right of node i >= 0
66
+ def right(i)
67
+ (2 * i) + 2
68
+ end
69
+
70
+ # Keeps an heap sorted with the smallest (largest) element on top
71
+ def heapify(i)
72
+ l = left(i)
73
+ top = (l <= heap_end) && compare(l, i) ? l : i
74
+
75
+ r = right(i)
76
+ top = (r <= heap_end) && compare(r, top) ? r : top
77
+
78
+ if top != i
79
+ swap(i, top)
80
+ heapify(top)
81
+ end
82
+ end
83
+
84
+ def sift_up(i)
85
+ if i > 0
86
+ p = parent(i)
87
+
88
+ if p && compare(i, p)
89
+ swap(i, p)
90
+ sift_up(p)
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'offline_sort/chunk'
4
+ require 'offline_sort/fixed_size_min_heap'
5
+
6
+ module OfflineSort
7
+ def self.sort_by(*args, **kwargs, &sort_by)
8
+ Sorter.new(*args, **kwargs, &sort_by)
9
+ end
10
+
11
+ class Sorter
12
+ DEFAULT_CHUNK_IO_CLASS = Chunk::InputOutput::Marshal
13
+ DEFAULT_CHUNK_SIZE = 1000
14
+
15
+ attr_reader :enumerable, :sort_by, :chunk_size, :chunk_input_output_class
16
+
17
+ def initialize(enumerable,
18
+ chunk_input_output_class: DEFAULT_CHUNK_IO_CLASS,
19
+ chunk_size: DEFAULT_CHUNK_SIZE,
20
+ &sort_by)
21
+ @enumerable = enumerable
22
+ @chunk_input_output_class = chunk_input_output_class
23
+ @chunk_size = chunk_size
24
+ @sort_by = sort_by
25
+ @temp_files = []
26
+ end
27
+
28
+ def sort
29
+ enumerator = merge(split)
30
+ yield enumerator
31
+ ensure
32
+ @temp_files.each(&:close)
33
+ end
34
+
35
+ private
36
+
37
+ # TODO: optimization for when there is less than a single full chunk of data
38
+ def merge(sorted_chunk_ios)
39
+ pq = []
40
+ chunk_enumerators = sorted_chunk_ios.map(&:each)
41
+
42
+ chunk_enumerators.each_with_index do |chunk, index|
43
+ entry = chunk.next
44
+ pq.push(ChunkEntry.new(index, entry))
45
+ end
46
+
47
+ entry_sort_by = Proc.new { |entry| sort_by.call(entry.data) }
48
+ pq = FixedSizeMinHeap.new(pq, &entry_sort_by)
49
+
50
+ Enumerator.new do |yielder|
51
+ while (item = pq.pop)
52
+ yielder.yield(item.data)
53
+
54
+ begin
55
+ entry = chunk_enumerators[item.chunk_number].next
56
+ pq.push(ChunkEntry.new(item.chunk_number, entry))
57
+ rescue StopIteration
58
+ sorted_chunk_ios[item.chunk_number].close
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+ def split
65
+ sorted_chunks = []
66
+ chunk_entries = []
67
+
68
+ enumerable.each do |entry|
69
+ chunk_entries << entry
70
+
71
+ if chunk_entries.size == chunk_size
72
+ sorted_chunks << write_sorted_chunk(chunk_entries)
73
+ chunk_entries.clear
74
+ end
75
+ end
76
+
77
+ sorted_chunks << write_sorted_chunk(chunk_entries) unless chunk_entries.empty?
78
+
79
+ sorted_chunks
80
+ end
81
+
82
+ def write_sorted_chunk(entries)
83
+ file = Tempfile.open('sort-chunk-')
84
+ @temp_files << file
85
+ file.unlink
86
+ file.binmode
87
+
88
+ chunk_io = chunk_input_output_class.new(file)
89
+ entries.sort_by(&sort_by).each { |entry| chunk_io.write_entry(entry) }
90
+
91
+ chunk_io.rewind
92
+ chunk_io
93
+ end
94
+
95
+ class ChunkEntry
96
+ attr_reader :chunk_number, :data
97
+
98
+ def initialize(chunk_number, data)
99
+ @chunk_number = chunk_number
100
+ @data = data
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OfflineSort
4
+ VERSION = '0.3.0'
5
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'offline_sort/version'
4
+ require 'offline_sort/chunk'
5
+ require 'offline_sort/offline_sort'
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path('lib', __dir__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require 'offline_sort/version'
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = 'iso-offline-sort'
9
+ spec.version = OfflineSort::VERSION
10
+ spec.authors = ['Isometric']
11
+ spec.email = ['andy@iso.io']
12
+ spec.description = 'Offline sort for any enumerable with pluggable serialization strategies'
13
+ spec.summary = 'Offline sort for any enumerable with pluggable serialization strategies'
14
+ spec.homepage = 'https://github.com/salsify/offline-sort'
15
+ spec.license = 'MIT'
16
+
17
+ if spec.respond_to?(:metadata)
18
+ spec.metadata['allowed_push_host'] = 'https://rubygems.org'
19
+ spec.metadata['rubygems_mfa_required'] = 'true'
20
+ else
21
+ raise 'RubyGems 2.0 or newer is required to set allowed_push_host.'
22
+ end
23
+
24
+ spec.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
25
+
26
+ spec.required_ruby_version = '>= 2.6'
27
+
28
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
29
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
30
+ spec.require_paths = ['lib']
31
+
32
+ spec.add_development_dependency 'bundler'
33
+ spec.add_development_dependency 'msgpack'
34
+ spec.add_development_dependency 'rake'
35
+ spec.add_development_dependency 'rspec'
36
+ spec.add_development_dependency 'rspec_junit_formatter'
37
+ spec.add_development_dependency 'salsify_rubocop', '~> 1.0.1'
38
+ end
@@ -0,0 +1,98 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ shared_examples "a valid chunk input output" do
6
+ let(:count) { 1000 }
7
+
8
+ let(:arrays) do
9
+ Array.new(count) do |index|
10
+ [SecureRandom.hex, index, SecureRandom.hex]
11
+ end
12
+ end
13
+
14
+ let(:hashes) do
15
+ Array.new(count) do |index|
16
+ { 'a' => SecureRandom.hex, 'b' => index, 'c' => SecureRandom.hex }
17
+ end
18
+ end
19
+
20
+ let(:tempfile) do
21
+ t = Tempfile.open('chunk-input-output')
22
+ t.binmode
23
+ t
24
+ end
25
+
26
+ let(:chunk_class) {}
27
+ let(:chunk_io) { chunk_class.new(tempfile) }
28
+
29
+ describe "#rewind" do
30
+ before do
31
+ allow(chunk_io).to receive(:flush)
32
+ allow(tempfile).to receive(:rewind)
33
+ chunk_io.rewind
34
+ end
35
+
36
+ it "rewinds the io" do
37
+ expect(tempfile).to have_received(:rewind)
38
+ end
39
+
40
+ it "flushes the io" do
41
+ expect(chunk_io).to have_received(:flush)
42
+ end
43
+ end
44
+
45
+ describe "#flush" do
46
+ before do
47
+ allow(tempfile).to receive(:flush)
48
+ chunk_io.flush
49
+ end
50
+
51
+ it "flushes the io" do
52
+ expect(tempfile).to have_received(:flush)
53
+ end
54
+ end
55
+
56
+ shared_examples "a valid integration test" do
57
+ let(:enumerable) {}
58
+
59
+ it "writes the data and reads it back" do
60
+ expect { chunk_io.write_entries(enumerable) }.not_to raise_error
61
+
62
+ chunk_io.rewind
63
+
64
+ expect(tempfile.size).not_to eq(0)
65
+ expect(chunk_io.each.to_a).to match_array(enumerable)
66
+ end
67
+ end
68
+
69
+ context "arrays" do
70
+ it_behaves_like "a valid integration test" do
71
+ let(:enumerable) { arrays }
72
+ end
73
+ end
74
+
75
+ context "hashes" do
76
+ it_behaves_like "a valid integration test" do
77
+ let(:enumerable) { hashes }
78
+ end
79
+ end
80
+ end
81
+
82
+ describe OfflineSort::Chunk::InputOutput::Base do
83
+ let(:io) { Tempfile.new('chunk') }
84
+ let(:chunk_io) { OfflineSort::Chunk::InputOutput::Base.new(io) }
85
+ let(:expected_error_klass) { OfflineSort::Chunk::InputOutput::Base::MethodNotImplementedError }
86
+
87
+ describe "#read_entry" do
88
+ it "raises when read_entry is called" do
89
+ expect { chunk_io.read_entry }.to raise_error(expected_error_klass)
90
+ end
91
+ end
92
+
93
+ describe "#write_entry" do
94
+ it "raises when write_entry is called" do
95
+ expect { chunk_io.write_entry({}) }.to raise_error(expected_error_klass)
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ describe OfflineSort::Chunk::InputOutput::Marshal do
6
+ it_behaves_like "a valid chunk input output" do
7
+ let(:chunk_class) { OfflineSort::Chunk::InputOutput::Marshal }
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ describe OfflineSort::Chunk::InputOutput::MessagePack do
6
+ it_behaves_like "a valid chunk input output" do
7
+ let(:chunk_class) { OfflineSort::Chunk::InputOutput::MessagePack }
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ describe OfflineSort::Chunk::InputOutput::Yaml do
6
+ it_behaves_like "a valid chunk input output" do
7
+ let(:chunk_class) { OfflineSort::Chunk::InputOutput::Yaml }
8
+ end
9
+ end
@@ -0,0 +1,96 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ describe OfflineSort::FixedSizeMinHeap do
6
+ let(:array) { (1..10).to_a.shuffle }
7
+ let(:heap) { OfflineSort::FixedSizeMinHeap.new(array.dup) }
8
+
9
+ describe "#initialize" do
10
+ it "is a a heap" do
11
+ expect { assert_min_heap(heap.array) }.not_to raise_error
12
+ end
13
+ end
14
+
15
+ describe "#push" do
16
+ context "with a full array" do
17
+ it "raises an exception" do
18
+ expect { heap.push(rand(20)) }.to raise_error("Heap Size (#{array.size}) Exceeded")
19
+ end
20
+ end
21
+
22
+ context "with one space" do
23
+ before do
24
+ heap.pop
25
+ end
26
+
27
+ it "adds to the heap" do
28
+ expect { heap.push(1) }.not_to raise_error
29
+ end
30
+ end
31
+
32
+ context "with more than one space" do
33
+ before do
34
+ 5.times { heap.pop }
35
+ end
36
+
37
+ it "adds to the heap" do
38
+ 5.times do
39
+ expect { heap.push(1) }.not_to raise_error
40
+ end
41
+ end
42
+ end
43
+ end
44
+
45
+ describe "#pop" do
46
+ context "with empty array" do
47
+ before do
48
+ array.size.times { heap.pop }
49
+ end
50
+
51
+ it "is nil" do
52
+ expect(heap.pop).to be nil
53
+ end
54
+ end
55
+
56
+ context "until empty" do
57
+ it "is sorted" do
58
+ last = -1
59
+ array.size.times do
60
+ popped = heap.pop
61
+ expect(popped).to be > (last)
62
+ last = popped
63
+ end
64
+ end
65
+ end
66
+ end
67
+
68
+ context "integration test" do
69
+ it "is always heap ordered" do
70
+ 100.times do
71
+ heap.pop
72
+ heap.push(rand(100))
73
+ expect { assert_min_heap(heap.array) }.not_to raise_error
74
+ end
75
+ end
76
+ end
77
+
78
+ def assert_min_heap(array)
79
+ array.each_with_index do |e, index|
80
+ left = (2 * index) + 1
81
+ right = (2 * index) + 2
82
+
83
+ if left < array.size && array[left] < e
84
+ puts "left #{e} #{array}"
85
+ raise 'not a heap'
86
+ end
87
+
88
+ next unless right < array.size
89
+
90
+ unless array[right] >= e
91
+ puts "right #{e} #{array}"
92
+ raise 'not a heap'
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ describe OfflineSort::Sorter do
6
+
7
+ shared_examples "a correct offline sort" do
8
+ let(:count) { 10000 }
9
+ let(:entries_per_chunk) { 900 }
10
+ let(:enumerable) {}
11
+ let(:sort) {}
12
+
13
+ before do
14
+ @unsorted = enumerable.dup
15
+ r = Benchmark.measure do
16
+ sorter = OfflineSort.sort_by(enumerable, chunk_size: entries_per_chunk, &sort)
17
+
18
+ sorter.sort do |result|
19
+ @sorted = result.map do |entry|
20
+ entry
21
+ end
22
+ end
23
+ end
24
+ puts r
25
+ end
26
+
27
+ it "produces the same sorted result as an in-memory sort" do
28
+ expect(@unsorted).to match_array(enumerable)
29
+ expect do
30
+ last = nil
31
+ entry_count = 0
32
+ @sorted.each do |entry|
33
+ if last.nil?
34
+ last = entry
35
+ entry_count += 1
36
+ next
37
+ end
38
+
39
+ raise "Out of order at line #{entry_count}" unless (sort.call(last) <=> sort.call(entry)) == -1
40
+
41
+ last = entry
42
+ entry_count += 1
43
+ end
44
+ end.not_to raise_error
45
+ expect(@sorted).to match_array(enumerable.sort_by(&sort))
46
+ end
47
+ end
48
+
49
+ let(:arrays) do
50
+ Array.new(count) do |index|
51
+ [SecureRandom.hex, index, SecureRandom.hex]
52
+ end
53
+ end
54
+
55
+ let(:array_sort_index) { 2 }
56
+ let(:array_sort) { Proc.new { |arr| arr[array_sort_index] } }
57
+
58
+ let(:hashes) do
59
+ Array.new(count) do |index|
60
+ { 'a' => SecureRandom.hex, 'b' => index, 'c' => SecureRandom.hex }
61
+ end
62
+ end
63
+
64
+ let(:hash_sort_key) { 'c' }
65
+ let(:hash_sort) { Proc.new { |hash| hash[hash_sort_key] } }
66
+
67
+
68
+ context "with arrays" do
69
+ it_behaves_like "a correct offline sort" do
70
+ let(:enumerable) { arrays }
71
+ let(:sort) { array_sort }
72
+ end
73
+
74
+ context "with multiple sort keys" do
75
+ it_behaves_like "a correct offline sort" do
76
+ let(:enumerable) do
77
+ Array.new(count) do |index|
78
+ [index.round(-1), index, SecureRandom.hex]
79
+ end.shuffle
80
+ end
81
+ let(:sort) { Proc.new { |arr| [arr[0], arr[1]] } }
82
+ end
83
+ end
84
+ end
85
+
86
+ context "hashes" do
87
+ it_behaves_like "a correct offline sort" do
88
+ let(:enumerable) { hashes }
89
+ let(:sort) { hash_sort }
90
+ end
91
+
92
+ context "with multiple sort keys" do
93
+ it_behaves_like "a correct offline sort" do
94
+ let(:enumerable) do
95
+ Array.new(count) do |index|
96
+ { 'a' => index.round(-1), 'b' => index, 'c' => SecureRandom.hex }
97
+ end.shuffle
98
+ end
99
+ let(:sort) { Proc.new { |hash| [hash['a'], hash['c']] } }
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ $LOAD_PATH.unshift File.expand_path('../lib', __dir__)
4
+ require 'securerandom'
5
+ require 'benchmark'
6
+ require 'msgpack'
7
+ require 'tempfile'
8
+
9
+ require 'offline_sort'
metadata ADDED
@@ -0,0 +1,163 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: iso-offline-sort
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.0
5
+ platform: ruby
6
+ authors:
7
+ - Isometric
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-10-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: msgpack
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec_junit_formatter
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: salsify_rubocop
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 1.0.1
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 1.0.1
97
+ description: Offline sort for any enumerable with pluggable serialization strategies
98
+ email:
99
+ - andy@iso.io
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".circleci/config.yml"
105
+ - ".github/CODEOWNERS"
106
+ - ".gitignore"
107
+ - ".rspec"
108
+ - ".rubocop.yml"
109
+ - CHANGELOG.md
110
+ - Gemfile
111
+ - LICENSE.txt
112
+ - README.md
113
+ - Rakefile
114
+ - lib/offline_sort.rb
115
+ - lib/offline_sort/chunk.rb
116
+ - lib/offline_sort/chunk/input_output/base.rb
117
+ - lib/offline_sort/chunk/input_output/marshal.rb
118
+ - lib/offline_sort/chunk/input_output/message_pack.rb
119
+ - lib/offline_sort/chunk/input_output/yaml.rb
120
+ - lib/offline_sort/fixed_size_min_heap.rb
121
+ - lib/offline_sort/offline_sort.rb
122
+ - lib/offline_sort/version.rb
123
+ - offline-sort.gemspec
124
+ - spec/offline_sort/chunk/input_output/base_spec.rb
125
+ - spec/offline_sort/chunk/input_output/marshal_spec.rb
126
+ - spec/offline_sort/chunk/input_output/message_pack_spec.rb
127
+ - spec/offline_sort/chunk/input_output/yaml_spec.rb
128
+ - spec/offline_sort/fixed_size_min_heap_spec.rb
129
+ - spec/offline_sort/sorter_spec.rb
130
+ - spec/spec_helper.rb
131
+ homepage: https://github.com/salsify/offline-sort
132
+ licenses:
133
+ - MIT
134
+ metadata:
135
+ allowed_push_host: https://rubygems.org
136
+ rubygems_mfa_required: 'true'
137
+ post_install_message:
138
+ rdoc_options: []
139
+ require_paths:
140
+ - lib
141
+ required_ruby_version: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '2.6'
146
+ required_rubygems_version: !ruby/object:Gem::Requirement
147
+ requirements:
148
+ - - ">="
149
+ - !ruby/object:Gem::Version
150
+ version: '0'
151
+ requirements: []
152
+ rubygems_version: 3.3.26
153
+ signing_key:
154
+ specification_version: 4
155
+ summary: Offline sort for any enumerable with pluggable serialization strategies
156
+ test_files:
157
+ - spec/offline_sort/chunk/input_output/base_spec.rb
158
+ - spec/offline_sort/chunk/input_output/marshal_spec.rb
159
+ - spec/offline_sort/chunk/input_output/message_pack_spec.rb
160
+ - spec/offline_sort/chunk/input_output/yaml_spec.rb
161
+ - spec/offline_sort/fixed_size_min_heap_spec.rb
162
+ - spec/offline_sort/sorter_spec.rb
163
+ - spec/spec_helper.rb