iso-offline-sort 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 4e6bc96d9982a9a958eddb6f6a9ffa2b669c2adfea523cded397e79402488b87
4
+ data.tar.gz: 5aec83f3929cf85de14dcb28e904f6e6785b17a6c7f221d780fddc9175acc938
5
+ SHA512:
6
+ metadata.gz: 34d5e7072aabe27a3325cdab2de78528628d595fc17c19317d0d4df9ac499015ca83734369b7e59ed7035a40dc75bde385340196c703f9715422870867c3526b
7
+ data.tar.gz: 9ee20fd481891bb48a3265ffa7b27fa83c980517b1d34edc04d6623bf7078efdf56bd636405b4e17b05a4a36d7aae24a07db19dd6f09b0a5f9210f63c85de9d1
@@ -0,0 +1,84 @@
1
+ version: 2.1
2
+ jobs:
3
+ lint:
4
+ docker:
5
+ - image: cimg/ruby:2.6.6
6
+ working_directory: ~/offline-sort
7
+ steps:
8
+ - checkout
9
+ - restore_cache:
10
+ keys:
11
+ - v1-gems-ruby-2.6.6-{{ checksum "offline-sort.gemspec" }}-{{ checksum "Gemfile" }}
12
+ - v1-gems-ruby-2.6.6-
13
+ - run:
14
+ name: Install Gems
15
+ command: |
16
+ if ! bundle check --path=vendor/bundle; then
17
+ bundle install --path=vendor/bundle --jobs=4 --retry=3
18
+ bundle clean
19
+ fi
20
+ - save_cache:
21
+ key: v1-gems-ruby-2.6.6-{{ checksum "offline-sort.gemspec" }}-{{ checksum "Gemfile" }}
22
+ paths:
23
+ - "vendor/bundle"
24
+ - "gemfiles/vendor/bundle"
25
+ - run:
26
+ name: Run Rubocop
27
+ command: bundle exec rubocop
28
+ test:
29
+ parameters:
30
+ gemfile:
31
+ type: string
32
+ ruby_version:
33
+ type: string
34
+ docker:
35
+ - image: cimg/ruby:<< parameters.ruby_version >>
36
+ environment:
37
+ CIRCLE_TEST_REPORTS: "test-results"
38
+ BUNDLE_GEMFILE: << parameters.gemfile >>
39
+ working_directory: ~/offline-sort
40
+ steps:
41
+ - checkout
42
+ - unless:
43
+ condition:
44
+ equal: ["gemfiles/rails_edge.gemfile", << parameters.gemfile >>]
45
+ steps:
46
+ - restore_cache:
47
+ keys:
48
+ - v1-gems-ruby-<< parameters.ruby_version >>-{{ checksum "offline-sort.gemspec" }}-{{ checksum "<< parameters.gemfile >>" }}
49
+ - v1-gems-ruby-<< parameters.ruby_version >>-
50
+ - run:
51
+ name: Install Gems
52
+ command: |
53
+ if ! bundle check --path=vendor/bundle; then
54
+ bundle install --path=vendor/bundle --jobs=4 --retry=3
55
+ bundle clean
56
+ fi
57
+ - unless:
58
+ condition:
59
+ equal: [ "gemfiles/rails_edge.gemfile", << parameters.gemfile >> ]
60
+ steps:
61
+ - save_cache:
62
+ key: v1-gems-ruby-<< parameters.ruby_version >>-{{ checksum "offline-sort.gemspec" }}-{{ checksum "<< parameters.gemfile >>" }}
63
+ paths:
64
+ - "vendor/bundle"
65
+ - "gemfiles/vendor/bundle"
66
+ - run:
67
+ name: Run Tests
68
+ command: |
69
+ bundle exec rspec --format RspecJunitFormatter --out $CIRCLE_TEST_REPORTS/rspec/junit.xml --format progress spec
70
+ - store_test_results:
71
+ path: "test-results"
72
+ workflows:
73
+ build:
74
+ jobs:
75
+ - lint
76
+ - test:
77
+ matrix:
78
+ parameters:
79
+ gemfile:
80
+ - "Gemfile"
81
+ ruby_version:
82
+ - "2.6.8"
83
+ - "2.7.4"
84
+ - "3.0.2"
@@ -0,0 +1 @@
1
+ * @salsify/pim-core-backend
data/.gitignore ADDED
@@ -0,0 +1,15 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
12
+
13
+ .idea/
14
+ *.iml
15
+ Gemfile.lock
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,15 @@
1
+ inherit_gem:
2
+ salsify_rubocop: conf/rubocop.yml
3
+
4
+ AllCops:
5
+ TargetRubyVersion: 2.6
6
+ Exclude:
7
+ - 'vendor/**/*'
8
+ - 'gemfiles/**/*'
9
+
10
+ # Offense count: 9
11
+ # Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
12
+ # AllowedNames: at, by, db, id, in, io, ip, of, on, os, pp, to
13
+ Naming/MethodParameterName:
14
+ Exclude:
15
+ - 'lib/offline_sort/fixed_size_min_heap.rb'
data/CHANGELOG.md ADDED
@@ -0,0 +1,10 @@
1
+ # Changelog
2
+
3
+ ### 0.2.0
4
+ - Add testing for ruby: 2.6, 2.7 & 3.0
5
+ - Add ruby 3 support
6
+ - Require ruby >= 2.6
7
+ - Add rubocop
8
+
9
+ ### 0.1.1
10
+ - Initial implementation
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ # Specify your gem's dependencies in offline-sort.gemspec
6
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Matthew Cross
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,63 @@
1
+ # offline-sort
2
+
3
+ Sort arbitrarily large collections of data with limited memory usage. Given an enumerable and a `sort_by` proc, this gem will break the input data into sorted chunks, persist the chunks, and return an `Enumerator`. Data read from this enumerator will be in its final sorted order.
4
+
5
+ The size of the chunks and the strategy for serializing and deserializing the data are configurable. The gem comes with builtin strategies for `Marshal`, `MessagePack` and `YAML`.
6
+
7
+ The development of this gem is documented in this [post](http://blog.salsify.com/engineering/ruby-scalable-offline-sort) from the Salsify Engineering Blog.
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ gem 'offline-sort'
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install offline-sort
22
+
23
+ ## Usage
24
+ ```ruby
25
+ arrays = [ [4,5,6], [7,8,9], [1,2,3] ]
26
+
27
+ sorter = OfflineSort.sort_by(arrays, chunk_size: 1) do |array|
28
+ array.first
29
+ end
30
+
31
+ # Create a sorted enumerator
32
+ sorter.sort do |sorted|
33
+ # Stream results in sorted order
34
+ sorted.each do |entry|
35
+ # e.g. write to a file
36
+ end
37
+ end
38
+ ```
39
+ The example above will create 3 files with 1 array each, then output them in sorted order. You should try different values of `chunk_size` to find the best speed/memory combination for your use case. In general larger chunk sizes will use more memory but run faster.
40
+
41
+ Sorting is not limited to arrays. You can use anything that can be expressed in a `Enumerable#sort_by` block.
42
+
43
+ ## Using MessagePack
44
+
45
+ Message pack serialization is faster than the default Ruby `Marshal` strategy. To enable message pack serialization follow these steps.
46
+
47
+ `gem install msgpack`
48
+
49
+ `require 'msgpack'`
50
+
51
+ Pass OfflineSort::Chunk::InputOutput::MessagePack to chunk_input_output_class.
52
+
53
+ Limitations
54
+
55
+ The MessagePack serialize/deserialize process stringifies hash keys so it is important to write your sort_by in terms of string keys.
56
+
57
+ ## Contributing
58
+
59
+ 1. Fork it
60
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
61
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
62
+ 4. Push to the branch (`git push origin my-new-feature`)
63
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OfflineSort
4
+ module Chunk
5
+ module InputOutput
6
+
7
+ class Base
8
+ MethodNotImplementedError = Class.new(StandardError)
9
+
10
+ attr_reader :io
11
+
12
+ def initialize(io)
13
+ @io = io
14
+ end
15
+
16
+ def read_entry
17
+ raise MethodNotImplementedError.new("#{__method__} must be overridden by #{self.class}")
18
+ end
19
+
20
+ def write_entry(_entry)
21
+ raise MethodNotImplementedError.new("#{__method__} must be overridden by #{self.class}")
22
+ end
23
+
24
+ def write_entries(entries)
25
+ entries.each { |entry| write_entry(entry) }
26
+ end
27
+
28
+ def flush
29
+ io.flush
30
+ end
31
+
32
+ def rewind
33
+ flush
34
+ io.rewind
35
+ end
36
+
37
+ def close
38
+ io.close
39
+ end
40
+
41
+ def each
42
+ Enumerator.new do |yielder|
43
+ loop do
44
+ yielder.yield(read_entry)
45
+ rescue EOFError
46
+ break
47
+ end
48
+ end
49
+ end
50
+ end
51
+
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OfflineSort
4
+ module Chunk
5
+ module InputOutput
6
+ class Marshal < OfflineSort::Chunk::InputOutput::Base
7
+ def read_entry
8
+ ::Marshal.load(io) # rubocop:disable Security/MarshalLoad, this is loading from a trusted source
9
+ end
10
+
11
+ def write_entry(entry)
12
+ io.write(::Marshal.dump(entry))
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'msgpack'
4
+ require 'offline_sort/chunk/input_output/base'
5
+
6
+ module OfflineSort
7
+ module Chunk
8
+ module InputOutput
9
+ class MessagePack < OfflineSort::Chunk::InputOutput::Base
10
+ attr_reader :packer, :unpacker
11
+
12
+ def initialize(io)
13
+ super
14
+ @packer = ::MessagePack::Packer.new(io)
15
+ @unpacker = ::MessagePack::Unpacker.new(io)
16
+ end
17
+
18
+ def read_entry
19
+ unpacker.read
20
+ end
21
+
22
+ def write_entry(entry)
23
+ packer.write(entry)
24
+ end
25
+
26
+ def flush
27
+ packer.flush
28
+ super
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'yaml'
4
+
5
+ module OfflineSort
6
+ module Chunk
7
+ module InputOutput
8
+ class Yaml < OfflineSort::Chunk::InputOutput::Base
9
+ # The yaml parser does not expose a document enumerator that we can call next on without loading the entire file
10
+ def read_entry
11
+ YAML.load(next_document) # rubocop:disable Security/YAMLLoad, this is loading from a trusted source
12
+ end
13
+
14
+ def write_entry(entry)
15
+ io.write(YAML.dump(entry))
16
+ end
17
+
18
+ private
19
+
20
+ def next_document
21
+ sio = StringIO.new
22
+ document_count = 0
23
+ line = nil
24
+
25
+ loop do
26
+ line = io.gets
27
+
28
+ document_count += 1 if line && line.start_with?('---')
29
+
30
+ sio.write(line)
31
+ break if line.nil? || document_count > 1
32
+ end
33
+
34
+ # reset the io to the beginning of the document
35
+ io.seek(io.pos - line.length, IO::SEEK_SET) if document_count > 1
36
+
37
+ raise EOFError unless sio.size > 0 # rubocop:disable Style/ZeroLengthPredicate
38
+
39
+ sio.string
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'offline_sort/chunk/input_output/base'
4
+ require 'offline_sort/chunk/input_output/marshal'
5
+ require 'offline_sort/chunk/input_output/message_pack' if defined?(MessagePack)
6
+ require 'offline_sort/chunk/input_output/yaml'
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OfflineSort
4
+ class FixedSizeMinHeap
5
+ attr_accessor :array
6
+ attr_reader :sort_by, :size_limit, :heap_end
7
+
8
+ def initialize(array, &sort_by)
9
+ @array = array
10
+ @sort_by = sort_by || Proc.new { |item| item }
11
+ @size_limit = array.size
12
+ @heap_end = array.size - 1
13
+ ((array.size * 0.5) - 1).to_i.downto(0) { |i| heapify(i) }
14
+ end
15
+
16
+ def push(item)
17
+ grow_heap
18
+ array[heap_end] = item
19
+ sift_up(heap_end)
20
+ end
21
+
22
+ def pop
23
+ item = array[0]
24
+ array[0] = array[heap_end]
25
+ heapify(0)
26
+ shrink_heap unless item.nil?
27
+ item
28
+ end
29
+
30
+ private
31
+
32
+ def shrink_heap
33
+ array[heap_end] = nil
34
+ @heap_end -= 1
35
+ end
36
+
37
+ def grow_heap
38
+ raise "Heap Size (#{size_limit}) Exceeded" if heap_end == (size_limit - 1)
39
+
40
+ @heap_end += 1
41
+ end
42
+
43
+ # Compare elements at the supplied indices
44
+ def compare(i, j)
45
+ (sort_by.call(array[i]) <=> sort_by.call(array[j])) == -1
46
+ end
47
+
48
+ # Swap elements in the array
49
+ def swap(i, j)
50
+ temp = array[i]
51
+ array[i] = array[j]
52
+ array[j] = temp
53
+ end
54
+
55
+ # Get the parent of the node i > 0.
56
+ def parent(i)
57
+ (i - 1) / 2
58
+ end
59
+
60
+ # Get the node left of node i >= 0
61
+ def left(i)
62
+ (2 * i) + 1
63
+ end
64
+
65
+ # Get the node right of node i >= 0
66
+ def right(i)
67
+ (2 * i) + 2
68
+ end
69
+
70
+ # Keeps an heap sorted with the smallest (largest) element on top
71
+ def heapify(i)
72
+ l = left(i)
73
+ top = (l <= heap_end) && compare(l, i) ? l : i
74
+
75
+ r = right(i)
76
+ top = (r <= heap_end) && compare(r, top) ? r : top
77
+
78
+ if top != i
79
+ swap(i, top)
80
+ heapify(top)
81
+ end
82
+ end
83
+
84
+ def sift_up(i)
85
+ if i > 0
86
+ p = parent(i)
87
+
88
+ if p && compare(i, p)
89
+ swap(i, p)
90
+ sift_up(p)
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'offline_sort/chunk'
4
+ require 'offline_sort/fixed_size_min_heap'
5
+
6
+ module OfflineSort
7
+ def self.sort_by(*args, **kwargs, &sort_by)
8
+ Sorter.new(*args, **kwargs, &sort_by)
9
+ end
10
+
11
+ class Sorter
12
+ DEFAULT_CHUNK_IO_CLASS = Chunk::InputOutput::Marshal
13
+ DEFAULT_CHUNK_SIZE = 1000
14
+
15
+ attr_reader :enumerable, :sort_by, :chunk_size, :chunk_input_output_class
16
+
17
+ def initialize(enumerable,
18
+ chunk_input_output_class: DEFAULT_CHUNK_IO_CLASS,
19
+ chunk_size: DEFAULT_CHUNK_SIZE,
20
+ &sort_by)
21
+ @enumerable = enumerable
22
+ @chunk_input_output_class = chunk_input_output_class
23
+ @chunk_size = chunk_size
24
+ @sort_by = sort_by
25
+ @temp_files = []
26
+ end
27
+
28
+ def sort
29
+ enumerator = merge(split)
30
+ yield enumerator
31
+ ensure
32
+ @temp_files.each(&:close)
33
+ end
34
+
35
+ private
36
+
37
+ # TODO: optimization for when there is less than a single full chunk of data
38
+ def merge(sorted_chunk_ios)
39
+ pq = []
40
+ chunk_enumerators = sorted_chunk_ios.map(&:each)
41
+
42
+ chunk_enumerators.each_with_index do |chunk, index|
43
+ entry = chunk.next
44
+ pq.push(ChunkEntry.new(index, entry))
45
+ end
46
+
47
+ entry_sort_by = Proc.new { |entry| sort_by.call(entry.data) }
48
+ pq = FixedSizeMinHeap.new(pq, &entry_sort_by)
49
+
50
+ Enumerator.new do |yielder|
51
+ while (item = pq.pop)
52
+ yielder.yield(item.data)
53
+
54
+ begin
55
+ entry = chunk_enumerators[item.chunk_number].next
56
+ pq.push(ChunkEntry.new(item.chunk_number, entry))
57
+ rescue StopIteration
58
+ sorted_chunk_ios[item.chunk_number].close
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+ def split
65
+ sorted_chunks = []
66
+ chunk_entries = []
67
+
68
+ enumerable.each do |entry|
69
+ chunk_entries << entry
70
+
71
+ if chunk_entries.size == chunk_size
72
+ sorted_chunks << write_sorted_chunk(chunk_entries)
73
+ chunk_entries.clear
74
+ end
75
+ end
76
+
77
+ sorted_chunks << write_sorted_chunk(chunk_entries) unless chunk_entries.empty?
78
+
79
+ sorted_chunks
80
+ end
81
+
82
+ def write_sorted_chunk(entries)
83
+ file = Tempfile.open('sort-chunk-')
84
+ @temp_files << file
85
+ file.unlink
86
+ file.binmode
87
+
88
+ chunk_io = chunk_input_output_class.new(file)
89
+ entries.sort_by(&sort_by).each { |entry| chunk_io.write_entry(entry) }
90
+
91
+ chunk_io.rewind
92
+ chunk_io
93
+ end
94
+
95
+ class ChunkEntry
96
+ attr_reader :chunk_number, :data
97
+
98
+ def initialize(chunk_number, data)
99
+ @chunk_number = chunk_number
100
+ @data = data
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OfflineSort
4
+ VERSION = '0.3.0'
5
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'offline_sort/version'
4
+ require 'offline_sort/chunk'
5
+ require 'offline_sort/offline_sort'
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path('lib', __dir__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require 'offline_sort/version'
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = 'iso-offline-sort'
9
+ spec.version = OfflineSort::VERSION
10
+ spec.authors = ['Isometric']
11
+ spec.email = ['andy@iso.io']
12
+ spec.description = 'Offline sort for any enumerable with pluggable serialization strategies'
13
+ spec.summary = 'Offline sort for any enumerable with pluggable serialization strategies'
14
+ spec.homepage = 'https://github.com/salsify/offline-sort'
15
+ spec.license = 'MIT'
16
+
17
+ if spec.respond_to?(:metadata)
18
+ spec.metadata['allowed_push_host'] = 'https://rubygems.org'
19
+ spec.metadata['rubygems_mfa_required'] = 'true'
20
+ else
21
+ raise 'RubyGems 2.0 or newer is required to set allowed_push_host.'
22
+ end
23
+
24
+ spec.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
25
+
26
+ spec.required_ruby_version = '>= 2.6'
27
+
28
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
29
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
30
+ spec.require_paths = ['lib']
31
+
32
+ spec.add_development_dependency 'bundler'
33
+ spec.add_development_dependency 'msgpack'
34
+ spec.add_development_dependency 'rake'
35
+ spec.add_development_dependency 'rspec'
36
+ spec.add_development_dependency 'rspec_junit_formatter'
37
+ spec.add_development_dependency 'salsify_rubocop', '~> 1.0.1'
38
+ end
@@ -0,0 +1,98 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ shared_examples "a valid chunk input output" do
6
+ let(:count) { 1000 }
7
+
8
+ let(:arrays) do
9
+ Array.new(count) do |index|
10
+ [SecureRandom.hex, index, SecureRandom.hex]
11
+ end
12
+ end
13
+
14
+ let(:hashes) do
15
+ Array.new(count) do |index|
16
+ { 'a' => SecureRandom.hex, 'b' => index, 'c' => SecureRandom.hex }
17
+ end
18
+ end
19
+
20
+ let(:tempfile) do
21
+ t = Tempfile.open('chunk-input-output')
22
+ t.binmode
23
+ t
24
+ end
25
+
26
+ let(:chunk_class) {}
27
+ let(:chunk_io) { chunk_class.new(tempfile) }
28
+
29
+ describe "#rewind" do
30
+ before do
31
+ allow(chunk_io).to receive(:flush)
32
+ allow(tempfile).to receive(:rewind)
33
+ chunk_io.rewind
34
+ end
35
+
36
+ it "rewinds the io" do
37
+ expect(tempfile).to have_received(:rewind)
38
+ end
39
+
40
+ it "flushes the io" do
41
+ expect(chunk_io).to have_received(:flush)
42
+ end
43
+ end
44
+
45
+ describe "#flush" do
46
+ before do
47
+ allow(tempfile).to receive(:flush)
48
+ chunk_io.flush
49
+ end
50
+
51
+ it "flushes the io" do
52
+ expect(tempfile).to have_received(:flush)
53
+ end
54
+ end
55
+
56
+ shared_examples "a valid integration test" do
57
+ let(:enumerable) {}
58
+
59
+ it "writes the data and reads it back" do
60
+ expect { chunk_io.write_entries(enumerable) }.not_to raise_error
61
+
62
+ chunk_io.rewind
63
+
64
+ expect(tempfile.size).not_to eq(0)
65
+ expect(chunk_io.each.to_a).to match_array(enumerable)
66
+ end
67
+ end
68
+
69
+ context "arrays" do
70
+ it_behaves_like "a valid integration test" do
71
+ let(:enumerable) { arrays }
72
+ end
73
+ end
74
+
75
+ context "hashes" do
76
+ it_behaves_like "a valid integration test" do
77
+ let(:enumerable) { hashes }
78
+ end
79
+ end
80
+ end
81
+
82
+ describe OfflineSort::Chunk::InputOutput::Base do
83
+ let(:io) { Tempfile.new('chunk') }
84
+ let(:chunk_io) { OfflineSort::Chunk::InputOutput::Base.new(io) }
85
+ let(:expected_error_klass) { OfflineSort::Chunk::InputOutput::Base::MethodNotImplementedError }
86
+
87
+ describe "#read_entry" do
88
+ it "raises when read_entry is called" do
89
+ expect { chunk_io.read_entry }.to raise_error(expected_error_klass)
90
+ end
91
+ end
92
+
93
+ describe "#write_entry" do
94
+ it "raises when write_entry is called" do
95
+ expect { chunk_io.write_entry({}) }.to raise_error(expected_error_klass)
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ describe OfflineSort::Chunk::InputOutput::Marshal do
6
+ it_behaves_like "a valid chunk input output" do
7
+ let(:chunk_class) { OfflineSort::Chunk::InputOutput::Marshal }
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ describe OfflineSort::Chunk::InputOutput::MessagePack do
6
+ it_behaves_like "a valid chunk input output" do
7
+ let(:chunk_class) { OfflineSort::Chunk::InputOutput::MessagePack }
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ describe OfflineSort::Chunk::InputOutput::Yaml do
6
+ it_behaves_like "a valid chunk input output" do
7
+ let(:chunk_class) { OfflineSort::Chunk::InputOutput::Yaml }
8
+ end
9
+ end
@@ -0,0 +1,96 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ describe OfflineSort::FixedSizeMinHeap do
6
+ let(:array) { (1..10).to_a.shuffle }
7
+ let(:heap) { OfflineSort::FixedSizeMinHeap.new(array.dup) }
8
+
9
+ describe "#initialize" do
10
+ it "is a a heap" do
11
+ expect { assert_min_heap(heap.array) }.not_to raise_error
12
+ end
13
+ end
14
+
15
+ describe "#push" do
16
+ context "with a full array" do
17
+ it "raises an exception" do
18
+ expect { heap.push(rand(20)) }.to raise_error("Heap Size (#{array.size}) Exceeded")
19
+ end
20
+ end
21
+
22
+ context "with one space" do
23
+ before do
24
+ heap.pop
25
+ end
26
+
27
+ it "adds to the heap" do
28
+ expect { heap.push(1) }.not_to raise_error
29
+ end
30
+ end
31
+
32
+ context "with more than one space" do
33
+ before do
34
+ 5.times { heap.pop }
35
+ end
36
+
37
+ it "adds to the heap" do
38
+ 5.times do
39
+ expect { heap.push(1) }.not_to raise_error
40
+ end
41
+ end
42
+ end
43
+ end
44
+
45
+ describe "#pop" do
46
+ context "with empty array" do
47
+ before do
48
+ array.size.times { heap.pop }
49
+ end
50
+
51
+ it "is nil" do
52
+ expect(heap.pop).to be nil
53
+ end
54
+ end
55
+
56
+ context "until empty" do
57
+ it "is sorted" do
58
+ last = -1
59
+ array.size.times do
60
+ popped = heap.pop
61
+ expect(popped).to be > (last)
62
+ last = popped
63
+ end
64
+ end
65
+ end
66
+ end
67
+
68
+ context "integration test" do
69
+ it "is always heap ordered" do
70
+ 100.times do
71
+ heap.pop
72
+ heap.push(rand(100))
73
+ expect { assert_min_heap(heap.array) }.not_to raise_error
74
+ end
75
+ end
76
+ end
77
+
78
+ def assert_min_heap(array)
79
+ array.each_with_index do |e, index|
80
+ left = (2 * index) + 1
81
+ right = (2 * index) + 2
82
+
83
+ if left < array.size && array[left] < e
84
+ puts "left #{e} #{array}"
85
+ raise 'not a heap'
86
+ end
87
+
88
+ next unless right < array.size
89
+
90
+ unless array[right] >= e
91
+ puts "right #{e} #{array}"
92
+ raise 'not a heap'
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ describe OfflineSort::Sorter do
6
+
7
+ shared_examples "a correct offline sort" do
8
+ let(:count) { 10000 }
9
+ let(:entries_per_chunk) { 900 }
10
+ let(:enumerable) {}
11
+ let(:sort) {}
12
+
13
+ before do
14
+ @unsorted = enumerable.dup
15
+ r = Benchmark.measure do
16
+ sorter = OfflineSort.sort_by(enumerable, chunk_size: entries_per_chunk, &sort)
17
+
18
+ sorter.sort do |result|
19
+ @sorted = result.map do |entry|
20
+ entry
21
+ end
22
+ end
23
+ end
24
+ puts r
25
+ end
26
+
27
+ it "produces the same sorted result as an in-memory sort" do
28
+ expect(@unsorted).to match_array(enumerable)
29
+ expect do
30
+ last = nil
31
+ entry_count = 0
32
+ @sorted.each do |entry|
33
+ if last.nil?
34
+ last = entry
35
+ entry_count += 1
36
+ next
37
+ end
38
+
39
+ raise "Out of order at line #{entry_count}" unless (sort.call(last) <=> sort.call(entry)) == -1
40
+
41
+ last = entry
42
+ entry_count += 1
43
+ end
44
+ end.not_to raise_error
45
+ expect(@sorted).to match_array(enumerable.sort_by(&sort))
46
+ end
47
+ end
48
+
49
+ let(:arrays) do
50
+ Array.new(count) do |index|
51
+ [SecureRandom.hex, index, SecureRandom.hex]
52
+ end
53
+ end
54
+
55
+ let(:array_sort_index) { 2 }
56
+ let(:array_sort) { Proc.new { |arr| arr[array_sort_index] } }
57
+
58
+ let(:hashes) do
59
+ Array.new(count) do |index|
60
+ { 'a' => SecureRandom.hex, 'b' => index, 'c' => SecureRandom.hex }
61
+ end
62
+ end
63
+
64
+ let(:hash_sort_key) { 'c' }
65
+ let(:hash_sort) { Proc.new { |hash| hash[hash_sort_key] } }
66
+
67
+
68
+ context "with arrays" do
69
+ it_behaves_like "a correct offline sort" do
70
+ let(:enumerable) { arrays }
71
+ let(:sort) { array_sort }
72
+ end
73
+
74
+ context "with multiple sort keys" do
75
+ it_behaves_like "a correct offline sort" do
76
+ let(:enumerable) do
77
+ Array.new(count) do |index|
78
+ [index.round(-1), index, SecureRandom.hex]
79
+ end.shuffle
80
+ end
81
+ let(:sort) { Proc.new { |arr| [arr[0], arr[1]] } }
82
+ end
83
+ end
84
+ end
85
+
86
+ context "hashes" do
87
+ it_behaves_like "a correct offline sort" do
88
+ let(:enumerable) { hashes }
89
+ let(:sort) { hash_sort }
90
+ end
91
+
92
+ context "with multiple sort keys" do
93
+ it_behaves_like "a correct offline sort" do
94
+ let(:enumerable) do
95
+ Array.new(count) do |index|
96
+ { 'a' => index.round(-1), 'b' => index, 'c' => SecureRandom.hex }
97
+ end.shuffle
98
+ end
99
+ let(:sort) { Proc.new { |hash| [hash['a'], hash['c']] } }
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ $LOAD_PATH.unshift File.expand_path('../lib', __dir__)
4
+ require 'securerandom'
5
+ require 'benchmark'
6
+ require 'msgpack'
7
+ require 'tempfile'
8
+
9
+ require 'offline_sort'
metadata ADDED
@@ -0,0 +1,163 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: iso-offline-sort
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.0
5
+ platform: ruby
6
+ authors:
7
+ - Isometric
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-10-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: msgpack
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec_junit_formatter
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: salsify_rubocop
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 1.0.1
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 1.0.1
97
+ description: Offline sort for any enumerable with pluggable serialization strategies
98
+ email:
99
+ - andy@iso.io
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".circleci/config.yml"
105
+ - ".github/CODEOWNERS"
106
+ - ".gitignore"
107
+ - ".rspec"
108
+ - ".rubocop.yml"
109
+ - CHANGELOG.md
110
+ - Gemfile
111
+ - LICENSE.txt
112
+ - README.md
113
+ - Rakefile
114
+ - lib/offline_sort.rb
115
+ - lib/offline_sort/chunk.rb
116
+ - lib/offline_sort/chunk/input_output/base.rb
117
+ - lib/offline_sort/chunk/input_output/marshal.rb
118
+ - lib/offline_sort/chunk/input_output/message_pack.rb
119
+ - lib/offline_sort/chunk/input_output/yaml.rb
120
+ - lib/offline_sort/fixed_size_min_heap.rb
121
+ - lib/offline_sort/offline_sort.rb
122
+ - lib/offline_sort/version.rb
123
+ - offline-sort.gemspec
124
+ - spec/offline_sort/chunk/input_output/base_spec.rb
125
+ - spec/offline_sort/chunk/input_output/marshal_spec.rb
126
+ - spec/offline_sort/chunk/input_output/message_pack_spec.rb
127
+ - spec/offline_sort/chunk/input_output/yaml_spec.rb
128
+ - spec/offline_sort/fixed_size_min_heap_spec.rb
129
+ - spec/offline_sort/sorter_spec.rb
130
+ - spec/spec_helper.rb
131
+ homepage: https://github.com/salsify/offline-sort
132
+ licenses:
133
+ - MIT
134
+ metadata:
135
+ allowed_push_host: https://rubygems.org
136
+ rubygems_mfa_required: 'true'
137
+ post_install_message:
138
+ rdoc_options: []
139
+ require_paths:
140
+ - lib
141
+ required_ruby_version: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '2.6'
146
+ required_rubygems_version: !ruby/object:Gem::Requirement
147
+ requirements:
148
+ - - ">="
149
+ - !ruby/object:Gem::Version
150
+ version: '0'
151
+ requirements: []
152
+ rubygems_version: 3.3.26
153
+ signing_key:
154
+ specification_version: 4
155
+ summary: Offline sort for any enumerable with pluggable serialization strategies
156
+ test_files:
157
+ - spec/offline_sort/chunk/input_output/base_spec.rb
158
+ - spec/offline_sort/chunk/input_output/marshal_spec.rb
159
+ - spec/offline_sort/chunk/input_output/message_pack_spec.rb
160
+ - spec/offline_sort/chunk/input_output/yaml_spec.rb
161
+ - spec/offline_sort/fixed_size_min_heap_spec.rb
162
+ - spec/offline_sort/sorter_spec.rb
163
+ - spec/spec_helper.rb