iso-offline-sort 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.circleci/config.yml +84 -0
- data/.github/CODEOWNERS +1 -0
- data/.gitignore +15 -0
- data/.rspec +3 -0
- data/.rubocop.yml +15 -0
- data/CHANGELOG.md +10 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +22 -0
- data/README.md +63 -0
- data/Rakefile +3 -0
- data/lib/offline_sort/chunk/input_output/base.rb +54 -0
- data/lib/offline_sort/chunk/input_output/marshal.rb +17 -0
- data/lib/offline_sort/chunk/input_output/message_pack.rb +33 -0
- data/lib/offline_sort/chunk/input_output/yaml.rb +44 -0
- data/lib/offline_sort/chunk.rb +6 -0
- data/lib/offline_sort/fixed_size_min_heap.rb +95 -0
- data/lib/offline_sort/offline_sort.rb +104 -0
- data/lib/offline_sort/version.rb +5 -0
- data/lib/offline_sort.rb +5 -0
- data/offline-sort.gemspec +38 -0
- data/spec/offline_sort/chunk/input_output/base_spec.rb +98 -0
- data/spec/offline_sort/chunk/input_output/marshal_spec.rb +9 -0
- data/spec/offline_sort/chunk/input_output/message_pack_spec.rb +9 -0
- data/spec/offline_sort/chunk/input_output/yaml_spec.rb +9 -0
- data/spec/offline_sort/fixed_size_min_heap_spec.rb +96 -0
- data/spec/offline_sort/sorter_spec.rb +103 -0
- data/spec/spec_helper.rb +9 -0
- metadata +163 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 4e6bc96d9982a9a958eddb6f6a9ffa2b669c2adfea523cded397e79402488b87
|
4
|
+
data.tar.gz: 5aec83f3929cf85de14dcb28e904f6e6785b17a6c7f221d780fddc9175acc938
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 34d5e7072aabe27a3325cdab2de78528628d595fc17c19317d0d4df9ac499015ca83734369b7e59ed7035a40dc75bde385340196c703f9715422870867c3526b
|
7
|
+
data.tar.gz: 9ee20fd481891bb48a3265ffa7b27fa83c980517b1d34edc04d6623bf7078efdf56bd636405b4e17b05a4a36d7aae24a07db19dd6f09b0a5f9210f63c85de9d1
|
@@ -0,0 +1,84 @@
|
|
1
|
+
version: 2.1
|
2
|
+
jobs:
|
3
|
+
lint:
|
4
|
+
docker:
|
5
|
+
- image: cimg/ruby:2.6.6
|
6
|
+
working_directory: ~/offline-sort
|
7
|
+
steps:
|
8
|
+
- checkout
|
9
|
+
- restore_cache:
|
10
|
+
keys:
|
11
|
+
- v1-gems-ruby-2.6.6-{{ checksum "offline-sort.gemspec" }}-{{ checksum "Gemfile" }}
|
12
|
+
- v1-gems-ruby-2.6.6-
|
13
|
+
- run:
|
14
|
+
name: Install Gems
|
15
|
+
command: |
|
16
|
+
if ! bundle check --path=vendor/bundle; then
|
17
|
+
bundle install --path=vendor/bundle --jobs=4 --retry=3
|
18
|
+
bundle clean
|
19
|
+
fi
|
20
|
+
- save_cache:
|
21
|
+
key: v1-gems-ruby-2.6.6-{{ checksum "offline-sort.gemspec" }}-{{ checksum "Gemfile" }}
|
22
|
+
paths:
|
23
|
+
- "vendor/bundle"
|
24
|
+
- "gemfiles/vendor/bundle"
|
25
|
+
- run:
|
26
|
+
name: Run Rubocop
|
27
|
+
command: bundle exec rubocop
|
28
|
+
test:
|
29
|
+
parameters:
|
30
|
+
gemfile:
|
31
|
+
type: string
|
32
|
+
ruby_version:
|
33
|
+
type: string
|
34
|
+
docker:
|
35
|
+
- image: cimg/ruby:<< parameters.ruby_version >>
|
36
|
+
environment:
|
37
|
+
CIRCLE_TEST_REPORTS: "test-results"
|
38
|
+
BUNDLE_GEMFILE: << parameters.gemfile >>
|
39
|
+
working_directory: ~/offline-sort
|
40
|
+
steps:
|
41
|
+
- checkout
|
42
|
+
- unless:
|
43
|
+
condition:
|
44
|
+
equal: ["gemfiles/rails_edge.gemfile", << parameters.gemfile >>]
|
45
|
+
steps:
|
46
|
+
- restore_cache:
|
47
|
+
keys:
|
48
|
+
- v1-gems-ruby-<< parameters.ruby_version >>-{{ checksum "offline-sort.gemspec" }}-{{ checksum "<< parameters.gemfile >>" }}
|
49
|
+
- v1-gems-ruby-<< parameters.ruby_version >>-
|
50
|
+
- run:
|
51
|
+
name: Install Gems
|
52
|
+
command: |
|
53
|
+
if ! bundle check --path=vendor/bundle; then
|
54
|
+
bundle install --path=vendor/bundle --jobs=4 --retry=3
|
55
|
+
bundle clean
|
56
|
+
fi
|
57
|
+
- unless:
|
58
|
+
condition:
|
59
|
+
equal: [ "gemfiles/rails_edge.gemfile", << parameters.gemfile >> ]
|
60
|
+
steps:
|
61
|
+
- save_cache:
|
62
|
+
key: v1-gems-ruby-<< parameters.ruby_version >>-{{ checksum "offline-sort.gemspec" }}-{{ checksum "<< parameters.gemfile >>" }}
|
63
|
+
paths:
|
64
|
+
- "vendor/bundle"
|
65
|
+
- "gemfiles/vendor/bundle"
|
66
|
+
- run:
|
67
|
+
name: Run Tests
|
68
|
+
command: |
|
69
|
+
bundle exec rspec --format RspecJunitFormatter --out $CIRCLE_TEST_REPORTS/rspec/junit.xml --format progress spec
|
70
|
+
- store_test_results:
|
71
|
+
path: "test-results"
|
72
|
+
workflows:
|
73
|
+
build:
|
74
|
+
jobs:
|
75
|
+
- lint
|
76
|
+
- test:
|
77
|
+
matrix:
|
78
|
+
parameters:
|
79
|
+
gemfile:
|
80
|
+
- "Gemfile"
|
81
|
+
ruby_version:
|
82
|
+
- "2.6.8"
|
83
|
+
- "2.7.4"
|
84
|
+
- "3.0.2"
|
data/.github/CODEOWNERS
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
* @salsify/pim-core-backend
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.rubocop.yml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
inherit_gem:
|
2
|
+
salsify_rubocop: conf/rubocop.yml
|
3
|
+
|
4
|
+
AllCops:
|
5
|
+
TargetRubyVersion: 2.6
|
6
|
+
Exclude:
|
7
|
+
- 'vendor/**/*'
|
8
|
+
- 'gemfiles/**/*'
|
9
|
+
|
10
|
+
# Offense count: 9
|
11
|
+
# Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
|
12
|
+
# AllowedNames: at, by, db, id, in, io, ip, of, on, os, pp, to
|
13
|
+
Naming/MethodParameterName:
|
14
|
+
Exclude:
|
15
|
+
- 'lib/offline_sort/fixed_size_min_heap.rb'
|
data/CHANGELOG.md
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2015 Matthew Cross
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
# offline-sort
|
2
|
+
|
3
|
+
Sort arbitrarily large collections of data with limited memory usage. Given an enumerable and a `sort_by` proc, this gem will break the input data into sorted chunks, persist the chunks, and return an `Enumerator`. Data read from this enumerator will be in its final sorted order.
|
4
|
+
|
5
|
+
The size of the chunks and the strategy for serializing and deserializing the data are configurable. The gem comes with builtin strategies for `Marshal`, `MessagePack` and `YAML`.
|
6
|
+
|
7
|
+
The development of this gem is documented in this [post](http://blog.salsify.com/engineering/ruby-scalable-offline-sort) from the Salsify Engineering Blog.
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
Add this line to your application's Gemfile:
|
12
|
+
|
13
|
+
gem 'offline-sort'
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install offline-sort
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
```ruby
|
25
|
+
arrays = [ [4,5,6], [7,8,9], [1,2,3] ]
|
26
|
+
|
27
|
+
sorter = OfflineSort.sort_by(arrays, chunk_size: 1) do |array|
|
28
|
+
array.first
|
29
|
+
end
|
30
|
+
|
31
|
+
# Create a sorted enumerator
|
32
|
+
sorter.sort do |sorted|
|
33
|
+
# Stream results in sorted order
|
34
|
+
sorted.each do |entry|
|
35
|
+
# e.g. write to a file
|
36
|
+
end
|
37
|
+
end
|
38
|
+
```
|
39
|
+
The example above will create 3 files with 1 array each, then output them in sorted order. You should try different values of `chunk_size` to find the best speed/memory combination for your use case. In general larger chunk sizes will use more memory but run faster.
|
40
|
+
|
41
|
+
Sorting is not limited to arrays. You can use anything that can be expressed in a `Enumerable#sort_by` block.
|
42
|
+
|
43
|
+
## Using MessagePack
|
44
|
+
|
45
|
+
Message pack serialization is faster than the default Ruby `Marshal` strategy. To enable message pack serialization follow these steps.
|
46
|
+
|
47
|
+
`gem install msgpack`
|
48
|
+
|
49
|
+
`require 'msgpack'`
|
50
|
+
|
51
|
+
Pass OfflineSort::Chunk::InputOutput::MessagePack to chunk_input_output_class.
|
52
|
+
|
53
|
+
Limitations
|
54
|
+
|
55
|
+
The MessagePack serialize/deserialize process stringifies hash keys so it is important to write your sort_by in terms of string keys.
|
56
|
+
|
57
|
+
## Contributing
|
58
|
+
|
59
|
+
1. Fork it
|
60
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
61
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
62
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
63
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module OfflineSort
|
4
|
+
module Chunk
|
5
|
+
module InputOutput
|
6
|
+
|
7
|
+
class Base
|
8
|
+
MethodNotImplementedError = Class.new(StandardError)
|
9
|
+
|
10
|
+
attr_reader :io
|
11
|
+
|
12
|
+
def initialize(io)
|
13
|
+
@io = io
|
14
|
+
end
|
15
|
+
|
16
|
+
def read_entry
|
17
|
+
raise MethodNotImplementedError.new("#{__method__} must be overridden by #{self.class}")
|
18
|
+
end
|
19
|
+
|
20
|
+
def write_entry(_entry)
|
21
|
+
raise MethodNotImplementedError.new("#{__method__} must be overridden by #{self.class}")
|
22
|
+
end
|
23
|
+
|
24
|
+
def write_entries(entries)
|
25
|
+
entries.each { |entry| write_entry(entry) }
|
26
|
+
end
|
27
|
+
|
28
|
+
def flush
|
29
|
+
io.flush
|
30
|
+
end
|
31
|
+
|
32
|
+
def rewind
|
33
|
+
flush
|
34
|
+
io.rewind
|
35
|
+
end
|
36
|
+
|
37
|
+
def close
|
38
|
+
io.close
|
39
|
+
end
|
40
|
+
|
41
|
+
def each
|
42
|
+
Enumerator.new do |yielder|
|
43
|
+
loop do
|
44
|
+
yielder.yield(read_entry)
|
45
|
+
rescue EOFError
|
46
|
+
break
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module OfflineSort
|
4
|
+
module Chunk
|
5
|
+
module InputOutput
|
6
|
+
class Marshal < OfflineSort::Chunk::InputOutput::Base
|
7
|
+
def read_entry
|
8
|
+
::Marshal.load(io) # rubocop:disable Security/MarshalLoad, this is loading from a trusted source
|
9
|
+
end
|
10
|
+
|
11
|
+
def write_entry(entry)
|
12
|
+
io.write(::Marshal.dump(entry))
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'msgpack'
|
4
|
+
require 'offline_sort/chunk/input_output/base'
|
5
|
+
|
6
|
+
module OfflineSort
|
7
|
+
module Chunk
|
8
|
+
module InputOutput
|
9
|
+
class MessagePack < OfflineSort::Chunk::InputOutput::Base
|
10
|
+
attr_reader :packer, :unpacker
|
11
|
+
|
12
|
+
def initialize(io)
|
13
|
+
super
|
14
|
+
@packer = ::MessagePack::Packer.new(io)
|
15
|
+
@unpacker = ::MessagePack::Unpacker.new(io)
|
16
|
+
end
|
17
|
+
|
18
|
+
def read_entry
|
19
|
+
unpacker.read
|
20
|
+
end
|
21
|
+
|
22
|
+
def write_entry(entry)
|
23
|
+
packer.write(entry)
|
24
|
+
end
|
25
|
+
|
26
|
+
def flush
|
27
|
+
packer.flush
|
28
|
+
super
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
module OfflineSort
|
6
|
+
module Chunk
|
7
|
+
module InputOutput
|
8
|
+
class Yaml < OfflineSort::Chunk::InputOutput::Base
|
9
|
+
# The yaml parser does not expose a document enumerator that we can call next on without loading the entire file
|
10
|
+
def read_entry
|
11
|
+
YAML.load(next_document) # rubocop:disable Security/YAMLLoad, this is loading from a trusted source
|
12
|
+
end
|
13
|
+
|
14
|
+
def write_entry(entry)
|
15
|
+
io.write(YAML.dump(entry))
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def next_document
|
21
|
+
sio = StringIO.new
|
22
|
+
document_count = 0
|
23
|
+
line = nil
|
24
|
+
|
25
|
+
loop do
|
26
|
+
line = io.gets
|
27
|
+
|
28
|
+
document_count += 1 if line && line.start_with?('---')
|
29
|
+
|
30
|
+
sio.write(line)
|
31
|
+
break if line.nil? || document_count > 1
|
32
|
+
end
|
33
|
+
|
34
|
+
# reset the io to the beginning of the document
|
35
|
+
io.seek(io.pos - line.length, IO::SEEK_SET) if document_count > 1
|
36
|
+
|
37
|
+
raise EOFError unless sio.size > 0 # rubocop:disable Style/ZeroLengthPredicate
|
38
|
+
|
39
|
+
sio.string
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module OfflineSort
|
4
|
+
class FixedSizeMinHeap
|
5
|
+
attr_accessor :array
|
6
|
+
attr_reader :sort_by, :size_limit, :heap_end
|
7
|
+
|
8
|
+
def initialize(array, &sort_by)
|
9
|
+
@array = array
|
10
|
+
@sort_by = sort_by || Proc.new { |item| item }
|
11
|
+
@size_limit = array.size
|
12
|
+
@heap_end = array.size - 1
|
13
|
+
((array.size * 0.5) - 1).to_i.downto(0) { |i| heapify(i) }
|
14
|
+
end
|
15
|
+
|
16
|
+
def push(item)
|
17
|
+
grow_heap
|
18
|
+
array[heap_end] = item
|
19
|
+
sift_up(heap_end)
|
20
|
+
end
|
21
|
+
|
22
|
+
def pop
|
23
|
+
item = array[0]
|
24
|
+
array[0] = array[heap_end]
|
25
|
+
heapify(0)
|
26
|
+
shrink_heap unless item.nil?
|
27
|
+
item
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def shrink_heap
|
33
|
+
array[heap_end] = nil
|
34
|
+
@heap_end -= 1
|
35
|
+
end
|
36
|
+
|
37
|
+
def grow_heap
|
38
|
+
raise "Heap Size (#{size_limit}) Exceeded" if heap_end == (size_limit - 1)
|
39
|
+
|
40
|
+
@heap_end += 1
|
41
|
+
end
|
42
|
+
|
43
|
+
# Compare elements at the supplied indices
|
44
|
+
def compare(i, j)
|
45
|
+
(sort_by.call(array[i]) <=> sort_by.call(array[j])) == -1
|
46
|
+
end
|
47
|
+
|
48
|
+
# Swap elements in the array
|
49
|
+
def swap(i, j)
|
50
|
+
temp = array[i]
|
51
|
+
array[i] = array[j]
|
52
|
+
array[j] = temp
|
53
|
+
end
|
54
|
+
|
55
|
+
# Get the parent of the node i > 0.
|
56
|
+
def parent(i)
|
57
|
+
(i - 1) / 2
|
58
|
+
end
|
59
|
+
|
60
|
+
# Get the node left of node i >= 0
|
61
|
+
def left(i)
|
62
|
+
(2 * i) + 1
|
63
|
+
end
|
64
|
+
|
65
|
+
# Get the node right of node i >= 0
|
66
|
+
def right(i)
|
67
|
+
(2 * i) + 2
|
68
|
+
end
|
69
|
+
|
70
|
+
# Keeps an heap sorted with the smallest (largest) element on top
|
71
|
+
def heapify(i)
|
72
|
+
l = left(i)
|
73
|
+
top = (l <= heap_end) && compare(l, i) ? l : i
|
74
|
+
|
75
|
+
r = right(i)
|
76
|
+
top = (r <= heap_end) && compare(r, top) ? r : top
|
77
|
+
|
78
|
+
if top != i
|
79
|
+
swap(i, top)
|
80
|
+
heapify(top)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def sift_up(i)
|
85
|
+
if i > 0
|
86
|
+
p = parent(i)
|
87
|
+
|
88
|
+
if p && compare(i, p)
|
89
|
+
swap(i, p)
|
90
|
+
sift_up(p)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'offline_sort/chunk'
|
4
|
+
require 'offline_sort/fixed_size_min_heap'
|
5
|
+
|
6
|
+
module OfflineSort
|
7
|
+
def self.sort_by(*args, **kwargs, &sort_by)
|
8
|
+
Sorter.new(*args, **kwargs, &sort_by)
|
9
|
+
end
|
10
|
+
|
11
|
+
class Sorter
|
12
|
+
DEFAULT_CHUNK_IO_CLASS = Chunk::InputOutput::Marshal
|
13
|
+
DEFAULT_CHUNK_SIZE = 1000
|
14
|
+
|
15
|
+
attr_reader :enumerable, :sort_by, :chunk_size, :chunk_input_output_class
|
16
|
+
|
17
|
+
def initialize(enumerable,
|
18
|
+
chunk_input_output_class: DEFAULT_CHUNK_IO_CLASS,
|
19
|
+
chunk_size: DEFAULT_CHUNK_SIZE,
|
20
|
+
&sort_by)
|
21
|
+
@enumerable = enumerable
|
22
|
+
@chunk_input_output_class = chunk_input_output_class
|
23
|
+
@chunk_size = chunk_size
|
24
|
+
@sort_by = sort_by
|
25
|
+
@temp_files = []
|
26
|
+
end
|
27
|
+
|
28
|
+
def sort
|
29
|
+
enumerator = merge(split)
|
30
|
+
yield enumerator
|
31
|
+
ensure
|
32
|
+
@temp_files.each(&:close)
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
# TODO: optimization for when there is less than a single full chunk of data
|
38
|
+
def merge(sorted_chunk_ios)
|
39
|
+
pq = []
|
40
|
+
chunk_enumerators = sorted_chunk_ios.map(&:each)
|
41
|
+
|
42
|
+
chunk_enumerators.each_with_index do |chunk, index|
|
43
|
+
entry = chunk.next
|
44
|
+
pq.push(ChunkEntry.new(index, entry))
|
45
|
+
end
|
46
|
+
|
47
|
+
entry_sort_by = Proc.new { |entry| sort_by.call(entry.data) }
|
48
|
+
pq = FixedSizeMinHeap.new(pq, &entry_sort_by)
|
49
|
+
|
50
|
+
Enumerator.new do |yielder|
|
51
|
+
while (item = pq.pop)
|
52
|
+
yielder.yield(item.data)
|
53
|
+
|
54
|
+
begin
|
55
|
+
entry = chunk_enumerators[item.chunk_number].next
|
56
|
+
pq.push(ChunkEntry.new(item.chunk_number, entry))
|
57
|
+
rescue StopIteration
|
58
|
+
sorted_chunk_ios[item.chunk_number].close
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def split
|
65
|
+
sorted_chunks = []
|
66
|
+
chunk_entries = []
|
67
|
+
|
68
|
+
enumerable.each do |entry|
|
69
|
+
chunk_entries << entry
|
70
|
+
|
71
|
+
if chunk_entries.size == chunk_size
|
72
|
+
sorted_chunks << write_sorted_chunk(chunk_entries)
|
73
|
+
chunk_entries.clear
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
sorted_chunks << write_sorted_chunk(chunk_entries) unless chunk_entries.empty?
|
78
|
+
|
79
|
+
sorted_chunks
|
80
|
+
end
|
81
|
+
|
82
|
+
def write_sorted_chunk(entries)
|
83
|
+
file = Tempfile.open('sort-chunk-')
|
84
|
+
@temp_files << file
|
85
|
+
file.unlink
|
86
|
+
file.binmode
|
87
|
+
|
88
|
+
chunk_io = chunk_input_output_class.new(file)
|
89
|
+
entries.sort_by(&sort_by).each { |entry| chunk_io.write_entry(entry) }
|
90
|
+
|
91
|
+
chunk_io.rewind
|
92
|
+
chunk_io
|
93
|
+
end
|
94
|
+
|
95
|
+
class ChunkEntry
|
96
|
+
attr_reader :chunk_number, :data
|
97
|
+
|
98
|
+
def initialize(chunk_number, data)
|
99
|
+
@chunk_number = chunk_number
|
100
|
+
@data = data
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
data/lib/offline_sort.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'offline_sort/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |spec|
|
8
|
+
spec.name = 'iso-offline-sort'
|
9
|
+
spec.version = OfflineSort::VERSION
|
10
|
+
spec.authors = ['Isometric']
|
11
|
+
spec.email = ['andy@iso.io']
|
12
|
+
spec.description = 'Offline sort for any enumerable with pluggable serialization strategies'
|
13
|
+
spec.summary = 'Offline sort for any enumerable with pluggable serialization strategies'
|
14
|
+
spec.homepage = 'https://github.com/salsify/offline-sort'
|
15
|
+
spec.license = 'MIT'
|
16
|
+
|
17
|
+
if spec.respond_to?(:metadata)
|
18
|
+
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
19
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
20
|
+
else
|
21
|
+
raise 'RubyGems 2.0 or newer is required to set allowed_push_host.'
|
22
|
+
end
|
23
|
+
|
24
|
+
spec.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
|
25
|
+
|
26
|
+
spec.required_ruby_version = '>= 2.6'
|
27
|
+
|
28
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
29
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
30
|
+
spec.require_paths = ['lib']
|
31
|
+
|
32
|
+
spec.add_development_dependency 'bundler'
|
33
|
+
spec.add_development_dependency 'msgpack'
|
34
|
+
spec.add_development_dependency 'rake'
|
35
|
+
spec.add_development_dependency 'rspec'
|
36
|
+
spec.add_development_dependency 'rspec_junit_formatter'
|
37
|
+
spec.add_development_dependency 'salsify_rubocop', '~> 1.0.1'
|
38
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
shared_examples "a valid chunk input output" do
|
6
|
+
let(:count) { 1000 }
|
7
|
+
|
8
|
+
let(:arrays) do
|
9
|
+
Array.new(count) do |index|
|
10
|
+
[SecureRandom.hex, index, SecureRandom.hex]
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
let(:hashes) do
|
15
|
+
Array.new(count) do |index|
|
16
|
+
{ 'a' => SecureRandom.hex, 'b' => index, 'c' => SecureRandom.hex }
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
let(:tempfile) do
|
21
|
+
t = Tempfile.open('chunk-input-output')
|
22
|
+
t.binmode
|
23
|
+
t
|
24
|
+
end
|
25
|
+
|
26
|
+
let(:chunk_class) {}
|
27
|
+
let(:chunk_io) { chunk_class.new(tempfile) }
|
28
|
+
|
29
|
+
describe "#rewind" do
|
30
|
+
before do
|
31
|
+
allow(chunk_io).to receive(:flush)
|
32
|
+
allow(tempfile).to receive(:rewind)
|
33
|
+
chunk_io.rewind
|
34
|
+
end
|
35
|
+
|
36
|
+
it "rewinds the io" do
|
37
|
+
expect(tempfile).to have_received(:rewind)
|
38
|
+
end
|
39
|
+
|
40
|
+
it "flushes the io" do
|
41
|
+
expect(chunk_io).to have_received(:flush)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "#flush" do
|
46
|
+
before do
|
47
|
+
allow(tempfile).to receive(:flush)
|
48
|
+
chunk_io.flush
|
49
|
+
end
|
50
|
+
|
51
|
+
it "flushes the io" do
|
52
|
+
expect(tempfile).to have_received(:flush)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
shared_examples "a valid integration test" do
|
57
|
+
let(:enumerable) {}
|
58
|
+
|
59
|
+
it "writes the data and reads it back" do
|
60
|
+
expect { chunk_io.write_entries(enumerable) }.not_to raise_error
|
61
|
+
|
62
|
+
chunk_io.rewind
|
63
|
+
|
64
|
+
expect(tempfile.size).not_to eq(0)
|
65
|
+
expect(chunk_io.each.to_a).to match_array(enumerable)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
context "arrays" do
|
70
|
+
it_behaves_like "a valid integration test" do
|
71
|
+
let(:enumerable) { arrays }
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
context "hashes" do
|
76
|
+
it_behaves_like "a valid integration test" do
|
77
|
+
let(:enumerable) { hashes }
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
describe OfflineSort::Chunk::InputOutput::Base do
|
83
|
+
let(:io) { Tempfile.new('chunk') }
|
84
|
+
let(:chunk_io) { OfflineSort::Chunk::InputOutput::Base.new(io) }
|
85
|
+
let(:expected_error_klass) { OfflineSort::Chunk::InputOutput::Base::MethodNotImplementedError }
|
86
|
+
|
87
|
+
describe "#read_entry" do
|
88
|
+
it "raises when read_entry is called" do
|
89
|
+
expect { chunk_io.read_entry }.to raise_error(expected_error_klass)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
describe "#write_entry" do
|
94
|
+
it "raises when write_entry is called" do
|
95
|
+
expect { chunk_io.write_entry({}) }.to raise_error(expected_error_klass)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe OfflineSort::FixedSizeMinHeap do
|
6
|
+
let(:array) { (1..10).to_a.shuffle }
|
7
|
+
let(:heap) { OfflineSort::FixedSizeMinHeap.new(array.dup) }
|
8
|
+
|
9
|
+
describe "#initialize" do
|
10
|
+
it "is a a heap" do
|
11
|
+
expect { assert_min_heap(heap.array) }.not_to raise_error
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "#push" do
|
16
|
+
context "with a full array" do
|
17
|
+
it "raises an exception" do
|
18
|
+
expect { heap.push(rand(20)) }.to raise_error("Heap Size (#{array.size}) Exceeded")
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
context "with one space" do
|
23
|
+
before do
|
24
|
+
heap.pop
|
25
|
+
end
|
26
|
+
|
27
|
+
it "adds to the heap" do
|
28
|
+
expect { heap.push(1) }.not_to raise_error
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context "with more than one space" do
|
33
|
+
before do
|
34
|
+
5.times { heap.pop }
|
35
|
+
end
|
36
|
+
|
37
|
+
it "adds to the heap" do
|
38
|
+
5.times do
|
39
|
+
expect { heap.push(1) }.not_to raise_error
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "#pop" do
|
46
|
+
context "with empty array" do
|
47
|
+
before do
|
48
|
+
array.size.times { heap.pop }
|
49
|
+
end
|
50
|
+
|
51
|
+
it "is nil" do
|
52
|
+
expect(heap.pop).to be nil
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
context "until empty" do
|
57
|
+
it "is sorted" do
|
58
|
+
last = -1
|
59
|
+
array.size.times do
|
60
|
+
popped = heap.pop
|
61
|
+
expect(popped).to be > (last)
|
62
|
+
last = popped
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
context "integration test" do
|
69
|
+
it "is always heap ordered" do
|
70
|
+
100.times do
|
71
|
+
heap.pop
|
72
|
+
heap.push(rand(100))
|
73
|
+
expect { assert_min_heap(heap.array) }.not_to raise_error
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def assert_min_heap(array)
|
79
|
+
array.each_with_index do |e, index|
|
80
|
+
left = (2 * index) + 1
|
81
|
+
right = (2 * index) + 2
|
82
|
+
|
83
|
+
if left < array.size && array[left] < e
|
84
|
+
puts "left #{e} #{array}"
|
85
|
+
raise 'not a heap'
|
86
|
+
end
|
87
|
+
|
88
|
+
next unless right < array.size
|
89
|
+
|
90
|
+
unless array[right] >= e
|
91
|
+
puts "right #{e} #{array}"
|
92
|
+
raise 'not a heap'
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe OfflineSort::Sorter do
|
6
|
+
|
7
|
+
shared_examples "a correct offline sort" do
|
8
|
+
let(:count) { 10000 }
|
9
|
+
let(:entries_per_chunk) { 900 }
|
10
|
+
let(:enumerable) {}
|
11
|
+
let(:sort) {}
|
12
|
+
|
13
|
+
before do
|
14
|
+
@unsorted = enumerable.dup
|
15
|
+
r = Benchmark.measure do
|
16
|
+
sorter = OfflineSort.sort_by(enumerable, chunk_size: entries_per_chunk, &sort)
|
17
|
+
|
18
|
+
sorter.sort do |result|
|
19
|
+
@sorted = result.map do |entry|
|
20
|
+
entry
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
puts r
|
25
|
+
end
|
26
|
+
|
27
|
+
it "produces the same sorted result as an in-memory sort" do
|
28
|
+
expect(@unsorted).to match_array(enumerable)
|
29
|
+
expect do
|
30
|
+
last = nil
|
31
|
+
entry_count = 0
|
32
|
+
@sorted.each do |entry|
|
33
|
+
if last.nil?
|
34
|
+
last = entry
|
35
|
+
entry_count += 1
|
36
|
+
next
|
37
|
+
end
|
38
|
+
|
39
|
+
raise "Out of order at line #{entry_count}" unless (sort.call(last) <=> sort.call(entry)) == -1
|
40
|
+
|
41
|
+
last = entry
|
42
|
+
entry_count += 1
|
43
|
+
end
|
44
|
+
end.not_to raise_error
|
45
|
+
expect(@sorted).to match_array(enumerable.sort_by(&sort))
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
let(:arrays) do
|
50
|
+
Array.new(count) do |index|
|
51
|
+
[SecureRandom.hex, index, SecureRandom.hex]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
let(:array_sort_index) { 2 }
|
56
|
+
let(:array_sort) { Proc.new { |arr| arr[array_sort_index] } }
|
57
|
+
|
58
|
+
let(:hashes) do
|
59
|
+
Array.new(count) do |index|
|
60
|
+
{ 'a' => SecureRandom.hex, 'b' => index, 'c' => SecureRandom.hex }
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
let(:hash_sort_key) { 'c' }
|
65
|
+
let(:hash_sort) { Proc.new { |hash| hash[hash_sort_key] } }
|
66
|
+
|
67
|
+
|
68
|
+
context "with arrays" do
|
69
|
+
it_behaves_like "a correct offline sort" do
|
70
|
+
let(:enumerable) { arrays }
|
71
|
+
let(:sort) { array_sort }
|
72
|
+
end
|
73
|
+
|
74
|
+
context "with multiple sort keys" do
|
75
|
+
it_behaves_like "a correct offline sort" do
|
76
|
+
let(:enumerable) do
|
77
|
+
Array.new(count) do |index|
|
78
|
+
[index.round(-1), index, SecureRandom.hex]
|
79
|
+
end.shuffle
|
80
|
+
end
|
81
|
+
let(:sort) { Proc.new { |arr| [arr[0], arr[1]] } }
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
context "hashes" do
|
87
|
+
it_behaves_like "a correct offline sort" do
|
88
|
+
let(:enumerable) { hashes }
|
89
|
+
let(:sort) { hash_sort }
|
90
|
+
end
|
91
|
+
|
92
|
+
context "with multiple sort keys" do
|
93
|
+
it_behaves_like "a correct offline sort" do
|
94
|
+
let(:enumerable) do
|
95
|
+
Array.new(count) do |index|
|
96
|
+
{ 'a' => index.round(-1), 'b' => index, 'c' => SecureRandom.hex }
|
97
|
+
end.shuffle
|
98
|
+
end
|
99
|
+
let(:sort) { Proc.new { |hash| [hash['a'], hash['c']] } }
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,163 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: iso-offline-sort
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.3.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Isometric
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-10-18 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: msgpack
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec_junit_formatter
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: salsify_rubocop
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 1.0.1
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 1.0.1
|
97
|
+
description: Offline sort for any enumerable with pluggable serialization strategies
|
98
|
+
email:
|
99
|
+
- andy@iso.io
|
100
|
+
executables: []
|
101
|
+
extensions: []
|
102
|
+
extra_rdoc_files: []
|
103
|
+
files:
|
104
|
+
- ".circleci/config.yml"
|
105
|
+
- ".github/CODEOWNERS"
|
106
|
+
- ".gitignore"
|
107
|
+
- ".rspec"
|
108
|
+
- ".rubocop.yml"
|
109
|
+
- CHANGELOG.md
|
110
|
+
- Gemfile
|
111
|
+
- LICENSE.txt
|
112
|
+
- README.md
|
113
|
+
- Rakefile
|
114
|
+
- lib/offline_sort.rb
|
115
|
+
- lib/offline_sort/chunk.rb
|
116
|
+
- lib/offline_sort/chunk/input_output/base.rb
|
117
|
+
- lib/offline_sort/chunk/input_output/marshal.rb
|
118
|
+
- lib/offline_sort/chunk/input_output/message_pack.rb
|
119
|
+
- lib/offline_sort/chunk/input_output/yaml.rb
|
120
|
+
- lib/offline_sort/fixed_size_min_heap.rb
|
121
|
+
- lib/offline_sort/offline_sort.rb
|
122
|
+
- lib/offline_sort/version.rb
|
123
|
+
- offline-sort.gemspec
|
124
|
+
- spec/offline_sort/chunk/input_output/base_spec.rb
|
125
|
+
- spec/offline_sort/chunk/input_output/marshal_spec.rb
|
126
|
+
- spec/offline_sort/chunk/input_output/message_pack_spec.rb
|
127
|
+
- spec/offline_sort/chunk/input_output/yaml_spec.rb
|
128
|
+
- spec/offline_sort/fixed_size_min_heap_spec.rb
|
129
|
+
- spec/offline_sort/sorter_spec.rb
|
130
|
+
- spec/spec_helper.rb
|
131
|
+
homepage: https://github.com/salsify/offline-sort
|
132
|
+
licenses:
|
133
|
+
- MIT
|
134
|
+
metadata:
|
135
|
+
allowed_push_host: https://rubygems.org
|
136
|
+
rubygems_mfa_required: 'true'
|
137
|
+
post_install_message:
|
138
|
+
rdoc_options: []
|
139
|
+
require_paths:
|
140
|
+
- lib
|
141
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - ">="
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '2.6'
|
146
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
147
|
+
requirements:
|
148
|
+
- - ">="
|
149
|
+
- !ruby/object:Gem::Version
|
150
|
+
version: '0'
|
151
|
+
requirements: []
|
152
|
+
rubygems_version: 3.3.26
|
153
|
+
signing_key:
|
154
|
+
specification_version: 4
|
155
|
+
summary: Offline sort for any enumerable with pluggable serialization strategies
|
156
|
+
test_files:
|
157
|
+
- spec/offline_sort/chunk/input_output/base_spec.rb
|
158
|
+
- spec/offline_sort/chunk/input_output/marshal_spec.rb
|
159
|
+
- spec/offline_sort/chunk/input_output/message_pack_spec.rb
|
160
|
+
- spec/offline_sort/chunk/input_output/yaml_spec.rb
|
161
|
+
- spec/offline_sort/fixed_size_min_heap_spec.rb
|
162
|
+
- spec/offline_sort/sorter_spec.rb
|
163
|
+
- spec/spec_helper.rb
|