offline-sort 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 4cc629685e889a5efc942757a27d66552ca71d20
4
+ data.tar.gz: de90414c864b38c29a810a19b52fa99ae25adf7b
5
+ SHA512:
6
+ metadata.gz: 4ce8bbfde9c3b183959ac4c204b0c56f1aa6f2162acffac1794903be81b9aa6e76f75a7ad92079da029b8f6c1b06e84d729075c5af6ac6fa348c3d8caea0d5b5
7
+ data.tar.gz: 3f24c42d692b7be89a55c89efcc80f582af04b89405aed64e3820567fbc80def0f30738957fa0906220e8e4e396872597eefe289fdcd913f00ff60d7f97d3299
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in offline-sort.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Matthew Cross
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,61 @@
1
+ # offline-sort
2
+
3
+ Sort arbitrarily large collections of data with limited memory usage. Given an enumerable and a `sort_by` proc, this gem will break the input data into sorted chunks, persist the chunks, and return an `Enumerator`. Data read from this enumerator will be in its final sorted order.
4
+
5
+ The size of the chunks and the strategy for serializing and deserializing the data are configurable. The gem comes with builtin strategies for `Marshal`, `MessagePack` and `YAML`.
6
+
7
+ The development of this gem is documented in this [post](http://blog.salsify.com/engineering/ruby-scalable-offline-sort) from the Salsify Engineering Blog.
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ gem 'offline-sort'
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install offline-sort
22
+
23
+ ## Usage
24
+ ```ruby
25
+ arrays = [ [4,5,6], [7,8,9], [1,2,3] ]
26
+
27
+ # Create a sorted enumerator
28
+ sorted = OfflineSort.sort(arrays, chunk_size: 1) do |array|
29
+ array.first
30
+ end
31
+
32
+ # Stream results in sorted order
33
+ sorted.each do |entry|
34
+ # e.g. write to a file
35
+ end
36
+ ```
37
+ The example above will create 3 files with 1 array each, then output them in sorted order. You should try different values of `chunk_size` to find the best speed/memory combination for your use case. In general larger chunk sizes will use more memory but run faster.
38
+
39
+ Sorting is not limited to arrays. You can use anything that can be expressed in a `Enumerable#sort_by` block.
40
+
41
+ ## Using MessagePack
42
+
43
+ Message pack serialization is faster than the default Ruby `Marshal` strategy. To enable message pack serialization follow these steps.
44
+
45
+ `gem install msgpack`
46
+
47
+ `require 'msgpack'`
48
+
49
+ Requiring MessagePack before you require `offline_sort` will automatically enable MessagePack serialization in the gem.
50
+
51
+ Limitations
52
+
53
+ The MessagePack serialize/deserialize process stringifies hash keys so it is important to write your sort_by in terms of string keys.
54
+
55
+ ## Contributing
56
+
57
+ 1. Fork it
58
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
59
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
60
+ 4. Push to the branch (`git push origin my-new-feature`)
61
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,55 @@
1
+ module OfflineSort
2
+ module Chunk
3
+ module InputOutput
4
+
5
+ class Base
6
+ MethodNotImplementedError = Class.new(StandardError)
7
+
8
+ attr_reader :io
9
+
10
+ def initialize(io)
11
+ @io = io
12
+ end
13
+
14
+ def read_entry
15
+ raise(MethodNotImplementedError, "#{__method__} must be overridden by #{self.class}")
16
+ end
17
+
18
+ def write_entry(entry)
19
+ raise(MethodNotImplementedError, "#{__method__} must be overridden by #{self.class}")
20
+ end
21
+
22
+ def write_entries(entries)
23
+ entries.each { |entry| write_entry(entry) }
24
+ end
25
+
26
+ def flush
27
+ io.flush
28
+ end
29
+
30
+ def rewind
31
+ flush
32
+ io.rewind
33
+ end
34
+
35
+ def close
36
+ io.close
37
+ end
38
+
39
+ def each
40
+ Enumerator.new do |yielder|
41
+ while true
42
+ begin
43
+ yielder.yield(read_entry)
44
+ rescue EOFError
45
+ break
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+
52
+ end
53
+ end
54
+ end
55
+
@@ -0,0 +1,15 @@
1
+ module OfflineSort
2
+ module Chunk
3
+ module InputOutput
4
+ class Marshal < OfflineSort::Chunk::InputOutput::Base
5
+ def read_entry
6
+ ::Marshal.load(io)
7
+ end
8
+
9
+ def write_entry(entry)
10
+ io.write(::Marshal.dump(entry))
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,32 @@
1
+ require 'msgpack'
2
+ require 'offline_sort/chunk/input_output/base'
3
+
4
+ module OfflineSort
5
+ module Chunk
6
+ module InputOutput
7
+ class MessagePack < OfflineSort::Chunk::InputOutput::Base
8
+ attr_reader :packer, :unpacker
9
+
10
+ def initialize(io)
11
+ super
12
+ @packer = ::MessagePack::Packer.new(io)
13
+ @unpacker = ::MessagePack::Unpacker.new(io)
14
+ end
15
+
16
+ def read_entry
17
+ unpacker.read
18
+ end
19
+
20
+ def write_entry(entry)
21
+ packer.write(entry)
22
+ end
23
+
24
+ def flush
25
+ packer.flush
26
+ super
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+
@@ -0,0 +1,45 @@
1
+ require 'yaml'
2
+
3
+ module OfflineSort
4
+ module Chunk
5
+ module InputOutput
6
+ class Yaml < OfflineSort::Chunk::InputOutput::Base
7
+ #The yaml parser does not expose a document enumerator that we can call next on without loading the entire file
8
+ def read_entry
9
+ YAML.load(next_document)
10
+ end
11
+
12
+ def write_entry(entry)
13
+ io.write(YAML.dump(entry))
14
+ end
15
+
16
+ private
17
+
18
+ def next_document
19
+ sio = StringIO.new
20
+ document_count = 0
21
+
22
+ begin
23
+ line = io.gets
24
+
25
+ if line && line.start_with?('---')
26
+ document_count += 1
27
+ end
28
+
29
+ sio.write(line)
30
+ end until line.nil? || document_count > 1
31
+
32
+ # reset the io to the beginning of the document
33
+ if document_count > 1
34
+ io.seek(io.pos - line.length, IO::SEEK_SET)
35
+ end
36
+
37
+ raise EOFError unless sio.size > 0
38
+
39
+ sio.string
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
45
+
@@ -0,0 +1,4 @@
1
+ require 'offline_sort/chunk/input_output/base'
2
+ require 'offline_sort/chunk/input_output/marshal'
3
+ require 'offline_sort/chunk/input_output/message_pack' if defined?(MessagePack)
4
+ require 'offline_sort/chunk/input_output/yaml'
@@ -0,0 +1,90 @@
1
+ module OfflineSort
2
+ class FixedSizeMinHeap
3
+ attr_accessor :array
4
+ attr_reader :sort_by
5
+ attr_reader :size_limit
6
+ attr_reader :heap_end
7
+
8
+ def initialize(array, &sort_by)
9
+ @array = array
10
+ @sort_by = sort_by || Proc.new { |item| item }
11
+ @size_limit = array.size
12
+ @heap_end = array.size - 1
13
+ ((array.size * 0.5) - 1).to_i.downto(0) { |i| heapify(i) }
14
+ end
15
+
16
+ def push(item)
17
+ grow_heap
18
+ array[heap_end] = item
19
+ sift_up(heap_end)
20
+ end
21
+
22
+ def pop
23
+ item = array[0]
24
+ array[0] = array[heap_end]
25
+ heapify(0)
26
+ shrink_heap unless item.nil?
27
+ item
28
+ end
29
+
30
+ private
31
+
32
+ def shrink_heap
33
+ array[heap_end] = nil
34
+ @heap_end -= 1
35
+ end
36
+
37
+ def grow_heap
38
+ raise "Heap Size (#{size_limit}) Exceeded" if heap_end == (size_limit - 1)
39
+ @heap_end += 1
40
+ end
41
+
42
+ # Compare elements at the supplied indices
43
+ def compare(i,j)
44
+ (sort_by.call(array[i]) <=> sort_by.call(array[j])) == -1
45
+ end
46
+
47
+ # Swap elements in the array
48
+ def swap(i,j)
49
+ temp = array[i]
50
+ array[i] = array[j]
51
+ array[j] = temp
52
+ end
53
+
54
+ # Get the parent of the node i > 0.
55
+ def parent(i)
56
+ (i - 1) / 2
57
+ end
58
+ # Get the node left of node i >= 0
59
+ def left(i)
60
+ (2 * i) + 1
61
+ end
62
+ # Get the node right of node i >= 0
63
+ def right(i)
64
+ (2 * i) + 2
65
+ end
66
+
67
+ # Keeps an heap sorted with the smallest (largest) element on top
68
+ def heapify(i)
69
+ l = left(i)
70
+ top = ((l <= heap_end) && compare(l,i)) ? l : i
71
+
72
+ r = right(i)
73
+ top = ((r <= heap_end) && compare(r,top)) ? r : top
74
+
75
+ if top != i
76
+ swap(i, top)
77
+ heapify(top)
78
+ end
79
+ end
80
+
81
+ def sift_up(i)
82
+ if i > 0 && p = parent(i)
83
+ if compare(i,p)
84
+ swap(i,p);
85
+ sift_up(p)
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,95 @@
1
+ require 'offline_sort/chunk'
2
+ require 'offline_sort/fixed_size_min_heap'
3
+
4
+ module OfflineSort
5
+ def self.sort(*args, &sort_by)
6
+ Sorter.new(*args, &sort_by).sort
7
+ end
8
+
9
+ class Sorter
10
+ DEFAULT_CHUNK_IO_CLASS = defined?(::MessagePack) ? Chunk::InputOutput::MessagePack : Chunk::InputOutput::Marshal
11
+ DEFAULT_CHUNK_SIZE = 1000
12
+
13
+ attr_reader :enumerable, :sort_by, :chunk_size, :chunk_input_output_class
14
+
15
+ def initialize(enumerable, chunk_input_output_class: DEFAULT_CHUNK_IO_CLASS, chunk_size: DEFAULT_CHUNK_SIZE, &sort_by)
16
+ @enumerable = enumerable
17
+ @chunk_input_output_class = chunk_input_output_class
18
+ @chunk_size = chunk_size
19
+ @sort_by = sort_by
20
+ end
21
+
22
+ def sort
23
+ merge(split)
24
+ end
25
+
26
+ private
27
+
28
+ #TODO optimization for when there is less than a single full chunk of data
29
+ def merge(sorted_chunk_ios)
30
+ pq = []
31
+ chunk_enumerators = sorted_chunk_ios.map(&:each)
32
+
33
+ chunk_enumerators.each_with_index do |chunk, index|
34
+ entry = chunk.next
35
+ pq.push(ChunkEntry.new(index, entry))
36
+ end
37
+
38
+ entry_sort_by = Proc.new { |entry| sort_by.call(entry.data) }
39
+ pq = FixedSizeMinHeap.new(pq, &entry_sort_by)
40
+
41
+ Enumerator.new do |yielder|
42
+ while item = pq.pop
43
+ yielder.yield(item.data)
44
+
45
+ begin
46
+ entry = chunk_enumerators[item.chunk_number].next
47
+ pq.push(ChunkEntry.new(item.chunk_number, entry))
48
+ rescue StopIteration
49
+ sorted_chunk_ios[item.chunk_number].close
50
+ end
51
+ end
52
+ end
53
+ end
54
+
55
+ def split
56
+ sorted_chunks = []
57
+ chunk_entries = []
58
+
59
+ enumerable.each do |entry|
60
+ chunk_entries << entry
61
+
62
+ if chunk_entries.size == chunk_size
63
+ sorted_chunks << write_sorted_chunk(chunk_entries)
64
+ chunk_entries.clear
65
+ end
66
+ end
67
+
68
+ unless chunk_entries.empty?
69
+ sorted_chunks << write_sorted_chunk(chunk_entries)
70
+ end
71
+
72
+ sorted_chunks
73
+ end
74
+
75
+ def write_sorted_chunk(entries)
76
+ file = Tempfile.open('sort-chunk-')
77
+ file.binmode
78
+
79
+ chunk_io = chunk_input_output_class.new(file)
80
+ entries.sort_by(&sort_by).each { |entry| chunk_io.write_entry(entry) }
81
+
82
+ chunk_io.rewind
83
+ chunk_io
84
+ end
85
+
86
+ class ChunkEntry
87
+ attr_reader :chunk_number, :data
88
+
89
+ def initialize(chunk_number, data)
90
+ @chunk_number = chunk_number
91
+ @data = data
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,3 @@
1
+ module OfflineSort
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,3 @@
1
+ require 'offline_sort/version'
2
+ require 'offline_sort/chunk'
3
+ require 'offline_sort/offline_sort'
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'offline_sort/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "offline-sort"
8
+ spec.version = OfflineSort::VERSION
9
+ spec.authors = ["Matthew Cross"]
10
+ spec.email = ["mcross@salsify.com"]
11
+ spec.description = %q{Offline sort for any enumerable with pluggable serialization strategies}
12
+ spec.summary = %q{Offline sort for any enumerable with pluggable serialization strategies}
13
+ spec.homepage = "https://github.com/salsify/offline-sort"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "pqueue"
22
+
23
+ spec.add_development_dependency "msgpack"
24
+ spec.add_development_dependency "bundler", "~> 1.3"
25
+ spec.add_development_dependency "rake"
26
+ spec.add_development_dependency "rspec"
27
+ end
@@ -0,0 +1,95 @@
1
+ require 'spec_helper'
2
+
3
+ shared_examples "a valid chunk input output" do
4
+ let(:count) { 1000 }
5
+
6
+ let(:arrays) do
7
+ count.times.map do |index|
8
+ [SecureRandom.hex, index, SecureRandom.hex]
9
+ end
10
+ end
11
+
12
+ let(:hashes) do
13
+ count.times.map do |index|
14
+ { 'a' => SecureRandom.hex, 'b' => index, 'c' => SecureRandom.hex }
15
+ end
16
+ end
17
+
18
+ let(:tempfile) do
19
+ t = Tempfile.open('chunk-input-output')
20
+ t.binmode
21
+ t
22
+ end
23
+
24
+ let(:chunk_class) { }
25
+ let(:chunk_io) { chunk_class.new(tempfile) }
26
+
27
+ describe "#rewind" do
28
+ before do
29
+ allow(chunk_io).to receive(:flush)
30
+ allow(tempfile).to receive(:rewind)
31
+ chunk_io.rewind
32
+ end
33
+
34
+ it "rewinds the io" do
35
+ expect(tempfile).to have_received(:rewind)
36
+ end
37
+
38
+ it "flushes the io" do
39
+ expect(chunk_io).to have_received(:flush)
40
+ end
41
+ end
42
+
43
+ describe "#flush" do
44
+ before do
45
+ allow(tempfile).to receive(:flush)
46
+ chunk_io.flush
47
+ end
48
+
49
+ it "flushes the io" do
50
+ expect(tempfile).to have_received(:flush)
51
+ end
52
+ end
53
+
54
+ shared_examples "a valid integration test" do
55
+ let(:enumerable) {}
56
+
57
+ it "writes the data and reads it back" do
58
+ expect { chunk_io.write_entries(enumerable) }.not_to raise_error
59
+
60
+ chunk_io.rewind
61
+
62
+ expect(tempfile.size).not_to eq(0)
63
+ expect(chunk_io.each.to_a).to match_array(enumerable)
64
+ end
65
+ end
66
+
67
+ context "arrays" do
68
+ it_behaves_like "a valid integration test" do
69
+ let(:enumerable) { arrays }
70
+ end
71
+ end
72
+
73
+ context "hashes" do
74
+ it_behaves_like "a valid integration test" do
75
+ let(:enumerable) { hashes }
76
+ end
77
+ end
78
+ end
79
+
80
+ describe OfflineSort::Chunk::InputOutput::Base do
81
+ let(:io) { Tempfile.new('chunk') }
82
+ let(:chunk_io) { OfflineSort::Chunk::InputOutput::Base.new(io) }
83
+
84
+ describe "#read_entry" do
85
+ it "raises when read_entry is called" do
86
+ expect { chunk_io.read_entry }.to raise_error(OfflineSort::Chunk::InputOutput::Base::MethodNotImplementedError)
87
+ end
88
+ end
89
+
90
+ describe "#write_entry" do
91
+ it "raises when write_entry is called" do
92
+ expect { chunk_io.write_entry({}) }.to raise_error(OfflineSort::Chunk::InputOutput::Base::MethodNotImplementedError)
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,7 @@
1
+ require 'spec_helper'
2
+
3
+ describe OfflineSort::Chunk::InputOutput::Marshal do
4
+ it_behaves_like "a valid chunk input output" do
5
+ let(:chunk_class) { OfflineSort::Chunk::InputOutput::Marshal }
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ require 'spec_helper'
2
+
3
+ describe OfflineSort::Chunk::InputOutput::MessagePack do
4
+ it_behaves_like "a valid chunk input output" do
5
+ let(:chunk_class) { OfflineSort::Chunk::InputOutput::MessagePack }
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ require 'spec_helper'
2
+
3
+ describe OfflineSort::Chunk::InputOutput::Yaml do
4
+ it_behaves_like "a valid chunk input output" do
5
+ let(:chunk_class) { OfflineSort::Chunk::InputOutput::Yaml }
6
+ end
7
+ end
@@ -0,0 +1,96 @@
1
+ require 'spec_helper'
2
+
3
+ describe OfflineSort::FixedSizeMinHeap do
4
+ let(:array) { (1..10).to_a.shuffle }
5
+ let(:heap) { OfflineSort::FixedSizeMinHeap.new(array.dup) }
6
+
7
+ describe "#initialize" do
8
+ it "is a a heap" do
9
+ expect{ assert_min_heap(heap.array) }.not_to raise_error
10
+ end
11
+ end
12
+
13
+ describe "#push" do
14
+ context "with a full array" do
15
+ it "raises an exception" do
16
+ expect{ heap.push(rand(20)) }.to raise_error("Heap Size (#{array.size}) Exceeded")
17
+ end
18
+ end
19
+
20
+ context "with one space" do
21
+ before do
22
+ heap.pop
23
+ end
24
+
25
+ it "adds to the heap" do
26
+ expect{ heap.push(1) }.not_to raise_error
27
+ end
28
+ end
29
+
30
+ context "with more than one space" do
31
+ before do
32
+ 5.times { heap.pop }
33
+ end
34
+
35
+ it "adds to the heap" do
36
+ 5.times do
37
+ expect{ heap.push(1) }.not_to raise_error
38
+ end
39
+ end
40
+ end
41
+ end
42
+
43
+ describe "#pop" do
44
+ context "with empty array" do
45
+ before do
46
+ array.size.times { heap.pop }
47
+ end
48
+
49
+ it "is nil" do
50
+ expect(heap.pop).to be nil
51
+ end
52
+ end
53
+
54
+ context "until empty" do
55
+ it "is sorted" do
56
+ last = -1
57
+ array.size.times do
58
+ popped = heap.pop
59
+ expect(popped).to be > (last)
60
+ last = popped
61
+ end
62
+ end
63
+ end
64
+ end
65
+
66
+ context "integration test" do
67
+ it "is always heap ordered" do
68
+ 100.times do
69
+ heap.pop
70
+ heap.push(rand(100))
71
+ expect{ assert_min_heap(heap.array) }.not_to raise_error
72
+ end
73
+ end
74
+ end
75
+
76
+ def assert_min_heap(array)
77
+ array.each_with_index do |e, index|
78
+ left = (2 * index) + 1
79
+ right = (2 * index) + 2
80
+
81
+ if left < array.size
82
+ unless array[left] >= e
83
+ puts "left #{e} #{array}"
84
+ raise "not a heap"
85
+ end
86
+ end
87
+
88
+ if right < array.size
89
+ unless array[right] >= e
90
+ puts "right #{e} #{array}"
91
+ raise "not a heap"
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,102 @@
1
+ require 'spec_helper'
2
+
3
+ describe OfflineSort::Sorter do
4
+
5
+ shared_examples "a correct offline sort" do
6
+ let(:count) { 10000 }
7
+ let(:entries_per_chunk) { 900 }
8
+ let(:enumerable) {}
9
+ let(:sort) {}
10
+
11
+ before do
12
+ @unsorted = enumerable.dup
13
+ r = Benchmark.measure do
14
+ result = OfflineSort.sort(enumerable, chunk_size: entries_per_chunk, &sort)
15
+
16
+ @sorted = result.map do |entry|
17
+ entry
18
+ end
19
+ end
20
+ puts r
21
+ end
22
+
23
+ it "produces the same sorted result as an in-memory sort" do
24
+ expect(@unsorted).to match_array(enumerable)
25
+ expect do
26
+ last = nil
27
+ entry_count = 0
28
+ @sorted.each do |entry|
29
+ if last.nil?
30
+ last = entry
31
+ entry_count += 1
32
+ next
33
+ end
34
+
35
+ unless ((sort.call(last) <=> sort.call(entry)) == -1)
36
+ raise "Out of order at line #{entry_count}"
37
+ end
38
+
39
+ last = entry
40
+ entry_count += 1
41
+ end
42
+ end.not_to raise_error
43
+ expect(@sorted).to match_array(enumerable.sort_by(&sort))
44
+ end
45
+ end
46
+
47
+ let(:arrays) do
48
+ count.times.map do |index|
49
+ [SecureRandom.hex, index, SecureRandom.hex]
50
+ end
51
+ end
52
+
53
+ let(:array_sort_index) { 2 }
54
+ let(:array_sort) { Proc.new { |arr| arr[array_sort_index] } }
55
+
56
+ let(:hashes) do
57
+ count.times.map do |index|
58
+ { 'a' => SecureRandom.hex, 'b' => index, 'c' => SecureRandom.hex }
59
+ end
60
+ end
61
+
62
+ let(:hash_sort_key) { 'c' }
63
+ let(:hash_sort) { Proc.new { |hash| hash[hash_sort_key] } }
64
+
65
+
66
+ context "with arrays" do
67
+ it_behaves_like "a correct offline sort" do
68
+ let(:enumerable) { arrays }
69
+ let(:sort) { array_sort }
70
+ end
71
+
72
+ context "with multiple sort keys" do
73
+ it_behaves_like "a correct offline sort" do
74
+ let(:enumerable) do
75
+ count.times.map do |index|
76
+ [index.round(-1), index, SecureRandom.hex]
77
+ end.shuffle
78
+ end
79
+ let(:sort) { Proc.new { |arr| [arr[0], arr[1]] } }
80
+ end
81
+ end
82
+ end
83
+
84
+ context "hashes" do
85
+ it_behaves_like "a correct offline sort" do
86
+ let(:enumerable) { hashes }
87
+ let(:sort) { hash_sort }
88
+ end
89
+
90
+ context "with multiple sort keys" do
91
+ it_behaves_like "a correct offline sort" do
92
+ let(:enumerable) do
93
+ count.times.map do |index|
94
+ { 'a' => index.round(-1), 'b' => index, 'c' => SecureRandom.hex }
95
+ end.shuffle
96
+ end
97
+ let(:sort) { Proc.new { |hash| [hash['a'], hash['c']] } }
98
+ end
99
+ end
100
+ end
101
+ end
102
+
@@ -0,0 +1,7 @@
1
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
2
+ require 'securerandom'
3
+ require 'benchmark'
4
+ require 'msgpack'
5
+
6
+ require 'offline_sort'
7
+
metadata ADDED
@@ -0,0 +1,142 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: offline-sort
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Matthew Cross
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-11-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: pqueue
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: msgpack
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.3'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.3'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Offline sort for any enumerable with pluggable serialization strategies
84
+ email:
85
+ - mcross@salsify.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - Gemfile
91
+ - LICENSE.txt
92
+ - README.md
93
+ - Rakefile
94
+ - lib/offline_sort.rb
95
+ - lib/offline_sort/chunk.rb
96
+ - lib/offline_sort/chunk/input_output/base.rb
97
+ - lib/offline_sort/chunk/input_output/marshal.rb
98
+ - lib/offline_sort/chunk/input_output/message_pack.rb
99
+ - lib/offline_sort/chunk/input_output/yaml.rb
100
+ - lib/offline_sort/fixed_size_min_heap.rb
101
+ - lib/offline_sort/offline_sort.rb
102
+ - lib/offline_sort/version.rb
103
+ - offline-sort.gemspec
104
+ - spec/offline_sort/chunk/input_output/base_spec.rb
105
+ - spec/offline_sort/chunk/input_output/marshal_spec.rb
106
+ - spec/offline_sort/chunk/input_output/message_pack_spec.rb
107
+ - spec/offline_sort/chunk/input_output/yaml_spec.rb
108
+ - spec/offline_sort/fixed_size_min_heap_spec.rb
109
+ - spec/offline_sort/offline_sort_spec.rb
110
+ - spec/spec_helper.rb
111
+ homepage: https://github.com/salsify/offline-sort
112
+ licenses:
113
+ - MIT
114
+ metadata: {}
115
+ post_install_message:
116
+ rdoc_options: []
117
+ require_paths:
118
+ - lib
119
+ required_ruby_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ required_rubygems_version: !ruby/object:Gem::Requirement
125
+ requirements:
126
+ - - ">="
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ requirements: []
130
+ rubyforge_project:
131
+ rubygems_version: 2.4.5.1
132
+ signing_key:
133
+ specification_version: 4
134
+ summary: Offline sort for any enumerable with pluggable serialization strategies
135
+ test_files:
136
+ - spec/offline_sort/chunk/input_output/base_spec.rb
137
+ - spec/offline_sort/chunk/input_output/marshal_spec.rb
138
+ - spec/offline_sort/chunk/input_output/message_pack_spec.rb
139
+ - spec/offline_sort/chunk/input_output/yaml_spec.rb
140
+ - spec/offline_sort/fixed_size_min_heap_spec.rb
141
+ - spec/offline_sort/offline_sort_spec.rb
142
+ - spec/spec_helper.rb