d_heap 0.3.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5a20fe814944fa8945bc2cc0ce87f810c8bf8d21102b8c454ae5d639f0548576
4
- data.tar.gz: 65a1af345ae84c7f6e5da8af89a00730ffc4cbdb6bb2cac59461c3728eb0ad28
3
+ metadata.gz: 5b51ed52baf74b585a7ab7799f92a446aef5852431ba10e146658b419657ffbe
4
+ data.tar.gz: cc7c6786eee78ec13214582b8701448d312f59fb723d12676fb673447ab409a7
5
5
  SHA512:
6
- metadata.gz: 27c240634013397033925ee258de08135587c31c7a046a902c028842680dadec20e2ccc0f76570e6f7db34d2292e5dae2ae7d17449436f890498697de4352c91
7
- data.tar.gz: 8eb2cba120747cc7788c42f264d2109ef1afee8a28caa7effb8bcb1ff36ed7c99a0556b48b7d9e9479f6e8be8945eea3bf1ddb7f608c13c63252684643154179
6
+ metadata.gz: 5de98f8c9084b30694fff5f8154a6e42e7e67d76518c25136ab4fb0c0afb047ad3c923f4544dcf613ded4c3b01417729aa796c973100faaa7ee93051fa630c7d
7
+ data.tar.gz: e5dbcc90da7adfba7ef45cd9a2da5fd1781a2bd489002a5ffc0a764915c035c178db30ae9b8431a8fc810cfa6f03a1b38ec0a50cbf23c2e1ba5dfc36549c0609
data/.clang-format ADDED
@@ -0,0 +1,21 @@
1
+ ---
2
+ BasedOnStyle: mozilla
3
+ IndentWidth: 4
4
+ PointerAlignment: Right
5
+ AlignAfterOpenBracket: Align
6
+ AlignConsecutiveAssignments: true
7
+ AlignConsecutiveDeclarations: true
8
+ AlignConsecutiveBitFields: true
9
+ AlignConsecutiveMacros: true
10
+ AlignEscapedNewlines: Right
11
+ AlignOperands: true
12
+
13
+ AllowAllConstructorInitializersOnNextLine: false
14
+ AllowShortIfStatementsOnASingleLine: WithoutElse
15
+
16
+ IndentCaseLabels: false
17
+ IndentPPDirectives: AfterHash
18
+
19
+ ForEachMacros:
20
+ - WHILE_PEEK_LT_P
21
+ ...
@@ -1,4 +1,4 @@
1
- name: Ruby
1
+ name: CI
2
2
 
3
3
  on: [push,pull_request]
4
4
 
@@ -7,7 +7,7 @@ jobs:
7
7
  strategy:
8
8
  fail-fast: false
9
9
  matrix:
10
- ruby: [2.5, 2.6, 2.7, 3.0]
10
+ ruby: [2.4, 2.5, 2.6, 2.7, 3.0]
11
11
  os: [ubuntu, macos]
12
12
  experimental: [false]
13
13
  runs-on: ${{ matrix.os }}-latest
@@ -23,4 +23,19 @@ jobs:
23
23
  run: |
24
24
  gem install bundler -v 2.2.3
25
25
  bundle install
26
- bundle exec rake
26
+ bundle exec rake ci
27
+
28
+ benchmarks:
29
+ runs-on: ubuntu-latest
30
+ steps:
31
+ - uses: actions/checkout@v2
32
+ - name: Set up Ruby
33
+ uses: ruby/setup-ruby@v1
34
+ with:
35
+ ruby-version: 2.7
36
+ bundler-cache: true
37
+ - name: Run the benchmarks
38
+ run: |
39
+ gem install bundler -v 2.2.3
40
+ bundle install
41
+ bundle exec rake ci:benchmarks
data/.gitignore CHANGED
@@ -10,6 +10,7 @@
10
10
  *.so
11
11
  *.o
12
12
  *.a
13
+ compile_commands.json
13
14
  mkmf.log
14
15
 
15
16
  # rspec failure tracking
data/.rubocop.yml CHANGED
@@ -3,9 +3,10 @@ inherit_mode:
3
3
  - Exclude
4
4
 
5
5
  AllCops:
6
- TargetRubyVersion: 2.5
6
+ TargetRubyVersion: 2.4
7
7
  NewCops: disable
8
8
  Exclude:
9
+ - bin/benchmark-driver
9
10
  - bin/rake
10
11
  - bin/rspec
11
12
  - bin/rubocop
@@ -106,26 +107,50 @@ Naming/RescuedExceptionsVariableName: { Enabled: false }
106
107
  ###########################################################################
107
108
  # Matrics:
108
109
 
110
+ Metrics/CyclomaticComplexity:
111
+ Max: 10
112
+
109
113
  # Although it may be better to split specs into multiple files...?
110
114
  Metrics/BlockLength:
111
115
  Exclude:
112
116
  - "spec/**/*_spec.rb"
117
+ CountAsOne:
118
+ - array
119
+ - hash
120
+ - heredoc
121
+
122
+ Metrics/ClassLength:
123
+ Max: 200
124
+ CountAsOne:
125
+ - array
126
+ - hash
127
+ - heredoc
113
128
 
114
129
  ###########################################################################
115
130
  # Style...
116
131
 
117
132
  Style/AccessorGrouping: { Enabled: false }
118
133
  Style/AsciiComments: { Enabled: false } # 👮 can't stop our 🎉🥳🎊🥳!
134
+ Style/ClassAndModuleChildren: { Enabled: false }
119
135
  Style/EachWithObject: { Enabled: false }
120
136
  Style/FormatStringToken: { Enabled: false }
121
137
  Style/FloatDivision: { Enabled: false }
138
+ Style/GuardClause: { Enabled: false } # usually nice to do, but...
139
+ Style/IfUnlessModifier: { Enabled: false }
140
+ Style/IfWithSemicolon: { Enabled: false }
122
141
  Style/Lambda: { Enabled: false }
123
142
  Style/LineEndConcatenation: { Enabled: false }
124
143
  Style/MixinGrouping: { Enabled: false }
144
+ Style/MultilineBlockChain: { Enabled: false }
125
145
  Style/PerlBackrefs: { Enabled: false } # use occasionally/sparingly
126
146
  Style/RescueStandardError: { Enabled: false }
147
+ Style/Semicolon: { Enabled: false }
127
148
  Style/SingleLineMethods: { Enabled: false }
128
149
  Style/StabbyLambdaParentheses: { Enabled: false }
150
+ Style/WhenThen : { Enabled: false }
151
+
152
+ # I require trailing commas elsewhere, but these are optional
153
+ Style/TrailingCommaInArguments: { Enabled: false }
129
154
 
130
155
  # If rubocop had an option to only enforce this on constants and literals (e.g.
131
156
  # strings, regexp, range), I'd agree.
@@ -149,7 +174,9 @@ Style/BlockDelimiters:
149
174
  EnforcedStyle: semantic
150
175
  AllowBracesOnProceduralOneLiners: true
151
176
  IgnoredMethods:
152
- - expect
177
+ - expect # rspec
178
+ - profile # ruby-prof
179
+ - ips # benchmark-ips
153
180
 
154
181
 
155
182
  Style/FormatString:
@@ -168,3 +195,6 @@ Style/TrailingCommaInHashLiteral:
168
195
 
169
196
  Style/TrailingCommaInArrayLiteral:
170
197
  EnforcedStyleForMultiline: consistent_comma
198
+
199
+ Style/YodaCondition:
200
+ EnforcedStyle: forbid_for_equality_operators_only
data/.yardopts ADDED
@@ -0,0 +1,10 @@
1
+ -o doc
2
+ --embed-mixins
3
+ --hide-void-return
4
+ --no-private
5
+ --asset images:images
6
+ --exclude lib/benchmark_driver
7
+ --exclude lib/d_heap/benchmarks*
8
+ -
9
+ CHANGELOG.md
10
+ CODE_OF_CONDUCT.md
data/CHANGELOG.md ADDED
@@ -0,0 +1,93 @@
1
+ ## Current/Unreleased
2
+
3
+ ## Release v0.7.0 (2021-01-24)
4
+
5
+ * 💥⚡️ **BREAKING**: Uses `double`) for _all_ scores.
6
+ * 💥 Integers larger than a double mantissa (53-bits) will lose some
7
+ precision.
8
+ * ⚡️ big speed up
9
+ * ⚡️ Much better memory usage
10
+ * ⚡️ Simplifies score conversion between ruby and C
11
+ * ✨ Added `DHeap::Map` for ensuring values can only be added once, by `#hash`.
12
+ * Adding again will update the score.
13
+ * Adds `DHeap::Map#[]` for quick lookup of existing scores
14
+ * Adds `DHeap::Map#[]=` for adjustments of existing scores
15
+ * TODO: `DHeap::Map#delete`
16
+ * 📝📈 SO MANY BENCHMARKS
17
+ * ⚡️ Set DEFAULT_D to 6, based on benchmarks.
18
+ * 🐛♻️ convert all `long` indexes to `size_t`
19
+
20
+ ## Release v0.6.1 (2021-01-24)
21
+
22
+ * 📝 Fix link to CHANGELOG.md in gemspec
23
+
24
+ ## Release v0.6.0 (2021-01-24)
25
+
26
+ * 🔥 **Breaking**: `#initialize` uses a keyword argument for `d`
27
+ * ✨ Added `#initialize(capacity: capa)` to set initial capacity.
28
+ * ✨ Added `peek_with_score` and `peek_score`
29
+ * ✨ Added `pop_with_score` and `each_pop(with_score: true)`
30
+ * ✨ Added `pop_all_below(max_score, array = [])`
31
+ * ✨ Added aliases for `shift` and `next`
32
+ * 📈 Added benchmark charts to README, and `bin/bench_charts` to generate them.
33
+ * requires `gruff` which requires `rmagick` which requires `imagemagick`
34
+ * 📝 Many documentation updates and fixes.
35
+
36
+ ## Release v0.5.0 (2021-01-17)
37
+
38
+ * 🔥 **Breaking**: reversed order of `#push` arguments to `value, score`.
39
+ * ✨ Added `#insert(score, value)` to replace earlier version of `#push`.
40
+ * ✨ Added `#each_pop` enumerator.
41
+ * ✨ Added aliases for `deq`, `enq`, `first`, `pop_below`, `length`, and
42
+ `count`, to mimic other classes in ruby's stdlib.
43
+ * ⚡️♻️ More performance improvements:
44
+ * Created an `ENTRY` struct and store both the score and the value pointer in
45
+ the same `ENTRY *entries` array.
46
+ * Reduced unnecessary allocations or copies in both sift loops. A similar
47
+ refactoring also sped up the pure ruby benchmark implementation.
48
+ * Compiling with `-O3`.
49
+ * 📝 Updated (and in some cases, fixed) yardoc
50
+ * ♻️ Moved aliases and less performance sensitive code into ruby.
51
+ * ♻️ DRY up push/insert methods
52
+
53
+ ## Release v0.4.0 (2021-01-12)
54
+
55
+ * 🔥 **Breaking**: Scores must be `Integer` or convertable to `Float`
56
+ * ⚠️ `Integer` scores must fit in `-ULONG_LONG_MAX` to `+ULONG_LONG_MAX`.
57
+ * ⚡️ Big performance improvements, by using C `long double *cscores` array
58
+ * ⚡️ many many (so many) updates to benchmarks
59
+ * ✨ Added `DHeap#clear`
60
+ * 🐛 Fixed `DHeap#initialize_copy` and `#freeze`
61
+ * ♻️ significant refactoring
62
+ * 📝 Updated docs (mostly adding benchmarks)
63
+
64
+ ## Release v0.3.0 (2020-12-29)
65
+
66
+ * 🔥 **Breaking**: Removed class methods that operated directly on an array.
67
+ They weren't compatible with the performance improvements.
68
+ * ⚡️ Big performance improvements, by converting to a `T_DATA` struct.
69
+ * ♻️ Major refactoring/rewriting of dheap.c
70
+ * ✅ Added benchmark specs
71
+
72
+ ## Release v0.2.2 (2020-12-27)
73
+
74
+ * 🐛 fix `optimized_cmp`, avoiding internal symbols
75
+ * 📝 Update documentation
76
+ * 💚 fix macos CI
77
+ * ➕ Add rubocop 👮🎨
78
+
79
+ ## Release v0.2.1 (2020-12-26)
80
+
81
+ * ⬆️ Upgraded rake (and bundler) to support ruby 3.0
82
+
83
+ ## Release v0.2.0 (2020-12-24)
84
+
85
+ * ✨ Add ability to push separate score and value
86
+ * ⚡️ Big performance gain, by storing scores separately and using ruby's
87
+ internal `OPTIMIZED_CMP` instead of always directly calling `<=>`
88
+
89
+ ## Release v0.1.0 (2020-12-22)
90
+
91
+ 🎉 initial release 🎉
92
+
93
+ * ✨ Add basic d-ary Heap implementation
data/D ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/sh
2
+ set -eu
3
+
4
+ export BENCH_D="$1"
5
+ shift
6
+
7
+ exec ruby "$@"
data/README.md CHANGED
@@ -1,199 +1,461 @@
1
- # DHeap
1
+ # DHeap - Fast d-ary heap for ruby
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/d_heap.svg)](https://badge.fury.io/rb/d_heap)
4
+ [![Build Status](https://github.com/nevans/d_heap/workflows/CI/badge.svg)](https://github.com/nevans/d_heap/actions?query=workflow%3ACI)
5
+ [![Maintainability](https://api.codeclimate.com/v1/badges/ff274acd0683c99c03e1/maintainability)](https://codeclimate.com/github/nevans/d_heap/maintainability)
6
+
7
+ A fast [_d_-ary heap][d-ary heap] [priority queue] implementation for ruby,
8
+ implemented as a C extension.
9
+
10
+ A regular queue has "FIFO" behavior: first in, first out. A stack is "LIFO":
11
+ last in first out. A priority queue pushes each element with a score and pops
12
+ out in order by score. Priority queues are often used in algorithms for e.g.
13
+ [scheduling] of timers or bandwidth management, for [Huffman coding], and for
14
+ various graph search algorithms such as [Dijkstra's algorithm], [A* search], or
15
+ [Prim's algorithm].
16
+
17
+ From [wikipedia](https://en.wikipedia.org/wiki/Heap_(data_structure)):
18
+ > A heap is a specialized tree-based data structure which is essentially an
19
+ > almost complete tree that satisfies the heap property: in a min heap, for any
20
+ > given node C, if P is a parent node of C, then the key (the value) of P is
21
+ > less than or equal to the key of C. The node at the "top" of the heap (with no
22
+ > parents) is called the root node.
23
+
24
+ ![tree representation of a min heap](images/wikipedia-min-heap.png)
25
+
26
+ The _d_-ary heap data structure is a generalization of a [binary heap] in which
27
+ each node has _d_ children instead of 2. This speeds up "push" or "decrease
28
+ priority" operations (`O(log n / log d)`) with the tradeoff of slower "pop" or
29
+ "increase priority" (`O(d log n / log d)`). Additionally, _d_-ary heaps can
30
+ have better memory cache behavior than binary heaps, letting them run more
31
+ quickly in practice.
32
+
33
+ Although the default _d_ value will usually perform best (see the time
34
+ complexity analysis below), it's always advisable to benchmark your specific
35
+ use-case. In particular, if you push items more than you pop, higher values for
36
+ _d_ can give a faster total runtime.
37
+
38
+ [d-ary heap]: https://en.wikipedia.org/wiki/D-ary_heap
39
+ [priority queue]: https://en.wikipedia.org/wiki/Priority_queue
40
+ [binary heap]: https://en.wikipedia.org/wiki/Binary_heap
41
+ [scheduling]: https://en.wikipedia.org/wiki/Scheduling_(computing)
42
+ [Huffman coding]: https://en.wikipedia.org/wiki/Huffman_coding#Compression
43
+ [Dijkstra's algorithm]: https://en.wikipedia.org/wiki/Dijkstra%27s_algorithm#Using_a_priority_queue
44
+ [A* search]: https://en.wikipedia.org/wiki/A*_search_algorithm#Description
45
+ [Prim's algorithm]: https://en.wikipedia.org/wiki/Prim%27s_algorithm
2
46
 
3
- A fast _d_-ary heap implementation for ruby, useful in priority queues and graph
4
- algorithms.
47
+ ## Installation
5
48
 
6
- The _d_-ary heap data structure is a generalization of the binary heap, in which
7
- the nodes have _d_ children instead of 2. This allows for "decrease priority"
8
- operations to be performed more quickly with the tradeoff of slower delete
9
- minimum. Additionally, _d_-ary heaps can have better memory cache behavior than
10
- binary heaps, allowing them to run more quickly in practice despite slower
11
- worst-case time complexity. In the worst case, a _d_-ary heap requires only
12
- `O(log n / log d)` to push, with the tradeoff that pop is `O(d log n / log d)`.
49
+ Add this line to your application's Gemfile:
13
50
 
14
- Although you should probably just stick with the default _d_ value of `4`, it
15
- may be worthwhile to benchmark your specific scenario.
51
+ ```ruby
52
+ gem 'd_heap'
53
+ ```
54
+
55
+ And then execute:
56
+
57
+ $ bundle install
58
+
59
+ Or install it yourself as:
60
+
61
+ $ gem install d_heap
16
62
 
17
63
  ## Usage
18
64
 
19
- The simplest way to use it is simply with `#push` and `#pop`. Push takes a
20
- score and a value, and pop returns the value with the current minimum score.
65
+ The basic API is `#push(object, score)` and `#pop`. Please read the [full
66
+ documentation] for more details. The score must be convertable to a `Float` via
67
+ `Float(score)` (i.e. it should properly implement `#to_f`).
68
+
69
+ Quick reference for the most common methods:
70
+
71
+ * `heap << object` adds a value, using `Float(object)` as its intrinsic score.
72
+ * `heap.push(object, score)` adds a value with an extrinsic score.
73
+ * `heap.peek` to view the minimum value without popping it.
74
+ * `heap.pop` removes and returns the value with the minimum score.
75
+ * `heap.pop_below(max_score)` pops only if the next score is `<` the argument.
76
+ * `heap.clear` to remove all items from the heap.
77
+ * `heap.empty?` returns true if the heap is empty.
78
+ * `heap.size` returns the number of items in the heap.
79
+
80
+ ### Examples
21
81
 
22
82
  ```ruby
83
+ # create some example objects to place in our heap
84
+ Task = Struct.new(:id, :time) do
85
+ def to_f; time.to_f end
86
+ end
87
+ t1 = Task.new(1, Time.now + 5*60)
88
+ t2 = Task.new(2, Time.now + 50)
89
+ t3 = Task.new(3, Time.now + 60)
90
+ t4 = Task.new(4, Time.now + 5)
91
+
92
+ # create the heap
23
93
  require "d_heap"
94
+ heap = DHeap.new
24
95
 
25
- heap = DHeap.new # defaults to a 4-ary heap
96
+ # push with an explicit score (which might be extrinsic to the value)
97
+ heap.push t1, t1.to_f
26
98
 
27
- # storing [score, value] tuples
28
- heap.push Time.now + 5*60, Task.new(1)
29
- heap.push Time.now + 30, Task.new(2)
30
- heap.push Time.now + 60, Task.new(3)
31
- heap.push Time.now + 5, Task.new(4)
99
+ # the score will be implicitly cast with Float, so any object with #to_f
100
+ heap.push t2, t2
32
101
 
33
- # peeking and popping (using last to get the task and ignore the time)
34
- heap.pop.last # => Task[4]
35
- heap.pop.last # => Task[2]
36
- heap.peak.last # => Task[3], but don't pop it
37
- heap.pop.last # => Task[3]
38
- heap.pop.last # => Task[1]
39
- ```
102
+ # if the object has an intrinsic score via #to_f, "<<" is the simplest API
103
+ heap << t3 << t4
40
104
 
41
- Read the `rdoc` for more detailed documentation and examples.
105
+ # pop returns the lowest scored item, and removes it from the heap
106
+ heap.pop # => #<struct Task id=4, time=2021-01-17 17:02:22.5574 -0500>
107
+ heap.pop # => #<struct Task id=2, time=2021-01-17 17:03:07.5574 -0500>
42
108
 
43
- ## Installation
109
+ # peek returns the lowest scored item, without removing it from the heap
110
+ heap.peek # => #<struct Task id=3, time=2021-01-17 17:03:17.5574 -0500>
111
+ heap.pop # => #<struct Task id=3, time=2021-01-17 17:03:17.5574 -0500>
44
112
 
45
- Add this line to your application's Gemfile:
113
+ # pop_lte handles the common "h.pop if h.peek_score < max" pattern
114
+ heap.pop_lte(Time.now + 65) # => nil
46
115
 
47
- ```ruby
48
- gem 'd_heap'
116
+ # the heap size can be inspected with size and empty?
117
+ heap.empty? # => false
118
+ heap.size # => 1
119
+ heap.pop # => #<struct Task id=1, time=2021-01-17 17:07:17.5574 -0500>
120
+ heap.empty? # => true
121
+ heap.size # => 0
122
+
123
+ # popping from an empty heap returns nil
124
+ heap.pop # => nil
49
125
  ```
50
126
 
51
- And then execute:
127
+ Please see the [full documentation] for more methods and more examples.
52
128
 
53
- $ bundle install
129
+ [full documentation]: https://rubydoc.info/gems/d_heap/DHeap
54
130
 
55
- Or install it yourself as:
131
+ ### DHeap::Map
56
132
 
57
- $ gem install d_heap
133
+ `DHeap::Map` augments the heap with an internal `Hash`, mapping objects to their
134
+ index in the heap. For simple push/pop this a bit slower than a normal `DHeap`
135
+ heap, but it can enable huge speed-ups for algorithms that need to adjust scores
136
+ after they've been added, e.g. [Dijkstra's algorithm]. It adds the following:
58
137
 
59
- ## Motivation
60
-
61
- Sometimes you just need a priority queue, right? With a regular queue, you
62
- expect "FIFO" behavior: first in, first out. With a priority queue, you push
63
- with a score (or your elements are comparable), and you want to be able to
64
- efficiently pop off the minimum (or maximum) element.
65
-
66
- One obvious approach is to simply maintain an array in sorted order. And
67
- ruby's Array class makes it simple to maintain a sorted array by combining
68
- `#bsearch_index` with `#insert`. With certain insert/remove workloads that can
69
- perform very well, but in the worst-case an insert or delete can result in O(n),
70
- since `#insert` may need to `memcpy` or `memmove` a significant portion of the
71
- array.
72
-
73
- But the standard way to efficiently and simply solve this problem is using a
74
- binary heap. Although it increases the time for `pop`, it converts the
75
- amortized time per push + pop from `O(n)` to `O(d log n / log d)`.
76
-
77
- I was surprised to find that, at least under certain benchmarks, my pure ruby
78
- heap implementation was usually slower than inserting into a fully sorted
79
- array. While this is a testament to ruby's fine-tuned Array implementationw, a
80
- heap implementated in C should easily peform faster than `Array#insert`.
81
-
82
- The biggest issue is that it just takes far too much time to call `<=>` from
83
- ruby code: A sorted array only requires `log n / log 2` comparisons to insert
84
- and no comparisons to pop. However a _d_-ary heap requires `log n / log d` to
85
- insert plus an additional `d log n / log d` to pop. If your queue contains only
86
- a few hundred items at once, the overhead of those extra calls to `<=>` is far
87
- more than occasionally calling `memcpy`.
88
-
89
- It's likely that MJIT will eventually make the C-extension completely
90
- unnecessary. This is definitely hotspot code, and the basic ruby implementation
91
- would work fine, if not for that `<=>` overhead. Until then... this gem gets
92
- the job done.
93
-
94
- ## TODOs...
95
-
96
- _TODO:_ In addition to a basic _d_-ary heap class (`DHeap`), this library
97
- ~~includes~~ _will include_ extensions to `Array`, allowing an Array to be
98
- directly handled as a priority queue. These extension methods are meant to be
99
- used similarly to how `#bsearch` and `#bsearch_index` might be used.
100
-
101
- _TODO:_ Also ~~included is~~ _will include_ `DHeap::Set`, which augments the
102
- basic heap with an internal `Hash`, which maps a set of values to scores.
103
- loosely inspired by go's timers. e.g: It lazily sifts its heap after deletion
104
- and adjustments, to achieve faster average runtime for *add* and *cancel*
105
- operations.
106
-
107
- _TODO:_ Also ~~included is~~ _will include_ `DHeap::Timers`, which contains some
108
- features that are loosely inspired by go's timers. e.g: It lazily sifts its
109
- heap after deletion and adjustments, to achieve faster average runtime for *add*
110
- and *cancel* operations.
111
-
112
- Additionally, I was inspired by reading go's "timer.go" implementation to
113
- experiment with a 4-ary heap instead of the traditional binary heap. In the
114
- case of timers, new timers are usually scheduled to run after most of the
115
- existing timers. And timers are usually canceled before they have a chance to
116
- run. While a binary heap holds 50% of its elements in its last layer, 75% of a
117
- 4-ary heap will have no children. That diminishes the extra comparison overhead
118
- during sift-down.
138
+ * a uniqueness constraint, by `#hash` value
139
+ * `#[obj] # => score` or `#score(obj)` in `O(1)`
140
+ * `#[obj] = new_score` or `#rescore(obj, score)` in `O(d log n / log d)`
141
+ * TODO:
142
+ * optionally unique by object identity
143
+ * `#delete(obj)` in `O(d log n / log d)` (TODO)
119
144
 
120
- ## Benchmarks
145
+ ## Scores
121
146
 
122
- _TODO: put benchmarks here._
147
+ If a score changes while the object is still in the heap, it will not be
148
+ re-evaluated again.
123
149
 
124
- ## Analysis
150
+ Constraining scores to `Float` gives enormous performance benefits. n.b.
151
+ very large `Integer` values will lose precision when converted to `Float`. This
152
+ is compiler and architecture dependant but with gcc on an IA-64 system, `Float`
153
+ is 64 bits with a 53-bit mantissa, which gives a range of -9,007,199,254,740,991
154
+ to +9,007,199,254,740,991, which is _not_ enough to store the precise POSIX
155
+ time since the epoch in nanoseconds. This can be worked around by adding a
156
+ bias, but probably it's good enough for most usage.
125
157
 
126
- ### Time complexity
158
+ _Comparing arbitary objects via_ `a <=> b` _was the original design and may be
159
+ added back in a future version,_ if (and only if) _it can be done without
160
+ impacting the speed of numeric comparisons._
127
161
 
128
- Both sift operations can perform (log[d] n = log n / log d) swaps.
129
- Swap up performs only a single comparison per swap: O(1).
130
- Swap down performs as many as d comparions per swap: O(d).
162
+ ## Thread safety
131
163
 
132
- Inserting an item is O(log n / log d).
133
- Deleting the root is O(d log n / log d).
164
+ `DHeap` is _not_ thread-safe, so concurrent access from multiple threads need to
165
+ take precautions such as locking access behind a mutex.
134
166
 
135
- Assuming every inserted item is eventually deleted from the root, d=4 requires
136
- the fewest comparisons for combined insert and delete:
137
- * (1 + 2) lg 2 = 4.328085
138
- * (1 + 3) lg 3 = 3.640957
139
- * (1 + 4) lg 4 = 3.606738
140
- * (1 + 5) lg 5 = 3.728010
141
- * (1 + 6) lg 6 = 3.906774
142
- * etc...
167
+ ## Benchmarks
143
168
 
144
- Leaf nodes require no comparisons to shift down, and higher values for d have
145
- higher percentage of leaf nodes:
146
- * d=2 has ~50% leaves,
147
- * d=3 has ~67% leaves,
148
- * d=4 has ~75% leaves,
149
- * and so on...
169
+ _See full benchmark output in subdirs of `benchmarks`. See also or updated
170
+ results. These benchmarks were measured with an Intel Core i7-1065G7 8x3.9GHz
171
+ with d_heap v0.5.0 and ruby 2.7.2 without MJIT enabled._
172
+
173
+ ### Implementations
174
+
175
+ * **findmin** -
176
+ A very fast `O(1)` push using `Array#push` onto an unsorted Array, but a
177
+ very slow `O(n)` pop using `Array#min`, `Array#rindex(min)` and
178
+ `Array#delete_at(min_index)`. Push + pop is still fast for `n < 100`, but
179
+ unusably slow for `n > 1000`.
180
+
181
+ * **bsearch** -
182
+ A simple implementation with a slow `O(n)` push using `Array#bsearch` +
183
+ `Array#insert` to maintain a sorted Array, but a very fast `O(1)` pop with
184
+ `Array#pop`. It is still relatively fast for `n < 10000`, but its linear
185
+ time complexity really destroys it after that.
186
+
187
+ * **rb_heap** -
188
+ A pure ruby binary min-heap that has been tuned for performance by making
189
+ few method calls and allocating and assigning as few variables as possible.
190
+ It runs in `O(log n)` for both push and pop, although pop is slower than
191
+ push by a constant factor. Its much higher constant factors makes it lose
192
+ to `bsearch` push + pop for `n < 10000` but it holds steady with very little
193
+ slowdown even with `n > 10000000`.
194
+
195
+ * **c++ stl** -
196
+ A thin wrapper around the [priority_queue_cxx gem] which uses the [C++ STL
197
+ priority_queue]. The wrapper is simply to provide compatibility with the
198
+ other benchmarked implementations, but it should be possible to speed this
199
+ up a little bit by benchmarking the `priority_queue_cxx` API directly. It
200
+ has the same time complexity as rb_heap but its much lower constant
201
+ factors allow it to easily outperform `bsearch`.
202
+
203
+ * **c_dheap** -
204
+ A {DHeap} instance with the default `d` value of `4`. It has the same time
205
+ complexity as `rb_heap` and `c++ stl`, but is faster than both in every
206
+ benchmarked scenario.
207
+
208
+ [priority_queue_cxx gem]: https://rubygems.org/gems/priority_queue_cxx
209
+ [C++ STL priority_queue]: http://www.cplusplus.com/reference/queue/priority_queue/
210
+
211
+ ### Scenarios
212
+
213
+ Each benchmark increases N exponentially, either by √1̅0̅ or approximating
214
+ (alternating between x3 and x3.333) in order to simplify keeping loop counts
215
+ evenly divisible by N.
216
+
217
+ #### push N items
218
+
219
+ This measures the _average time per insert_ to create a queue of size N
220
+ (clearing the queue once it reaches that size). Use cases which push (or
221
+ decrease) more values than they pop, e.g. [Dijkstra's algorithm] or [Prim's
222
+ algorithm] when the graph has more edges than verticies, may want to pay more
223
+ attention to this benchmark.
224
+
225
+ ![bar graph for push_n_pop_n benchmarks](./images/push_n.png)
226
+
227
+ == push N (N=100) ==========================================================
228
+ push N (c_dheap): 10522662.6 i/s
229
+ push N (findmin): 9980622.3 i/s - 1.05x slower
230
+ push N (c++ stl): 7991608.3 i/s - 1.32x slower
231
+ push N (rb_heap): 4607849.4 i/s - 2.28x slower
232
+ push N (bsearch): 2769106.2 i/s - 3.80x slower
233
+ == push N (N=10,000) =======================================================
234
+ push N (c_dheap): 10444588.3 i/s
235
+ push N (findmin): 10191797.4 i/s - 1.02x slower
236
+ push N (c++ stl): 8210895.4 i/s - 1.27x slower
237
+ push N (rb_heap): 4369252.9 i/s - 2.39x slower
238
+ push N (bsearch): 1213580.4 i/s - 8.61x slower
239
+ == push N (N=1,000,000) ====================================================
240
+ push N (c_dheap): 10342183.7 i/s
241
+ push N (findmin): 9963898.8 i/s - 1.04x slower
242
+ push N (c++ stl): 7891924.8 i/s - 1.31x slower
243
+ push N (rb_heap): 4350116.0 i/s - 2.38x slower
244
+
245
+ All three heap implementations have little to no perceptible slowdown for `N >
246
+ 100`. But `DHeap` runs faster than `Array#push` to an unsorted array (findmin)!
247
+
248
+ #### push then pop N items
249
+
250
+ This measures the _average_ for a push **or** a pop, filling up a queue with N
251
+ items and then draining that queue until empty. It represents the amortized
252
+ cost of balanced pushes and pops to fill a heap and drain it.
253
+
254
+ ![bar graph for push_n_pop_n benchmarks](./images/push_n_pop_n.png)
255
+
256
+ == push N then pop N (N=100) ===============================================
257
+ push N + pop N (c_dheap): 10954469.2 i/s
258
+ push N + pop N (c++ stl): 9317140.2 i/s - 1.18x slower
259
+ push N + pop N (bsearch): 4808770.2 i/s - 2.28x slower
260
+ push N + pop N (findmin): 4321411.9 i/s - 2.53x slower
261
+ push N + pop N (rb_heap): 2467417.0 i/s - 4.44x slower
262
+ == push N then pop N (N=10,000) ============================================
263
+ push N + pop N (c_dheap): 8083962.7 i/s
264
+ push N + pop N (c++ stl): 7365661.8 i/s - 1.10x slower
265
+ push N + pop N (bsearch): 2257047.9 i/s - 3.58x slower
266
+ push N + pop N (rb_heap): 1439204.3 i/s - 5.62x slower
267
+ == push N then pop N (N=1,000,000) =========================================
268
+ push N + pop N (c++ stl): 5274657.5 i/s
269
+ push N + pop N (c_dheap): 4731117.9 i/s - 1.11x slower
270
+ push N + pop N (rb_heap): 976688.6 i/s - 5.40x slower
271
+
272
+ At N=100 findmin still beats a pure-ruby heap. But above that it slows down too
273
+ much to be useful. At N=10k, bsearch still beats a pure ruby heap, but above
274
+ 30k it slows down too much to be useful. `DHeap` consistently runs 4.5-5.5x
275
+ faster than the pure ruby heap.
276
+
277
+ #### push & pop on N-item heap
278
+
279
+ This measures the combined time to push once and pop once, which is done
280
+ repeatedly while keeping a stable heap size of N. Its an approximation for
281
+ scenarios which reach a stable size and then plateau with balanced pushes and
282
+ pops. E.g. timers and timeouts will often reschedule themselves or replace
283
+ themselves with new timers or timeouts, maintaining a roughly stable total count
284
+ of timers.
285
+
286
+ ![bar graph for push_pop benchmarks](./images/push_pop.png)
287
+
288
+ push + pop (findmin)
289
+ N 10: 5480288.0 i/s
290
+ N 100: 2595178.8 i/s - 2.11x slower
291
+ N 1000: 224813.9 i/s - 24.38x slower
292
+ N 10000: 12630.7 i/s - 433.89x slower
293
+ N 100000: 1097.3 i/s - 4994.31x slower
294
+ N 1000000: 135.9 i/s - 40313.05x slower
295
+ N 10000000: 12.9 i/s - 425838.01x slower
296
+
297
+ push + pop (bsearch)
298
+ N 10: 3931408.4 i/s
299
+ N 100: 2904181.8 i/s - 1.35x slower
300
+ N 1000: 2203157.1 i/s - 1.78x slower
301
+ N 10000: 1209584.9 i/s - 3.25x slower
302
+ N 100000: 81121.4 i/s - 48.46x slower
303
+ N 1000000: 5356.0 i/s - 734.02x slower
304
+ N 10000000: 281.9 i/s - 13946.33x slower
305
+
306
+ push + pop (rb_heap)
307
+ N 10: 2325816.5 i/s
308
+ N 100: 1603540.3 i/s - 1.45x slower
309
+ N 1000: 1262515.2 i/s - 1.84x slower
310
+ N 10000: 950389.3 i/s - 2.45x slower
311
+ N 100000: 732548.8 i/s - 3.17x slower
312
+ N 1000000: 673577.8 i/s - 3.45x slower
313
+ N 10000000: 467512.3 i/s - 4.97x slower
314
+
315
+ push + pop (c++ stl)
316
+ N 10: 7706818.6 i/s - 1.01x slower
317
+ N 100: 7393127.3 i/s - 1.05x slower
318
+ N 1000: 6898781.3 i/s - 1.13x slower
319
+ N 10000: 5731130.5 i/s - 1.36x slower
320
+ N 100000: 4842393.2 i/s - 1.60x slower
321
+ N 1000000: 4170936.4 i/s - 1.86x slower
322
+ N 10000000: 2737146.6 i/s - 2.84x slower
323
+
324
+ push + pop (c_dheap)
325
+ N 10: 10196454.1 i/s
326
+ N 100: 9668679.8 i/s - 1.05x slower
327
+ N 1000: 9339557.0 i/s - 1.09x slower
328
+ N 10000: 8045103.0 i/s - 1.27x slower
329
+ N 100000: 7150276.7 i/s - 1.43x slower
330
+ N 1000000: 6490261.6 i/s - 1.57x slower
331
+ N 10000000: 3734856.5 i/s - 2.73x slower
332
+
333
+ ## Time complexity analysis
334
+
335
+ There are two fundamental heap operations: sift-up (used by push or decrease
336
+ score) and sift-down (used by pop or delete or increase score). Each sift
337
+ bubbles an item to its correct location in the tree.
338
+
339
+ * A _d_-ary heap has `log n / log d` layers, so either sift performs as many as
340
+ `log n / log d` writes, when a member sifts the entire length of the tree.
341
+ * Sift-up needs one comparison per layer: `O(log n / log d)`.
342
+ * Sift-down needs d comparions per layer: `O(d log n / log d)`.
343
+
344
+ So, in the case of a balanced push then pop, as many as `(1 + d) log n / log d`
345
+ comparisons are made. Looking only at this worst case combo, `d=4` requires the
346
+ fewest comparisons for a combined push and pop:
347
+
348
+ * `(1 + 2) log n / log d ≈ 4.328085 log n`
349
+ * `(1 + 3) log n / log d ≈ 3.640957 log n`
350
+ * `(1 + 4) log n / log d ≈ 3.606738 log n`
351
+ * `(1 + 5) log n / log d ≈ 3.728010 log n`
352
+ * `(1 + 6) log n / log d ≈ 3.906774 log n`
353
+ * `(1 + 7) log n / log d ≈ 4.111187 log n`
354
+ * `(1 + 8) log n / log d ≈ 4.328085 log n`
355
+ * `(1 + 9) log n / log d ≈ 4.551196 log n`
356
+ * `(1 + 10) log n / log d ≈ 4.777239 log n`
357
+ * etc...
150
358
 
151
359
  See https://en.wikipedia.org/wiki/D-ary_heap#Analysis for deeper analysis.
152
360
 
153
- ### Space complexity
361
+ However, what this simple count of comparisons misses is the extent to which
362
+ modern compilers can optimize code (e.g. by unrolling the comparison loop to
363
+ execute on registers) and more importantly how well modern processors are at
364
+ pipelined speculative execution using branch prediction, etc. Benchmarks should
365
+ be run on the _exact same_ hardware platform that production code will use,
366
+ as the sift-down operation is especially sensitive to good pipelining.
154
367
 
155
- Because the heap is a complete binary tree, space usage is linear, regardless
156
- of d. However higher d values may provide better cache locality.
368
+ ## Comparison performance
157
369
 
158
- We can run comparisons much much faster for Numeric or String objects than for
159
- ruby objects which delegate comparison to internal Numeric or String objects.
160
- And it is often advantageous to use extrinsic scores for uncomparable items.
161
- For this, our internal array uses twice as many entries (one for score and one
162
- for value) as it would if it only supported intrinsic comparison or used an
163
- un-memoized "sort_by" proc.
370
+ It is often useful to use external scores for otherwise uncomparable values.
371
+ And casting an item or score (via `to_f`) can also be time consuming. So
372
+ `DHeap` evaluates and stores scores at the time of insertion, and they will be
373
+ compared directly without needing any further lookup.
164
374
 
165
- ### Timers
375
+ Numeric values can be compared _much_ faster than other ruby objects, even if
376
+ those objects simply delegate comparison to internal Numeric values.
377
+ Additionally, native C integers or floats can be compared _much_ faster than
378
+ ruby `Numeric` objects. So scores are converted to Float and stored as
379
+ `double`, which is 64 bits on an [LP64 64-bit system].
166
380
 
167
- Additionally, when used to sort timers, we can reasonably assume that:
168
- * New timers usually sort after most existing timers.
169
- * Most timers will be canceled before executing.
170
- * Canceled timers usually sort after most existing timers.
171
-
172
- So, if we are able to delete an item without searching for it, by keeping a map
173
- of positions within the heap, most timers can be inserted and deleted in O(1)
174
- time. Canceling a non-leaf timer can be further optimized by marking it as
175
- canceled without immediately removing it from the heap. If the timer is
176
- rescheduled before we garbage collect, adjusting its position will usually be
177
- faster than a delete and re-insert.
381
+ [LP64 64-bit system]: https://en.wikipedia.org/wiki/64-bit_computing#64-bit_data_models
178
382
 
179
383
  ## Alternative data structures
180
384
 
181
- Depending on what you're doing, maintaining a sorted `Array` using
182
- `#bsearch_index` and `#insert` might be faster! Although it is technically
183
- O(n) for insertions, the implementations for `memcpy` or `memmove` can be *very*
184
- fast on modern architectures. Also, it can be faster O(n) on average, if
185
- insertions are usually near the end of the array. You should run benchmarks
186
- with your expected scenarios to determine which is right.
385
+ As always, you should run benchmarks with your expected scenarios to determine
386
+ which is best for your application.
387
+
388
+ Depending on your use-case, using a sorted `Array` using `#bsearch_index`
389
+ and `#insert` might be just fine! It only takes a couple of lines of code and
390
+ is probably "Fast Enough".
391
+
392
+ More complex heap variant, e.g. [Fibonacci heap], allow heaps to be split and
393
+ merged which gives some graph algorithms a lower amortized time complexity. But
394
+ in practice, _d_-ary heaps have much lower overhead and often run faster.
395
+
396
+ [Fibonacci heap]: https://en.wikipedia.org/wiki/Fibonacci_heap
187
397
 
188
398
  If it is important to be able to quickly enumerate the set or find the ranking
189
- of values in it, then you probably want to use a self-balancing binary search
190
- tree (e.g. a red-black tree) or a skip-list.
191
-
192
- A Hashed Timing Wheel or Heirarchical Timing Wheels (or some variant in that
193
- family of data structures) can be constructed to have effectively O(1) running
194
- time in most cases. However, the implementation for that data structure is more
195
- complex than a heap. If a 4-ary heap is good enough for go's timers, it should
196
- be suitable for many use cases.
399
+ of values in it, then you may want to use a self-balancing binary search tree
400
+ (e.g. a [red-black tree]) or a [skip-list].
401
+
402
+ [red-black tree]: https://en.wikipedia.org/wiki/Red%E2%80%93black_tree
403
+ [skip-list]: https://en.wikipedia.org/wiki/Skip_list
404
+
405
+ [Hashed and Heirarchical Timing Wheels][timing wheel] (or some variant in the
406
+ timing wheel family of data structures) can have effectively `O(1)` running time
407
+ in most cases. Although the implementation for that data structure is more
408
+ complex than a heap, it may be necessary for enormous values of N.
409
+
410
+ [timing wheel]: http://www.cs.columbia.edu/~nahum/w6998/papers/ton97-timing-wheels.pdf
411
+
412
+ ## Supported platforms
413
+
414
+ See the [CI workflow] for all supported platforms.
415
+
416
+ [CI workflow]: https://github.com/nevans/d_heap/actions?query=workflow%3ACI
417
+
418
+ `d_heap` may contain bugs on 32-bit systems. Currently, `d_heap` is only tested
419
+ on 64-bit x86 CRuby 2.4-3.0 under Linux and Mac OS.
420
+
421
+ ## Caveats and TODOs (PRs welcome!)
422
+
423
+ A `DHeap`'s internal array grows but never shrinks. At the very least, there
424
+ should be a `#compact` or `#shrink` method and during `#freeze`. It might make
425
+ sense to automatically shrink (to no more than 2x the current size) during GC's
426
+ compact phase.
427
+
428
+ Benchmark sift-down min-child comparisons using SSE, AVX2, and AVX512F. This
429
+ might lead to a different default `d` value (maybe 16 or 24?).
430
+
431
+ Shrink scores to 64-bits: either store a type flag with each entry (this could
432
+ be used to support non-numeric scores) or require users to choose between
433
+ `Integer` or `Float` at construction time. Reducing memory usage should also
434
+ improve speed for very large heaps.
435
+
436
+ Patches to support JRuby, rubinius, 32-bit systems, or any other platforms are
437
+ welcome! JRuby and Truffle Ruby ought to be able to use [Java's PriorityQueue]?
438
+ Other platforms could fallback on the (slower) pure ruby implementation used by
439
+ the benchmarks.
440
+
441
+ [Java's PriorityQueue]: https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/util/PriorityQueue.html
442
+
443
+ Allow a max-heap (or other configurations of the compare function). This can be
444
+ very easily implemented by just reversing the scores.
445
+
446
+ _Maybe_ allow non-numeric scores to be compared with `<=>`, _only_ if the basic
447
+ numeric use case simplicity and speed can be preserved.
448
+
449
+ Consider `DHeap::Monotonic`, which could rely on `#pop_below` for "current time"
450
+ and move all values below that time onto an Array.
451
+
452
+ Consider adding `DHeap::Lazy` or `DHeap.new(lazy: true)` which could contain
453
+ some features that are loosely inspired by go's timers. Go lazily sifts its
454
+ heap after deletion or adjustments, to achieve faster amortized runtime.
455
+ There's no need to actually remove a deleted item from the heap, if you re-add
456
+ it back before it's next evaluated. A similar trick can be to store "far away"
457
+ values in an internal `Hash`, assuming many will be deleted before they rise to
458
+ the top. This could naturally evolve into a [timing wheel] variant.
197
459
 
198
460
  ## Development
199
461