d_heap 0.3.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5a20fe814944fa8945bc2cc0ce87f810c8bf8d21102b8c454ae5d639f0548576
4
- data.tar.gz: 65a1af345ae84c7f6e5da8af89a00730ffc4cbdb6bb2cac59461c3728eb0ad28
3
+ metadata.gz: 5b51ed52baf74b585a7ab7799f92a446aef5852431ba10e146658b419657ffbe
4
+ data.tar.gz: cc7c6786eee78ec13214582b8701448d312f59fb723d12676fb673447ab409a7
5
5
  SHA512:
6
- metadata.gz: 27c240634013397033925ee258de08135587c31c7a046a902c028842680dadec20e2ccc0f76570e6f7db34d2292e5dae2ae7d17449436f890498697de4352c91
7
- data.tar.gz: 8eb2cba120747cc7788c42f264d2109ef1afee8a28caa7effb8bcb1ff36ed7c99a0556b48b7d9e9479f6e8be8945eea3bf1ddb7f608c13c63252684643154179
6
+ metadata.gz: 5de98f8c9084b30694fff5f8154a6e42e7e67d76518c25136ab4fb0c0afb047ad3c923f4544dcf613ded4c3b01417729aa796c973100faaa7ee93051fa630c7d
7
+ data.tar.gz: e5dbcc90da7adfba7ef45cd9a2da5fd1781a2bd489002a5ffc0a764915c035c178db30ae9b8431a8fc810cfa6f03a1b38ec0a50cbf23c2e1ba5dfc36549c0609
data/.clang-format ADDED
@@ -0,0 +1,21 @@
1
+ ---
2
+ BasedOnStyle: mozilla
3
+ IndentWidth: 4
4
+ PointerAlignment: Right
5
+ AlignAfterOpenBracket: Align
6
+ AlignConsecutiveAssignments: true
7
+ AlignConsecutiveDeclarations: true
8
+ AlignConsecutiveBitFields: true
9
+ AlignConsecutiveMacros: true
10
+ AlignEscapedNewlines: Right
11
+ AlignOperands: true
12
+
13
+ AllowAllConstructorInitializersOnNextLine: false
14
+ AllowShortIfStatementsOnASingleLine: WithoutElse
15
+
16
+ IndentCaseLabels: false
17
+ IndentPPDirectives: AfterHash
18
+
19
+ ForEachMacros:
20
+ - WHILE_PEEK_LT_P
21
+ ...
@@ -1,4 +1,4 @@
1
- name: Ruby
1
+ name: CI
2
2
 
3
3
  on: [push,pull_request]
4
4
 
@@ -7,7 +7,7 @@ jobs:
7
7
  strategy:
8
8
  fail-fast: false
9
9
  matrix:
10
- ruby: [2.5, 2.6, 2.7, 3.0]
10
+ ruby: [2.4, 2.5, 2.6, 2.7, 3.0]
11
11
  os: [ubuntu, macos]
12
12
  experimental: [false]
13
13
  runs-on: ${{ matrix.os }}-latest
@@ -23,4 +23,19 @@ jobs:
23
23
  run: |
24
24
  gem install bundler -v 2.2.3
25
25
  bundle install
26
- bundle exec rake
26
+ bundle exec rake ci
27
+
28
+ benchmarks:
29
+ runs-on: ubuntu-latest
30
+ steps:
31
+ - uses: actions/checkout@v2
32
+ - name: Set up Ruby
33
+ uses: ruby/setup-ruby@v1
34
+ with:
35
+ ruby-version: 2.7
36
+ bundler-cache: true
37
+ - name: Run the benchmarks
38
+ run: |
39
+ gem install bundler -v 2.2.3
40
+ bundle install
41
+ bundle exec rake ci:benchmarks
data/.gitignore CHANGED
@@ -10,6 +10,7 @@
10
10
  *.so
11
11
  *.o
12
12
  *.a
13
+ compile_commands.json
13
14
  mkmf.log
14
15
 
15
16
  # rspec failure tracking
data/.rubocop.yml CHANGED
@@ -3,9 +3,10 @@ inherit_mode:
3
3
  - Exclude
4
4
 
5
5
  AllCops:
6
- TargetRubyVersion: 2.5
6
+ TargetRubyVersion: 2.4
7
7
  NewCops: disable
8
8
  Exclude:
9
+ - bin/benchmark-driver
9
10
  - bin/rake
10
11
  - bin/rspec
11
12
  - bin/rubocop
@@ -106,26 +107,50 @@ Naming/RescuedExceptionsVariableName: { Enabled: false }
106
107
  ###########################################################################
107
108
  # Matrics:
108
109
 
110
+ Metrics/CyclomaticComplexity:
111
+ Max: 10
112
+
109
113
  # Although it may be better to split specs into multiple files...?
110
114
  Metrics/BlockLength:
111
115
  Exclude:
112
116
  - "spec/**/*_spec.rb"
117
+ CountAsOne:
118
+ - array
119
+ - hash
120
+ - heredoc
121
+
122
+ Metrics/ClassLength:
123
+ Max: 200
124
+ CountAsOne:
125
+ - array
126
+ - hash
127
+ - heredoc
113
128
 
114
129
  ###########################################################################
115
130
  # Style...
116
131
 
117
132
  Style/AccessorGrouping: { Enabled: false }
118
133
  Style/AsciiComments: { Enabled: false } # 👮 can't stop our 🎉🥳🎊🥳!
134
+ Style/ClassAndModuleChildren: { Enabled: false }
119
135
  Style/EachWithObject: { Enabled: false }
120
136
  Style/FormatStringToken: { Enabled: false }
121
137
  Style/FloatDivision: { Enabled: false }
138
+ Style/GuardClause: { Enabled: false } # usually nice to do, but...
139
+ Style/IfUnlessModifier: { Enabled: false }
140
+ Style/IfWithSemicolon: { Enabled: false }
122
141
  Style/Lambda: { Enabled: false }
123
142
  Style/LineEndConcatenation: { Enabled: false }
124
143
  Style/MixinGrouping: { Enabled: false }
144
+ Style/MultilineBlockChain: { Enabled: false }
125
145
  Style/PerlBackrefs: { Enabled: false } # use occasionally/sparingly
126
146
  Style/RescueStandardError: { Enabled: false }
147
+ Style/Semicolon: { Enabled: false }
127
148
  Style/SingleLineMethods: { Enabled: false }
128
149
  Style/StabbyLambdaParentheses: { Enabled: false }
150
+ Style/WhenThen : { Enabled: false }
151
+
152
+ # I require trailing commas elsewhere, but these are optional
153
+ Style/TrailingCommaInArguments: { Enabled: false }
129
154
 
130
155
  # If rubocop had an option to only enforce this on constants and literals (e.g.
131
156
  # strings, regexp, range), I'd agree.
@@ -149,7 +174,9 @@ Style/BlockDelimiters:
149
174
  EnforcedStyle: semantic
150
175
  AllowBracesOnProceduralOneLiners: true
151
176
  IgnoredMethods:
152
- - expect
177
+ - expect # rspec
178
+ - profile # ruby-prof
179
+ - ips # benchmark-ips
153
180
 
154
181
 
155
182
  Style/FormatString:
@@ -168,3 +195,6 @@ Style/TrailingCommaInHashLiteral:
168
195
 
169
196
  Style/TrailingCommaInArrayLiteral:
170
197
  EnforcedStyleForMultiline: consistent_comma
198
+
199
+ Style/YodaCondition:
200
+ EnforcedStyle: forbid_for_equality_operators_only
data/.yardopts ADDED
@@ -0,0 +1,10 @@
1
+ -o doc
2
+ --embed-mixins
3
+ --hide-void-return
4
+ --no-private
5
+ --asset images:images
6
+ --exclude lib/benchmark_driver
7
+ --exclude lib/d_heap/benchmarks*
8
+ -
9
+ CHANGELOG.md
10
+ CODE_OF_CONDUCT.md
data/CHANGELOG.md ADDED
@@ -0,0 +1,93 @@
1
+ ## Current/Unreleased
2
+
3
+ ## Release v0.7.0 (2021-01-24)
4
+
5
+ * 💥⚡️ **BREAKING**: Uses `double`) for _all_ scores.
6
+ * 💥 Integers larger than a double mantissa (53-bits) will lose some
7
+ precision.
8
+ * ⚡️ big speed up
9
+ * ⚡️ Much better memory usage
10
+ * ⚡️ Simplifies score conversion between ruby and C
11
+ * ✨ Added `DHeap::Map` for ensuring values can only be added once, by `#hash`.
12
+ * Adding again will update the score.
13
+ * Adds `DHeap::Map#[]` for quick lookup of existing scores
14
+ * Adds `DHeap::Map#[]=` for adjustments of existing scores
15
+ * TODO: `DHeap::Map#delete`
16
+ * 📝📈 SO MANY BENCHMARKS
17
+ * ⚡️ Set DEFAULT_D to 6, based on benchmarks.
18
+ * 🐛♻️ convert all `long` indexes to `size_t`
19
+
20
+ ## Release v0.6.1 (2021-01-24)
21
+
22
+ * 📝 Fix link to CHANGELOG.md in gemspec
23
+
24
+ ## Release v0.6.0 (2021-01-24)
25
+
26
+ * 🔥 **Breaking**: `#initialize` uses a keyword argument for `d`
27
+ * ✨ Added `#initialize(capacity: capa)` to set initial capacity.
28
+ * ✨ Added `peek_with_score` and `peek_score`
29
+ * ✨ Added `pop_with_score` and `each_pop(with_score: true)`
30
+ * ✨ Added `pop_all_below(max_score, array = [])`
31
+ * ✨ Added aliases for `shift` and `next`
32
+ * 📈 Added benchmark charts to README, and `bin/bench_charts` to generate them.
33
+ * requires `gruff` which requires `rmagick` which requires `imagemagick`
34
+ * 📝 Many documentation updates and fixes.
35
+
36
+ ## Release v0.5.0 (2021-01-17)
37
+
38
+ * 🔥 **Breaking**: reversed order of `#push` arguments to `value, score`.
39
+ * ✨ Added `#insert(score, value)` to replace earlier version of `#push`.
40
+ * ✨ Added `#each_pop` enumerator.
41
+ * ✨ Added aliases for `deq`, `enq`, `first`, `pop_below`, `length`, and
42
+ `count`, to mimic other classes in ruby's stdlib.
43
+ * ⚡️♻️ More performance improvements:
44
+ * Created an `ENTRY` struct and store both the score and the value pointer in
45
+ the same `ENTRY *entries` array.
46
+ * Reduced unnecessary allocations or copies in both sift loops. A similar
47
+ refactoring also sped up the pure ruby benchmark implementation.
48
+ * Compiling with `-O3`.
49
+ * 📝 Updated (and in some cases, fixed) yardoc
50
+ * ♻️ Moved aliases and less performance sensitive code into ruby.
51
+ * ♻️ DRY up push/insert methods
52
+
53
+ ## Release v0.4.0 (2021-01-12)
54
+
55
+ * 🔥 **Breaking**: Scores must be `Integer` or convertable to `Float`
56
+ * ⚠️ `Integer` scores must fit in `-ULONG_LONG_MAX` to `+ULONG_LONG_MAX`.
57
+ * ⚡️ Big performance improvements, by using C `long double *cscores` array
58
+ * ⚡️ many many (so many) updates to benchmarks
59
+ * ✨ Added `DHeap#clear`
60
+ * 🐛 Fixed `DHeap#initialize_copy` and `#freeze`
61
+ * ♻️ significant refactoring
62
+ * 📝 Updated docs (mostly adding benchmarks)
63
+
64
+ ## Release v0.3.0 (2020-12-29)
65
+
66
+ * 🔥 **Breaking**: Removed class methods that operated directly on an array.
67
+ They weren't compatible with the performance improvements.
68
+ * ⚡️ Big performance improvements, by converting to a `T_DATA` struct.
69
+ * ♻️ Major refactoring/rewriting of dheap.c
70
+ * ✅ Added benchmark specs
71
+
72
+ ## Release v0.2.2 (2020-12-27)
73
+
74
+ * 🐛 fix `optimized_cmp`, avoiding internal symbols
75
+ * 📝 Update documentation
76
+ * 💚 fix macos CI
77
+ * ➕ Add rubocop 👮🎨
78
+
79
+ ## Release v0.2.1 (2020-12-26)
80
+
81
+ * ⬆️ Upgraded rake (and bundler) to support ruby 3.0
82
+
83
+ ## Release v0.2.0 (2020-12-24)
84
+
85
+ * ✨ Add ability to push separate score and value
86
+ * ⚡️ Big performance gain, by storing scores separately and using ruby's
87
+ internal `OPTIMIZED_CMP` instead of always directly calling `<=>`
88
+
89
+ ## Release v0.1.0 (2020-12-22)
90
+
91
+ 🎉 initial release 🎉
92
+
93
+ * ✨ Add basic d-ary Heap implementation
data/D ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/sh
2
+ set -eu
3
+
4
+ export BENCH_D="$1"
5
+ shift
6
+
7
+ exec ruby "$@"
data/README.md CHANGED
@@ -1,199 +1,461 @@
1
- # DHeap
1
+ # DHeap - Fast d-ary heap for ruby
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/d_heap.svg)](https://badge.fury.io/rb/d_heap)
4
+ [![Build Status](https://github.com/nevans/d_heap/workflows/CI/badge.svg)](https://github.com/nevans/d_heap/actions?query=workflow%3ACI)
5
+ [![Maintainability](https://api.codeclimate.com/v1/badges/ff274acd0683c99c03e1/maintainability)](https://codeclimate.com/github/nevans/d_heap/maintainability)
6
+
7
+ A fast [_d_-ary heap][d-ary heap] [priority queue] implementation for ruby,
8
+ implemented as a C extension.
9
+
10
+ A regular queue has "FIFO" behavior: first in, first out. A stack is "LIFO":
11
+ last in first out. A priority queue pushes each element with a score and pops
12
+ out in order by score. Priority queues are often used in algorithms for e.g.
13
+ [scheduling] of timers or bandwidth management, for [Huffman coding], and for
14
+ various graph search algorithms such as [Dijkstra's algorithm], [A* search], or
15
+ [Prim's algorithm].
16
+
17
+ From [wikipedia](https://en.wikipedia.org/wiki/Heap_(data_structure)):
18
+ > A heap is a specialized tree-based data structure which is essentially an
19
+ > almost complete tree that satisfies the heap property: in a min heap, for any
20
+ > given node C, if P is a parent node of C, then the key (the value) of P is
21
+ > less than or equal to the key of C. The node at the "top" of the heap (with no
22
+ > parents) is called the root node.
23
+
24
+ ![tree representation of a min heap](images/wikipedia-min-heap.png)
25
+
26
+ The _d_-ary heap data structure is a generalization of a [binary heap] in which
27
+ each node has _d_ children instead of 2. This speeds up "push" or "decrease
28
+ priority" operations (`O(log n / log d)`) with the tradeoff of slower "pop" or
29
+ "increase priority" (`O(d log n / log d)`). Additionally, _d_-ary heaps can
30
+ have better memory cache behavior than binary heaps, letting them run more
31
+ quickly in practice.
32
+
33
+ Although the default _d_ value will usually perform best (see the time
34
+ complexity analysis below), it's always advisable to benchmark your specific
35
+ use-case. In particular, if you push items more than you pop, higher values for
36
+ _d_ can give a faster total runtime.
37
+
38
+ [d-ary heap]: https://en.wikipedia.org/wiki/D-ary_heap
39
+ [priority queue]: https://en.wikipedia.org/wiki/Priority_queue
40
+ [binary heap]: https://en.wikipedia.org/wiki/Binary_heap
41
+ [scheduling]: https://en.wikipedia.org/wiki/Scheduling_(computing)
42
+ [Huffman coding]: https://en.wikipedia.org/wiki/Huffman_coding#Compression
43
+ [Dijkstra's algorithm]: https://en.wikipedia.org/wiki/Dijkstra%27s_algorithm#Using_a_priority_queue
44
+ [A* search]: https://en.wikipedia.org/wiki/A*_search_algorithm#Description
45
+ [Prim's algorithm]: https://en.wikipedia.org/wiki/Prim%27s_algorithm
2
46
 
3
- A fast _d_-ary heap implementation for ruby, useful in priority queues and graph
4
- algorithms.
47
+ ## Installation
5
48
 
6
- The _d_-ary heap data structure is a generalization of the binary heap, in which
7
- the nodes have _d_ children instead of 2. This allows for "decrease priority"
8
- operations to be performed more quickly with the tradeoff of slower delete
9
- minimum. Additionally, _d_-ary heaps can have better memory cache behavior than
10
- binary heaps, allowing them to run more quickly in practice despite slower
11
- worst-case time complexity. In the worst case, a _d_-ary heap requires only
12
- `O(log n / log d)` to push, with the tradeoff that pop is `O(d log n / log d)`.
49
+ Add this line to your application's Gemfile:
13
50
 
14
- Although you should probably just stick with the default _d_ value of `4`, it
15
- may be worthwhile to benchmark your specific scenario.
51
+ ```ruby
52
+ gem 'd_heap'
53
+ ```
54
+
55
+ And then execute:
56
+
57
+ $ bundle install
58
+
59
+ Or install it yourself as:
60
+
61
+ $ gem install d_heap
16
62
 
17
63
  ## Usage
18
64
 
19
- The simplest way to use it is simply with `#push` and `#pop`. Push takes a
20
- score and a value, and pop returns the value with the current minimum score.
65
+ The basic API is `#push(object, score)` and `#pop`. Please read the [full
66
+ documentation] for more details. The score must be convertable to a `Float` via
67
+ `Float(score)` (i.e. it should properly implement `#to_f`).
68
+
69
+ Quick reference for the most common methods:
70
+
71
+ * `heap << object` adds a value, using `Float(object)` as its intrinsic score.
72
+ * `heap.push(object, score)` adds a value with an extrinsic score.
73
+ * `heap.peek` to view the minimum value without popping it.
74
+ * `heap.pop` removes and returns the value with the minimum score.
75
+ * `heap.pop_below(max_score)` pops only if the next score is `<` the argument.
76
+ * `heap.clear` to remove all items from the heap.
77
+ * `heap.empty?` returns true if the heap is empty.
78
+ * `heap.size` returns the number of items in the heap.
79
+
80
+ ### Examples
21
81
 
22
82
  ```ruby
83
+ # create some example objects to place in our heap
84
+ Task = Struct.new(:id, :time) do
85
+ def to_f; time.to_f end
86
+ end
87
+ t1 = Task.new(1, Time.now + 5*60)
88
+ t2 = Task.new(2, Time.now + 50)
89
+ t3 = Task.new(3, Time.now + 60)
90
+ t4 = Task.new(4, Time.now + 5)
91
+
92
+ # create the heap
23
93
  require "d_heap"
94
+ heap = DHeap.new
24
95
 
25
- heap = DHeap.new # defaults to a 4-ary heap
96
+ # push with an explicit score (which might be extrinsic to the value)
97
+ heap.push t1, t1.to_f
26
98
 
27
- # storing [score, value] tuples
28
- heap.push Time.now + 5*60, Task.new(1)
29
- heap.push Time.now + 30, Task.new(2)
30
- heap.push Time.now + 60, Task.new(3)
31
- heap.push Time.now + 5, Task.new(4)
99
+ # the score will be implicitly cast with Float, so any object with #to_f
100
+ heap.push t2, t2
32
101
 
33
- # peeking and popping (using last to get the task and ignore the time)
34
- heap.pop.last # => Task[4]
35
- heap.pop.last # => Task[2]
36
- heap.peak.last # => Task[3], but don't pop it
37
- heap.pop.last # => Task[3]
38
- heap.pop.last # => Task[1]
39
- ```
102
+ # if the object has an intrinsic score via #to_f, "<<" is the simplest API
103
+ heap << t3 << t4
40
104
 
41
- Read the `rdoc` for more detailed documentation and examples.
105
+ # pop returns the lowest scored item, and removes it from the heap
106
+ heap.pop # => #<struct Task id=4, time=2021-01-17 17:02:22.5574 -0500>
107
+ heap.pop # => #<struct Task id=2, time=2021-01-17 17:03:07.5574 -0500>
42
108
 
43
- ## Installation
109
+ # peek returns the lowest scored item, without removing it from the heap
110
+ heap.peek # => #<struct Task id=3, time=2021-01-17 17:03:17.5574 -0500>
111
+ heap.pop # => #<struct Task id=3, time=2021-01-17 17:03:17.5574 -0500>
44
112
 
45
- Add this line to your application's Gemfile:
113
+ # pop_lte handles the common "h.pop if h.peek_score < max" pattern
114
+ heap.pop_lte(Time.now + 65) # => nil
46
115
 
47
- ```ruby
48
- gem 'd_heap'
116
+ # the heap size can be inspected with size and empty?
117
+ heap.empty? # => false
118
+ heap.size # => 1
119
+ heap.pop # => #<struct Task id=1, time=2021-01-17 17:07:17.5574 -0500>
120
+ heap.empty? # => true
121
+ heap.size # => 0
122
+
123
+ # popping from an empty heap returns nil
124
+ heap.pop # => nil
49
125
  ```
50
126
 
51
- And then execute:
127
+ Please see the [full documentation] for more methods and more examples.
52
128
 
53
- $ bundle install
129
+ [full documentation]: https://rubydoc.info/gems/d_heap/DHeap
54
130
 
55
- Or install it yourself as:
131
+ ### DHeap::Map
56
132
 
57
- $ gem install d_heap
133
+ `DHeap::Map` augments the heap with an internal `Hash`, mapping objects to their
134
+ index in the heap. For simple push/pop this a bit slower than a normal `DHeap`
135
+ heap, but it can enable huge speed-ups for algorithms that need to adjust scores
136
+ after they've been added, e.g. [Dijkstra's algorithm]. It adds the following:
58
137
 
59
- ## Motivation
60
-
61
- Sometimes you just need a priority queue, right? With a regular queue, you
62
- expect "FIFO" behavior: first in, first out. With a priority queue, you push
63
- with a score (or your elements are comparable), and you want to be able to
64
- efficiently pop off the minimum (or maximum) element.
65
-
66
- One obvious approach is to simply maintain an array in sorted order. And
67
- ruby's Array class makes it simple to maintain a sorted array by combining
68
- `#bsearch_index` with `#insert`. With certain insert/remove workloads that can
69
- perform very well, but in the worst-case an insert or delete can result in O(n),
70
- since `#insert` may need to `memcpy` or `memmove` a significant portion of the
71
- array.
72
-
73
- But the standard way to efficiently and simply solve this problem is using a
74
- binary heap. Although it increases the time for `pop`, it converts the
75
- amortized time per push + pop from `O(n)` to `O(d log n / log d)`.
76
-
77
- I was surprised to find that, at least under certain benchmarks, my pure ruby
78
- heap implementation was usually slower than inserting into a fully sorted
79
- array. While this is a testament to ruby's fine-tuned Array implementationw, a
80
- heap implementated in C should easily peform faster than `Array#insert`.
81
-
82
- The biggest issue is that it just takes far too much time to call `<=>` from
83
- ruby code: A sorted array only requires `log n / log 2` comparisons to insert
84
- and no comparisons to pop. However a _d_-ary heap requires `log n / log d` to
85
- insert plus an additional `d log n / log d` to pop. If your queue contains only
86
- a few hundred items at once, the overhead of those extra calls to `<=>` is far
87
- more than occasionally calling `memcpy`.
88
-
89
- It's likely that MJIT will eventually make the C-extension completely
90
- unnecessary. This is definitely hotspot code, and the basic ruby implementation
91
- would work fine, if not for that `<=>` overhead. Until then... this gem gets
92
- the job done.
93
-
94
- ## TODOs...
95
-
96
- _TODO:_ In addition to a basic _d_-ary heap class (`DHeap`), this library
97
- ~~includes~~ _will include_ extensions to `Array`, allowing an Array to be
98
- directly handled as a priority queue. These extension methods are meant to be
99
- used similarly to how `#bsearch` and `#bsearch_index` might be used.
100
-
101
- _TODO:_ Also ~~included is~~ _will include_ `DHeap::Set`, which augments the
102
- basic heap with an internal `Hash`, which maps a set of values to scores.
103
- loosely inspired by go's timers. e.g: It lazily sifts its heap after deletion
104
- and adjustments, to achieve faster average runtime for *add* and *cancel*
105
- operations.
106
-
107
- _TODO:_ Also ~~included is~~ _will include_ `DHeap::Timers`, which contains some
108
- features that are loosely inspired by go's timers. e.g: It lazily sifts its
109
- heap after deletion and adjustments, to achieve faster average runtime for *add*
110
- and *cancel* operations.
111
-
112
- Additionally, I was inspired by reading go's "timer.go" implementation to
113
- experiment with a 4-ary heap instead of the traditional binary heap. In the
114
- case of timers, new timers are usually scheduled to run after most of the
115
- existing timers. And timers are usually canceled before they have a chance to
116
- run. While a binary heap holds 50% of its elements in its last layer, 75% of a
117
- 4-ary heap will have no children. That diminishes the extra comparison overhead
118
- during sift-down.
138
+ * a uniqueness constraint, by `#hash` value
139
+ * `#[obj] # => score` or `#score(obj)` in `O(1)`
140
+ * `#[obj] = new_score` or `#rescore(obj, score)` in `O(d log n / log d)`
141
+ * TODO:
142
+ * optionally unique by object identity
143
+ * `#delete(obj)` in `O(d log n / log d)` (TODO)
119
144
 
120
- ## Benchmarks
145
+ ## Scores
121
146
 
122
- _TODO: put benchmarks here._
147
+ If a score changes while the object is still in the heap, it will not be
148
+ re-evaluated again.
123
149
 
124
- ## Analysis
150
+ Constraining scores to `Float` gives enormous performance benefits. n.b.
151
+ very large `Integer` values will lose precision when converted to `Float`. This
152
+ is compiler and architecture dependant but with gcc on an IA-64 system, `Float`
153
+ is 64 bits with a 53-bit mantissa, which gives a range of -9,007,199,254,740,991
154
+ to +9,007,199,254,740,991, which is _not_ enough to store the precise POSIX
155
+ time since the epoch in nanoseconds. This can be worked around by adding a
156
+ bias, but probably it's good enough for most usage.
125
157
 
126
- ### Time complexity
158
+ _Comparing arbitary objects via_ `a <=> b` _was the original design and may be
159
+ added back in a future version,_ if (and only if) _it can be done without
160
+ impacting the speed of numeric comparisons._
127
161
 
128
- Both sift operations can perform (log[d] n = log n / log d) swaps.
129
- Swap up performs only a single comparison per swap: O(1).
130
- Swap down performs as many as d comparions per swap: O(d).
162
+ ## Thread safety
131
163
 
132
- Inserting an item is O(log n / log d).
133
- Deleting the root is O(d log n / log d).
164
+ `DHeap` is _not_ thread-safe, so concurrent access from multiple threads need to
165
+ take precautions such as locking access behind a mutex.
134
166
 
135
- Assuming every inserted item is eventually deleted from the root, d=4 requires
136
- the fewest comparisons for combined insert and delete:
137
- * (1 + 2) lg 2 = 4.328085
138
- * (1 + 3) lg 3 = 3.640957
139
- * (1 + 4) lg 4 = 3.606738
140
- * (1 + 5) lg 5 = 3.728010
141
- * (1 + 6) lg 6 = 3.906774
142
- * etc...
167
+ ## Benchmarks
143
168
 
144
- Leaf nodes require no comparisons to shift down, and higher values for d have
145
- higher percentage of leaf nodes:
146
- * d=2 has ~50% leaves,
147
- * d=3 has ~67% leaves,
148
- * d=4 has ~75% leaves,
149
- * and so on...
169
+ _See full benchmark output in subdirs of `benchmarks`. See also or updated
170
+ results. These benchmarks were measured with an Intel Core i7-1065G7 8x3.9GHz
171
+ with d_heap v0.5.0 and ruby 2.7.2 without MJIT enabled._
172
+
173
+ ### Implementations
174
+
175
+ * **findmin** -
176
+ A very fast `O(1)` push using `Array#push` onto an unsorted Array, but a
177
+ very slow `O(n)` pop using `Array#min`, `Array#rindex(min)` and
178
+ `Array#delete_at(min_index)`. Push + pop is still fast for `n < 100`, but
179
+ unusably slow for `n > 1000`.
180
+
181
+ * **bsearch** -
182
+ A simple implementation with a slow `O(n)` push using `Array#bsearch` +
183
+ `Array#insert` to maintain a sorted Array, but a very fast `O(1)` pop with
184
+ `Array#pop`. It is still relatively fast for `n < 10000`, but its linear
185
+ time complexity really destroys it after that.
186
+
187
+ * **rb_heap** -
188
+ A pure ruby binary min-heap that has been tuned for performance by making
189
+ few method calls and allocating and assigning as few variables as possible.
190
+ It runs in `O(log n)` for both push and pop, although pop is slower than
191
+ push by a constant factor. Its much higher constant factors makes it lose
192
+ to `bsearch` push + pop for `n < 10000` but it holds steady with very little
193
+ slowdown even with `n > 10000000`.
194
+
195
+ * **c++ stl** -
196
+ A thin wrapper around the [priority_queue_cxx gem] which uses the [C++ STL
197
+ priority_queue]. The wrapper is simply to provide compatibility with the
198
+ other benchmarked implementations, but it should be possible to speed this
199
+ up a little bit by benchmarking the `priority_queue_cxx` API directly. It
200
+ has the same time complexity as rb_heap but its much lower constant
201
+ factors allow it to easily outperform `bsearch`.
202
+
203
+ * **c_dheap** -
204
+ A {DHeap} instance with the default `d` value of `4`. It has the same time
205
+ complexity as `rb_heap` and `c++ stl`, but is faster than both in every
206
+ benchmarked scenario.
207
+
208
+ [priority_queue_cxx gem]: https://rubygems.org/gems/priority_queue_cxx
209
+ [C++ STL priority_queue]: http://www.cplusplus.com/reference/queue/priority_queue/
210
+
211
+ ### Scenarios
212
+
213
+ Each benchmark increases N exponentially, either by √1̅0̅ or approximating
214
+ (alternating between x3 and x3.333) in order to simplify keeping loop counts
215
+ evenly divisible by N.
216
+
217
+ #### push N items
218
+
219
+ This measures the _average time per insert_ to create a queue of size N
220
+ (clearing the queue once it reaches that size). Use cases which push (or
221
+ decrease) more values than they pop, e.g. [Dijkstra's algorithm] or [Prim's
222
+ algorithm] when the graph has more edges than verticies, may want to pay more
223
+ attention to this benchmark.
224
+
225
+ ![bar graph for push_n_pop_n benchmarks](./images/push_n.png)
226
+
227
+ == push N (N=100) ==========================================================
228
+ push N (c_dheap): 10522662.6 i/s
229
+ push N (findmin): 9980622.3 i/s - 1.05x slower
230
+ push N (c++ stl): 7991608.3 i/s - 1.32x slower
231
+ push N (rb_heap): 4607849.4 i/s - 2.28x slower
232
+ push N (bsearch): 2769106.2 i/s - 3.80x slower
233
+ == push N (N=10,000) =======================================================
234
+ push N (c_dheap): 10444588.3 i/s
235
+ push N (findmin): 10191797.4 i/s - 1.02x slower
236
+ push N (c++ stl): 8210895.4 i/s - 1.27x slower
237
+ push N (rb_heap): 4369252.9 i/s - 2.39x slower
238
+ push N (bsearch): 1213580.4 i/s - 8.61x slower
239
+ == push N (N=1,000,000) ====================================================
240
+ push N (c_dheap): 10342183.7 i/s
241
+ push N (findmin): 9963898.8 i/s - 1.04x slower
242
+ push N (c++ stl): 7891924.8 i/s - 1.31x slower
243
+ push N (rb_heap): 4350116.0 i/s - 2.38x slower
244
+
245
+ All three heap implementations have little to no perceptible slowdown for `N >
246
+ 100`. But `DHeap` runs faster than `Array#push` to an unsorted array (findmin)!
247
+
248
+ #### push then pop N items
249
+
250
+ This measures the _average_ for a push **or** a pop, filling up a queue with N
251
+ items and then draining that queue until empty. It represents the amortized
252
+ cost of balanced pushes and pops to fill a heap and drain it.
253
+
254
+ ![bar graph for push_n_pop_n benchmarks](./images/push_n_pop_n.png)
255
+
256
+ == push N then pop N (N=100) ===============================================
257
+ push N + pop N (c_dheap): 10954469.2 i/s
258
+ push N + pop N (c++ stl): 9317140.2 i/s - 1.18x slower
259
+ push N + pop N (bsearch): 4808770.2 i/s - 2.28x slower
260
+ push N + pop N (findmin): 4321411.9 i/s - 2.53x slower
261
+ push N + pop N (rb_heap): 2467417.0 i/s - 4.44x slower
262
+ == push N then pop N (N=10,000) ============================================
263
+ push N + pop N (c_dheap): 8083962.7 i/s
264
+ push N + pop N (c++ stl): 7365661.8 i/s - 1.10x slower
265
+ push N + pop N (bsearch): 2257047.9 i/s - 3.58x slower
266
+ push N + pop N (rb_heap): 1439204.3 i/s - 5.62x slower
267
+ == push N then pop N (N=1,000,000) =========================================
268
+ push N + pop N (c++ stl): 5274657.5 i/s
269
+ push N + pop N (c_dheap): 4731117.9 i/s - 1.11x slower
270
+ push N + pop N (rb_heap): 976688.6 i/s - 5.40x slower
271
+
272
+ At N=100 findmin still beats a pure-ruby heap. But above that it slows down too
273
+ much to be useful. At N=10k, bsearch still beats a pure ruby heap, but above
274
+ 30k it slows down too much to be useful. `DHeap` consistently runs 4.5-5.5x
275
+ faster than the pure ruby heap.
276
+
277
+ #### push & pop on N-item heap
278
+
279
+ This measures the combined time to push once and pop once, which is done
280
+ repeatedly while keeping a stable heap size of N. Its an approximation for
281
+ scenarios which reach a stable size and then plateau with balanced pushes and
282
+ pops. E.g. timers and timeouts will often reschedule themselves or replace
283
+ themselves with new timers or timeouts, maintaining a roughly stable total count
284
+ of timers.
285
+
286
+ ![bar graph for push_pop benchmarks](./images/push_pop.png)
287
+
288
+ push + pop (findmin)
289
+ N 10: 5480288.0 i/s
290
+ N 100: 2595178.8 i/s - 2.11x slower
291
+ N 1000: 224813.9 i/s - 24.38x slower
292
+ N 10000: 12630.7 i/s - 433.89x slower
293
+ N 100000: 1097.3 i/s - 4994.31x slower
294
+ N 1000000: 135.9 i/s - 40313.05x slower
295
+ N 10000000: 12.9 i/s - 425838.01x slower
296
+
297
+ push + pop (bsearch)
298
+ N 10: 3931408.4 i/s
299
+ N 100: 2904181.8 i/s - 1.35x slower
300
+ N 1000: 2203157.1 i/s - 1.78x slower
301
+ N 10000: 1209584.9 i/s - 3.25x slower
302
+ N 100000: 81121.4 i/s - 48.46x slower
303
+ N 1000000: 5356.0 i/s - 734.02x slower
304
+ N 10000000: 281.9 i/s - 13946.33x slower
305
+
306
+ push + pop (rb_heap)
307
+ N 10: 2325816.5 i/s
308
+ N 100: 1603540.3 i/s - 1.45x slower
309
+ N 1000: 1262515.2 i/s - 1.84x slower
310
+ N 10000: 950389.3 i/s - 2.45x slower
311
+ N 100000: 732548.8 i/s - 3.17x slower
312
+ N 1000000: 673577.8 i/s - 3.45x slower
313
+ N 10000000: 467512.3 i/s - 4.97x slower
314
+
315
+ push + pop (c++ stl)
316
+ N 10: 7706818.6 i/s - 1.01x slower
317
+ N 100: 7393127.3 i/s - 1.05x slower
318
+ N 1000: 6898781.3 i/s - 1.13x slower
319
+ N 10000: 5731130.5 i/s - 1.36x slower
320
+ N 100000: 4842393.2 i/s - 1.60x slower
321
+ N 1000000: 4170936.4 i/s - 1.86x slower
322
+ N 10000000: 2737146.6 i/s - 2.84x slower
323
+
324
+ push + pop (c_dheap)
325
+ N 10: 10196454.1 i/s
326
+ N 100: 9668679.8 i/s - 1.05x slower
327
+ N 1000: 9339557.0 i/s - 1.09x slower
328
+ N 10000: 8045103.0 i/s - 1.27x slower
329
+ N 100000: 7150276.7 i/s - 1.43x slower
330
+ N 1000000: 6490261.6 i/s - 1.57x slower
331
+ N 10000000: 3734856.5 i/s - 2.73x slower
332
+
333
+ ## Time complexity analysis
334
+
335
+ There are two fundamental heap operations: sift-up (used by push or decrease
336
+ score) and sift-down (used by pop or delete or increase score). Each sift
337
+ bubbles an item to its correct location in the tree.
338
+
339
+ * A _d_-ary heap has `log n / log d` layers, so either sift performs as many as
340
+ `log n / log d` writes, when a member sifts the entire length of the tree.
341
+ * Sift-up needs one comparison per layer: `O(log n / log d)`.
342
+ * Sift-down needs d comparions per layer: `O(d log n / log d)`.
343
+
344
+ So, in the case of a balanced push then pop, as many as `(1 + d) log n / log d`
345
+ comparisons are made. Looking only at this worst case combo, `d=4` requires the
346
+ fewest comparisons for a combined push and pop:
347
+
348
+ * `(1 + 2) log n / log d ≈ 4.328085 log n`
349
+ * `(1 + 3) log n / log d ≈ 3.640957 log n`
350
+ * `(1 + 4) log n / log d ≈ 3.606738 log n`
351
+ * `(1 + 5) log n / log d ≈ 3.728010 log n`
352
+ * `(1 + 6) log n / log d ≈ 3.906774 log n`
353
+ * `(1 + 7) log n / log d ≈ 4.111187 log n`
354
+ * `(1 + 8) log n / log d ≈ 4.328085 log n`
355
+ * `(1 + 9) log n / log d ≈ 4.551196 log n`
356
+ * `(1 + 10) log n / log d ≈ 4.777239 log n`
357
+ * etc...
150
358
 
151
359
  See https://en.wikipedia.org/wiki/D-ary_heap#Analysis for deeper analysis.
152
360
 
153
- ### Space complexity
361
+ However, what this simple count of comparisons misses is the extent to which
362
+ modern compilers can optimize code (e.g. by unrolling the comparison loop to
363
+ execute on registers) and more importantly how well modern processors are at
364
+ pipelined speculative execution using branch prediction, etc. Benchmarks should
365
+ be run on the _exact same_ hardware platform that production code will use,
366
+ as the sift-down operation is especially sensitive to good pipelining.
154
367
 
155
- Because the heap is a complete binary tree, space usage is linear, regardless
156
- of d. However higher d values may provide better cache locality.
368
+ ## Comparison performance
157
369
 
158
- We can run comparisons much much faster for Numeric or String objects than for
159
- ruby objects which delegate comparison to internal Numeric or String objects.
160
- And it is often advantageous to use extrinsic scores for uncomparable items.
161
- For this, our internal array uses twice as many entries (one for score and one
162
- for value) as it would if it only supported intrinsic comparison or used an
163
- un-memoized "sort_by" proc.
370
+ It is often useful to use external scores for otherwise uncomparable values.
371
+ And casting an item or score (via `to_f`) can also be time consuming. So
372
+ `DHeap` evaluates and stores scores at the time of insertion, and they will be
373
+ compared directly without needing any further lookup.
164
374
 
165
- ### Timers
375
+ Numeric values can be compared _much_ faster than other ruby objects, even if
376
+ those objects simply delegate comparison to internal Numeric values.
377
+ Additionally, native C integers or floats can be compared _much_ faster than
378
+ ruby `Numeric` objects. So scores are converted to Float and stored as
379
+ `double`, which is 64 bits on an [LP64 64-bit system].
166
380
 
167
- Additionally, when used to sort timers, we can reasonably assume that:
168
- * New timers usually sort after most existing timers.
169
- * Most timers will be canceled before executing.
170
- * Canceled timers usually sort after most existing timers.
171
-
172
- So, if we are able to delete an item without searching for it, by keeping a map
173
- of positions within the heap, most timers can be inserted and deleted in O(1)
174
- time. Canceling a non-leaf timer can be further optimized by marking it as
175
- canceled without immediately removing it from the heap. If the timer is
176
- rescheduled before we garbage collect, adjusting its position will usually be
177
- faster than a delete and re-insert.
381
+ [LP64 64-bit system]: https://en.wikipedia.org/wiki/64-bit_computing#64-bit_data_models
178
382
 
179
383
  ## Alternative data structures
180
384
 
181
- Depending on what you're doing, maintaining a sorted `Array` using
182
- `#bsearch_index` and `#insert` might be faster! Although it is technically
183
- O(n) for insertions, the implementations for `memcpy` or `memmove` can be *very*
184
- fast on modern architectures. Also, it can be faster O(n) on average, if
185
- insertions are usually near the end of the array. You should run benchmarks
186
- with your expected scenarios to determine which is right.
385
+ As always, you should run benchmarks with your expected scenarios to determine
386
+ which is best for your application.
387
+
388
+ Depending on your use-case, using a sorted `Array` using `#bsearch_index`
389
+ and `#insert` might be just fine! It only takes a couple of lines of code and
390
+ is probably "Fast Enough".
391
+
392
+ More complex heap variant, e.g. [Fibonacci heap], allow heaps to be split and
393
+ merged which gives some graph algorithms a lower amortized time complexity. But
394
+ in practice, _d_-ary heaps have much lower overhead and often run faster.
395
+
396
+ [Fibonacci heap]: https://en.wikipedia.org/wiki/Fibonacci_heap
187
397
 
188
398
  If it is important to be able to quickly enumerate the set or find the ranking
189
- of values in it, then you probably want to use a self-balancing binary search
190
- tree (e.g. a red-black tree) or a skip-list.
191
-
192
- A Hashed Timing Wheel or Heirarchical Timing Wheels (or some variant in that
193
- family of data structures) can be constructed to have effectively O(1) running
194
- time in most cases. However, the implementation for that data structure is more
195
- complex than a heap. If a 4-ary heap is good enough for go's timers, it should
196
- be suitable for many use cases.
399
+ of values in it, then you may want to use a self-balancing binary search tree
400
+ (e.g. a [red-black tree]) or a [skip-list].
401
+
402
+ [red-black tree]: https://en.wikipedia.org/wiki/Red%E2%80%93black_tree
403
+ [skip-list]: https://en.wikipedia.org/wiki/Skip_list
404
+
405
+ [Hashed and Heirarchical Timing Wheels][timing wheel] (or some variant in the
406
+ timing wheel family of data structures) can have effectively `O(1)` running time
407
+ in most cases. Although the implementation for that data structure is more
408
+ complex than a heap, it may be necessary for enormous values of N.
409
+
410
+ [timing wheel]: http://www.cs.columbia.edu/~nahum/w6998/papers/ton97-timing-wheels.pdf
411
+
412
+ ## Supported platforms
413
+
414
+ See the [CI workflow] for all supported platforms.
415
+
416
+ [CI workflow]: https://github.com/nevans/d_heap/actions?query=workflow%3ACI
417
+
418
+ `d_heap` may contain bugs on 32-bit systems. Currently, `d_heap` is only tested
419
+ on 64-bit x86 CRuby 2.4-3.0 under Linux and Mac OS.
420
+
421
+ ## Caveats and TODOs (PRs welcome!)
422
+
423
+ A `DHeap`'s internal array grows but never shrinks. At the very least, there
424
+ should be a `#compact` or `#shrink` method and during `#freeze`. It might make
425
+ sense to automatically shrink (to no more than 2x the current size) during GC's
426
+ compact phase.
427
+
428
+ Benchmark sift-down min-child comparisons using SSE, AVX2, and AVX512F. This
429
+ might lead to a different default `d` value (maybe 16 or 24?).
430
+
431
+ Shrink scores to 64-bits: either store a type flag with each entry (this could
432
+ be used to support non-numeric scores) or require users to choose between
433
+ `Integer` or `Float` at construction time. Reducing memory usage should also
434
+ improve speed for very large heaps.
435
+
436
+ Patches to support JRuby, rubinius, 32-bit systems, or any other platforms are
437
+ welcome! JRuby and Truffle Ruby ought to be able to use [Java's PriorityQueue]?
438
+ Other platforms could fallback on the (slower) pure ruby implementation used by
439
+ the benchmarks.
440
+
441
+ [Java's PriorityQueue]: https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/util/PriorityQueue.html
442
+
443
+ Allow a max-heap (or other configurations of the compare function). This can be
444
+ very easily implemented by just reversing the scores.
445
+
446
+ _Maybe_ allow non-numeric scores to be compared with `<=>`, _only_ if the basic
447
+ numeric use case simplicity and speed can be preserved.
448
+
449
+ Consider `DHeap::Monotonic`, which could rely on `#pop_below` for "current time"
450
+ and move all values below that time onto an Array.
451
+
452
+ Consider adding `DHeap::Lazy` or `DHeap.new(lazy: true)` which could contain
453
+ some features that are loosely inspired by go's timers. Go lazily sifts its
454
+ heap after deletion or adjustments, to achieve faster amortized runtime.
455
+ There's no need to actually remove a deleted item from the heap, if you re-add
456
+ it back before it's next evaluated. A similar trick can be to store "far away"
457
+ values in an internal `Hash`, assuming many will be deleted before they rise to
458
+ the top. This could naturally evolve into a [timing wheel] variant.
197
459
 
198
460
  ## Development
199
461