RubyGems - d_heap - Versions diffs - 0.3.0 → 0.7.0 - Mend

d_heap 0.3.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +4 -4
data/.clang-format +21 -0
data/.github/workflows/main.yml +18 -3
data/.gitignore +1 -0
data/.rubocop.yml +32 -2
data/.yardopts +10 -0
data/CHANGELOG.md +93 -0
data/D +7 -0
data/README.md +416 -154
data/d_heap.gemspec +20 -8
data/docs/benchmarks-2.txt +93 -0
data/docs/benchmarks-mem.txt +39 -0
data/docs/benchmarks.txt +686 -0
data/docs/profile.txt +358 -0
data/ext/d_heap/.rubocop.yml +7 -0
data/ext/d_heap/d_heap.c +917 -295
data/ext/d_heap/extconf.rb +45 -3
data/images/push_n.png +0 -0
data/images/push_n_pop_n.png +0 -0
data/images/push_pop.png +0 -0
data/images/wikipedia-min-heap.png +0 -0
data/lib/d_heap.rb +116 -3
data/lib/d_heap/version.rb +1 -1
metadata +33 -17
data/.rspec +0 -3
data/.travis.yml +0 -6
data/Gemfile +0 -11
data/Gemfile.lock +0 -67
data/Rakefile +0 -20
data/bin/console +0 -15
data/bin/rake +0 -29
data/bin/rspec +0 -29
data/bin/rubocop +0 -29
data/bin/setup +0 -8
data/ext/d_heap/d_heap.h +0 -41

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 5a20fe814944fa8945bc2cc0ce87f810c8bf8d21102b8c454ae5d639f0548576
-  data.tar.gz: 65a1af345ae84c7f6e5da8af89a00730ffc4cbdb6bb2cac59461c3728eb0ad28
+  metadata.gz: 5b51ed52baf74b585a7ab7799f92a446aef5852431ba10e146658b419657ffbe
+  data.tar.gz: cc7c6786eee78ec13214582b8701448d312f59fb723d12676fb673447ab409a7
 SHA512:
-  metadata.gz: 27c240634013397033925ee258de08135587c31c7a046a902c028842680dadec20e2ccc0f76570e6f7db34d2292e5dae2ae7d17449436f890498697de4352c91
-  data.tar.gz: 8eb2cba120747cc7788c42f264d2109ef1afee8a28caa7effb8bcb1ff36ed7c99a0556b48b7d9e9479f6e8be8945eea3bf1ddb7f608c13c63252684643154179
+  metadata.gz: 5de98f8c9084b30694fff5f8154a6e42e7e67d76518c25136ab4fb0c0afb047ad3c923f4544dcf613ded4c3b01417729aa796c973100faaa7ee93051fa630c7d
+  data.tar.gz: e5dbcc90da7adfba7ef45cd9a2da5fd1781a2bd489002a5ffc0a764915c035c178db30ae9b8431a8fc810cfa6f03a1b38ec0a50cbf23c2e1ba5dfc36549c0609

data/.clang-format ADDED Viewed

@@ -0,0 +1,21 @@
+---
+BasedOnStyle: mozilla
+IndentWidth: 4
+PointerAlignment: Right
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: true
+AlignConsecutiveDeclarations: true
+AlignConsecutiveBitFields: true
+AlignConsecutiveMacros: true
+AlignEscapedNewlines: Right
+AlignOperands: true
+AllowAllConstructorInitializersOnNextLine: false
+AllowShortIfStatementsOnASingleLine: WithoutElse
+IndentCaseLabels: false
+IndentPPDirectives: AfterHash
+ForEachMacros:
+  - WHILE_PEEK_LT_P
+...

data/.github/workflows/main.yml CHANGED Viewed

@@ -1,4 +1,4 @@
-name: Ruby
+name: CI
 on: [push,pull_request]
@@ -7,7 +7,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        ruby: [2.5, 2.6, 2.7, 3.0]
+        ruby: [2.4, 2.5, 2.6, 2.7, 3.0]
         os: [ubuntu, macos]
         experimental: [false]
     runs-on: ${{ matrix.os }}-latest
@@ -23,4 +23,19 @@ jobs:
       run: |
         gem install bundler -v 2.2.3
         bundle install
-        bundle exec rake
+        bundle exec rake ci
+  benchmarks:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Ruby
+      uses: ruby/setup-ruby@v1
+      with:
+        ruby-version: 2.7
+        bundler-cache: true
+    - name: Run the benchmarks
+      run: |
+        gem install bundler -v 2.2.3
+        bundle install
+        bundle exec rake ci:benchmarks

data/.gitignore CHANGED Viewed

@@ -10,6 +10,7 @@
 *.so
 *.o
 *.a
+compile_commands.json
 mkmf.log
 # rspec failure tracking

data/.rubocop.yml CHANGED Viewed

@@ -3,9 +3,10 @@ inherit_mode:
     - Exclude
 AllCops:
-  TargetRubyVersion: 2.5
+  TargetRubyVersion: 2.4
   NewCops: disable
   Exclude:
+    - bin/benchmark-driver
     - bin/rake
     - bin/rspec
     - bin/rubocop
@@ -106,26 +107,50 @@ Naming/RescuedExceptionsVariableName: { Enabled: false }
 ###########################################################################
 # Matrics:
+Metrics/CyclomaticComplexity:
+  Max: 10
 # Although it may be better to split specs into multiple files...?
 Metrics/BlockLength:
   Exclude:
     - "spec/**/*_spec.rb"
+  CountAsOne:
+    - array
+    - hash
+    - heredoc
+Metrics/ClassLength:
+  Max: 200
+  CountAsOne:
+    - array
+    - hash
+    - heredoc
 ###########################################################################
 # Style...
 Style/AccessorGrouping:        { Enabled: false }
 Style/AsciiComments:           { Enabled: false } # 👮 can't stop our 🎉🥳🎊🥳!
+Style/ClassAndModuleChildren:  { Enabled: false }
 Style/EachWithObject:          { Enabled: false }
 Style/FormatStringToken:       { Enabled: false }
 Style/FloatDivision:           { Enabled: false }
+Style/GuardClause:             { Enabled: false } # usually nice to do, but...
+Style/IfUnlessModifier:        { Enabled: false }
+Style/IfWithSemicolon:         { Enabled: false }
 Style/Lambda:                  { Enabled: false }
 Style/LineEndConcatenation:    { Enabled: false }
 Style/MixinGrouping:           { Enabled: false }
+Style/MultilineBlockChain:     { Enabled: false }
 Style/PerlBackrefs:            { Enabled: false } # use occasionally/sparingly
 Style/RescueStandardError:     { Enabled: false }
+Style/Semicolon:               { Enabled: false }
 Style/SingleLineMethods:       { Enabled: false }
 Style/StabbyLambdaParentheses: { Enabled: false }
+Style/WhenThen               : { Enabled: false }
+# I require trailing commas elsewhere, but these are optional
+Style/TrailingCommaInArguments: { Enabled: false }
 # If rubocop had an option to only enforce this on constants and literals (e.g.
 # strings, regexp, range), I'd agree.
@@ -149,7 +174,9 @@ Style/BlockDelimiters:
   EnforcedStyle: semantic
   AllowBracesOnProceduralOneLiners: true
   IgnoredMethods:
-    - expect
+    - expect  # rspec
+    - profile # ruby-prof
+    - ips     # benchmark-ips
 Style/FormatString:
@@ -168,3 +195,6 @@ Style/TrailingCommaInHashLiteral:
 Style/TrailingCommaInArrayLiteral:
   EnforcedStyleForMultiline: consistent_comma
+Style/YodaCondition:
+  EnforcedStyle: forbid_for_equality_operators_only

data/.yardopts ADDED Viewed

@@ -0,0 +1,10 @@
+-o doc
+--embed-mixins
+--hide-void-return
+--no-private
+--asset images:images
+--exclude lib/benchmark_driver
+--exclude lib/d_heap/benchmarks*
+-
+CHANGELOG.md
+CODE_OF_CONDUCT.md

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,93 @@
+## Current/Unreleased
+## Release v0.7.0 (2021-01-24)
+* 💥⚡️ **BREAKING**: Uses `double`) for  _all_ scores.
+    * 💥 Integers larger than a double mantissa (53-bits) will lose some
+        precision.
+    * ⚡️ big speed up
+    * ⚡️ Much better memory usage
+    * ⚡️ Simplifies score conversion between ruby and C
+* ✨ Added `DHeap::Map` for ensuring values can only be added once, by `#hash`.
+    * Adding again will update the score.
+    * Adds `DHeap::Map#[]` for quick lookup of existing scores
+    * Adds `DHeap::Map#[]=` for adjustments of existing scores
+    * TODO: `DHeap::Map#delete`
+* 📝📈 SO MANY BENCHMARKS
+* ⚡️ Set DEFAULT_D to 6, based on benchmarks.
+* 🐛♻️  convert all `long` indexes to `size_t`
+## Release v0.6.1 (2021-01-24)
+* 📝 Fix link to CHANGELOG.md in gemspec
+## Release v0.6.0 (2021-01-24)
+* 🔥 **Breaking**: `#initialize` uses a keyword argument for `d`
+* ✨ Added `#initialize(capacity: capa)` to set initial capacity.
+* ✨ Added `peek_with_score` and `peek_score`
+* ✨ Added `pop_with_score` and `each_pop(with_score: true)`
+* ✨ Added `pop_all_below(max_score, array = [])`
+* ✨ Added aliases for `shift` and `next`
+* 📈 Added benchmark charts to README, and `bin/bench_charts` to generate them.
+    * requires `gruff` which requires `rmagick` which requires `imagemagick`
+* 📝 Many documentation updates and fixes.
+## Release v0.5.0 (2021-01-17)
+* 🔥 **Breaking**: reversed order of `#push` arguments to `value, score`.
+* ✨ Added `#insert(score, value)` to replace earlier version of `#push`.
+* ✨ Added `#each_pop` enumerator.
+* ✨ Added aliases for `deq`, `enq`, `first`, `pop_below`, `length`, and
+    `count`, to mimic other classes in ruby's stdlib.
+* ⚡️♻️  More performance improvements:
+    * Created an `ENTRY` struct and store both the score and the value pointer in
+      the same `ENTRY *entries` array.
+    * Reduced unnecessary allocations or copies in both sift loops.  A similar
+      refactoring also sped up the pure ruby benchmark implementation.
+    * Compiling with `-O3`.
+* 📝 Updated (and in some cases, fixed) yardoc
+* ♻️  Moved aliases and less performance sensitive code into ruby.
+* ♻️  DRY up push/insert methods
+## Release v0.4.0 (2021-01-12)
+* 🔥 **Breaking**: Scores must be `Integer` or convertable to `Float`
+    * ⚠️  `Integer` scores must fit in `-ULONG_LONG_MAX` to `+ULONG_LONG_MAX`.
+* ⚡️ Big performance improvements, by using C `long double *cscores` array
+* ⚡️ many many (so many) updates to benchmarks
+* ✨ Added `DHeap#clear`
+* 🐛 Fixed `DHeap#initialize_copy` and `#freeze`
+* ♻️  significant refactoring
+* 📝 Updated docs (mostly adding benchmarks)
+## Release v0.3.0 (2020-12-29)
+* 🔥 **Breaking**: Removed class methods that operated directly on an array.
+    They weren't compatible with the performance improvements.
+* ⚡️ Big performance improvements, by converting to a `T_DATA` struct.
+* ♻️  Major refactoring/rewriting of dheap.c
+* ✅ Added benchmark specs
+## Release v0.2.2 (2020-12-27)
+* 🐛 fix `optimized_cmp`, avoiding internal symbols
+* 📝 Update documentation
+* 💚 fix macos CI
+* ➕ Add rubocop 👮🎨
+## Release v0.2.1 (2020-12-26)
+* ⬆️  Upgraded rake (and bundler) to support ruby 3.0
+## Release v0.2.0 (2020-12-24)
+* ✨ Add ability to push separate score and value
+* ⚡️ Big performance gain, by storing scores separately and using ruby's
+  internal `OPTIMIZED_CMP` instead of always directly calling `<=>`
+## Release v0.1.0 (2020-12-22)
+🎉 initial release 🎉
+* ✨ Add basic d-ary Heap implementation

data/D ADDED Viewed

@@ -0,0 +1,7 @@
+#!/bin/sh
+set -eu
+export BENCH_D="$1"
+shift
+exec ruby "$@"

data/README.md CHANGED Viewed

@@ -1,199 +1,461 @@
-# DHeap
+# DHeap - Fast d-ary heap for ruby
+[![Gem Version](https://badge.fury.io/rb/d_heap.svg)](https://badge.fury.io/rb/d_heap)
+[![Build Status](https://github.com/nevans/d_heap/workflows/CI/badge.svg)](https://github.com/nevans/d_heap/actions?query=workflow%3ACI)
+[![Maintainability](https://api.codeclimate.com/v1/badges/ff274acd0683c99c03e1/maintainability)](https://codeclimate.com/github/nevans/d_heap/maintainability)
+A fast [_d_-ary heap][d-ary heap] [priority queue] implementation for ruby,
+implemented as a C extension.
+A regular queue has "FIFO" behavior: first in, first out.  A stack is "LIFO":
+last in first out.  A priority queue pushes each element with a score and pops
+out in order by score.  Priority queues are often used in algorithms for e.g.
+[scheduling] of timers or bandwidth management, for [Huffman coding], and for
+various graph search algorithms such as [Dijkstra's algorithm], [A* search], or
+[Prim's algorithm].
+From [wikipedia](https://en.wikipedia.org/wiki/Heap_(data_structure)):
+> A heap is a specialized tree-based data structure which is essentially an
+> almost complete tree that satisfies the heap property: in a min heap, for any
+> given node C, if P is a parent node of C, then the key (the value) of P is
+> less than or equal to the key of C. The node at the "top" of the heap (with no
+> parents) is called the root node.
+![tree representation of a min heap](images/wikipedia-min-heap.png)
+The _d_-ary heap data structure is a generalization of a [binary heap] in which
+each node has _d_ children instead of 2.  This speeds up "push" or "decrease
+priority" operations (`O(log n / log d)`) with the tradeoff of slower "pop" or
+"increase priority" (`O(d log n / log d)`).  Additionally, _d_-ary heaps can
+have better memory cache behavior than binary heaps, letting them run more
+quickly in practice.
+Although the default _d_ value will usually perform best (see the time
+complexity analysis below), it's always advisable to benchmark your specific
+use-case.  In particular, if you push items more than you pop, higher values for
+_d_ can give a faster total runtime.
+[d-ary heap]: https://en.wikipedia.org/wiki/D-ary_heap
+[priority queue]: https://en.wikipedia.org/wiki/Priority_queue
+[binary heap]: https://en.wikipedia.org/wiki/Binary_heap
+[scheduling]: https://en.wikipedia.org/wiki/Scheduling_(computing)
+[Huffman coding]: https://en.wikipedia.org/wiki/Huffman_coding#Compression
+[Dijkstra's algorithm]: https://en.wikipedia.org/wiki/Dijkstra%27s_algorithm#Using_a_priority_queue
+[A* search]: https://en.wikipedia.org/wiki/A*_search_algorithm#Description
+[Prim's algorithm]: https://en.wikipedia.org/wiki/Prim%27s_algorithm
-A fast _d_-ary heap implementation for ruby, useful in priority queues and graph
-algorithms.
+## Installation
-The _d_-ary heap data structure is a generalization of the binary heap, in which
-the nodes have _d_ children instead of 2.  This allows for "decrease priority"
-operations to be performed more quickly with the tradeoff of slower delete
-minimum.  Additionally, _d_-ary heaps can have better memory cache behavior than
-binary heaps, allowing them to run more quickly in practice despite slower
-worst-case time complexity. In the worst case, a _d_-ary heap requires only
-`O(log n / log d)` to push, with the tradeoff that pop is `O(d log n / log d)`.
+Add this line to your application's Gemfile:
-Although you should probably just stick with the default _d_ value  of `4`, it
-may be worthwhile to benchmark your specific scenario.
+```ruby
+gem 'd_heap'
+```
+And then execute:
+    $ bundle install
+Or install it yourself as:
+    $ gem install d_heap
 ## Usage
-The simplest way to use it is simply with `#push` and `#pop`.  Push takes a
-score and a value, and pop returns the value with the current minimum score.
+The basic API is `#push(object, score)` and `#pop`.  Please read the [full
+documentation] for more details.  The score must be convertable to a `Float` via
+`Float(score)` (i.e. it should properly implement `#to_f`).
+Quick reference for the most common methods:
+* `heap << object` adds a value, using `Float(object)` as its intrinsic score.
+* `heap.push(object, score)` adds a value with an extrinsic score.
+* `heap.peek` to view the minimum value without popping it.
+* `heap.pop` removes and returns the value with the minimum score.
+* `heap.pop_below(max_score)` pops only if the next score is `<` the argument.
+* `heap.clear` to remove all items from the heap.
+* `heap.empty?` returns true if the heap is empty.
+* `heap.size` returns the number of items in the heap.
+### Examples
 ```ruby
+# create some example objects to place in our heap
+Task = Struct.new(:id, :time) do
+  def to_f; time.to_f end
+end
+t1 = Task.new(1, Time.now + 5*60)
+t2 = Task.new(2, Time.now + 50)
+t3 = Task.new(3, Time.now + 60)
+t4 = Task.new(4, Time.now +  5)
+# create the heap
 require "d_heap"
+heap = DHeap.new
-heap = DHeap.new # defaults to a 4-ary heap
+# push with an explicit score (which might be extrinsic to the value)
+heap.push t1, t1.to_f
-# storing [score, value] tuples
-heap.push Time.now + 5*60, Task.new(1)
-heap.push Time.now +   30, Task.new(2)
-heap.push Time.now +   60, Task.new(3)
-heap.push Time.now +    5, Task.new(4)
+# the score will be implicitly cast with Float, so any object with #to_f
+heap.push t2, t2
-# peeking and popping (using last to get the task and ignore the time)
-heap.pop.last  # => Task[4]
-heap.pop.last  # => Task[2]
-heap.peak.last # => Task[3], but don't pop it
-heap.pop.last  # => Task[3]
-heap.pop.last  # => Task[1]
-```
+# if the object has an intrinsic score via #to_f, "<<" is the simplest API
+heap << t3 << t4
-Read the `rdoc` for more detailed documentation and examples.
+# pop returns the lowest scored item, and removes it from the heap
+heap.pop    # => #<struct Task id=4, time=2021-01-17 17:02:22.5574 -0500>
+heap.pop    # => #<struct Task id=2, time=2021-01-17 17:03:07.5574 -0500>
-## Installation
+# peek returns the lowest scored item, without removing it from the heap
+heap.peek   # => #<struct Task id=3, time=2021-01-17 17:03:17.5574 -0500>
+heap.pop    # => #<struct Task id=3, time=2021-01-17 17:03:17.5574 -0500>
-Add this line to your application's Gemfile:
+# pop_lte handles the common "h.pop if h.peek_score < max" pattern
+heap.pop_lte(Time.now + 65) # => nil
-```ruby
-gem 'd_heap'
+# the heap size can be inspected with size and empty?
+heap.empty? # => false
+heap.size   # => 1
+heap.pop    # => #<struct Task id=1, time=2021-01-17 17:07:17.5574 -0500>
+heap.empty? # => true
+heap.size   # => 0
+# popping from an empty heap returns nil
+heap.pop    # => nil
 ```
-And then execute:
+Please see the [full documentation] for more methods and more examples.
-    $ bundle install
+[full documentation]: https://rubydoc.info/gems/d_heap/DHeap
-Or install it yourself as:
+### DHeap::Map
-    $ gem install d_heap
+`DHeap::Map` augments the heap with an internal `Hash`, mapping objects to their
+index in the heap.  For simple push/pop this a bit slower than a normal `DHeap`
+heap, but it can enable huge speed-ups for algorithms that need to adjust scores
+after they've been added, e.g.  [Dijkstra's algorithm].  It adds the following:
-## Motivation
-Sometimes you just need a priority queue, right?  With a regular queue, you
-expect "FIFO" behavior: first in, first out.  With a priority queue, you push
-with a score (or your elements are comparable), and you want to be able to
-efficiently pop off the minimum (or maximum) element.
-One obvious approach is to simply maintain an array in sorted order.  And
-ruby's Array class makes it simple to maintain a sorted array by combining
-`#bsearch_index` with `#insert`.  With certain insert/remove workloads that can
-perform very well, but in the worst-case an insert or delete can result in O(n),
-since `#insert` may need to `memcpy` or `memmove` a significant portion of the
-array.
-But the standard way to efficiently and simply solve this problem is using a
-binary heap.  Although it increases the time for `pop`, it converts the
-amortized time per push + pop from `O(n)` to `O(d log n / log d)`.
-I was surprised to find that, at least under certain benchmarks, my pure ruby
-heap implementation was usually slower than inserting into a fully sorted
-array.  While this is a testament to ruby's fine-tuned Array implementationw, a
-heap implementated in C should easily peform faster than `Array#insert`.
-The biggest issue is that it just takes far too much time to call `<=>` from
-ruby code: A sorted array only requires `log n / log 2` comparisons to insert
-and no comparisons to pop.  However a _d_-ary heap requires `log n / log d` to
-insert plus an additional `d log n / log d` to pop.  If your queue contains only
-a few hundred items at once, the overhead of those extra calls to `<=>` is far
-more than occasionally calling `memcpy`.
-It's likely that MJIT will eventually make the C-extension completely
-unnecessary.  This is definitely hotspot code, and the basic ruby implementation
-would work fine, if not for that `<=>` overhead.  Until then... this gem gets
-the job done.
-## TODOs...
-_TODO:_ In addition to a basic _d_-ary heap class (`DHeap`), this library
-~~includes~~ _will include_ extensions to `Array`, allowing an Array to be
-directly handled as a priority queue.  These extension methods are meant to be
-used similarly to how `#bsearch` and `#bsearch_index` might be used.
-_TODO:_ Also ~~included is~~ _will include_ `DHeap::Set`, which augments the
-basic heap with an internal `Hash`, which maps a set of values to scores.
-loosely inspired by go's timers.  e.g: It lazily sifts its heap after deletion
-and adjustments, to achieve faster average runtime for *add* and *cancel*
-operations.
-_TODO:_ Also ~~included is~~ _will include_ `DHeap::Timers`, which contains some
-features that are loosely inspired by go's timers.  e.g: It lazily sifts its
-heap after deletion and adjustments, to achieve faster average runtime for *add*
-and *cancel* operations.
-Additionally, I was inspired by reading go's "timer.go" implementation to
-experiment with a 4-ary heap instead of the traditional binary heap.  In the
-case of timers, new timers are usually scheduled to run after most of the
-existing timers.  And timers are usually canceled before they have a chance to
-run. While a binary heap holds 50% of its elements in its last layer, 75% of a
-4-ary heap will have no children.  That diminishes the extra comparison overhead
-during sift-down.
+* a uniqueness constraint, by `#hash` value
+* `#[obj] # => score` or `#score(obj)` in `O(1)`
+* `#[obj] = new_score` or `#rescore(obj, score)` in `O(d log n / log d)`
+* TODO:
+  * optionally unique by object identity
+  * `#delete(obj)` in `O(d log n / log d)` (TODO)
-## Benchmarks
+## Scores
-_TODO: put benchmarks here._
+If a score changes while the object is still in the heap, it will not be
+re-evaluated again.
-## Analysis
+Constraining scores to `Float` gives enormous performance benefits.  n.b.
+very large `Integer` values will lose precision when converted to `Float`.  This
+is compiler and architecture dependant but with gcc on an IA-64 system, `Float`
+is 64 bits with a 53-bit mantissa, which gives a range of -9,007,199,254,740,991
+to +9,007,199,254,740,991, which is _not_ enough to store the precise POSIX
+time since the epoch in nanoseconds.  This can be worked around by adding a
+bias, but probably it's good enough for most usage.
-### Time complexity
+_Comparing arbitary objects via_ `a <=> b` _was the original design and may be
+added back in a future version,_ if (and only if) _it can be done without
+impacting the speed of numeric comparisons._
-Both sift operations can perform (log[d] n = log n / log d) swaps.
-Swap up performs only a single comparison per swap: O(1).
-Swap down performs as many as d comparions per swap: O(d).
+## Thread safety
-Inserting an item is O(log n / log d).
-Deleting the root is O(d log n / log d).
+`DHeap` is _not_ thread-safe, so concurrent access from multiple threads need to
+take precautions such as locking access behind a mutex.
-Assuming every inserted item is eventually deleted from the root, d=4 requires
-the fewest comparisons for combined insert and delete:
- * (1 + 2) lg 2 = 4.328085
- * (1 + 3) lg 3 = 3.640957
- * (1 + 4) lg 4 = 3.606738
- * (1 + 5) lg 5 = 3.728010
- * (1 + 6) lg 6 = 3.906774
- * etc...
+## Benchmarks
-Leaf nodes require no comparisons to shift down, and higher values for d have
-higher percentage of leaf nodes:
- * d=2 has ~50% leaves,
- * d=3 has ~67% leaves,
- * d=4 has ~75% leaves,
- * and so on...
+_See full benchmark output in subdirs of `benchmarks`.  See also or updated
+results. These benchmarks were measured with an Intel Core i7-1065G7 8x3.9GHz
+with d_heap v0.5.0 and ruby 2.7.2 without MJIT enabled._
+### Implementations
+ * **findmin** -
+    A very fast `O(1)` push using `Array#push` onto an unsorted Array, but a
+    very slow `O(n)` pop using `Array#min`, `Array#rindex(min)` and
+    `Array#delete_at(min_index)`.  Push + pop is still fast for `n < 100`, but
+    unusably slow for `n > 1000`.
+ * **bsearch** -
+    A simple implementation with a slow `O(n)` push using `Array#bsearch` +
+    `Array#insert` to maintain a sorted Array, but a very fast `O(1)` pop with
+    `Array#pop`.  It is still relatively fast for `n < 10000`, but its linear
+    time complexity really destroys it after that.
+ * **rb_heap** -
+    A pure ruby binary min-heap that has been tuned for performance by making
+    few method calls and allocating and assigning as few variables as possible.
+    It runs in `O(log n)` for both push and pop, although pop is slower than
+    push by a constant factor.  Its much higher constant factors makes it lose
+    to `bsearch` push + pop for `n < 10000` but it holds steady with very little
+    slowdown even with `n > 10000000`.
+ * **c++ stl** -
+    A thin wrapper around the [priority_queue_cxx gem] which uses the [C++ STL
+    priority_queue].  The wrapper is simply to provide compatibility with the
+    other benchmarked implementations, but it should be possible to speed this
+    up a little bit by benchmarking the `priority_queue_cxx` API directly.  It
+    has the same time complexity as rb_heap but its much lower constant
+    factors allow it to easily outperform `bsearch`.
+ * **c_dheap** -
+    A {DHeap} instance with the default `d` value of `4`.  It has the same time
+    complexity as `rb_heap` and `c++ stl`, but is faster than both in every
+    benchmarked scenario.
+[priority_queue_cxx gem]: https://rubygems.org/gems/priority_queue_cxx
+[C++ STL priority_queue]: http://www.cplusplus.com/reference/queue/priority_queue/
+### Scenarios
+Each benchmark increases N exponentially, either by √1̅0̅ or approximating
+(alternating between x3 and x3.333) in order to simplify keeping loop counts
+evenly divisible by N.
+#### push N items
+This measures the _average time per insert_ to create a queue of size N
+(clearing the queue once it reaches that size).  Use cases which push (or
+decrease) more values than they pop, e.g.  [Dijkstra's algorithm] or [Prim's
+algorithm] when the graph has more edges than verticies, may want to pay more
+attention to this benchmark.
+![bar graph for push_n_pop_n benchmarks](./images/push_n.png)
+    == push N (N=100) ==========================================================
+    push N (c_dheap):  10522662.6 i/s
+    push N (findmin):   9980622.3 i/s - 1.05x  slower
+    push N (c++ stl):   7991608.3 i/s - 1.32x  slower
+    push N (rb_heap):   4607849.4 i/s - 2.28x  slower
+    push N (bsearch):   2769106.2 i/s - 3.80x  slower
+    == push N (N=10,000) =======================================================
+    push N (c_dheap):  10444588.3 i/s
+    push N (findmin):  10191797.4 i/s - 1.02x  slower
+    push N (c++ stl):   8210895.4 i/s - 1.27x  slower
+    push N (rb_heap):   4369252.9 i/s - 2.39x  slower
+    push N (bsearch):   1213580.4 i/s - 8.61x  slower
+    == push N (N=1,000,000) ====================================================
+    push N (c_dheap):  10342183.7 i/s
+    push N (findmin):   9963898.8 i/s - 1.04x  slower
+    push N (c++ stl):   7891924.8 i/s - 1.31x  slower
+    push N (rb_heap):   4350116.0 i/s - 2.38x  slower
+All three heap implementations have little to no perceptible slowdown for `N >
+100`.  But `DHeap` runs faster than `Array#push` to an unsorted array (findmin)!
+#### push then pop N items
+This measures the _average_ for a push **or** a pop, filling up a queue with N
+items and then draining that queue until empty.  It represents the amortized
+cost of balanced pushes and pops to fill a heap and drain it.
+![bar graph for push_n_pop_n benchmarks](./images/push_n_pop_n.png)
+    == push N then pop N (N=100) ===============================================
+    push N + pop N (c_dheap):  10954469.2 i/s
+    push N + pop N (c++ stl):   9317140.2 i/s - 1.18x  slower
+    push N + pop N (bsearch):   4808770.2 i/s - 2.28x  slower
+    push N + pop N (findmin):   4321411.9 i/s - 2.53x  slower
+    push N + pop N (rb_heap):   2467417.0 i/s - 4.44x  slower
+    == push N then pop N (N=10,000) ============================================
+    push N + pop N (c_dheap):   8083962.7 i/s
+    push N + pop N (c++ stl):   7365661.8 i/s - 1.10x  slower
+    push N + pop N (bsearch):   2257047.9 i/s - 3.58x  slower
+    push N + pop N (rb_heap):   1439204.3 i/s - 5.62x  slower
+    == push N then pop N (N=1,000,000) =========================================
+    push N + pop N (c++ stl):   5274657.5 i/s
+    push N + pop N (c_dheap):   4731117.9 i/s - 1.11x  slower
+    push N + pop N (rb_heap):    976688.6 i/s - 5.40x  slower
+At N=100 findmin still beats a pure-ruby heap. But above that it slows down too
+much to be useful.  At N=10k, bsearch still beats a pure ruby heap, but above
+30k it slows down too much to be useful.  `DHeap` consistently runs 4.5-5.5x
+faster than the pure ruby heap.
+#### push & pop on N-item heap
+This measures the combined time to push once and pop once, which is done
+repeatedly while keeping a stable heap size of N.  Its an approximation for
+scenarios which reach a stable size and then plateau with balanced pushes and
+pops.  E.g. timers and timeouts will often reschedule themselves or replace
+themselves with new timers or timeouts, maintaining a roughly stable total count
+of timers.
+![bar graph for push_pop benchmarks](./images/push_pop.png)
+             push + pop (findmin)
+                N 10:   5480288.0 i/s
+               N 100:   2595178.8 i/s - 2.11x  slower
+              N 1000:    224813.9 i/s - 24.38x  slower
+             N 10000:     12630.7 i/s - 433.89x  slower
+            N 100000:      1097.3 i/s - 4994.31x  slower
+           N 1000000:       135.9 i/s - 40313.05x  slower
+          N 10000000:        12.9 i/s - 425838.01x  slower
+             push + pop (bsearch)
+                N 10:   3931408.4 i/s
+               N 100:   2904181.8 i/s - 1.35x  slower
+              N 1000:   2203157.1 i/s - 1.78x  slower
+             N 10000:   1209584.9 i/s - 3.25x  slower
+            N 100000:     81121.4 i/s - 48.46x  slower
+           N 1000000:      5356.0 i/s - 734.02x  slower
+          N 10000000:       281.9 i/s - 13946.33x  slower
+             push + pop (rb_heap)
+                N 10:   2325816.5 i/s
+               N 100:   1603540.3 i/s - 1.45x  slower
+              N 1000:   1262515.2 i/s - 1.84x  slower
+             N 10000:    950389.3 i/s - 2.45x  slower
+            N 100000:    732548.8 i/s - 3.17x  slower
+           N 1000000:    673577.8 i/s - 3.45x  slower
+          N 10000000:    467512.3 i/s - 4.97x  slower
+             push + pop (c++ stl)
+                N 10:   7706818.6 i/s - 1.01x  slower
+               N 100:   7393127.3 i/s - 1.05x  slower
+              N 1000:   6898781.3 i/s - 1.13x  slower
+             N 10000:   5731130.5 i/s - 1.36x  slower
+            N 100000:   4842393.2 i/s - 1.60x  slower
+           N 1000000:   4170936.4 i/s - 1.86x  slower
+          N 10000000:   2737146.6 i/s - 2.84x  slower
+             push + pop (c_dheap)
+                N 10:  10196454.1 i/s
+               N 100:   9668679.8 i/s - 1.05x  slower
+              N 1000:   9339557.0 i/s - 1.09x  slower
+             N 10000:   8045103.0 i/s - 1.27x  slower
+            N 100000:   7150276.7 i/s - 1.43x  slower
+           N 1000000:   6490261.6 i/s - 1.57x  slower
+          N 10000000:   3734856.5 i/s - 2.73x  slower
+## Time complexity analysis
+There are two fundamental heap operations: sift-up (used by push or decrease
+score) and sift-down (used by pop or delete or increase score).  Each sift
+bubbles an item to its correct location in the tree.
+* A _d_-ary heap has `log n / log d` layers, so either sift performs as many as
+  `log n / log d` writes, when a member sifts the entire length of the tree.
+* Sift-up needs one comparison per layer: `O(log n / log d)`.
+* Sift-down needs d comparions per layer: `O(d log n / log d)`.
+So, in the case of a balanced push then pop, as many as `(1 + d) log n / log d`
+comparisons are made.  Looking only at this worst case combo, `d=4` requires the
+fewest comparisons for a combined push and pop:
+* `(1 +  2) log n / log d ≈ 4.328085 log n`
+* `(1 +  3) log n / log d ≈ 3.640957 log n`
+* `(1 +  4) log n / log d ≈ 3.606738 log n`
+* `(1 +  5) log n / log d ≈ 3.728010 log n`
+* `(1 +  6) log n / log d ≈ 3.906774 log n`
+* `(1 +  7) log n / log d ≈ 4.111187 log n`
+* `(1 +  8) log n / log d ≈ 4.328085 log n`
+* `(1 +  9) log n / log d ≈ 4.551196 log n`
+* `(1 + 10) log n / log d ≈ 4.777239 log n`
+* etc...
 See https://en.wikipedia.org/wiki/D-ary_heap#Analysis for deeper analysis.
-### Space complexity
+However, what this simple count of comparisons misses is the extent to which
+modern compilers can optimize code (e.g. by unrolling the comparison loop to
+execute on registers) and more importantly how well modern processors are at
+pipelined speculative execution using branch prediction, etc.  Benchmarks should
+be run on the _exact same_ hardware platform that production code will use,
+as the sift-down operation is especially sensitive to good pipelining.
-Because the heap is a complete binary tree, space usage is linear, regardless
-of d.  However higher d values may provide better cache locality.
+## Comparison performance
-We can run comparisons much much faster for Numeric or String objects than for
-ruby objects which delegate comparison to internal Numeric or String objects.
-And it is often advantageous to use extrinsic scores for uncomparable items.
-For this, our internal array uses twice as many entries (one for score and one
-for value) as it would if it only supported intrinsic comparison or used an
-un-memoized "sort_by" proc.
+It is often useful to use external scores for otherwise uncomparable values.
+And casting an item or score (via `to_f`) can also be time consuming.  So
+`DHeap` evaluates and stores scores at the time of insertion, and they will be
+compared directly without needing any further lookup.
-### Timers
+Numeric values can be compared _much_ faster than other ruby objects, even if
+those objects simply delegate comparison to internal Numeric values.
+Additionally, native C integers or floats can be compared _much_ faster than
+ruby `Numeric` objects.  So scores are converted to Float and stored as
+`double`, which is 64 bits on an [LP64 64-bit system].
-Additionally, when used to sort timers, we can reasonably assume that:
- * New timers usually sort after most existing timers.
- * Most timers will be canceled before executing.
- * Canceled timers usually sort after most existing timers.
-So, if we are able to delete an item without searching for it, by keeping a map
-of positions within the heap, most timers can be inserted and deleted in O(1)
-time.  Canceling a non-leaf timer can be further optimized by marking it as
-canceled without immediately removing it from the heap.  If the timer is
-rescheduled before we garbage collect, adjusting its position will usually be
-faster than a delete and re-insert.
+[LP64 64-bit system]: https://en.wikipedia.org/wiki/64-bit_computing#64-bit_data_models
 ## Alternative data structures
-Depending on what you're doing, maintaining a sorted `Array` using
-`#bsearch_index` and `#insert` might be faster!  Although it is technically
-O(n) for insertions, the implementations for `memcpy` or `memmove` can be *very*
-fast on modern architectures.  Also, it can be faster O(n) on average, if
-insertions are usually near the end of the array.  You should run benchmarks
-with your expected scenarios to determine which is right.
+As always, you should run benchmarks with your expected scenarios to determine
+which is best for your application.
+Depending on your use-case, using a sorted `Array` using `#bsearch_index`
+and `#insert` might be just fine!  It only takes a couple of lines of code and
+is probably "Fast Enough".
+More complex heap variant, e.g. [Fibonacci heap], allow heaps to be split and
+merged which gives some graph algorithms a lower amortized time complexity.  But
+in practice, _d_-ary heaps have much lower overhead and often run faster.
+[Fibonacci heap]: https://en.wikipedia.org/wiki/Fibonacci_heap
 If it is important to be able to quickly enumerate the set or find the ranking
-of values in it, then you probably want to use a self-balancing binary search
-tree (e.g. a red-black tree) or a skip-list.
-A Hashed Timing Wheel or Heirarchical Timing Wheels (or some variant in that
-family of data structures) can be constructed to have effectively O(1) running
-time in most cases.  However, the implementation for that data structure is more
-complex than a heap.  If a 4-ary heap is good enough for go's timers, it should
-be suitable for many use cases.
+of values in it, then you may want to use a self-balancing binary search tree
+(e.g. a [red-black tree]) or a [skip-list].
+[red-black tree]: https://en.wikipedia.org/wiki/Red%E2%80%93black_tree
+[skip-list]: https://en.wikipedia.org/wiki/Skip_list
+[Hashed and Heirarchical Timing Wheels][timing wheel] (or some variant in the
+timing wheel family of data structures) can have effectively `O(1)` running time
+in most cases.  Although the implementation for that data structure is more
+complex than a heap, it may be necessary for enormous values of N.
+[timing wheel]: http://www.cs.columbia.edu/~nahum/w6998/papers/ton97-timing-wheels.pdf
+## Supported platforms
+See the [CI workflow] for all supported platforms.
+[CI workflow]: https://github.com/nevans/d_heap/actions?query=workflow%3ACI
+`d_heap` may contain bugs on 32-bit systems.  Currently, `d_heap` is only tested
+on 64-bit x86 CRuby 2.4-3.0 under Linux and Mac OS.
+## Caveats and TODOs (PRs welcome!)
+A `DHeap`'s internal array grows but never shrinks.  At the very least, there
+should be a `#compact` or `#shrink` method and during `#freeze`.  It might make
+sense to automatically shrink (to no more than 2x the current size) during GC's
+compact phase.
+Benchmark sift-down min-child comparisons using SSE, AVX2, and AVX512F.  This
+might lead to a different default `d` value (maybe 16 or 24?).
+Shrink scores to 64-bits: either store a type flag with each entry (this could
+be used to support non-numeric scores) or require users to choose between
+`Integer` or `Float` at construction time.  Reducing memory usage should also
+improve speed for very large heaps.
+Patches to support JRuby, rubinius, 32-bit systems, or any other platforms are
+welcome!  JRuby and Truffle Ruby ought to be able to use [Java's PriorityQueue]?
+Other platforms could fallback on the (slower) pure ruby implementation used by
+the benchmarks.
+[Java's PriorityQueue]: https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/util/PriorityQueue.html
+Allow a max-heap (or other configurations of the compare function).  This can be
+very easily implemented by just reversing the scores.
+_Maybe_ allow non-numeric scores to be compared with `<=>`, _only_ if the basic
+numeric use case simplicity and speed can be preserved.
+Consider `DHeap::Monotonic`, which could rely on `#pop_below` for "current time"
+and move all values below that time onto an Array.
+Consider adding `DHeap::Lazy` or `DHeap.new(lazy: true)` which could contain
+some features that are loosely inspired by go's timers.  Go lazily sifts its
+heap after deletion or adjustments, to achieve faster amortized runtime.
+There's no need to actually remove a deleted item from the heap, if you re-add
+it back before it's next evaluated.  A similar trick can be to store "far away"
+values in an internal `Hash`, assuming many will be deleted before they rise to
+the top.  This could naturally evolve into a [timing wheel] variant.
 ## Development