RubyGems - d_heap - Versions diffs - 0.2.2 → 0.6.1 - Mend

d_heap 0.2.2 → 0.6.1

Files changed (41) hide show

checksums.yaml +4 -4
data/.github/workflows/main.yml +2 -2
data/.gitignore +1 -0
data/.rubocop.yml +40 -1
data/.yardopts +10 -0
data/CHANGELOG.md +76 -0
data/Gemfile +9 -0
data/Gemfile.lock +26 -1
data/N +7 -0
data/README.md +358 -147
data/benchmarks/perf.rb +29 -0
data/benchmarks/push_n.yml +35 -0
data/benchmarks/push_n_pop_n.yml +52 -0
data/benchmarks/push_pop.yml +32 -0
data/benchmarks/stackprof.rb +31 -0
data/bin/bench_charts +13 -0
data/bin/bench_n +7 -0
data/bin/benchmark-driver +29 -0
data/bin/benchmarks +10 -0
data/bin/profile +10 -0
data/d_heap.gemspec +5 -2
data/docs/benchmarks-2.txt +75 -0
data/docs/benchmarks-mem.txt +39 -0
data/docs/benchmarks.txt +515 -0
data/docs/profile.txt +392 -0
data/ext/d_heap/d_heap.c +824 -246
data/ext/d_heap/extconf.rb +16 -3
data/images/push_n.png +0 -0
data/images/push_n_pop_n.png +0 -0
data/images/push_pop.png +0 -0
data/images/wikipedia-min-heap.png +0 -0
data/lib/benchmark_driver/runner/ips_zero_fail.rb +158 -0
data/lib/d_heap.rb +92 -3
data/lib/d_heap/benchmarks.rb +112 -0
data/lib/d_heap/benchmarks/benchmarker.rb +116 -0
data/lib/d_heap/benchmarks/implementations.rb +224 -0
data/lib/d_heap/benchmarks/profiler.rb +71 -0
data/lib/d_heap/benchmarks/rspec_matchers.rb +352 -0
data/lib/d_heap/version.rb +1 -1
metadata +60 -6
data/ext/d_heap/d_heap.h +0 -65

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f549e01dd83eb6b48c1190443495a628ed1dd64ead7eb94851e661aef2607e14
-  data.tar.gz: 492ce5c17ace9ecc9deaccf8fcd47883835da28d4e5f32328aef60989186b2ff
+  metadata.gz: 1ad095ff29343f83c8bbe6fd0bc7f4acd79fa9c298aa4f8d007acf02ebedba30
+  data.tar.gz: b2806a066a173a83d12259342c3f7d90900c83dc628063955d861f05acc98796
 SHA512:
-  metadata.gz: 85521dee7f2a9992980935756571e87afbe8ae13347b5b3fbad17b501b5709111972b98ad0c9e1fca6d318c4be20ce2983086dfd84f7c0e73636ac9e4f11f253
-  data.tar.gz: e1daac5b02fcc817b3c6c6a99395e3ca0b92f42bb14bd813fedbb3037eed698bda335c2898d5b0131b48fecd73e4ccf1615943a52287c36f73764b08bf8b1969
+  metadata.gz: 297aad8a8b4c7845fbea64808a2beaf4aa66b8431a23841c3d17952aaf85f41a3377c2dadc7651858e038adc69a35b2fe8e6ca484d45999f026efb41817e281b
+  data.tar.gz: 1e3f123c7f723c752b2e8326c70b4208188ad09c275574bd0cee3dc7a119c7e3f07173f4ad4ed32035d2103a10b1a979400dfa35bdc1dd55272b53bcc8eaa2b9

data/.github/workflows/main.yml CHANGED

@@ -1,4 +1,4 @@
-name: Ruby
+name: CI
 on: [push,pull_request]
@@ -7,7 +7,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        ruby: [2.5, 2.6, 2.7, 3.0]
+        ruby: [2.4, 2.5, 2.6, 2.7, 3.0]
         os: [ubuntu, macos]
         experimental: [false]
     runs-on: ${{ matrix.os }}-latest

data/.gitignore CHANGED

@@ -10,6 +10,7 @@
 *.so
 *.o
 *.a
+compile_commands.json
 mkmf.log
 # rspec failure tracking

data/.rubocop.yml CHANGED

@@ -3,9 +3,10 @@ inherit_mode:
     - Exclude
 AllCops:
-  TargetRubyVersion: 2.5
+  TargetRubyVersion: 2.4
   NewCops: disable
   Exclude:
+    - bin/benchmark-driver
     - bin/rake
     - bin/rspec
     - bin/rubocop
@@ -44,6 +45,7 @@ Layout/EmptyLineBetweenDefs:
 Layout/EmptyLinesAroundAttributeAccessor:
   inherit_mode:
     merge:
+      - Exclude
       - AllowedMethods
   Enabled: true
   AllowedMethods:
@@ -105,26 +107,49 @@ Naming/RescuedExceptionsVariableName: { Enabled: false }
 ###########################################################################
 # Matrics:
+Metrics/CyclomaticComplexity:
+  Max: 10
 # Although it may be better to split specs into multiple files...?
 Metrics/BlockLength:
   Exclude:
     - "spec/**/*_spec.rb"
+  CountAsOne:
+    - array
+    - hash
+    - heredoc
+Metrics/ClassLength:
+  Max: 200
+  CountAsOne:
+    - array
+    - hash
+    - heredoc
 ###########################################################################
 # Style...
 Style/AccessorGrouping:        { Enabled: false }
 Style/AsciiComments:           { Enabled: false } # 👮 can't stop our 🎉🥳🎊🥳!
+Style/ClassAndModuleChildren:  { Enabled: false }
 Style/EachWithObject:          { Enabled: false }
 Style/FormatStringToken:       { Enabled: false }
 Style/FloatDivision:           { Enabled: false }
+Style/IfUnlessModifier:        { Enabled: false }
+Style/IfWithSemicolon:         { Enabled: false }
 Style/Lambda:                  { Enabled: false }
 Style/LineEndConcatenation:    { Enabled: false }
 Style/MixinGrouping:           { Enabled: false }
+Style/MultilineBlockChain:     { Enabled: false }
 Style/PerlBackrefs:            { Enabled: false } # use occasionally/sparingly
 Style/RescueStandardError:     { Enabled: false }
+Style/Semicolon:               { Enabled: false }
 Style/SingleLineMethods:       { Enabled: false }
 Style/StabbyLambdaParentheses: { Enabled: false }
+Style/WhenThen               : { Enabled: false }
+# I require trailing commas elsewhere, but these are optional
+Style/TrailingCommaInArguments: { Enabled: false }
 # If rubocop had an option to only enforce this on constants and literals (e.g.
 # strings, regexp, range), I'd agree.
@@ -139,8 +164,19 @@ Style/TernaryParentheses:
   Enabled: false
 Style/BlockDelimiters:
+  inherit_mode:
+    merge:
+      - Exclude
+      - ProceduralMethods
+      - IgnoredMethods
+      - FunctionalMethods
   EnforcedStyle: semantic
   AllowBracesOnProceduralOneLiners: true
+  IgnoredMethods:
+    - expect  # rspec
+    - profile # ruby-prof
+    - ips     # benchmark-ips
 Style/FormatString:
   EnforcedStyle: percent
@@ -158,3 +194,6 @@ Style/TrailingCommaInHashLiteral:
 Style/TrailingCommaInArrayLiteral:
   EnforcedStyleForMultiline: consistent_comma
+Style/YodaCondition:
+  EnforcedStyle: forbid_for_equality_operators_only

data/.yardopts ADDED

@@ -0,0 +1,10 @@
+-o doc
+--embed-mixins
+--hide-void-return
+--no-private
+--asset images:images
+--exclude lib/benchmark_driver
+--exclude lib/d_heap/benchmarks*
+-
+CHANGELOG.md
+CODE_OF_CONDUCT.md

data/CHANGELOG.md ADDED

@@ -0,0 +1,76 @@
+## Current/Unreleased
+## Release v0.6.1 (2021-01-24)
+* 📝 Fix link to CHANGELOG.md in gemspec
+## Release v0.6.0 (2021-01-24)
+* 🔥 **Breaking**: `#initialize` uses a keyword argument for `d`
+* ✨ Added `#initialize(capacity: capa)` to set initial capacity.
+* ✨ Added `peek_with_score` and `peek_score`
+* ✨ Added `pop_with_score` and `each_pop(with_score: true)`
+* ✨ Added `pop_all_below(max_score, array = [])`
+* ✨ Added aliases for `shift` and `next`
+* 📈 Added benchmark charts to README, and `bin/bench_charts` to generate them.
+    * requires `gruff` which requires `rmagick` which requires `imagemagick`
+* 📝 Many documentation updates and fixes.
+## Release v0.5.0 (2021-01-17)
+* 🔥 **Breaking**: reversed order of `#push` arguments to `value, score`.
+* ✨ Added `#insert(score, value)` to replace earlier version of `#push`.
+* ✨ Added `#each_pop` enumerator.
+* ✨ Added aliases for `deq`, `enq`, `first`, `pop_below`, `length`, and
+    `count`, to mimic other classes in ruby's stdlib.
+* ⚡️♻️  More performance improvements:
+    * Created an `ENTRY` struct and store both the score and the value pointer in
+      the same `ENTRY *entries` array.
+    * Reduced unnecessary allocations or copies in both sift loops.  A similar
+      refactoring also sped up the pure ruby benchmark implementation.
+    * Compiling with `-O3`.
+* 📝 Updated (and in some cases, fixed) yardoc
+* ♻️  Moved aliases and less performance sensitive code into ruby.
+* ♻️  DRY up push/insert methods
+## Release v0.4.0 (2021-01-12)
+* 🔥 **Breaking**: Scores must be `Integer` or convertable to `Float`
+    * ⚠️  `Integer` scores must fit in `-ULONG_LONG_MAX` to `+ULONG_LONG_MAX`.
+* ⚡️ Big performance improvements, by using C `long double *cscores` array
+* ⚡️ many many (so many) updates to benchmarks
+* ✨ Added `DHeap#clear`
+* 🐛 Fixed `DHeap#initialize_copy` and `#freeze`
+* ♻️  significant refactoring
+* 📝 Updated docs (mostly adding benchmarks)
+## Release v0.3.0 (2020-12-29)
+* 🔥 **Breaking**: Removed class methods that operated directly on an array.
+    They weren't compatible with the performance improvements.
+* ⚡️ Big performance improvements, by converting to a `T_DATA` struct.
+* ♻️  Major refactoring/rewriting of dheap.c
+* ✅ Added benchmark specs
+## Release v0.2.2 (2020-12-27)
+* 🐛 fix `optimized_cmp`, avoiding internal symbols
+* 📝 Update documentation
+* 💚 fix macos CI
+* ➕ Add rubocop 👮🎨
+## Release v0.2.1 (2020-12-26)
+* ⬆️  Upgraded rake (and bundler) to support ruby 3.0
+## Release v0.2.0 (2020-12-24)
+* ✨ Add ability to push separate score and value
+* ⚡️ Big performance gain, by storing scores separately and using ruby's
+  internal `OPTIMIZED_CMP` instead of always directly calling `<=>`
+## Release v0.1.0 (2020-12-22)
+🎉 initial release 🎉
+* ✨ Add basic d-ary Heap implementation

data/Gemfile CHANGED

@@ -5,7 +5,16 @@ source "https://rubygems.org"
 # Specify your gem's dependencies in d_heap.gemspec
 gemspec
+gem "pry"
 gem "rake", "~> 13.0"
 gem "rake-compiler"
 gem "rspec", "~> 3.10"
 gem "rubocop", "~> 1.0"
+install_if -> { RUBY_PLATFORM !~ /darwin/ } do
+  gem "benchmark_driver-output-gruff"
+end
+gem "perf"
+gem "priority_queue_cxx"
+gem "stackprof"

data/Gemfile.lock CHANGED

@@ -1,22 +1,38 @@
 PATH
   remote: .
   specs:
-    d_heap (0.2.2)
+    d_heap (0.6.1)
 GEM
   remote: https://rubygems.org/
   specs:
     ast (2.4.1)
+    benchmark_driver (0.15.16)
+    benchmark_driver-output-gruff (0.3.1)
+      benchmark_driver (>= 0.12.0)
+      gruff
+    coderay (1.1.3)
     diff-lcs (1.4.4)
+    gruff (0.12.1)
+      histogram
+      rmagick
+    histogram (0.2.4.1)
+    method_source (1.0.0)
     parallel (1.19.2)
     parser (2.7.2.0)
       ast (~> 2.4.1)
+    perf (0.1.2)
+    priority_queue_cxx (0.3.4)
+    pry (0.13.1)
+      coderay (~> 1.1)
+      method_source (~> 1.0)
     rainbow (3.0.0)
     rake (13.0.3)
     rake-compiler (1.1.1)
       rake
     regexp_parser (1.8.2)
     rexml (3.2.3)
+    rmagick (4.1.2)
     rspec (3.10.0)
       rspec-core (~> 3.10.0)
       rspec-expectations (~> 3.10.0)
@@ -41,18 +57,27 @@ GEM
       unicode-display_width (>= 1.4.0, < 2.0)
     rubocop-ast (1.1.1)
       parser (>= 2.7.1.5)
+    ruby-prof (1.4.2)
     ruby-progressbar (1.10.1)
+    stackprof (0.2.16)
     unicode-display_width (1.7.0)
 PLATFORMS
   ruby
 DEPENDENCIES
+  benchmark_driver
+  benchmark_driver-output-gruff
   d_heap!
+  perf
+  priority_queue_cxx
+  pry
   rake (~> 13.0)
   rake-compiler
   rspec (~> 3.10)
   rubocop (~> 1.0)
+  ruby-prof
+  stackprof
 BUNDLED WITH
    2.2.3

data/N ADDED

@@ -0,0 +1,7 @@
+#!/bin/sh
+set -eu
+export BENCH_N="$1"
+shift
+exec ruby "$@"

data/README.md CHANGED

@@ -1,53 +1,134 @@
-# DHeap
+# DHeap - Fast d-ary heap for ruby
+[![Gem Version](https://badge.fury.io/rb/d_heap.svg)](https://badge.fury.io/rb/d_heap)
+[![Build Status](https://github.com/nevans/d_heap/workflows/CI/badge.svg)](https://github.com/nevans/d_heap/actions?query=workflow%3ACI)
+[![Maintainability](https://api.codeclimate.com/v1/badges/ff274acd0683c99c03e1/maintainability)](https://codeclimate.com/github/nevans/d_heap/maintainability)
+A fast [_d_-ary heap][d-ary heap] [priority queue] implementation for ruby,
+implemented as a C extension.
+From [wikipedia](https://en.wikipedia.org/wiki/Heap_(data_structure)):
+> A heap is a specialized tree-based data structure which is essentially an
+> almost complete tree that satisfies the heap property: in a min heap, for any
+> given node C, if P is a parent node of C, then the key (the value) of P is
+> less than or equal to the key of C. The node at the "top" of the heap (with no
+> parents) is called the root node.
+![tree representation of a min heap](images/wikipedia-min-heap.png)
+With a regular queue, you expect "FIFO" behavior: first in, first out.  With a
+stack you expect "LIFO": last in first out.  A priority queue has a score for
+each element and elements are popped in order by score.  Priority queues are
+often used in algorithms for e.g. [scheduling] of timers or bandwidth
+management, for [Huffman coding], and various graph search algorithms such as
+[Dijkstra's algorithm], [A* search], or [Prim's algorithm].
+The _d_-ary heap data structure is a generalization of the [binary heap], in
+which the nodes have _d_ children instead of 2.  This allows for "insert" and
+"decrease priority" operations to be performed more quickly with the tradeoff of
+slower delete minimum or "increase priority".  Additionally, _d_-ary heaps can
+have better memory cache behavior than binary heaps, allowing them to run more
+quickly in practice despite slower worst-case time complexity. In the worst
+case, a _d_-ary heap requires only `O(log n / log d)` operations to push, with
+the tradeoff that pop requires `O(d log n / log d)`.
+Although you should probably just use the default _d_ value  of `4` (see the
+analysis below), it's always advisable to benchmark your specific use-case.  In
+particular, if you push items more than you pop, higher values for _d_ can give
+a faster total runtime.
+[d-ary heap]: https://en.wikipedia.org/wiki/D-ary_heap
+[priority queue]: https://en.wikipedia.org/wiki/Priority_queue
+[binary heap]: https://en.wikipedia.org/wiki/Binary_heap
+[scheduling]: https://en.wikipedia.org/wiki/Scheduling_(computing)
+[Huffman coding]: https://en.wikipedia.org/wiki/Huffman_coding#Compression
+[Dijkstra's algorithm]: https://en.wikipedia.org/wiki/Dijkstra%27s_algorithm#Using_a_priority_queue
+[A* search]: https://en.wikipedia.org/wiki/A*_search_algorithm#Description
+[Prim's algorithm]: https://en.wikipedia.org/wiki/Prim%27s_algorithm
-A fast _d_-ary heap implementation for ruby, useful in priority queues and graph
-algorithms.
+## Usage
-The _d_-ary heap data structure is a generalization of the binary heap, in which
-the nodes have _d_ children instead of 2.  This allows for "decrease priority"
-operations to be performed more quickly with the tradeoff of slower delete
-minimum.  Additionally, _d_-ary heaps can have better memory cache behavior than
-binary heaps, allowing them to run more quickly in practice despite slower
-worst-case time complexity. In the worst case, a _d_-ary heap requires only
-`O(log n / log d)` to push, with the tradeoff that pop is `O(d log n / log d)`.
+The basic API is `#push(object, score)` and `#pop`.  Please read the
+[gem documentation] for more details and other methods.
-Although you should probably just stick with the default _d_ value  of `4`, it
-may be worthwhile to benchmark your specific scenario.
+Quick reference for some common methods:
-## Motivation
+* `heap << object` adds a value, with `Float(object)` as its score.
+* `heap.push(object, score)` adds a value with an extrinsic score.
+* `heap.pop` removes and returns the value with the minimum score.
+* `heap.pop_lte(max_score)` pops only if the next score is `<=` the argument.
+* `heap.peek` to view the minimum value without popping it.
+* `heap.clear` to remove all items from the heap.
+* `heap.empty?` returns true if the heap is empty.
+* `heap.size` returns the number of items in the heap.
+If the score changes while the object is still in the heap, it will not be
+re-evaluated again.
+The score must either be `Integer` or `Float` or convertable to a `Float` via
+`Float(score)` (i.e. it should implement `#to_f`).  Constraining scores to
+numeric values gives more than 50% speedup under some benchmarks!  _n.b._
+`Integer` _scores must have an absolute value that fits into_ `unsigned long
+long`. This is compiler and architecture dependant but with gcc on an IA-64
+system it's 64 bits, which gives a range of -18,446,744,073,709,551,615 to
++18,446,744,073,709,551,615, which is more than enough to store e.g. POSIX time
+in nanoseconds.
+_Comparing arbitary objects via_ `a <=> b` _was the original design and may be
+added back in a future version,_ if (and only if) _it can be done without
+impacting the speed of numeric comparisons. The speedup from this constraint is
+huge!_
+[gem documentation]: https://rubydoc.info/gems/d_heap/DHeap
+### Examples
+```ruby
+# create some example objects to place in our heap
+Task = Struct.new(:id, :time) do
+  def to_f; time.to_f end
+end
+t1 = Task.new(1, Time.now + 5*60)
+t2 = Task.new(2, Time.now + 50)
+t3 = Task.new(3, Time.now + 60)
+t4 = Task.new(4, Time.now +  5)
+# create the heap
+require "d_heap"
+heap = DHeap.new
+# push with an explicit score (which might be extrinsic to the value)
+heap.push t1, t1.to_f
+# the score will be implicitly cast with Float, so any object with #to_f
+heap.push t2, t2
-Sometimes you just need a priority queue, right?  With a regular queue, you
-expect "FIFO" behavior: first in, first out.  With a priority queue, you push
-with a score (or your elements are comparable), and you want to be able to
-efficiently pop off the minimum (or maximum) element.
-One obvious approach is to simply maintain an array in sorted order.  And
-ruby's Array class makes it simple to maintain a sorted array by combining
-`#bsearch_index` with `#insert`.  With certain insert/remove workloads that can
-perform very well, but in the worst-case an insert or delete can result in O(n),
-since `#insert` may need to `memcpy` or `memmove` a significant portion of the
-array.
-But the standard way to efficiently and simply solve this problem is using a
-binary heap.  Although it increases the time for `pop`, it converts the
-amortized time per push + pop from `O(n)` to `O(d log n / log d)`.
-I was surprised to find that, at least under certain benchmarks, my pure ruby
-heap implementation was usually slower than inserting into a fully sorted
-array.  While this is a testament to ruby's fine-tuned Array implementationw, a
-heap implementated in C should easily peform faster than `Array#insert`.
-The biggest issue is that it just takes far too much time to call `<=>` from
-ruby code: A sorted array only requires `log n / log 2` comparisons to insert
-and no comparisons to pop.  However a _d_-ary heap requires `log n / log d` to
-insert plus an additional `d log n / log d` to pop.  If your queue contains only
-a few hundred items at once, the overhead of those extra calls to `<=>` is far
-more than occasionally calling `memcpy`.
-It's likely that MJIT will eventually make the C-extension completely
-unnecessary.  This is definitely hotspot code, and the basic ruby implementation
-would work fine, if not for that `<=>` overhead.  Until then... this gem gets
-the job done.
+# if the object has an intrinsic score via #to_f, "<<" is the simplest API
+heap << t3 << t4
+# pop returns the lowest scored item, and removes it from the heap
+heap.pop    # => #<struct Task id=4, time=2021-01-17 17:02:22.5574 -0500>
+heap.pop    # => #<struct Task id=2, time=2021-01-17 17:03:07.5574 -0500>
+# peek returns the lowest scored item, without removing it from the heap
+heap.peek   # => #<struct Task id=3, time=2021-01-17 17:03:17.5574 -0500>
+heap.pop    # => #<struct Task id=3, time=2021-01-17 17:03:17.5574 -0500>
+# pop_lte handles the common "h.pop if h.peek_score < max" pattern
+heap.pop_lte(Time.now + 65) # => nil
+# the heap size can be inspected with size and empty?
+heap.empty? # => false
+heap.size   # => 1
+heap.pop    # => #<struct Task id=1, time=2021-01-17 17:07:17.5574 -0500>
+heap.empty? # => true
+heap.size   # => 0
+# popping from an empty heap returns nil
+heap.pop    # => nil
+```
+Please see the [gem documentation] for more methods and more examples.
 ## Installation
@@ -65,134 +146,264 @@ Or install it yourself as:
     $ gem install d_heap
-## Usage
+## Motivation
-The simplest way to use it is simply with `#push` and `#pop`.  Push will
+One naive approach to a priority queue is to maintain an array in sorted order.
+This can be very simply implemented in ruby with `Array#bseach_index` +
+`Array#insert`.  This can be very fast—`Array#pop` is `O(1)`—but the worst-case
+for insert is `O(n)` because it may need to `memcpy` a significant portion of
+the array.
-```ruby
-require "d_heap"
+The standard way to implement a priority queue is with a binary heap.  Although
+this increases the time complexity for `pop` alone, it reduces the combined time
+compexity for the combined `push` + `pop`. Using a d-ary heap with d > 2
+makes the tree shorter but broader, which reduces  to `O(log n / log d)` while
+increasing the comparisons needed by sift-down to `O(d log n/ log d)`.
-heap = DHeap.new # defaults to a 4-ary heap
+However, I was disappointed when my best ruby heap implementation ran much more
+slowly than the naive approach—even for heaps containing ten thousand items.
+Although it _is_ `O(n)`, `memcpy` is _very_ fast, while calling `<=>` from ruby
+has _much_ higher overhead.  And a _d_-heap needs `d + 1` times more comparisons
+for each push + pop than `bsearch` + `insert`.
-# storing [time, task] tuples
-heap << [Time.now + 5*60, Task.new(1)]
-heap << [Time.now +   30, Task.new(2)]
-heap << [Time.now +   60, Task.new(3)]
-heap << [Time.now +    5, Task.new(4)]
+Additionally, when researching how other systems handle their scheduling, I was
+inspired by reading go's "timer.go" implementation to experiment with a 4-ary
+heap instead of the traditional binary heap.
-# peeking and popping (using last to get the task and ignore the time)
-heap.pop.last # => Task[4]
-heap.pop.last # => Task[2]
-heap.peak.last # => Task[3]
-heap.pop.last # => Task[3]
-heap.pop.last # => Task[1]
-```
+## Benchmarks
-Read the `rdoc` for more detailed documentation and examples.
+_See `bin/benchmarks` and `docs/benchmarks.txt`, as well as `bin/profile` and
+`docs/profile.txt` for much more detail or updated results. These benchmarks
+were measured with v0.5.0 and ruby 2.7.2 without MJIT enabled._
+These benchmarks use very simple implementations for a pure-ruby heap and an
+array that is kept sorted using `Array#bsearch_index` and `Array#insert`.  For
+comparison, I also compare to the [priority_queue_cxx gem] which uses the [C++
+STL priority_queue], and another naive implementation that uses `Array#min` and
+`Array#delete_at` with an unsorted array.
+In these benchmarks, `DHeap` runs faster than all other implementations for
+every scenario and every value of N, although the difference is usually more
+noticable at higher values of N.  The pure ruby heap implementation is
+competitive for `push` alone at every value of N, but is significantly slower
+than bsearch + insert for push + pop, until N is _very_ large (somewhere between
+10k and 100k)!
+[priority_queue_cxx gem]: https://rubygems.org/gems/priority_queue_cxx
+[C++ STL priority_queue]: http://www.cplusplus.com/reference/queue/priority_queue/
+Three different scenarios are measured:
+### push N items onto an empty heap
+...but never pop (clearing between each set of pushes).
+![bar graph for push_n_pop_n benchmarks](./images/push_n.png)
+### push N items onto an empty heap then pop all N
+Although this could be used for heap sort, we're unlikely to choose heap sort
+over Ruby's quick sort implementation. I'm using this scenario to represent
+the amortized cost of creating a heap and (eventually) draining it.
+![bar graph for push_n_pop_n benchmarks](./images/push_n_pop_n.png)
+### push and pop on a heap with N values
+Repeatedly push and pop while keeping a stable heap size.  This is a _very
+simplistic_ approximation for how most scheduler/timer heaps might be used.
+Usually when a timer fires it will be quickly replaced by a new timer, and the
+overall count of timers will remain roughly stable.
+![bar graph for push_pop benchmarks](./images/push_pop.png)
+### numbers
+Even for very small N values the benchmark implementations,  `DHeap` runs faster
+than the other implementations for each scenario, although the difference is
+still relatively small.  The pure ruby binary heap is 2x or more slower than
+bsearch + insert for common push/pop scenario.
+    == push N (N=5) ==========================================================
+    push N (c_dheap):   1969700.7 i/s
+    push N (c++ stl):   1049738.1 i/s - 1.88x  slower
+    push N (rb_heap):    928435.2 i/s - 2.12x  slower
+    push N (bsearch):    921060.0 i/s - 2.14x  slower
+    == push N then pop N (N=5) ===============================================
+    push N + pop N (c_dheap):   1375805.0 i/s
+    push N + pop N (c++ stl):   1134997.5 i/s - 1.21x  slower
+    push N + pop N (findmin):    862913.1 i/s - 1.59x  slower
+    push N + pop N (bsearch):    762887.1 i/s - 1.80x  slower
+    push N + pop N (rb_heap):    506890.4 i/s - 2.71x  slower
+    == Push/pop with pre-filled queue of size=N (N=5) ========================
+    push + pop (c_dheap):   9044435.5 i/s
+    push + pop (c++ stl):   7534583.4 i/s - 1.20x  slower
+    push + pop (findmin):   5026155.1 i/s - 1.80x  slower
+    push + pop (bsearch):   4300260.0 i/s - 2.10x  slower
+    push + pop (rb_heap):   2299499.7 i/s - 3.93x  slower
+By N=21, `DHeap` has pulled significantly ahead of bsearch + insert for all
+scenarios, but the pure ruby heap is still slower than every other
+implementation—even resorting the array after every `#push`—in any scenario that
+uses `#pop`.
+    == push N (N=21) =========================================================
+    push N (c_dheap):    464231.4 i/s
+    push N (c++ stl):    305546.7 i/s - 1.52x  slower
+    push N (rb_heap):    202803.7 i/s - 2.29x  slower
+    push N (bsearch):    168678.7 i/s - 2.75x  slower
+    == push N then pop N (N=21) ==============================================
+    push N + pop N (c_dheap):    298350.3 i/s
+    push N + pop N (c++ stl):    252227.1 i/s - 1.18x  slower
+    push N + pop N (findmin):    161998.7 i/s - 1.84x  slower
+    push N + pop N (bsearch):    143432.3 i/s - 2.08x  slower
+    push N + pop N (rb_heap):     79622.1 i/s - 3.75x  slower
+    == Push/pop with pre-filled queue of size=N (N=21) =======================
+    push + pop (c_dheap):   8855093.4 i/s
+    push + pop (c++ stl):   7223079.5 i/s - 1.23x  slower
+    push + pop (findmin):   4542913.7 i/s - 1.95x  slower
+    push + pop (bsearch):   3461802.4 i/s - 2.56x  slower
+    push + pop (rb_heap):   1845488.7 i/s - 4.80x  slower
+At higher values of N, a heaps logarithmic growth leads to only a little
+slowdown of `#push`, while insert's linear growth causes it to run noticably
+slower and slower.  But because `#pop` is `O(1)` for a sorted array and `O(d log
+n / log d)` for a heap, scenarios involving both `#push` and `#pop` remain
+relatively close, and bsearch + insert still runs faster than a pure ruby heap,
+even up to queues with 10k items.  But as queue size increases beyond than that,
+the linear time compexity to keep a sorted array dominates.
+    == push + pop (rb_heap)
+    queue size =    10000:    736618.2 i/s
+    queue size =    25000:    670186.8 i/s - 1.10x  slower
+    queue size =    50000:    618156.7 i/s - 1.19x  slower
+    queue size =   100000:    579250.7 i/s - 1.27x  slower
+    queue size =   250000:    572795.0 i/s - 1.29x  slower
+    queue size =   500000:    543648.3 i/s - 1.35x  slower
+    queue size =  1000000:    513523.4 i/s - 1.43x  slower
+    queue size =  2500000:    460848.9 i/s - 1.60x  slower
+    queue size =  5000000:    445234.5 i/s - 1.65x  slower
+    queue size = 10000000:    423119.0 i/s - 1.74x  slower
+    == push + pop (bsearch)
+    queue size =    10000:    786334.2 i/s
+    queue size =    25000:    364963.8 i/s - 2.15x  slower
+    queue size =    50000:    200520.6 i/s - 3.92x  slower
+    queue size =   100000:     88607.0 i/s - 8.87x  slower
+    queue size =   250000:     34530.5 i/s - 22.77x  slower
+    queue size =   500000:     17965.4 i/s - 43.77x  slower
+    queue size =  1000000:      5638.7 i/s - 139.45x  slower
+    queue size =  2500000:      1302.0 i/s - 603.93x  slower
+    queue size =  5000000:       592.0 i/s - 1328.25x  slower
+    queue size = 10000000:       288.8 i/s - 2722.66x  slower
+    == push + pop (c_dheap)
+    queue size =    10000:   7311366.6 i/s
+    queue size =    50000:   6737824.5 i/s - 1.09x  slower
+    queue size =    25000:   6407340.6 i/s - 1.14x  slower
+    queue size =   100000:   6254396.3 i/s - 1.17x  slower
+    queue size =   250000:   5917684.5 i/s - 1.24x  slower
+    queue size =   500000:   5126307.6 i/s - 1.43x  slower
+    queue size =  1000000:   4403494.1 i/s - 1.66x  slower
+    queue size =  2500000:   3304088.2 i/s - 2.21x  slower
+    queue size =  5000000:   2664897.7 i/s - 2.74x  slower
+    queue size = 10000000:   2137927.6 i/s - 3.42x  slower
-## TODOs...
+## Analysis
-_TODO:_ In addition to a basic _d_-ary heap class (`DHeap`), this library
-~~includes~~ _will include_ extensions to `Array`, allowing an Array to be
-directly handled as a priority queue.  These extension methods are meant to be
-used similarly to how `#bsearch` and `#bsearch_index` might be used.
+### Time complexity
-_TODO:_ Also ~~included is~~ _will include_ `DHeap::Set`, which augments the
-basic heap with an internal `Hash`, which maps a set of values to scores.
-loosely inspired by go's timers.  e.g: It lazily sifts its heap after deletion
-and adjustments, to achieve faster average runtime for *add* and *cancel*
-operations.
+There are two fundamental heap operations: sift-up (used by push) and sift-down
+(used by pop).
+* A _d_-ary heap will have `log n / log d` layers, so both sift operations can
+  perform as many as `log n / log d` writes, when a member sifts the entire
+  length of the tree.
+* Sift-up makes one comparison per layer, so push runs in `O(log n / log d)`.
+* Sift-down makes d comparions per layer, so pop runs in `O(d log n / log d)`.
+So, in the simplest case of running balanced push/pop while maintaining the same
+heap size, `(1 + d) log n / log d` comparisons are made.  In the worst case,
+when every sift traverses every layer of the tree, `d=4` requires the fewest
+comparisons for combined insert and delete:
+* (1 +  2) lg n / lg d ≈ 4.328085 lg n
+* (1 +  3) lg n / lg d ≈ 3.640957 lg n
+* (1 +  4) lg n / lg d ≈ 3.606738 lg n
+* (1 +  5) lg n / lg d ≈ 3.728010 lg n
+* (1 +  6) lg n / lg d ≈ 3.906774 lg n
+* (1 +  7) lg n / lg d ≈ 4.111187 lg n
+* (1 +  8) lg n / lg d ≈ 4.328085 lg n
+* (1 +  9) lg n / lg d ≈ 4.551196 lg n
+* (1 + 10) lg n / lg d ≈ 4.777239 lg n
+* etc...
-_TODO:_ Also ~~included is~~ _will include_ `DHeap::Timers`, which contains some
-features that are loosely inspired by go's timers.  e.g: It lazily sifts its
-heap after deletion and adjustments, to achieve faster average runtime for *add*
-and *cancel* operations.
+See https://en.wikipedia.org/wiki/D-ary_heap#Analysis for deeper analysis.
-Additionally, I was inspired by reading go's "timer.go" implementation to
-experiment with a 4-ary heap instead of the traditional binary heap.  In the
-case of timers, new timers are usually scheduled to run after most of the
-existing timers.  And timers are usually canceled before they have a chance to
-run. While a binary heap holds 50% of its elements in its last layer, 75% of a
-4-ary heap will have no children.  That diminishes the extra comparison overhead
-during sift-down.
+### Space complexity
-## Benchmarks
+Space usage is linear, regardless of d.  However higher d values may
+provide better cache locality.  Because the heap is a complete binary tree, the
+elements can be stored in an array, without the need for tree or list pointers.
-_TODO: put benchmarks here._
+Ruby can compare Numeric values _much_ faster than other ruby objects, even if
+those objects simply delegate comparison to internal Numeric values.  And it is
+often useful to use external scores for otherwise uncomparable values.  So
+`DHeap` uses twice as many entries (one for score and one for value)
+as an array which only stores values.
-## Analysis
+## Thread safety
-### Time complexity
+`DHeap` is _not_ thread-safe, so concurrent access from multiple threads need to
+take precautions such as locking access behind a mutex.
-Both sift operations can perform (log[d] n = log n / log d) swaps.
-Swap up performs only a single comparison per swap: O(1).
-Swap down performs as many as d comparions per swap: O(d).
-Inserting an item is O(log n / log d).
-Deleting the root is O(d log n / log d).
-Assuming every inserted item is eventually deleted from the root, d=4 requires
-the fewest comparisons for combined insert and delete:
- * (1 + 2) lg 2 = 4.328085
- * (1 + 3) lg 3 = 3.640957
- * (1 + 4) lg 4 = 3.606738
- * (1 + 5) lg 5 = 3.728010
- * (1 + 6) lg 6 = 3.906774
- * etc...
-Leaf nodes require no comparisons to shift down, and higher values for d have
-higher percentage of leaf nodes:
- * d=2 has ~50% leaves,
- * d=3 has ~67% leaves,
- * d=4 has ~75% leaves,
- * and so on...
+## Alternative data structures
-See https://en.wikipedia.org/wiki/D-ary_heap#Analysis for deeper analysis.
+As always, you should run benchmarks with your expected scenarios to determine
+which is best for your application.
-### Space complexity
+Depending on your use-case, maintaining a sorted `Array` using `#bsearch_index`
+and `#insert` might be just fine!  Even `min` plus `delete` with an unsorted
+array can be very fast on small queues.  Although insertions run with `O(n)`,
+`memcpy` is so fast on modern hardware that your dataset might not be large
+enough for it to matter.
-Because the heap is a complete binary tree, space usage is linear, regardless
-of d.  However higher d values may provide better cache locality.
+More complex heap varients, e.g. [Fibonacci heap], allow heaps to be split and
+merged which gives some graph algorithms a lower amortized time complexity.  But
+in practice, _d_-ary heaps have much lower overhead and often run faster.
-We can run comparisons much much faster for Numeric or String objects than for
-ruby objects which delegate comparison to internal Numeric or String objects.
-And it is often advantageous to use extrinsic scores for uncomparable items.
-For this, our internal array uses twice as many entries (one for score and one
-for value) as it would if it only supported intrinsic comparison or used an
-un-memoized "sort_by" proc.
+[Fibonacci heap]: https://en.wikipedia.org/wiki/Fibonacci_heap
-### Timers
+If it is important to be able to quickly enumerate the set or find the ranking
+of values in it, then you may want to use a self-balancing binary search tree
+(e.g. a [red-black tree]) or a [skip-list].
-Additionally, when used to sort timers, we can reasonably assume that:
- * New timers usually sort after most existing timers.
- * Most timers will be canceled before executing.
- * Canceled timers usually sort after most existing timers.
+[red-black tree]: https://en.wikipedia.org/wiki/Red%E2%80%93black_tree
+[skip-list]: https://en.wikipedia.org/wiki/Skip_list
-So, if we are able to delete an item without searching for it, by keeping a map
-of positions within the heap, most timers can be inserted and deleted in O(1)
-time.  Canceling a non-leaf timer can be further optimized by marking it as
-canceled without immediately removing it from the heap.  If the timer is
-rescheduled before we garbage collect, adjusting its position will usually be
-faster than a delete and re-insert.
+[Hashed and Heirarchical Timing Wheels][timing wheels] (or some variant in that
+family of data structures) can be constructed to have effectively `O(1)` running
+time in most cases.  Although the implementation for that data structure is more
+complex than a heap, it may be necessary for enormous values of N.
-## Alternative data structures
+[timing wheels]: http://www.cs.columbia.edu/~nahum/w6998/papers/ton97-timing-wheels.pdf
-Depending on what you're doing, maintaining a sorted `Array` using
-`#bsearch_index` and `#insert` might be faster!  Although it is technically
-O(n) for insertions, the implementations for `memcpy` or `memmove` can be *very*
-fast on modern architectures.  Also, it can be faster O(n) on average, if
-insertions are usually near the end of the array.  You should run benchmarks
-with your expected scenarios to determine which is right.
+## TODOs...
-If it is important to be able to quickly enumerate the set or find the ranking
-of values in it, then you probably want to use a self-balancing binary search
-tree (e.g. a red-black tree) or a skip-list.
-A Hashed Timing Wheel or Heirarchical Timing Wheels (or some variant in that
-family of data structures) can be constructed to have effectively O(1) running
-time in most cases.  However, the implementation for that data structure is more
-complex than a heap.  If a 4-ary heap is good enough for go's timers, it should
-be suitable for many use cases.
+_TODO:_ Also ~~included is~~ _will include_ `DHeap::Map`, which augments the
+basic heap with an internal `Hash`, which maps objects to their position in the
+heap.  This enforces a uniqueness constraint on items on the heap, and also
+allows items to be more efficiently deleted or adjusted.  However maintaining
+the hash does lead to a small drop in normal `#push` and `#pop` performance.
+_TODO:_ Also ~~included is~~ _will include_ `DHeap::Lazy`, which contains some
+features that are loosely inspired by go's timers.  e.g: It lazily sifts its
+heap after deletion and adjustments, to achieve faster average runtime for *add*
+and *cancel* operations.
 ## Development