RubyGems - d_heap - Versions diffs - 0.2.2 → 0.6.1 - Mend

d_heap 0.2.2 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +4 -4
data/.github/workflows/main.yml +2 -2
data/.gitignore +1 -0
data/.rubocop.yml +40 -1
data/.yardopts +10 -0
data/CHANGELOG.md +76 -0
data/Gemfile +9 -0
data/Gemfile.lock +26 -1
data/N +7 -0
data/README.md +358 -147
data/benchmarks/perf.rb +29 -0
data/benchmarks/push_n.yml +35 -0
data/benchmarks/push_n_pop_n.yml +52 -0
data/benchmarks/push_pop.yml +32 -0
data/benchmarks/stackprof.rb +31 -0
data/bin/bench_charts +13 -0
data/bin/bench_n +7 -0
data/bin/benchmark-driver +29 -0
data/bin/benchmarks +10 -0
data/bin/profile +10 -0
data/d_heap.gemspec +5 -2
data/docs/benchmarks-2.txt +75 -0
data/docs/benchmarks-mem.txt +39 -0
data/docs/benchmarks.txt +515 -0
data/docs/profile.txt +392 -0
data/ext/d_heap/d_heap.c +824 -246
data/ext/d_heap/extconf.rb +16 -3
data/images/push_n.png +0 -0
data/images/push_n_pop_n.png +0 -0
data/images/push_pop.png +0 -0
data/images/wikipedia-min-heap.png +0 -0
data/lib/benchmark_driver/runner/ips_zero_fail.rb +158 -0
data/lib/d_heap.rb +92 -3
data/lib/d_heap/benchmarks.rb +112 -0
data/lib/d_heap/benchmarks/benchmarker.rb +116 -0
data/lib/d_heap/benchmarks/implementations.rb +224 -0
data/lib/d_heap/benchmarks/profiler.rb +71 -0
data/lib/d_heap/benchmarks/rspec_matchers.rb +352 -0
data/lib/d_heap/version.rb +1 -1
metadata +60 -6
data/ext/d_heap/d_heap.h +0 -65

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f549e01dd83eb6b48c1190443495a628ed1dd64ead7eb94851e661aef2607e14
-  data.tar.gz: 492ce5c17ace9ecc9deaccf8fcd47883835da28d4e5f32328aef60989186b2ff
+  metadata.gz: 1ad095ff29343f83c8bbe6fd0bc7f4acd79fa9c298aa4f8d007acf02ebedba30
+  data.tar.gz: b2806a066a173a83d12259342c3f7d90900c83dc628063955d861f05acc98796
 SHA512:
-  metadata.gz: 85521dee7f2a9992980935756571e87afbe8ae13347b5b3fbad17b501b5709111972b98ad0c9e1fca6d318c4be20ce2983086dfd84f7c0e73636ac9e4f11f253
-  data.tar.gz: e1daac5b02fcc817b3c6c6a99395e3ca0b92f42bb14bd813fedbb3037eed698bda335c2898d5b0131b48fecd73e4ccf1615943a52287c36f73764b08bf8b1969
+  metadata.gz: 297aad8a8b4c7845fbea64808a2beaf4aa66b8431a23841c3d17952aaf85f41a3377c2dadc7651858e038adc69a35b2fe8e6ca484d45999f026efb41817e281b
+  data.tar.gz: 1e3f123c7f723c752b2e8326c70b4208188ad09c275574bd0cee3dc7a119c7e3f07173f4ad4ed32035d2103a10b1a979400dfa35bdc1dd55272b53bcc8eaa2b9

data/.github/workflows/main.yml CHANGED

@@ -1,4 +1,4 @@
-name: Ruby
+name: CI
 on: [push,pull_request]
@@ -7,7 +7,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        ruby: [2.5, 2.6, 2.7, 3.0]
+        ruby: [2.4, 2.5, 2.6, 2.7, 3.0]
         os: [ubuntu, macos]
         experimental: [false]
     runs-on: ${{ matrix.os }}-latest

data/.gitignore CHANGED

@@ -10,6 +10,7 @@
 *.so
 *.o
 *.a
+compile_commands.json
 mkmf.log
 # rspec failure tracking

data/.rubocop.yml CHANGED

@@ -3,9 +3,10 @@ inherit_mode:
     - Exclude
 AllCops:
-  TargetRubyVersion: 2.5
+  TargetRubyVersion: 2.4
   NewCops: disable
   Exclude:
+    - bin/benchmark-driver
     - bin/rake
     - bin/rspec
     - bin/rubocop
@@ -44,6 +45,7 @@ Layout/EmptyLineBetweenDefs:
 Layout/EmptyLinesAroundAttributeAccessor:
   inherit_mode:
     merge:
+      - Exclude
       - AllowedMethods
   Enabled: true
   AllowedMethods:
@@ -105,26 +107,49 @@ Naming/RescuedExceptionsVariableName: { Enabled: false }
 ###########################################################################
 # Matrics:
+Metrics/CyclomaticComplexity:
+  Max: 10
 # Although it may be better to split specs into multiple files...?
 Metrics/BlockLength:
   Exclude:
     - "spec/**/*_spec.rb"
+  CountAsOne:
+    - array
+    - hash
+    - heredoc
+Metrics/ClassLength:
+  Max: 200
+  CountAsOne:
+    - array
+    - hash
+    - heredoc
 ###########################################################################
 # Style...
 Style/AccessorGrouping:        { Enabled: false }
 Style/AsciiComments:           { Enabled: false } # 👮 can't stop our 🎉🥳🎊🥳!
+Style/ClassAndModuleChildren:  { Enabled: false }
 Style/EachWithObject:          { Enabled: false }
 Style/FormatStringToken:       { Enabled: false }
 Style/FloatDivision:           { Enabled: false }
+Style/IfUnlessModifier:        { Enabled: false }
+Style/IfWithSemicolon:         { Enabled: false }
 Style/Lambda:                  { Enabled: false }
 Style/LineEndConcatenation:    { Enabled: false }
 Style/MixinGrouping:           { Enabled: false }
+Style/MultilineBlockChain:     { Enabled: false }
 Style/PerlBackrefs:            { Enabled: false } # use occasionally/sparingly
 Style/RescueStandardError:     { Enabled: false }
+Style/Semicolon:               { Enabled: false }
 Style/SingleLineMethods:       { Enabled: false }
 Style/StabbyLambdaParentheses: { Enabled: false }
+Style/WhenThen               : { Enabled: false }
+# I require trailing commas elsewhere, but these are optional
+Style/TrailingCommaInArguments: { Enabled: false }
 # If rubocop had an option to only enforce this on constants and literals (e.g.
 # strings, regexp, range), I'd agree.
@@ -139,8 +164,19 @@ Style/TernaryParentheses:
   Enabled: false
 Style/BlockDelimiters:
+  inherit_mode:
+    merge:
+      - Exclude
+      - ProceduralMethods
+      - IgnoredMethods
+      - FunctionalMethods
   EnforcedStyle: semantic
   AllowBracesOnProceduralOneLiners: true
+  IgnoredMethods:
+    - expect  # rspec
+    - profile # ruby-prof
+    - ips     # benchmark-ips
 Style/FormatString:
   EnforcedStyle: percent
@@ -158,3 +194,6 @@ Style/TrailingCommaInHashLiteral:
 Style/TrailingCommaInArrayLiteral:
   EnforcedStyleForMultiline: consistent_comma
+Style/YodaCondition:
+  EnforcedStyle: forbid_for_equality_operators_only

data/.yardopts ADDED

@@ -0,0 +1,10 @@
+-o doc
+--embed-mixins
+--hide-void-return
+--no-private
+--asset images:images
+--exclude lib/benchmark_driver
+--exclude lib/d_heap/benchmarks*
+-
+CHANGELOG.md
+CODE_OF_CONDUCT.md

data/CHANGELOG.md ADDED

@@ -0,0 +1,76 @@
+## Current/Unreleased
+## Release v0.6.1 (2021-01-24)
+* 📝 Fix link to CHANGELOG.md in gemspec
+## Release v0.6.0 (2021-01-24)
+* 🔥 **Breaking**: `#initialize` uses a keyword argument for `d`
+* ✨ Added `#initialize(capacity: capa)` to set initial capacity.
+* ✨ Added `peek_with_score` and `peek_score`
+* ✨ Added `pop_with_score` and `each_pop(with_score: true)`
+* ✨ Added `pop_all_below(max_score, array = [])`
+* ✨ Added aliases for `shift` and `next`
+* 📈 Added benchmark charts to README, and `bin/bench_charts` to generate them.
+    * requires `gruff` which requires `rmagick` which requires `imagemagick`
+* 📝 Many documentation updates and fixes.
+## Release v0.5.0 (2021-01-17)
+* 🔥 **Breaking**: reversed order of `#push` arguments to `value, score`.
+* ✨ Added `#insert(score, value)` to replace earlier version of `#push`.
+* ✨ Added `#each_pop` enumerator.
+* ✨ Added aliases for `deq`, `enq`, `first`, `pop_below`, `length`, and
+    `count`, to mimic other classes in ruby's stdlib.
+* ⚡️♻️  More performance improvements:
+    * Created an `ENTRY` struct and store both the score and the value pointer in
+      the same `ENTRY *entries` array.
+    * Reduced unnecessary allocations or copies in both sift loops.  A similar
+      refactoring also sped up the pure ruby benchmark implementation.
+    * Compiling with `-O3`.
+* 📝 Updated (and in some cases, fixed) yardoc
+* ♻️  Moved aliases and less performance sensitive code into ruby.
+* ♻️  DRY up push/insert methods
+## Release v0.4.0 (2021-01-12)
+* 🔥 **Breaking**: Scores must be `Integer` or convertable to `Float`
+    * ⚠️  `Integer` scores must fit in `-ULONG_LONG_MAX` to `+ULONG_LONG_MAX`.
+* ⚡️ Big performance improvements, by using C `long double *cscores` array
+* ⚡️ many many (so many) updates to benchmarks
+* ✨ Added `DHeap#clear`
+* 🐛 Fixed `DHeap#initialize_copy` and `#freeze`
+* ♻️  significant refactoring
+* 📝 Updated docs (mostly adding benchmarks)
+## Release v0.3.0 (2020-12-29)
+* 🔥 **Breaking**: Removed class methods that operated directly on an array.
+    They weren't compatible with the performance improvements.
+* ⚡️ Big performance improvements, by converting to a `T_DATA` struct.
+* ♻️  Major refactoring/rewriting of dheap.c
+* ✅ Added benchmark specs
+## Release v0.2.2 (2020-12-27)
+* 🐛 fix `optimized_cmp`, avoiding internal symbols
+* 📝 Update documentation
+* 💚 fix macos CI
+* ➕ Add rubocop 👮🎨
+## Release v0.2.1 (2020-12-26)
+* ⬆️  Upgraded rake (and bundler) to support ruby 3.0
+## Release v0.2.0 (2020-12-24)
+* ✨ Add ability to push separate score and value
+* ⚡️ Big performance gain, by storing scores separately and using ruby's
+  internal `OPTIMIZED_CMP` instead of always directly calling `<=>`
+## Release v0.1.0 (2020-12-22)
+🎉 initial release 🎉
+* ✨ Add basic d-ary Heap implementation

data/Gemfile CHANGED

@@ -5,7 +5,16 @@ source "https://rubygems.org"
 # Specify your gem's dependencies in d_heap.gemspec
 gemspec
+gem "pry"
 gem "rake", "~> 13.0"
 gem "rake-compiler"
 gem "rspec", "~> 3.10"
 gem "rubocop", "~> 1.0"
+install_if -> { RUBY_PLATFORM !~ /darwin/ } do
+  gem "benchmark_driver-output-gruff"
+end
+gem "perf"
+gem "priority_queue_cxx"
+gem "stackprof"

data/Gemfile.lock CHANGED

@@ -1,22 +1,38 @@
 PATH
   remote: .
   specs:
-    d_heap (0.2.2)
+    d_heap (0.6.1)
 GEM
   remote: https://rubygems.org/
   specs:
     ast (2.4.1)
+    benchmark_driver (0.15.16)
+    benchmark_driver-output-gruff (0.3.1)
+      benchmark_driver (>= 0.12.0)
+      gruff
+    coderay (1.1.3)
     diff-lcs (1.4.4)
+    gruff (0.12.1)
+      histogram
+      rmagick
+    histogram (0.2.4.1)
+    method_source (1.0.0)
     parallel (1.19.2)
     parser (2.7.2.0)
       ast (~> 2.4.1)
+    perf (0.1.2)
+    priority_queue_cxx (0.3.4)
+    pry (0.13.1)
+      coderay (~> 1.1)
+      method_source (~> 1.0)
     rainbow (3.0.0)
     rake (13.0.3)
     rake-compiler (1.1.1)
       rake
     regexp_parser (1.8.2)
     rexml (3.2.3)
+    rmagick (4.1.2)
     rspec (3.10.0)
       rspec-core (~> 3.10.0)
       rspec-expectations (~> 3.10.0)
@@ -41,18 +57,27 @@ GEM
       unicode-display_width (>= 1.4.0, < 2.0)
     rubocop-ast (1.1.1)
       parser (>= 2.7.1.5)
+    ruby-prof (1.4.2)
     ruby-progressbar (1.10.1)
+    stackprof (0.2.16)
     unicode-display_width (1.7.0)
 PLATFORMS
   ruby
 DEPENDENCIES
+  benchmark_driver
+  benchmark_driver-output-gruff
   d_heap!
+  perf
+  priority_queue_cxx
+  pry
   rake (~> 13.0)
   rake-compiler
   rspec (~> 3.10)
   rubocop (~> 1.0)
+  ruby-prof
+  stackprof
 BUNDLED WITH
    2.2.3

data/N ADDED

@@ -0,0 +1,7 @@
+#!/bin/sh
+set -eu
+export BENCH_N="$1"
+shift
+exec ruby "$@"

data/README.md CHANGED

@@ -1,53 +1,134 @@
-# DHeap
+# DHeap - Fast d-ary heap for ruby
+[![Gem Version](https://badge.fury.io/rb/d_heap.svg)](https://badge.fury.io/rb/d_heap)
+[![Build Status](https://github.com/nevans/d_heap/workflows/CI/badge.svg)](https://github.com/nevans/d_heap/actions?query=workflow%3ACI)
+[![Maintainability](https://api.codeclimate.com/v1/badges/ff274acd0683c99c03e1/maintainability)](https://codeclimate.com/github/nevans/d_heap/maintainability)
+A fast [_d_-ary heap][d-ary heap] [priority queue] implementation for ruby,
+implemented as a C extension.
+From [wikipedia](https://en.wikipedia.org/wiki/Heap_(data_structure)):
+> A heap is a specialized tree-based data structure which is essentially an
+> almost complete tree that satisfies the heap property: in a min heap, for any
+> given node C, if P is a parent node of C, then the key (the value) of P is
+> less than or equal to the key of C. The node at the "top" of the heap (with no
+> parents) is called the root node.
+![tree representation of a min heap](images/wikipedia-min-heap.png)
+With a regular queue, you expect "FIFO" behavior: first in, first out.  With a
+stack you expect "LIFO": last in first out.  A priority queue has a score for
+each element and elements are popped in order by score.  Priority queues are
+often used in algorithms for e.g. [scheduling] of timers or bandwidth
+management, for [Huffman coding], and various graph search algorithms such as
+[Dijkstra's algorithm], [A* search], or [Prim's algorithm].
+The _d_-ary heap data structure is a generalization of the [binary heap], in
+which the nodes have _d_ children instead of 2.  This allows for "insert" and
+"decrease priority" operations to be performed more quickly with the tradeoff of
+slower delete minimum or "increase priority".  Additionally, _d_-ary heaps can
+have better memory cache behavior than binary heaps, allowing them to run more
+quickly in practice despite slower worst-case time complexity. In the worst
+case, a _d_-ary heap requires only `O(log n / log d)` operations to push, with
+the tradeoff that pop requires `O(d log n / log d)`.
+Although you should probably just use the default _d_ value  of `4` (see the
+analysis below), it's always advisable to benchmark your specific use-case.  In
+particular, if you push items more than you pop, higher values for _d_ can give
+a faster total runtime.
+[d-ary heap]: https://en.wikipedia.org/wiki/D-ary_heap
+[priority queue]: https://en.wikipedia.org/wiki/Priority_queue
+[binary heap]: https://en.wikipedia.org/wiki/Binary_heap
+[scheduling]: https://en.wikipedia.org/wiki/Scheduling_(computing)
+[Huffman coding]: https://en.wikipedia.org/wiki/Huffman_coding#Compression
+[Dijkstra's algorithm]: https://en.wikipedia.org/wiki/Dijkstra%27s_algorithm#Using_a_priority_queue
+[A* search]: https://en.wikipedia.org/wiki/A*_search_algorithm#Description
+[Prim's algorithm]: https://en.wikipedia.org/wiki/Prim%27s_algorithm
-A fast _d_-ary heap implementation for ruby, useful in priority queues and graph
-algorithms.
+## Usage
-The _d_-ary heap data structure is a generalization of the binary heap, in which
-the nodes have _d_ children instead of 2.  This allows for "decrease priority"
-operations to be performed more quickly with the tradeoff of slower delete
-minimum.  Additionally, _d_-ary heaps can have better memory cache behavior than
-binary heaps, allowing them to run more quickly in practice despite slower
-worst-case time complexity. In the worst case, a _d_-ary heap requires only
-`O(log n / log d)` to push, with the tradeoff that pop is `O(d log n / log d)`.
+The basic API is `#push(object, score)` and `#pop`.  Please read the
+[gem documentation] for more details and other methods.
-Although you should probably just stick with the default _d_ value  of `4`, it
-may be worthwhile to benchmark your specific scenario.
+Quick reference for some common methods:
-## Motivation
+* `heap << object` adds a value, with `Float(object)` as its score.
+* `heap.push(object, score)` adds a value with an extrinsic score.
+* `heap.pop` removes and returns the value with the minimum score.
+* `heap.pop_lte(max_score)` pops only if the next score is `<=` the argument.
+* `heap.peek` to view the minimum value without popping it.
+* `heap.clear` to remove all items from the heap.
+* `heap.empty?` returns true if the heap is empty.
+* `heap.size` returns the number of items in the heap.
+If the score changes while the object is still in the heap, it will not be
+re-evaluated again.
+The score must either be `Integer` or `Float` or convertable to a `Float` via
+`Float(score)` (i.e. it should implement `#to_f`).  Constraining scores to
+numeric values gives more than 50% speedup under some benchmarks!  _n.b._
+`Integer` _scores must have an absolute value that fits into_ `unsigned long
+long`. This is compiler and architecture dependant but with gcc on an IA-64
+system it's 64 bits, which gives a range of -18,446,744,073,709,551,615 to
++18,446,744,073,709,551,615, which is more than enough to store e.g. POSIX time
+in nanoseconds.
+_Comparing arbitary objects via_ `a <=> b` _was the original design and may be
+added back in a future version,_ if (and only if) _it can be done without
+impacting the speed of numeric comparisons. The speedup from this constraint is
+huge!_
+[gem documentation]: https://rubydoc.info/gems/d_heap/DHeap
+### Examples
+```ruby
+# create some example objects to place in our heap
+Task = Struct.new(:id, :time) do
+  def to_f; time.to_f end
+end
+t1 = Task.new(1, Time.now + 5*60)
+t2 = Task.new(2, Time.now + 50)
+t3 = Task.new(3, Time.now + 60)
+t4 = Task.new(4, Time.now +  5)
+# create the heap
+require "d_heap"
+heap = DHeap.new
+# push with an explicit score (which might be extrinsic to the value)
+heap.push t1, t1.to_f
+# the score will be implicitly cast with Float, so any object with #to_f
+heap.push t2, t2
-Sometimes you just need a priority queue, right?  With a regular queue, you
-expect "FIFO" behavior: first in, first out.  With a priority queue, you push
-with a score (or your elements are comparable), and you want to be able to
-efficiently pop off the minimum (or maximum) element.
-One obvious approach is to simply maintain an array in sorted order.  And
-ruby's Array class makes it simple to maintain a sorted array by combining
-`#bsearch_index` with `#insert`.  With certain insert/remove workloads that can
-perform very well, but in the worst-case an insert or delete can result in O(n),
-since `#insert` may need to `memcpy` or `memmove` a significant portion of the
-array.
-But the standard way to efficiently and simply solve this problem is using a
-binary heap.  Although it increases the time for `pop`, it converts the
-amortized time per push + pop from `O(n)` to `O(d log n / log d)`.
-I was surprised to find that, at least under certain benchmarks, my pure ruby
-heap implementation was usually slower than inserting into a fully sorted
-array.  While this is a testament to ruby's fine-tuned Array implementationw, a
-heap implementated in C should easily peform faster than `Array#insert`.
-The biggest issue is that it just takes far too much time to call `<=>` from
-ruby code: A sorted array only requires `log n / log 2` comparisons to insert
-and no comparisons to pop.  However a _d_-ary heap requires `log n / log d` to
-insert plus an additional `d log n / log d` to pop.  If your queue contains only
-a few hundred items at once, the overhead of those extra calls to `<=>` is far
-more than occasionally calling `memcpy`.
-It's likely that MJIT will eventually make the C-extension completely
-unnecessary.  This is definitely hotspot code, and the basic ruby implementation
-would work fine, if not for that `<=>` overhead.  Until then... this gem gets
-the job done.
+# if the object has an intrinsic score via #to_f, "<<" is the simplest API
+heap << t3 << t4
+# pop returns the lowest scored item, and removes it from the heap
+heap.pop    # => #<struct Task id=4, time=2021-01-17 17:02:22.5574 -0500>
+heap.pop    # => #<struct Task id=2, time=2021-01-17 17:03:07.5574 -0500>
+# peek returns the lowest scored item, without removing it from the heap
+heap.peek   # => #<struct Task id=3, time=2021-01-17 17:03:17.5574 -0500>
+heap.pop    # => #<struct Task id=3, time=2021-01-17 17:03:17.5574 -0500>
+# pop_lte handles the common "h.pop if h.peek_score < max" pattern
+heap.pop_lte(Time.now + 65) # => nil
+# the heap size can be inspected with size and empty?
+heap.empty? # => false
+heap.size   # => 1
+heap.pop    # => #<struct Task id=1, time=2021-01-17 17:07:17.5574 -0500>
+heap.empty? # => true
+heap.size   # => 0
+# popping from an empty heap returns nil
+heap.pop    # => nil
+```
+Please see the [gem documentation] for more methods and more examples.
 ## Installation
@@ -65,134 +146,264 @@ Or install it yourself as:
     $ gem install d_heap
-## Usage
+## Motivation
-The simplest way to use it is simply with `#push` and `#pop`.  Push will
+One naive approach to a priority queue is to maintain an array in sorted order.
+This can be very simply implemented in ruby with `Array#bseach_index` +
+`Array#insert`.  This can be very fast—`Array#pop` is `O(1)`—but the worst-case
+for insert is `O(n)` because it may need to `memcpy` a significant portion of
+the array.
-```ruby
-require "d_heap"
+The standard way to implement a priority queue is with a binary heap.  Although
+this increases the time complexity for `pop` alone, it reduces the combined time
+compexity for the combined `push` + `pop`. Using a d-ary heap with d > 2
+makes the tree shorter but broader, which reduces  to `O(log n / log d)` while
+increasing the comparisons needed by sift-down to `O(d log n/ log d)`.
-heap = DHeap.new # defaults to a 4-ary heap
+However, I was disappointed when my best ruby heap implementation ran much more
+slowly than the naive approach—even for heaps containing ten thousand items.
+Although it _is_ `O(n)`, `memcpy` is _very_ fast, while calling `<=>` from ruby
+has _much_ higher overhead.  And a _d_-heap needs `d + 1` times more comparisons
+for each push + pop than `bsearch` + `insert`.
-# storing [time, task] tuples
-heap << [Time.now + 5*60, Task.new(1)]
-heap << [Time.now +   30, Task.new(2)]
-heap << [Time.now +   60, Task.new(3)]
-heap << [Time.now +    5, Task.new(4)]
+Additionally, when researching how other systems handle their scheduling, I was
+inspired by reading go's "timer.go" implementation to experiment with a 4-ary
+heap instead of the traditional binary heap.
-# peeking and popping (using last to get the task and ignore the time)
-heap.pop.last # => Task[4]
-heap.pop.last # => Task[2]
-heap.peak.last # => Task[3]
-heap.pop.last # => Task[3]
-heap.pop.last # => Task[1]
-```
+## Benchmarks
-Read the `rdoc` for more detailed documentation and examples.
+_See `bin/benchmarks` and `docs/benchmarks.txt`, as well as `bin/profile` and
+`docs/profile.txt` for much more detail or updated results. These benchmarks
+were measured with v0.5.0 and ruby 2.7.2 without MJIT enabled._
+These benchmarks use very simple implementations for a pure-ruby heap and an
+array that is kept sorted using `Array#bsearch_index` and `Array#insert`.  For
+comparison, I also compare to the [priority_queue_cxx gem] which uses the [C++
+STL priority_queue], and another naive implementation that uses `Array#min` and
+`Array#delete_at` with an unsorted array.
+In these benchmarks, `DHeap` runs faster than all other implementations for
+every scenario and every value of N, although the difference is usually more
+noticable at higher values of N.  The pure ruby heap implementation is
+competitive for `push` alone at every value of N, but is significantly slower
+than bsearch + insert for push + pop, until N is _very_ large (somewhere between
+10k and 100k)!
+[priority_queue_cxx gem]: https://rubygems.org/gems/priority_queue_cxx
+[C++ STL priority_queue]: http://www.cplusplus.com/reference/queue/priority_queue/
+Three different scenarios are measured:
+### push N items onto an empty heap
+...but never pop (clearing between each set of pushes).
+![bar graph for push_n_pop_n benchmarks](./images/push_n.png)
+### push N items onto an empty heap then pop all N
+Although this could be used for heap sort, we're unlikely to choose heap sort
+over Ruby's quick sort implementation. I'm using this scenario to represent
+the amortized cost of creating a heap and (eventually) draining it.
+![bar graph for push_n_pop_n benchmarks](./images/push_n_pop_n.png)
+### push and pop on a heap with N values
+Repeatedly push and pop while keeping a stable heap size.  This is a _very
+simplistic_ approximation for how most scheduler/timer heaps might be used.
+Usually when a timer fires it will be quickly replaced by a new timer, and the
+overall count of timers will remain roughly stable.
+![bar graph for push_pop benchmarks](./images/push_pop.png)
+### numbers
+Even for very small N values the benchmark implementations,  `DHeap` runs faster
+than the other implementations for each scenario, although the difference is
+still relatively small.  The pure ruby binary heap is 2x or more slower than
+bsearch + insert for common push/pop scenario.
+    == push N (N=5) ==========================================================
+    push N (c_dheap):   1969700.7 i/s
+    push N (c++ stl):   1049738.1 i/s - 1.88x  slower
+    push N (rb_heap):    928435.2 i/s - 2.12x  slower
+    push N (bsearch):    921060.0 i/s - 2.14x  slower
+    == push N then pop N (N=5) ===============================================
+    push N + pop N (c_dheap):   1375805.0 i/s
+    push N + pop N (c++ stl):   1134997.5 i/s - 1.21x  slower
+    push N + pop N (findmin):    862913.1 i/s - 1.59x  slower
+    push N + pop N (bsearch):    762887.1 i/s - 1.80x  slower
+    push N + pop N (rb_heap):    506890.4 i/s - 2.71x  slower
+    == Push/pop with pre-filled queue of size=N (N=5) ========================
+    push + pop (c_dheap):   9044435.5 i/s
+    push + pop (c++ stl):   7534583.4 i/s - 1.20x  slower
+    push + pop (findmin):   5026155.1 i/s - 1.80x  slower
+    push + pop (bsearch):   4300260.0 i/s - 2.10x  slower
+    push + pop (rb_heap):   2299499.7 i/s - 3.93x  slower
+By N=21, `DHeap` has pulled significantly ahead of bsearch + insert for all
+scenarios, but the pure ruby heap is still slower than every other
+implementation—even resorting the array after every `#push`—in any scenario that
+uses `#pop`.
+    == push N (N=21) =========================================================
+    push N (c_dheap):    464231.4 i/s
+    push N (c++ stl):    305546.7 i/s - 1.52x  slower
+    push N (rb_heap):    202803.7 i/s - 2.29x  slower
+    push N (bsearch):    168678.7 i/s - 2.75x  slower
+    == push N then pop N (N=21) ==============================================
+    push N + pop N (c_dheap):    298350.3 i/s
+    push N + pop N (c++ stl):    252227.1 i/s - 1.18x  slower
+    push N + pop N (findmin):    161998.7 i/s - 1.84x  slower
+    push N + pop N (bsearch):    143432.3 i/s - 2.08x  slower
+    push N + pop N (rb_heap):     79622.1 i/s - 3.75x  slower
+    == Push/pop with pre-filled queue of size=N (N=21) =======================
+    push + pop (c_dheap):   8855093.4 i/s
+    push + pop (c++ stl):   7223079.5 i/s - 1.23x  slower
+    push + pop (findmin):   4542913.7 i/s - 1.95x  slower
+    push + pop (bsearch):   3461802.4 i/s - 2.56x  slower
+    push + pop (rb_heap):   1845488.7 i/s - 4.80x  slower
+At higher values of N, a heaps logarithmic growth leads to only a little
+slowdown of `#push`, while insert's linear growth causes it to run noticably
+slower and slower.  But because `#pop` is `O(1)` for a sorted array and `O(d log
+n / log d)` for a heap, scenarios involving both `#push` and `#pop` remain
+relatively close, and bsearch + insert still runs faster than a pure ruby heap,
+even up to queues with 10k items.  But as queue size increases beyond than that,
+the linear time compexity to keep a sorted array dominates.
+    == push + pop (rb_heap)
+    queue size =    10000:    736618.2 i/s
+    queue size =    25000:    670186.8 i/s - 1.10x  slower
+    queue size =    50000:    618156.7 i/s - 1.19x  slower
+    queue size =   100000:    579250.7 i/s - 1.27x  slower
+    queue size =   250000:    572795.0 i/s - 1.29x  slower
+    queue size =   500000:    543648.3 i/s - 1.35x  slower
+    queue size =  1000000:    513523.4 i/s - 1.43x  slower
+    queue size =  2500000:    460848.9 i/s - 1.60x  slower
+    queue size =  5000000:    445234.5 i/s - 1.65x  slower
+    queue size = 10000000:    423119.0 i/s - 1.74x  slower
+    == push + pop (bsearch)
+    queue size =    10000:    786334.2 i/s
+    queue size =    25000:    364963.8 i/s - 2.15x  slower
+    queue size =    50000:    200520.6 i/s - 3.92x  slower
+    queue size =   100000:     88607.0 i/s - 8.87x  slower
+    queue size =   250000:     34530.5 i/s - 22.77x  slower
+    queue size =   500000:     17965.4 i/s - 43.77x  slower
+    queue size =  1000000:      5638.7 i/s - 139.45x  slower
+    queue size =  2500000:      1302.0 i/s - 603.93x  slower
+    queue size =  5000000:       592.0 i/s - 1328.25x  slower
+    queue size = 10000000:       288.8 i/s - 2722.66x  slower
+    == push + pop (c_dheap)
+    queue size =    10000:   7311366.6 i/s
+    queue size =    50000:   6737824.5 i/s - 1.09x  slower
+    queue size =    25000:   6407340.6 i/s - 1.14x  slower
+    queue size =   100000:   6254396.3 i/s - 1.17x  slower
+    queue size =   250000:   5917684.5 i/s - 1.24x  slower
+    queue size =   500000:   5126307.6 i/s - 1.43x  slower
+    queue size =  1000000:   4403494.1 i/s - 1.66x  slower
+    queue size =  2500000:   3304088.2 i/s - 2.21x  slower
+    queue size =  5000000:   2664897.7 i/s - 2.74x  slower
+    queue size = 10000000:   2137927.6 i/s - 3.42x  slower
-## TODOs...
+## Analysis
-_TODO:_ In addition to a basic _d_-ary heap class (`DHeap`), this library
-~~includes~~ _will include_ extensions to `Array`, allowing an Array to be
-directly handled as a priority queue.  These extension methods are meant to be
-used similarly to how `#bsearch` and `#bsearch_index` might be used.
+### Time complexity
-_TODO:_ Also ~~included is~~ _will include_ `DHeap::Set`, which augments the
-basic heap with an internal `Hash`, which maps a set of values to scores.
-loosely inspired by go's timers.  e.g: It lazily sifts its heap after deletion
-and adjustments, to achieve faster average runtime for *add* and *cancel*
-operations.
+There are two fundamental heap operations: sift-up (used by push) and sift-down
+(used by pop).
+* A _d_-ary heap will have `log n / log d` layers, so both sift operations can
+  perform as many as `log n / log d` writes, when a member sifts the entire
+  length of the tree.
+* Sift-up makes one comparison per layer, so push runs in `O(log n / log d)`.
+* Sift-down makes d comparions per layer, so pop runs in `O(d log n / log d)`.
+So, in the simplest case of running balanced push/pop while maintaining the same
+heap size, `(1 + d) log n / log d` comparisons are made.  In the worst case,
+when every sift traverses every layer of the tree, `d=4` requires the fewest
+comparisons for combined insert and delete:
+* (1 +  2) lg n / lg d ≈ 4.328085 lg n
+* (1 +  3) lg n / lg d ≈ 3.640957 lg n
+* (1 +  4) lg n / lg d ≈ 3.606738 lg n
+* (1 +  5) lg n / lg d ≈ 3.728010 lg n
+* (1 +  6) lg n / lg d ≈ 3.906774 lg n
+* (1 +  7) lg n / lg d ≈ 4.111187 lg n
+* (1 +  8) lg n / lg d ≈ 4.328085 lg n
+* (1 +  9) lg n / lg d ≈ 4.551196 lg n
+* (1 + 10) lg n / lg d ≈ 4.777239 lg n
+* etc...
-_TODO:_ Also ~~included is~~ _will include_ `DHeap::Timers`, which contains some
-features that are loosely inspired by go's timers.  e.g: It lazily sifts its
-heap after deletion and adjustments, to achieve faster average runtime for *add*
-and *cancel* operations.
+See https://en.wikipedia.org/wiki/D-ary_heap#Analysis for deeper analysis.
-Additionally, I was inspired by reading go's "timer.go" implementation to
-experiment with a 4-ary heap instead of the traditional binary heap.  In the
-case of timers, new timers are usually scheduled to run after most of the
-existing timers.  And timers are usually canceled before they have a chance to
-run. While a binary heap holds 50% of its elements in its last layer, 75% of a
-4-ary heap will have no children.  That diminishes the extra comparison overhead
-during sift-down.
+### Space complexity
-## Benchmarks
+Space usage is linear, regardless of d.  However higher d values may
+provide better cache locality.  Because the heap is a complete binary tree, the
+elements can be stored in an array, without the need for tree or list pointers.
-_TODO: put benchmarks here._
+Ruby can compare Numeric values _much_ faster than other ruby objects, even if
+those objects simply delegate comparison to internal Numeric values.  And it is
+often useful to use external scores for otherwise uncomparable values.  So
+`DHeap` uses twice as many entries (one for score and one for value)
+as an array which only stores values.
-## Analysis
+## Thread safety
-### Time complexity
+`DHeap` is _not_ thread-safe, so concurrent access from multiple threads need to
+take precautions such as locking access behind a mutex.
-Both sift operations can perform (log[d] n = log n / log d) swaps.
-Swap up performs only a single comparison per swap: O(1).
-Swap down performs as many as d comparions per swap: O(d).
-Inserting an item is O(log n / log d).
-Deleting the root is O(d log n / log d).
-Assuming every inserted item is eventually deleted from the root, d=4 requires
-the fewest comparisons for combined insert and delete:
- * (1 + 2) lg 2 = 4.328085
- * (1 + 3) lg 3 = 3.640957
- * (1 + 4) lg 4 = 3.606738
- * (1 + 5) lg 5 = 3.728010
- * (1 + 6) lg 6 = 3.906774
- * etc...
-Leaf nodes require no comparisons to shift down, and higher values for d have
-higher percentage of leaf nodes:
- * d=2 has ~50% leaves,
- * d=3 has ~67% leaves,
- * d=4 has ~75% leaves,
- * and so on...
+## Alternative data structures
-See https://en.wikipedia.org/wiki/D-ary_heap#Analysis for deeper analysis.
+As always, you should run benchmarks with your expected scenarios to determine
+which is best for your application.
-### Space complexity
+Depending on your use-case, maintaining a sorted `Array` using `#bsearch_index`
+and `#insert` might be just fine!  Even `min` plus `delete` with an unsorted
+array can be very fast on small queues.  Although insertions run with `O(n)`,
+`memcpy` is so fast on modern hardware that your dataset might not be large
+enough for it to matter.
-Because the heap is a complete binary tree, space usage is linear, regardless
-of d.  However higher d values may provide better cache locality.
+More complex heap varients, e.g. [Fibonacci heap], allow heaps to be split and
+merged which gives some graph algorithms a lower amortized time complexity.  But
+in practice, _d_-ary heaps have much lower overhead and often run faster.
-We can run comparisons much much faster for Numeric or String objects than for
-ruby objects which delegate comparison to internal Numeric or String objects.
-And it is often advantageous to use extrinsic scores for uncomparable items.
-For this, our internal array uses twice as many entries (one for score and one
-for value) as it would if it only supported intrinsic comparison or used an
-un-memoized "sort_by" proc.
+[Fibonacci heap]: https://en.wikipedia.org/wiki/Fibonacci_heap
-### Timers
+If it is important to be able to quickly enumerate the set or find the ranking
+of values in it, then you may want to use a self-balancing binary search tree
+(e.g. a [red-black tree]) or a [skip-list].
-Additionally, when used to sort timers, we can reasonably assume that:
- * New timers usually sort after most existing timers.
- * Most timers will be canceled before executing.
- * Canceled timers usually sort after most existing timers.
+[red-black tree]: https://en.wikipedia.org/wiki/Red%E2%80%93black_tree
+[skip-list]: https://en.wikipedia.org/wiki/Skip_list
-So, if we are able to delete an item without searching for it, by keeping a map
-of positions within the heap, most timers can be inserted and deleted in O(1)
-time.  Canceling a non-leaf timer can be further optimized by marking it as
-canceled without immediately removing it from the heap.  If the timer is
-rescheduled before we garbage collect, adjusting its position will usually be
-faster than a delete and re-insert.
+[Hashed and Heirarchical Timing Wheels][timing wheels] (or some variant in that
+family of data structures) can be constructed to have effectively `O(1)` running
+time in most cases.  Although the implementation for that data structure is more
+complex than a heap, it may be necessary for enormous values of N.
-## Alternative data structures
+[timing wheels]: http://www.cs.columbia.edu/~nahum/w6998/papers/ton97-timing-wheels.pdf
-Depending on what you're doing, maintaining a sorted `Array` using
-`#bsearch_index` and `#insert` might be faster!  Although it is technically
-O(n) for insertions, the implementations for `memcpy` or `memmove` can be *very*
-fast on modern architectures.  Also, it can be faster O(n) on average, if
-insertions are usually near the end of the array.  You should run benchmarks
-with your expected scenarios to determine which is right.
+## TODOs...
-If it is important to be able to quickly enumerate the set or find the ranking
-of values in it, then you probably want to use a self-balancing binary search
-tree (e.g. a red-black tree) or a skip-list.
-A Hashed Timing Wheel or Heirarchical Timing Wheels (or some variant in that
-family of data structures) can be constructed to have effectively O(1) running
-time in most cases.  However, the implementation for that data structure is more
-complex than a heap.  If a 4-ary heap is good enough for go's timers, it should
-be suitable for many use cases.
+_TODO:_ Also ~~included is~~ _will include_ `DHeap::Map`, which augments the
+basic heap with an internal `Hash`, which maps objects to their position in the
+heap.  This enforces a uniqueness constraint on items on the heap, and also
+allows items to be more efficiently deleted or adjusted.  However maintaining
+the hash does lead to a small drop in normal `#push` and `#pop` performance.
+_TODO:_ Also ~~included is~~ _will include_ `DHeap::Lazy`, which contains some
+features that are loosely inspired by go's timers.  e.g: It lazily sifts its
+heap after deletion and adjustments, to achieve faster average runtime for *add*
+and *cancel* operations.
 ## Development