RubyGems - data_structures_rmolinari - Versions diffs - 0.2.2 → 0.4.0 - Mend

data_structures_rmolinari 0.2.2 → 0.4.0

Files changed (10) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +33 -0
data/lib/data_structures_rmolinari/disjoint_union.rb +30 -14
data/lib/data_structures_rmolinari/generic_segment_tree.rb +8 -8
data/lib/data_structures_rmolinari/heap.rb +64 -42
data/lib/data_structures_rmolinari/max_priority_search_tree.rb +70 -119
data/lib/data_structures_rmolinari/shared.rb +9 -1
data/lib/data_structures_rmolinari.rb +17 -14
metadata +3 -3
data/lib/data_structures_rmolinari/minmax_priority_search_tree.rb +0 -670

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3765b8df91fcc62eb885e32ff5ad4b0b4678bba6f322cb5c8282657052aed8c6
-  data.tar.gz: 845bea3649dc51dab697927132c0fc2f62dcacf5c25e1c717b99e57819b52286
+  metadata.gz: 87a44faaaa62f555546867230df704981671491f040f6be29eeed7db7eb22a0a
+  data.tar.gz: 0a0f1f6cf22bdde5d0510a818af9d8a6dbdbf11a6e69ce2e178bf6f336bb3d92
 SHA512:
-  metadata.gz: 23687561ec6ddb12369ca5e75db33ffd710295097cd0c91b72fb278fea3b11b23152867bb4aea6a4fd17b2f95184fb5433c9ff009db92a5c4bab78686ae472de
-  data.tar.gz: d930a674f85aa0a57030ed59f2b39979c7dbb8d8f74e0a9718d44112c1efe05657e0d55773b623c3724f040054e2fedf453de30b72d5631404642081455406f9
+  metadata.gz: 990fc38cbc64c20290317bf2858ff6f2813f832d0046f249faea32c7f88f389e8c8c2db892f8288a0747aa9446181864a3e62435e4846a230411b6afa4b75faf
+  data.tar.gz: f1e641b03d30c4726268c1c8da6d6364f635251152230f89aba2b551f0355d37ce843dba8e631c2fbd4a20e87ae94c78cf30b46dc3d472f1a1b55add258de32a

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,33 @@
+# Changelog
+## [Unreleased]
+### Changed
+- MaxPrioritySearchTree
+  - Duplicate y values are now allowed. Ties are broken with a preference for smaller values of x.
+  - Method names have changed
+    - Instead of "highest", "leftmost", "rightmost" we use "largest_y", "smallest_x", "largest_x"
+    - For example, +highest_ne+ is now +largest_y_in_nw+
+- DisjointUnion
+  - the size argument to initializer is optional. The default value is 0.
+  - elements can be added to the "universe" of known values with +make_set+
+### Removed
+- MinmaxPrioritySearchTree is no longer available
+  - it was only a partial implementation anyway
+## [0.3.0] 2023-01-06
+### Added
+- Start this file
+- `Heap` can be constructed as "non-addressable"
+  - `update` is not possible but duplicates can be inserted and overall performance is a little better.
+### Changed
+- `LogicError` gets a subclassed `InternalLogicError` for issues inside the library.
+- `Shared::Pair` becomes `Shared::Point`
+  - this doesn't change the API of `MaxPrioritySearchTree` because of ducktyping. But client code (of which there is none) might be
+    using the `Pair` name.

data/lib/data_structures_rmolinari/disjoint_union.rb CHANGED Viewed

@@ -4,40 +4,54 @@
 # The data structure provides efficient actions to merge two disjoint subsets, i.e., replace them by their union, and determine if
 # two elements are in the same subset.
 #
-# The elements of the set must be 0, 1, ..., n-1, where n is the size of the universe. Client code can map its data to these
+# The elements of the set are 0, 1, ..., n-1, where n is the size of the universe. Client code can map its data to these
 # representatives.
 #
 # See https://en.wikipedia.org/wiki/Disjoint-set_data_structure for a good introduction.
 #
 # The code uses several ideas from Tarjan and van Leeuwen for efficiency. We use "union by rank" in +unite+ and path-halving in
-# +find+. Together, these make the amortized cost for each of n such operations effectively constant.
+# +find+. Together, these make the amortized cost of each opperation effectively constant.
 #
-# - Tarjan, Robert E., van Leeuwen, Jan (1984). "Worst-case analysis of set union algorithms". Journal of the ACM. 31 (2): 245–281.
+# - Tarjan, Robert E., van Leeuwen, Jan (1984). _Worst-case analysis of set union algorithms_. Journal of the ACM. 31 (2): 245–281.
 #
 # @todo
 #   - allow caller to expand the size of the universe. This operation is called "make set".
 #     - All we need to do is increase the size of @d, set the parent pointers, define the new ranks (zero), and update @size.
 class DataStructuresRMolinari::DisjointUnion
+  include Shared
   # The number of subsets in the partition.
   attr_reader :subset_count
-  # @param size the size of the universe, which must be known at the time of construction. The elements 0, 1, ..., size - 1 start
-  #   out in disjoint singleton subsets.
-  def initialize(size)
-    @size = size
+  # @param initial_size the initial size of the universe. The elements 0, 1, ..., initial_size - 1 start out in disjoint singleton
+  # subsets.
+  def initialize(initial_size = 0)
     # Initialize to
-    @d = (0...size).to_a
-    @rank = [0] * size
+    @d = (0...initial_size).to_a
+    @rank = [0] * initial_size
+    @subset_count = initial_size
+  end
+  # Add a new subset to the universe containing the element +new_v+
+  # @param new_v the new element, starting in its own singleton subset
+  #   - it must be a non-negative integer, not already part of the universe of elements.
+  def make_set(new_v)
+    raise DataError, "Element #{new_v} must be a non-negative integer" unless new_v.is_a?(Integer) && !new_v.negative?
+    raise DataError, "Element #{new_v} is already present" if @d[new_v]
-    @subset_count = size
+    @d[new_v] = new_v
+    @rank[new_v] = 0
+    @subset_count += 1
   end
   # Declare that e and f are equivalent, i.e., in the same subset. If they are already in the same subset this is a no-op.
   #
-  # Each argument must be one of 0, 1, ..., size-1.
+  # Each argument must be in the universe of elements
   def unite(e, f)
     check_value(e)
     check_value(f)
     raise 'Uniting an element with itself is meaningless' if e == f
     e_root = find(e)
@@ -50,9 +64,11 @@ class DataStructuresRMolinari::DisjointUnion
   # The canonical representative of the subset containing e. Two elements d and e are in the same subset exactly when find(d) ==
   # find(e).
-  # @param e must be one of 0, 1, ..., size-1.
-  # @return (Integer) one of 0, 1, ..., size-1.
+  # @param e must be in the universe of elements
+  # @return (Integer) one of the universe of elements
   def find(e)
+    check_value(e)
     # We implement find with "halving" to shrink the length of paths to the root. See Tarjan and van Leeuwin p 252.
     x = e
     x = @d[x] = @d[@d[x]] while @d[@d[x]] != @d[x]
@@ -60,7 +76,7 @@ class DataStructuresRMolinari::DisjointUnion
   end
   private def check_value(v)
-    raise "Value must be given and be in (0..#{@size - 1})" unless v && v.between?(0, @size - 1)
+    raise Shared::DataError, "Value #{v} is not part of the univserse." unless @d[v]
   end
   private def link(e, f)

data/lib/data_structures_rmolinari/generic_segment_tree.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 require_relative 'shared'
-# A Segment Tree, which can be used for various interval-related purposes, like efficiently finding the sum (or min or max) on a
-# arbitrary subarray of a given array.
+# The template of Segment Tree, which can be used for various interval-related purposes, like efficiently finding the sum (or min or
+# max) on a arbitrary subarray of a given array.
 #
 # There is an excellent description of the data structure at https://cp-algorithms.com/data_structures/segment_tree.html. The
 # Wikipedia article (https://en.wikipedia.org/wiki/Segment_tree) appears to describe a different data structure which is sometimes
@@ -13,10 +13,10 @@ require_relative 'shared'
 # Ruby.
 #
 # This is a generic implementation, intended to allow easy configuration for concrete instances. See the parameters to the
-# initializer and the defintiaons concrete realisations like MaxValSegmentTree.
+# initializer and the definitions of concrete realisations like MaxValSegmentTree.
 #
 # We do O(n) work to build the internal data structure at initialization. Then we answer queries in O(log n) time.
-class DataStructuresRMolinari::GenericSegmentTree
+class DataStructuresRMolinari::SegmentTreeTemplate
   include Shared::BinaryTreeArithmetic
   # Construct a concrete instance of a Segment Tree. See details at the links above for the underlying concepts here.
@@ -24,14 +24,14 @@ class DataStructuresRMolinari::GenericSegmentTree
   #   - For example, if we are calculating sums over subintervals, combine.call(a, b) = a + b, while if we are doing maxima we will
   #     return max(a, b).
   #   - Things get more complicated when we are calculating, say, the _index_ of the maximal value in a subinterval. Now it is not
-  #     enough simple to store that index at each tree node, because to combine the indices from two child nodes we need to know
+  #     enough simply to store that index at each tree node, because to combine the indices from two child nodes we need to know
   #     both the index of the maximal element in each child node's interval, but also the maximal values themselves, so we know
   #     which one "wins" for the parent node. This affects the sort of work we need to do when combining and the value provided by
   #     the +single_cell_array_val+ lambda.
   # @param single_cell_array_val a lambda that takes an index i and returns the value we need to store in the #build
   #     operation for the subinterval i..i.
-  #     - This is often simply be the value data[i], but in some cases it will be something else. For example, when we are
-  #       calculating the index of the maximal value on each subinterval we will retern the pair [i, data[i]] here.
+  #     - This will often simply be the value data[i], but in some cases it will be something else. For example, when we are
+  #       calculating the index of the maximal value on each subinterval we need [i, data[i]] here.
   #     - If +update_at+ is called later, this lambda must close over the underlying data in a way that captures the updated value.
   # @param size the size of the underlying data array, used in certain internal arithmetic.
   # @param identity the value to return when we are querying on an empty interval
@@ -96,7 +96,7 @@ class DataStructuresRMolinari::GenericSegmentTree
   private def update_val_at(idx, tree_idx, tree_l, tree_r)
     if tree_l == tree_r
       # We have found the spot!
-      raise LogicError, 'tree_l == tree_r, but they do not agree with the idx holding the updated value' unless tree_l == idx
+      raise InternalLogicError, 'tree_l == tree_r, but they do not agree with the idx holding the updated value' unless tree_l == idx
       @tree[tree_idx] = @single_cell_array_val.call(tree_l)
     else

data/lib/data_structures_rmolinari/heap.rb CHANGED Viewed

@@ -13,8 +13,8 @@ require_relative 'shared'
 # - +empty?+
 #   - is the heap empty?
 #   - O(1)
-# - +insert+
-#   - add a new element to the heap with an associated priority
+# - +insert(item, priority)+
+#   - add a new item to the heap with an associated priority
 #   - O(log N)
 # - +top+
 #   - return the lowest-priority element, which is the element at the root of the tree. In a max-heap this is the highest-priority
@@ -23,12 +23,18 @@ require_relative 'shared'
 # - +pop+
 #   - removes and returns the item that would be returned by +top+
 #   - O(log N)
-# - +update+
+# - +update(item, priority)+
 #   - tell the heap that the priority of a particular item has changed
 #   - O(log N)
 #
 # Here N is the number of elements in the heap.
 #
+# The internal requirements needed to implement +update+ have several consequences.
+# - Items added to the heap must be distinct. Otherwise we would not know which occurrence to update
+# - There is some bookkeeping overhead.
+# If client code doesn't need to call +update+ then we can create a "non-addressable" heap that allows for the insertion of
+# duplicate items and has slightly faster runtime overall. See the arguments to the initializer.
+#
 # References:
 #
 # - https://en.wikipedia.org/wiki/Binary_heap
@@ -36,31 +42,31 @@ require_relative 'shared'
 #   DOI 10.1007/s00224-017-9760-2
 #
 # @todo
-#   - allow for priorities comparable only via +<=>+, like arrays
-#     - this requires different handling for max-heaps, as we can't just negate the priorities and use min-heap logic
-#   - relax the requirement that priorities must be comparable vai +<+ and respond to negation. Instead, allow comparison via +<=>+
-#     and handle max-heaps differently.
-#     - this will allow priorities to be arrays for tie-breakers and similar.
-#   - offer a non-addressable version that doesn't support +update+
-#     - configure through the initializer
-#     - other operations will be a little quicker, and we can add the same item more than once. The paper by Chen et al. referenced
-#       in the Wikipedia article for Pairing Heaps suggests that using such a priority queue for Dijkstra's algorithm and inserting
-#       multiple copies of a key rather than updating its priority is faster in practice than other approaches that have better
-#       theoretical performance.
+#   - let caller see the priority of the top element. Maybe this is useful sometimes.
 class DataStructuresRMolinari::Heap
+  include Shared
   include Shared::BinaryTreeArithmetic
+  # The number of items currently in the heap
   attr_reader :size
-  Pair = Struct.new(:priority, :item)
+  # An (item, priority) pair
+  InternalPair = Struct.new(:item, :priority)
+  private_constant :InternalPair
   # @param max_heap when truthy, make a max-heap rather than a min-heap
-  # @param debug when truthy, verify the heap property after each update than might violate it. This makes operations much slower.
-  def initialize(max_heap: false, debug: false)
+  # @param addressable when truthy, the heap is _addressable_. This means that
+  #   - item priorities are updatable with +update(item, p)+, and
+  #   - items added to the heap must be distinct.
+  #   When falsy, priorities are not updateable but items may be inserted multiple times. Operations are slightly faster because
+  #   there is less internal bookkeeping.
+  # @param debug when truthy, verify the heap property after each change that might violate it. This makes operations much slower.
+  def initialize(max_heap: false, addressable: true, debug: false)
     @data = []
     @size = 0
     @max_heap = max_heap
-    @index_of = {}
+    @addressable = addressable
+    @index_of = {} # used in addressable heaps
     @debug = debug
   end
@@ -70,26 +76,24 @@ class DataStructuresRMolinari::Heap
   end
   # Insert a new element into the heap with the given priority.
-  # @param value the item to be inserted. It is an error to insert an item that is already present in the heap, though we don't
-  #   check for this.
-  # @param priority the priority to use for new item. The values used as priorities ust be totally ordered via +<+ and, if +self+ is
-  #   a max-heap, must respond to negation +@-+ in the natural order-respecting way.
-  # @todo
-  #   - check for duplicate
+  # @param value the item to be inserted.
+  #   - If the heap is addressible (the default) it is an error to insert an item that is already present in the heap.
+  # @param priority the priority to use for new item. The values used as priorities must be totally ordered via +<=>+.
   def insert(value, priority)
-    priority *= -1 if @max_heap
+    raise DataError, "Heap already contains #{value}" if @addressable && contains?(value)
     @size += 1
-    d = Pair.new(priority, value)
+    d = InternalPair.new(value, priority)
     assign(d, @size)
     sift_up(@size)
   end
   # Return the top of the heap without removing it
-  # @return the value with minimal (maximal for max-heaps) priority. Strictly speaking, it returns the item at the root of the
-  #   binary tree; this element has minimal priority, but there may be other elements with the same priority.
+  # @return a value with minimal priority (maximal for max-heaps). Strictly speaking, it returns the item at the root of the
+  #   binary tree; this element has minimal priority, but there may be other elements with the same priority and they do not appear
+  #   at the top of the heap in any guaranteed order.
   def top
     raise 'Heap is empty!' unless @size.positive?
@@ -100,12 +104,11 @@ class DataStructuresRMolinari::Heap
   # @return (see #top)
   def pop
     result = top
-    @index_of.delete(result)
     assign(@data[@size], root)
     @data[@size] = nil
     @size -= 1
+    @index_of.delete(result) if @addressable
     sift_down(root) if @size.positive?
@@ -113,21 +116,20 @@ class DataStructuresRMolinari::Heap
   end
   # Update the priority of the given element and maintain the necessary heap properties.
+  #
   # @param element the item whose priority we are updating. It is an error to update the priority of an element not already in the
   #   heap
   # @param priority the new priority
-  #
-  # @todo
-  #   - check that the element is in the heap
   def update(element, priority)
-    priority *= -1 if @max_heap
+    raise LogicError, 'Cannot update priorities in a non-addressable heap' unless @addressable
+    raise DataError, "Cannot update priority for value #{element} not already in the heap" unless contains?(element)
     idx = @index_of[element]
     old = @data[idx].priority
     @data[idx].priority = priority
-    if priority > old
+    if less_than_priority?(old, priority)
       sift_down(idx)
-    elsif priority < old
+    elsif less_than_priority?(priority, old)
       sift_up(idx)
     end
@@ -141,7 +143,7 @@ class DataStructuresRMolinari::Heap
     x = @data[idx]
     while idx != root
       i = parent(idx)
-      break unless x.priority < @data[i].priority
+      break unless less_than?(x, @data[i])
       assign(@data[i], idx)
       idx = i
@@ -156,9 +158,9 @@ class DataStructuresRMolinari::Heap
     x = @data[idx]
     while (j = left(idx)) <= @size
-      j += 1 if j + 1 <= @size && @data[j + 1].priority < @data[j].priority
+      j += 1 if j + 1 <= @size && less_than?(@data[j + 1], @data[j])
-      break unless @data[j].priority < x.priority
+      break unless less_than?(@data[j], x)
       assign(@data[j], idx)
       idx = j
@@ -171,7 +173,27 @@ class DataStructuresRMolinari::Heap
   # Put the pair in the given heap location
   private def assign(pair, idx)
     @data[idx] = pair
-    @index_of[pair.item] = idx
+    @index_of[pair.item] = idx if @addressable
+  end
+  # Compare the priorities of two items with <=> and return truthy exactly when the result is -1.
+  #
+  # If this is a max-heap return truthy exactly when the result of <=> is 1.
+  #
+  # The arguments can also be the priorities themselves.
+  private def less_than?(p1, p2)
+    less_than_priority?(p1.priority, p2.priority)
+  end
+  # Direct comparison of priorities
+  private def less_than_priority?(priority1, priority2)
+    return (priority1 <=> priority2) == 1 if @max_heap
+    (priority1 <=> priority2) == -1
+  end
+  private def contains?(item)
+    !!@index_of[item]
   end
   # For debugging
@@ -180,8 +202,8 @@ class DataStructuresRMolinari::Heap
       left = left(idx)
       right = right(idx)
-      raise "Heap property violated by left child of index #{idx}" if left <= @size && @data[idx].priority >= @data[left].priority
-      raise "Heap property violated by right child of index #{idx}" if right <= @size && @data[idx].priority >= @data[right].priority
+      raise InternalLogicError, "Heap property violated by left child of index #{idx}" if left <= @size && less_than?(@data[left], @data[idx])
+      raise InternalLogicError, "Heap property violated by right child of index #{idx}" if right <= @size && less_than?(@data[right], @data[idx])
     end
   end
 end