RubyGems - data_structures_rmolinari - Versions diffs - 0.2.2 → 0.4.0 - Mend

data_structures_rmolinari 0.2.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +33 -0
data/lib/data_structures_rmolinari/disjoint_union.rb +30 -14
data/lib/data_structures_rmolinari/generic_segment_tree.rb +8 -8
data/lib/data_structures_rmolinari/heap.rb +64 -42
data/lib/data_structures_rmolinari/max_priority_search_tree.rb +70 -119
data/lib/data_structures_rmolinari/shared.rb +9 -1
data/lib/data_structures_rmolinari.rb +17 -14
metadata +3 -3
data/lib/data_structures_rmolinari/minmax_priority_search_tree.rb +0 -670

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3765b8df91fcc62eb885e32ff5ad4b0b4678bba6f322cb5c8282657052aed8c6
-  data.tar.gz: 845bea3649dc51dab697927132c0fc2f62dcacf5c25e1c717b99e57819b52286
+  metadata.gz: 87a44faaaa62f555546867230df704981671491f040f6be29eeed7db7eb22a0a
+  data.tar.gz: 0a0f1f6cf22bdde5d0510a818af9d8a6dbdbf11a6e69ce2e178bf6f336bb3d92
 SHA512:
-  metadata.gz: 23687561ec6ddb12369ca5e75db33ffd710295097cd0c91b72fb278fea3b11b23152867bb4aea6a4fd17b2f95184fb5433c9ff009db92a5c4bab78686ae472de
-  data.tar.gz: d930a674f85aa0a57030ed59f2b39979c7dbb8d8f74e0a9718d44112c1efe05657e0d55773b623c3724f040054e2fedf453de30b72d5631404642081455406f9
+  metadata.gz: 990fc38cbc64c20290317bf2858ff6f2813f832d0046f249faea32c7f88f389e8c8c2db892f8288a0747aa9446181864a3e62435e4846a230411b6afa4b75faf
+  data.tar.gz: f1e641b03d30c4726268c1c8da6d6364f635251152230f89aba2b551f0355d37ce843dba8e631c2fbd4a20e87ae94c78cf30b46dc3d472f1a1b55add258de32a

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,33 @@
+# Changelog
+## [Unreleased]
+### Changed
+- MaxPrioritySearchTree
+  - Duplicate y values are now allowed. Ties are broken with a preference for smaller values of x.
+  - Method names have changed
+    - Instead of "highest", "leftmost", "rightmost" we use "largest_y", "smallest_x", "largest_x"
+    - For example, +highest_ne+ is now +largest_y_in_nw+
+- DisjointUnion
+  - the size argument to initializer is optional. The default value is 0.
+  - elements can be added to the "universe" of known values with +make_set+
+### Removed
+- MinmaxPrioritySearchTree is no longer available
+  - it was only a partial implementation anyway
+## [0.3.0] 2023-01-06
+### Added
+- Start this file
+- `Heap` can be constructed as "non-addressable"
+  - `update` is not possible but duplicates can be inserted and overall performance is a little better.
+### Changed
+- `LogicError` gets a subclassed `InternalLogicError` for issues inside the library.
+- `Shared::Pair` becomes `Shared::Point`
+  - this doesn't change the API of `MaxPrioritySearchTree` because of ducktyping. But client code (of which there is none) might be
+    using the `Pair` name.

data/lib/data_structures_rmolinari/disjoint_union.rb CHANGED Viewed

@@ -4,40 +4,54 @@
 # The data structure provides efficient actions to merge two disjoint subsets, i.e., replace them by their union, and determine if
 # two elements are in the same subset.
 #
-# The elements of the set must be 0, 1, ..., n-1, where n is the size of the universe. Client code can map its data to these
+# The elements of the set are 0, 1, ..., n-1, where n is the size of the universe. Client code can map its data to these
 # representatives.
 #
 # See https://en.wikipedia.org/wiki/Disjoint-set_data_structure for a good introduction.
 #
 # The code uses several ideas from Tarjan and van Leeuwen for efficiency. We use "union by rank" in +unite+ and path-halving in
-# +find+. Together, these make the amortized cost for each of n such operations effectively constant.
+# +find+. Together, these make the amortized cost of each opperation effectively constant.
 #
-# - Tarjan, Robert E., van Leeuwen, Jan (1984). "Worst-case analysis of set union algorithms". Journal of the ACM. 31 (2): 245–281.
+# - Tarjan, Robert E., van Leeuwen, Jan (1984). _Worst-case analysis of set union algorithms_. Journal of the ACM. 31 (2): 245–281.
 #
 # @todo
 #   - allow caller to expand the size of the universe. This operation is called "make set".
 #     - All we need to do is increase the size of @d, set the parent pointers, define the new ranks (zero), and update @size.
 class DataStructuresRMolinari::DisjointUnion
+  include Shared
   # The number of subsets in the partition.
   attr_reader :subset_count
-  # @param size the size of the universe, which must be known at the time of construction. The elements 0, 1, ..., size - 1 start
-  #   out in disjoint singleton subsets.
-  def initialize(size)
-    @size = size
+  # @param initial_size the initial size of the universe. The elements 0, 1, ..., initial_size - 1 start out in disjoint singleton
+  # subsets.
+  def initialize(initial_size = 0)
     # Initialize to
-    @d = (0...size).to_a
-    @rank = [0] * size
+    @d = (0...initial_size).to_a
+    @rank = [0] * initial_size
+    @subset_count = initial_size
+  end
+  # Add a new subset to the universe containing the element +new_v+
+  # @param new_v the new element, starting in its own singleton subset
+  #   - it must be a non-negative integer, not already part of the universe of elements.
+  def make_set(new_v)
+    raise DataError, "Element #{new_v} must be a non-negative integer" unless new_v.is_a?(Integer) && !new_v.negative?
+    raise DataError, "Element #{new_v} is already present" if @d[new_v]
-    @subset_count = size
+    @d[new_v] = new_v
+    @rank[new_v] = 0
+    @subset_count += 1
   end
   # Declare that e and f are equivalent, i.e., in the same subset. If they are already in the same subset this is a no-op.
   #
-  # Each argument must be one of 0, 1, ..., size-1.
+  # Each argument must be in the universe of elements
   def unite(e, f)
     check_value(e)
     check_value(f)
     raise 'Uniting an element with itself is meaningless' if e == f
     e_root = find(e)
@@ -50,9 +64,11 @@ class DataStructuresRMolinari::DisjointUnion
   # The canonical representative of the subset containing e. Two elements d and e are in the same subset exactly when find(d) ==
   # find(e).
-  # @param e must be one of 0, 1, ..., size-1.
-  # @return (Integer) one of 0, 1, ..., size-1.
+  # @param e must be in the universe of elements
+  # @return (Integer) one of the universe of elements
   def find(e)
+    check_value(e)
     # We implement find with "halving" to shrink the length of paths to the root. See Tarjan and van Leeuwin p 252.
     x = e
     x = @d[x] = @d[@d[x]] while @d[@d[x]] != @d[x]
@@ -60,7 +76,7 @@ class DataStructuresRMolinari::DisjointUnion
   end
   private def check_value(v)
-    raise "Value must be given and be in (0..#{@size - 1})" unless v && v.between?(0, @size - 1)
+    raise Shared::DataError, "Value #{v} is not part of the univserse." unless @d[v]
   end
   private def link(e, f)

data/lib/data_structures_rmolinari/generic_segment_tree.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 require_relative 'shared'
-# A Segment Tree, which can be used for various interval-related purposes, like efficiently finding the sum (or min or max) on a
-# arbitrary subarray of a given array.
+# The template of Segment Tree, which can be used for various interval-related purposes, like efficiently finding the sum (or min or
+# max) on a arbitrary subarray of a given array.
 #
 # There is an excellent description of the data structure at https://cp-algorithms.com/data_structures/segment_tree.html. The
 # Wikipedia article (https://en.wikipedia.org/wiki/Segment_tree) appears to describe a different data structure which is sometimes
@@ -13,10 +13,10 @@ require_relative 'shared'
 # Ruby.
 #
 # This is a generic implementation, intended to allow easy configuration for concrete instances. See the parameters to the
-# initializer and the defintiaons concrete realisations like MaxValSegmentTree.
+# initializer and the definitions of concrete realisations like MaxValSegmentTree.
 #
 # We do O(n) work to build the internal data structure at initialization. Then we answer queries in O(log n) time.
-class DataStructuresRMolinari::GenericSegmentTree
+class DataStructuresRMolinari::SegmentTreeTemplate
   include Shared::BinaryTreeArithmetic
   # Construct a concrete instance of a Segment Tree. See details at the links above for the underlying concepts here.
@@ -24,14 +24,14 @@ class DataStructuresRMolinari::GenericSegmentTree
   #   - For example, if we are calculating sums over subintervals, combine.call(a, b) = a + b, while if we are doing maxima we will
   #     return max(a, b).
   #   - Things get more complicated when we are calculating, say, the _index_ of the maximal value in a subinterval. Now it is not
-  #     enough simple to store that index at each tree node, because to combine the indices from two child nodes we need to know
+  #     enough simply to store that index at each tree node, because to combine the indices from two child nodes we need to know
   #     both the index of the maximal element in each child node's interval, but also the maximal values themselves, so we know
   #     which one "wins" for the parent node. This affects the sort of work we need to do when combining and the value provided by
   #     the +single_cell_array_val+ lambda.
   # @param single_cell_array_val a lambda that takes an index i and returns the value we need to store in the #build
   #     operation for the subinterval i..i.
-  #     - This is often simply be the value data[i], but in some cases it will be something else. For example, when we are
-  #       calculating the index of the maximal value on each subinterval we will retern the pair [i, data[i]] here.
+  #     - This will often simply be the value data[i], but in some cases it will be something else. For example, when we are
+  #       calculating the index of the maximal value on each subinterval we need [i, data[i]] here.
   #     - If +update_at+ is called later, this lambda must close over the underlying data in a way that captures the updated value.
   # @param size the size of the underlying data array, used in certain internal arithmetic.
   # @param identity the value to return when we are querying on an empty interval
@@ -96,7 +96,7 @@ class DataStructuresRMolinari::GenericSegmentTree
   private def update_val_at(idx, tree_idx, tree_l, tree_r)
     if tree_l == tree_r
       # We have found the spot!
-      raise LogicError, 'tree_l == tree_r, but they do not agree with the idx holding the updated value' unless tree_l == idx
+      raise InternalLogicError, 'tree_l == tree_r, but they do not agree with the idx holding the updated value' unless tree_l == idx
       @tree[tree_idx] = @single_cell_array_val.call(tree_l)
     else

data/lib/data_structures_rmolinari/heap.rb CHANGED Viewed

@@ -13,8 +13,8 @@ require_relative 'shared'
 # - +empty?+
 #   - is the heap empty?
 #   - O(1)
-# - +insert+
-#   - add a new element to the heap with an associated priority
+# - +insert(item, priority)+
+#   - add a new item to the heap with an associated priority
 #   - O(log N)
 # - +top+
 #   - return the lowest-priority element, which is the element at the root of the tree. In a max-heap this is the highest-priority
@@ -23,12 +23,18 @@ require_relative 'shared'
 # - +pop+
 #   - removes and returns the item that would be returned by +top+
 #   - O(log N)
-# - +update+
+# - +update(item, priority)+
 #   - tell the heap that the priority of a particular item has changed
 #   - O(log N)
 #
 # Here N is the number of elements in the heap.
 #
+# The internal requirements needed to implement +update+ have several consequences.
+# - Items added to the heap must be distinct. Otherwise we would not know which occurrence to update
+# - There is some bookkeeping overhead.
+# If client code doesn't need to call +update+ then we can create a "non-addressable" heap that allows for the insertion of
+# duplicate items and has slightly faster runtime overall. See the arguments to the initializer.
+#
 # References:
 #
 # - https://en.wikipedia.org/wiki/Binary_heap
@@ -36,31 +42,31 @@ require_relative 'shared'
 #   DOI 10.1007/s00224-017-9760-2
 #
 # @todo
-#   - allow for priorities comparable only via +<=>+, like arrays
-#     - this requires different handling for max-heaps, as we can't just negate the priorities and use min-heap logic
-#   - relax the requirement that priorities must be comparable vai +<+ and respond to negation. Instead, allow comparison via +<=>+
-#     and handle max-heaps differently.
-#     - this will allow priorities to be arrays for tie-breakers and similar.
-#   - offer a non-addressable version that doesn't support +update+
-#     - configure through the initializer
-#     - other operations will be a little quicker, and we can add the same item more than once. The paper by Chen et al. referenced
-#       in the Wikipedia article for Pairing Heaps suggests that using such a priority queue for Dijkstra's algorithm and inserting
-#       multiple copies of a key rather than updating its priority is faster in practice than other approaches that have better
-#       theoretical performance.
+#   - let caller see the priority of the top element. Maybe this is useful sometimes.
 class DataStructuresRMolinari::Heap
+  include Shared
   include Shared::BinaryTreeArithmetic
+  # The number of items currently in the heap
   attr_reader :size
-  Pair = Struct.new(:priority, :item)
+  # An (item, priority) pair
+  InternalPair = Struct.new(:item, :priority)
+  private_constant :InternalPair
   # @param max_heap when truthy, make a max-heap rather than a min-heap
-  # @param debug when truthy, verify the heap property after each update than might violate it. This makes operations much slower.
-  def initialize(max_heap: false, debug: false)
+  # @param addressable when truthy, the heap is _addressable_. This means that
+  #   - item priorities are updatable with +update(item, p)+, and
+  #   - items added to the heap must be distinct.
+  #   When falsy, priorities are not updateable but items may be inserted multiple times. Operations are slightly faster because
+  #   there is less internal bookkeeping.
+  # @param debug when truthy, verify the heap property after each change that might violate it. This makes operations much slower.
+  def initialize(max_heap: false, addressable: true, debug: false)
     @data = []
     @size = 0
     @max_heap = max_heap
-    @index_of = {}
+    @addressable = addressable
+    @index_of = {} # used in addressable heaps
     @debug = debug
   end
@@ -70,26 +76,24 @@ class DataStructuresRMolinari::Heap
   end
   # Insert a new element into the heap with the given priority.
-  # @param value the item to be inserted. It is an error to insert an item that is already present in the heap, though we don't
-  #   check for this.
-  # @param priority the priority to use for new item. The values used as priorities ust be totally ordered via +<+ and, if +self+ is
-  #   a max-heap, must respond to negation +@-+ in the natural order-respecting way.
-  # @todo
-  #   - check for duplicate
+  # @param value the item to be inserted.
+  #   - If the heap is addressible (the default) it is an error to insert an item that is already present in the heap.
+  # @param priority the priority to use for new item. The values used as priorities must be totally ordered via +<=>+.
   def insert(value, priority)
-    priority *= -1 if @max_heap
+    raise DataError, "Heap already contains #{value}" if @addressable && contains?(value)
     @size += 1
-    d = Pair.new(priority, value)
+    d = InternalPair.new(value, priority)
     assign(d, @size)
     sift_up(@size)
   end
   # Return the top of the heap without removing it
-  # @return the value with minimal (maximal for max-heaps) priority. Strictly speaking, it returns the item at the root of the
-  #   binary tree; this element has minimal priority, but there may be other elements with the same priority.
+  # @return a value with minimal priority (maximal for max-heaps). Strictly speaking, it returns the item at the root of the
+  #   binary tree; this element has minimal priority, but there may be other elements with the same priority and they do not appear
+  #   at the top of the heap in any guaranteed order.
   def top
     raise 'Heap is empty!' unless @size.positive?
@@ -100,12 +104,11 @@ class DataStructuresRMolinari::Heap
   # @return (see #top)
   def pop
     result = top
-    @index_of.delete(result)
     assign(@data[@size], root)
     @data[@size] = nil
     @size -= 1
+    @index_of.delete(result) if @addressable
     sift_down(root) if @size.positive?
@@ -113,21 +116,20 @@ class DataStructuresRMolinari::Heap
   end
   # Update the priority of the given element and maintain the necessary heap properties.
+  #
   # @param element the item whose priority we are updating. It is an error to update the priority of an element not already in the
   #   heap
   # @param priority the new priority
-  #
-  # @todo
-  #   - check that the element is in the heap
   def update(element, priority)
-    priority *= -1 if @max_heap
+    raise LogicError, 'Cannot update priorities in a non-addressable heap' unless @addressable
+    raise DataError, "Cannot update priority for value #{element} not already in the heap" unless contains?(element)
     idx = @index_of[element]
     old = @data[idx].priority
     @data[idx].priority = priority
-    if priority > old
+    if less_than_priority?(old, priority)
       sift_down(idx)
-    elsif priority < old
+    elsif less_than_priority?(priority, old)
       sift_up(idx)
     end
@@ -141,7 +143,7 @@ class DataStructuresRMolinari::Heap
     x = @data[idx]
     while idx != root
       i = parent(idx)
-      break unless x.priority < @data[i].priority
+      break unless less_than?(x, @data[i])
       assign(@data[i], idx)
       idx = i
@@ -156,9 +158,9 @@ class DataStructuresRMolinari::Heap
     x = @data[idx]
     while (j = left(idx)) <= @size
-      j += 1 if j + 1 <= @size && @data[j + 1].priority < @data[j].priority
+      j += 1 if j + 1 <= @size && less_than?(@data[j + 1], @data[j])
-      break unless @data[j].priority < x.priority
+      break unless less_than?(@data[j], x)
       assign(@data[j], idx)
       idx = j
@@ -171,7 +173,27 @@ class DataStructuresRMolinari::Heap
   # Put the pair in the given heap location
   private def assign(pair, idx)
     @data[idx] = pair
-    @index_of[pair.item] = idx
+    @index_of[pair.item] = idx if @addressable
+  end
+  # Compare the priorities of two items with <=> and return truthy exactly when the result is -1.
+  #
+  # If this is a max-heap return truthy exactly when the result of <=> is 1.
+  #
+  # The arguments can also be the priorities themselves.
+  private def less_than?(p1, p2)
+    less_than_priority?(p1.priority, p2.priority)
+  end
+  # Direct comparison of priorities
+  private def less_than_priority?(priority1, priority2)
+    return (priority1 <=> priority2) == 1 if @max_heap
+    (priority1 <=> priority2) == -1
+  end
+  private def contains?(item)
+    !!@index_of[item]
   end
   # For debugging
@@ -180,8 +202,8 @@ class DataStructuresRMolinari::Heap
       left = left(idx)
       right = right(idx)
-      raise "Heap property violated by left child of index #{idx}" if left <= @size && @data[idx].priority >= @data[left].priority
-      raise "Heap property violated by right child of index #{idx}" if right <= @size && @data[idx].priority >= @data[right].priority
+      raise InternalLogicError, "Heap property violated by left child of index #{idx}" if left <= @size && less_than?(@data[left], @data[idx])
+      raise InternalLogicError, "Heap property violated by right child of index #{idx}" if right <= @size && less_than?(@data[right], @data[idx])
     end
   end
 end