data_structures_rmolinari 0.2.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3765b8df91fcc62eb885e32ff5ad4b0b4678bba6f322cb5c8282657052aed8c6
4
- data.tar.gz: 845bea3649dc51dab697927132c0fc2f62dcacf5c25e1c717b99e57819b52286
3
+ metadata.gz: 87a44faaaa62f555546867230df704981671491f040f6be29eeed7db7eb22a0a
4
+ data.tar.gz: 0a0f1f6cf22bdde5d0510a818af9d8a6dbdbf11a6e69ce2e178bf6f336bb3d92
5
5
  SHA512:
6
- metadata.gz: 23687561ec6ddb12369ca5e75db33ffd710295097cd0c91b72fb278fea3b11b23152867bb4aea6a4fd17b2f95184fb5433c9ff009db92a5c4bab78686ae472de
7
- data.tar.gz: d930a674f85aa0a57030ed59f2b39979c7dbb8d8f74e0a9718d44112c1efe05657e0d55773b623c3724f040054e2fedf453de30b72d5631404642081455406f9
6
+ metadata.gz: 990fc38cbc64c20290317bf2858ff6f2813f832d0046f249faea32c7f88f389e8c8c2db892f8288a0747aa9446181864a3e62435e4846a230411b6afa4b75faf
7
+ data.tar.gz: f1e641b03d30c4726268c1c8da6d6364f635251152230f89aba2b551f0355d37ce843dba8e631c2fbd4a20e87ae94c78cf30b46dc3d472f1a1b55add258de32a
data/CHANGELOG.md ADDED
@@ -0,0 +1,33 @@
1
+ # Changelog
2
+
3
+ ## [Unreleased]
4
+
5
+ ### Changed
6
+
7
+ - MaxPrioritySearchTree
8
+ - Duplicate y values are now allowed. Ties are broken with a preference for smaller values of x.
9
+ - Method names have changed
10
+ - Instead of "highest", "leftmost", "rightmost" we use "largest_y", "smallest_x", "largest_x"
11
+ - For example, +highest_ne+ is now +largest_y_in_nw+
12
+ - DisjointUnion
13
+ - the size argument to initializer is optional. The default value is 0.
14
+ - elements can be added to the "universe" of known values with +make_set+
15
+
16
+ ### Removed
17
+ - MinmaxPrioritySearchTree is no longer available
18
+ - it was only a partial implementation anyway
19
+
20
+ ## [0.3.0] 2023-01-06
21
+
22
+ ### Added
23
+
24
+ - Start this file
25
+ - `Heap` can be constructed as "non-addressable"
26
+ - `update` is not possible but duplicates can be inserted and overall performance is a little better.
27
+
28
+ ### Changed
29
+
30
+ - `LogicError` gets a subclassed `InternalLogicError` for issues inside the library.
31
+ - `Shared::Pair` becomes `Shared::Point`
32
+ - this doesn't change the API of `MaxPrioritySearchTree` because of ducktyping. But client code (of which there is none) might be
33
+ using the `Pair` name.
@@ -4,40 +4,54 @@
4
4
  # The data structure provides efficient actions to merge two disjoint subsets, i.e., replace them by their union, and determine if
5
5
  # two elements are in the same subset.
6
6
  #
7
- # The elements of the set must be 0, 1, ..., n-1, where n is the size of the universe. Client code can map its data to these
7
+ # The elements of the set are 0, 1, ..., n-1, where n is the size of the universe. Client code can map its data to these
8
8
  # representatives.
9
9
  #
10
10
  # See https://en.wikipedia.org/wiki/Disjoint-set_data_structure for a good introduction.
11
11
  #
12
12
  # The code uses several ideas from Tarjan and van Leeuwen for efficiency. We use "union by rank" in +unite+ and path-halving in
13
- # +find+. Together, these make the amortized cost for each of n such operations effectively constant.
13
+ # +find+. Together, these make the amortized cost of each opperation effectively constant.
14
14
  #
15
- # - Tarjan, Robert E., van Leeuwen, Jan (1984). "Worst-case analysis of set union algorithms". Journal of the ACM. 31 (2): 245–281.
15
+ # - Tarjan, Robert E., van Leeuwen, Jan (1984). _Worst-case analysis of set union algorithms_. Journal of the ACM. 31 (2): 245–281.
16
16
  #
17
17
  # @todo
18
18
  # - allow caller to expand the size of the universe. This operation is called "make set".
19
19
  # - All we need to do is increase the size of @d, set the parent pointers, define the new ranks (zero), and update @size.
20
20
  class DataStructuresRMolinari::DisjointUnion
21
+ include Shared
22
+
21
23
  # The number of subsets in the partition.
22
24
  attr_reader :subset_count
23
25
 
24
- # @param size the size of the universe, which must be known at the time of construction. The elements 0, 1, ..., size - 1 start
25
- # out in disjoint singleton subsets.
26
- def initialize(size)
27
- @size = size
26
+ # @param initial_size the initial size of the universe. The elements 0, 1, ..., initial_size - 1 start out in disjoint singleton
27
+ # subsets.
28
+ def initialize(initial_size = 0)
28
29
  # Initialize to
29
- @d = (0...size).to_a
30
- @rank = [0] * size
30
+ @d = (0...initial_size).to_a
31
+ @rank = [0] * initial_size
32
+
33
+ @subset_count = initial_size
34
+ end
35
+
36
+ # Add a new subset to the universe containing the element +new_v+
37
+ # @param new_v the new element, starting in its own singleton subset
38
+ # - it must be a non-negative integer, not already part of the universe of elements.
39
+ def make_set(new_v)
40
+ raise DataError, "Element #{new_v} must be a non-negative integer" unless new_v.is_a?(Integer) && !new_v.negative?
41
+ raise DataError, "Element #{new_v} is already present" if @d[new_v]
31
42
 
32
- @subset_count = size
43
+ @d[new_v] = new_v
44
+ @rank[new_v] = 0
45
+ @subset_count += 1
33
46
  end
34
47
 
35
48
  # Declare that e and f are equivalent, i.e., in the same subset. If they are already in the same subset this is a no-op.
36
49
  #
37
- # Each argument must be one of 0, 1, ..., size-1.
50
+ # Each argument must be in the universe of elements
38
51
  def unite(e, f)
39
52
  check_value(e)
40
53
  check_value(f)
54
+
41
55
  raise 'Uniting an element with itself is meaningless' if e == f
42
56
 
43
57
  e_root = find(e)
@@ -50,9 +64,11 @@ class DataStructuresRMolinari::DisjointUnion
50
64
 
51
65
  # The canonical representative of the subset containing e. Two elements d and e are in the same subset exactly when find(d) ==
52
66
  # find(e).
53
- # @param e must be one of 0, 1, ..., size-1.
54
- # @return (Integer) one of 0, 1, ..., size-1.
67
+ # @param e must be in the universe of elements
68
+ # @return (Integer) one of the universe of elements
55
69
  def find(e)
70
+ check_value(e)
71
+
56
72
  # We implement find with "halving" to shrink the length of paths to the root. See Tarjan and van Leeuwin p 252.
57
73
  x = e
58
74
  x = @d[x] = @d[@d[x]] while @d[@d[x]] != @d[x]
@@ -60,7 +76,7 @@ class DataStructuresRMolinari::DisjointUnion
60
76
  end
61
77
 
62
78
  private def check_value(v)
63
- raise "Value must be given and be in (0..#{@size - 1})" unless v && v.between?(0, @size - 1)
79
+ raise Shared::DataError, "Value #{v} is not part of the univserse." unless @d[v]
64
80
  end
65
81
 
66
82
  private def link(e, f)
@@ -1,7 +1,7 @@
1
1
  require_relative 'shared'
2
2
 
3
- # A Segment Tree, which can be used for various interval-related purposes, like efficiently finding the sum (or min or max) on a
4
- # arbitrary subarray of a given array.
3
+ # The template of Segment Tree, which can be used for various interval-related purposes, like efficiently finding the sum (or min or
4
+ # max) on a arbitrary subarray of a given array.
5
5
  #
6
6
  # There is an excellent description of the data structure at https://cp-algorithms.com/data_structures/segment_tree.html. The
7
7
  # Wikipedia article (https://en.wikipedia.org/wiki/Segment_tree) appears to describe a different data structure which is sometimes
@@ -13,10 +13,10 @@ require_relative 'shared'
13
13
  # Ruby.
14
14
  #
15
15
  # This is a generic implementation, intended to allow easy configuration for concrete instances. See the parameters to the
16
- # initializer and the defintiaons concrete realisations like MaxValSegmentTree.
16
+ # initializer and the definitions of concrete realisations like MaxValSegmentTree.
17
17
  #
18
18
  # We do O(n) work to build the internal data structure at initialization. Then we answer queries in O(log n) time.
19
- class DataStructuresRMolinari::GenericSegmentTree
19
+ class DataStructuresRMolinari::SegmentTreeTemplate
20
20
  include Shared::BinaryTreeArithmetic
21
21
 
22
22
  # Construct a concrete instance of a Segment Tree. See details at the links above for the underlying concepts here.
@@ -24,14 +24,14 @@ class DataStructuresRMolinari::GenericSegmentTree
24
24
  # - For example, if we are calculating sums over subintervals, combine.call(a, b) = a + b, while if we are doing maxima we will
25
25
  # return max(a, b).
26
26
  # - Things get more complicated when we are calculating, say, the _index_ of the maximal value in a subinterval. Now it is not
27
- # enough simple to store that index at each tree node, because to combine the indices from two child nodes we need to know
27
+ # enough simply to store that index at each tree node, because to combine the indices from two child nodes we need to know
28
28
  # both the index of the maximal element in each child node's interval, but also the maximal values themselves, so we know
29
29
  # which one "wins" for the parent node. This affects the sort of work we need to do when combining and the value provided by
30
30
  # the +single_cell_array_val+ lambda.
31
31
  # @param single_cell_array_val a lambda that takes an index i and returns the value we need to store in the #build
32
32
  # operation for the subinterval i..i.
33
- # - This is often simply be the value data[i], but in some cases it will be something else. For example, when we are
34
- # calculating the index of the maximal value on each subinterval we will retern the pair [i, data[i]] here.
33
+ # - This will often simply be the value data[i], but in some cases it will be something else. For example, when we are
34
+ # calculating the index of the maximal value on each subinterval we need [i, data[i]] here.
35
35
  # - If +update_at+ is called later, this lambda must close over the underlying data in a way that captures the updated value.
36
36
  # @param size the size of the underlying data array, used in certain internal arithmetic.
37
37
  # @param identity the value to return when we are querying on an empty interval
@@ -96,7 +96,7 @@ class DataStructuresRMolinari::GenericSegmentTree
96
96
  private def update_val_at(idx, tree_idx, tree_l, tree_r)
97
97
  if tree_l == tree_r
98
98
  # We have found the spot!
99
- raise LogicError, 'tree_l == tree_r, but they do not agree with the idx holding the updated value' unless tree_l == idx
99
+ raise InternalLogicError, 'tree_l == tree_r, but they do not agree with the idx holding the updated value' unless tree_l == idx
100
100
 
101
101
  @tree[tree_idx] = @single_cell_array_val.call(tree_l)
102
102
  else
@@ -13,8 +13,8 @@ require_relative 'shared'
13
13
  # - +empty?+
14
14
  # - is the heap empty?
15
15
  # - O(1)
16
- # - +insert+
17
- # - add a new element to the heap with an associated priority
16
+ # - +insert(item, priority)+
17
+ # - add a new item to the heap with an associated priority
18
18
  # - O(log N)
19
19
  # - +top+
20
20
  # - return the lowest-priority element, which is the element at the root of the tree. In a max-heap this is the highest-priority
@@ -23,12 +23,18 @@ require_relative 'shared'
23
23
  # - +pop+
24
24
  # - removes and returns the item that would be returned by +top+
25
25
  # - O(log N)
26
- # - +update+
26
+ # - +update(item, priority)+
27
27
  # - tell the heap that the priority of a particular item has changed
28
28
  # - O(log N)
29
29
  #
30
30
  # Here N is the number of elements in the heap.
31
31
  #
32
+ # The internal requirements needed to implement +update+ have several consequences.
33
+ # - Items added to the heap must be distinct. Otherwise we would not know which occurrence to update
34
+ # - There is some bookkeeping overhead.
35
+ # If client code doesn't need to call +update+ then we can create a "non-addressable" heap that allows for the insertion of
36
+ # duplicate items and has slightly faster runtime overall. See the arguments to the initializer.
37
+ #
32
38
  # References:
33
39
  #
34
40
  # - https://en.wikipedia.org/wiki/Binary_heap
@@ -36,31 +42,31 @@ require_relative 'shared'
36
42
  # DOI 10.1007/s00224-017-9760-2
37
43
  #
38
44
  # @todo
39
- # - allow for priorities comparable only via +<=>+, like arrays
40
- # - this requires different handling for max-heaps, as we can't just negate the priorities and use min-heap logic
41
- # - relax the requirement that priorities must be comparable vai +<+ and respond to negation. Instead, allow comparison via +<=>+
42
- # and handle max-heaps differently.
43
- # - this will allow priorities to be arrays for tie-breakers and similar.
44
- # - offer a non-addressable version that doesn't support +update+
45
- # - configure through the initializer
46
- # - other operations will be a little quicker, and we can add the same item more than once. The paper by Chen et al. referenced
47
- # in the Wikipedia article for Pairing Heaps suggests that using such a priority queue for Dijkstra's algorithm and inserting
48
- # multiple copies of a key rather than updating its priority is faster in practice than other approaches that have better
49
- # theoretical performance.
45
+ # - let caller see the priority of the top element. Maybe this is useful sometimes.
50
46
  class DataStructuresRMolinari::Heap
47
+ include Shared
51
48
  include Shared::BinaryTreeArithmetic
52
49
 
50
+ # The number of items currently in the heap
53
51
  attr_reader :size
54
52
 
55
- Pair = Struct.new(:priority, :item)
53
+ # An (item, priority) pair
54
+ InternalPair = Struct.new(:item, :priority)
55
+ private_constant :InternalPair
56
56
 
57
57
  # @param max_heap when truthy, make a max-heap rather than a min-heap
58
- # @param debug when truthy, verify the heap property after each update than might violate it. This makes operations much slower.
59
- def initialize(max_heap: false, debug: false)
58
+ # @param addressable when truthy, the heap is _addressable_. This means that
59
+ # - item priorities are updatable with +update(item, p)+, and
60
+ # - items added to the heap must be distinct.
61
+ # When falsy, priorities are not updateable but items may be inserted multiple times. Operations are slightly faster because
62
+ # there is less internal bookkeeping.
63
+ # @param debug when truthy, verify the heap property after each change that might violate it. This makes operations much slower.
64
+ def initialize(max_heap: false, addressable: true, debug: false)
60
65
  @data = []
61
66
  @size = 0
62
67
  @max_heap = max_heap
63
- @index_of = {}
68
+ @addressable = addressable
69
+ @index_of = {} # used in addressable heaps
64
70
  @debug = debug
65
71
  end
66
72
 
@@ -70,26 +76,24 @@ class DataStructuresRMolinari::Heap
70
76
  end
71
77
 
72
78
  # Insert a new element into the heap with the given priority.
73
- # @param value the item to be inserted. It is an error to insert an item that is already present in the heap, though we don't
74
- # check for this.
75
- # @param priority the priority to use for new item. The values used as priorities ust be totally ordered via +<+ and, if +self+ is
76
- # a max-heap, must respond to negation +@-+ in the natural order-respecting way.
77
- # @todo
78
- # - check for duplicate
79
+ # @param value the item to be inserted.
80
+ # - If the heap is addressible (the default) it is an error to insert an item that is already present in the heap.
81
+ # @param priority the priority to use for new item. The values used as priorities must be totally ordered via +<=>+.
79
82
  def insert(value, priority)
80
- priority *= -1 if @max_heap
83
+ raise DataError, "Heap already contains #{value}" if @addressable && contains?(value)
81
84
 
82
85
  @size += 1
83
86
 
84
- d = Pair.new(priority, value)
87
+ d = InternalPair.new(value, priority)
85
88
  assign(d, @size)
86
89
 
87
90
  sift_up(@size)
88
91
  end
89
92
 
90
93
  # Return the top of the heap without removing it
91
- # @return the value with minimal (maximal for max-heaps) priority. Strictly speaking, it returns the item at the root of the
92
- # binary tree; this element has minimal priority, but there may be other elements with the same priority.
94
+ # @return a value with minimal priority (maximal for max-heaps). Strictly speaking, it returns the item at the root of the
95
+ # binary tree; this element has minimal priority, but there may be other elements with the same priority and they do not appear
96
+ # at the top of the heap in any guaranteed order.
93
97
  def top
94
98
  raise 'Heap is empty!' unless @size.positive?
95
99
 
@@ -100,12 +104,11 @@ class DataStructuresRMolinari::Heap
100
104
  # @return (see #top)
101
105
  def pop
102
106
  result = top
103
- @index_of.delete(result)
104
-
105
107
  assign(@data[@size], root)
106
108
 
107
109
  @data[@size] = nil
108
110
  @size -= 1
111
+ @index_of.delete(result) if @addressable
109
112
 
110
113
  sift_down(root) if @size.positive?
111
114
 
@@ -113,21 +116,20 @@ class DataStructuresRMolinari::Heap
113
116
  end
114
117
 
115
118
  # Update the priority of the given element and maintain the necessary heap properties.
119
+ #
116
120
  # @param element the item whose priority we are updating. It is an error to update the priority of an element not already in the
117
121
  # heap
118
122
  # @param priority the new priority
119
- #
120
- # @todo
121
- # - check that the element is in the heap
122
123
  def update(element, priority)
123
- priority *= -1 if @max_heap
124
+ raise LogicError, 'Cannot update priorities in a non-addressable heap' unless @addressable
125
+ raise DataError, "Cannot update priority for value #{element} not already in the heap" unless contains?(element)
124
126
 
125
127
  idx = @index_of[element]
126
128
  old = @data[idx].priority
127
129
  @data[idx].priority = priority
128
- if priority > old
130
+ if less_than_priority?(old, priority)
129
131
  sift_down(idx)
130
- elsif priority < old
132
+ elsif less_than_priority?(priority, old)
131
133
  sift_up(idx)
132
134
  end
133
135
 
@@ -141,7 +143,7 @@ class DataStructuresRMolinari::Heap
141
143
  x = @data[idx]
142
144
  while idx != root
143
145
  i = parent(idx)
144
- break unless x.priority < @data[i].priority
146
+ break unless less_than?(x, @data[i])
145
147
 
146
148
  assign(@data[i], idx)
147
149
  idx = i
@@ -156,9 +158,9 @@ class DataStructuresRMolinari::Heap
156
158
  x = @data[idx]
157
159
 
158
160
  while (j = left(idx)) <= @size
159
- j += 1 if j + 1 <= @size && @data[j + 1].priority < @data[j].priority
161
+ j += 1 if j + 1 <= @size && less_than?(@data[j + 1], @data[j])
160
162
 
161
- break unless @data[j].priority < x.priority
163
+ break unless less_than?(@data[j], x)
162
164
 
163
165
  assign(@data[j], idx)
164
166
  idx = j
@@ -171,7 +173,27 @@ class DataStructuresRMolinari::Heap
171
173
  # Put the pair in the given heap location
172
174
  private def assign(pair, idx)
173
175
  @data[idx] = pair
174
- @index_of[pair.item] = idx
176
+ @index_of[pair.item] = idx if @addressable
177
+ end
178
+
179
+ # Compare the priorities of two items with <=> and return truthy exactly when the result is -1.
180
+ #
181
+ # If this is a max-heap return truthy exactly when the result of <=> is 1.
182
+ #
183
+ # The arguments can also be the priorities themselves.
184
+ private def less_than?(p1, p2)
185
+ less_than_priority?(p1.priority, p2.priority)
186
+ end
187
+
188
+ # Direct comparison of priorities
189
+ private def less_than_priority?(priority1, priority2)
190
+ return (priority1 <=> priority2) == 1 if @max_heap
191
+
192
+ (priority1 <=> priority2) == -1
193
+ end
194
+
195
+ private def contains?(item)
196
+ !!@index_of[item]
175
197
  end
176
198
 
177
199
  # For debugging
@@ -180,8 +202,8 @@ class DataStructuresRMolinari::Heap
180
202
  left = left(idx)
181
203
  right = right(idx)
182
204
 
183
- raise "Heap property violated by left child of index #{idx}" if left <= @size && @data[idx].priority >= @data[left].priority
184
- raise "Heap property violated by right child of index #{idx}" if right <= @size && @data[idx].priority >= @data[right].priority
205
+ raise InternalLogicError, "Heap property violated by left child of index #{idx}" if left <= @size && less_than?(@data[left], @data[idx])
206
+ raise InternalLogicError, "Heap property violated by right child of index #{idx}" if right <= @size && less_than?(@data[right], @data[idx])
185
207
  end
186
208
  end
187
209
  end