data_structures_rmolinari 0.2.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +33 -0
- data/lib/data_structures_rmolinari/disjoint_union.rb +30 -14
- data/lib/data_structures_rmolinari/generic_segment_tree.rb +8 -8
- data/lib/data_structures_rmolinari/heap.rb +64 -42
- data/lib/data_structures_rmolinari/max_priority_search_tree.rb +70 -119
- data/lib/data_structures_rmolinari/shared.rb +9 -1
- data/lib/data_structures_rmolinari.rb +17 -14
- metadata +3 -3
- data/lib/data_structures_rmolinari/minmax_priority_search_tree.rb +0 -670
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 87a44faaaa62f555546867230df704981671491f040f6be29eeed7db7eb22a0a
|
4
|
+
data.tar.gz: 0a0f1f6cf22bdde5d0510a818af9d8a6dbdbf11a6e69ce2e178bf6f336bb3d92
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 990fc38cbc64c20290317bf2858ff6f2813f832d0046f249faea32c7f88f389e8c8c2db892f8288a0747aa9446181864a3e62435e4846a230411b6afa4b75faf
|
7
|
+
data.tar.gz: f1e641b03d30c4726268c1c8da6d6364f635251152230f89aba2b551f0355d37ce843dba8e631c2fbd4a20e87ae94c78cf30b46dc3d472f1a1b55add258de32a
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
## [Unreleased]
|
4
|
+
|
5
|
+
### Changed
|
6
|
+
|
7
|
+
- MaxPrioritySearchTree
|
8
|
+
- Duplicate y values are now allowed. Ties are broken with a preference for smaller values of x.
|
9
|
+
- Method names have changed
|
10
|
+
- Instead of "highest", "leftmost", "rightmost" we use "largest_y", "smallest_x", "largest_x"
|
11
|
+
- For example, +highest_ne+ is now +largest_y_in_nw+
|
12
|
+
- DisjointUnion
|
13
|
+
- the size argument to initializer is optional. The default value is 0.
|
14
|
+
- elements can be added to the "universe" of known values with +make_set+
|
15
|
+
|
16
|
+
### Removed
|
17
|
+
- MinmaxPrioritySearchTree is no longer available
|
18
|
+
- it was only a partial implementation anyway
|
19
|
+
|
20
|
+
## [0.3.0] 2023-01-06
|
21
|
+
|
22
|
+
### Added
|
23
|
+
|
24
|
+
- Start this file
|
25
|
+
- `Heap` can be constructed as "non-addressable"
|
26
|
+
- `update` is not possible but duplicates can be inserted and overall performance is a little better.
|
27
|
+
|
28
|
+
### Changed
|
29
|
+
|
30
|
+
- `LogicError` gets a subclassed `InternalLogicError` for issues inside the library.
|
31
|
+
- `Shared::Pair` becomes `Shared::Point`
|
32
|
+
- this doesn't change the API of `MaxPrioritySearchTree` because of ducktyping. But client code (of which there is none) might be
|
33
|
+
using the `Pair` name.
|
@@ -4,40 +4,54 @@
|
|
4
4
|
# The data structure provides efficient actions to merge two disjoint subsets, i.e., replace them by their union, and determine if
|
5
5
|
# two elements are in the same subset.
|
6
6
|
#
|
7
|
-
# The elements of the set
|
7
|
+
# The elements of the set are 0, 1, ..., n-1, where n is the size of the universe. Client code can map its data to these
|
8
8
|
# representatives.
|
9
9
|
#
|
10
10
|
# See https://en.wikipedia.org/wiki/Disjoint-set_data_structure for a good introduction.
|
11
11
|
#
|
12
12
|
# The code uses several ideas from Tarjan and van Leeuwen for efficiency. We use "union by rank" in +unite+ and path-halving in
|
13
|
-
# +find+. Together, these make the amortized cost
|
13
|
+
# +find+. Together, these make the amortized cost of each opperation effectively constant.
|
14
14
|
#
|
15
|
-
# - Tarjan, Robert E., van Leeuwen, Jan (1984).
|
15
|
+
# - Tarjan, Robert E., van Leeuwen, Jan (1984). _Worst-case analysis of set union algorithms_. Journal of the ACM. 31 (2): 245–281.
|
16
16
|
#
|
17
17
|
# @todo
|
18
18
|
# - allow caller to expand the size of the universe. This operation is called "make set".
|
19
19
|
# - All we need to do is increase the size of @d, set the parent pointers, define the new ranks (zero), and update @size.
|
20
20
|
class DataStructuresRMolinari::DisjointUnion
|
21
|
+
include Shared
|
22
|
+
|
21
23
|
# The number of subsets in the partition.
|
22
24
|
attr_reader :subset_count
|
23
25
|
|
24
|
-
# @param
|
25
|
-
#
|
26
|
-
def initialize(
|
27
|
-
@size = size
|
26
|
+
# @param initial_size the initial size of the universe. The elements 0, 1, ..., initial_size - 1 start out in disjoint singleton
|
27
|
+
# subsets.
|
28
|
+
def initialize(initial_size = 0)
|
28
29
|
# Initialize to
|
29
|
-
@d = (0...
|
30
|
-
@rank = [0] *
|
30
|
+
@d = (0...initial_size).to_a
|
31
|
+
@rank = [0] * initial_size
|
32
|
+
|
33
|
+
@subset_count = initial_size
|
34
|
+
end
|
35
|
+
|
36
|
+
# Add a new subset to the universe containing the element +new_v+
|
37
|
+
# @param new_v the new element, starting in its own singleton subset
|
38
|
+
# - it must be a non-negative integer, not already part of the universe of elements.
|
39
|
+
def make_set(new_v)
|
40
|
+
raise DataError, "Element #{new_v} must be a non-negative integer" unless new_v.is_a?(Integer) && !new_v.negative?
|
41
|
+
raise DataError, "Element #{new_v} is already present" if @d[new_v]
|
31
42
|
|
32
|
-
@
|
43
|
+
@d[new_v] = new_v
|
44
|
+
@rank[new_v] = 0
|
45
|
+
@subset_count += 1
|
33
46
|
end
|
34
47
|
|
35
48
|
# Declare that e and f are equivalent, i.e., in the same subset. If they are already in the same subset this is a no-op.
|
36
49
|
#
|
37
|
-
# Each argument must be
|
50
|
+
# Each argument must be in the universe of elements
|
38
51
|
def unite(e, f)
|
39
52
|
check_value(e)
|
40
53
|
check_value(f)
|
54
|
+
|
41
55
|
raise 'Uniting an element with itself is meaningless' if e == f
|
42
56
|
|
43
57
|
e_root = find(e)
|
@@ -50,9 +64,11 @@ class DataStructuresRMolinari::DisjointUnion
|
|
50
64
|
|
51
65
|
# The canonical representative of the subset containing e. Two elements d and e are in the same subset exactly when find(d) ==
|
52
66
|
# find(e).
|
53
|
-
# @param e must be
|
54
|
-
# @return (Integer) one of
|
67
|
+
# @param e must be in the universe of elements
|
68
|
+
# @return (Integer) one of the universe of elements
|
55
69
|
def find(e)
|
70
|
+
check_value(e)
|
71
|
+
|
56
72
|
# We implement find with "halving" to shrink the length of paths to the root. See Tarjan and van Leeuwin p 252.
|
57
73
|
x = e
|
58
74
|
x = @d[x] = @d[@d[x]] while @d[@d[x]] != @d[x]
|
@@ -60,7 +76,7 @@ class DataStructuresRMolinari::DisjointUnion
|
|
60
76
|
end
|
61
77
|
|
62
78
|
private def check_value(v)
|
63
|
-
raise "Value
|
79
|
+
raise Shared::DataError, "Value #{v} is not part of the univserse." unless @d[v]
|
64
80
|
end
|
65
81
|
|
66
82
|
private def link(e, f)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require_relative 'shared'
|
2
2
|
|
3
|
-
#
|
4
|
-
# arbitrary subarray of a given array.
|
3
|
+
# The template of Segment Tree, which can be used for various interval-related purposes, like efficiently finding the sum (or min or
|
4
|
+
# max) on a arbitrary subarray of a given array.
|
5
5
|
#
|
6
6
|
# There is an excellent description of the data structure at https://cp-algorithms.com/data_structures/segment_tree.html. The
|
7
7
|
# Wikipedia article (https://en.wikipedia.org/wiki/Segment_tree) appears to describe a different data structure which is sometimes
|
@@ -13,10 +13,10 @@ require_relative 'shared'
|
|
13
13
|
# Ruby.
|
14
14
|
#
|
15
15
|
# This is a generic implementation, intended to allow easy configuration for concrete instances. See the parameters to the
|
16
|
-
# initializer and the
|
16
|
+
# initializer and the definitions of concrete realisations like MaxValSegmentTree.
|
17
17
|
#
|
18
18
|
# We do O(n) work to build the internal data structure at initialization. Then we answer queries in O(log n) time.
|
19
|
-
class DataStructuresRMolinari::
|
19
|
+
class DataStructuresRMolinari::SegmentTreeTemplate
|
20
20
|
include Shared::BinaryTreeArithmetic
|
21
21
|
|
22
22
|
# Construct a concrete instance of a Segment Tree. See details at the links above for the underlying concepts here.
|
@@ -24,14 +24,14 @@ class DataStructuresRMolinari::GenericSegmentTree
|
|
24
24
|
# - For example, if we are calculating sums over subintervals, combine.call(a, b) = a + b, while if we are doing maxima we will
|
25
25
|
# return max(a, b).
|
26
26
|
# - Things get more complicated when we are calculating, say, the _index_ of the maximal value in a subinterval. Now it is not
|
27
|
-
# enough
|
27
|
+
# enough simply to store that index at each tree node, because to combine the indices from two child nodes we need to know
|
28
28
|
# both the index of the maximal element in each child node's interval, but also the maximal values themselves, so we know
|
29
29
|
# which one "wins" for the parent node. This affects the sort of work we need to do when combining and the value provided by
|
30
30
|
# the +single_cell_array_val+ lambda.
|
31
31
|
# @param single_cell_array_val a lambda that takes an index i and returns the value we need to store in the #build
|
32
32
|
# operation for the subinterval i..i.
|
33
|
-
# - This
|
34
|
-
# calculating the index of the maximal value on each subinterval we
|
33
|
+
# - This will often simply be the value data[i], but in some cases it will be something else. For example, when we are
|
34
|
+
# calculating the index of the maximal value on each subinterval we need [i, data[i]] here.
|
35
35
|
# - If +update_at+ is called later, this lambda must close over the underlying data in a way that captures the updated value.
|
36
36
|
# @param size the size of the underlying data array, used in certain internal arithmetic.
|
37
37
|
# @param identity the value to return when we are querying on an empty interval
|
@@ -96,7 +96,7 @@ class DataStructuresRMolinari::GenericSegmentTree
|
|
96
96
|
private def update_val_at(idx, tree_idx, tree_l, tree_r)
|
97
97
|
if tree_l == tree_r
|
98
98
|
# We have found the spot!
|
99
|
-
raise
|
99
|
+
raise InternalLogicError, 'tree_l == tree_r, but they do not agree with the idx holding the updated value' unless tree_l == idx
|
100
100
|
|
101
101
|
@tree[tree_idx] = @single_cell_array_val.call(tree_l)
|
102
102
|
else
|
@@ -13,8 +13,8 @@ require_relative 'shared'
|
|
13
13
|
# - +empty?+
|
14
14
|
# - is the heap empty?
|
15
15
|
# - O(1)
|
16
|
-
# - +insert+
|
17
|
-
# - add a new
|
16
|
+
# - +insert(item, priority)+
|
17
|
+
# - add a new item to the heap with an associated priority
|
18
18
|
# - O(log N)
|
19
19
|
# - +top+
|
20
20
|
# - return the lowest-priority element, which is the element at the root of the tree. In a max-heap this is the highest-priority
|
@@ -23,12 +23,18 @@ require_relative 'shared'
|
|
23
23
|
# - +pop+
|
24
24
|
# - removes and returns the item that would be returned by +top+
|
25
25
|
# - O(log N)
|
26
|
-
# - +update+
|
26
|
+
# - +update(item, priority)+
|
27
27
|
# - tell the heap that the priority of a particular item has changed
|
28
28
|
# - O(log N)
|
29
29
|
#
|
30
30
|
# Here N is the number of elements in the heap.
|
31
31
|
#
|
32
|
+
# The internal requirements needed to implement +update+ have several consequences.
|
33
|
+
# - Items added to the heap must be distinct. Otherwise we would not know which occurrence to update
|
34
|
+
# - There is some bookkeeping overhead.
|
35
|
+
# If client code doesn't need to call +update+ then we can create a "non-addressable" heap that allows for the insertion of
|
36
|
+
# duplicate items and has slightly faster runtime overall. See the arguments to the initializer.
|
37
|
+
#
|
32
38
|
# References:
|
33
39
|
#
|
34
40
|
# - https://en.wikipedia.org/wiki/Binary_heap
|
@@ -36,31 +42,31 @@ require_relative 'shared'
|
|
36
42
|
# DOI 10.1007/s00224-017-9760-2
|
37
43
|
#
|
38
44
|
# @todo
|
39
|
-
# -
|
40
|
-
# - this requires different handling for max-heaps, as we can't just negate the priorities and use min-heap logic
|
41
|
-
# - relax the requirement that priorities must be comparable vai +<+ and respond to negation. Instead, allow comparison via +<=>+
|
42
|
-
# and handle max-heaps differently.
|
43
|
-
# - this will allow priorities to be arrays for tie-breakers and similar.
|
44
|
-
# - offer a non-addressable version that doesn't support +update+
|
45
|
-
# - configure through the initializer
|
46
|
-
# - other operations will be a little quicker, and we can add the same item more than once. The paper by Chen et al. referenced
|
47
|
-
# in the Wikipedia article for Pairing Heaps suggests that using such a priority queue for Dijkstra's algorithm and inserting
|
48
|
-
# multiple copies of a key rather than updating its priority is faster in practice than other approaches that have better
|
49
|
-
# theoretical performance.
|
45
|
+
# - let caller see the priority of the top element. Maybe this is useful sometimes.
|
50
46
|
class DataStructuresRMolinari::Heap
|
47
|
+
include Shared
|
51
48
|
include Shared::BinaryTreeArithmetic
|
52
49
|
|
50
|
+
# The number of items currently in the heap
|
53
51
|
attr_reader :size
|
54
52
|
|
55
|
-
|
53
|
+
# An (item, priority) pair
|
54
|
+
InternalPair = Struct.new(:item, :priority)
|
55
|
+
private_constant :InternalPair
|
56
56
|
|
57
57
|
# @param max_heap when truthy, make a max-heap rather than a min-heap
|
58
|
-
# @param
|
59
|
-
|
58
|
+
# @param addressable when truthy, the heap is _addressable_. This means that
|
59
|
+
# - item priorities are updatable with +update(item, p)+, and
|
60
|
+
# - items added to the heap must be distinct.
|
61
|
+
# When falsy, priorities are not updateable but items may be inserted multiple times. Operations are slightly faster because
|
62
|
+
# there is less internal bookkeeping.
|
63
|
+
# @param debug when truthy, verify the heap property after each change that might violate it. This makes operations much slower.
|
64
|
+
def initialize(max_heap: false, addressable: true, debug: false)
|
60
65
|
@data = []
|
61
66
|
@size = 0
|
62
67
|
@max_heap = max_heap
|
63
|
-
@
|
68
|
+
@addressable = addressable
|
69
|
+
@index_of = {} # used in addressable heaps
|
64
70
|
@debug = debug
|
65
71
|
end
|
66
72
|
|
@@ -70,26 +76,24 @@ class DataStructuresRMolinari::Heap
|
|
70
76
|
end
|
71
77
|
|
72
78
|
# Insert a new element into the heap with the given priority.
|
73
|
-
# @param value the item to be inserted.
|
74
|
-
#
|
75
|
-
# @param priority the priority to use for new item. The values used as priorities
|
76
|
-
# a max-heap, must respond to negation +@-+ in the natural order-respecting way.
|
77
|
-
# @todo
|
78
|
-
# - check for duplicate
|
79
|
+
# @param value the item to be inserted.
|
80
|
+
# - If the heap is addressible (the default) it is an error to insert an item that is already present in the heap.
|
81
|
+
# @param priority the priority to use for new item. The values used as priorities must be totally ordered via +<=>+.
|
79
82
|
def insert(value, priority)
|
80
|
-
|
83
|
+
raise DataError, "Heap already contains #{value}" if @addressable && contains?(value)
|
81
84
|
|
82
85
|
@size += 1
|
83
86
|
|
84
|
-
d =
|
87
|
+
d = InternalPair.new(value, priority)
|
85
88
|
assign(d, @size)
|
86
89
|
|
87
90
|
sift_up(@size)
|
88
91
|
end
|
89
92
|
|
90
93
|
# Return the top of the heap without removing it
|
91
|
-
# @return
|
92
|
-
# binary tree; this element has minimal priority, but there may be other elements with the same priority
|
94
|
+
# @return a value with minimal priority (maximal for max-heaps). Strictly speaking, it returns the item at the root of the
|
95
|
+
# binary tree; this element has minimal priority, but there may be other elements with the same priority and they do not appear
|
96
|
+
# at the top of the heap in any guaranteed order.
|
93
97
|
def top
|
94
98
|
raise 'Heap is empty!' unless @size.positive?
|
95
99
|
|
@@ -100,12 +104,11 @@ class DataStructuresRMolinari::Heap
|
|
100
104
|
# @return (see #top)
|
101
105
|
def pop
|
102
106
|
result = top
|
103
|
-
@index_of.delete(result)
|
104
|
-
|
105
107
|
assign(@data[@size], root)
|
106
108
|
|
107
109
|
@data[@size] = nil
|
108
110
|
@size -= 1
|
111
|
+
@index_of.delete(result) if @addressable
|
109
112
|
|
110
113
|
sift_down(root) if @size.positive?
|
111
114
|
|
@@ -113,21 +116,20 @@ class DataStructuresRMolinari::Heap
|
|
113
116
|
end
|
114
117
|
|
115
118
|
# Update the priority of the given element and maintain the necessary heap properties.
|
119
|
+
#
|
116
120
|
# @param element the item whose priority we are updating. It is an error to update the priority of an element not already in the
|
117
121
|
# heap
|
118
122
|
# @param priority the new priority
|
119
|
-
#
|
120
|
-
# @todo
|
121
|
-
# - check that the element is in the heap
|
122
123
|
def update(element, priority)
|
123
|
-
|
124
|
+
raise LogicError, 'Cannot update priorities in a non-addressable heap' unless @addressable
|
125
|
+
raise DataError, "Cannot update priority for value #{element} not already in the heap" unless contains?(element)
|
124
126
|
|
125
127
|
idx = @index_of[element]
|
126
128
|
old = @data[idx].priority
|
127
129
|
@data[idx].priority = priority
|
128
|
-
if priority
|
130
|
+
if less_than_priority?(old, priority)
|
129
131
|
sift_down(idx)
|
130
|
-
elsif priority
|
132
|
+
elsif less_than_priority?(priority, old)
|
131
133
|
sift_up(idx)
|
132
134
|
end
|
133
135
|
|
@@ -141,7 +143,7 @@ class DataStructuresRMolinari::Heap
|
|
141
143
|
x = @data[idx]
|
142
144
|
while idx != root
|
143
145
|
i = parent(idx)
|
144
|
-
break unless x
|
146
|
+
break unless less_than?(x, @data[i])
|
145
147
|
|
146
148
|
assign(@data[i], idx)
|
147
149
|
idx = i
|
@@ -156,9 +158,9 @@ class DataStructuresRMolinari::Heap
|
|
156
158
|
x = @data[idx]
|
157
159
|
|
158
160
|
while (j = left(idx)) <= @size
|
159
|
-
j += 1 if j + 1 <= @size && @data[j + 1]
|
161
|
+
j += 1 if j + 1 <= @size && less_than?(@data[j + 1], @data[j])
|
160
162
|
|
161
|
-
break unless @data[j]
|
163
|
+
break unless less_than?(@data[j], x)
|
162
164
|
|
163
165
|
assign(@data[j], idx)
|
164
166
|
idx = j
|
@@ -171,7 +173,27 @@ class DataStructuresRMolinari::Heap
|
|
171
173
|
# Put the pair in the given heap location
|
172
174
|
private def assign(pair, idx)
|
173
175
|
@data[idx] = pair
|
174
|
-
@index_of[pair.item] = idx
|
176
|
+
@index_of[pair.item] = idx if @addressable
|
177
|
+
end
|
178
|
+
|
179
|
+
# Compare the priorities of two items with <=> and return truthy exactly when the result is -1.
|
180
|
+
#
|
181
|
+
# If this is a max-heap return truthy exactly when the result of <=> is 1.
|
182
|
+
#
|
183
|
+
# The arguments can also be the priorities themselves.
|
184
|
+
private def less_than?(p1, p2)
|
185
|
+
less_than_priority?(p1.priority, p2.priority)
|
186
|
+
end
|
187
|
+
|
188
|
+
# Direct comparison of priorities
|
189
|
+
private def less_than_priority?(priority1, priority2)
|
190
|
+
return (priority1 <=> priority2) == 1 if @max_heap
|
191
|
+
|
192
|
+
(priority1 <=> priority2) == -1
|
193
|
+
end
|
194
|
+
|
195
|
+
private def contains?(item)
|
196
|
+
!!@index_of[item]
|
175
197
|
end
|
176
198
|
|
177
199
|
# For debugging
|
@@ -180,8 +202,8 @@ class DataStructuresRMolinari::Heap
|
|
180
202
|
left = left(idx)
|
181
203
|
right = right(idx)
|
182
204
|
|
183
|
-
raise "Heap property violated by left child of index #{idx}" if left <= @size && @data[
|
184
|
-
raise "Heap property violated by right child of index #{idx}" if right <= @size && @data[
|
205
|
+
raise InternalLogicError, "Heap property violated by left child of index #{idx}" if left <= @size && less_than?(@data[left], @data[idx])
|
206
|
+
raise InternalLogicError, "Heap property violated by right child of index #{idx}" if right <= @size && less_than?(@data[right], @data[idx])
|
185
207
|
end
|
186
208
|
end
|
187
209
|
end
|