perobs 3.0.1 → 4.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +19 -18
- data/lib/perobs.rb +2 -0
- data/lib/perobs/Array.rb +68 -21
- data/lib/perobs/BTree.rb +110 -54
- data/lib/perobs/BTreeBlob.rb +14 -13
- data/lib/perobs/BTreeDB.rb +11 -10
- data/lib/perobs/BTreeNode.rb +551 -197
- data/lib/perobs/BTreeNodeCache.rb +10 -8
- data/lib/perobs/BTreeNodeLink.rb +11 -1
- data/lib/perobs/BigArray.rb +285 -0
- data/lib/perobs/BigArrayNode.rb +1002 -0
- data/lib/perobs/BigHash.rb +246 -0
- data/lib/perobs/BigTree.rb +197 -0
- data/lib/perobs/BigTreeNode.rb +873 -0
- data/lib/perobs/Cache.rb +47 -22
- data/lib/perobs/ClassMap.rb +2 -2
- data/lib/perobs/ConsoleProgressMeter.rb +61 -0
- data/lib/perobs/DataBase.rb +4 -3
- data/lib/perobs/DynamoDB.rb +62 -20
- data/lib/perobs/EquiBlobsFile.rb +174 -59
- data/lib/perobs/FNV_Hash_1a_64.rb +54 -0
- data/lib/perobs/FlatFile.rb +536 -242
- data/lib/perobs/FlatFileBlobHeader.rb +120 -84
- data/lib/perobs/FlatFileDB.rb +58 -27
- data/lib/perobs/FuzzyStringMatcher.rb +175 -0
- data/lib/perobs/Hash.rb +129 -35
- data/lib/perobs/IDList.rb +144 -0
- data/lib/perobs/IDListPage.rb +107 -0
- data/lib/perobs/IDListPageFile.rb +180 -0
- data/lib/perobs/IDListPageRecord.rb +142 -0
- data/lib/perobs/LockFile.rb +3 -0
- data/lib/perobs/Object.rb +28 -20
- data/lib/perobs/ObjectBase.rb +53 -10
- data/lib/perobs/PersistentObjectCache.rb +142 -0
- data/lib/perobs/PersistentObjectCacheLine.rb +99 -0
- data/lib/perobs/ProgressMeter.rb +97 -0
- data/lib/perobs/SpaceManager.rb +273 -0
- data/lib/perobs/SpaceTree.rb +63 -47
- data/lib/perobs/SpaceTreeNode.rb +134 -115
- data/lib/perobs/SpaceTreeNodeLink.rb +1 -1
- data/lib/perobs/StackFile.rb +1 -1
- data/lib/perobs/Store.rb +180 -70
- data/lib/perobs/version.rb +1 -1
- data/perobs.gemspec +4 -4
- data/test/Array_spec.rb +48 -39
- data/test/BTreeDB_spec.rb +2 -2
- data/test/BTree_spec.rb +50 -1
- data/test/BigArray_spec.rb +261 -0
- data/test/BigHash_spec.rb +152 -0
- data/test/BigTreeNode_spec.rb +153 -0
- data/test/BigTree_spec.rb +259 -0
- data/test/EquiBlobsFile_spec.rb +105 -5
- data/test/FNV_Hash_1a_64_spec.rb +59 -0
- data/test/FlatFileDB_spec.rb +199 -15
- data/test/FuzzyStringMatcher_spec.rb +261 -0
- data/test/Hash_spec.rb +27 -16
- data/test/IDList_spec.rb +77 -0
- data/test/LegacyDBs/LegacyDB.rb +155 -0
- data/test/LegacyDBs/version_3/class_map.json +1 -0
- data/test/LegacyDBs/version_3/config.json +1 -0
- data/test/LegacyDBs/version_3/database.blobs +0 -0
- data/test/LegacyDBs/version_3/database_spaces.blobs +0 -0
- data/test/LegacyDBs/version_3/index.blobs +0 -0
- data/test/LegacyDBs/version_3/version +1 -0
- data/test/LockFile_spec.rb +9 -6
- data/test/Object_spec.rb +5 -5
- data/test/SpaceManager_spec.rb +176 -0
- data/test/SpaceTree_spec.rb +27 -9
- data/test/Store_spec.rb +353 -206
- data/test/perobs_spec.rb +7 -3
- data/test/spec_helper.rb +9 -4
- metadata +59 -16
- data/lib/perobs/SpaceTreeNodeCache.rb +0 -76
- data/lib/perobs/TreeDB.rb +0 -277
@@ -0,0 +1,175 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
#
|
3
|
+
# = FuzzyStringMatcher.rb -- Persistent Ruby Object Store
|
4
|
+
#
|
5
|
+
# Copyright (c) 2020 by Chris Schlaeger <chris@taskjuggler.org>
|
6
|
+
#
|
7
|
+
# MIT License
|
8
|
+
#
|
9
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
10
|
+
# a copy of this software and associated documentation files (the
|
11
|
+
# "Software"), to deal in the Software without restriction, including
|
12
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
13
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
14
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
15
|
+
# the following conditions:
|
16
|
+
#
|
17
|
+
# The above copyright notice and this permission notice shall be
|
18
|
+
# included in all copies or substantial portions of the Software.
|
19
|
+
#
|
20
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
21
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
22
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
23
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
24
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
25
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
26
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
27
|
+
|
28
|
+
require 'perobs/Log'
|
29
|
+
require 'perobs/Object'
|
30
|
+
|
31
|
+
module PEROBS
|
32
|
+
|
33
|
+
# The fuzzy string matcher can be used to perform a fuzzy string search
|
34
|
+
# against a known set of strings. The dictionary of known strings does not
|
35
|
+
# store the actual strings but references to String or PEROBS objects.
|
36
|
+
# Once the dictionary has been established, fuzzy matches can be done. Since
|
37
|
+
# the actual input strings are not directly stored, you cannot remove or
|
38
|
+
# modified already stored strings. To remove strings, you have to clear the
|
39
|
+
# matcher and add the strings again that you want to keep.
|
40
|
+
class FuzzyStringMatcher < PEROBS::Object
|
41
|
+
|
42
|
+
attr_persist :case_sensitive, :n, :dict
|
43
|
+
|
44
|
+
# Create a new FuzzyStringMatcher.
|
45
|
+
# @param p [PEROBS::Store] place to store the dictionary
|
46
|
+
# @param case_sensitive [Boolean] True if case matters for matching
|
47
|
+
# @param n [Integer] Determines what kind of n-gramm is used to store the
|
48
|
+
# references in the dictionary. It also determines the minimum word
|
49
|
+
# length that can be used for fuzzy matches. Values between 2 and
|
50
|
+
# 10 are supported. The default is 4.
|
51
|
+
def initialize(p, case_sensitive = false, n = 4)
|
52
|
+
super(p)
|
53
|
+
if n < 2 || n > 10
|
54
|
+
raise ArgumentError, 'n must be between 2 and 10'
|
55
|
+
end
|
56
|
+
self.case_sensitive = case_sensitive
|
57
|
+
self.n = n
|
58
|
+
|
59
|
+
clear unless @dict
|
60
|
+
end
|
61
|
+
|
62
|
+
# Wipe the dictionary.
|
63
|
+
def clear
|
64
|
+
self.dict = @store.new(BigHash)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Add a string with its reference to the dictionary.
|
68
|
+
# @param string [String] The string to store
|
69
|
+
# @param reference [Object] Any object that is associated with the string
|
70
|
+
def learn(string, reference = string)
|
71
|
+
reference = string if reference.nil?
|
72
|
+
|
73
|
+
unless @case_sensitive
|
74
|
+
string = string.downcase
|
75
|
+
end
|
76
|
+
# Enclose string in 'start of text' and 'end of text' ASCII values.
|
77
|
+
string = "\002" + string + "\003"
|
78
|
+
|
79
|
+
each_n_gramm(string) do |n_gramm|
|
80
|
+
unless (ng_list = @dict[n_gramm])
|
81
|
+
@dict[n_gramm] = ng_list = @store.new(Hash)
|
82
|
+
end
|
83
|
+
|
84
|
+
# We use the Hash as a Set. The value doesn't matter.
|
85
|
+
ng_list[reference] = true unless ng_list.include?(reference)
|
86
|
+
end
|
87
|
+
|
88
|
+
nil
|
89
|
+
end
|
90
|
+
|
91
|
+
# Find the references who's string best matches the given string.
|
92
|
+
# @param string [String] string to search for
|
93
|
+
# @param min_score [Float] Value 0.01 and 1.0 that specifies how strict
|
94
|
+
# the matching should be done. The larger the value the more closer
|
95
|
+
# the given string needs to be.
|
96
|
+
# @param max_count [Integer] The maximum number of matches that should be
|
97
|
+
# returned.
|
98
|
+
# @return [Array] The result is an Array of Arrays. The nested Arrays only
|
99
|
+
# have 2 entries. The reference and a Float value between 0 and
|
100
|
+
# 1.0 that describes how good the match is. The matches are sorted
|
101
|
+
# in descending order by the match score.
|
102
|
+
def best_matches(string, min_score = 0.5, max_count = 100)
|
103
|
+
unless @case_sensitive
|
104
|
+
string = string.downcase
|
105
|
+
end
|
106
|
+
# Enclose string in 'start of text' and 'end of text' ASCII values.
|
107
|
+
string = "\002" + string + "\003"
|
108
|
+
|
109
|
+
matches = {}
|
110
|
+
|
111
|
+
each_n_gramm(string) do |n_gramm|
|
112
|
+
if (ng_list = @dict[n_gramm])
|
113
|
+
ng_list.each do |reference, dummy|
|
114
|
+
if matches.include?(reference)
|
115
|
+
matches[reference] += 1
|
116
|
+
else
|
117
|
+
matches[reference] = 1
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
return [] if matches.empty?
|
124
|
+
|
125
|
+
match_list = matches.to_a
|
126
|
+
|
127
|
+
# Set occurance counters to scores relative to the best possible score.
|
128
|
+
# This will be the best possible score for a perfect match.
|
129
|
+
best_possible_score = string.length - @n + 1
|
130
|
+
match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
|
131
|
+
|
132
|
+
# Delete all matches that don't have the required minimum match score.
|
133
|
+
match_list.delete_if { |a| a[1] < min_score }
|
134
|
+
|
135
|
+
# Sort the list best to worst match
|
136
|
+
match_list.sort! do |a, b|
|
137
|
+
b[1] <=> a[1]
|
138
|
+
end
|
139
|
+
|
140
|
+
# Return the top max_count matches.
|
141
|
+
match_list[0..max_count - 1]
|
142
|
+
end
|
143
|
+
|
144
|
+
# Returns some internal stats about the dictionary.
|
145
|
+
def stats
|
146
|
+
s = {}
|
147
|
+
s['dictionary_size'] = @dict.size
|
148
|
+
max = total = 0
|
149
|
+
@dict.each do |n_gramm, ng_list|
|
150
|
+
size = ng_list.length
|
151
|
+
max = size if size > max
|
152
|
+
total += size
|
153
|
+
end
|
154
|
+
s['max_list_size'] = max
|
155
|
+
s['avg_list_size'] = total > 0 ? total.to_f / s['dictionary_size'] : 0
|
156
|
+
|
157
|
+
s
|
158
|
+
end
|
159
|
+
|
160
|
+
private
|
161
|
+
|
162
|
+
def each_n_gramm(string, &block)
|
163
|
+
return if string.length < @n
|
164
|
+
|
165
|
+
0.upto(string.length - @n) do |i|
|
166
|
+
n_gramm = string[i, @n]
|
167
|
+
|
168
|
+
yield(n_gramm)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
|
data/lib/perobs/Hash.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#
|
3
3
|
# = Hash.rb -- Persistent Ruby Object Store
|
4
4
|
#
|
5
|
-
# Copyright (c) 2015, 2016 by Chris Schlaeger <chris@taskjuggler.org>
|
5
|
+
# Copyright (c) 2015, 2016, 2017 by Chris Schlaeger <chris@taskjuggler.org>
|
6
6
|
#
|
7
7
|
# MIT License
|
8
8
|
#
|
@@ -37,20 +37,36 @@ module PEROBS
|
|
37
37
|
# The implementation is largely a proxy around the standard Hash class. But
|
38
38
|
# all mutating methods must be re-implemented to convert PEROBS::Objects to
|
39
39
|
# POXReference objects and to register the object as modified with the
|
40
|
-
# cache.
|
40
|
+
# cache. However, it is not designed for large data sets as it always reads
|
41
|
+
# and writes the full data set for every access (unless it is cached). For
|
42
|
+
# data sets that could have more than a few hundred entries BigHash is the
|
43
|
+
# recommended alternative.
|
41
44
|
#
|
42
45
|
# We explicitely don't support Hash::store() as it conflicts with
|
43
46
|
# ObjectBase::store() method to access the store.
|
44
47
|
class Hash < ObjectBase
|
45
48
|
|
49
|
+
# These methods do not mutate the Hash. They only perform read
|
50
|
+
# operations and return a new PEROBS::Hash object.
|
51
|
+
([
|
52
|
+
:invert, :merge, :reject, :select
|
53
|
+
] + Enumerable.instance_methods).uniq.each do |method_sym|
|
54
|
+
# Create a wrapper method that passes the call to @data.
|
55
|
+
define_method(method_sym) do |*args, &block|
|
56
|
+
# Register the read operation with the cache.
|
57
|
+
@store.cache.cache_read(self)
|
58
|
+
@store.new(PEROBS::Hash, @data.send(method_sym, *args, &block))
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
46
62
|
# These methods do not mutate the Hash. They only perform read
|
47
63
|
# operations.
|
48
64
|
([
|
49
65
|
:==, :[], :assoc, :compare_by_identity, :compare_by_identity?, :default,
|
50
66
|
:default_proc, :each, :each_key, :each_pair, :each_value, :empty?,
|
51
67
|
:eql?, :fetch, :flatten, :has_key?, :has_value?, :hash, :include?,
|
52
|
-
:
|
53
|
-
:pretty_print, :pretty_print_cycle, :rassoc, :
|
68
|
+
:key, :key?, :keys, :length, :member?,
|
69
|
+
:pretty_print, :pretty_print_cycle, :rassoc, :size,
|
54
70
|
:to_a, :to_h, :to_hash, :to_s, :value?, :values, :values_at
|
55
71
|
] + Enumerable.instance_methods).uniq.each do |method_sym|
|
56
72
|
# Create a wrapper method that passes the call to @data.
|
@@ -61,11 +77,22 @@ module PEROBS
|
|
61
77
|
end
|
62
78
|
end
|
63
79
|
|
64
|
-
# These methods mutate the Hash
|
80
|
+
# These methods mutate the Hash and return self
|
81
|
+
[
|
82
|
+
:clear, :keep_if, :merge!, :rehash, :reject!, :replace, :select!, :update
|
83
|
+
].each do |method_sym|
|
84
|
+
# Create a wrapper method that passes the call to @data.
|
85
|
+
define_method(method_sym) do |*args, &block|
|
86
|
+
# Register the write operation with the cache.
|
87
|
+
@store.cache.cache_write(self)
|
88
|
+
@data.send(method_sym, *args, &block)
|
89
|
+
myself
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# These methods mutate the Hash and return basic Ruby type objects.
|
65
94
|
[
|
66
|
-
:
|
67
|
-
:initialize_copy, :keep_if, :merge!, :rehash, :reject!, :replace,
|
68
|
-
:select!, :shift, :update
|
95
|
+
:delete, :delete_if, :shift
|
69
96
|
].each do |method_sym|
|
70
97
|
# Create a wrapper method that passes the call to @data.
|
71
98
|
define_method(method_sym) do |*args, &block|
|
@@ -79,33 +106,70 @@ module PEROBS
|
|
79
106
|
# PEROBS users should never call this method or equivalents of derived
|
80
107
|
# methods directly.
|
81
108
|
# @param p [PEROBS::Handle] PEROBS handle
|
82
|
-
# @param default [
|
83
|
-
# stored for a specific key.
|
84
|
-
|
109
|
+
# @param default [Object] The default value that is returned when no value
|
110
|
+
# is stored for a specific key. The default must be of the
|
111
|
+
# supported type.
|
112
|
+
def initialize(p, default = nil, &block)
|
85
113
|
super(p)
|
86
|
-
|
87
|
-
|
114
|
+
_check_assignment_value(default)
|
115
|
+
if block_given?
|
116
|
+
@data = ::Hash.new(&block)
|
117
|
+
else
|
118
|
+
@data = ::Hash.new(default)
|
119
|
+
end
|
88
120
|
|
89
121
|
# Ensure that the newly created object will be pushed into the database.
|
90
122
|
@store.cache.cache_write(self)
|
91
123
|
end
|
92
124
|
|
125
|
+
# Proxy for assignment method.
|
126
|
+
def []=(key, value)
|
127
|
+
unless key.is_a?(String) || key.respond_to?(:is_poxreference?)
|
128
|
+
raise ArgumentError, "PEROBS::Hash[] key must be a String or " +
|
129
|
+
"a PEROBS object but is a #{key.class}"
|
130
|
+
end
|
131
|
+
_check_assignment_value(value)
|
132
|
+
@store.cache.cache_write(self)
|
133
|
+
@data[key] = value
|
134
|
+
end
|
135
|
+
|
136
|
+
# Proxy for default= method.
|
137
|
+
def default=(value)
|
138
|
+
_check_assignment_value(value)
|
139
|
+
@data.default=(value)
|
140
|
+
end
|
141
|
+
|
93
142
|
# Return a list of all object IDs of all persistend objects that this Hash
|
94
143
|
# is referencing.
|
95
|
-
# @return [Array of
|
144
|
+
# @return [Array of Integer] IDs of referenced objects
|
96
145
|
def _referenced_object_ids
|
97
|
-
|
98
|
-
|
146
|
+
ids = []
|
147
|
+
@data.each do |k, v|
|
148
|
+
if k && k.respond_to?(:is_poxreference?)
|
149
|
+
ids << k.id
|
150
|
+
end
|
151
|
+
if v && v.respond_to?(:is_poxreference?)
|
152
|
+
ids << v.id
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
ids
|
99
157
|
end
|
100
158
|
|
101
159
|
# This method should only be used during store repair operations. It will
|
102
160
|
# delete all referenced to the given object ID.
|
103
|
-
# @param id [
|
161
|
+
# @param id [Integer] targeted object ID
|
104
162
|
def _delete_reference_to_id(id)
|
163
|
+
original_length = @data.length
|
164
|
+
|
105
165
|
@data.delete_if do |k, v|
|
106
|
-
|
166
|
+
(k && k.respond_to?(:is_poxreference?) && k.id == id) ||
|
167
|
+
(v && v.respond_to?(:is_poxreference?) && v.id == id)
|
168
|
+
end
|
169
|
+
|
170
|
+
if @data.length != original_length
|
171
|
+
@store.cache.cache_write(self)
|
107
172
|
end
|
108
|
-
@store.cache.cache_write(self)
|
109
173
|
end
|
110
174
|
|
111
175
|
# Restore the persistent data from a single data structure.
|
@@ -114,8 +178,18 @@ module PEROBS
|
|
114
178
|
# @private
|
115
179
|
def _deserialize(data)
|
116
180
|
@data = {}
|
117
|
-
|
118
|
-
|
181
|
+
|
182
|
+
data.each do |k, v|
|
183
|
+
# References to other PEROBS Objects are marshalled with our own
|
184
|
+
# format. If we detect such a marshalled String we convert it into a
|
185
|
+
# POXReference object.
|
186
|
+
if (match = /^#<PEROBS::POReference id=([0-9]+)>$/.match(k))
|
187
|
+
k = POXReference.new(@store, match[1].to_i)
|
188
|
+
end
|
189
|
+
dv = v.is_a?(POReference) ? POXReference.new(@store, v.id) : v
|
190
|
+
@data[k] = dv
|
191
|
+
end
|
192
|
+
|
119
193
|
@data
|
120
194
|
end
|
121
195
|
|
@@ -136,26 +210,46 @@ module PEROBS
|
|
136
210
|
data = {}
|
137
211
|
|
138
212
|
@data.each do |k, v|
|
139
|
-
if
|
140
|
-
|
141
|
-
|
142
|
-
#
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
v.inspect
|
151
|
-
end
|
152
|
-
data[k] = v
|
213
|
+
if k.respond_to?(:is_poxreference?)
|
214
|
+
# JSON only supports Strings as hash keys. Since JSON is the default
|
215
|
+
# internal storage format in the database, we have to marshall
|
216
|
+
# PEROBS::Object references ourselves.
|
217
|
+
k = "#<PEROBS::POReference id=#{k.id}>"
|
218
|
+
elsif k[0..24] == '#<PEROBS::POReference id='
|
219
|
+
# This could obviously result in conflicts with 'normal' String hash
|
220
|
+
# keys. This is extremely unlikely, but we better catch this case
|
221
|
+
# before it causes hard to debug trouble.
|
222
|
+
raise ArgumentError, "Hash key #{k} conflicts with PEROBS " +
|
223
|
+
"internal representation of marshalled hash keys!"
|
153
224
|
end
|
225
|
+
data[k] = serialize_helper(v)
|
154
226
|
end
|
155
227
|
|
156
228
|
data
|
157
229
|
end
|
158
230
|
|
231
|
+
def serialize_helper(v)
|
232
|
+
if v.respond_to?(:is_poxreference?)
|
233
|
+
# References to other PEROBS objects (POXReference) are stored as
|
234
|
+
# POReference in the database.
|
235
|
+
return POReference.new(v.id)
|
236
|
+
else
|
237
|
+
# Outside of the PEROBS library all PEROBS::ObjectBase derived
|
238
|
+
# objects should not be used directly. The library only exposes them
|
239
|
+
# via POXReference proxy objects.
|
240
|
+
if v.is_a?(ObjectBase)
|
241
|
+
PEROBS.log.fatal 'A PEROBS::ObjectBase object escaped! ' +
|
242
|
+
"It is stored in a PEROBS::Hash. " +
|
243
|
+
'Have you used self() instead of myself() to ' +
|
244
|
+
"get the reference of this PEROBS object?\n" +
|
245
|
+
v.inspect
|
246
|
+
end
|
247
|
+
|
248
|
+
# All other objects are serialized by their native methods.
|
249
|
+
return v
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
159
253
|
end
|
160
254
|
|
161
255
|
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
#
|
3
|
+
# = IDList.rb -- Persistent Ruby Object Store
|
4
|
+
#
|
5
|
+
# Copyright (c) 2018 by Chris Schlaeger <chris@taskjuggler.org>
|
6
|
+
#
|
7
|
+
# MIT License
|
8
|
+
#
|
9
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
10
|
+
# a copy of this software and associated documentation files (the
|
11
|
+
# "Software"), to deal in the Software without restriction, including
|
12
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
13
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
14
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
15
|
+
# the following conditions:
|
16
|
+
#
|
17
|
+
# The above copyright notice and this permission notice shall be
|
18
|
+
# included in all copies or substantial portions of the Software.
|
19
|
+
#
|
20
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
21
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
22
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
23
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
24
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
25
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
26
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
27
|
+
|
28
|
+
require 'perobs/IDListPageFile'
|
29
|
+
require 'perobs/IDListPageRecord'
|
30
|
+
|
31
|
+
module PEROBS
|
32
|
+
|
33
|
+
# This class stores a list of 64 bit values. Values can be added to the list
|
34
|
+
# and the presence of a certain value can be checked. It can hold up to 2^64
|
35
|
+
# values. It tries to keep values in memory but can store them in a file if
|
36
|
+
# needed. A threshold for the in-memory values can be set in the
|
37
|
+
# constructor. The stored values are grouped in pages. Each page can hold up
|
38
|
+
# to page_size entries.
|
39
|
+
class IDList
|
40
|
+
|
41
|
+
# Create a new IDList object. The data that can't be kept in memory will
|
42
|
+
# be stored in the specified directory under the given name.
|
43
|
+
# @param dir [String] Path of the directory
|
44
|
+
# @param name [String] Name of the file
|
45
|
+
# @param max_in_memory [Integer] Specifies the maximum number of values
|
46
|
+
# that will be kept in memory. If the list is larger, values will
|
47
|
+
# be cached in the specified file.
|
48
|
+
# @param page_size [Integer] The number of values per page. The default
|
49
|
+
# value is 32 which was found the best performing config in tests.
|
50
|
+
def initialize(dir, name, max_in_memory, page_size = 32)
|
51
|
+
# The page_file manages the pages that store the values.
|
52
|
+
@page_file = IDListPageFile.new(self, dir, name,
|
53
|
+
max_in_memory, page_size)
|
54
|
+
clear
|
55
|
+
end
|
56
|
+
|
57
|
+
# Insert a new value into the list.
|
58
|
+
# @param id [Integer] The value to add
|
59
|
+
def insert(id)
|
60
|
+
# Find the index of the page that should hold ID.
|
61
|
+
index = @page_records.bsearch_index { |pr| pr.max_id >= id }
|
62
|
+
# Get the corresponding IDListPageRecord object.
|
63
|
+
page = @page_records[index]
|
64
|
+
|
65
|
+
# In case the page is already full we'll have to create a new page.
|
66
|
+
# There is no guarantee that a split will yield an page with space as we
|
67
|
+
# split by ID range, not by distributing the values evenly across the
|
68
|
+
# two pages.
|
69
|
+
while page.is_full?
|
70
|
+
new_page = page.split
|
71
|
+
# Store the newly created page into the page_records list.
|
72
|
+
@page_records.insert(index + 1, new_page)
|
73
|
+
if id >= new_page.min_id
|
74
|
+
# We need to insert the ID into the newly created page. Adjust index
|
75
|
+
# and page reference accordingly.
|
76
|
+
index += 1
|
77
|
+
page = new_page
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Insert the ID into the page.
|
82
|
+
page.insert(id)
|
83
|
+
end
|
84
|
+
|
85
|
+
# Check if a given value is already stored in the list.
|
86
|
+
# @param id [Integer] The value to check for
|
87
|
+
def include?(id)
|
88
|
+
@page_records.bsearch { |pr| pr.max_id >= id }.include?(id)
|
89
|
+
end
|
90
|
+
|
91
|
+
# Clear the list and empty the filesystem cache file.
|
92
|
+
def clear
|
93
|
+
@page_file.clear
|
94
|
+
@page_records = [ IDListPageRecord.new(@page_file, 0, 2 ** 64) ]
|
95
|
+
end
|
96
|
+
|
97
|
+
# Erase the list including the filesystem cache file. The IDList is no
|
98
|
+
# longer usable after this call but the cache file is removed from the
|
99
|
+
# filesystem.
|
100
|
+
def erase
|
101
|
+
@page_file.erase
|
102
|
+
@page_records = nil
|
103
|
+
end
|
104
|
+
|
105
|
+
# Perform some consistency checks on the internal data structures. Raises
|
106
|
+
# a RuntimeError in case a problem is found.
|
107
|
+
def check
|
108
|
+
last_max = -1
|
109
|
+
unless (min_id = @page_records.first.min_id) == 0
|
110
|
+
raise RuntimeError, "min_id of first record (#{min_id}) " +
|
111
|
+
"must be 0."
|
112
|
+
end
|
113
|
+
|
114
|
+
@page_records.each do |pr|
|
115
|
+
unless pr.min_id == last_max + 1
|
116
|
+
raise RuntimeError, "max_id of previous record (#{last_max}) " +
|
117
|
+
"must be exactly 1 smaller than current record (#{pr.min_id})."
|
118
|
+
end
|
119
|
+
last_max = pr.max_id
|
120
|
+
pr.check
|
121
|
+
end
|
122
|
+
|
123
|
+
unless last_max == 2 ** 64
|
124
|
+
raise RuntimeError, "max_id of last records " +
|
125
|
+
"(#{@page_records.last.max_id}) must be #{2 ** 64})."
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def to_a
|
130
|
+
a = []
|
131
|
+
@page_records.each { |pr| a += pr.values }
|
132
|
+
a
|
133
|
+
end
|
134
|
+
|
135
|
+
# Print a human readable form of the tree that stores the list. This is
|
136
|
+
# only meant for debugging purposes and does not scale for larger trees.
|
137
|
+
def to_s
|
138
|
+
"\n" + @root.to_s
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
|
143
|
+
end
|
144
|
+
|