perobs 3.0.1 → 4.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.md +19 -18
- data/lib/perobs.rb +2 -0
- data/lib/perobs/Array.rb +68 -21
- data/lib/perobs/BTree.rb +110 -54
- data/lib/perobs/BTreeBlob.rb +14 -13
- data/lib/perobs/BTreeDB.rb +11 -10
- data/lib/perobs/BTreeNode.rb +551 -197
- data/lib/perobs/BTreeNodeCache.rb +10 -8
- data/lib/perobs/BTreeNodeLink.rb +11 -1
- data/lib/perobs/BigArray.rb +285 -0
- data/lib/perobs/BigArrayNode.rb +1002 -0
- data/lib/perobs/BigHash.rb +246 -0
- data/lib/perobs/BigTree.rb +197 -0
- data/lib/perobs/BigTreeNode.rb +873 -0
- data/lib/perobs/Cache.rb +47 -22
- data/lib/perobs/ClassMap.rb +2 -2
- data/lib/perobs/ConsoleProgressMeter.rb +61 -0
- data/lib/perobs/DataBase.rb +4 -3
- data/lib/perobs/DynamoDB.rb +62 -20
- data/lib/perobs/EquiBlobsFile.rb +174 -59
- data/lib/perobs/FNV_Hash_1a_64.rb +54 -0
- data/lib/perobs/FlatFile.rb +536 -242
- data/lib/perobs/FlatFileBlobHeader.rb +120 -84
- data/lib/perobs/FlatFileDB.rb +58 -27
- data/lib/perobs/FuzzyStringMatcher.rb +175 -0
- data/lib/perobs/Hash.rb +129 -35
- data/lib/perobs/IDList.rb +144 -0
- data/lib/perobs/IDListPage.rb +107 -0
- data/lib/perobs/IDListPageFile.rb +180 -0
- data/lib/perobs/IDListPageRecord.rb +142 -0
- data/lib/perobs/LockFile.rb +3 -0
- data/lib/perobs/Object.rb +28 -20
- data/lib/perobs/ObjectBase.rb +53 -10
- data/lib/perobs/PersistentObjectCache.rb +142 -0
- data/lib/perobs/PersistentObjectCacheLine.rb +99 -0
- data/lib/perobs/ProgressMeter.rb +97 -0
- data/lib/perobs/SpaceManager.rb +273 -0
- data/lib/perobs/SpaceTree.rb +63 -47
- data/lib/perobs/SpaceTreeNode.rb +134 -115
- data/lib/perobs/SpaceTreeNodeLink.rb +1 -1
- data/lib/perobs/StackFile.rb +1 -1
- data/lib/perobs/Store.rb +180 -70
- data/lib/perobs/version.rb +1 -1
- data/perobs.gemspec +4 -4
- data/test/Array_spec.rb +48 -39
- data/test/BTreeDB_spec.rb +2 -2
- data/test/BTree_spec.rb +50 -1
- data/test/BigArray_spec.rb +261 -0
- data/test/BigHash_spec.rb +152 -0
- data/test/BigTreeNode_spec.rb +153 -0
- data/test/BigTree_spec.rb +259 -0
- data/test/EquiBlobsFile_spec.rb +105 -5
- data/test/FNV_Hash_1a_64_spec.rb +59 -0
- data/test/FlatFileDB_spec.rb +199 -15
- data/test/FuzzyStringMatcher_spec.rb +261 -0
- data/test/Hash_spec.rb +27 -16
- data/test/IDList_spec.rb +77 -0
- data/test/LegacyDBs/LegacyDB.rb +155 -0
- data/test/LegacyDBs/version_3/class_map.json +1 -0
- data/test/LegacyDBs/version_3/config.json +1 -0
- data/test/LegacyDBs/version_3/database.blobs +0 -0
- data/test/LegacyDBs/version_3/database_spaces.blobs +0 -0
- data/test/LegacyDBs/version_3/index.blobs +0 -0
- data/test/LegacyDBs/version_3/version +1 -0
- data/test/LockFile_spec.rb +9 -6
- data/test/Object_spec.rb +5 -5
- data/test/SpaceManager_spec.rb +176 -0
- data/test/SpaceTree_spec.rb +27 -9
- data/test/Store_spec.rb +353 -206
- data/test/perobs_spec.rb +7 -3
- data/test/spec_helper.rb +9 -4
- metadata +59 -16
- data/lib/perobs/SpaceTreeNodeCache.rb +0 -76
- data/lib/perobs/TreeDB.rb +0 -277
@@ -0,0 +1,175 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
#
|
3
|
+
# = FuzzyStringMatcher.rb -- Persistent Ruby Object Store
|
4
|
+
#
|
5
|
+
# Copyright (c) 2020 by Chris Schlaeger <chris@taskjuggler.org>
|
6
|
+
#
|
7
|
+
# MIT License
|
8
|
+
#
|
9
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
10
|
+
# a copy of this software and associated documentation files (the
|
11
|
+
# "Software"), to deal in the Software without restriction, including
|
12
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
13
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
14
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
15
|
+
# the following conditions:
|
16
|
+
#
|
17
|
+
# The above copyright notice and this permission notice shall be
|
18
|
+
# included in all copies or substantial portions of the Software.
|
19
|
+
#
|
20
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
21
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
22
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
23
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
24
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
25
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
26
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
27
|
+
|
28
|
+
require 'perobs/Log'
|
29
|
+
require 'perobs/Object'
|
30
|
+
|
31
|
+
module PEROBS
|
32
|
+
|
33
|
+
# The fuzzy string matcher can be used to perform a fuzzy string search
|
34
|
+
# against a known set of strings. The dictionary of known strings does not
|
35
|
+
# store the actual strings but references to String or PEROBS objects.
|
36
|
+
# Once the dictionary has been established, fuzzy matches can be done. Since
|
37
|
+
# the actual input strings are not directly stored, you cannot remove or
|
38
|
+
# modified already stored strings. To remove strings, you have to clear the
|
39
|
+
# matcher and add the strings again that you want to keep.
|
40
|
+
class FuzzyStringMatcher < PEROBS::Object
|
41
|
+
|
42
|
+
attr_persist :case_sensitive, :n, :dict
|
43
|
+
|
44
|
+
# Create a new FuzzyStringMatcher.
|
45
|
+
# @param p [PEROBS::Store] place to store the dictionary
|
46
|
+
# @param case_sensitive [Boolean] True if case matters for matching
|
47
|
+
# @param n [Integer] Determines what kind of n-gramm is used to store the
|
48
|
+
# references in the dictionary. It also determines the minimum word
|
49
|
+
# length that can be used for fuzzy matches. Values between 2 and
|
50
|
+
# 10 are supported. The default is 4.
|
51
|
+
def initialize(p, case_sensitive = false, n = 4)
|
52
|
+
super(p)
|
53
|
+
if n < 2 || n > 10
|
54
|
+
raise ArgumentError, 'n must be between 2 and 10'
|
55
|
+
end
|
56
|
+
self.case_sensitive = case_sensitive
|
57
|
+
self.n = n
|
58
|
+
|
59
|
+
clear unless @dict
|
60
|
+
end
|
61
|
+
|
62
|
+
# Wipe the dictionary.
|
63
|
+
def clear
|
64
|
+
self.dict = @store.new(BigHash)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Add a string with its reference to the dictionary.
|
68
|
+
# @param string [String] The string to store
|
69
|
+
# @param reference [Object] Any object that is associated with the string
|
70
|
+
def learn(string, reference = string)
|
71
|
+
reference = string if reference.nil?
|
72
|
+
|
73
|
+
unless @case_sensitive
|
74
|
+
string = string.downcase
|
75
|
+
end
|
76
|
+
# Enclose string in 'start of text' and 'end of text' ASCII values.
|
77
|
+
string = "\002" + string + "\003"
|
78
|
+
|
79
|
+
each_n_gramm(string) do |n_gramm|
|
80
|
+
unless (ng_list = @dict[n_gramm])
|
81
|
+
@dict[n_gramm] = ng_list = @store.new(Hash)
|
82
|
+
end
|
83
|
+
|
84
|
+
# We use the Hash as a Set. The value doesn't matter.
|
85
|
+
ng_list[reference] = true unless ng_list.include?(reference)
|
86
|
+
end
|
87
|
+
|
88
|
+
nil
|
89
|
+
end
|
90
|
+
|
91
|
+
# Find the references who's string best matches the given string.
|
92
|
+
# @param string [String] string to search for
|
93
|
+
# @param min_score [Float] Value 0.01 and 1.0 that specifies how strict
|
94
|
+
# the matching should be done. The larger the value the more closer
|
95
|
+
# the given string needs to be.
|
96
|
+
# @param max_count [Integer] The maximum number of matches that should be
|
97
|
+
# returned.
|
98
|
+
# @return [Array] The result is an Array of Arrays. The nested Arrays only
|
99
|
+
# have 2 entries. The reference and a Float value between 0 and
|
100
|
+
# 1.0 that describes how good the match is. The matches are sorted
|
101
|
+
# in descending order by the match score.
|
102
|
+
def best_matches(string, min_score = 0.5, max_count = 100)
|
103
|
+
unless @case_sensitive
|
104
|
+
string = string.downcase
|
105
|
+
end
|
106
|
+
# Enclose string in 'start of text' and 'end of text' ASCII values.
|
107
|
+
string = "\002" + string + "\003"
|
108
|
+
|
109
|
+
matches = {}
|
110
|
+
|
111
|
+
each_n_gramm(string) do |n_gramm|
|
112
|
+
if (ng_list = @dict[n_gramm])
|
113
|
+
ng_list.each do |reference, dummy|
|
114
|
+
if matches.include?(reference)
|
115
|
+
matches[reference] += 1
|
116
|
+
else
|
117
|
+
matches[reference] = 1
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
return [] if matches.empty?
|
124
|
+
|
125
|
+
match_list = matches.to_a
|
126
|
+
|
127
|
+
# Set occurance counters to scores relative to the best possible score.
|
128
|
+
# This will be the best possible score for a perfect match.
|
129
|
+
best_possible_score = string.length - @n + 1
|
130
|
+
match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
|
131
|
+
|
132
|
+
# Delete all matches that don't have the required minimum match score.
|
133
|
+
match_list.delete_if { |a| a[1] < min_score }
|
134
|
+
|
135
|
+
# Sort the list best to worst match
|
136
|
+
match_list.sort! do |a, b|
|
137
|
+
b[1] <=> a[1]
|
138
|
+
end
|
139
|
+
|
140
|
+
# Return the top max_count matches.
|
141
|
+
match_list[0..max_count - 1]
|
142
|
+
end
|
143
|
+
|
144
|
+
# Returns some internal stats about the dictionary.
|
145
|
+
def stats
|
146
|
+
s = {}
|
147
|
+
s['dictionary_size'] = @dict.size
|
148
|
+
max = total = 0
|
149
|
+
@dict.each do |n_gramm, ng_list|
|
150
|
+
size = ng_list.length
|
151
|
+
max = size if size > max
|
152
|
+
total += size
|
153
|
+
end
|
154
|
+
s['max_list_size'] = max
|
155
|
+
s['avg_list_size'] = total > 0 ? total.to_f / s['dictionary_size'] : 0
|
156
|
+
|
157
|
+
s
|
158
|
+
end
|
159
|
+
|
160
|
+
private
|
161
|
+
|
162
|
+
def each_n_gramm(string, &block)
|
163
|
+
return if string.length < @n
|
164
|
+
|
165
|
+
0.upto(string.length - @n) do |i|
|
166
|
+
n_gramm = string[i, @n]
|
167
|
+
|
168
|
+
yield(n_gramm)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
|
data/lib/perobs/Hash.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#
|
3
3
|
# = Hash.rb -- Persistent Ruby Object Store
|
4
4
|
#
|
5
|
-
# Copyright (c) 2015, 2016 by Chris Schlaeger <chris@taskjuggler.org>
|
5
|
+
# Copyright (c) 2015, 2016, 2017 by Chris Schlaeger <chris@taskjuggler.org>
|
6
6
|
#
|
7
7
|
# MIT License
|
8
8
|
#
|
@@ -37,20 +37,36 @@ module PEROBS
|
|
37
37
|
# The implementation is largely a proxy around the standard Hash class. But
|
38
38
|
# all mutating methods must be re-implemented to convert PEROBS::Objects to
|
39
39
|
# POXReference objects and to register the object as modified with the
|
40
|
-
# cache.
|
40
|
+
# cache. However, it is not designed for large data sets as it always reads
|
41
|
+
# and writes the full data set for every access (unless it is cached). For
|
42
|
+
# data sets that could have more than a few hundred entries BigHash is the
|
43
|
+
# recommended alternative.
|
41
44
|
#
|
42
45
|
# We explicitely don't support Hash::store() as it conflicts with
|
43
46
|
# ObjectBase::store() method to access the store.
|
44
47
|
class Hash < ObjectBase
|
45
48
|
|
49
|
+
# These methods do not mutate the Hash. They only perform read
|
50
|
+
# operations and return a new PEROBS::Hash object.
|
51
|
+
([
|
52
|
+
:invert, :merge, :reject, :select
|
53
|
+
] + Enumerable.instance_methods).uniq.each do |method_sym|
|
54
|
+
# Create a wrapper method that passes the call to @data.
|
55
|
+
define_method(method_sym) do |*args, &block|
|
56
|
+
# Register the read operation with the cache.
|
57
|
+
@store.cache.cache_read(self)
|
58
|
+
@store.new(PEROBS::Hash, @data.send(method_sym, *args, &block))
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
46
62
|
# These methods do not mutate the Hash. They only perform read
|
47
63
|
# operations.
|
48
64
|
([
|
49
65
|
:==, :[], :assoc, :compare_by_identity, :compare_by_identity?, :default,
|
50
66
|
:default_proc, :each, :each_key, :each_pair, :each_value, :empty?,
|
51
67
|
:eql?, :fetch, :flatten, :has_key?, :has_value?, :hash, :include?,
|
52
|
-
:
|
53
|
-
:pretty_print, :pretty_print_cycle, :rassoc, :
|
68
|
+
:key, :key?, :keys, :length, :member?,
|
69
|
+
:pretty_print, :pretty_print_cycle, :rassoc, :size,
|
54
70
|
:to_a, :to_h, :to_hash, :to_s, :value?, :values, :values_at
|
55
71
|
] + Enumerable.instance_methods).uniq.each do |method_sym|
|
56
72
|
# Create a wrapper method that passes the call to @data.
|
@@ -61,11 +77,22 @@ module PEROBS
|
|
61
77
|
end
|
62
78
|
end
|
63
79
|
|
64
|
-
# These methods mutate the Hash
|
80
|
+
# These methods mutate the Hash and return self
|
81
|
+
[
|
82
|
+
:clear, :keep_if, :merge!, :rehash, :reject!, :replace, :select!, :update
|
83
|
+
].each do |method_sym|
|
84
|
+
# Create a wrapper method that passes the call to @data.
|
85
|
+
define_method(method_sym) do |*args, &block|
|
86
|
+
# Register the write operation with the cache.
|
87
|
+
@store.cache.cache_write(self)
|
88
|
+
@data.send(method_sym, *args, &block)
|
89
|
+
myself
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# These methods mutate the Hash and return basic Ruby type objects.
|
65
94
|
[
|
66
|
-
:
|
67
|
-
:initialize_copy, :keep_if, :merge!, :rehash, :reject!, :replace,
|
68
|
-
:select!, :shift, :update
|
95
|
+
:delete, :delete_if, :shift
|
69
96
|
].each do |method_sym|
|
70
97
|
# Create a wrapper method that passes the call to @data.
|
71
98
|
define_method(method_sym) do |*args, &block|
|
@@ -79,33 +106,70 @@ module PEROBS
|
|
79
106
|
# PEROBS users should never call this method or equivalents of derived
|
80
107
|
# methods directly.
|
81
108
|
# @param p [PEROBS::Handle] PEROBS handle
|
82
|
-
# @param default [
|
83
|
-
# stored for a specific key.
|
84
|
-
|
109
|
+
# @param default [Object] The default value that is returned when no value
|
110
|
+
# is stored for a specific key. The default must be of the
|
111
|
+
# supported type.
|
112
|
+
def initialize(p, default = nil, &block)
|
85
113
|
super(p)
|
86
|
-
|
87
|
-
|
114
|
+
_check_assignment_value(default)
|
115
|
+
if block_given?
|
116
|
+
@data = ::Hash.new(&block)
|
117
|
+
else
|
118
|
+
@data = ::Hash.new(default)
|
119
|
+
end
|
88
120
|
|
89
121
|
# Ensure that the newly created object will be pushed into the database.
|
90
122
|
@store.cache.cache_write(self)
|
91
123
|
end
|
92
124
|
|
125
|
+
# Proxy for assignment method.
|
126
|
+
def []=(key, value)
|
127
|
+
unless key.is_a?(String) || key.respond_to?(:is_poxreference?)
|
128
|
+
raise ArgumentError, "PEROBS::Hash[] key must be a String or " +
|
129
|
+
"a PEROBS object but is a #{key.class}"
|
130
|
+
end
|
131
|
+
_check_assignment_value(value)
|
132
|
+
@store.cache.cache_write(self)
|
133
|
+
@data[key] = value
|
134
|
+
end
|
135
|
+
|
136
|
+
# Proxy for default= method.
|
137
|
+
def default=(value)
|
138
|
+
_check_assignment_value(value)
|
139
|
+
@data.default=(value)
|
140
|
+
end
|
141
|
+
|
93
142
|
# Return a list of all object IDs of all persistend objects that this Hash
|
94
143
|
# is referencing.
|
95
|
-
# @return [Array of
|
144
|
+
# @return [Array of Integer] IDs of referenced objects
|
96
145
|
def _referenced_object_ids
|
97
|
-
|
98
|
-
|
146
|
+
ids = []
|
147
|
+
@data.each do |k, v|
|
148
|
+
if k && k.respond_to?(:is_poxreference?)
|
149
|
+
ids << k.id
|
150
|
+
end
|
151
|
+
if v && v.respond_to?(:is_poxreference?)
|
152
|
+
ids << v.id
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
ids
|
99
157
|
end
|
100
158
|
|
101
159
|
# This method should only be used during store repair operations. It will
|
102
160
|
# delete all referenced to the given object ID.
|
103
|
-
# @param id [
|
161
|
+
# @param id [Integer] targeted object ID
|
104
162
|
def _delete_reference_to_id(id)
|
163
|
+
original_length = @data.length
|
164
|
+
|
105
165
|
@data.delete_if do |k, v|
|
106
|
-
|
166
|
+
(k && k.respond_to?(:is_poxreference?) && k.id == id) ||
|
167
|
+
(v && v.respond_to?(:is_poxreference?) && v.id == id)
|
168
|
+
end
|
169
|
+
|
170
|
+
if @data.length != original_length
|
171
|
+
@store.cache.cache_write(self)
|
107
172
|
end
|
108
|
-
@store.cache.cache_write(self)
|
109
173
|
end
|
110
174
|
|
111
175
|
# Restore the persistent data from a single data structure.
|
@@ -114,8 +178,18 @@ module PEROBS
|
|
114
178
|
# @private
|
115
179
|
def _deserialize(data)
|
116
180
|
@data = {}
|
117
|
-
|
118
|
-
|
181
|
+
|
182
|
+
data.each do |k, v|
|
183
|
+
# References to other PEROBS Objects are marshalled with our own
|
184
|
+
# format. If we detect such a marshalled String we convert it into a
|
185
|
+
# POXReference object.
|
186
|
+
if (match = /^#<PEROBS::POReference id=([0-9]+)>$/.match(k))
|
187
|
+
k = POXReference.new(@store, match[1].to_i)
|
188
|
+
end
|
189
|
+
dv = v.is_a?(POReference) ? POXReference.new(@store, v.id) : v
|
190
|
+
@data[k] = dv
|
191
|
+
end
|
192
|
+
|
119
193
|
@data
|
120
194
|
end
|
121
195
|
|
@@ -136,26 +210,46 @@ module PEROBS
|
|
136
210
|
data = {}
|
137
211
|
|
138
212
|
@data.each do |k, v|
|
139
|
-
if
|
140
|
-
|
141
|
-
|
142
|
-
#
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
v.inspect
|
151
|
-
end
|
152
|
-
data[k] = v
|
213
|
+
if k.respond_to?(:is_poxreference?)
|
214
|
+
# JSON only supports Strings as hash keys. Since JSON is the default
|
215
|
+
# internal storage format in the database, we have to marshall
|
216
|
+
# PEROBS::Object references ourselves.
|
217
|
+
k = "#<PEROBS::POReference id=#{k.id}>"
|
218
|
+
elsif k[0..24] == '#<PEROBS::POReference id='
|
219
|
+
# This could obviously result in conflicts with 'normal' String hash
|
220
|
+
# keys. This is extremely unlikely, but we better catch this case
|
221
|
+
# before it causes hard to debug trouble.
|
222
|
+
raise ArgumentError, "Hash key #{k} conflicts with PEROBS " +
|
223
|
+
"internal representation of marshalled hash keys!"
|
153
224
|
end
|
225
|
+
data[k] = serialize_helper(v)
|
154
226
|
end
|
155
227
|
|
156
228
|
data
|
157
229
|
end
|
158
230
|
|
231
|
+
def serialize_helper(v)
|
232
|
+
if v.respond_to?(:is_poxreference?)
|
233
|
+
# References to other PEROBS objects (POXReference) are stored as
|
234
|
+
# POReference in the database.
|
235
|
+
return POReference.new(v.id)
|
236
|
+
else
|
237
|
+
# Outside of the PEROBS library all PEROBS::ObjectBase derived
|
238
|
+
# objects should not be used directly. The library only exposes them
|
239
|
+
# via POXReference proxy objects.
|
240
|
+
if v.is_a?(ObjectBase)
|
241
|
+
PEROBS.log.fatal 'A PEROBS::ObjectBase object escaped! ' +
|
242
|
+
"It is stored in a PEROBS::Hash. " +
|
243
|
+
'Have you used self() instead of myself() to ' +
|
244
|
+
"get the reference of this PEROBS object?\n" +
|
245
|
+
v.inspect
|
246
|
+
end
|
247
|
+
|
248
|
+
# All other objects are serialized by their native methods.
|
249
|
+
return v
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
159
253
|
end
|
160
254
|
|
161
255
|
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
#
|
3
|
+
# = IDList.rb -- Persistent Ruby Object Store
|
4
|
+
#
|
5
|
+
# Copyright (c) 2018 by Chris Schlaeger <chris@taskjuggler.org>
|
6
|
+
#
|
7
|
+
# MIT License
|
8
|
+
#
|
9
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
10
|
+
# a copy of this software and associated documentation files (the
|
11
|
+
# "Software"), to deal in the Software without restriction, including
|
12
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
13
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
14
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
15
|
+
# the following conditions:
|
16
|
+
#
|
17
|
+
# The above copyright notice and this permission notice shall be
|
18
|
+
# included in all copies or substantial portions of the Software.
|
19
|
+
#
|
20
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
21
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
22
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
23
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
24
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
25
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
26
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
27
|
+
|
28
|
+
require 'perobs/IDListPageFile'
|
29
|
+
require 'perobs/IDListPageRecord'
|
30
|
+
|
31
|
+
module PEROBS
|
32
|
+
|
33
|
+
# This class stores a list of 64 bit values. Values can be added to the list
|
34
|
+
# and the presence of a certain value can be checked. It can hold up to 2^64
|
35
|
+
# values. It tries to keep values in memory but can store them in a file if
|
36
|
+
# needed. A threshold for the in-memory values can be set in the
|
37
|
+
# constructor. The stored values are grouped in pages. Each page can hold up
|
38
|
+
# to page_size entries.
|
39
|
+
class IDList
|
40
|
+
|
41
|
+
# Create a new IDList object. The data that can't be kept in memory will
|
42
|
+
# be stored in the specified directory under the given name.
|
43
|
+
# @param dir [String] Path of the directory
|
44
|
+
# @param name [String] Name of the file
|
45
|
+
# @param max_in_memory [Integer] Specifies the maximum number of values
|
46
|
+
# that will be kept in memory. If the list is larger, values will
|
47
|
+
# be cached in the specified file.
|
48
|
+
# @param page_size [Integer] The number of values per page. The default
|
49
|
+
# value is 32 which was found the best performing config in tests.
|
50
|
+
def initialize(dir, name, max_in_memory, page_size = 32)
|
51
|
+
# The page_file manages the pages that store the values.
|
52
|
+
@page_file = IDListPageFile.new(self, dir, name,
|
53
|
+
max_in_memory, page_size)
|
54
|
+
clear
|
55
|
+
end
|
56
|
+
|
57
|
+
# Insert a new value into the list.
|
58
|
+
# @param id [Integer] The value to add
|
59
|
+
def insert(id)
|
60
|
+
# Find the index of the page that should hold ID.
|
61
|
+
index = @page_records.bsearch_index { |pr| pr.max_id >= id }
|
62
|
+
# Get the corresponding IDListPageRecord object.
|
63
|
+
page = @page_records[index]
|
64
|
+
|
65
|
+
# In case the page is already full we'll have to create a new page.
|
66
|
+
# There is no guarantee that a split will yield an page with space as we
|
67
|
+
# split by ID range, not by distributing the values evenly across the
|
68
|
+
# two pages.
|
69
|
+
while page.is_full?
|
70
|
+
new_page = page.split
|
71
|
+
# Store the newly created page into the page_records list.
|
72
|
+
@page_records.insert(index + 1, new_page)
|
73
|
+
if id >= new_page.min_id
|
74
|
+
# We need to insert the ID into the newly created page. Adjust index
|
75
|
+
# and page reference accordingly.
|
76
|
+
index += 1
|
77
|
+
page = new_page
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Insert the ID into the page.
|
82
|
+
page.insert(id)
|
83
|
+
end
|
84
|
+
|
85
|
+
# Check if a given value is already stored in the list.
|
86
|
+
# @param id [Integer] The value to check for
|
87
|
+
def include?(id)
|
88
|
+
@page_records.bsearch { |pr| pr.max_id >= id }.include?(id)
|
89
|
+
end
|
90
|
+
|
91
|
+
# Clear the list and empty the filesystem cache file.
|
92
|
+
def clear
|
93
|
+
@page_file.clear
|
94
|
+
@page_records = [ IDListPageRecord.new(@page_file, 0, 2 ** 64) ]
|
95
|
+
end
|
96
|
+
|
97
|
+
# Erase the list including the filesystem cache file. The IDList is no
|
98
|
+
# longer usable after this call but the cache file is removed from the
|
99
|
+
# filesystem.
|
100
|
+
def erase
|
101
|
+
@page_file.erase
|
102
|
+
@page_records = nil
|
103
|
+
end
|
104
|
+
|
105
|
+
# Perform some consistency checks on the internal data structures. Raises
|
106
|
+
# a RuntimeError in case a problem is found.
|
107
|
+
def check
|
108
|
+
last_max = -1
|
109
|
+
unless (min_id = @page_records.first.min_id) == 0
|
110
|
+
raise RuntimeError, "min_id of first record (#{min_id}) " +
|
111
|
+
"must be 0."
|
112
|
+
end
|
113
|
+
|
114
|
+
@page_records.each do |pr|
|
115
|
+
unless pr.min_id == last_max + 1
|
116
|
+
raise RuntimeError, "max_id of previous record (#{last_max}) " +
|
117
|
+
"must be exactly 1 smaller than current record (#{pr.min_id})."
|
118
|
+
end
|
119
|
+
last_max = pr.max_id
|
120
|
+
pr.check
|
121
|
+
end
|
122
|
+
|
123
|
+
unless last_max == 2 ** 64
|
124
|
+
raise RuntimeError, "max_id of last records " +
|
125
|
+
"(#{@page_records.last.max_id}) must be #{2 ** 64})."
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def to_a
|
130
|
+
a = []
|
131
|
+
@page_records.each { |pr| a += pr.values }
|
132
|
+
a
|
133
|
+
end
|
134
|
+
|
135
|
+
# Print a human readable form of the tree that stores the list. This is
|
136
|
+
# only meant for debugging purposes and does not scale for larger trees.
|
137
|
+
def to_s
|
138
|
+
"\n" + @root.to_s
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
|
143
|
+
end
|
144
|
+
|