perobs 3.0.1 → 4.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (75) hide show
  1. checksums.yaml +5 -5
  2. data/README.md +19 -18
  3. data/lib/perobs.rb +2 -0
  4. data/lib/perobs/Array.rb +68 -21
  5. data/lib/perobs/BTree.rb +110 -54
  6. data/lib/perobs/BTreeBlob.rb +14 -13
  7. data/lib/perobs/BTreeDB.rb +11 -10
  8. data/lib/perobs/BTreeNode.rb +551 -197
  9. data/lib/perobs/BTreeNodeCache.rb +10 -8
  10. data/lib/perobs/BTreeNodeLink.rb +11 -1
  11. data/lib/perobs/BigArray.rb +285 -0
  12. data/lib/perobs/BigArrayNode.rb +1002 -0
  13. data/lib/perobs/BigHash.rb +246 -0
  14. data/lib/perobs/BigTree.rb +197 -0
  15. data/lib/perobs/BigTreeNode.rb +873 -0
  16. data/lib/perobs/Cache.rb +47 -22
  17. data/lib/perobs/ClassMap.rb +2 -2
  18. data/lib/perobs/ConsoleProgressMeter.rb +61 -0
  19. data/lib/perobs/DataBase.rb +4 -3
  20. data/lib/perobs/DynamoDB.rb +62 -20
  21. data/lib/perobs/EquiBlobsFile.rb +174 -59
  22. data/lib/perobs/FNV_Hash_1a_64.rb +54 -0
  23. data/lib/perobs/FlatFile.rb +536 -242
  24. data/lib/perobs/FlatFileBlobHeader.rb +120 -84
  25. data/lib/perobs/FlatFileDB.rb +58 -27
  26. data/lib/perobs/FuzzyStringMatcher.rb +175 -0
  27. data/lib/perobs/Hash.rb +129 -35
  28. data/lib/perobs/IDList.rb +144 -0
  29. data/lib/perobs/IDListPage.rb +107 -0
  30. data/lib/perobs/IDListPageFile.rb +180 -0
  31. data/lib/perobs/IDListPageRecord.rb +142 -0
  32. data/lib/perobs/LockFile.rb +3 -0
  33. data/lib/perobs/Object.rb +28 -20
  34. data/lib/perobs/ObjectBase.rb +53 -10
  35. data/lib/perobs/PersistentObjectCache.rb +142 -0
  36. data/lib/perobs/PersistentObjectCacheLine.rb +99 -0
  37. data/lib/perobs/ProgressMeter.rb +97 -0
  38. data/lib/perobs/SpaceManager.rb +273 -0
  39. data/lib/perobs/SpaceTree.rb +63 -47
  40. data/lib/perobs/SpaceTreeNode.rb +134 -115
  41. data/lib/perobs/SpaceTreeNodeLink.rb +1 -1
  42. data/lib/perobs/StackFile.rb +1 -1
  43. data/lib/perobs/Store.rb +180 -70
  44. data/lib/perobs/version.rb +1 -1
  45. data/perobs.gemspec +4 -4
  46. data/test/Array_spec.rb +48 -39
  47. data/test/BTreeDB_spec.rb +2 -2
  48. data/test/BTree_spec.rb +50 -1
  49. data/test/BigArray_spec.rb +261 -0
  50. data/test/BigHash_spec.rb +152 -0
  51. data/test/BigTreeNode_spec.rb +153 -0
  52. data/test/BigTree_spec.rb +259 -0
  53. data/test/EquiBlobsFile_spec.rb +105 -5
  54. data/test/FNV_Hash_1a_64_spec.rb +59 -0
  55. data/test/FlatFileDB_spec.rb +199 -15
  56. data/test/FuzzyStringMatcher_spec.rb +261 -0
  57. data/test/Hash_spec.rb +27 -16
  58. data/test/IDList_spec.rb +77 -0
  59. data/test/LegacyDBs/LegacyDB.rb +155 -0
  60. data/test/LegacyDBs/version_3/class_map.json +1 -0
  61. data/test/LegacyDBs/version_3/config.json +1 -0
  62. data/test/LegacyDBs/version_3/database.blobs +0 -0
  63. data/test/LegacyDBs/version_3/database_spaces.blobs +0 -0
  64. data/test/LegacyDBs/version_3/index.blobs +0 -0
  65. data/test/LegacyDBs/version_3/version +1 -0
  66. data/test/LockFile_spec.rb +9 -6
  67. data/test/Object_spec.rb +5 -5
  68. data/test/SpaceManager_spec.rb +176 -0
  69. data/test/SpaceTree_spec.rb +27 -9
  70. data/test/Store_spec.rb +353 -206
  71. data/test/perobs_spec.rb +7 -3
  72. data/test/spec_helper.rb +9 -4
  73. metadata +59 -16
  74. data/lib/perobs/SpaceTreeNodeCache.rb +0 -76
  75. data/lib/perobs/TreeDB.rb +0 -277
@@ -0,0 +1,175 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # = FuzzyStringMatcher.rb -- Persistent Ruby Object Store
4
+ #
5
+ # Copyright (c) 2020 by Chris Schlaeger <chris@taskjuggler.org>
6
+ #
7
+ # MIT License
8
+ #
9
+ # Permission is hereby granted, free of charge, to any person obtaining
10
+ # a copy of this software and associated documentation files (the
11
+ # "Software"), to deal in the Software without restriction, including
12
+ # without limitation the rights to use, copy, modify, merge, publish,
13
+ # distribute, sublicense, and/or sell copies of the Software, and to
14
+ # permit persons to whom the Software is furnished to do so, subject to
15
+ # the following conditions:
16
+ #
17
+ # The above copyright notice and this permission notice shall be
18
+ # included in all copies or substantial portions of the Software.
19
+ #
20
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
+
28
+ require 'perobs/Log'
29
+ require 'perobs/Object'
30
+
31
+ module PEROBS
32
+
33
+ # The fuzzy string matcher can be used to perform a fuzzy string search
34
+ # against a known set of strings. The dictionary of known strings does not
35
+ # store the actual strings but references to String or PEROBS objects.
36
+ # Once the dictionary has been established, fuzzy matches can be done. Since
37
+ # the actual input strings are not directly stored, you cannot remove or
38
+ # modified already stored strings. To remove strings, you have to clear the
39
+ # matcher and add the strings again that you want to keep.
40
+ class FuzzyStringMatcher < PEROBS::Object
41
+
42
+ attr_persist :case_sensitive, :n, :dict
43
+
44
+ # Create a new FuzzyStringMatcher.
45
+ # @param p [PEROBS::Store] place to store the dictionary
46
+ # @param case_sensitive [Boolean] True if case matters for matching
47
+ # @param n [Integer] Determines what kind of n-gramm is used to store the
48
+ # references in the dictionary. It also determines the minimum word
49
+ # length that can be used for fuzzy matches. Values between 2 and
50
+ # 10 are supported. The default is 4.
51
+ def initialize(p, case_sensitive = false, n = 4)
52
+ super(p)
53
+ if n < 2 || n > 10
54
+ raise ArgumentError, 'n must be between 2 and 10'
55
+ end
56
+ self.case_sensitive = case_sensitive
57
+ self.n = n
58
+
59
+ clear unless @dict
60
+ end
61
+
62
+ # Wipe the dictionary.
63
+ def clear
64
+ self.dict = @store.new(BigHash)
65
+ end
66
+
67
+ # Add a string with its reference to the dictionary.
68
+ # @param string [String] The string to store
69
+ # @param reference [Object] Any object that is associated with the string
70
+ def learn(string, reference = string)
71
+ reference = string if reference.nil?
72
+
73
+ unless @case_sensitive
74
+ string = string.downcase
75
+ end
76
+ # Enclose string in 'start of text' and 'end of text' ASCII values.
77
+ string = "\002" + string + "\003"
78
+
79
+ each_n_gramm(string) do |n_gramm|
80
+ unless (ng_list = @dict[n_gramm])
81
+ @dict[n_gramm] = ng_list = @store.new(Hash)
82
+ end
83
+
84
+ # We use the Hash as a Set. The value doesn't matter.
85
+ ng_list[reference] = true unless ng_list.include?(reference)
86
+ end
87
+
88
+ nil
89
+ end
90
+
91
+ # Find the references who's string best matches the given string.
92
+ # @param string [String] string to search for
93
+ # @param min_score [Float] Value 0.01 and 1.0 that specifies how strict
94
+ # the matching should be done. The larger the value the more closer
95
+ # the given string needs to be.
96
+ # @param max_count [Integer] The maximum number of matches that should be
97
+ # returned.
98
+ # @return [Array] The result is an Array of Arrays. The nested Arrays only
99
+ # have 2 entries. The reference and a Float value between 0 and
100
+ # 1.0 that describes how good the match is. The matches are sorted
101
+ # in descending order by the match score.
102
+ def best_matches(string, min_score = 0.5, max_count = 100)
103
+ unless @case_sensitive
104
+ string = string.downcase
105
+ end
106
+ # Enclose string in 'start of text' and 'end of text' ASCII values.
107
+ string = "\002" + string + "\003"
108
+
109
+ matches = {}
110
+
111
+ each_n_gramm(string) do |n_gramm|
112
+ if (ng_list = @dict[n_gramm])
113
+ ng_list.each do |reference, dummy|
114
+ if matches.include?(reference)
115
+ matches[reference] += 1
116
+ else
117
+ matches[reference] = 1
118
+ end
119
+ end
120
+ end
121
+ end
122
+
123
+ return [] if matches.empty?
124
+
125
+ match_list = matches.to_a
126
+
127
+ # Set occurance counters to scores relative to the best possible score.
128
+ # This will be the best possible score for a perfect match.
129
+ best_possible_score = string.length - @n + 1
130
+ match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
131
+
132
+ # Delete all matches that don't have the required minimum match score.
133
+ match_list.delete_if { |a| a[1] < min_score }
134
+
135
+ # Sort the list best to worst match
136
+ match_list.sort! do |a, b|
137
+ b[1] <=> a[1]
138
+ end
139
+
140
+ # Return the top max_count matches.
141
+ match_list[0..max_count - 1]
142
+ end
143
+
144
+ # Returns some internal stats about the dictionary.
145
+ def stats
146
+ s = {}
147
+ s['dictionary_size'] = @dict.size
148
+ max = total = 0
149
+ @dict.each do |n_gramm, ng_list|
150
+ size = ng_list.length
151
+ max = size if size > max
152
+ total += size
153
+ end
154
+ s['max_list_size'] = max
155
+ s['avg_list_size'] = total > 0 ? total.to_f / s['dictionary_size'] : 0
156
+
157
+ s
158
+ end
159
+
160
+ private
161
+
162
+ def each_n_gramm(string, &block)
163
+ return if string.length < @n
164
+
165
+ 0.upto(string.length - @n) do |i|
166
+ n_gramm = string[i, @n]
167
+
168
+ yield(n_gramm)
169
+ end
170
+ end
171
+
172
+ end
173
+
174
+ end
175
+
data/lib/perobs/Hash.rb CHANGED
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # = Hash.rb -- Persistent Ruby Object Store
4
4
  #
5
- # Copyright (c) 2015, 2016 by Chris Schlaeger <chris@taskjuggler.org>
5
+ # Copyright (c) 2015, 2016, 2017 by Chris Schlaeger <chris@taskjuggler.org>
6
6
  #
7
7
  # MIT License
8
8
  #
@@ -37,20 +37,36 @@ module PEROBS
37
37
  # The implementation is largely a proxy around the standard Hash class. But
38
38
  # all mutating methods must be re-implemented to convert PEROBS::Objects to
39
39
  # POXReference objects and to register the object as modified with the
40
- # cache.
40
+ # cache. However, it is not designed for large data sets as it always reads
41
+ # and writes the full data set for every access (unless it is cached). For
42
+ # data sets that could have more than a few hundred entries BigHash is the
43
+ # recommended alternative.
41
44
  #
42
45
  # We explicitely don't support Hash::store() as it conflicts with
43
46
  # ObjectBase::store() method to access the store.
44
47
  class Hash < ObjectBase
45
48
 
49
+ # These methods do not mutate the Hash. They only perform read
50
+ # operations and return a new PEROBS::Hash object.
51
+ ([
52
+ :invert, :merge, :reject, :select
53
+ ] + Enumerable.instance_methods).uniq.each do |method_sym|
54
+ # Create a wrapper method that passes the call to @data.
55
+ define_method(method_sym) do |*args, &block|
56
+ # Register the read operation with the cache.
57
+ @store.cache.cache_read(self)
58
+ @store.new(PEROBS::Hash, @data.send(method_sym, *args, &block))
59
+ end
60
+ end
61
+
46
62
  # These methods do not mutate the Hash. They only perform read
47
63
  # operations.
48
64
  ([
49
65
  :==, :[], :assoc, :compare_by_identity, :compare_by_identity?, :default,
50
66
  :default_proc, :each, :each_key, :each_pair, :each_value, :empty?,
51
67
  :eql?, :fetch, :flatten, :has_key?, :has_value?, :hash, :include?,
52
- :invert, :key, :key?, :keys, :length, :member?, :merge,
53
- :pretty_print, :pretty_print_cycle, :rassoc, :reject, :select, :size,
68
+ :key, :key?, :keys, :length, :member?,
69
+ :pretty_print, :pretty_print_cycle, :rassoc, :size,
54
70
  :to_a, :to_h, :to_hash, :to_s, :value?, :values, :values_at
55
71
  ] + Enumerable.instance_methods).uniq.each do |method_sym|
56
72
  # Create a wrapper method that passes the call to @data.
@@ -61,11 +77,22 @@ module PEROBS
61
77
  end
62
78
  end
63
79
 
64
- # These methods mutate the Hash.
80
+ # These methods mutate the Hash and return self
81
+ [
82
+ :clear, :keep_if, :merge!, :rehash, :reject!, :replace, :select!, :update
83
+ ].each do |method_sym|
84
+ # Create a wrapper method that passes the call to @data.
85
+ define_method(method_sym) do |*args, &block|
86
+ # Register the write operation with the cache.
87
+ @store.cache.cache_write(self)
88
+ @data.send(method_sym, *args, &block)
89
+ myself
90
+ end
91
+ end
92
+
93
+ # These methods mutate the Hash and return basic Ruby type objects.
65
94
  [
66
- :[]=, :clear, :default=, :default_proc=, :delete, :delete_if,
67
- :initialize_copy, :keep_if, :merge!, :rehash, :reject!, :replace,
68
- :select!, :shift, :update
95
+ :delete, :delete_if, :shift
69
96
  ].each do |method_sym|
70
97
  # Create a wrapper method that passes the call to @data.
71
98
  define_method(method_sym) do |*args, &block|
@@ -79,33 +106,70 @@ module PEROBS
79
106
  # PEROBS users should never call this method or equivalents of derived
80
107
  # methods directly.
81
108
  # @param p [PEROBS::Handle] PEROBS handle
82
- # @param default [Any] The default value that is returned when no value is
83
- # stored for a specific key.
84
- def initialize(p, default = nil)
109
+ # @param default [Object] The default value that is returned when no value
110
+ # is stored for a specific key. The default must be of the
111
+ # supported type.
112
+ def initialize(p, default = nil, &block)
85
113
  super(p)
86
- @default = nil
87
- @data = {}
114
+ _check_assignment_value(default)
115
+ if block_given?
116
+ @data = ::Hash.new(&block)
117
+ else
118
+ @data = ::Hash.new(default)
119
+ end
88
120
 
89
121
  # Ensure that the newly created object will be pushed into the database.
90
122
  @store.cache.cache_write(self)
91
123
  end
92
124
 
125
+ # Proxy for assignment method.
126
+ def []=(key, value)
127
+ unless key.is_a?(String) || key.respond_to?(:is_poxreference?)
128
+ raise ArgumentError, "PEROBS::Hash[] key must be a String or " +
129
+ "a PEROBS object but is a #{key.class}"
130
+ end
131
+ _check_assignment_value(value)
132
+ @store.cache.cache_write(self)
133
+ @data[key] = value
134
+ end
135
+
136
+ # Proxy for default= method.
137
+ def default=(value)
138
+ _check_assignment_value(value)
139
+ @data.default=(value)
140
+ end
141
+
93
142
  # Return a list of all object IDs of all persistend objects that this Hash
94
143
  # is referencing.
95
- # @return [Array of Fixnum or Bignum] IDs of referenced objects
144
+ # @return [Array of Integer] IDs of referenced objects
96
145
  def _referenced_object_ids
97
- @data.each_value.select { |v| v && v.respond_to?(:is_poxreference?) }.
98
- map { |o| o.id }
146
+ ids = []
147
+ @data.each do |k, v|
148
+ if k && k.respond_to?(:is_poxreference?)
149
+ ids << k.id
150
+ end
151
+ if v && v.respond_to?(:is_poxreference?)
152
+ ids << v.id
153
+ end
154
+ end
155
+
156
+ ids
99
157
  end
100
158
 
101
159
  # This method should only be used during store repair operations. It will
102
160
  # delete all referenced to the given object ID.
103
- # @param id [Fixnum/Bignum] targeted object ID
161
+ # @param id [Integer] targeted object ID
104
162
  def _delete_reference_to_id(id)
163
+ original_length = @data.length
164
+
105
165
  @data.delete_if do |k, v|
106
- v && v.respond_to?(:is_poxreference?) && v.id == id
166
+ (k && k.respond_to?(:is_poxreference?) && k.id == id) ||
167
+ (v && v.respond_to?(:is_poxreference?) && v.id == id)
168
+ end
169
+
170
+ if @data.length != original_length
171
+ @store.cache.cache_write(self)
107
172
  end
108
- @store.cache.cache_write(self)
109
173
  end
110
174
 
111
175
  # Restore the persistent data from a single data structure.
@@ -114,8 +178,18 @@ module PEROBS
114
178
  # @private
115
179
  def _deserialize(data)
116
180
  @data = {}
117
- data.each { |k, v| @data[k] = v.is_a?(POReference) ?
118
- POXReference.new(@store, v.id) : v }
181
+
182
+ data.each do |k, v|
183
+ # References to other PEROBS Objects are marshalled with our own
184
+ # format. If we detect such a marshalled String we convert it into a
185
+ # POXReference object.
186
+ if (match = /^#<PEROBS::POReference id=([0-9]+)>$/.match(k))
187
+ k = POXReference.new(@store, match[1].to_i)
188
+ end
189
+ dv = v.is_a?(POReference) ? POXReference.new(@store, v.id) : v
190
+ @data[k] = dv
191
+ end
192
+
119
193
  @data
120
194
  end
121
195
 
@@ -136,26 +210,46 @@ module PEROBS
136
210
  data = {}
137
211
 
138
212
  @data.each do |k, v|
139
- if v.respond_to?(:is_poxreference?)
140
- data[k] = POReference.new(v.id)
141
- else
142
- # Outside of the PEROBS library all PEROBS::ObjectBase derived
143
- # objects should not be used directly. The library only exposes them
144
- # via POXReference proxy objects.
145
- if v.is_a?(ObjectBase)
146
- PEROBS.log.fatal 'A PEROBS::ObjectBase object escaped! ' +
147
- "It is stored in a PEROBS::Hash with key #{k.inspect}. " +
148
- 'Have you used self() instead of myself() to ' +
149
- "get the reference of this PEROBS object?\n" +
150
- v.inspect
151
- end
152
- data[k] = v
213
+ if k.respond_to?(:is_poxreference?)
214
+ # JSON only supports Strings as hash keys. Since JSON is the default
215
+ # internal storage format in the database, we have to marshall
216
+ # PEROBS::Object references ourselves.
217
+ k = "#<PEROBS::POReference id=#{k.id}>"
218
+ elsif k[0..24] == '#<PEROBS::POReference id='
219
+ # This could obviously result in conflicts with 'normal' String hash
220
+ # keys. This is extremely unlikely, but we better catch this case
221
+ # before it causes hard to debug trouble.
222
+ raise ArgumentError, "Hash key #{k} conflicts with PEROBS " +
223
+ "internal representation of marshalled hash keys!"
153
224
  end
225
+ data[k] = serialize_helper(v)
154
226
  end
155
227
 
156
228
  data
157
229
  end
158
230
 
231
+ def serialize_helper(v)
232
+ if v.respond_to?(:is_poxreference?)
233
+ # References to other PEROBS objects (POXReference) are stored as
234
+ # POReference in the database.
235
+ return POReference.new(v.id)
236
+ else
237
+ # Outside of the PEROBS library all PEROBS::ObjectBase derived
238
+ # objects should not be used directly. The library only exposes them
239
+ # via POXReference proxy objects.
240
+ if v.is_a?(ObjectBase)
241
+ PEROBS.log.fatal 'A PEROBS::ObjectBase object escaped! ' +
242
+ "It is stored in a PEROBS::Hash. " +
243
+ 'Have you used self() instead of myself() to ' +
244
+ "get the reference of this PEROBS object?\n" +
245
+ v.inspect
246
+ end
247
+
248
+ # All other objects are serialized by their native methods.
249
+ return v
250
+ end
251
+ end
252
+
159
253
  end
160
254
 
161
255
  end
@@ -0,0 +1,144 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # = IDList.rb -- Persistent Ruby Object Store
4
+ #
5
+ # Copyright (c) 2018 by Chris Schlaeger <chris@taskjuggler.org>
6
+ #
7
+ # MIT License
8
+ #
9
+ # Permission is hereby granted, free of charge, to any person obtaining
10
+ # a copy of this software and associated documentation files (the
11
+ # "Software"), to deal in the Software without restriction, including
12
+ # without limitation the rights to use, copy, modify, merge, publish,
13
+ # distribute, sublicense, and/or sell copies of the Software, and to
14
+ # permit persons to whom the Software is furnished to do so, subject to
15
+ # the following conditions:
16
+ #
17
+ # The above copyright notice and this permission notice shall be
18
+ # included in all copies or substantial portions of the Software.
19
+ #
20
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
+
28
+ require 'perobs/IDListPageFile'
29
+ require 'perobs/IDListPageRecord'
30
+
31
+ module PEROBS
32
+
33
+ # This class stores a list of 64 bit values. Values can be added to the list
34
+ # and the presence of a certain value can be checked. It can hold up to 2^64
35
+ # values. It tries to keep values in memory but can store them in a file if
36
+ # needed. A threshold for the in-memory values can be set in the
37
+ # constructor. The stored values are grouped in pages. Each page can hold up
38
+ # to page_size entries.
39
+ class IDList
40
+
41
+ # Create a new IDList object. The data that can't be kept in memory will
42
+ # be stored in the specified directory under the given name.
43
+ # @param dir [String] Path of the directory
44
+ # @param name [String] Name of the file
45
+ # @param max_in_memory [Integer] Specifies the maximum number of values
46
+ # that will be kept in memory. If the list is larger, values will
47
+ # be cached in the specified file.
48
+ # @param page_size [Integer] The number of values per page. The default
49
+ # value is 32 which was found the best performing config in tests.
50
+ def initialize(dir, name, max_in_memory, page_size = 32)
51
+ # The page_file manages the pages that store the values.
52
+ @page_file = IDListPageFile.new(self, dir, name,
53
+ max_in_memory, page_size)
54
+ clear
55
+ end
56
+
57
+ # Insert a new value into the list.
58
+ # @param id [Integer] The value to add
59
+ def insert(id)
60
+ # Find the index of the page that should hold ID.
61
+ index = @page_records.bsearch_index { |pr| pr.max_id >= id }
62
+ # Get the corresponding IDListPageRecord object.
63
+ page = @page_records[index]
64
+
65
+ # In case the page is already full we'll have to create a new page.
66
+ # There is no guarantee that a split will yield an page with space as we
67
+ # split by ID range, not by distributing the values evenly across the
68
+ # two pages.
69
+ while page.is_full?
70
+ new_page = page.split
71
+ # Store the newly created page into the page_records list.
72
+ @page_records.insert(index + 1, new_page)
73
+ if id >= new_page.min_id
74
+ # We need to insert the ID into the newly created page. Adjust index
75
+ # and page reference accordingly.
76
+ index += 1
77
+ page = new_page
78
+ end
79
+ end
80
+
81
+ # Insert the ID into the page.
82
+ page.insert(id)
83
+ end
84
+
85
+ # Check if a given value is already stored in the list.
86
+ # @param id [Integer] The value to check for
87
+ def include?(id)
88
+ @page_records.bsearch { |pr| pr.max_id >= id }.include?(id)
89
+ end
90
+
91
+ # Clear the list and empty the filesystem cache file.
92
+ def clear
93
+ @page_file.clear
94
+ @page_records = [ IDListPageRecord.new(@page_file, 0, 2 ** 64) ]
95
+ end
96
+
97
+ # Erase the list including the filesystem cache file. The IDList is no
98
+ # longer usable after this call but the cache file is removed from the
99
+ # filesystem.
100
+ def erase
101
+ @page_file.erase
102
+ @page_records = nil
103
+ end
104
+
105
+ # Perform some consistency checks on the internal data structures. Raises
106
+ # a RuntimeError in case a problem is found.
107
+ def check
108
+ last_max = -1
109
+ unless (min_id = @page_records.first.min_id) == 0
110
+ raise RuntimeError, "min_id of first record (#{min_id}) " +
111
+ "must be 0."
112
+ end
113
+
114
+ @page_records.each do |pr|
115
+ unless pr.min_id == last_max + 1
116
+ raise RuntimeError, "max_id of previous record (#{last_max}) " +
117
+ "must be exactly 1 smaller than current record (#{pr.min_id})."
118
+ end
119
+ last_max = pr.max_id
120
+ pr.check
121
+ end
122
+
123
+ unless last_max == 2 ** 64
124
+ raise RuntimeError, "max_id of last records " +
125
+ "(#{@page_records.last.max_id}) must be #{2 ** 64})."
126
+ end
127
+ end
128
+
129
+ def to_a
130
+ a = []
131
+ @page_records.each { |pr| a += pr.values }
132
+ a
133
+ end
134
+
135
+ # Print a human readable form of the tree that stores the list. This is
136
+ # only meant for debugging purposes and does not scale for larger trees.
137
+ def to_s
138
+ "\n" + @root.to_s
139
+ end
140
+
141
+ end
142
+
143
+ end
144
+