perobs 3.0.1 → 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. checksums.yaml +5 -5
  2. data/README.md +19 -18
  3. data/lib/perobs.rb +2 -0
  4. data/lib/perobs/Array.rb +68 -21
  5. data/lib/perobs/BTree.rb +110 -54
  6. data/lib/perobs/BTreeBlob.rb +14 -13
  7. data/lib/perobs/BTreeDB.rb +11 -10
  8. data/lib/perobs/BTreeNode.rb +551 -197
  9. data/lib/perobs/BTreeNodeCache.rb +10 -8
  10. data/lib/perobs/BTreeNodeLink.rb +11 -1
  11. data/lib/perobs/BigArray.rb +285 -0
  12. data/lib/perobs/BigArrayNode.rb +1002 -0
  13. data/lib/perobs/BigHash.rb +246 -0
  14. data/lib/perobs/BigTree.rb +197 -0
  15. data/lib/perobs/BigTreeNode.rb +873 -0
  16. data/lib/perobs/Cache.rb +47 -22
  17. data/lib/perobs/ClassMap.rb +2 -2
  18. data/lib/perobs/ConsoleProgressMeter.rb +61 -0
  19. data/lib/perobs/DataBase.rb +4 -3
  20. data/lib/perobs/DynamoDB.rb +62 -20
  21. data/lib/perobs/EquiBlobsFile.rb +174 -59
  22. data/lib/perobs/FNV_Hash_1a_64.rb +54 -0
  23. data/lib/perobs/FlatFile.rb +536 -242
  24. data/lib/perobs/FlatFileBlobHeader.rb +120 -84
  25. data/lib/perobs/FlatFileDB.rb +58 -27
  26. data/lib/perobs/FuzzyStringMatcher.rb +175 -0
  27. data/lib/perobs/Hash.rb +129 -35
  28. data/lib/perobs/IDList.rb +144 -0
  29. data/lib/perobs/IDListPage.rb +107 -0
  30. data/lib/perobs/IDListPageFile.rb +180 -0
  31. data/lib/perobs/IDListPageRecord.rb +142 -0
  32. data/lib/perobs/LockFile.rb +3 -0
  33. data/lib/perobs/Object.rb +28 -20
  34. data/lib/perobs/ObjectBase.rb +53 -10
  35. data/lib/perobs/PersistentObjectCache.rb +142 -0
  36. data/lib/perobs/PersistentObjectCacheLine.rb +99 -0
  37. data/lib/perobs/ProgressMeter.rb +97 -0
  38. data/lib/perobs/SpaceManager.rb +273 -0
  39. data/lib/perobs/SpaceTree.rb +63 -47
  40. data/lib/perobs/SpaceTreeNode.rb +134 -115
  41. data/lib/perobs/SpaceTreeNodeLink.rb +1 -1
  42. data/lib/perobs/StackFile.rb +1 -1
  43. data/lib/perobs/Store.rb +180 -70
  44. data/lib/perobs/version.rb +1 -1
  45. data/perobs.gemspec +4 -4
  46. data/test/Array_spec.rb +48 -39
  47. data/test/BTreeDB_spec.rb +2 -2
  48. data/test/BTree_spec.rb +50 -1
  49. data/test/BigArray_spec.rb +261 -0
  50. data/test/BigHash_spec.rb +152 -0
  51. data/test/BigTreeNode_spec.rb +153 -0
  52. data/test/BigTree_spec.rb +259 -0
  53. data/test/EquiBlobsFile_spec.rb +105 -5
  54. data/test/FNV_Hash_1a_64_spec.rb +59 -0
  55. data/test/FlatFileDB_spec.rb +199 -15
  56. data/test/FuzzyStringMatcher_spec.rb +261 -0
  57. data/test/Hash_spec.rb +27 -16
  58. data/test/IDList_spec.rb +77 -0
  59. data/test/LegacyDBs/LegacyDB.rb +155 -0
  60. data/test/LegacyDBs/version_3/class_map.json +1 -0
  61. data/test/LegacyDBs/version_3/config.json +1 -0
  62. data/test/LegacyDBs/version_3/database.blobs +0 -0
  63. data/test/LegacyDBs/version_3/database_spaces.blobs +0 -0
  64. data/test/LegacyDBs/version_3/index.blobs +0 -0
  65. data/test/LegacyDBs/version_3/version +1 -0
  66. data/test/LockFile_spec.rb +9 -6
  67. data/test/Object_spec.rb +5 -5
  68. data/test/SpaceManager_spec.rb +176 -0
  69. data/test/SpaceTree_spec.rb +27 -9
  70. data/test/Store_spec.rb +353 -206
  71. data/test/perobs_spec.rb +7 -3
  72. data/test/spec_helper.rb +9 -4
  73. metadata +59 -16
  74. data/lib/perobs/SpaceTreeNodeCache.rb +0 -76
  75. data/lib/perobs/TreeDB.rb +0 -277
@@ -0,0 +1,175 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # = FuzzyStringMatcher.rb -- Persistent Ruby Object Store
4
+ #
5
+ # Copyright (c) 2020 by Chris Schlaeger <chris@taskjuggler.org>
6
+ #
7
+ # MIT License
8
+ #
9
+ # Permission is hereby granted, free of charge, to any person obtaining
10
+ # a copy of this software and associated documentation files (the
11
+ # "Software"), to deal in the Software without restriction, including
12
+ # without limitation the rights to use, copy, modify, merge, publish,
13
+ # distribute, sublicense, and/or sell copies of the Software, and to
14
+ # permit persons to whom the Software is furnished to do so, subject to
15
+ # the following conditions:
16
+ #
17
+ # The above copyright notice and this permission notice shall be
18
+ # included in all copies or substantial portions of the Software.
19
+ #
20
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
+
28
+ require 'perobs/Log'
29
+ require 'perobs/Object'
30
+
31
+ module PEROBS
32
+
33
+ # The fuzzy string matcher can be used to perform a fuzzy string search
34
+ # against a known set of strings. The dictionary of known strings does not
35
+ # store the actual strings but references to String or PEROBS objects.
36
+ # Once the dictionary has been established, fuzzy matches can be done. Since
37
+ # the actual input strings are not directly stored, you cannot remove or
38
+ # modified already stored strings. To remove strings, you have to clear the
39
+ # matcher and add the strings again that you want to keep.
40
+ class FuzzyStringMatcher < PEROBS::Object
41
+
42
+ attr_persist :case_sensitive, :n, :dict
43
+
44
+ # Create a new FuzzyStringMatcher.
45
+ # @param p [PEROBS::Store] place to store the dictionary
46
+ # @param case_sensitive [Boolean] True if case matters for matching
47
+ # @param n [Integer] Determines what kind of n-gramm is used to store the
48
+ # references in the dictionary. It also determines the minimum word
49
+ # length that can be used for fuzzy matches. Values between 2 and
50
+ # 10 are supported. The default is 4.
51
+ def initialize(p, case_sensitive = false, n = 4)
52
+ super(p)
53
+ if n < 2 || n > 10
54
+ raise ArgumentError, 'n must be between 2 and 10'
55
+ end
56
+ self.case_sensitive = case_sensitive
57
+ self.n = n
58
+
59
+ clear unless @dict
60
+ end
61
+
62
+ # Wipe the dictionary.
63
+ def clear
64
+ self.dict = @store.new(BigHash)
65
+ end
66
+
67
+ # Add a string with its reference to the dictionary.
68
+ # @param string [String] The string to store
69
+ # @param reference [Object] Any object that is associated with the string
70
+ def learn(string, reference = string)
71
+ reference = string if reference.nil?
72
+
73
+ unless @case_sensitive
74
+ string = string.downcase
75
+ end
76
+ # Enclose string in 'start of text' and 'end of text' ASCII values.
77
+ string = "\002" + string + "\003"
78
+
79
+ each_n_gramm(string) do |n_gramm|
80
+ unless (ng_list = @dict[n_gramm])
81
+ @dict[n_gramm] = ng_list = @store.new(Hash)
82
+ end
83
+
84
+ # We use the Hash as a Set. The value doesn't matter.
85
+ ng_list[reference] = true unless ng_list.include?(reference)
86
+ end
87
+
88
+ nil
89
+ end
90
+
91
+ # Find the references who's string best matches the given string.
92
+ # @param string [String] string to search for
93
+ # @param min_score [Float] Value 0.01 and 1.0 that specifies how strict
94
+ # the matching should be done. The larger the value the more closer
95
+ # the given string needs to be.
96
+ # @param max_count [Integer] The maximum number of matches that should be
97
+ # returned.
98
+ # @return [Array] The result is an Array of Arrays. The nested Arrays only
99
+ # have 2 entries. The reference and a Float value between 0 and
100
+ # 1.0 that describes how good the match is. The matches are sorted
101
+ # in descending order by the match score.
102
+ def best_matches(string, min_score = 0.5, max_count = 100)
103
+ unless @case_sensitive
104
+ string = string.downcase
105
+ end
106
+ # Enclose string in 'start of text' and 'end of text' ASCII values.
107
+ string = "\002" + string + "\003"
108
+
109
+ matches = {}
110
+
111
+ each_n_gramm(string) do |n_gramm|
112
+ if (ng_list = @dict[n_gramm])
113
+ ng_list.each do |reference, dummy|
114
+ if matches.include?(reference)
115
+ matches[reference] += 1
116
+ else
117
+ matches[reference] = 1
118
+ end
119
+ end
120
+ end
121
+ end
122
+
123
+ return [] if matches.empty?
124
+
125
+ match_list = matches.to_a
126
+
127
+ # Set occurance counters to scores relative to the best possible score.
128
+ # This will be the best possible score for a perfect match.
129
+ best_possible_score = string.length - @n + 1
130
+ match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
131
+
132
+ # Delete all matches that don't have the required minimum match score.
133
+ match_list.delete_if { |a| a[1] < min_score }
134
+
135
+ # Sort the list best to worst match
136
+ match_list.sort! do |a, b|
137
+ b[1] <=> a[1]
138
+ end
139
+
140
+ # Return the top max_count matches.
141
+ match_list[0..max_count - 1]
142
+ end
143
+
144
+ # Returns some internal stats about the dictionary.
145
+ def stats
146
+ s = {}
147
+ s['dictionary_size'] = @dict.size
148
+ max = total = 0
149
+ @dict.each do |n_gramm, ng_list|
150
+ size = ng_list.length
151
+ max = size if size > max
152
+ total += size
153
+ end
154
+ s['max_list_size'] = max
155
+ s['avg_list_size'] = total > 0 ? total.to_f / s['dictionary_size'] : 0
156
+
157
+ s
158
+ end
159
+
160
+ private
161
+
162
+ def each_n_gramm(string, &block)
163
+ return if string.length < @n
164
+
165
+ 0.upto(string.length - @n) do |i|
166
+ n_gramm = string[i, @n]
167
+
168
+ yield(n_gramm)
169
+ end
170
+ end
171
+
172
+ end
173
+
174
+ end
175
+
data/lib/perobs/Hash.rb CHANGED
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # = Hash.rb -- Persistent Ruby Object Store
4
4
  #
5
- # Copyright (c) 2015, 2016 by Chris Schlaeger <chris@taskjuggler.org>
5
+ # Copyright (c) 2015, 2016, 2017 by Chris Schlaeger <chris@taskjuggler.org>
6
6
  #
7
7
  # MIT License
8
8
  #
@@ -37,20 +37,36 @@ module PEROBS
37
37
  # The implementation is largely a proxy around the standard Hash class. But
38
38
  # all mutating methods must be re-implemented to convert PEROBS::Objects to
39
39
  # POXReference objects and to register the object as modified with the
40
- # cache.
40
+ # cache. However, it is not designed for large data sets as it always reads
41
+ # and writes the full data set for every access (unless it is cached). For
42
+ # data sets that could have more than a few hundred entries BigHash is the
43
+ # recommended alternative.
41
44
  #
42
45
  # We explicitely don't support Hash::store() as it conflicts with
43
46
  # ObjectBase::store() method to access the store.
44
47
  class Hash < ObjectBase
45
48
 
49
+ # These methods do not mutate the Hash. They only perform read
50
+ # operations and return a new PEROBS::Hash object.
51
+ ([
52
+ :invert, :merge, :reject, :select
53
+ ] + Enumerable.instance_methods).uniq.each do |method_sym|
54
+ # Create a wrapper method that passes the call to @data.
55
+ define_method(method_sym) do |*args, &block|
56
+ # Register the read operation with the cache.
57
+ @store.cache.cache_read(self)
58
+ @store.new(PEROBS::Hash, @data.send(method_sym, *args, &block))
59
+ end
60
+ end
61
+
46
62
  # These methods do not mutate the Hash. They only perform read
47
63
  # operations.
48
64
  ([
49
65
  :==, :[], :assoc, :compare_by_identity, :compare_by_identity?, :default,
50
66
  :default_proc, :each, :each_key, :each_pair, :each_value, :empty?,
51
67
  :eql?, :fetch, :flatten, :has_key?, :has_value?, :hash, :include?,
52
- :invert, :key, :key?, :keys, :length, :member?, :merge,
53
- :pretty_print, :pretty_print_cycle, :rassoc, :reject, :select, :size,
68
+ :key, :key?, :keys, :length, :member?,
69
+ :pretty_print, :pretty_print_cycle, :rassoc, :size,
54
70
  :to_a, :to_h, :to_hash, :to_s, :value?, :values, :values_at
55
71
  ] + Enumerable.instance_methods).uniq.each do |method_sym|
56
72
  # Create a wrapper method that passes the call to @data.
@@ -61,11 +77,22 @@ module PEROBS
61
77
  end
62
78
  end
63
79
 
64
- # These methods mutate the Hash.
80
+ # These methods mutate the Hash and return self
81
+ [
82
+ :clear, :keep_if, :merge!, :rehash, :reject!, :replace, :select!, :update
83
+ ].each do |method_sym|
84
+ # Create a wrapper method that passes the call to @data.
85
+ define_method(method_sym) do |*args, &block|
86
+ # Register the write operation with the cache.
87
+ @store.cache.cache_write(self)
88
+ @data.send(method_sym, *args, &block)
89
+ myself
90
+ end
91
+ end
92
+
93
+ # These methods mutate the Hash and return basic Ruby type objects.
65
94
  [
66
- :[]=, :clear, :default=, :default_proc=, :delete, :delete_if,
67
- :initialize_copy, :keep_if, :merge!, :rehash, :reject!, :replace,
68
- :select!, :shift, :update
95
+ :delete, :delete_if, :shift
69
96
  ].each do |method_sym|
70
97
  # Create a wrapper method that passes the call to @data.
71
98
  define_method(method_sym) do |*args, &block|
@@ -79,33 +106,70 @@ module PEROBS
79
106
  # PEROBS users should never call this method or equivalents of derived
80
107
  # methods directly.
81
108
  # @param p [PEROBS::Handle] PEROBS handle
82
- # @param default [Any] The default value that is returned when no value is
83
- # stored for a specific key.
84
- def initialize(p, default = nil)
109
+ # @param default [Object] The default value that is returned when no value
110
+ # is stored for a specific key. The default must be of the
111
+ # supported type.
112
+ def initialize(p, default = nil, &block)
85
113
  super(p)
86
- @default = nil
87
- @data = {}
114
+ _check_assignment_value(default)
115
+ if block_given?
116
+ @data = ::Hash.new(&block)
117
+ else
118
+ @data = ::Hash.new(default)
119
+ end
88
120
 
89
121
  # Ensure that the newly created object will be pushed into the database.
90
122
  @store.cache.cache_write(self)
91
123
  end
92
124
 
125
+ # Proxy for assignment method.
126
+ def []=(key, value)
127
+ unless key.is_a?(String) || key.respond_to?(:is_poxreference?)
128
+ raise ArgumentError, "PEROBS::Hash[] key must be a String or " +
129
+ "a PEROBS object but is a #{key.class}"
130
+ end
131
+ _check_assignment_value(value)
132
+ @store.cache.cache_write(self)
133
+ @data[key] = value
134
+ end
135
+
136
+ # Proxy for default= method.
137
+ def default=(value)
138
+ _check_assignment_value(value)
139
+ @data.default=(value)
140
+ end
141
+
93
142
  # Return a list of all object IDs of all persistend objects that this Hash
94
143
  # is referencing.
95
- # @return [Array of Fixnum or Bignum] IDs of referenced objects
144
+ # @return [Array of Integer] IDs of referenced objects
96
145
  def _referenced_object_ids
97
- @data.each_value.select { |v| v && v.respond_to?(:is_poxreference?) }.
98
- map { |o| o.id }
146
+ ids = []
147
+ @data.each do |k, v|
148
+ if k && k.respond_to?(:is_poxreference?)
149
+ ids << k.id
150
+ end
151
+ if v && v.respond_to?(:is_poxreference?)
152
+ ids << v.id
153
+ end
154
+ end
155
+
156
+ ids
99
157
  end
100
158
 
101
159
  # This method should only be used during store repair operations. It will
102
160
  # delete all referenced to the given object ID.
103
- # @param id [Fixnum/Bignum] targeted object ID
161
+ # @param id [Integer] targeted object ID
104
162
  def _delete_reference_to_id(id)
163
+ original_length = @data.length
164
+
105
165
  @data.delete_if do |k, v|
106
- v && v.respond_to?(:is_poxreference?) && v.id == id
166
+ (k && k.respond_to?(:is_poxreference?) && k.id == id) ||
167
+ (v && v.respond_to?(:is_poxreference?) && v.id == id)
168
+ end
169
+
170
+ if @data.length != original_length
171
+ @store.cache.cache_write(self)
107
172
  end
108
- @store.cache.cache_write(self)
109
173
  end
110
174
 
111
175
  # Restore the persistent data from a single data structure.
@@ -114,8 +178,18 @@ module PEROBS
114
178
  # @private
115
179
  def _deserialize(data)
116
180
  @data = {}
117
- data.each { |k, v| @data[k] = v.is_a?(POReference) ?
118
- POXReference.new(@store, v.id) : v }
181
+
182
+ data.each do |k, v|
183
+ # References to other PEROBS Objects are marshalled with our own
184
+ # format. If we detect such a marshalled String we convert it into a
185
+ # POXReference object.
186
+ if (match = /^#<PEROBS::POReference id=([0-9]+)>$/.match(k))
187
+ k = POXReference.new(@store, match[1].to_i)
188
+ end
189
+ dv = v.is_a?(POReference) ? POXReference.new(@store, v.id) : v
190
+ @data[k] = dv
191
+ end
192
+
119
193
  @data
120
194
  end
121
195
 
@@ -136,26 +210,46 @@ module PEROBS
136
210
  data = {}
137
211
 
138
212
  @data.each do |k, v|
139
- if v.respond_to?(:is_poxreference?)
140
- data[k] = POReference.new(v.id)
141
- else
142
- # Outside of the PEROBS library all PEROBS::ObjectBase derived
143
- # objects should not be used directly. The library only exposes them
144
- # via POXReference proxy objects.
145
- if v.is_a?(ObjectBase)
146
- PEROBS.log.fatal 'A PEROBS::ObjectBase object escaped! ' +
147
- "It is stored in a PEROBS::Hash with key #{k.inspect}. " +
148
- 'Have you used self() instead of myself() to ' +
149
- "get the reference of this PEROBS object?\n" +
150
- v.inspect
151
- end
152
- data[k] = v
213
+ if k.respond_to?(:is_poxreference?)
214
+ # JSON only supports Strings as hash keys. Since JSON is the default
215
+ # internal storage format in the database, we have to marshall
216
+ # PEROBS::Object references ourselves.
217
+ k = "#<PEROBS::POReference id=#{k.id}>"
218
+ elsif k[0..24] == '#<PEROBS::POReference id='
219
+ # This could obviously result in conflicts with 'normal' String hash
220
+ # keys. This is extremely unlikely, but we better catch this case
221
+ # before it causes hard to debug trouble.
222
+ raise ArgumentError, "Hash key #{k} conflicts with PEROBS " +
223
+ "internal representation of marshalled hash keys!"
153
224
  end
225
+ data[k] = serialize_helper(v)
154
226
  end
155
227
 
156
228
  data
157
229
  end
158
230
 
231
+ def serialize_helper(v)
232
+ if v.respond_to?(:is_poxreference?)
233
+ # References to other PEROBS objects (POXReference) are stored as
234
+ # POReference in the database.
235
+ return POReference.new(v.id)
236
+ else
237
+ # Outside of the PEROBS library all PEROBS::ObjectBase derived
238
+ # objects should not be used directly. The library only exposes them
239
+ # via POXReference proxy objects.
240
+ if v.is_a?(ObjectBase)
241
+ PEROBS.log.fatal 'A PEROBS::ObjectBase object escaped! ' +
242
+ "It is stored in a PEROBS::Hash. " +
243
+ 'Have you used self() instead of myself() to ' +
244
+ "get the reference of this PEROBS object?\n" +
245
+ v.inspect
246
+ end
247
+
248
+ # All other objects are serialized by their native methods.
249
+ return v
250
+ end
251
+ end
252
+
159
253
  end
160
254
 
161
255
  end
@@ -0,0 +1,144 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # = IDList.rb -- Persistent Ruby Object Store
4
+ #
5
+ # Copyright (c) 2018 by Chris Schlaeger <chris@taskjuggler.org>
6
+ #
7
+ # MIT License
8
+ #
9
+ # Permission is hereby granted, free of charge, to any person obtaining
10
+ # a copy of this software and associated documentation files (the
11
+ # "Software"), to deal in the Software without restriction, including
12
+ # without limitation the rights to use, copy, modify, merge, publish,
13
+ # distribute, sublicense, and/or sell copies of the Software, and to
14
+ # permit persons to whom the Software is furnished to do so, subject to
15
+ # the following conditions:
16
+ #
17
+ # The above copyright notice and this permission notice shall be
18
+ # included in all copies or substantial portions of the Software.
19
+ #
20
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
+
28
+ require 'perobs/IDListPageFile'
29
+ require 'perobs/IDListPageRecord'
30
+
31
+ module PEROBS
32
+
33
+ # This class stores a list of 64 bit values. Values can be added to the list
34
+ # and the presence of a certain value can be checked. It can hold up to 2^64
35
+ # values. It tries to keep values in memory but can store them in a file if
36
+ # needed. A threshold for the in-memory values can be set in the
37
+ # constructor. The stored values are grouped in pages. Each page can hold up
38
+ # to page_size entries.
39
+ class IDList
40
+
41
+ # Create a new IDList object. The data that can't be kept in memory will
42
+ # be stored in the specified directory under the given name.
43
+ # @param dir [String] Path of the directory
44
+ # @param name [String] Name of the file
45
+ # @param max_in_memory [Integer] Specifies the maximum number of values
46
+ # that will be kept in memory. If the list is larger, values will
47
+ # be cached in the specified file.
48
+ # @param page_size [Integer] The number of values per page. The default
49
+ # value is 32 which was found the best performing config in tests.
50
+ def initialize(dir, name, max_in_memory, page_size = 32)
51
+ # The page_file manages the pages that store the values.
52
+ @page_file = IDListPageFile.new(self, dir, name,
53
+ max_in_memory, page_size)
54
+ clear
55
+ end
56
+
57
+ # Insert a new value into the list.
58
+ # @param id [Integer] The value to add
59
+ def insert(id)
60
+ # Find the index of the page that should hold ID.
61
+ index = @page_records.bsearch_index { |pr| pr.max_id >= id }
62
+ # Get the corresponding IDListPageRecord object.
63
+ page = @page_records[index]
64
+
65
+ # In case the page is already full we'll have to create a new page.
66
+ # There is no guarantee that a split will yield an page with space as we
67
+ # split by ID range, not by distributing the values evenly across the
68
+ # two pages.
69
+ while page.is_full?
70
+ new_page = page.split
71
+ # Store the newly created page into the page_records list.
72
+ @page_records.insert(index + 1, new_page)
73
+ if id >= new_page.min_id
74
+ # We need to insert the ID into the newly created page. Adjust index
75
+ # and page reference accordingly.
76
+ index += 1
77
+ page = new_page
78
+ end
79
+ end
80
+
81
+ # Insert the ID into the page.
82
+ page.insert(id)
83
+ end
84
+
85
+ # Check if a given value is already stored in the list.
86
+ # @param id [Integer] The value to check for
87
+ def include?(id)
88
+ @page_records.bsearch { |pr| pr.max_id >= id }.include?(id)
89
+ end
90
+
91
+ # Clear the list and empty the filesystem cache file.
92
+ def clear
93
+ @page_file.clear
94
+ @page_records = [ IDListPageRecord.new(@page_file, 0, 2 ** 64) ]
95
+ end
96
+
97
+ # Erase the list including the filesystem cache file. The IDList is no
98
+ # longer usable after this call but the cache file is removed from the
99
+ # filesystem.
100
+ def erase
101
+ @page_file.erase
102
+ @page_records = nil
103
+ end
104
+
105
+ # Perform some consistency checks on the internal data structures. Raises
106
+ # a RuntimeError in case a problem is found.
107
+ def check
108
+ last_max = -1
109
+ unless (min_id = @page_records.first.min_id) == 0
110
+ raise RuntimeError, "min_id of first record (#{min_id}) " +
111
+ "must be 0."
112
+ end
113
+
114
+ @page_records.each do |pr|
115
+ unless pr.min_id == last_max + 1
116
+ raise RuntimeError, "max_id of previous record (#{last_max}) " +
117
+ "must be exactly 1 smaller than current record (#{pr.min_id})."
118
+ end
119
+ last_max = pr.max_id
120
+ pr.check
121
+ end
122
+
123
+ unless last_max == 2 ** 64
124
+ raise RuntimeError, "max_id of last records " +
125
+ "(#{@page_records.last.max_id}) must be #{2 ** 64})."
126
+ end
127
+ end
128
+
129
+ def to_a
130
+ a = []
131
+ @page_records.each { |pr| a += pr.values }
132
+ a
133
+ end
134
+
135
+ # Print a human readable form of the tree that stores the list. This is
136
+ # only meant for debugging purposes and does not scale for larger trees.
137
+ def to_s
138
+ "\n" + @root.to_s
139
+ end
140
+
141
+ end
142
+
143
+ end
144
+