perobs 4.1.0 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -115,6 +115,10 @@ module PEROBS
115
115
  PEROBS.log.fatal "Cannot read blob header " +
116
116
  "#{id ? "for ID #{id} " : ''}at address #{addr}"
117
117
  else
118
+ if corruption_start
119
+ PEROBS.log.error "Corruption found at end of blob file at " +
120
+ "address #{addr}"
121
+ end
118
122
  # We have reached the end of the file.
119
123
  return nil
120
124
  end
@@ -122,10 +126,15 @@ module PEROBS
122
126
 
123
127
  # Did we get the full header?
124
128
  if buf_with_crc.length != LENGTH
125
- PEROBS.log.error "Incomplete FlatFileBlobHeader: Only " +
129
+ msg = "Incomplete FlatFileBlobHeader: Only " +
126
130
  "#{buf_with_crc.length} " +
127
131
  "bytes of #{LENGTH} could be read "
128
132
  "#{id ? "for ID #{id} " : ''}at address #{addr}"
133
+ if errors_are_fatal
134
+ PEROBS.log.fatal msg
135
+ else
136
+ PEROBS.log.error msg
137
+ end
129
138
  return nil
130
139
  end
131
140
 
@@ -148,10 +157,16 @@ module PEROBS
148
157
  "#{'%08x' % crc}."
149
158
  else
150
159
  if corruption_start.nil?
151
- PEROBS.log.error "FlatFile corruption found. The FlatFile " +
152
- "Header CRC mismatch at address #{addr}. Header CRC is " +
153
- "#{'%08x' % read_crc} but should be #{'%08x' % crc}. Trying " +
154
- "to find the next header."
160
+ if errors_are_fatal
161
+ PEROBS.log.fatal "FlatFile corruption found. The FlatFile " +
162
+ "Header CRC mismatch at address #{addr}. Header CRC is " +
163
+ "#{'%08x' % read_crc} but should be #{'%08x' % crc}."
164
+ else
165
+ PEROBS.log.error "FlatFile corruption found. The FlatFile " +
166
+ "Header CRC mismatch at address #{addr}. Header CRC is " +
167
+ "#{'%08x' % read_crc} but should be #{'%08x' % crc}. " +
168
+ "Trying to find the next header."
169
+ end
155
170
  corruption_start = addr
156
171
  end
157
172
  # The blob file is corrupted. There is no valid header at the
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # = FlatFileDB.rb -- Persistent Ruby Object Store
4
4
  #
5
- # Copyright (c) 2015, 2016, 2017, 2018
5
+ # Copyright (c) 2015, 2016, 2017, 2018, 2019
6
6
  # by Chris Schlaeger <chris@taskjuggler.org>
7
7
  #
8
8
  # MIT License
@@ -161,8 +161,8 @@ module PEROBS
161
161
  # Permanently delete all objects that have not been marked. Those are
162
162
  # orphaned and are no longer referenced by any actively used object.
163
163
  # @return [Integer] Number of the removed objects from the DB.
164
- def delete_unmarked_objects
165
- @flat_file.delete_unmarked_objects
164
+ def delete_unmarked_objects(&block)
165
+ @flat_file.delete_unmarked_objects(&block)
166
166
  end
167
167
 
168
168
  # Mark an object.
@@ -184,7 +184,11 @@ module PEROBS
184
184
  # repaired.
185
185
  # @return number of errors found
186
186
  def check_db(repair = false)
187
- @flat_file.check(repair)
187
+ if repair
188
+ @flat_file.repair
189
+ else
190
+ @flat_file.check
191
+ end
188
192
  end
189
193
 
190
194
  # Check if the stored object is syntactically correct.
@@ -0,0 +1,192 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # = FuzzyStringMatcher.rb -- Persistent Ruby Object Store
4
+ #
5
+ # Copyright (c) 2020 by Chris Schlaeger <chris@taskjuggler.org>
6
+ #
7
+ # MIT License
8
+ #
9
+ # Permission is hereby granted, free of charge, to any person obtaining
10
+ # a copy of this software and associated documentation files (the
11
+ # "Software"), to deal in the Software without restriction, including
12
+ # without limitation the rights to use, copy, modify, merge, publish,
13
+ # distribute, sublicense, and/or sell copies of the Software, and to
14
+ # permit persons to whom the Software is furnished to do so, subject to
15
+ # the following conditions:
16
+ #
17
+ # The above copyright notice and this permission notice shall be
18
+ # included in all copies or substantial portions of the Software.
19
+ #
20
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
+
28
+ require 'perobs/Log'
29
+ require 'perobs/ObjectBase'
30
+
31
+ module PEROBS
32
+
33
+ # The fuzzy string matcher can be used to perform a fuzzy string search
34
+ # against a known set of strings. The dictionary of known strings does not
35
+ # store the actual strings but references to arbitrary objects. These could
36
+ # be the string, but can be something else related to the learned strings.
37
+ # To use this class a list of strings with their references must be learned.
38
+ # Once the dictionary has been established, fuzzy matches can be done.
39
+ class FuzzyStringMatcher
40
+
41
+ # Create a new FuzzyStringMatcher.
42
+ # @param store [PEROBS::Store] place to store the dictionary
43
+ # @param name [String] Unique name of the string matcher
44
+ # @param case_sensitive [Boolean] True if case matters for matching
45
+ # @param n [Integer] Determines what kind of n-gramm is used to store the
46
+ # references in the dictionary. It also determines the minimum word
47
+ # length that can be used for fuzzy matches.
48
+ def initialize(store, name, case_sensitive = false, n = 4)
49
+ @store = store
50
+ @dict_name = "FuzzyStringMatcher::#{name}"
51
+ if n < 2 || n > 10
52
+ raise ArgumentError, 'n must be between 2 and 10'
53
+ end
54
+ @case_sensitive = case_sensitive
55
+ @n = n
56
+
57
+ clear unless (@dict = @store[@dict_name])
58
+ end
59
+
60
+ # Wipe the dictionary.
61
+ def clear
62
+ @store[@dict_name] = @dict = @store.new(BigHash)
63
+ end
64
+
65
+ # Add a string with its reference to the dictionary.
66
+ # @param string [String] The string to store
67
+ # @param reference [Object] Any object that is associated with the string
68
+ def learn(string, reference = string)
69
+ reference = string if reference.nil?
70
+
71
+ unless @case_sensitive
72
+ string = string.downcase
73
+ end
74
+ # Enclose string in 'start of text' and 'end of text' ASCII values.
75
+ string = "\002" + string + "\003"
76
+
77
+ each_n_gramm(string) do |n_gramm|
78
+ unless (ng_list = @dict[n_gramm])
79
+ @dict[n_gramm] = ng_list = @store.new(Hash)
80
+ end
81
+
82
+ if ng_list.include?(reference)
83
+ ng_list[reference] += 1
84
+ else
85
+ ng_list[reference] = 0
86
+ end
87
+ end
88
+
89
+ nil
90
+ end
91
+
92
+ # Find the references who's string best matches the given string.
93
+ # @param string [String] string to search for
94
+ # @param min_score [Float] Value 0.01 and 1.0 that specifies how strict
95
+ # the matching should be done. The larger the value the more closer
96
+ # the given string needs to be.
97
+ # @param max_count [Integer] The maximum number of matches that should be
98
+ # returned.
99
+ # @return [Array] The result is an Array of Arrays. The nested Arrays only
100
+ # have 2 entries. The reference and a Float value between 0 and
101
+ # 1.0 that describes how good the match is. The matches are sorted
102
+ # in descending order by the match score.
103
+ def best_matches(string, min_score = 0.5, max_count = 100)
104
+ unless @case_sensitive
105
+ string = string.downcase
106
+ end
107
+ # Enclose string in 'start of text' and 'end of text' ASCII values.
108
+ string = "\002" + string + "\003"
109
+
110
+ matches = {}
111
+
112
+ # This will be the best possible score for a perfect match.
113
+ best_possible_score = 0
114
+ each_n_gramm(string) do |n_gramm|
115
+ best_possible_score += 1
116
+ if (ng_list = @dict[n_gramm])
117
+ ng_list.each do |reference, count|
118
+ if matches.include?(reference)
119
+ matches[reference] += 1
120
+ else
121
+ # We use internally a 10 times larger list so that we don't
122
+ # throw away good matches too early. If the max_count value is
123
+ # chosen too small there is a risk of not finding the best
124
+ # matches!
125
+ if matches.size > 10 * max_count
126
+ matches = discard_worst_match(matches)
127
+ end
128
+ matches[reference] = 1
129
+ end
130
+ end
131
+ end
132
+ end
133
+
134
+ return [] if matches.empty?
135
+
136
+ # Sort in the order of occurance count downwards.
137
+ match_list = matches.to_a.sort do |a, b|
138
+ b[1] <=> a[1]
139
+ end
140
+
141
+ # Set occurance counters to scores relative to the best possible score.
142
+ match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
143
+
144
+ # Delete all matches that occured less than half as often than the
145
+ # top match.
146
+ match_list.delete_if { |a| a[1] < min_score }
147
+
148
+ match_list[0..max_count]
149
+ end
150
+
151
+ # Returns some internal stats about the dictionary.
152
+ def stats
153
+ s = {}
154
+ s['dictionary_size'] = @dict.size
155
+ max = total = 0
156
+ @dict.each do |n_gramm, ng_list|
157
+ size = ng_list.length
158
+ max = size if size > max
159
+ total += size
160
+ end
161
+ s['max_list_size'] = max
162
+ s['avg_list_size'] = total > 0 ? total.to_f / s['dictionary_size'] : 0
163
+
164
+ s
165
+ end
166
+
167
+ private
168
+
169
+ def each_n_gramm(string, &block)
170
+ return if string.length < @n
171
+
172
+ 0.upto(string.length - @n) do |i|
173
+ n_gramm = string[i, @n]
174
+
175
+ yield(n_gramm)
176
+ end
177
+ end
178
+
179
+ def discard_worst_match(matches)
180
+ # Sort in the order of occurance count downwards.
181
+ match_list = matches.to_a.sort do |a, b|
182
+ b[1] <=> a[1]
183
+ end
184
+ # Discard the lowest half of the matches
185
+ match_list = match_list[0..match_list.length / 2]
186
+ match_list.to_h
187
+ end
188
+
189
+ end
190
+
191
+ end
192
+
@@ -124,6 +124,10 @@ module PEROBS
124
124
 
125
125
  # Proxy for assignment method.
126
126
  def []=(key, value)
127
+ unless key.is_a?(String)
128
+ raise ArgumentError, "PEROBS::Hash[] key must be a String but is a " +
129
+ "#{key.class}"
130
+ end
127
131
  _check_assignment_value(value)
128
132
  @store.cache.cache_write(self)
129
133
  @data[key] = value
@@ -54,8 +54,7 @@ module PEROBS
54
54
  @file_name = File.join(dir, name + '.cache')
55
55
  @page_size = page_size
56
56
  open
57
- @pages = PersistentObjectCache.new(max_in_memory, max_in_memory / 2,
58
- IDListPage, self)
57
+ @pages = PersistentObjectCache.new(max_in_memory, -1, IDListPage, self)
59
58
  @page_counter = 0
60
59
  end
61
60
 
@@ -250,7 +250,7 @@ module PEROBS
250
250
  def _restore(level)
251
251
  # Find the most recently stored state of this object. This could be on
252
252
  # any previous stash level or in the regular object DB. If the object
253
- # was created during the transaction, there is not previous state to
253
+ # was created during the transaction, there is no previous state to
254
254
  # restore to.
255
255
  data = nil
256
256
  if @_stash_map
@@ -44,7 +44,8 @@ module PEROBS
44
44
  # cache objects.
45
45
  # @param size [Integer] Minimum number of objects to be cached at a time
46
46
  # @param flush_delay [Integer] Determines how often non-forced flushes are
47
- # ignored in a row before the flush is really done.
47
+ # ignored in a row before the flush is really done. If flush_delay
48
+ # is smaller than 0 non-forced flushed will always be ignored.
48
49
  # @param klass [Class] The class of the objects to be cached. Objects must
49
50
  # provide a uid() method that returns a unique ID for every object.
50
51
  # @param collection [] The object collection the objects belong to. It
@@ -71,8 +72,7 @@ module PEROBS
71
72
  if modified
72
73
  @modified_entries[object.uid] = object
73
74
  else
74
- index = object.uid % @size
75
- @unmodified_entries[index] = object
75
+ @unmodified_entries[object.uid % @size] = object
76
76
  end
77
77
 
78
78
  nil
@@ -111,9 +111,12 @@ module PEROBS
111
111
  # all modified objects will be written.
112
112
  # @param now [Boolean]
113
113
  def flush(now = false)
114
- if now || (@flush_counter -= 1) <= 0
114
+ if now || (@flush_delay >= 0 && (@flush_counter -= 1) <= 0)
115
115
  @modified_entries.each do |id, object|
116
116
  object.save
117
+ # Add the object to the unmodified object cache. We might still need
118
+ # it again soon.
119
+ @unmodified_entries[object.uid % @size] = object
117
120
  end
118
121
  @modified_entries = ::Hash.new
119
122
  @flush_counter = @flush_delay
@@ -0,0 +1,273 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # = SpaceManager.rb -- Persistent Ruby Object Store
4
+ #
5
+ # Copyright (c) 2020 by Chris Schlaeger <chris@taskjuggler.org>
6
+ #
7
+ # MIT License
8
+ #
9
+ # Permission is hereby granted, free of charge, to any person obtaining
10
+ # a copy of this software and associated documentation files (the
11
+ # "Software"), to deal in the Software without restriction, including
12
+ # without limitation the rights to use, copy, modify, merge, publish,
13
+ # distribute, sublicense, and/or sell copies of the Software, and to
14
+ # permit persons to whom the Software is furnished to do so, subject to
15
+ # the following conditions:
16
+ #
17
+ # The above copyright notice and this permission notice shall be
18
+ # included in all copies or substantial portions of the Software.
19
+ #
20
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
+
28
+ require 'perobs/BTree'
29
+ require 'perobs/EquiBlobsFile'
30
+ require 'perobs/FlatFile'
31
+ require 'perobs/FlatFileBlobHeader'
32
+
33
+ module PEROBS
34
+
35
+ # The SpaceManager is used to keep a list of all the empty spaces in a
36
+ # FlatFileDB file. An empty space is described by its starting address and
37
+ # its length in bytes. The SpaceManager keeps a list of all the spaces and
38
+ # can find the best fit space when a new blob needs to be added to the
39
+ # FlatFileDB.
40
+ #
41
+ # The SpaceManager uses two files to store the list. The first is a file
42
+ # with the actual addresses. This is a set of linked address lists. Each
43
+ # list holds the addresses for spaces that have exactly the same size. The
44
+ # second file is a BTree file that serves as the index. It is used to map
45
+ # the length of a space to the address of the linked list for that
46
+ # particular length. The linked list consists of elements that only hold 2
47
+ # items. The actual address in the FlatFileDB and the address of the next
48
+ # entry in the linked list in the list file.
49
+ class SpaceManager
50
+
51
+ attr_reader :added_spaces, :recycled_spaces, :failed_requests
52
+
53
+ def initialize(db_dir, progressmeter, btree_order = 65)
54
+ @db_dir = db_dir
55
+ @progressmeter = progressmeter
56
+
57
+ @index = BTree.new(@db_dir, 'space_index', btree_order, @progressmeter)
58
+ # The space list contains blobs that have each 2 entries. The address of
59
+ # the space in the FlatFile and the address of the next blob in the
60
+ # space list file that is an entry for the same space size. An address
61
+ # of 0 marks the end of the list.
62
+ @list = EquiBlobsFile.new(@db_dir, 'space_list', @progressmeter, 2 * 8, 1)
63
+ end
64
+
65
+ def open
66
+ @index.open
67
+ @list.open
68
+ reset_stats
69
+ end
70
+
71
+ def close
72
+ if @index.is_open?
73
+ PEROBS.log.info "SpaceManager has currently #{@list.total_entries} " +
74
+ "used blobs and #{@list.total_spaces} unused blobs in list " +
75
+ "EquiBlobsFile"
76
+ PEROBS.log.info "#{@added_spaces} were added, #{@recycled_spaces} " +
77
+ "spaces were recycled and #{@failed_requests} requests failed"
78
+
79
+ @list.close
80
+ @index.close
81
+ end
82
+ end
83
+
84
+ def is_open?
85
+ @index.is_open?
86
+ end
87
+
88
+ def sync
89
+ @list.sync
90
+ @index.sync
91
+ end
92
+
93
+ def add_space(address, length)
94
+ if (list_entry_addr = @index.get(length))
95
+ # There is already at least one move entry for this length.
96
+ new_list_entry_addr = insert_space_in_list(address, list_entry_addr)
97
+ else
98
+ new_list_entry_addr = insert_space_in_list(address, 0)
99
+ end
100
+ @index.insert(length, new_list_entry_addr)
101
+ @added_spaces += 1
102
+ end
103
+
104
+ def has_space?(address, length)
105
+ if (list_entry_addr = @index.get(length))
106
+ while list_entry_addr > 0
107
+ blob = @list.retrieve_blob(list_entry_addr)
108
+ space_address, next_entry_addr = blob.unpack('QQ')
109
+ return true if space_address == address
110
+ list_entry_addr = next_entry_addr
111
+ end
112
+ end
113
+
114
+ false
115
+ end
116
+
117
+ def get_space(length)
118
+ # We use a simple exact fit strategy. All attempts to use a more
119
+ # elaborate scheme were actually less efficient. Non-exact matches
120
+ # generate new spaces for the remainder and fragment the blob file with
121
+ # lots of unusable small spaces. Most applications seem to have
122
+ # clustered their blob sizes around a number of popular sizes. So exact
123
+ # match is very efficient to implement and results in the highest
124
+ # probability that a space will be reused soon.
125
+ list_entry_addr = @index.get(length)
126
+
127
+ if list_entry_addr
128
+ blob = @list.retrieve_blob(list_entry_addr)
129
+ space_address, next_entry_addr = blob.unpack('QQ')
130
+ @list.delete_blob(list_entry_addr)
131
+
132
+ if next_entry_addr > 0
133
+ # Update the index entry for the length to point to the
134
+ # following space list entry.
135
+ @index.insert(length, next_entry_addr)
136
+ else
137
+ # The space list for this length is empty. Remove the entry
138
+ # from the index.
139
+ @index.remove(length)
140
+ end
141
+ @recycled_spaces += 1
142
+
143
+ # We return the length to remain compatible with the old SpaceTree
144
+ # API.
145
+ return [ space_address, length ]
146
+ end
147
+
148
+ @failed_requests += 1
149
+ nil
150
+ end
151
+
152
+ def clear
153
+ @list.clear
154
+ @index.clear
155
+ reset_stats
156
+ end
157
+
158
+ def erase
159
+ @list.erase
160
+ @index.erase
161
+ end
162
+
163
+ def check(flat_file = nil)
164
+ sync
165
+ return false unless @index.check
166
+ return false unless @list.check
167
+
168
+ smallest_space = nil
169
+ largest_space = nil
170
+ total_space_bytes = 0
171
+ space_distribution = ::Hash.new(0)
172
+
173
+ @index.each do |length, list_entry_addr|
174
+ if list_entry_addr <= 0
175
+ PEROBS.log.error "list_entry_addr (#{list_entry_addr}) " +
176
+ "must be positive"
177
+ return false
178
+ end
179
+
180
+ # Detect smallest and largest space
181
+ if smallest_space.nil? || length < smallest_space
182
+ smallest_space = length
183
+ end
184
+ if largest_space.nil? || length > largest_space
185
+ largest_space = length
186
+ end
187
+
188
+ known_addresses = [ list_entry_addr ]
189
+ entries = 0
190
+ while list_entry_addr > 0
191
+ entries += 1
192
+ unless (blob = @list.retrieve_blob(list_entry_addr))
193
+ PEROBS.log.error "SpaceManager points to non-existing " +
194
+ "space list entry at address #{list_entry_addr}"
195
+ return false
196
+ end
197
+ space_address, next_entry_addr = blob.unpack('QQ')
198
+
199
+ if known_addresses.include?(next_entry_addr)
200
+ PEROBS.log.error "Space list is cyclic: "
201
+ "#{known_addresses + next_entry_addr}"
202
+ return false
203
+ end
204
+ if flat_file &&
205
+ !flat_file.has_space?(space_address, length)
206
+ PEROBS.log.error "SpaceManager has space at offset " +
207
+ "#{space_address} of size #{length} that isn't " +
208
+ "available in the FlatFile."
209
+ return false
210
+ end
211
+ list_entry_addr = next_entry_addr
212
+ end
213
+
214
+ total_space_bytes += length * entries
215
+ space_distribution[msb(length)] += entries
216
+ end
217
+
218
+ PEROBS.log.info "SpaceManager stats: smallest: #{smallest_space}; " +
219
+ "largest: #{largest_space}; total bytes: #{total_space_bytes}; " +
220
+ "distribution: " +
221
+ "#{space_distribution.map { |l, c| "#{2 ** (l - 1)}-#{2 ** l - 1}:#{c}; " }}"
222
+
223
+ true
224
+ end
225
+
226
+ def to_a
227
+ a = []
228
+
229
+ @index.each do |length, list_entry_addr|
230
+ while list_entry_addr > 0
231
+ blob = @list.retrieve_blob(list_entry_addr)
232
+ space_address, next_entry_addr = blob.unpack('QQ')
233
+
234
+ a << [ space_address, length ]
235
+
236
+ list_entry_addr = next_entry_addr
237
+ end
238
+ end
239
+
240
+ a.sort { |a, b| a[0] <=> b[0] }
241
+ end
242
+
243
+ private
244
+
245
+ def insert_space_in_list(next_element_addr, space_address)
246
+ blob = [ next_element_addr, space_address ].pack('QQ')
247
+ @list.store_blob(blob_addr = @list.free_address, blob)
248
+
249
+ blob_addr
250
+ end
251
+
252
+ def msb(i)
253
+ return 63 if i < 0
254
+
255
+ bit = 0
256
+ while (i > 0)
257
+ bit += 1
258
+ i = i >> 1
259
+ end
260
+
261
+ bit
262
+ end
263
+
264
+ def reset_stats
265
+ @added_spaces = 0
266
+ @recycled_spaces = 0
267
+ @failed_requests = 0
268
+ end
269
+
270
+ end
271
+
272
+ end
273
+