perobs 4.1.0 → 4.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -115,6 +115,10 @@ module PEROBS
115
115
  PEROBS.log.fatal "Cannot read blob header " +
116
116
  "#{id ? "for ID #{id} " : ''}at address #{addr}"
117
117
  else
118
+ if corruption_start
119
+ PEROBS.log.error "Corruption found at end of blob file at " +
120
+ "address #{addr}"
121
+ end
118
122
  # We have reached the end of the file.
119
123
  return nil
120
124
  end
@@ -122,10 +126,15 @@ module PEROBS
122
126
 
123
127
  # Did we get the full header?
124
128
  if buf_with_crc.length != LENGTH
125
- PEROBS.log.error "Incomplete FlatFileBlobHeader: Only " +
129
+ msg = "Incomplete FlatFileBlobHeader: Only " +
126
130
  "#{buf_with_crc.length} " +
127
131
  "bytes of #{LENGTH} could be read "
128
132
  "#{id ? "for ID #{id} " : ''}at address #{addr}"
133
+ if errors_are_fatal
134
+ PEROBS.log.fatal msg
135
+ else
136
+ PEROBS.log.error msg
137
+ end
129
138
  return nil
130
139
  end
131
140
 
@@ -148,10 +157,16 @@ module PEROBS
148
157
  "#{'%08x' % crc}."
149
158
  else
150
159
  if corruption_start.nil?
151
- PEROBS.log.error "FlatFile corruption found. The FlatFile " +
152
- "Header CRC mismatch at address #{addr}. Header CRC is " +
153
- "#{'%08x' % read_crc} but should be #{'%08x' % crc}. Trying " +
154
- "to find the next header."
160
+ if errors_are_fatal
161
+ PEROBS.log.fatal "FlatFile corruption found. The FlatFile " +
162
+ "Header CRC mismatch at address #{addr}. Header CRC is " +
163
+ "#{'%08x' % read_crc} but should be #{'%08x' % crc}."
164
+ else
165
+ PEROBS.log.error "FlatFile corruption found. The FlatFile " +
166
+ "Header CRC mismatch at address #{addr}. Header CRC is " +
167
+ "#{'%08x' % read_crc} but should be #{'%08x' % crc}. " +
168
+ "Trying to find the next header."
169
+ end
155
170
  corruption_start = addr
156
171
  end
157
172
  # The blob file is corrupted. There is no valid header at the
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # = FlatFileDB.rb -- Persistent Ruby Object Store
4
4
  #
5
- # Copyright (c) 2015, 2016, 2017, 2018
5
+ # Copyright (c) 2015, 2016, 2017, 2018, 2019
6
6
  # by Chris Schlaeger <chris@taskjuggler.org>
7
7
  #
8
8
  # MIT License
@@ -161,8 +161,8 @@ module PEROBS
161
161
  # Permanently delete all objects that have not been marked. Those are
162
162
  # orphaned and are no longer referenced by any actively used object.
163
163
  # @return [Integer] Number of the removed objects from the DB.
164
- def delete_unmarked_objects
165
- @flat_file.delete_unmarked_objects
164
+ def delete_unmarked_objects(&block)
165
+ @flat_file.delete_unmarked_objects(&block)
166
166
  end
167
167
 
168
168
  # Mark an object.
@@ -184,7 +184,11 @@ module PEROBS
184
184
  # repaired.
185
185
  # @return number of errors found
186
186
  def check_db(repair = false)
187
- @flat_file.check(repair)
187
+ if repair
188
+ @flat_file.repair
189
+ else
190
+ @flat_file.check
191
+ end
188
192
  end
189
193
 
190
194
  # Check if the stored object is syntactically correct.
@@ -0,0 +1,192 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # = FuzzyStringMatcher.rb -- Persistent Ruby Object Store
4
+ #
5
+ # Copyright (c) 2020 by Chris Schlaeger <chris@taskjuggler.org>
6
+ #
7
+ # MIT License
8
+ #
9
+ # Permission is hereby granted, free of charge, to any person obtaining
10
+ # a copy of this software and associated documentation files (the
11
+ # "Software"), to deal in the Software without restriction, including
12
+ # without limitation the rights to use, copy, modify, merge, publish,
13
+ # distribute, sublicense, and/or sell copies of the Software, and to
14
+ # permit persons to whom the Software is furnished to do so, subject to
15
+ # the following conditions:
16
+ #
17
+ # The above copyright notice and this permission notice shall be
18
+ # included in all copies or substantial portions of the Software.
19
+ #
20
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
+
28
+ require 'perobs/Log'
29
+ require 'perobs/ObjectBase'
30
+
31
+ module PEROBS
32
+
33
+ # The fuzzy string matcher can be used to perform a fuzzy string search
34
+ # against a known set of strings. The dictionary of known strings does not
35
+ # store the actual strings but references to arbitrary objects. These could
36
+ # be the string, but can be something else related to the learned strings.
37
+ # To use this class a list of strings with their references must be learned.
38
+ # Once the dictionary has been established, fuzzy matches can be done.
39
+ class FuzzyStringMatcher
40
+
41
+ # Create a new FuzzyStringMatcher.
42
+ # @param store [PEROBS::Store] place to store the dictionary
43
+ # @param name [String] Unique name of the string matcher
44
+ # @param case_sensitive [Boolean] True if case matters for matching
45
+ # @param n [Integer] Determines what kind of n-gramm is used to store the
46
+ # references in the dictionary. It also determines the minimum word
47
+ # length that can be used for fuzzy matches.
48
+ def initialize(store, name, case_sensitive = false, n = 4)
49
+ @store = store
50
+ @dict_name = "FuzzyStringMatcher::#{name}"
51
+ if n < 2 || n > 10
52
+ raise ArgumentError, 'n must be between 2 and 10'
53
+ end
54
+ @case_sensitive = case_sensitive
55
+ @n = n
56
+
57
+ clear unless (@dict = @store[@dict_name])
58
+ end
59
+
60
+ # Wipe the dictionary.
61
+ def clear
62
+ @store[@dict_name] = @dict = @store.new(BigHash)
63
+ end
64
+
65
+ # Add a string with its reference to the dictionary.
66
+ # @param string [String] The string to store
67
+ # @param reference [Object] Any object that is associated with the string
68
+ def learn(string, reference = string)
69
+ reference = string if reference.nil?
70
+
71
+ unless @case_sensitive
72
+ string = string.downcase
73
+ end
74
+ # Enclose string in 'start of text' and 'end of text' ASCII values.
75
+ string = "\002" + string + "\003"
76
+
77
+ each_n_gramm(string) do |n_gramm|
78
+ unless (ng_list = @dict[n_gramm])
79
+ @dict[n_gramm] = ng_list = @store.new(Hash)
80
+ end
81
+
82
+ if ng_list.include?(reference)
83
+ ng_list[reference] += 1
84
+ else
85
+ ng_list[reference] = 0
86
+ end
87
+ end
88
+
89
+ nil
90
+ end
91
+
92
+ # Find the references who's string best matches the given string.
93
+ # @param string [String] string to search for
94
+ # @param min_score [Float] Value 0.01 and 1.0 that specifies how strict
95
+ # the matching should be done. The larger the value the more closer
96
+ # the given string needs to be.
97
+ # @param max_count [Integer] The maximum number of matches that should be
98
+ # returned.
99
+ # @return [Array] The result is an Array of Arrays. The nested Arrays only
100
+ # have 2 entries. The reference and a Float value between 0 and
101
+ # 1.0 that describes how good the match is. The matches are sorted
102
+ # in descending order by the match score.
103
+ def best_matches(string, min_score = 0.5, max_count = 100)
104
+ unless @case_sensitive
105
+ string = string.downcase
106
+ end
107
+ # Enclose string in 'start of text' and 'end of text' ASCII values.
108
+ string = "\002" + string + "\003"
109
+
110
+ matches = {}
111
+
112
+ # This will be the best possible score for a perfect match.
113
+ best_possible_score = 0
114
+ each_n_gramm(string) do |n_gramm|
115
+ best_possible_score += 1
116
+ if (ng_list = @dict[n_gramm])
117
+ ng_list.each do |reference, count|
118
+ if matches.include?(reference)
119
+ matches[reference] += 1
120
+ else
121
+ # We use internally a 10 times larger list so that we don't
122
+ # throw away good matches too early. If the max_count value is
123
+ # chosen too small there is a risk of not finding the best
124
+ # matches!
125
+ if matches.size > 10 * max_count
126
+ matches = discard_worst_match(matches)
127
+ end
128
+ matches[reference] = 1
129
+ end
130
+ end
131
+ end
132
+ end
133
+
134
+ return [] if matches.empty?
135
+
136
+ # Sort in the order of occurance count downwards.
137
+ match_list = matches.to_a.sort do |a, b|
138
+ b[1] <=> a[1]
139
+ end
140
+
141
+ # Set occurance counters to scores relative to the best possible score.
142
+ match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
143
+
144
+ # Delete all matches that occured less than half as often than the
145
+ # top match.
146
+ match_list.delete_if { |a| a[1] < min_score }
147
+
148
+ match_list[0..max_count]
149
+ end
150
+
151
+ # Returns some internal stats about the dictionary.
152
+ def stats
153
+ s = {}
154
+ s['dictionary_size'] = @dict.size
155
+ max = total = 0
156
+ @dict.each do |n_gramm, ng_list|
157
+ size = ng_list.length
158
+ max = size if size > max
159
+ total += size
160
+ end
161
+ s['max_list_size'] = max
162
+ s['avg_list_size'] = total > 0 ? total.to_f / s['dictionary_size'] : 0
163
+
164
+ s
165
+ end
166
+
167
+ private
168
+
169
+ def each_n_gramm(string, &block)
170
+ return if string.length < @n
171
+
172
+ 0.upto(string.length - @n) do |i|
173
+ n_gramm = string[i, @n]
174
+
175
+ yield(n_gramm)
176
+ end
177
+ end
178
+
179
+ def discard_worst_match(matches)
180
+ # Sort in the order of occurance count downwards.
181
+ match_list = matches.to_a.sort do |a, b|
182
+ b[1] <=> a[1]
183
+ end
184
+ # Discard the lowest half of the matches
185
+ match_list = match_list[0..match_list.length / 2]
186
+ match_list.to_h
187
+ end
188
+
189
+ end
190
+
191
+ end
192
+
@@ -124,6 +124,10 @@ module PEROBS
124
124
 
125
125
  # Proxy for assignment method.
126
126
  def []=(key, value)
127
+ unless key.is_a?(String)
128
+ raise ArgumentError, "PEROBS::Hash[] key must be a String but is a " +
129
+ "#{key.class}"
130
+ end
127
131
  _check_assignment_value(value)
128
132
  @store.cache.cache_write(self)
129
133
  @data[key] = value
@@ -54,8 +54,7 @@ module PEROBS
54
54
  @file_name = File.join(dir, name + '.cache')
55
55
  @page_size = page_size
56
56
  open
57
- @pages = PersistentObjectCache.new(max_in_memory, max_in_memory / 2,
58
- IDListPage, self)
57
+ @pages = PersistentObjectCache.new(max_in_memory, -1, IDListPage, self)
59
58
  @page_counter = 0
60
59
  end
61
60
 
@@ -250,7 +250,7 @@ module PEROBS
250
250
  def _restore(level)
251
251
  # Find the most recently stored state of this object. This could be on
252
252
  # any previous stash level or in the regular object DB. If the object
253
- # was created during the transaction, there is not previous state to
253
+ # was created during the transaction, there is no previous state to
254
254
  # restore to.
255
255
  data = nil
256
256
  if @_stash_map
@@ -44,7 +44,8 @@ module PEROBS
44
44
  # cache objects.
45
45
  # @param size [Integer] Minimum number of objects to be cached at a time
46
46
  # @param flush_delay [Integer] Determines how often non-forced flushes are
47
- # ignored in a row before the flush is really done.
47
+ # ignored in a row before the flush is really done. If flush_delay
48
+ # is smaller than 0 non-forced flushed will always be ignored.
48
49
  # @param klass [Class] The class of the objects to be cached. Objects must
49
50
  # provide a uid() method that returns a unique ID for every object.
50
51
  # @param collection [] The object collection the objects belong to. It
@@ -71,8 +72,7 @@ module PEROBS
71
72
  if modified
72
73
  @modified_entries[object.uid] = object
73
74
  else
74
- index = object.uid % @size
75
- @unmodified_entries[index] = object
75
+ @unmodified_entries[object.uid % @size] = object
76
76
  end
77
77
 
78
78
  nil
@@ -111,9 +111,12 @@ module PEROBS
111
111
  # all modified objects will be written.
112
112
  # @param now [Boolean]
113
113
  def flush(now = false)
114
- if now || (@flush_counter -= 1) <= 0
114
+ if now || (@flush_delay >= 0 && (@flush_counter -= 1) <= 0)
115
115
  @modified_entries.each do |id, object|
116
116
  object.save
117
+ # Add the object to the unmodified object cache. We might still need
118
+ # it again soon.
119
+ @unmodified_entries[object.uid % @size] = object
117
120
  end
118
121
  @modified_entries = ::Hash.new
119
122
  @flush_counter = @flush_delay
@@ -0,0 +1,273 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # = SpaceManager.rb -- Persistent Ruby Object Store
4
+ #
5
+ # Copyright (c) 2020 by Chris Schlaeger <chris@taskjuggler.org>
6
+ #
7
+ # MIT License
8
+ #
9
+ # Permission is hereby granted, free of charge, to any person obtaining
10
+ # a copy of this software and associated documentation files (the
11
+ # "Software"), to deal in the Software without restriction, including
12
+ # without limitation the rights to use, copy, modify, merge, publish,
13
+ # distribute, sublicense, and/or sell copies of the Software, and to
14
+ # permit persons to whom the Software is furnished to do so, subject to
15
+ # the following conditions:
16
+ #
17
+ # The above copyright notice and this permission notice shall be
18
+ # included in all copies or substantial portions of the Software.
19
+ #
20
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
+
28
+ require 'perobs/BTree'
29
+ require 'perobs/EquiBlobsFile'
30
+ require 'perobs/FlatFile'
31
+ require 'perobs/FlatFileBlobHeader'
32
+
33
+ module PEROBS
34
+
35
+ # The SpaceManager is used to keep a list of all the empty spaces in a
36
+ # FlatFileDB file. An empty space is described by its starting address and
37
+ # its length in bytes. The SpaceManager keeps a list of all the spaces and
38
+ # can find the best fit space when a new blob needs to be added to the
39
+ # FlatFileDB.
40
+ #
41
+ # The SpaceManager uses two files to store the list. The first is a file
42
+ # with the actual addresses. This is a set of linked address lists. Each
43
+ # list holds the addresses for spaces that have exactly the same size. The
44
+ # second file is a BTree file that serves as the index. It is used to map
45
+ # the length of a space to the address of the linked list for that
46
+ # particular length. The linked list consists of elements that only hold 2
47
+ # items. The actual address in the FlatFileDB and the address of the next
48
+ # entry in the linked list in the list file.
49
+ class SpaceManager
50
+
51
+ attr_reader :added_spaces, :recycled_spaces, :failed_requests
52
+
53
+ def initialize(db_dir, progressmeter, btree_order = 65)
54
+ @db_dir = db_dir
55
+ @progressmeter = progressmeter
56
+
57
+ @index = BTree.new(@db_dir, 'space_index', btree_order, @progressmeter)
58
+ # The space list contains blobs that have each 2 entries. The address of
59
+ # the space in the FlatFile and the address of the next blob in the
60
+ # space list file that is an entry for the same space size. An address
61
+ # of 0 marks the end of the list.
62
+ @list = EquiBlobsFile.new(@db_dir, 'space_list', @progressmeter, 2 * 8, 1)
63
+ end
64
+
65
+ def open
66
+ @index.open
67
+ @list.open
68
+ reset_stats
69
+ end
70
+
71
+ def close
72
+ if @index.is_open?
73
+ PEROBS.log.info "SpaceManager has currently #{@list.total_entries} " +
74
+ "used blobs and #{@list.total_spaces} unused blobs in list " +
75
+ "EquiBlobsFile"
76
+ PEROBS.log.info "#{@added_spaces} were added, #{@recycled_spaces} " +
77
+ "spaces were recycled and #{@failed_requests} requests failed"
78
+
79
+ @list.close
80
+ @index.close
81
+ end
82
+ end
83
+
84
+ def is_open?
85
+ @index.is_open?
86
+ end
87
+
88
+ def sync
89
+ @list.sync
90
+ @index.sync
91
+ end
92
+
93
+ def add_space(address, length)
94
+ if (list_entry_addr = @index.get(length))
95
+ # There is already at least one move entry for this length.
96
+ new_list_entry_addr = insert_space_in_list(address, list_entry_addr)
97
+ else
98
+ new_list_entry_addr = insert_space_in_list(address, 0)
99
+ end
100
+ @index.insert(length, new_list_entry_addr)
101
+ @added_spaces += 1
102
+ end
103
+
104
+ def has_space?(address, length)
105
+ if (list_entry_addr = @index.get(length))
106
+ while list_entry_addr > 0
107
+ blob = @list.retrieve_blob(list_entry_addr)
108
+ space_address, next_entry_addr = blob.unpack('QQ')
109
+ return true if space_address == address
110
+ list_entry_addr = next_entry_addr
111
+ end
112
+ end
113
+
114
+ false
115
+ end
116
+
117
+ def get_space(length)
118
+ # We use a simple exact fit strategy. All attempts to use a more
119
+ # elaborate scheme were actually less efficient. Non-exact matches
120
+ # generate new spaces for the remainder and fragment the blob file with
121
+ # lots of unusable small spaces. Most applications seem to have
122
+ # clustered their blob sizes around a number of popular sizes. So exact
123
+ # match is very efficient to implement and results in the highest
124
+ # probability that a space will be reused soon.
125
+ list_entry_addr = @index.get(length)
126
+
127
+ if list_entry_addr
128
+ blob = @list.retrieve_blob(list_entry_addr)
129
+ space_address, next_entry_addr = blob.unpack('QQ')
130
+ @list.delete_blob(list_entry_addr)
131
+
132
+ if next_entry_addr > 0
133
+ # Update the index entry for the length to point to the
134
+ # following space list entry.
135
+ @index.insert(length, next_entry_addr)
136
+ else
137
+ # The space list for this length is empty. Remove the entry
138
+ # from the index.
139
+ @index.remove(length)
140
+ end
141
+ @recycled_spaces += 1
142
+
143
+ # We return the length to remain compatible with the old SpaceTree
144
+ # API.
145
+ return [ space_address, length ]
146
+ end
147
+
148
+ @failed_requests += 1
149
+ nil
150
+ end
151
+
152
+ def clear
153
+ @list.clear
154
+ @index.clear
155
+ reset_stats
156
+ end
157
+
158
+ def erase
159
+ @list.erase
160
+ @index.erase
161
+ end
162
+
163
+ def check(flat_file = nil)
164
+ sync
165
+ return false unless @index.check
166
+ return false unless @list.check
167
+
168
+ smallest_space = nil
169
+ largest_space = nil
170
+ total_space_bytes = 0
171
+ space_distribution = ::Hash.new(0)
172
+
173
+ @index.each do |length, list_entry_addr|
174
+ if list_entry_addr <= 0
175
+ PEROBS.log.error "list_entry_addr (#{list_entry_addr}) " +
176
+ "must be positive"
177
+ return false
178
+ end
179
+
180
+ # Detect smallest and largest space
181
+ if smallest_space.nil? || length < smallest_space
182
+ smallest_space = length
183
+ end
184
+ if largest_space.nil? || length > largest_space
185
+ largest_space = length
186
+ end
187
+
188
+ known_addresses = [ list_entry_addr ]
189
+ entries = 0
190
+ while list_entry_addr > 0
191
+ entries += 1
192
+ unless (blob = @list.retrieve_blob(list_entry_addr))
193
+ PEROBS.log.error "SpaceManager points to non-existing " +
194
+ "space list entry at address #{list_entry_addr}"
195
+ return false
196
+ end
197
+ space_address, next_entry_addr = blob.unpack('QQ')
198
+
199
+ if known_addresses.include?(next_entry_addr)
200
+ PEROBS.log.error "Space list is cyclic: "
201
+ "#{known_addresses + next_entry_addr}"
202
+ return false
203
+ end
204
+ if flat_file &&
205
+ !flat_file.has_space?(space_address, length)
206
+ PEROBS.log.error "SpaceManager has space at offset " +
207
+ "#{space_address} of size #{length} that isn't " +
208
+ "available in the FlatFile."
209
+ return false
210
+ end
211
+ list_entry_addr = next_entry_addr
212
+ end
213
+
214
+ total_space_bytes += length * entries
215
+ space_distribution[msb(length)] += entries
216
+ end
217
+
218
+ PEROBS.log.info "SpaceManager stats: smallest: #{smallest_space}; " +
219
+ "largest: #{largest_space}; total bytes: #{total_space_bytes}; " +
220
+ "distribution: " +
221
+ "#{space_distribution.map { |l, c| "#{2 ** (l - 1)}-#{2 ** l - 1}:#{c}; " }}"
222
+
223
+ true
224
+ end
225
+
226
+ def to_a
227
+ a = []
228
+
229
+ @index.each do |length, list_entry_addr|
230
+ while list_entry_addr > 0
231
+ blob = @list.retrieve_blob(list_entry_addr)
232
+ space_address, next_entry_addr = blob.unpack('QQ')
233
+
234
+ a << [ space_address, length ]
235
+
236
+ list_entry_addr = next_entry_addr
237
+ end
238
+ end
239
+
240
+ a.sort { |a, b| a[0] <=> b[0] }
241
+ end
242
+
243
+ private
244
+
245
+ def insert_space_in_list(next_element_addr, space_address)
246
+ blob = [ next_element_addr, space_address ].pack('QQ')
247
+ @list.store_blob(blob_addr = @list.free_address, blob)
248
+
249
+ blob_addr
250
+ end
251
+
252
+ def msb(i)
253
+ return 63 if i < 0
254
+
255
+ bit = 0
256
+ while (i > 0)
257
+ bit += 1
258
+ i = i >> 1
259
+ end
260
+
261
+ bit
262
+ end
263
+
264
+ def reset_stats
265
+ @added_spaces = 0
266
+ @recycled_spaces = 0
267
+ @failed_requests = 0
268
+ end
269
+
270
+ end
271
+
272
+ end
273
+