perobs 4.2.0 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -221,6 +221,7 @@ module PEROBS
221
221
  flags |= (1 << FlatFileBlobHeader::COMPRESSED_FLAG_BIT) if compressed
222
222
  FlatFileBlobHeader.new(@f, addr, flags, raw_obj_bytesize, id, crc).write
223
223
  @f.write(raw_obj)
224
+ @f.flush
224
225
  if length != -1 && raw_obj_bytesize < length
225
226
  # The new object was not appended and it did not completely fill the
226
227
  # free space. So we have to write a new header to mark the remaining
@@ -247,12 +248,11 @@ module PEROBS
247
248
  # If we had an existing object stored for the ID we have to mark
248
249
  # this entry as deleted now.
249
250
  old_header.clear_flags
251
+ @f.flush
250
252
  # And register the newly freed space with the space list.
251
253
  if @space_list.is_open?
252
254
  @space_list.add_space(old_addr, old_header.length)
253
255
  end
254
- else
255
- @f.flush
256
256
  end
257
257
  rescue IOError => e
258
258
  PEROBS.log.fatal "Cannot write blob for ID #{id} to FlatFileDB: " +
@@ -293,7 +293,7 @@ module PEROBS
293
293
  header = FlatFileBlobHeader.read(@f, addr, id)
294
294
  if header.id != id
295
295
  PEROBS.log.fatal "Database index corrupted: Index for object " +
296
- "#{id} points to object with ID #{header.id}"
296
+ "#{id} points to object with ID #{header.id} at address #{addr}"
297
297
  end
298
298
 
299
299
  buf = nil
@@ -302,7 +302,8 @@ module PEROBS
302
302
  @f.seek(addr + FlatFileBlobHeader::LENGTH)
303
303
  buf = @f.read(header.length)
304
304
  rescue IOError => e
305
- PEROBS.log.fatal "Cannot read blob for ID #{id}: #{e.message}"
305
+ PEROBS.log.fatal "Cannot read blob for ID #{id} at address #{addr}: " +
306
+ e.message
306
307
  end
307
308
 
308
309
  # Uncompress the data if the compression bit is set in the flags byte.
@@ -311,12 +312,13 @@ module PEROBS
311
312
  buf = Zlib.inflate(buf)
312
313
  rescue Zlib::BufError, Zlib::DataError
313
314
  PEROBS.log.fatal "Corrupted compressed block with ID " +
314
- "#{header.id} found."
315
+ "#{id} found at address #{addr}."
315
316
  end
316
317
  end
317
318
 
318
319
  if checksum(buf) != header.crc
319
- PEROBS.log.fatal "Checksum failure while reading blob ID #{id}"
320
+ PEROBS.log.fatal "Checksum failure while reading blob ID #{id} " +
321
+ "at address #{addr}"
320
322
  end
321
323
 
322
324
  buf
@@ -339,7 +341,7 @@ module PEROBS
339
341
  if @marks
340
342
  @marks.clear
341
343
  else
342
- @marks = IDList.new(@db_dir, 'marks', 8)
344
+ @marks = IDList.new(@db_dir, 'marks', item_counter)
343
345
  end
344
346
  end
345
347
 
@@ -353,7 +355,7 @@ module PEROBS
353
355
  valid_blobs = 0
354
356
 
355
357
  # Iterate over all entries.
356
- @progressmeter.start('Defragmentizing blobs file', @f.size) do |pm|
358
+ @progressmeter.start('Defragmenting blobs file', @f.size) do |pm|
357
359
  each_blob_header do |header|
358
360
  # If we have stumbled over a corrupted blob we treat it similar to a
359
361
  # deleted blob and reuse the space.
@@ -452,16 +454,14 @@ module PEROBS
452
454
  regenerate_index_and_spaces
453
455
  end
454
456
 
455
- # Check (and repair) the FlatFile.
456
- # @param repair [Boolean] True if errors should be fixed.
457
+ # Check the FlatFile.
457
458
  # @return [Integer] Number of errors found
458
- def check(repair = false)
459
+ def check()
459
460
  errors = 0
460
461
  return errors unless @f
461
462
 
462
463
  t = Time.now
463
- PEROBS.log.info "Checking FlatFile database" +
464
- "#{repair ? ' in repair mode' : ''}..."
464
+ PEROBS.log.info "Checking FlatFile database..."
465
465
 
466
466
  # First check the database blob file. Each entry should be readable and
467
467
  # correct and all IDs must be unique. We use a shadow index to keep
@@ -483,7 +483,6 @@ module PEROBS
483
483
  if buf.bytesize != header.length
484
484
  PEROBS.log.error "Premature end of file in blob with ID " +
485
485
  "#{header.id}."
486
- discard_damaged_blob(header) if repair
487
486
  errors += 1
488
487
  next
489
488
  end
@@ -496,7 +495,6 @@ module PEROBS
496
495
  rescue Zlib::BufError, Zlib::DataError
497
496
  PEROBS.log.error "Corrupted compressed block with ID " +
498
497
  "#{header.id} found."
499
- discard_damaged_blob(header) if repair
500
498
  errors += 1
501
499
  next
502
500
  end
@@ -505,7 +503,6 @@ module PEROBS
505
503
  if header.crc && checksum(buf) != header.crc
506
504
  PEROBS.log.error "Checksum failure while checking blob " +
507
505
  "with ID #{header.id}"
508
- discard_damaged_blob(header) if repair
509
506
  errors += 1
510
507
  next
511
508
  end
@@ -521,22 +518,6 @@ module PEROBS
521
518
  errors += 1
522
519
  previous_header = FlatFileBlobHeader.read(@f, previous_address,
523
520
  header.id)
524
- if repair
525
- # We have two blobs with the same ID and we must discard one of
526
- # them.
527
- if header.is_outdated?
528
- discard_damaged_blob(header)
529
- elsif previous_header.is_outdated?
530
- discard_damaged_blob(previous_header)
531
- else
532
- PEROBS.log.error "None of the blobs with same ID have " +
533
- "the outdated flag set. Deleting the smaller one."
534
- errors += 1
535
- discard_damaged_blob(header.length < previous_header.length ?
536
- header : previous_header)
537
- end
538
- next
539
- end
540
521
  else
541
522
  # ID is unique so far. Add it to the shadow index.
542
523
  new_index.insert(header.id, header.addr)
@@ -553,12 +534,6 @@ module PEROBS
553
534
  PEROBS.log.error "#{@f.size - end_of_last_healthy_blob} corrupted " +
554
535
  'bytes found at the end of FlatFile.'
555
536
  corrupted_blobs += 1
556
- if repair
557
- PEROBS.log.error "Truncating FlatFile to " +
558
- "#{end_of_last_healthy_blob} bytes by discarding " +
559
- "#{@f.size - end_of_last_healthy_blob} bytes"
560
- @f.truncate(end_of_last_healthy_blob)
561
- end
562
537
  end
563
538
 
564
539
  errors += corrupted_blobs
@@ -568,17 +543,19 @@ module PEROBS
568
543
  new_index.close
569
544
  new_index.erase
570
545
 
571
- if repair && corrupted_blobs > 0
572
- erase_index_files
573
- defragmentize
574
- regenerate_index_and_spaces
575
- elsif corrupted_blobs == 0
546
+ if corrupted_blobs == 0
576
547
  # Now we check the index data. It must be correct and the entries must
577
548
  # match the blob file. All entries in the index must be in the blob file
578
549
  # and vise versa.
579
550
  begin
580
551
  index_ok = @index.check do |id, address|
581
- has_id_at?(id, address)
552
+ unless has_id_at?(id, address)
553
+ PEROBS.log.error "Index contains an entry for " +
554
+ "ID #{id} at address #{address} that is not in FlatFile"
555
+ false
556
+ else
557
+ true
558
+ end
582
559
  end
583
560
  x_check_errs = 0
584
561
  space_check_ok = true
@@ -586,16 +563,13 @@ module PEROBS
586
563
  (x_check_errs = cross_check_entries) == 0
587
564
  errors += 1 unless index_ok && space_check_ok
588
565
  errors += x_check_errs
589
- regenerate_index_and_spaces if repair
590
566
  end
591
567
  rescue PEROBS::FatalError
592
568
  errors += 1
593
- regenerate_index_and_spaces if repair
594
569
  end
595
570
  end
596
571
 
597
- sync if repair
598
- PEROBS.log.info "check_db completed in #{Time.now - t} seconds. " +
572
+ PEROBS.log.info "FlatFile check completed in #{Time.now - t} seconds. " +
599
573
  "#{errors} errors found."
600
574
 
601
575
  errors
@@ -604,7 +578,6 @@ module PEROBS
604
578
  # Repair the FlatFile. In contrast to the repair functionality in the
605
579
  # check() method this method is much faster. It simply re-creates the
606
580
  # index and space list from the blob file.
607
- # @param repair [Boolean] True if errors should be fixed.
608
581
  # @return [Integer] Number of errors found
609
582
  def repair
610
583
  errors = 0
@@ -687,17 +660,7 @@ module PEROBS
687
660
  header.id)
688
661
  # We have two blobs with the same ID and we must discard one of
689
662
  # them.
690
- if header.is_outdated?
691
- discard_damaged_blob(header)
692
- elsif previous_header.is_outdated?
693
- discard_damaged_blob(previous_header)
694
- else
695
- PEROBS.log.error "None of the blobs with same ID have " +
696
- "the outdated flag set. Deleting the smaller one."
697
- errors += 1
698
- discard_damaged_blob(header.length < previous_header.length ?
699
- header : previous_header)
700
- end
663
+ discard_duplicate_blobs(header, previous_header)
701
664
  else
702
665
  # ID is unique so far. Add it to the shadow index.
703
666
  @index.insert(header.id, header.addr)
@@ -927,6 +890,23 @@ module PEROBS
927
890
  header.clear_flags
928
891
  end
929
892
 
893
+ def discard_duplicate_blobs(header, previous_header)
894
+ if header.is_outdated?
895
+ discard_damaged_blob(header)
896
+ elsif previous_header.is_outdated?
897
+ discard_damaged_blob(previous_header)
898
+ else
899
+ smaller, larger = header.length < previous_header.length ?
900
+ [ header, previous_header ] : [ previous_header, header ]
901
+ PEROBS.log.error "None of the blobs with same ID have " +
902
+ "the outdated flag set. Deleting the smaller one " +
903
+ "at address #{smaller.addr}"
904
+ discard_damaged_blob(smaller)
905
+ @space_list.add_space(smaller.addr, smaller.length)
906
+ @index.insert(larger.id, larger.addr)
907
+ end
908
+ end
909
+
930
910
  def open_index_files(abort_on_missing_files = false)
931
911
  begin
932
912
  @index.open(abort_on_missing_files)
@@ -26,40 +26,42 @@
26
26
  # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
27
 
28
28
  require 'perobs/Log'
29
- require 'perobs/ObjectBase'
29
+ require 'perobs/Object'
30
30
 
31
31
  module PEROBS
32
32
 
33
33
  # The fuzzy string matcher can be used to perform a fuzzy string search
34
34
  # against a known set of strings. The dictionary of known strings does not
35
- # store the actual strings but references to arbitrary objects. These could
36
- # be the string, but can be something else related to the learned strings.
37
- # To use this class a list of strings with their references must be learned.
38
- # Once the dictionary has been established, fuzzy matches can be done.
39
- class FuzzyStringMatcher
35
+ # store the actual strings but references to String or PEROBS objects.
36
+ # Once the dictionary has been established, fuzzy matches can be done. Since
37
+ # the actual input strings are not directly stored, you cannot remove or
38
+ # modified already stored strings. To remove strings, you have to clear the
39
+ # matcher and add the strings again that you want to keep.
40
+ class FuzzyStringMatcher < PEROBS::Object
41
+
42
+ attr_persist :case_sensitive, :n, :dict
40
43
 
41
44
  # Create a new FuzzyStringMatcher.
42
- # @param store [PEROBS::Store] place to store the dictionary
43
- # @param name [String] Unique name of the string matcher
45
+ # @param p [PEROBS::Store] place to store the dictionary
44
46
  # @param case_sensitive [Boolean] True if case matters for matching
45
47
  # @param n [Integer] Determines what kind of n-gramm is used to store the
46
48
  # references in the dictionary. It also determines the minimum word
47
- # length that can be used for fuzzy matches.
48
- def initialize(store, name, case_sensitive = false, n = 4)
49
- @store = store
50
- @dict_name = "FuzzyStringMatcher::#{name}"
49
+ # length that can be used for fuzzy matches. Values between 2 and
50
+ # 10 are supported. The default is 4.
51
+ def initialize(p, case_sensitive = false, n = 4)
52
+ super(p)
51
53
  if n < 2 || n > 10
52
54
  raise ArgumentError, 'n must be between 2 and 10'
53
55
  end
54
- @case_sensitive = case_sensitive
55
- @n = n
56
+ self.case_sensitive = case_sensitive
57
+ self.n = n
56
58
 
57
- clear unless (@dict = @store[@dict_name])
59
+ clear unless @dict
58
60
  end
59
61
 
60
62
  # Wipe the dictionary.
61
63
  def clear
62
- @store[@dict_name] = @dict = @store.new(BigHash)
64
+ self.dict = @store.new(BigHash)
63
65
  end
64
66
 
65
67
  # Add a string with its reference to the dictionary.
@@ -79,11 +81,8 @@ module PEROBS
79
81
  @dict[n_gramm] = ng_list = @store.new(Hash)
80
82
  end
81
83
 
82
- if ng_list.include?(reference)
83
- ng_list[reference] += 1
84
- else
85
- ng_list[reference] = 0
86
- end
84
+ # We use the Hash as a Set. The value doesn't matter.
85
+ ng_list[reference] = true unless ng_list.include?(reference)
87
86
  end
88
87
 
89
88
  nil
@@ -109,22 +108,12 @@ module PEROBS
109
108
 
110
109
  matches = {}
111
110
 
112
- # This will be the best possible score for a perfect match.
113
- best_possible_score = 0
114
111
  each_n_gramm(string) do |n_gramm|
115
- best_possible_score += 1
116
112
  if (ng_list = @dict[n_gramm])
117
- ng_list.each do |reference, count|
113
+ ng_list.each do |reference, dummy|
118
114
  if matches.include?(reference)
119
115
  matches[reference] += 1
120
116
  else
121
- # We use internally a 10 times larger list so that we don't
122
- # throw away good matches too early. If the max_count value is
123
- # chosen too small there is a risk of not finding the best
124
- # matches!
125
- if matches.size > 10 * max_count
126
- matches = discard_worst_match(matches)
127
- end
128
117
  matches[reference] = 1
129
118
  end
130
119
  end
@@ -133,19 +122,23 @@ module PEROBS
133
122
 
134
123
  return [] if matches.empty?
135
124
 
136
- # Sort in the order of occurance count downwards.
137
- match_list = matches.to_a.sort do |a, b|
138
- b[1] <=> a[1]
139
- end
125
+ match_list = matches.to_a
140
126
 
141
127
  # Set occurance counters to scores relative to the best possible score.
128
+ # This will be the best possible score for a perfect match.
129
+ best_possible_score = string.length - @n + 1
142
130
  match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
143
131
 
144
- # Delete all matches that occured less than half as often than the
145
- # top match.
132
+ # Delete all matches that don't have the required minimum match score.
146
133
  match_list.delete_if { |a| a[1] < min_score }
147
134
 
148
- match_list[0..max_count]
135
+ # Sort the list best to worst match
136
+ match_list.sort! do |a, b|
137
+ b[1] <=> a[1]
138
+ end
139
+
140
+ # Return the top max_count matches.
141
+ match_list[0..max_count - 1]
149
142
  end
150
143
 
151
144
  # Returns some internal stats about the dictionary.
@@ -176,16 +169,6 @@ module PEROBS
176
169
  end
177
170
  end
178
171
 
179
- def discard_worst_match(matches)
180
- # Sort in the order of occurance count downwards.
181
- match_list = matches.to_a.sort do |a, b|
182
- b[1] <=> a[1]
183
- end
184
- # Discard the lowest half of the matches
185
- match_list = match_list[0..match_list.length / 2]
186
- match_list.to_h
187
- end
188
-
189
172
  end
190
173
 
191
174
  end
data/lib/perobs/Hash.rb CHANGED
@@ -124,9 +124,9 @@ module PEROBS
124
124
 
125
125
  # Proxy for assignment method.
126
126
  def []=(key, value)
127
- unless key.is_a?(String)
128
- raise ArgumentError, "PEROBS::Hash[] key must be a String but is a " +
129
- "#{key.class}"
127
+ unless key.is_a?(String) || key.respond_to?(:is_poxreference?)
128
+ raise ArgumentError, "PEROBS::Hash[] key must be a String or " +
129
+ "a PEROBS object but is a #{key.class}"
130
130
  end
131
131
  _check_assignment_value(value)
132
132
  @store.cache.cache_write(self)
@@ -143,18 +143,33 @@ module PEROBS
143
143
  # is referencing.
144
144
  # @return [Array of Integer] IDs of referenced objects
145
145
  def _referenced_object_ids
146
- @data.each_value.select { |v| v && v.respond_to?(:is_poxreference?) }.
147
- map { |o| o.id }
146
+ ids = []
147
+ @data.each do |k, v|
148
+ if k && k.respond_to?(:is_poxreference?)
149
+ ids << k.id
150
+ end
151
+ if v && v.respond_to?(:is_poxreference?)
152
+ ids << v.id
153
+ end
154
+ end
155
+
156
+ ids
148
157
  end
149
158
 
150
159
  # This method should only be used during store repair operations. It will
151
160
  # delete all referenced to the given object ID.
152
161
  # @param id [Integer] targeted object ID
153
162
  def _delete_reference_to_id(id)
163
+ original_length = @data.length
164
+
154
165
  @data.delete_if do |k, v|
155
- v && v.respond_to?(:is_poxreference?) && v.id == id
166
+ (k && k.respond_to?(:is_poxreference?) && k.id == id) ||
167
+ (v && v.respond_to?(:is_poxreference?) && v.id == id)
168
+ end
169
+
170
+ if @data.length != original_length
171
+ @store.cache.cache_write(self)
156
172
  end
157
- @store.cache.cache_write(self)
158
173
  end
159
174
 
160
175
  # Restore the persistent data from a single data structure.
@@ -163,8 +178,18 @@ module PEROBS
163
178
  # @private
164
179
  def _deserialize(data)
165
180
  @data = {}
166
- data.each { |k, v| @data[k] = v.is_a?(POReference) ?
167
- POXReference.new(@store, v.id) : v }
181
+
182
+ data.each do |k, v|
183
+ # References to other PEROBS Objects are marshalled with our own
184
+ # format. If we detect such a marshalled String we convert it into a
185
+ # POXReference object.
186
+ if (match = /^#<PEROBS::POReference id=([0-9]+)>$/.match(k))
187
+ k = POXReference.new(@store, match[1].to_i)
188
+ end
189
+ dv = v.is_a?(POReference) ? POXReference.new(@store, v.id) : v
190
+ @data[k] = dv
191
+ end
192
+
168
193
  @data
169
194
  end
170
195
 
@@ -185,26 +210,46 @@ module PEROBS
185
210
  data = {}
186
211
 
187
212
  @data.each do |k, v|
188
- if v.respond_to?(:is_poxreference?)
189
- data[k] = POReference.new(v.id)
190
- else
191
- # Outside of the PEROBS library all PEROBS::ObjectBase derived
192
- # objects should not be used directly. The library only exposes them
193
- # via POXReference proxy objects.
194
- if v.is_a?(ObjectBase)
195
- PEROBS.log.fatal 'A PEROBS::ObjectBase object escaped! ' +
196
- "It is stored in a PEROBS::Hash with key #{k.inspect}. " +
197
- 'Have you used self() instead of myself() to ' +
198
- "get the reference of this PEROBS object?\n" +
199
- v.inspect
200
- end
201
- data[k] = v
213
+ if k.respond_to?(:is_poxreference?)
214
+ # JSON only supports Strings as hash keys. Since JSON is the default
215
+ # internal storage format in the database, we have to marshall
216
+ # PEROBS::Object references ourselves.
217
+ k = "#<PEROBS::POReference id=#{k.id}>"
218
+ elsif k[0..24] == '#<PEROBS::POReference id='
219
+ # This could obviously result in conflicts with 'normal' String hash
220
+ # keys. This is extremely unlikely, but we better catch this case
221
+ # before it causes hard to debug trouble.
222
+ raise ArgumentError, "Hash key #{k} conflicts with PEROBS " +
223
+ "internal representation of marshalled hash keys!"
202
224
  end
225
+ data[k] = serialize_helper(v)
203
226
  end
204
227
 
205
228
  data
206
229
  end
207
230
 
231
+ def serialize_helper(v)
232
+ if v.respond_to?(:is_poxreference?)
233
+ # References to other PEROBS objects (POXReference) are stored as
234
+ # POReference in the database.
235
+ return POReference.new(v.id)
236
+ else
237
+ # Outside of the PEROBS library all PEROBS::ObjectBase derived
238
+ # objects should not be used directly. The library only exposes them
239
+ # via POXReference proxy objects.
240
+ if v.is_a?(ObjectBase)
241
+ PEROBS.log.fatal 'A PEROBS::ObjectBase object escaped! ' +
242
+ "It is stored in a PEROBS::Hash. " +
243
+ 'Have you used self() instead of myself() to ' +
244
+ "get the reference of this PEROBS object?\n" +
245
+ v.inspect
246
+ end
247
+
248
+ # All other objects are serialized by their native methods.
249
+ return v
250
+ end
251
+ end
252
+
208
253
  end
209
254
 
210
255
  end
@@ -54,7 +54,8 @@ module PEROBS
54
54
  @file_name = File.join(dir, name + '.cache')
55
55
  @page_size = page_size
56
56
  open
57
- @pages = PersistentObjectCache.new(max_in_memory, -1, IDListPage, self)
57
+ @pages = PersistentObjectCache.new(max_in_memory, max_in_memory,
58
+ IDListPage, self)
58
59
  @page_counter = 0
59
60
  end
60
61
 
@@ -65,7 +65,7 @@ module PEROBS
65
65
  end
66
66
 
67
67
  # Insert an ID into the page.
68
- # @param ID [Integer] The ID to store
68
+ # @param id [Integer] The ID to store
69
69
  def insert(id)
70
70
  unless @min_id <= id && id <= @max_id
71
71
  raise ArgumentError, "IDs for this page must be between #{@min_id} " +
data/lib/perobs/Log.rb CHANGED
@@ -42,6 +42,11 @@ module PEROBS
42
42
  # are caused by user error rather than program logic errors.
43
43
  class UsageError < StandardError ; end
44
44
 
45
+ # This is the Exception type that will be thrown when a transaction start
46
+ # failed because there is an ongoing transaction from another thread in
47
+ # progress.
48
+ class TransactionInOtherThread < StandardError ; end
49
+
45
50
  # The ILogger class is a singleton that provides a common logging mechanism
46
51
  # to all objects. It exposes essentially the same interface as the Logger
47
52
  # class, just as a singleton and extends fatal to raise an FatalError
@@ -102,6 +102,13 @@ module PEROBS
102
102
  end
103
103
  end
104
104
 
105
+ # To allow POXReference objects to be used as Hash keys we need to
106
+ # implement this function. Conveniently, we can just use the PEROBS object
107
+ # ID since that is unique.
108
+ def hash
109
+ @id
110
+ end
111
+
105
112
  # Shortcut to access the _id() method of the referenced object.
106
113
  def _id
107
114
  @id
@@ -54,7 +54,7 @@ module PEROBS
54
54
 
55
55
  # Benchmark runs showed a cache size of 128 to be a good compromise
56
56
  # between read and write performance trade-offs and memory consumption.
57
- @cache = PersistentObjectCache.new(256, -1, SpaceTreeNode, self)
57
+ @cache = PersistentObjectCache.new(256, 256, SpaceTreeNode, self)
58
58
  end
59
59
 
60
60
  # Open the SpaceTree file.