perobs 4.2.0 → 4.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -221,6 +221,7 @@ module PEROBS
221
221
  flags |= (1 << FlatFileBlobHeader::COMPRESSED_FLAG_BIT) if compressed
222
222
  FlatFileBlobHeader.new(@f, addr, flags, raw_obj_bytesize, id, crc).write
223
223
  @f.write(raw_obj)
224
+ @f.flush
224
225
  if length != -1 && raw_obj_bytesize < length
225
226
  # The new object was not appended and it did not completely fill the
226
227
  # free space. So we have to write a new header to mark the remaining
@@ -247,12 +248,11 @@ module PEROBS
247
248
  # If we had an existing object stored for the ID we have to mark
248
249
  # this entry as deleted now.
249
250
  old_header.clear_flags
251
+ @f.flush
250
252
  # And register the newly freed space with the space list.
251
253
  if @space_list.is_open?
252
254
  @space_list.add_space(old_addr, old_header.length)
253
255
  end
254
- else
255
- @f.flush
256
256
  end
257
257
  rescue IOError => e
258
258
  PEROBS.log.fatal "Cannot write blob for ID #{id} to FlatFileDB: " +
@@ -293,7 +293,7 @@ module PEROBS
293
293
  header = FlatFileBlobHeader.read(@f, addr, id)
294
294
  if header.id != id
295
295
  PEROBS.log.fatal "Database index corrupted: Index for object " +
296
- "#{id} points to object with ID #{header.id}"
296
+ "#{id} points to object with ID #{header.id} at address #{addr}"
297
297
  end
298
298
 
299
299
  buf = nil
@@ -302,7 +302,8 @@ module PEROBS
302
302
  @f.seek(addr + FlatFileBlobHeader::LENGTH)
303
303
  buf = @f.read(header.length)
304
304
  rescue IOError => e
305
- PEROBS.log.fatal "Cannot read blob for ID #{id}: #{e.message}"
305
+ PEROBS.log.fatal "Cannot read blob for ID #{id} at address #{addr}: " +
306
+ e.message
306
307
  end
307
308
 
308
309
  # Uncompress the data if the compression bit is set in the flags byte.
@@ -311,12 +312,13 @@ module PEROBS
311
312
  buf = Zlib.inflate(buf)
312
313
  rescue Zlib::BufError, Zlib::DataError
313
314
  PEROBS.log.fatal "Corrupted compressed block with ID " +
314
- "#{header.id} found."
315
+ "#{id} found at address #{addr}."
315
316
  end
316
317
  end
317
318
 
318
319
  if checksum(buf) != header.crc
319
- PEROBS.log.fatal "Checksum failure while reading blob ID #{id}"
320
+ PEROBS.log.fatal "Checksum failure while reading blob ID #{id} " +
321
+ "at address #{addr}"
320
322
  end
321
323
 
322
324
  buf
@@ -339,7 +341,7 @@ module PEROBS
339
341
  if @marks
340
342
  @marks.clear
341
343
  else
342
- @marks = IDList.new(@db_dir, 'marks', 8)
344
+ @marks = IDList.new(@db_dir, 'marks', item_counter)
343
345
  end
344
346
  end
345
347
 
@@ -353,7 +355,7 @@ module PEROBS
353
355
  valid_blobs = 0
354
356
 
355
357
  # Iterate over all entries.
356
- @progressmeter.start('Defragmentizing blobs file', @f.size) do |pm|
358
+ @progressmeter.start('Defragmenting blobs file', @f.size) do |pm|
357
359
  each_blob_header do |header|
358
360
  # If we have stumbled over a corrupted blob we treat it similar to a
359
361
  # deleted blob and reuse the space.
@@ -452,16 +454,14 @@ module PEROBS
452
454
  regenerate_index_and_spaces
453
455
  end
454
456
 
455
- # Check (and repair) the FlatFile.
456
- # @param repair [Boolean] True if errors should be fixed.
457
+ # Check the FlatFile.
457
458
  # @return [Integer] Number of errors found
458
- def check(repair = false)
459
+ def check()
459
460
  errors = 0
460
461
  return errors unless @f
461
462
 
462
463
  t = Time.now
463
- PEROBS.log.info "Checking FlatFile database" +
464
- "#{repair ? ' in repair mode' : ''}..."
464
+ PEROBS.log.info "Checking FlatFile database..."
465
465
 
466
466
  # First check the database blob file. Each entry should be readable and
467
467
  # correct and all IDs must be unique. We use a shadow index to keep
@@ -483,7 +483,6 @@ module PEROBS
483
483
  if buf.bytesize != header.length
484
484
  PEROBS.log.error "Premature end of file in blob with ID " +
485
485
  "#{header.id}."
486
- discard_damaged_blob(header) if repair
487
486
  errors += 1
488
487
  next
489
488
  end
@@ -496,7 +495,6 @@ module PEROBS
496
495
  rescue Zlib::BufError, Zlib::DataError
497
496
  PEROBS.log.error "Corrupted compressed block with ID " +
498
497
  "#{header.id} found."
499
- discard_damaged_blob(header) if repair
500
498
  errors += 1
501
499
  next
502
500
  end
@@ -505,7 +503,6 @@ module PEROBS
505
503
  if header.crc && checksum(buf) != header.crc
506
504
  PEROBS.log.error "Checksum failure while checking blob " +
507
505
  "with ID #{header.id}"
508
- discard_damaged_blob(header) if repair
509
506
  errors += 1
510
507
  next
511
508
  end
@@ -521,22 +518,6 @@ module PEROBS
521
518
  errors += 1
522
519
  previous_header = FlatFileBlobHeader.read(@f, previous_address,
523
520
  header.id)
524
- if repair
525
- # We have two blobs with the same ID and we must discard one of
526
- # them.
527
- if header.is_outdated?
528
- discard_damaged_blob(header)
529
- elsif previous_header.is_outdated?
530
- discard_damaged_blob(previous_header)
531
- else
532
- PEROBS.log.error "None of the blobs with same ID have " +
533
- "the outdated flag set. Deleting the smaller one."
534
- errors += 1
535
- discard_damaged_blob(header.length < previous_header.length ?
536
- header : previous_header)
537
- end
538
- next
539
- end
540
521
  else
541
522
  # ID is unique so far. Add it to the shadow index.
542
523
  new_index.insert(header.id, header.addr)
@@ -553,12 +534,6 @@ module PEROBS
553
534
  PEROBS.log.error "#{@f.size - end_of_last_healthy_blob} corrupted " +
554
535
  'bytes found at the end of FlatFile.'
555
536
  corrupted_blobs += 1
556
- if repair
557
- PEROBS.log.error "Truncating FlatFile to " +
558
- "#{end_of_last_healthy_blob} bytes by discarding " +
559
- "#{@f.size - end_of_last_healthy_blob} bytes"
560
- @f.truncate(end_of_last_healthy_blob)
561
- end
562
537
  end
563
538
 
564
539
  errors += corrupted_blobs
@@ -568,17 +543,19 @@ module PEROBS
568
543
  new_index.close
569
544
  new_index.erase
570
545
 
571
- if repair && corrupted_blobs > 0
572
- erase_index_files
573
- defragmentize
574
- regenerate_index_and_spaces
575
- elsif corrupted_blobs == 0
546
+ if corrupted_blobs == 0
576
547
  # Now we check the index data. It must be correct and the entries must
577
548
  # match the blob file. All entries in the index must be in the blob file
578
549
  # and vise versa.
579
550
  begin
580
551
  index_ok = @index.check do |id, address|
581
- has_id_at?(id, address)
552
+ unless has_id_at?(id, address)
553
+ PEROBS.log.error "Index contains an entry for " +
554
+ "ID #{id} at address #{address} that is not in FlatFile"
555
+ false
556
+ else
557
+ true
558
+ end
582
559
  end
583
560
  x_check_errs = 0
584
561
  space_check_ok = true
@@ -586,16 +563,13 @@ module PEROBS
586
563
  (x_check_errs = cross_check_entries) == 0
587
564
  errors += 1 unless index_ok && space_check_ok
588
565
  errors += x_check_errs
589
- regenerate_index_and_spaces if repair
590
566
  end
591
567
  rescue PEROBS::FatalError
592
568
  errors += 1
593
- regenerate_index_and_spaces if repair
594
569
  end
595
570
  end
596
571
 
597
- sync if repair
598
- PEROBS.log.info "check_db completed in #{Time.now - t} seconds. " +
572
+ PEROBS.log.info "FlatFile check completed in #{Time.now - t} seconds. " +
599
573
  "#{errors} errors found."
600
574
 
601
575
  errors
@@ -604,7 +578,6 @@ module PEROBS
604
578
  # Repair the FlatFile. In contrast to the repair functionality in the
605
579
  # check() method this method is much faster. It simply re-creates the
606
580
  # index and space list from the blob file.
607
- # @param repair [Boolean] True if errors should be fixed.
608
581
  # @return [Integer] Number of errors found
609
582
  def repair
610
583
  errors = 0
@@ -687,17 +660,7 @@ module PEROBS
687
660
  header.id)
688
661
  # We have two blobs with the same ID and we must discard one of
689
662
  # them.
690
- if header.is_outdated?
691
- discard_damaged_blob(header)
692
- elsif previous_header.is_outdated?
693
- discard_damaged_blob(previous_header)
694
- else
695
- PEROBS.log.error "None of the blobs with same ID have " +
696
- "the outdated flag set. Deleting the smaller one."
697
- errors += 1
698
- discard_damaged_blob(header.length < previous_header.length ?
699
- header : previous_header)
700
- end
663
+ discard_duplicate_blobs(header, previous_header)
701
664
  else
702
665
  # ID is unique so far. Add it to the shadow index.
703
666
  @index.insert(header.id, header.addr)
@@ -927,6 +890,23 @@ module PEROBS
927
890
  header.clear_flags
928
891
  end
929
892
 
893
+ def discard_duplicate_blobs(header, previous_header)
894
+ if header.is_outdated?
895
+ discard_damaged_blob(header)
896
+ elsif previous_header.is_outdated?
897
+ discard_damaged_blob(previous_header)
898
+ else
899
+ smaller, larger = header.length < previous_header.length ?
900
+ [ header, previous_header ] : [ previous_header, header ]
901
+ PEROBS.log.error "None of the blobs with same ID have " +
902
+ "the outdated flag set. Deleting the smaller one " +
903
+ "at address #{smaller.addr}"
904
+ discard_damaged_blob(smaller)
905
+ @space_list.add_space(smaller.addr, smaller.length)
906
+ @index.insert(larger.id, larger.addr)
907
+ end
908
+ end
909
+
930
910
  def open_index_files(abort_on_missing_files = false)
931
911
  begin
932
912
  @index.open(abort_on_missing_files)
@@ -26,40 +26,42 @@
26
26
  # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
27
 
28
28
  require 'perobs/Log'
29
- require 'perobs/ObjectBase'
29
+ require 'perobs/Object'
30
30
 
31
31
  module PEROBS
32
32
 
33
33
  # The fuzzy string matcher can be used to perform a fuzzy string search
34
34
  # against a known set of strings. The dictionary of known strings does not
35
- # store the actual strings but references to arbitrary objects. These could
36
- # be the string, but can be something else related to the learned strings.
37
- # To use this class a list of strings with their references must be learned.
38
- # Once the dictionary has been established, fuzzy matches can be done.
39
- class FuzzyStringMatcher
35
+ # store the actual strings but references to String or PEROBS objects.
36
+ # Once the dictionary has been established, fuzzy matches can be done. Since
37
+ # the actual input strings are not directly stored, you cannot remove or
38
+ # modified already stored strings. To remove strings, you have to clear the
39
+ # matcher and add the strings again that you want to keep.
40
+ class FuzzyStringMatcher < PEROBS::Object
41
+
42
+ attr_persist :case_sensitive, :n, :dict
40
43
 
41
44
  # Create a new FuzzyStringMatcher.
42
- # @param store [PEROBS::Store] place to store the dictionary
43
- # @param name [String] Unique name of the string matcher
45
+ # @param p [PEROBS::Store] place to store the dictionary
44
46
  # @param case_sensitive [Boolean] True if case matters for matching
45
47
  # @param n [Integer] Determines what kind of n-gramm is used to store the
46
48
  # references in the dictionary. It also determines the minimum word
47
- # length that can be used for fuzzy matches.
48
- def initialize(store, name, case_sensitive = false, n = 4)
49
- @store = store
50
- @dict_name = "FuzzyStringMatcher::#{name}"
49
+ # length that can be used for fuzzy matches. Values between 2 and
50
+ # 10 are supported. The default is 4.
51
+ def initialize(p, case_sensitive = false, n = 4)
52
+ super(p)
51
53
  if n < 2 || n > 10
52
54
  raise ArgumentError, 'n must be between 2 and 10'
53
55
  end
54
- @case_sensitive = case_sensitive
55
- @n = n
56
+ self.case_sensitive = case_sensitive
57
+ self.n = n
56
58
 
57
- clear unless (@dict = @store[@dict_name])
59
+ clear unless @dict
58
60
  end
59
61
 
60
62
  # Wipe the dictionary.
61
63
  def clear
62
- @store[@dict_name] = @dict = @store.new(BigHash)
64
+ self.dict = @store.new(BigHash)
63
65
  end
64
66
 
65
67
  # Add a string with its reference to the dictionary.
@@ -79,11 +81,8 @@ module PEROBS
79
81
  @dict[n_gramm] = ng_list = @store.new(Hash)
80
82
  end
81
83
 
82
- if ng_list.include?(reference)
83
- ng_list[reference] += 1
84
- else
85
- ng_list[reference] = 0
86
- end
84
+ # We use the Hash as a Set. The value doesn't matter.
85
+ ng_list[reference] = true unless ng_list.include?(reference)
87
86
  end
88
87
 
89
88
  nil
@@ -109,22 +108,12 @@ module PEROBS
109
108
 
110
109
  matches = {}
111
110
 
112
- # This will be the best possible score for a perfect match.
113
- best_possible_score = 0
114
111
  each_n_gramm(string) do |n_gramm|
115
- best_possible_score += 1
116
112
  if (ng_list = @dict[n_gramm])
117
- ng_list.each do |reference, count|
113
+ ng_list.each do |reference, dummy|
118
114
  if matches.include?(reference)
119
115
  matches[reference] += 1
120
116
  else
121
- # We use internally a 10 times larger list so that we don't
122
- # throw away good matches too early. If the max_count value is
123
- # chosen too small there is a risk of not finding the best
124
- # matches!
125
- if matches.size > 10 * max_count
126
- matches = discard_worst_match(matches)
127
- end
128
117
  matches[reference] = 1
129
118
  end
130
119
  end
@@ -133,19 +122,23 @@ module PEROBS
133
122
 
134
123
  return [] if matches.empty?
135
124
 
136
- # Sort in the order of occurance count downwards.
137
- match_list = matches.to_a.sort do |a, b|
138
- b[1] <=> a[1]
139
- end
125
+ match_list = matches.to_a
140
126
 
141
127
  # Set occurance counters to scores relative to the best possible score.
128
+ # This will be the best possible score for a perfect match.
129
+ best_possible_score = string.length - @n + 1
142
130
  match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
143
131
 
144
- # Delete all matches that occured less than half as often than the
145
- # top match.
132
+ # Delete all matches that don't have the required minimum match score.
146
133
  match_list.delete_if { |a| a[1] < min_score }
147
134
 
148
- match_list[0..max_count]
135
+ # Sort the list best to worst match
136
+ match_list.sort! do |a, b|
137
+ b[1] <=> a[1]
138
+ end
139
+
140
+ # Return the top max_count matches.
141
+ match_list[0..max_count - 1]
149
142
  end
150
143
 
151
144
  # Returns some internal stats about the dictionary.
@@ -176,16 +169,6 @@ module PEROBS
176
169
  end
177
170
  end
178
171
 
179
- def discard_worst_match(matches)
180
- # Sort in the order of occurance count downwards.
181
- match_list = matches.to_a.sort do |a, b|
182
- b[1] <=> a[1]
183
- end
184
- # Discard the lowest half of the matches
185
- match_list = match_list[0..match_list.length / 2]
186
- match_list.to_h
187
- end
188
-
189
172
  end
190
173
 
191
174
  end
data/lib/perobs/Hash.rb CHANGED
@@ -124,9 +124,9 @@ module PEROBS
124
124
 
125
125
  # Proxy for assignment method.
126
126
  def []=(key, value)
127
- unless key.is_a?(String)
128
- raise ArgumentError, "PEROBS::Hash[] key must be a String but is a " +
129
- "#{key.class}"
127
+ unless key.is_a?(String) || key.respond_to?(:is_poxreference?)
128
+ raise ArgumentError, "PEROBS::Hash[] key must be a String or " +
129
+ "a PEROBS object but is a #{key.class}"
130
130
  end
131
131
  _check_assignment_value(value)
132
132
  @store.cache.cache_write(self)
@@ -143,18 +143,33 @@ module PEROBS
143
143
  # is referencing.
144
144
  # @return [Array of Integer] IDs of referenced objects
145
145
  def _referenced_object_ids
146
- @data.each_value.select { |v| v && v.respond_to?(:is_poxreference?) }.
147
- map { |o| o.id }
146
+ ids = []
147
+ @data.each do |k, v|
148
+ if k && k.respond_to?(:is_poxreference?)
149
+ ids << k.id
150
+ end
151
+ if v && v.respond_to?(:is_poxreference?)
152
+ ids << v.id
153
+ end
154
+ end
155
+
156
+ ids
148
157
  end
149
158
 
150
159
  # This method should only be used during store repair operations. It will
151
160
  # delete all referenced to the given object ID.
152
161
  # @param id [Integer] targeted object ID
153
162
  def _delete_reference_to_id(id)
163
+ original_length = @data.length
164
+
154
165
  @data.delete_if do |k, v|
155
- v && v.respond_to?(:is_poxreference?) && v.id == id
166
+ (k && k.respond_to?(:is_poxreference?) && k.id == id) ||
167
+ (v && v.respond_to?(:is_poxreference?) && v.id == id)
168
+ end
169
+
170
+ if @data.length != original_length
171
+ @store.cache.cache_write(self)
156
172
  end
157
- @store.cache.cache_write(self)
158
173
  end
159
174
 
160
175
  # Restore the persistent data from a single data structure.
@@ -163,8 +178,18 @@ module PEROBS
163
178
  # @private
164
179
  def _deserialize(data)
165
180
  @data = {}
166
- data.each { |k, v| @data[k] = v.is_a?(POReference) ?
167
- POXReference.new(@store, v.id) : v }
181
+
182
+ data.each do |k, v|
183
+ # References to other PEROBS Objects are marshalled with our own
184
+ # format. If we detect such a marshalled String we convert it into a
185
+ # POXReference object.
186
+ if (match = /^#<PEROBS::POReference id=([0-9]+)>$/.match(k))
187
+ k = POXReference.new(@store, match[1].to_i)
188
+ end
189
+ dv = v.is_a?(POReference) ? POXReference.new(@store, v.id) : v
190
+ @data[k] = dv
191
+ end
192
+
168
193
  @data
169
194
  end
170
195
 
@@ -185,26 +210,46 @@ module PEROBS
185
210
  data = {}
186
211
 
187
212
  @data.each do |k, v|
188
- if v.respond_to?(:is_poxreference?)
189
- data[k] = POReference.new(v.id)
190
- else
191
- # Outside of the PEROBS library all PEROBS::ObjectBase derived
192
- # objects should not be used directly. The library only exposes them
193
- # via POXReference proxy objects.
194
- if v.is_a?(ObjectBase)
195
- PEROBS.log.fatal 'A PEROBS::ObjectBase object escaped! ' +
196
- "It is stored in a PEROBS::Hash with key #{k.inspect}. " +
197
- 'Have you used self() instead of myself() to ' +
198
- "get the reference of this PEROBS object?\n" +
199
- v.inspect
200
- end
201
- data[k] = v
213
+ if k.respond_to?(:is_poxreference?)
214
+ # JSON only supports Strings as hash keys. Since JSON is the default
215
+ # internal storage format in the database, we have to marshall
216
+ # PEROBS::Object references ourselves.
217
+ k = "#<PEROBS::POReference id=#{k.id}>"
218
+ elsif k[0..24] == '#<PEROBS::POReference id='
219
+ # This could obviously result in conflicts with 'normal' String hash
220
+ # keys. This is extremely unlikely, but we better catch this case
221
+ # before it causes hard to debug trouble.
222
+ raise ArgumentError, "Hash key #{k} conflicts with PEROBS " +
223
+ "internal representation of marshalled hash keys!"
202
224
  end
225
+ data[k] = serialize_helper(v)
203
226
  end
204
227
 
205
228
  data
206
229
  end
207
230
 
231
+ def serialize_helper(v)
232
+ if v.respond_to?(:is_poxreference?)
233
+ # References to other PEROBS objects (POXReference) are stored as
234
+ # POReference in the database.
235
+ return POReference.new(v.id)
236
+ else
237
+ # Outside of the PEROBS library all PEROBS::ObjectBase derived
238
+ # objects should not be used directly. The library only exposes them
239
+ # via POXReference proxy objects.
240
+ if v.is_a?(ObjectBase)
241
+ PEROBS.log.fatal 'A PEROBS::ObjectBase object escaped! ' +
242
+ "It is stored in a PEROBS::Hash. " +
243
+ 'Have you used self() instead of myself() to ' +
244
+ "get the reference of this PEROBS object?\n" +
245
+ v.inspect
246
+ end
247
+
248
+ # All other objects are serialized by their native methods.
249
+ return v
250
+ end
251
+ end
252
+
208
253
  end
209
254
 
210
255
  end
@@ -54,7 +54,8 @@ module PEROBS
54
54
  @file_name = File.join(dir, name + '.cache')
55
55
  @page_size = page_size
56
56
  open
57
- @pages = PersistentObjectCache.new(max_in_memory, -1, IDListPage, self)
57
+ @pages = PersistentObjectCache.new(max_in_memory, max_in_memory,
58
+ IDListPage, self)
58
59
  @page_counter = 0
59
60
  end
60
61
 
@@ -65,7 +65,7 @@ module PEROBS
65
65
  end
66
66
 
67
67
  # Insert an ID into the page.
68
- # @param ID [Integer] The ID to store
68
+ # @param id [Integer] The ID to store
69
69
  def insert(id)
70
70
  unless @min_id <= id && id <= @max_id
71
71
  raise ArgumentError, "IDs for this page must be between #{@min_id} " +
data/lib/perobs/Log.rb CHANGED
@@ -42,6 +42,11 @@ module PEROBS
42
42
  # are caused by user error rather than program logic errors.
43
43
  class UsageError < StandardError ; end
44
44
 
45
+ # This is the Exception type that will be thrown when a transaction start
46
+ # failed because there is an ongoing transaction from another thread in
47
+ # progress.
48
+ class TransactionInOtherThread < StandardError ; end
49
+
45
50
  # The ILogger class is a singleton that provides a common logging mechanism
46
51
  # to all objects. It exposes essentially the same interface as the Logger
47
52
  # class, just as a singleton and extends fatal to raise an FatalError
@@ -102,6 +102,13 @@ module PEROBS
102
102
  end
103
103
  end
104
104
 
105
+ # To allow POXReference objects to be used as Hash keys we need to
106
+ # implement this function. Conveniently, we can just use the PEROBS object
107
+ # ID since that is unique.
108
+ def hash
109
+ @id
110
+ end
111
+
105
112
  # Shortcut to access the _id() method of the referenced object.
106
113
  def _id
107
114
  @id
@@ -54,7 +54,7 @@ module PEROBS
54
54
 
55
55
  # Benchmark runs showed a cache size of 128 to be a good compromise
56
56
  # between read and write performance trade-offs and memory consumption.
57
- @cache = PersistentObjectCache.new(256, -1, SpaceTreeNode, self)
57
+ @cache = PersistentObjectCache.new(256, 256, SpaceTreeNode, self)
58
58
  end
59
59
 
60
60
  # Open the SpaceTree file.