perobs 4.0.0 → 4.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +5 -5
  2. data/README.md +27 -16
  3. data/lib/perobs/Array.rb +66 -19
  4. data/lib/perobs/BTree.rb +106 -15
  5. data/lib/perobs/BTreeBlob.rb +4 -3
  6. data/lib/perobs/BTreeDB.rb +5 -4
  7. data/lib/perobs/BTreeNode.rb +482 -156
  8. data/lib/perobs/BTreeNodeLink.rb +10 -0
  9. data/lib/perobs/BigArray.rb +285 -0
  10. data/lib/perobs/BigArrayNode.rb +1002 -0
  11. data/lib/perobs/BigHash.rb +246 -0
  12. data/lib/perobs/BigTree.rb +197 -0
  13. data/lib/perobs/BigTreeNode.rb +873 -0
  14. data/lib/perobs/Cache.rb +48 -10
  15. data/lib/perobs/ConsoleProgressMeter.rb +61 -0
  16. data/lib/perobs/DataBase.rb +4 -3
  17. data/lib/perobs/DynamoDB.rb +57 -15
  18. data/lib/perobs/EquiBlobsFile.rb +155 -50
  19. data/lib/perobs/FNV_Hash_1a_64.rb +54 -0
  20. data/lib/perobs/FlatFile.rb +519 -227
  21. data/lib/perobs/FlatFileBlobHeader.rb +113 -54
  22. data/lib/perobs/FlatFileDB.rb +49 -23
  23. data/lib/perobs/FuzzyStringMatcher.rb +175 -0
  24. data/lib/perobs/Hash.rb +127 -33
  25. data/lib/perobs/IDList.rb +144 -0
  26. data/lib/perobs/IDListPage.rb +107 -0
  27. data/lib/perobs/IDListPageFile.rb +180 -0
  28. data/lib/perobs/IDListPageRecord.rb +142 -0
  29. data/lib/perobs/Object.rb +18 -15
  30. data/lib/perobs/ObjectBase.rb +46 -5
  31. data/lib/perobs/PersistentObjectCache.rb +57 -68
  32. data/lib/perobs/PersistentObjectCacheLine.rb +24 -12
  33. data/lib/perobs/ProgressMeter.rb +97 -0
  34. data/lib/perobs/SpaceManager.rb +273 -0
  35. data/lib/perobs/SpaceTree.rb +21 -12
  36. data/lib/perobs/SpaceTreeNode.rb +53 -61
  37. data/lib/perobs/Store.rb +264 -145
  38. data/lib/perobs/version.rb +1 -1
  39. data/lib/perobs.rb +2 -0
  40. data/perobs.gemspec +4 -4
  41. data/test/Array_spec.rb +15 -6
  42. data/test/BTree_spec.rb +6 -2
  43. data/test/BigArray_spec.rb +261 -0
  44. data/test/BigHash_spec.rb +152 -0
  45. data/test/BigTreeNode_spec.rb +153 -0
  46. data/test/BigTree_spec.rb +259 -0
  47. data/test/EquiBlobsFile_spec.rb +105 -1
  48. data/test/FNV_Hash_1a_64_spec.rb +59 -0
  49. data/test/FlatFileDB_spec.rb +198 -14
  50. data/test/FuzzyStringMatcher_spec.rb +261 -0
  51. data/test/Hash_spec.rb +13 -3
  52. data/test/IDList_spec.rb +77 -0
  53. data/test/LegacyDBs/LegacyDB.rb +155 -0
  54. data/test/LegacyDBs/version_3/class_map.json +1 -0
  55. data/test/LegacyDBs/version_3/config.json +1 -0
  56. data/test/LegacyDBs/version_3/database.blobs +0 -0
  57. data/test/LegacyDBs/version_3/database_spaces.blobs +0 -0
  58. data/test/LegacyDBs/version_3/index.blobs +0 -0
  59. data/test/LegacyDBs/version_3/version +1 -0
  60. data/test/LockFile_spec.rb +9 -6
  61. data/test/SpaceManager_spec.rb +176 -0
  62. data/test/SpaceTree_spec.rb +4 -1
  63. data/test/Store_spec.rb +305 -203
  64. data/test/spec_helper.rb +9 -4
  65. metadata +57 -16
  66. data/lib/perobs/BTreeNodeCache.rb +0 -109
  67. data/lib/perobs/TreeDB.rb +0 -277
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # = FlatFile.rb -- Persistent Ruby Object Store
4
4
  #
5
- # Copyright (c) 2016 by Chris Schlaeger <chris@taskjuggler.org>
5
+ # Copyright (c) 2016, 2018, 2019 by Chris Schlaeger <chris@taskjuggler.org>
6
6
  #
7
7
  # MIT License
8
8
  #
@@ -31,6 +31,8 @@ require 'perobs/Log'
31
31
  require 'perobs/FlatFileBlobHeader'
32
32
  require 'perobs/BTree'
33
33
  require 'perobs/SpaceTree'
34
+ require 'perobs/SpaceManager'
35
+ require 'perobs/IDList'
34
36
 
35
37
  module PEROBS
36
38
 
@@ -44,12 +46,20 @@ module PEROBS
44
46
 
45
47
  # Create a new FlatFile object for a database in the given path.
46
48
  # @param dir [String] Directory path for the data base file
47
- def initialize(dir)
49
+ def initialize(dir, progressmeter)
48
50
  @db_dir = dir
51
+ @progressmeter = progressmeter
49
52
  @f = nil
50
- @index = BTree.new(@db_dir, 'index', INDEX_BTREE_ORDER)
51
- @marks = BTree.new(@db_dir, 'marks', INDEX_BTREE_ORDER)
52
- @space_list = SpaceTree.new(@db_dir)
53
+ @marks = nil
54
+ @index = BTree.new(@db_dir, 'index', INDEX_BTREE_ORDER, @progressmeter)
55
+ old_spaces_file = File.join(@db_dir, 'database_spaces.blobs')
56
+ if File.exist?(old_spaces_file)
57
+ # PEROBS version 4.1.0 and earlier used this space list format. It is
58
+ # deprecated now. Newly created DBs use the SpaceManager format.
59
+ @space_list = SpaceTree.new(@db_dir, @progressmeter)
60
+ else
61
+ @space_list = SpaceManager.new(@db_dir, @progressmeter)
62
+ end
53
63
  end
54
64
 
55
65
  # Open the flat file for reading and writing.
@@ -74,33 +84,19 @@ module PEROBS
74
84
  end
75
85
  @f.sync = true
76
86
 
77
- begin
78
- @index.open(!new_db_created)
79
- @space_list.open
80
- rescue FatalError
81
- # Ensure that the index is really closed.
82
- @index.close
83
- # Erase it completely
84
- @index.erase
85
- # Then create it again.
86
- @index.open
87
-
88
- # Ensure that the spaces list is really closed.
89
- @space_list.close
90
- # Erase it completely
91
- @space_list.erase
92
- # Then create it again
93
- @space_list.open
94
-
95
- regenerate_index_and_spaces
96
- end
87
+ open_index_files(!new_db_created)
97
88
  end
98
89
 
99
90
  # Close the flat file. This method must be called to ensure that all data
100
91
  # is really written into the filesystem.
101
92
  def close
102
- @space_list.close
103
- @index.close
93
+ @space_list.close if @space_list.is_open?
94
+ @index.close if @index.is_open?
95
+
96
+ if @marks
97
+ @marks.erase
98
+ @marks = nil
99
+ end
104
100
 
105
101
  if @f
106
102
  @f.flush
@@ -139,29 +135,37 @@ module PEROBS
139
135
  # @param addr [Integer] Address of the blob to delete
140
136
  # @param id [Integer] ID of the blob to delete
141
137
  def delete_obj_by_address(addr, id)
142
- @index.remove(id)
143
- header = FlatFileBlobHeader.read_at(@f, addr, id)
138
+ @index.remove(id) if @index.is_open?
139
+ header = FlatFileBlobHeader.read(@f, addr, id)
144
140
  header.clear_flags
145
- @space_list.add_space(addr, header.length)
141
+ @space_list.add_space(addr, header.length) if @space_list.is_open?
146
142
  end
147
143
 
148
144
  # Delete all unmarked objects.
149
- def delete_unmarked_objects
150
- PEROBS.log.info "Deleting unmarked objects..."
151
- t = Time.now
145
+ def delete_unmarked_objects(&block)
146
+ # We don't update the index and the space list during this operation as
147
+ # we defragmentize the blob file at the end. We'll end the operation
148
+ # with an empty space list.
149
+ clear_index_files
150
+
151
+ deleted_objects_count = 0
152
+ @progressmeter.start('Sweeping unmarked objects', @f.size) do |pm|
153
+ each_blob_header do |header|
154
+ if header.is_valid? && !@marks.include?(header.id)
155
+ delete_obj_by_address(header.addr, header.id)
156
+ yield(header.id) if block_given?
157
+ deleted_objects_count += 1
158
+ end
152
159
 
153
- deleted_ids = []
154
- each_blob_header do |pos, header|
155
- if header.is_valid? && @marks.get(header.id).nil?
156
- delete_obj_by_address(pos, header.id)
157
- deleted_ids << header.id
160
+ pm.update(header.addr)
158
161
  end
159
162
  end
160
163
  defragmentize
161
164
 
162
- PEROBS.log.info "#{deleted_ids.length} unmarked objects deleted " +
163
- "in #{Time.now - t} seconds"
164
- deleted_ids
165
+ # Update the index file and create a new, empty space list.
166
+ regenerate_index_and_spaces
167
+
168
+ deleted_objects_count
165
169
  end
166
170
 
167
171
  # Write the given object into the file. This method never uses in-place
@@ -177,7 +181,7 @@ module PEROBS
177
181
  # operation is aborted or interrupted we ensure that we either have the
178
182
  # old or the new version available.
179
183
  if (old_addr = find_obj_addr_by_id(id))
180
- old_header = FlatFileBlobHeader.read_at(@f, old_addr)
184
+ old_header = FlatFileBlobHeader.read(@f, old_addr)
181
185
  old_header.set_outdated_flag
182
186
  end
183
187
 
@@ -188,22 +192,24 @@ module PEROBS
188
192
  # performance impact of compression is not compensated by writing
189
193
  # less data to the storage.
190
194
  compressed = false
191
- if raw_obj.bytesize > 256
195
+ raw_obj_bytesize = raw_obj.bytesize
196
+ if raw_obj_bytesize > 256
192
197
  raw_obj = Zlib.deflate(raw_obj)
198
+ raw_obj_bytesize = raw_obj.bytesize
193
199
  compressed = true
194
200
  end
195
201
 
196
- addr, length = find_free_blob(raw_obj.bytesize)
202
+ addr, length = find_free_blob(raw_obj_bytesize)
197
203
  begin
198
204
  if length != -1
199
205
  # Just a safeguard so we don't overwrite current data.
200
- header = FlatFileBlobHeader.read_at(@f, addr)
206
+ header = FlatFileBlobHeader.read(@f, addr)
201
207
  if header.length != length
202
208
  PEROBS.log.fatal "Length in free list (#{length}) and header " +
203
209
  "(#{header.length}) for address #{addr} don't match."
204
210
  end
205
- if raw_obj.bytesize > header.length
206
- PEROBS.log.fatal "Object (#{raw_obj.bytesize}) is longer than " +
211
+ if raw_obj_bytesize > header.length
212
+ PEROBS.log.fatal "Object (#{raw_obj_bytesize}) is longer than " +
207
213
  "blob space (#{header.length})."
208
214
  end
209
215
  if header.is_valid?
@@ -213,36 +219,40 @@ module PEROBS
213
219
  end
214
220
  flags = 1 << FlatFileBlobHeader::VALID_FLAG_BIT
215
221
  flags |= (1 << FlatFileBlobHeader::COMPRESSED_FLAG_BIT) if compressed
216
- FlatFileBlobHeader.new(@f, addr, flags, raw_obj.bytesize, id, crc).write
222
+ FlatFileBlobHeader.new(@f, addr, flags, raw_obj_bytesize, id, crc).write
217
223
  @f.write(raw_obj)
218
- if length != -1 && raw_obj.bytesize < length
224
+ @f.flush
225
+ if length != -1 && raw_obj_bytesize < length
219
226
  # The new object was not appended and it did not completely fill the
220
227
  # free space. So we have to write a new header to mark the remaining
221
228
  # empty space.
222
- unless length - raw_obj.bytesize >= FlatFileBlobHeader::LENGTH
229
+ unless length - raw_obj_bytesize >= FlatFileBlobHeader::LENGTH
223
230
  PEROBS.log.fatal "Not enough space to append the empty space " +
224
- "header (space: #{length} bytes, object: #{raw_obj.bytesize} " +
231
+ "header (space: #{length} bytes, object: #{raw_obj_bytesize} " +
225
232
  "bytes)."
226
233
  end
227
234
  space_address = @f.pos
228
- space_length = length - FlatFileBlobHeader::LENGTH - raw_obj.bytesize
235
+ space_length = length - FlatFileBlobHeader::LENGTH - raw_obj_bytesize
229
236
  FlatFileBlobHeader.new(@f, space_address, 0, space_length,
230
237
  0, 0).write
231
238
  # Register the new space with the space list.
232
- @space_list.add_space(space_address, space_length) if space_length > 0
239
+ if @space_list.is_open? && space_length > 0
240
+ @space_list.add_space(space_address, space_length)
241
+ end
233
242
  end
234
243
 
235
244
  # Once the blob has been written we can update the index as well.
236
- @index.insert(id, addr)
245
+ @index.insert(id, addr) if @index.is_open?
237
246
 
238
247
  if old_addr
239
248
  # If we had an existing object stored for the ID we have to mark
240
249
  # this entry as deleted now.
241
250
  old_header.clear_flags
242
- # And register the newly freed space with the space list.
243
- @space_list.add_space(old_addr, old_header.length)
244
- else
245
251
  @f.flush
252
+ # And register the newly freed space with the space list.
253
+ if @space_list.is_open?
254
+ @space_list.add_space(old_addr, old_header.length)
255
+ end
246
256
  end
247
257
  rescue IOError => e
248
258
  PEROBS.log.fatal "Cannot write blob for ID #{id} to FlatFileDB: " +
@@ -270,24 +280,20 @@ module PEROBS
270
280
  nil
271
281
  end
272
282
 
273
- def search_object(id)
274
- each_blob_header do |pos, header|
275
- return read_obj_by_address(pos, id)
276
- end
277
-
278
- nil
283
+ # @return [Integer] Number of items stored in the DB.
284
+ def item_counter
285
+ @index.entries_count
279
286
  end
280
287
 
281
-
282
288
  # Read the object at the specified address.
283
289
  # @param addr [Integer] Offset in the flat file
284
290
  # @param id [Integer] ID of the data blob
285
291
  # @return [String] Raw object data
286
292
  def read_obj_by_address(addr, id)
287
- header = FlatFileBlobHeader.read_at(@f, addr, id)
293
+ header = FlatFileBlobHeader.read(@f, addr, id)
288
294
  if header.id != id
289
295
  PEROBS.log.fatal "Database index corrupted: Index for object " +
290
- "#{id} points to object with ID #{header.id}"
296
+ "#{id} points to object with ID #{header.id} at address #{addr}"
291
297
  end
292
298
 
293
299
  buf = nil
@@ -296,7 +302,8 @@ module PEROBS
296
302
  @f.seek(addr + FlatFileBlobHeader::LENGTH)
297
303
  buf = @f.read(header.length)
298
304
  rescue IOError => e
299
- PEROBS.log.fatal "Cannot read blob for ID #{id}: #{e.message}"
305
+ PEROBS.log.fatal "Cannot read blob for ID #{id} at address #{addr}: " +
306
+ e.message
300
307
  end
301
308
 
302
309
  # Uncompress the data if the compression bit is set in the flags byte.
@@ -305,12 +312,13 @@ module PEROBS
305
312
  buf = Zlib.inflate(buf)
306
313
  rescue Zlib::BufError, Zlib::DataError
307
314
  PEROBS.log.fatal "Corrupted compressed block with ID " +
308
- "#{header.id} found."
315
+ "#{id} found at address #{addr}."
309
316
  end
310
317
  end
311
318
 
312
319
  if checksum(buf) != header.crc
313
- PEROBS.log.fatal "Checksum failure while reading blob ID #{id}"
320
+ PEROBS.log.fatal "Checksum failure while reading blob ID #{id} " +
321
+ "at address #{addr}"
314
322
  end
315
323
 
316
324
  buf
@@ -319,19 +327,22 @@ module PEROBS
319
327
  # Mark the object with the given ID.
320
328
  # @param id [Integer] ID of the object
321
329
  def mark_obj_by_id(id)
322
- @marks.insert(id, 0)
330
+ @marks.insert(id)
323
331
  end
324
332
 
325
333
  # Return true if the object with the given ID is marked, false otherwise.
326
334
  # @param id [Integer] ID of the object
327
335
  def is_marked_by_id?(id)
328
- !@marks.get(id).nil?
336
+ @marks.include?(id)
329
337
  end
330
338
 
331
339
  # Clear alls marks.
332
340
  def clear_all_marks
333
- @marks.erase
334
- @marks.open
341
+ if @marks
342
+ @marks.clear
343
+ else
344
+ @marks = IDList.new(@db_dir, 'marks', item_counter)
345
+ end
335
346
  end
336
347
 
337
348
  # Eliminate all the holes in the file. This is an in-place
@@ -340,59 +351,72 @@ module PEROBS
340
351
  distance = 0
341
352
  new_file_size = 0
342
353
  deleted_blobs = 0
354
+ corrupted_blobs = 0
343
355
  valid_blobs = 0
344
- t = Time.now
345
- PEROBS.log.info "Defragmenting FlatFile"
356
+
346
357
  # Iterate over all entries.
347
- each_blob_header do |pos, header|
348
- # Total size of the current entry
349
- entry_bytes = FlatFileBlobHeader::LENGTH + header.length
350
- if header.is_valid?
351
- # We have found a valid entry.
352
- valid_blobs += 1
353
- if distance > 0
354
- begin
355
- # Read current entry into a buffer
356
- @f.seek(pos)
357
- buf = @f.read(entry_bytes)
358
- # Write the buffer right after the end of the previous entry.
359
- @f.seek(pos - distance)
360
- @f.write(buf)
361
- # Update the index with the new position
362
- @index.insert(header.id, pos - distance)
363
- # Mark the space between the relocated current entry and the
364
- # next valid entry as deleted space.
365
- FlatFileBlobHeader.new(@f, @f.pos, 0,
366
- distance - FlatFileBlobHeader::LENGTH,
367
- 0, 0).write
368
- @f.flush
369
- rescue IOError => e
370
- PEROBS.log.fatal "Error while moving blob for ID #{header.id}: " +
371
- e.message
358
+ @progressmeter.start('Defragmenting blobs file', @f.size) do |pm|
359
+ each_blob_header do |header|
360
+ # If we have stumbled over a corrupted blob we treat it similar to a
361
+ # deleted blob and reuse the space.
362
+ if header.corruption_start
363
+ distance += header.addr - header.corruption_start
364
+ corrupted_blobs += 1
365
+ end
366
+
367
+ # Total size of the current entry
368
+ entry_bytes = FlatFileBlobHeader::LENGTH + header.length
369
+ if header.is_valid?
370
+ # We have found a valid entry.
371
+ valid_blobs += 1
372
+ if distance > 0
373
+ begin
374
+ # Read current entry into a buffer
375
+ @f.seek(header.addr)
376
+ buf = @f.read(entry_bytes)
377
+ # Write the buffer right after the end of the previous entry.
378
+ @f.seek(header.addr - distance)
379
+ @f.write(buf)
380
+ # Mark the space between the relocated current entry and the
381
+ # next valid entry as deleted space.
382
+ FlatFileBlobHeader.new(@f, @f.pos, 0,
383
+ distance - FlatFileBlobHeader::LENGTH,
384
+ 0, 0).write
385
+ @f.flush
386
+ rescue IOError => e
387
+ PEROBS.log.fatal "Error while moving blob for ID " +
388
+ "#{header.id}: #{e.message}"
389
+ end
372
390
  end
391
+ new_file_size = header.addr - distance +
392
+ FlatFileBlobHeader::LENGTH + header.length
393
+ else
394
+ deleted_blobs += 1
395
+ distance += entry_bytes
373
396
  end
374
- new_file_size = pos + FlatFileBlobHeader::LENGTH + header.length
375
- else
376
- deleted_blobs += 1
377
- distance += entry_bytes
397
+
398
+ pm.update(header.addr)
378
399
  end
379
400
  end
380
- PEROBS.log.info "FlatFile defragmented in #{Time.now - t} seconds"
401
+
381
402
  PEROBS.log.info "#{distance / 1000} KiB/#{deleted_blobs} blobs of " +
382
403
  "#{@f.size / 1000} KiB/#{valid_blobs} blobs or " +
383
404
  "#{'%.1f' % (distance.to_f / @f.size * 100.0)}% reclaimed"
405
+ if corrupted_blobs > 0
406
+ PEROBS.log.info "#{corrupted_blobs} corrupted blob(s) found. Space " +
407
+ "was recycled."
408
+ end
384
409
 
385
410
  @f.flush
386
411
  @f.truncate(new_file_size)
387
412
  @f.flush
388
- @space_list.clear
389
413
 
390
414
  sync
391
415
  end
392
416
 
393
417
  # This method iterates over all entries in the FlatFile and removes the
394
418
  # entry and inserts it again. This is useful to update all entries in
395
- # cased the storage format has changed.
419
+ # case the storage format has changed.
396
420
  def refresh
397
421
  # This iteration might look scary as we iterate over the entries while
398
422
  # while we are rearranging them. Re-inserted items may be inserted
@@ -400,132 +424,276 @@ module PEROBS
400
424
  # inserted after the current entry and will be re-read again unless they
401
425
  # are inserted after the original file end.
402
426
  file_size = @f.size
403
- PEROBS.log.info "Refreshing the DB..."
404
- t = Time.now
405
- each_blob_header do |pos, header|
406
- if header.is_valid?
407
- buf = read_obj_by_address(pos, header.id)
408
- delete_obj_by_address(pos, header.id)
409
- write_obj_by_id(header.id, buf)
410
- end
411
427
 
412
- # Some re-inserted blobs may be inserted after the original file end.
413
- # No need to process those blobs again.
414
- break if pos >= file_size
428
+ # We don't update the index and the space list during this operation as
429
+ # we defragmentize the blob file at the end. We'll end the operation
430
+ # with an empty space list.
431
+ clear_index_files
432
+
433
+ @progressmeter.start('Converting objects to new storage format',
434
+ @f.size) do |pm|
435
+ each_blob_header do |header|
436
+ if header.is_valid?
437
+ buf = read_obj_by_address(header.addr, header.id)
438
+ delete_obj_by_address(header.addr, header.id)
439
+ write_obj_by_id(header.id, buf)
440
+ end
441
+
442
+ # Some re-inserted blobs may be inserted after the original file end.
443
+ # No need to process those blobs again.
444
+ break if header.addr >= file_size
445
+
446
+ pm.update(header.addr)
447
+ end
415
448
  end
416
- PEROBS.log.info "DB refresh completed in #{Time.now - t} seconds"
417
449
 
418
450
  # Reclaim the space saved by compressing entries.
419
451
  defragmentize
452
+
453
+ # Recreate the index file and create an empty space list.
454
+ regenerate_index_and_spaces
420
455
  end
421
456
 
422
- # Check (and repair) the FlatFile.
423
- # @param repair [Boolean] True if errors should be fixed.
457
+ # Check the FlatFile.
424
458
  # @return [Integer] Number of errors found
425
- def check(repair = false)
459
+ def check()
426
460
  errors = 0
427
461
  return errors unless @f
428
462
 
429
463
  t = Time.now
430
- PEROBS.log.info "Checking FlatFile database" +
431
- "#{repair ? ' in repair mode' : ''}..."
464
+ PEROBS.log.info "Checking FlatFile database..."
432
465
 
433
466
  # First check the database blob file. Each entry should be readable and
434
467
  # correct and all IDs must be unique. We use a shadow index to keep
435
468
  # track of the already found IDs.
436
- new_index = BTree.new(@db_dir, 'new-index', INDEX_BTREE_ORDER)
469
+ new_index = BTree.new(@db_dir, 'new-index', INDEX_BTREE_ORDER,
470
+ @progressmeter)
437
471
  new_index.erase
438
472
  new_index.open
439
473
 
440
- each_blob_header do |pos, header|
441
- if header.is_valid?
442
- # We have a non-deleted entry.
443
- begin
444
- @f.seek(pos + FlatFileBlobHeader::LENGTH)
445
- buf = @f.read(header.length)
446
- if buf.bytesize != header.length
447
- PEROBS.log.error "Premature end of file in blob with ID " +
448
- "#{header.id}."
449
- discard_damaged_blob(header) if repair
450
- errors += 1
451
- next
452
- end
474
+ corrupted_blobs = 0
475
+ end_of_last_healthy_blob = nil
476
+ @progressmeter.start('Checking blobs file', @f.size) do |pm|
477
+ corrupted_blobs = each_blob_header do |header|
478
+ if header.is_valid?
479
+ # We have a non-deleted entry.
480
+ begin
481
+ @f.seek(header.addr + FlatFileBlobHeader::LENGTH)
482
+ buf = @f.read(header.length)
483
+ if buf.bytesize != header.length
484
+ PEROBS.log.error "Premature end of file in blob with ID " +
485
+ "#{header.id}."
486
+ errors += 1
487
+ next
488
+ end
453
489
 
454
- # Uncompress the data if the compression bit is set in the mark
455
- # byte.
456
- if header.is_compressed?
457
- begin
458
- buf = Zlib.inflate(buf)
459
- rescue Zlib::BufError, Zlib::DataError
460
- PEROBS.log.error "Corrupted compressed block with ID " +
461
- "#{header.id} found."
462
- discard_damaged_blob(header) if repair
490
+ # Uncompress the data if the compression bit is set in the mark
491
+ # byte.
492
+ if header.is_compressed?
493
+ begin
494
+ buf = Zlib.inflate(buf)
495
+ rescue Zlib::BufError, Zlib::DataError
496
+ PEROBS.log.error "Corrupted compressed block with ID " +
497
+ "#{header.id} found."
498
+ errors += 1
499
+ next
500
+ end
501
+ end
502
+
503
+ if header.crc && checksum(buf) != header.crc
504
+ PEROBS.log.error "Checksum failure while checking blob " +
505
+ "with ID #{header.id}"
463
506
  errors += 1
464
507
  next
465
508
  end
509
+ rescue IOError => e
510
+ PEROBS.log.fatal "Check of blob with ID #{header.id} failed: " +
511
+ e.message
466
512
  end
467
513
 
468
- if header.crc && checksum(buf) != header.crc
469
- PEROBS.log.error "Checksum failure while checking blob " +
470
- "with ID #{header.id}"
471
- discard_damaged_blob(header) if repair
514
+ # Check if the ID has already been found in the file.
515
+ if (previous_address = new_index.get(header.id))
516
+ PEROBS.log.error "Multiple blobs for ID #{header.id} found. " +
517
+ "Addresses: #{previous_address}, #{header.addr}"
472
518
  errors += 1
473
- next
519
+ previous_header = FlatFileBlobHeader.read(@f, previous_address,
520
+ header.id)
521
+ else
522
+ # ID is unique so far. Add it to the shadow index.
523
+ new_index.insert(header.id, header.addr)
474
524
  end
475
- rescue IOError => e
476
- PEROBS.log.fatal "Check of blob with ID #{header.id} failed: " +
477
- e.message
478
525
  end
526
+ end_of_last_healthy_blob = header.addr +
527
+ FlatFileBlobHeader::LENGTH + header.length
479
528
 
480
- # Check if the ID has already been found in the file.
481
- if (previous_address = new_index.get(header.id))
482
- PEROBS.log.error "Multiple blobs for ID #{header.id} found. " +
483
- "Addresses: #{previous_address}, #{pos}"
484
- previous_header = FlatFileBlobHeader.read_at(@f, previous_address,
485
- header.id)
486
- if repair
487
- # We have two blobs with the same ID and we must discard one of
488
- # them.
489
- if header.is_outdated?
490
- discard_damaged_blob(header)
491
- elsif previous_header.is_outdated?
492
- discard_damaged_blob(previous_header)
493
- else
494
- PEROBS.log.error "None of the blobs with same ID have " +
495
- "the outdated flag set. Deleting the smaller one."
496
- discard_damaged_blob(header.length < previous_header.length ?
497
- header : previous_header)
498
- end
499
- next
500
- end
501
- else
502
- # ID is unique so far. Add it to the shadow index.
503
- new_index.insert(header.id, pos)
504
- end
529
+ pm.update(header.addr)
530
+ end
505
531
 
532
+ if end_of_last_healthy_blob && end_of_last_healthy_blob != @f.size
533
+ # The blob file ends with a corrupted blob header.
534
+ PEROBS.log.error "#{@f.size - end_of_last_healthy_blob} corrupted " +
535
+ 'bytes found at the end of FlatFile.'
536
+ corrupted_blobs += 1
506
537
  end
538
+
539
+ errors += corrupted_blobs
507
540
  end
541
+
508
542
  # We no longer need the new index.
509
543
  new_index.close
510
544
  new_index.erase
511
545
 
512
- # Now we check the index data. It must be correct and the entries must
513
- # match the blob file. All entries in the index must be in the blob file
514
- # and vise versa.
515
- begin
516
- index_ok = @index.check do |id, address|
517
- has_id_at?(id, address)
546
+ if corrupted_blobs == 0
547
+ # Now we check the index data. It must be correct and the entries must
548
+ # match the blob file. All entries in the index must be in the blob file
549
+ # and vise versa.
550
+ begin
551
+ index_ok = @index.check do |id, address|
552
+ unless has_id_at?(id, address)
553
+ PEROBS.log.error "Index contains an entry for " +
554
+ "ID #{id} at address #{address} that is not in FlatFile"
555
+ false
556
+ else
557
+ true
558
+ end
559
+ end
560
+ x_check_errs = 0
561
+ space_check_ok = true
562
+ unless index_ok && (space_check_ok = @space_list.check(self)) &&
563
+ (x_check_errs = cross_check_entries) == 0
564
+ errors += 1 unless index_ok && space_check_ok
565
+ errors += x_check_errs
566
+ end
567
+ rescue PEROBS::FatalError
568
+ errors += 1
518
569
  end
519
- unless index_ok && @space_list.check(self) && cross_check_entries
520
- regenerate_index_and_spaces if repair
570
+ end
571
+
572
+ PEROBS.log.info "FlatFile check completed in #{Time.now - t} seconds. " +
573
+ "#{errors} errors found."
574
+
575
+ errors
576
+ end
577
+
578
+ # Repair the FlatFile. In contrast to the repair functionality in the
579
+ # check() method this method is much faster. It simply re-creates the
580
+ # index and space list from the blob file.
581
+ # @return [Integer] Number of errors found
582
+ def repair
583
+ errors = 0
584
+ return errors unless @f
585
+
586
+ t = Time.now
587
+ PEROBS.log.info "Repairing FlatFile database"
588
+
589
+ # Erase and re-open the index and space list files. We purposely don't
590
+ # close the files at it would trigger needless flushing.
591
+ clear_index_files(true)
592
+
593
+ # Now we scan the blob file and re-index all blobs and spaces. Corrupted
594
+ # blobs will be skipped.
595
+ corrupted_blobs = 0
596
+ end_of_last_healthy_blob = nil
597
+ @progressmeter.start('Re-indexing blobs file', @f.size) do |pm|
598
+ corrupted_blobs = each_blob_header do |header|
599
+ if header.corruption_start
600
+ # The blob is preceeded by a corrupted area. We create a new
601
+ # header of a deleted blob for this area and write the new blob
602
+ # over it.
603
+ if (data_length = header.addr - header.corruption_start -
604
+ FlatFileBlobHeader::LENGTH) <= 0
605
+ PEROBS.log.error "Found a corrupted blob that is too small to " +
606
+ "fit a header (#{data_length}). File must be defragmented."
607
+ else
608
+ new_header = FlatFileBlobHeader.new(@f, header.corruption_start,
609
+ 0, data_length, 0, 0)
610
+ new_header.write
611
+ @space_list.add_space(header.corruption_start, data_length)
612
+ end
613
+ end
614
+
615
+ if header.is_valid?
616
+ # We have a non-deleted entry.
617
+ begin
618
+ @f.seek(header.addr + FlatFileBlobHeader::LENGTH)
619
+ buf = @f.read(header.length)
620
+ if buf.bytesize != header.length
621
+ PEROBS.log.error "Premature end of file in blob with ID " +
622
+ "#{header.id}."
623
+ discard_damaged_blob(header)
624
+ errors += 1
625
+ next
626
+ end
627
+
628
+ # Uncompress the data if the compression bit is set in the mark
629
+ # byte.
630
+ if header.is_compressed?
631
+ begin
632
+ buf = Zlib.inflate(buf)
633
+ rescue Zlib::BufError, Zlib::DataError
634
+ PEROBS.log.error "Corrupted compressed block with ID " +
635
+ "#{header.id} found."
636
+ discard_damaged_blob(header)
637
+ errors += 1
638
+ next
639
+ end
640
+ end
641
+
642
+ if header.crc && checksum(buf) != header.crc
643
+ PEROBS.log.error "Checksum failure while checking blob " +
644
+ "with ID #{header.id}"
645
+ discard_damaged_blob(header)
646
+ errors += 1
647
+ next
648
+ end
649
+ rescue IOError => e
650
+ PEROBS.log.fatal "Check of blob with ID #{header.id} failed: " +
651
+ e.message
652
+ end
653
+
654
+ # Check if the ID has already been found in the file.
655
+ if (previous_address = @index.get(header.id))
656
+ PEROBS.log.error "Multiple blobs for ID #{header.id} found. " +
657
+ "Addresses: #{previous_address}, #{header.addr}"
658
+ errors += 1
659
+ previous_header = FlatFileBlobHeader.read(@f, previous_address,
660
+ header.id)
661
+ # We have two blobs with the same ID and we must discard one of
662
+ # them.
663
+ discard_duplicate_blobs(header, previous_header)
664
+ else
665
+ # ID is unique so far. Add it to the shadow index.
666
+ @index.insert(header.id, header.addr)
667
+ end
668
+
669
+ else
670
+ if header.length > 0
671
+ @space_list.add_space(header.addr, header.length)
672
+ end
673
+ end
674
+ end_of_last_healthy_blob = header.addr +
675
+ FlatFileBlobHeader::LENGTH + header.length
676
+
677
+ pm.update(header.addr)
521
678
  end
522
- rescue PEROBS::FatalError
523
- errors += 1
524
- regenerate_index_and_spaces if repair
679
+
680
+ if end_of_last_healthy_blob && end_of_last_healthy_blob != @f.size
681
+ # The blob file ends with a corrupted blob header.
682
+ PEROBS.log.error "#{@f.size - end_of_last_healthy_blob} corrupted " +
683
+ 'bytes found at the end of FlatFile.'
684
+ corrupted_blobs += 1
685
+
686
+ PEROBS.log.error "Truncating FlatFile to " +
687
+ "#{end_of_last_healthy_blob} bytes by discarding " +
688
+ "#{@f.size - end_of_last_healthy_blob} bytes"
689
+ @f.truncate(end_of_last_healthy_blob)
690
+ end
691
+
692
+ errors += corrupted_blobs
525
693
  end
526
694
 
527
- sync if repair
528
- PEROBS.log.info "check_db completed in #{Time.now - t} seconds. " +
695
+ sync
696
+ PEROBS.log.info "FlatFile repair completed in #{Time.now - t} seconds. " +
529
697
  "#{errors} errors found."
530
698
 
531
699
  errors
@@ -535,22 +703,32 @@ module PEROBS
535
703
  # regenerates them from the FlatFile.
536
704
  def regenerate_index_and_spaces
537
705
  PEROBS.log.warn "Re-generating FlatFileDB index and space files"
706
+ @index.open unless @index.is_open?
538
707
  @index.clear
708
+ @space_list.open unless @space_list.is_open?
539
709
  @space_list.clear
540
710
 
541
- each_blob_header do |pos, header|
542
- if header.is_valid?
543
- if (duplicate_pos = @index.get(header.id))
544
- PEROBS.log.error "FlatFile contains multiple blobs for ID " +
545
- "#{header.id}. First blob is at address #{duplicate_pos}. " +
546
- "Other blob found at address #{pos}."
547
- @space_list.add_space(pos, header.length) if header.length > 0
548
- discard_damaged_blob(header)
711
+ @progressmeter.start('Re-generating database index', @f.size) do |pm|
712
+ each_blob_header do |header|
713
+ if header.is_valid?
714
+ if (duplicate_pos = @index.get(header.id))
715
+ PEROBS.log.error "FlatFile contains multiple blobs for ID " +
716
+ "#{header.id}. First blob is at address #{duplicate_pos}. " +
717
+ "Other blob found at address #{header.addr}."
718
+ if header.length > 0
719
+ @space_list.add_space(header.addr, header.length)
720
+ end
721
+ discard_damaged_blob(header)
722
+ else
723
+ @index.insert(header.id, header.addr)
724
+ end
549
725
  else
550
- @index.insert(header.id, pos)
726
+ if header.length > 0
727
+ @space_list.add_space(header.addr, header.length)
728
+ end
551
729
  end
552
- else
553
- @space_list.add_space(pos, header.length) if header.length > 0
730
+
731
+ pm.update(header.addr)
554
732
  end
555
733
  end
556
734
 
@@ -558,19 +736,23 @@ module PEROBS
558
736
  end
559
737
 
560
738
  def has_space?(address, size)
561
- header = FlatFileBlobHeader.read_at(@f, address)
739
+ header = FlatFileBlobHeader.read(@f, address)
562
740
  !header.is_valid? && header.length == size
563
741
  end
564
742
 
565
743
  def has_id_at?(id, address)
566
- header = FlatFileBlobHeader.read_at(@f, address)
744
+ begin
745
+ header = FlatFileBlobHeader.read(@f, address)
746
+ rescue PEROBS::FatalError
747
+ return false
748
+ end
567
749
  header.is_valid? && header.id == id
568
750
  end
569
751
 
570
752
  def inspect
571
753
  s = '['
572
- each_blob_header do |pos, header|
573
- s << "{ :pos => #{pos}, :flags => #{header.flags}, " +
754
+ each_blob_header do |header|
755
+ s << "{ :pos => #{header.addr}, :flags => #{header.flags}, " +
574
756
  ":length => #{header.length}, :id => #{header.id}, " +
575
757
  ":crc => #{header.crc}"
576
758
  if header.is_valid?
@@ -581,21 +763,68 @@ module PEROBS
581
763
  s + ']'
582
764
  end
583
765
 
766
+ def FlatFile::insert_header_checksums(db_dir)
767
+ old_file_name = File.join(db_dir, 'database.blobs')
768
+ new_file_name = File.join(db_dir, 'database_v4.blobs')
769
+ bak_file_name = File.join(db_dir, 'database_v3.blobs')
770
+
771
+ old_file = File.open(old_file_name, 'rb')
772
+ new_file = File.open(new_file_name, 'wb')
773
+
774
+ entries = 0
775
+ while (buf = old_file.read(21))
776
+ flags, length, id, crc = *buf.unpack('CQQL')
777
+ blob_data = old_file.read(length)
778
+
779
+ # Some basic sanity checking to ensure all reserved bits are 0. Older
780
+ # versions of PEROBS used to set bit 1 despite it being reserved now.
781
+ unless flags & 0xF0 == 0
782
+ PEROBS.log.fatal "Blob file #{old_file_name} contains illegal " +
783
+ "flag byte #{'%02x' % flags} at #{old_file.pos - 21}"
784
+ end
785
+
786
+ # Check if the blob is valid and current.
787
+ if flags & 0x1 == 1 && flags & 0x8 == 0
788
+ # Make sure the bit 1 is not set anymore.
789
+ flags = flags & 0x05
790
+ header_str = [ flags, length, id, crc ].pack('CQQL')
791
+ header_crc = Zlib.crc32(header_str, 0)
792
+ header_str += [ header_crc ].pack('L')
793
+
794
+ new_file.write(header_str + blob_data)
795
+ entries += 1
796
+ end
797
+ end
798
+ PEROBS.log.info "Header checksum added to #{entries} entries"
799
+
800
+ old_file.close
801
+ new_file.close
802
+
803
+ File.rename(old_file_name, bak_file_name)
804
+ File.rename(new_file_name, old_file_name)
805
+ end
806
+
584
807
  private
585
808
 
586
809
  def each_blob_header(&block)
587
- pos = 0
810
+ corrupted_blobs = 0
811
+
588
812
  begin
589
813
  @f.seek(0)
590
814
  while (header = FlatFileBlobHeader.read(@f))
591
- yield(pos, header)
815
+ if header.corruption_start
816
+ corrupted_blobs += 1
817
+ end
818
+
819
+ yield(header)
592
820
 
593
- pos += FlatFileBlobHeader::LENGTH + header.length
594
- @f.seek(pos)
821
+ @f.seek(header.addr + FlatFileBlobHeader::LENGTH + header.length)
595
822
  end
596
823
  rescue IOError => e
597
824
  PEROBS.log.fatal "Cannot read blob in flat file DB: #{e.message}"
598
825
  end
826
+
827
+ corrupted_blobs
599
828
  end
600
829
 
601
830
  def find_free_blob(bytes)
@@ -625,26 +854,34 @@ module PEROBS
625
854
  def cross_check_entries
626
855
  errors = 0
627
856
 
628
- each_blob_header do |pos, header|
629
- if !header.is_valid?
630
- if header.length > 0
631
- unless @space_list.has_space?(pos, header.length)
632
- PEROBS.log.error "FlatFile has free space " +
633
- "(addr: #{pos}, len: #{header.length}) that is not in " +
634
- "FreeSpaceManager"
635
- errors += 1
857
+ @progressmeter.start('Cross checking blobs and index', @f.size) do |pm|
858
+ each_blob_header do |header|
859
+ if !header.is_valid?
860
+ if header.length > 0
861
+ unless @space_list.has_space?(header.addr, header.length)
862
+ PEROBS.log.error "FlatFile has free space " +
863
+ "(addr: #{header.addr}, len: #{header.length}) that is " +
864
+ "not in SpaceManager"
865
+ errors += 1
866
+ end
867
+ end
868
+ else
869
+ if (index_address = @index.get(header.id)).nil?
870
+ PEROBS.log.error "FlatFile blob at address #{header.addr} " +
871
+ "is not listed in the index"
872
+ errors +=1
873
+ elsif index_address != header.addr
874
+ PEROBS.log.error "FlatFile blob at address #{header.addr} " +
875
+ "is listed in index with address #{index_address}"
876
+ errors += 1
636
877
  end
637
878
  end
638
- else
639
- unless @index.get(header.id) == pos
640
- PEROBS.log.error "FlatFile blob at address #{pos} is listed " +
641
- "in index with address #{@index.get(header.id)}"
642
- errors += 1
643
- end
879
+
880
+ pm.update(header.addr)
644
881
  end
645
882
  end
646
883
 
647
- errors == 0
884
+ errors
648
885
  end
649
886
 
650
887
  def discard_damaged_blob(header)
@@ -653,6 +890,61 @@ module PEROBS
653
890
  header.clear_flags
654
891
  end
655
892
 
893
+ def discard_duplicate_blobs(header, previous_header)
894
+ if header.is_outdated?
895
+ discard_damaged_blob(header)
896
+ elsif previous_header.is_outdated?
897
+ discard_damaged_blob(previous_header)
898
+ else
899
+ smaller, larger = header.length < previous_header.length ?
900
+ [ header, previous_header ] : [ previous_header, header ]
901
+ PEROBS.log.error "None of the blobs with same ID have " +
902
+ "the outdated flag set. Deleting the smaller one " +
903
+ "at address #{smaller.addr}"
904
+ discard_damaged_blob(smaller)
905
+ @space_list.add_space(smaller.addr, smaller.length)
906
+ @index.insert(larger.id, larger.addr)
907
+ end
908
+ end
909
+
910
+ def open_index_files(abort_on_missing_files = false)
911
+ begin
912
+ @index.open(abort_on_missing_files)
913
+ @space_list.open
914
+ rescue FatalError
915
+ clear_index_files
916
+ regenerate_index_and_spaces
917
+ end
918
+ end
919
+
920
+ def erase_index_files(dont_close_files = false)
921
+ # Ensure that the index is really closed.
922
+ @index.close unless dont_close_files
923
+ # Erase it completely
924
+ @index.erase
925
+
926
+ # Ensure that the spaces list is really closed.
927
+ @space_list.close unless dont_close_files
928
+ # Erase it completely
929
+ @space_list.erase
930
+
931
+ if @space_list.is_a?(SpaceTree)
932
+ # If we still use the old SpaceTree format, this is the moment to
933
+ # convert it to the new SpaceManager format.
934
+ @space_list = SpaceManager.new(@db_dir, @progressmeter)
935
+ PEROBS.log.warn "Converting space list from SpaceTree format " +
936
+ "to SpaceManager format"
937
+ end
938
+ end
939
+
940
+ def clear_index_files(dont_close_files = false)
941
+ erase_index_files(dont_close_files)
942
+
943
+ # Then create them again.
944
+ @index.open
945
+ @space_list.open
946
+ end
947
+
656
948
  end
657
949
 
658
950
  end