perobs 4.0.0 → 4.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +5 -5
  2. data/README.md +27 -16
  3. data/lib/perobs/Array.rb +66 -19
  4. data/lib/perobs/BTree.rb +106 -15
  5. data/lib/perobs/BTreeBlob.rb +4 -3
  6. data/lib/perobs/BTreeDB.rb +5 -4
  7. data/lib/perobs/BTreeNode.rb +482 -156
  8. data/lib/perobs/BTreeNodeLink.rb +10 -0
  9. data/lib/perobs/BigArray.rb +285 -0
  10. data/lib/perobs/BigArrayNode.rb +1002 -0
  11. data/lib/perobs/BigHash.rb +246 -0
  12. data/lib/perobs/BigTree.rb +197 -0
  13. data/lib/perobs/BigTreeNode.rb +873 -0
  14. data/lib/perobs/Cache.rb +48 -10
  15. data/lib/perobs/ConsoleProgressMeter.rb +61 -0
  16. data/lib/perobs/DataBase.rb +4 -3
  17. data/lib/perobs/DynamoDB.rb +57 -15
  18. data/lib/perobs/EquiBlobsFile.rb +155 -50
  19. data/lib/perobs/FNV_Hash_1a_64.rb +54 -0
  20. data/lib/perobs/FlatFile.rb +519 -227
  21. data/lib/perobs/FlatFileBlobHeader.rb +113 -54
  22. data/lib/perobs/FlatFileDB.rb +49 -23
  23. data/lib/perobs/FuzzyStringMatcher.rb +175 -0
  24. data/lib/perobs/Hash.rb +127 -33
  25. data/lib/perobs/IDList.rb +144 -0
  26. data/lib/perobs/IDListPage.rb +107 -0
  27. data/lib/perobs/IDListPageFile.rb +180 -0
  28. data/lib/perobs/IDListPageRecord.rb +142 -0
  29. data/lib/perobs/Object.rb +18 -15
  30. data/lib/perobs/ObjectBase.rb +46 -5
  31. data/lib/perobs/PersistentObjectCache.rb +57 -68
  32. data/lib/perobs/PersistentObjectCacheLine.rb +24 -12
  33. data/lib/perobs/ProgressMeter.rb +97 -0
  34. data/lib/perobs/SpaceManager.rb +273 -0
  35. data/lib/perobs/SpaceTree.rb +21 -12
  36. data/lib/perobs/SpaceTreeNode.rb +53 -61
  37. data/lib/perobs/Store.rb +264 -145
  38. data/lib/perobs/version.rb +1 -1
  39. data/lib/perobs.rb +2 -0
  40. data/perobs.gemspec +4 -4
  41. data/test/Array_spec.rb +15 -6
  42. data/test/BTree_spec.rb +6 -2
  43. data/test/BigArray_spec.rb +261 -0
  44. data/test/BigHash_spec.rb +152 -0
  45. data/test/BigTreeNode_spec.rb +153 -0
  46. data/test/BigTree_spec.rb +259 -0
  47. data/test/EquiBlobsFile_spec.rb +105 -1
  48. data/test/FNV_Hash_1a_64_spec.rb +59 -0
  49. data/test/FlatFileDB_spec.rb +198 -14
  50. data/test/FuzzyStringMatcher_spec.rb +261 -0
  51. data/test/Hash_spec.rb +13 -3
  52. data/test/IDList_spec.rb +77 -0
  53. data/test/LegacyDBs/LegacyDB.rb +155 -0
  54. data/test/LegacyDBs/version_3/class_map.json +1 -0
  55. data/test/LegacyDBs/version_3/config.json +1 -0
  56. data/test/LegacyDBs/version_3/database.blobs +0 -0
  57. data/test/LegacyDBs/version_3/database_spaces.blobs +0 -0
  58. data/test/LegacyDBs/version_3/index.blobs +0 -0
  59. data/test/LegacyDBs/version_3/version +1 -0
  60. data/test/LockFile_spec.rb +9 -6
  61. data/test/SpaceManager_spec.rb +176 -0
  62. data/test/SpaceTree_spec.rb +4 -1
  63. data/test/Store_spec.rb +305 -203
  64. data/test/spec_helper.rb +9 -4
  65. metadata +57 -16
  66. data/lib/perobs/BTreeNodeCache.rb +0 -109
  67. data/lib/perobs/TreeDB.rb +0 -277
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # = FlatFile.rb -- Persistent Ruby Object Store
4
4
  #
5
- # Copyright (c) 2016 by Chris Schlaeger <chris@taskjuggler.org>
5
+ # Copyright (c) 2016, 2018, 2019 by Chris Schlaeger <chris@taskjuggler.org>
6
6
  #
7
7
  # MIT License
8
8
  #
@@ -31,6 +31,8 @@ require 'perobs/Log'
31
31
  require 'perobs/FlatFileBlobHeader'
32
32
  require 'perobs/BTree'
33
33
  require 'perobs/SpaceTree'
34
+ require 'perobs/SpaceManager'
35
+ require 'perobs/IDList'
34
36
 
35
37
  module PEROBS
36
38
 
@@ -44,12 +46,20 @@ module PEROBS
44
46
 
45
47
  # Create a new FlatFile object for a database in the given path.
46
48
  # @param dir [String] Directory path for the data base file
47
- def initialize(dir)
49
+ def initialize(dir, progressmeter)
48
50
  @db_dir = dir
51
+ @progressmeter = progressmeter
49
52
  @f = nil
50
- @index = BTree.new(@db_dir, 'index', INDEX_BTREE_ORDER)
51
- @marks = BTree.new(@db_dir, 'marks', INDEX_BTREE_ORDER)
52
- @space_list = SpaceTree.new(@db_dir)
53
+ @marks = nil
54
+ @index = BTree.new(@db_dir, 'index', INDEX_BTREE_ORDER, @progressmeter)
55
+ old_spaces_file = File.join(@db_dir, 'database_spaces.blobs')
56
+ if File.exist?(old_spaces_file)
57
+ # PEROBS version 4.1.0 and earlier used this space list format. It is
58
+ # deprecated now. Newly created DBs use the SpaceManager format.
59
+ @space_list = SpaceTree.new(@db_dir, @progressmeter)
60
+ else
61
+ @space_list = SpaceManager.new(@db_dir, @progressmeter)
62
+ end
53
63
  end
54
64
 
55
65
  # Open the flat file for reading and writing.
@@ -74,33 +84,19 @@ module PEROBS
74
84
  end
75
85
  @f.sync = true
76
86
 
77
- begin
78
- @index.open(!new_db_created)
79
- @space_list.open
80
- rescue FatalError
81
- # Ensure that the index is really closed.
82
- @index.close
83
- # Erase it completely
84
- @index.erase
85
- # Then create it again.
86
- @index.open
87
-
88
- # Ensure that the spaces list is really closed.
89
- @space_list.close
90
- # Erase it completely
91
- @space_list.erase
92
- # Then create it again
93
- @space_list.open
94
-
95
- regenerate_index_and_spaces
96
- end
87
+ open_index_files(!new_db_created)
97
88
  end
98
89
 
99
90
  # Close the flat file. This method must be called to ensure that all data
100
91
  # is really written into the filesystem.
101
92
  def close
102
- @space_list.close
103
- @index.close
93
+ @space_list.close if @space_list.is_open?
94
+ @index.close if @index.is_open?
95
+
96
+ if @marks
97
+ @marks.erase
98
+ @marks = nil
99
+ end
104
100
 
105
101
  if @f
106
102
  @f.flush
@@ -139,29 +135,37 @@ module PEROBS
139
135
  # @param addr [Integer] Address of the blob to delete
140
136
  # @param id [Integer] ID of the blob to delete
141
137
  def delete_obj_by_address(addr, id)
142
- @index.remove(id)
143
- header = FlatFileBlobHeader.read_at(@f, addr, id)
138
+ @index.remove(id) if @index.is_open?
139
+ header = FlatFileBlobHeader.read(@f, addr, id)
144
140
  header.clear_flags
145
- @space_list.add_space(addr, header.length)
141
+ @space_list.add_space(addr, header.length) if @space_list.is_open?
146
142
  end
147
143
 
148
144
  # Delete all unmarked objects.
149
- def delete_unmarked_objects
150
- PEROBS.log.info "Deleting unmarked objects..."
151
- t = Time.now
145
+ def delete_unmarked_objects(&block)
146
+ # We don't update the index and the space list during this operation as
147
+ # we defragmentize the blob file at the end. We'll end the operation
148
+ # with an empty space list.
149
+ clear_index_files
150
+
151
+ deleted_objects_count = 0
152
+ @progressmeter.start('Sweeping unmarked objects', @f.size) do |pm|
153
+ each_blob_header do |header|
154
+ if header.is_valid? && !@marks.include?(header.id)
155
+ delete_obj_by_address(header.addr, header.id)
156
+ yield(header.id) if block_given?
157
+ deleted_objects_count += 1
158
+ end
152
159
 
153
- deleted_ids = []
154
- each_blob_header do |pos, header|
155
- if header.is_valid? && @marks.get(header.id).nil?
156
- delete_obj_by_address(pos, header.id)
157
- deleted_ids << header.id
160
+ pm.update(header.addr)
158
161
  end
159
162
  end
160
163
  defragmentize
161
164
 
162
- PEROBS.log.info "#{deleted_ids.length} unmarked objects deleted " +
163
- "in #{Time.now - t} seconds"
164
- deleted_ids
165
+ # Update the index file and create a new, empty space list.
166
+ regenerate_index_and_spaces
167
+
168
+ deleted_objects_count
165
169
  end
166
170
 
167
171
  # Write the given object into the file. This method never uses in-place
@@ -177,7 +181,7 @@ module PEROBS
177
181
  # operation is aborted or interrupted we ensure that we either have the
178
182
  # old or the new version available.
179
183
  if (old_addr = find_obj_addr_by_id(id))
180
- old_header = FlatFileBlobHeader.read_at(@f, old_addr)
184
+ old_header = FlatFileBlobHeader.read(@f, old_addr)
181
185
  old_header.set_outdated_flag
182
186
  end
183
187
 
@@ -188,22 +192,24 @@ module PEROBS
188
192
  # performance impact of compression is not compensated by writing
189
193
  # less data to the storage.
190
194
  compressed = false
191
- if raw_obj.bytesize > 256
195
+ raw_obj_bytesize = raw_obj.bytesize
196
+ if raw_obj_bytesize > 256
192
197
  raw_obj = Zlib.deflate(raw_obj)
198
+ raw_obj_bytesize = raw_obj.bytesize
193
199
  compressed = true
194
200
  end
195
201
 
196
- addr, length = find_free_blob(raw_obj.bytesize)
202
+ addr, length = find_free_blob(raw_obj_bytesize)
197
203
  begin
198
204
  if length != -1
199
205
  # Just a safeguard so we don't overwrite current data.
200
- header = FlatFileBlobHeader.read_at(@f, addr)
206
+ header = FlatFileBlobHeader.read(@f, addr)
201
207
  if header.length != length
202
208
  PEROBS.log.fatal "Length in free list (#{length}) and header " +
203
209
  "(#{header.length}) for address #{addr} don't match."
204
210
  end
205
- if raw_obj.bytesize > header.length
206
- PEROBS.log.fatal "Object (#{raw_obj.bytesize}) is longer than " +
211
+ if raw_obj_bytesize > header.length
212
+ PEROBS.log.fatal "Object (#{raw_obj_bytesize}) is longer than " +
207
213
  "blob space (#{header.length})."
208
214
  end
209
215
  if header.is_valid?
@@ -213,36 +219,40 @@ module PEROBS
213
219
  end
214
220
  flags = 1 << FlatFileBlobHeader::VALID_FLAG_BIT
215
221
  flags |= (1 << FlatFileBlobHeader::COMPRESSED_FLAG_BIT) if compressed
216
- FlatFileBlobHeader.new(@f, addr, flags, raw_obj.bytesize, id, crc).write
222
+ FlatFileBlobHeader.new(@f, addr, flags, raw_obj_bytesize, id, crc).write
217
223
  @f.write(raw_obj)
218
- if length != -1 && raw_obj.bytesize < length
224
+ @f.flush
225
+ if length != -1 && raw_obj_bytesize < length
219
226
  # The new object was not appended and it did not completely fill the
220
227
  # free space. So we have to write a new header to mark the remaining
221
228
  # empty space.
222
- unless length - raw_obj.bytesize >= FlatFileBlobHeader::LENGTH
229
+ unless length - raw_obj_bytesize >= FlatFileBlobHeader::LENGTH
223
230
  PEROBS.log.fatal "Not enough space to append the empty space " +
224
- "header (space: #{length} bytes, object: #{raw_obj.bytesize} " +
231
+ "header (space: #{length} bytes, object: #{raw_obj_bytesize} " +
225
232
  "bytes)."
226
233
  end
227
234
  space_address = @f.pos
228
- space_length = length - FlatFileBlobHeader::LENGTH - raw_obj.bytesize
235
+ space_length = length - FlatFileBlobHeader::LENGTH - raw_obj_bytesize
229
236
  FlatFileBlobHeader.new(@f, space_address, 0, space_length,
230
237
  0, 0).write
231
238
  # Register the new space with the space list.
232
- @space_list.add_space(space_address, space_length) if space_length > 0
239
+ if @space_list.is_open? && space_length > 0
240
+ @space_list.add_space(space_address, space_length)
241
+ end
233
242
  end
234
243
 
235
244
  # Once the blob has been written we can update the index as well.
236
- @index.insert(id, addr)
245
+ @index.insert(id, addr) if @index.is_open?
237
246
 
238
247
  if old_addr
239
248
  # If we had an existing object stored for the ID we have to mark
240
249
  # this entry as deleted now.
241
250
  old_header.clear_flags
242
- # And register the newly freed space with the space list.
243
- @space_list.add_space(old_addr, old_header.length)
244
- else
245
251
  @f.flush
252
+ # And register the newly freed space with the space list.
253
+ if @space_list.is_open?
254
+ @space_list.add_space(old_addr, old_header.length)
255
+ end
246
256
  end
247
257
  rescue IOError => e
248
258
  PEROBS.log.fatal "Cannot write blob for ID #{id} to FlatFileDB: " +
@@ -270,24 +280,20 @@ module PEROBS
270
280
  nil
271
281
  end
272
282
 
273
- def search_object(id)
274
- each_blob_header do |pos, header|
275
- return read_obj_by_address(pos, id)
276
- end
277
-
278
- nil
283
+ # @return [Integer] Number of items stored in the DB.
284
+ def item_counter
285
+ @index.entries_count
279
286
  end
280
287
 
281
-
282
288
  # Read the object at the specified address.
283
289
  # @param addr [Integer] Offset in the flat file
284
290
  # @param id [Integer] ID of the data blob
285
291
  # @return [String] Raw object data
286
292
  def read_obj_by_address(addr, id)
287
- header = FlatFileBlobHeader.read_at(@f, addr, id)
293
+ header = FlatFileBlobHeader.read(@f, addr, id)
288
294
  if header.id != id
289
295
  PEROBS.log.fatal "Database index corrupted: Index for object " +
290
- "#{id} points to object with ID #{header.id}"
296
+ "#{id} points to object with ID #{header.id} at address #{addr}"
291
297
  end
292
298
 
293
299
  buf = nil
@@ -296,7 +302,8 @@ module PEROBS
296
302
  @f.seek(addr + FlatFileBlobHeader::LENGTH)
297
303
  buf = @f.read(header.length)
298
304
  rescue IOError => e
299
- PEROBS.log.fatal "Cannot read blob for ID #{id}: #{e.message}"
305
+ PEROBS.log.fatal "Cannot read blob for ID #{id} at address #{addr}: " +
306
+ e.message
300
307
  end
301
308
 
302
309
  # Uncompress the data if the compression bit is set in the flags byte.
@@ -305,12 +312,13 @@ module PEROBS
305
312
  buf = Zlib.inflate(buf)
306
313
  rescue Zlib::BufError, Zlib::DataError
307
314
  PEROBS.log.fatal "Corrupted compressed block with ID " +
308
- "#{header.id} found."
315
+ "#{id} found at address #{addr}."
309
316
  end
310
317
  end
311
318
 
312
319
  if checksum(buf) != header.crc
313
- PEROBS.log.fatal "Checksum failure while reading blob ID #{id}"
320
+ PEROBS.log.fatal "Checksum failure while reading blob ID #{id} " +
321
+ "at address #{addr}"
314
322
  end
315
323
 
316
324
  buf
@@ -319,19 +327,22 @@ module PEROBS
319
327
  # Mark the object with the given ID.
320
328
  # @param id [Integer] ID of the object
321
329
  def mark_obj_by_id(id)
322
- @marks.insert(id, 0)
330
+ @marks.insert(id)
323
331
  end
324
332
 
325
333
  # Return true if the object with the given ID is marked, false otherwise.
326
334
  # @param id [Integer] ID of the object
327
335
  def is_marked_by_id?(id)
328
- !@marks.get(id).nil?
336
+ @marks.include?(id)
329
337
  end
330
338
 
331
339
  # Clear alls marks.
332
340
  def clear_all_marks
333
- @marks.erase
334
- @marks.open
341
+ if @marks
342
+ @marks.clear
343
+ else
344
+ @marks = IDList.new(@db_dir, 'marks', item_counter)
345
+ end
335
346
  end
336
347
 
337
348
  # Eliminate all the holes in the file. This is an in-place
@@ -340,59 +351,72 @@ module PEROBS
340
351
  distance = 0
341
352
  new_file_size = 0
342
353
  deleted_blobs = 0
354
+ corrupted_blobs = 0
343
355
  valid_blobs = 0
344
- t = Time.now
345
- PEROBS.log.info "Defragmenting FlatFile"
356
+
346
357
  # Iterate over all entries.
347
- each_blob_header do |pos, header|
348
- # Total size of the current entry
349
- entry_bytes = FlatFileBlobHeader::LENGTH + header.length
350
- if header.is_valid?
351
- # We have found a valid entry.
352
- valid_blobs += 1
353
- if distance > 0
354
- begin
355
- # Read current entry into a buffer
356
- @f.seek(pos)
357
- buf = @f.read(entry_bytes)
358
- # Write the buffer right after the end of the previous entry.
359
- @f.seek(pos - distance)
360
- @f.write(buf)
361
- # Update the index with the new position
362
- @index.insert(header.id, pos - distance)
363
- # Mark the space between the relocated current entry and the
364
- # next valid entry as deleted space.
365
- FlatFileBlobHeader.new(@f, @f.pos, 0,
366
- distance - FlatFileBlobHeader::LENGTH,
367
- 0, 0).write
368
- @f.flush
369
- rescue IOError => e
370
- PEROBS.log.fatal "Error while moving blob for ID #{header.id}: " +
371
- e.message
358
+ @progressmeter.start('Defragmenting blobs file', @f.size) do |pm|
359
+ each_blob_header do |header|
360
+ # If we have stumbled over a corrupted blob we treat it similar to a
361
+ # deleted blob and reuse the space.
362
+ if header.corruption_start
363
+ distance += header.addr - header.corruption_start
364
+ corrupted_blobs += 1
365
+ end
366
+
367
+ # Total size of the current entry
368
+ entry_bytes = FlatFileBlobHeader::LENGTH + header.length
369
+ if header.is_valid?
370
+ # We have found a valid entry.
371
+ valid_blobs += 1
372
+ if distance > 0
373
+ begin
374
+ # Read current entry into a buffer
375
+ @f.seek(header.addr)
376
+ buf = @f.read(entry_bytes)
377
+ # Write the buffer right after the end of the previous entry.
378
+ @f.seek(header.addr - distance)
379
+ @f.write(buf)
380
+ # Mark the space between the relocated current entry and the
381
+ # next valid entry as deleted space.
382
+ FlatFileBlobHeader.new(@f, @f.pos, 0,
383
+ distance - FlatFileBlobHeader::LENGTH,
384
+ 0, 0).write
385
+ @f.flush
386
+ rescue IOError => e
387
+ PEROBS.log.fatal "Error while moving blob for ID " +
388
+ "#{header.id}: #{e.message}"
389
+ end
372
390
  end
391
+ new_file_size = header.addr - distance +
392
+ FlatFileBlobHeader::LENGTH + header.length
393
+ else
394
+ deleted_blobs += 1
395
+ distance += entry_bytes
373
396
  end
374
- new_file_size = pos + FlatFileBlobHeader::LENGTH + header.length
375
- else
376
- deleted_blobs += 1
377
- distance += entry_bytes
397
+
398
+ pm.update(header.addr)
378
399
  end
379
400
  end
380
- PEROBS.log.info "FlatFile defragmented in #{Time.now - t} seconds"
401
+
381
402
  PEROBS.log.info "#{distance / 1000} KiB/#{deleted_blobs} blobs of " +
382
403
  "#{@f.size / 1000} KiB/#{valid_blobs} blobs or " +
383
404
  "#{'%.1f' % (distance.to_f / @f.size * 100.0)}% reclaimed"
405
+ if corrupted_blobs > 0
406
+ PEROBS.log.info "#{corrupted_blobs} corrupted blob(s) found. Space " +
407
+ "was recycled."
408
+ end
384
409
 
385
410
  @f.flush
386
411
  @f.truncate(new_file_size)
387
412
  @f.flush
388
- @space_list.clear
389
413
 
390
414
  sync
391
415
  end
392
416
 
393
417
  # This method iterates over all entries in the FlatFile and removes the
394
418
  # entry and inserts it again. This is useful to update all entries in
395
- # cased the storage format has changed.
419
+ # case the storage format has changed.
396
420
  def refresh
397
421
  # This iteration might look scary as we iterate over the entries while
398
422
  # while we are rearranging them. Re-inserted items may be inserted
@@ -400,132 +424,276 @@ module PEROBS
400
424
  # inserted after the current entry and will be re-read again unless they
401
425
  # are inserted after the original file end.
402
426
  file_size = @f.size
403
- PEROBS.log.info "Refreshing the DB..."
404
- t = Time.now
405
- each_blob_header do |pos, header|
406
- if header.is_valid?
407
- buf = read_obj_by_address(pos, header.id)
408
- delete_obj_by_address(pos, header.id)
409
- write_obj_by_id(header.id, buf)
410
- end
411
427
 
412
- # Some re-inserted blobs may be inserted after the original file end.
413
- # No need to process those blobs again.
414
- break if pos >= file_size
428
+ # We don't update the index and the space list during this operation as
429
+ # we defragmentize the blob file at the end. We'll end the operation
430
+ # with an empty space list.
431
+ clear_index_files
432
+
433
+ @progressmeter.start('Converting objects to new storage format',
434
+ @f.size) do |pm|
435
+ each_blob_header do |header|
436
+ if header.is_valid?
437
+ buf = read_obj_by_address(header.addr, header.id)
438
+ delete_obj_by_address(header.addr, header.id)
439
+ write_obj_by_id(header.id, buf)
440
+ end
441
+
442
+ # Some re-inserted blobs may be inserted after the original file end.
443
+ # No need to process those blobs again.
444
+ break if header.addr >= file_size
445
+
446
+ pm.update(header.addr)
447
+ end
415
448
  end
416
- PEROBS.log.info "DB refresh completed in #{Time.now - t} seconds"
417
449
 
418
450
  # Reclaim the space saved by compressing entries.
419
451
  defragmentize
452
+
453
+ # Recreate the index file and create an empty space list.
454
+ regenerate_index_and_spaces
420
455
  end
421
456
 
422
- # Check (and repair) the FlatFile.
423
- # @param repair [Boolean] True if errors should be fixed.
457
+ # Check the FlatFile.
424
458
  # @return [Integer] Number of errors found
425
- def check(repair = false)
459
+ def check()
426
460
  errors = 0
427
461
  return errors unless @f
428
462
 
429
463
  t = Time.now
430
- PEROBS.log.info "Checking FlatFile database" +
431
- "#{repair ? ' in repair mode' : ''}..."
464
+ PEROBS.log.info "Checking FlatFile database..."
432
465
 
433
466
  # First check the database blob file. Each entry should be readable and
434
467
  # correct and all IDs must be unique. We use a shadow index to keep
435
468
  # track of the already found IDs.
436
- new_index = BTree.new(@db_dir, 'new-index', INDEX_BTREE_ORDER)
469
+ new_index = BTree.new(@db_dir, 'new-index', INDEX_BTREE_ORDER,
470
+ @progressmeter)
437
471
  new_index.erase
438
472
  new_index.open
439
473
 
440
- each_blob_header do |pos, header|
441
- if header.is_valid?
442
- # We have a non-deleted entry.
443
- begin
444
- @f.seek(pos + FlatFileBlobHeader::LENGTH)
445
- buf = @f.read(header.length)
446
- if buf.bytesize != header.length
447
- PEROBS.log.error "Premature end of file in blob with ID " +
448
- "#{header.id}."
449
- discard_damaged_blob(header) if repair
450
- errors += 1
451
- next
452
- end
474
+ corrupted_blobs = 0
475
+ end_of_last_healthy_blob = nil
476
+ @progressmeter.start('Checking blobs file', @f.size) do |pm|
477
+ corrupted_blobs = each_blob_header do |header|
478
+ if header.is_valid?
479
+ # We have a non-deleted entry.
480
+ begin
481
+ @f.seek(header.addr + FlatFileBlobHeader::LENGTH)
482
+ buf = @f.read(header.length)
483
+ if buf.bytesize != header.length
484
+ PEROBS.log.error "Premature end of file in blob with ID " +
485
+ "#{header.id}."
486
+ errors += 1
487
+ next
488
+ end
453
489
 
454
- # Uncompress the data if the compression bit is set in the mark
455
- # byte.
456
- if header.is_compressed?
457
- begin
458
- buf = Zlib.inflate(buf)
459
- rescue Zlib::BufError, Zlib::DataError
460
- PEROBS.log.error "Corrupted compressed block with ID " +
461
- "#{header.id} found."
462
- discard_damaged_blob(header) if repair
490
+ # Uncompress the data if the compression bit is set in the mark
491
+ # byte.
492
+ if header.is_compressed?
493
+ begin
494
+ buf = Zlib.inflate(buf)
495
+ rescue Zlib::BufError, Zlib::DataError
496
+ PEROBS.log.error "Corrupted compressed block with ID " +
497
+ "#{header.id} found."
498
+ errors += 1
499
+ next
500
+ end
501
+ end
502
+
503
+ if header.crc && checksum(buf) != header.crc
504
+ PEROBS.log.error "Checksum failure while checking blob " +
505
+ "with ID #{header.id}"
463
506
  errors += 1
464
507
  next
465
508
  end
509
+ rescue IOError => e
510
+ PEROBS.log.fatal "Check of blob with ID #{header.id} failed: " +
511
+ e.message
466
512
  end
467
513
 
468
- if header.crc && checksum(buf) != header.crc
469
- PEROBS.log.error "Checksum failure while checking blob " +
470
- "with ID #{header.id}"
471
- discard_damaged_blob(header) if repair
514
+ # Check if the ID has already been found in the file.
515
+ if (previous_address = new_index.get(header.id))
516
+ PEROBS.log.error "Multiple blobs for ID #{header.id} found. " +
517
+ "Addresses: #{previous_address}, #{header.addr}"
472
518
  errors += 1
473
- next
519
+ previous_header = FlatFileBlobHeader.read(@f, previous_address,
520
+ header.id)
521
+ else
522
+ # ID is unique so far. Add it to the shadow index.
523
+ new_index.insert(header.id, header.addr)
474
524
  end
475
- rescue IOError => e
476
- PEROBS.log.fatal "Check of blob with ID #{header.id} failed: " +
477
- e.message
478
525
  end
526
+ end_of_last_healthy_blob = header.addr +
527
+ FlatFileBlobHeader::LENGTH + header.length
479
528
 
480
- # Check if the ID has already been found in the file.
481
- if (previous_address = new_index.get(header.id))
482
- PEROBS.log.error "Multiple blobs for ID #{header.id} found. " +
483
- "Addresses: #{previous_address}, #{pos}"
484
- previous_header = FlatFileBlobHeader.read_at(@f, previous_address,
485
- header.id)
486
- if repair
487
- # We have two blobs with the same ID and we must discard one of
488
- # them.
489
- if header.is_outdated?
490
- discard_damaged_blob(header)
491
- elsif previous_header.is_outdated?
492
- discard_damaged_blob(previous_header)
493
- else
494
- PEROBS.log.error "None of the blobs with same ID have " +
495
- "the outdated flag set. Deleting the smaller one."
496
- discard_damaged_blob(header.length < previous_header.length ?
497
- header : previous_header)
498
- end
499
- next
500
- end
501
- else
502
- # ID is unique so far. Add it to the shadow index.
503
- new_index.insert(header.id, pos)
504
- end
529
+ pm.update(header.addr)
530
+ end
505
531
 
532
+ if end_of_last_healthy_blob && end_of_last_healthy_blob != @f.size
533
+ # The blob file ends with a corrupted blob header.
534
+ PEROBS.log.error "#{@f.size - end_of_last_healthy_blob} corrupted " +
535
+ 'bytes found at the end of FlatFile.'
536
+ corrupted_blobs += 1
506
537
  end
538
+
539
+ errors += corrupted_blobs
507
540
  end
541
+
508
542
  # We no longer need the new index.
509
543
  new_index.close
510
544
  new_index.erase
511
545
 
512
- # Now we check the index data. It must be correct and the entries must
513
- # match the blob file. All entries in the index must be in the blob file
514
- # and vise versa.
515
- begin
516
- index_ok = @index.check do |id, address|
517
- has_id_at?(id, address)
546
+ if corrupted_blobs == 0
547
+ # Now we check the index data. It must be correct and the entries must
548
+ # match the blob file. All entries in the index must be in the blob file
549
+ # and vise versa.
550
+ begin
551
+ index_ok = @index.check do |id, address|
552
+ unless has_id_at?(id, address)
553
+ PEROBS.log.error "Index contains an entry for " +
554
+ "ID #{id} at address #{address} that is not in FlatFile"
555
+ false
556
+ else
557
+ true
558
+ end
559
+ end
560
+ x_check_errs = 0
561
+ space_check_ok = true
562
+ unless index_ok && (space_check_ok = @space_list.check(self)) &&
563
+ (x_check_errs = cross_check_entries) == 0
564
+ errors += 1 unless index_ok && space_check_ok
565
+ errors += x_check_errs
566
+ end
567
+ rescue PEROBS::FatalError
568
+ errors += 1
518
569
  end
519
- unless index_ok && @space_list.check(self) && cross_check_entries
520
- regenerate_index_and_spaces if repair
570
+ end
571
+
572
+ PEROBS.log.info "FlatFile check completed in #{Time.now - t} seconds. " +
573
+ "#{errors} errors found."
574
+
575
+ errors
576
+ end
577
+
578
+ # Repair the FlatFile. In contrast to the repair functionality in the
579
+ # check() method this method is much faster. It simply re-creates the
580
+ # index and space list from the blob file.
581
+ # @return [Integer] Number of errors found
582
+ def repair
583
+ errors = 0
584
+ return errors unless @f
585
+
586
+ t = Time.now
587
+ PEROBS.log.info "Repairing FlatFile database"
588
+
589
+ # Erase and re-open the index and space list files. We purposely don't
590
+ # close the files at it would trigger needless flushing.
591
+ clear_index_files(true)
592
+
593
+ # Now we scan the blob file and re-index all blobs and spaces. Corrupted
594
+ # blobs will be skipped.
595
+ corrupted_blobs = 0
596
+ end_of_last_healthy_blob = nil
597
+ @progressmeter.start('Re-indexing blobs file', @f.size) do |pm|
598
+ corrupted_blobs = each_blob_header do |header|
599
+ if header.corruption_start
600
+ # The blob is preceeded by a corrupted area. We create a new
601
+ # header of a deleted blob for this area and write the new blob
602
+ # over it.
603
+ if (data_length = header.addr - header.corruption_start -
604
+ FlatFileBlobHeader::LENGTH) <= 0
605
+ PEROBS.log.error "Found a corrupted blob that is too small to " +
606
+ "fit a header (#{data_length}). File must be defragmented."
607
+ else
608
+ new_header = FlatFileBlobHeader.new(@f, header.corruption_start,
609
+ 0, data_length, 0, 0)
610
+ new_header.write
611
+ @space_list.add_space(header.corruption_start, data_length)
612
+ end
613
+ end
614
+
615
+ if header.is_valid?
616
+ # We have a non-deleted entry.
617
+ begin
618
+ @f.seek(header.addr + FlatFileBlobHeader::LENGTH)
619
+ buf = @f.read(header.length)
620
+ if buf.bytesize != header.length
621
+ PEROBS.log.error "Premature end of file in blob with ID " +
622
+ "#{header.id}."
623
+ discard_damaged_blob(header)
624
+ errors += 1
625
+ next
626
+ end
627
+
628
+ # Uncompress the data if the compression bit is set in the mark
629
+ # byte.
630
+ if header.is_compressed?
631
+ begin
632
+ buf = Zlib.inflate(buf)
633
+ rescue Zlib::BufError, Zlib::DataError
634
+ PEROBS.log.error "Corrupted compressed block with ID " +
635
+ "#{header.id} found."
636
+ discard_damaged_blob(header)
637
+ errors += 1
638
+ next
639
+ end
640
+ end
641
+
642
+ if header.crc && checksum(buf) != header.crc
643
+ PEROBS.log.error "Checksum failure while checking blob " +
644
+ "with ID #{header.id}"
645
+ discard_damaged_blob(header)
646
+ errors += 1
647
+ next
648
+ end
649
+ rescue IOError => e
650
+ PEROBS.log.fatal "Check of blob with ID #{header.id} failed: " +
651
+ e.message
652
+ end
653
+
654
+ # Check if the ID has already been found in the file.
655
+ if (previous_address = @index.get(header.id))
656
+ PEROBS.log.error "Multiple blobs for ID #{header.id} found. " +
657
+ "Addresses: #{previous_address}, #{header.addr}"
658
+ errors += 1
659
+ previous_header = FlatFileBlobHeader.read(@f, previous_address,
660
+ header.id)
661
+ # We have two blobs with the same ID and we must discard one of
662
+ # them.
663
+ discard_duplicate_blobs(header, previous_header)
664
+ else
665
+ # ID is unique so far. Add it to the shadow index.
666
+ @index.insert(header.id, header.addr)
667
+ end
668
+
669
+ else
670
+ if header.length > 0
671
+ @space_list.add_space(header.addr, header.length)
672
+ end
673
+ end
674
+ end_of_last_healthy_blob = header.addr +
675
+ FlatFileBlobHeader::LENGTH + header.length
676
+
677
+ pm.update(header.addr)
521
678
  end
522
- rescue PEROBS::FatalError
523
- errors += 1
524
- regenerate_index_and_spaces if repair
679
+
680
+ if end_of_last_healthy_blob && end_of_last_healthy_blob != @f.size
681
+ # The blob file ends with a corrupted blob header.
682
+ PEROBS.log.error "#{@f.size - end_of_last_healthy_blob} corrupted " +
683
+ 'bytes found at the end of FlatFile.'
684
+ corrupted_blobs += 1
685
+
686
+ PEROBS.log.error "Truncating FlatFile to " +
687
+ "#{end_of_last_healthy_blob} bytes by discarding " +
688
+ "#{@f.size - end_of_last_healthy_blob} bytes"
689
+ @f.truncate(end_of_last_healthy_blob)
690
+ end
691
+
692
+ errors += corrupted_blobs
525
693
  end
526
694
 
527
- sync if repair
528
- PEROBS.log.info "check_db completed in #{Time.now - t} seconds. " +
695
+ sync
696
+ PEROBS.log.info "FlatFile repair completed in #{Time.now - t} seconds. " +
529
697
  "#{errors} errors found."
530
698
 
531
699
  errors
@@ -535,22 +703,32 @@ module PEROBS
535
703
  # regenerates them from the FlatFile.
536
704
  def regenerate_index_and_spaces
537
705
  PEROBS.log.warn "Re-generating FlatFileDB index and space files"
706
+ @index.open unless @index.is_open?
538
707
  @index.clear
708
+ @space_list.open unless @space_list.is_open?
539
709
  @space_list.clear
540
710
 
541
- each_blob_header do |pos, header|
542
- if header.is_valid?
543
- if (duplicate_pos = @index.get(header.id))
544
- PEROBS.log.error "FlatFile contains multiple blobs for ID " +
545
- "#{header.id}. First blob is at address #{duplicate_pos}. " +
546
- "Other blob found at address #{pos}."
547
- @space_list.add_space(pos, header.length) if header.length > 0
548
- discard_damaged_blob(header)
711
+ @progressmeter.start('Re-generating database index', @f.size) do |pm|
712
+ each_blob_header do |header|
713
+ if header.is_valid?
714
+ if (duplicate_pos = @index.get(header.id))
715
+ PEROBS.log.error "FlatFile contains multiple blobs for ID " +
716
+ "#{header.id}. First blob is at address #{duplicate_pos}. " +
717
+ "Other blob found at address #{header.addr}."
718
+ if header.length > 0
719
+ @space_list.add_space(header.addr, header.length)
720
+ end
721
+ discard_damaged_blob(header)
722
+ else
723
+ @index.insert(header.id, header.addr)
724
+ end
549
725
  else
550
- @index.insert(header.id, pos)
726
+ if header.length > 0
727
+ @space_list.add_space(header.addr, header.length)
728
+ end
551
729
  end
552
- else
553
- @space_list.add_space(pos, header.length) if header.length > 0
730
+
731
+ pm.update(header.addr)
554
732
  end
555
733
  end
556
734
 
@@ -558,19 +736,23 @@ module PEROBS
558
736
  end
559
737
 
560
738
  def has_space?(address, size)
561
- header = FlatFileBlobHeader.read_at(@f, address)
739
+ header = FlatFileBlobHeader.read(@f, address)
562
740
  !header.is_valid? && header.length == size
563
741
  end
564
742
 
565
743
  def has_id_at?(id, address)
566
- header = FlatFileBlobHeader.read_at(@f, address)
744
+ begin
745
+ header = FlatFileBlobHeader.read(@f, address)
746
+ rescue PEROBS::FatalError
747
+ return false
748
+ end
567
749
  header.is_valid? && header.id == id
568
750
  end
569
751
 
570
752
  def inspect
571
753
  s = '['
572
- each_blob_header do |pos, header|
573
- s << "{ :pos => #{pos}, :flags => #{header.flags}, " +
754
+ each_blob_header do |header|
755
+ s << "{ :pos => #{header.addr}, :flags => #{header.flags}, " +
574
756
  ":length => #{header.length}, :id => #{header.id}, " +
575
757
  ":crc => #{header.crc}"
576
758
  if header.is_valid?
@@ -581,21 +763,68 @@ module PEROBS
581
763
  s + ']'
582
764
  end
583
765
 
766
+ def FlatFile::insert_header_checksums(db_dir)
767
+ old_file_name = File.join(db_dir, 'database.blobs')
768
+ new_file_name = File.join(db_dir, 'database_v4.blobs')
769
+ bak_file_name = File.join(db_dir, 'database_v3.blobs')
770
+
771
+ old_file = File.open(old_file_name, 'rb')
772
+ new_file = File.open(new_file_name, 'wb')
773
+
774
+ entries = 0
775
+ while (buf = old_file.read(21))
776
+ flags, length, id, crc = *buf.unpack('CQQL')
777
+ blob_data = old_file.read(length)
778
+
779
+ # Some basic sanity checking to ensure all reserved bits are 0. Older
780
+ # versions of PEROBS used to set bit 1 despite it being reserved now.
781
+ unless flags & 0xF0 == 0
782
+ PEROBS.log.fatal "Blob file #{old_file_name} contains illegal " +
783
+ "flag byte #{'%02x' % flags} at #{old_file.pos - 21}"
784
+ end
785
+
786
+ # Check if the blob is valid and current.
787
+ if flags & 0x1 == 1 && flags & 0x8 == 0
788
+ # Make sure the bit 1 is not set anymore.
789
+ flags = flags & 0x05
790
+ header_str = [ flags, length, id, crc ].pack('CQQL')
791
+ header_crc = Zlib.crc32(header_str, 0)
792
+ header_str += [ header_crc ].pack('L')
793
+
794
+ new_file.write(header_str + blob_data)
795
+ entries += 1
796
+ end
797
+ end
798
+ PEROBS.log.info "Header checksum added to #{entries} entries"
799
+
800
+ old_file.close
801
+ new_file.close
802
+
803
+ File.rename(old_file_name, bak_file_name)
804
+ File.rename(new_file_name, old_file_name)
805
+ end
806
+
584
807
  private
585
808
 
586
809
  def each_blob_header(&block)
587
- pos = 0
810
+ corrupted_blobs = 0
811
+
588
812
  begin
589
813
  @f.seek(0)
590
814
  while (header = FlatFileBlobHeader.read(@f))
591
- yield(pos, header)
815
+ if header.corruption_start
816
+ corrupted_blobs += 1
817
+ end
818
+
819
+ yield(header)
592
820
 
593
- pos += FlatFileBlobHeader::LENGTH + header.length
594
- @f.seek(pos)
821
+ @f.seek(header.addr + FlatFileBlobHeader::LENGTH + header.length)
595
822
  end
596
823
  rescue IOError => e
597
824
  PEROBS.log.fatal "Cannot read blob in flat file DB: #{e.message}"
598
825
  end
826
+
827
+ corrupted_blobs
599
828
  end
600
829
 
601
830
  def find_free_blob(bytes)
@@ -625,26 +854,34 @@ module PEROBS
625
854
  def cross_check_entries
626
855
  errors = 0
627
856
 
628
- each_blob_header do |pos, header|
629
- if !header.is_valid?
630
- if header.length > 0
631
- unless @space_list.has_space?(pos, header.length)
632
- PEROBS.log.error "FlatFile has free space " +
633
- "(addr: #{pos}, len: #{header.length}) that is not in " +
634
- "FreeSpaceManager"
635
- errors += 1
857
+ @progressmeter.start('Cross checking blobs and index', @f.size) do |pm|
858
+ each_blob_header do |header|
859
+ if !header.is_valid?
860
+ if header.length > 0
861
+ unless @space_list.has_space?(header.addr, header.length)
862
+ PEROBS.log.error "FlatFile has free space " +
863
+ "(addr: #{header.addr}, len: #{header.length}) that is " +
864
+ "not in SpaceManager"
865
+ errors += 1
866
+ end
867
+ end
868
+ else
869
+ if (index_address = @index.get(header.id)).nil?
870
+ PEROBS.log.error "FlatFile blob at address #{header.addr} " +
871
+ "is not listed in the index"
872
+ errors +=1
873
+ elsif index_address != header.addr
874
+ PEROBS.log.error "FlatFile blob at address #{header.addr} " +
875
+ "is listed in index with address #{index_address}"
876
+ errors += 1
636
877
  end
637
878
  end
638
- else
639
- unless @index.get(header.id) == pos
640
- PEROBS.log.error "FlatFile blob at address #{pos} is listed " +
641
- "in index with address #{@index.get(header.id)}"
642
- errors += 1
643
- end
879
+
880
+ pm.update(header.addr)
644
881
  end
645
882
  end
646
883
 
647
- errors == 0
884
+ errors
648
885
  end
649
886
 
650
887
  def discard_damaged_blob(header)
@@ -653,6 +890,61 @@ module PEROBS
653
890
  header.clear_flags
654
891
  end
655
892
 
893
+ def discard_duplicate_blobs(header, previous_header)
894
+ if header.is_outdated?
895
+ discard_damaged_blob(header)
896
+ elsif previous_header.is_outdated?
897
+ discard_damaged_blob(previous_header)
898
+ else
899
+ smaller, larger = header.length < previous_header.length ?
900
+ [ header, previous_header ] : [ previous_header, header ]
901
+ PEROBS.log.error "None of the blobs with same ID have " +
902
+ "the outdated flag set. Deleting the smaller one " +
903
+ "at address #{smaller.addr}"
904
+ discard_damaged_blob(smaller)
905
+ @space_list.add_space(smaller.addr, smaller.length)
906
+ @index.insert(larger.id, larger.addr)
907
+ end
908
+ end
909
+
910
+ def open_index_files(abort_on_missing_files = false)
911
+ begin
912
+ @index.open(abort_on_missing_files)
913
+ @space_list.open
914
+ rescue FatalError
915
+ clear_index_files
916
+ regenerate_index_and_spaces
917
+ end
918
+ end
919
+
920
+ def erase_index_files(dont_close_files = false)
921
+ # Ensure that the index is really closed.
922
+ @index.close unless dont_close_files
923
+ # Erase it completely
924
+ @index.erase
925
+
926
+ # Ensure that the spaces list is really closed.
927
+ @space_list.close unless dont_close_files
928
+ # Erase it completely
929
+ @space_list.erase
930
+
931
+ if @space_list.is_a?(SpaceTree)
932
+ # If we still use the old SpaceTree format, this is the moment to
933
+ # convert it to the new SpaceManager format.
934
+ @space_list = SpaceManager.new(@db_dir, @progressmeter)
935
+ PEROBS.log.warn "Converting space list from SpaceTree format " +
936
+ "to SpaceManager format"
937
+ end
938
+ end
939
+
940
+ def clear_index_files(dont_close_files = false)
941
+ erase_index_files(dont_close_files)
942
+
943
+ # Then create them again.
944
+ @index.open
945
+ @space_list.open
946
+ end
947
+
656
948
  end
657
949
 
658
950
  end