RubyGems - perobs - Versions diffs - 4.0.0 → 4.4.0 - Mend

perobs 4.0.0 → 4.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

checksums.yaml +5 -5
data/README.md +27 -16
data/lib/perobs/Array.rb +66 -19
data/lib/perobs/BTree.rb +106 -15
data/lib/perobs/BTreeBlob.rb +4 -3
data/lib/perobs/BTreeDB.rb +5 -4
data/lib/perobs/BTreeNode.rb +482 -156
data/lib/perobs/BTreeNodeLink.rb +10 -0
data/lib/perobs/BigArray.rb +285 -0
data/lib/perobs/BigArrayNode.rb +1002 -0
data/lib/perobs/BigHash.rb +246 -0
data/lib/perobs/BigTree.rb +197 -0
data/lib/perobs/BigTreeNode.rb +873 -0
data/lib/perobs/Cache.rb +48 -10
data/lib/perobs/ConsoleProgressMeter.rb +61 -0
data/lib/perobs/DataBase.rb +4 -3
data/lib/perobs/DynamoDB.rb +57 -15
data/lib/perobs/EquiBlobsFile.rb +155 -50
data/lib/perobs/FNV_Hash_1a_64.rb +54 -0
data/lib/perobs/FlatFile.rb +519 -227
data/lib/perobs/FlatFileBlobHeader.rb +113 -54
data/lib/perobs/FlatFileDB.rb +49 -23
data/lib/perobs/FuzzyStringMatcher.rb +175 -0
data/lib/perobs/Hash.rb +127 -33
data/lib/perobs/IDList.rb +144 -0
data/lib/perobs/IDListPage.rb +107 -0
data/lib/perobs/IDListPageFile.rb +180 -0
data/lib/perobs/IDListPageRecord.rb +142 -0
data/lib/perobs/Object.rb +18 -15
data/lib/perobs/ObjectBase.rb +46 -5
data/lib/perobs/PersistentObjectCache.rb +57 -68
data/lib/perobs/PersistentObjectCacheLine.rb +24 -12
data/lib/perobs/ProgressMeter.rb +97 -0
data/lib/perobs/SpaceManager.rb +273 -0
data/lib/perobs/SpaceTree.rb +21 -12
data/lib/perobs/SpaceTreeNode.rb +53 -61
data/lib/perobs/Store.rb +264 -145
data/lib/perobs/version.rb +1 -1
data/lib/perobs.rb +2 -0
data/perobs.gemspec +4 -4
data/test/Array_spec.rb +15 -6
data/test/BTree_spec.rb +6 -2
data/test/BigArray_spec.rb +261 -0
data/test/BigHash_spec.rb +152 -0
data/test/BigTreeNode_spec.rb +153 -0
data/test/BigTree_spec.rb +259 -0
data/test/EquiBlobsFile_spec.rb +105 -1
data/test/FNV_Hash_1a_64_spec.rb +59 -0
data/test/FlatFileDB_spec.rb +198 -14
data/test/FuzzyStringMatcher_spec.rb +261 -0
data/test/Hash_spec.rb +13 -3
data/test/IDList_spec.rb +77 -0
data/test/LegacyDBs/LegacyDB.rb +155 -0
data/test/LegacyDBs/version_3/class_map.json +1 -0
data/test/LegacyDBs/version_3/config.json +1 -0
data/test/LegacyDBs/version_3/database.blobs +0 -0
data/test/LegacyDBs/version_3/database_spaces.blobs +0 -0
data/test/LegacyDBs/version_3/index.blobs +0 -0
data/test/LegacyDBs/version_3/version +1 -0
data/test/LockFile_spec.rb +9 -6
data/test/SpaceManager_spec.rb +176 -0
data/test/SpaceTree_spec.rb +4 -1
data/test/Store_spec.rb +305 -203
data/test/spec_helper.rb +9 -4
metadata +57 -16
data/lib/perobs/BTreeNodeCache.rb +0 -109
data/lib/perobs/TreeDB.rb +0 -277

data/lib/perobs/FlatFile.rb CHANGED Viewed

@@ -2,7 +2,7 @@
 #
 # = FlatFile.rb -- Persistent Ruby Object Store
 #
-# Copyright (c) 2016 by Chris Schlaeger <chris@taskjuggler.org>
+# Copyright (c) 2016, 2018, 2019 by Chris Schlaeger <chris@taskjuggler.org>
 #
 # MIT License
 #
@@ -31,6 +31,8 @@ require 'perobs/Log'
 require 'perobs/FlatFileBlobHeader'
 require 'perobs/BTree'
 require 'perobs/SpaceTree'
+require 'perobs/SpaceManager'
+require 'perobs/IDList'
 module PEROBS
@@ -44,12 +46,20 @@ module PEROBS
     # Create a new FlatFile object for a database in the given path.
     # @param dir [String] Directory path for the data base file
-    def initialize(dir)
+    def initialize(dir, progressmeter)
       @db_dir = dir
+      @progressmeter = progressmeter
       @f = nil
-      @index = BTree.new(@db_dir, 'index', INDEX_BTREE_ORDER)
-      @marks = BTree.new(@db_dir, 'marks', INDEX_BTREE_ORDER)
-      @space_list = SpaceTree.new(@db_dir)
+      @marks = nil
+      @index = BTree.new(@db_dir, 'index', INDEX_BTREE_ORDER, @progressmeter)
+      old_spaces_file = File.join(@db_dir, 'database_spaces.blobs')
+      if File.exist?(old_spaces_file)
+        # PEROBS version 4.1.0 and earlier used this space list format. It is
+        # deprecated now. Newly created DBs use the SpaceManager format.
+        @space_list = SpaceTree.new(@db_dir, @progressmeter)
+      else
+        @space_list = SpaceManager.new(@db_dir, @progressmeter)
+      end
     end
     # Open the flat file for reading and writing.
@@ -74,33 +84,19 @@ module PEROBS
       end
       @f.sync = true
-      begin
-        @index.open(!new_db_created)
-        @space_list.open
-      rescue FatalError
-        # Ensure that the index is really closed.
-        @index.close
-        # Erase it completely
-        @index.erase
-        # Then create it again.
-        @index.open
-        # Ensure that the spaces list is really closed.
-        @space_list.close
-        # Erase it completely
-        @space_list.erase
-        # Then create it again
-        @space_list.open
-        regenerate_index_and_spaces
-      end
+      open_index_files(!new_db_created)
     end
     # Close the flat file. This method must be called to ensure that all data
     # is really written into the filesystem.
     def close
-      @space_list.close
-      @index.close
+      @space_list.close if @space_list.is_open?
+      @index.close if @index.is_open?
+      if @marks
+        @marks.erase
+        @marks = nil
+      end
       if @f
         @f.flush
@@ -139,29 +135,37 @@ module PEROBS
     # @param addr [Integer] Address of the blob to delete
     # @param id [Integer] ID of the blob to delete
     def delete_obj_by_address(addr, id)
-      @index.remove(id)
-      header = FlatFileBlobHeader.read_at(@f, addr, id)
+      @index.remove(id) if @index.is_open?
+      header = FlatFileBlobHeader.read(@f, addr, id)
       header.clear_flags
-      @space_list.add_space(addr, header.length)
+      @space_list.add_space(addr, header.length) if @space_list.is_open?
     end
     # Delete all unmarked objects.
-    def delete_unmarked_objects
-      PEROBS.log.info "Deleting unmarked objects..."
-      t = Time.now
+    def delete_unmarked_objects(&block)
+      # We don't update the index and the space list during this operation as
+      # we defragmentize the blob file at the end. We'll end the operation
+      # with an empty space list.
+      clear_index_files
+      deleted_objects_count = 0
+      @progressmeter.start('Sweeping unmarked objects', @f.size) do |pm|
+        each_blob_header do |header|
+          if header.is_valid? && !@marks.include?(header.id)
+            delete_obj_by_address(header.addr, header.id)
+            yield(header.id) if block_given?
+            deleted_objects_count += 1
+          end
-      deleted_ids = []
-      each_blob_header do |pos, header|
-        if header.is_valid? && @marks.get(header.id).nil?
-          delete_obj_by_address(pos, header.id)
-          deleted_ids << header.id
+          pm.update(header.addr)
         end
       end
       defragmentize
-      PEROBS.log.info "#{deleted_ids.length} unmarked objects deleted " +
-        "in #{Time.now - t} seconds"
-      deleted_ids
+      # Update the index file and create a new, empty space list.
+      regenerate_index_and_spaces
+      deleted_objects_count
     end
     # Write the given object into the file. This method never uses in-place
@@ -177,7 +181,7 @@ module PEROBS
       # operation is aborted or interrupted we ensure that we either have the
       # old or the new version available.
       if (old_addr = find_obj_addr_by_id(id))
-        old_header = FlatFileBlobHeader.read_at(@f, old_addr)
+        old_header = FlatFileBlobHeader.read(@f, old_addr)
         old_header.set_outdated_flag
       end
@@ -188,22 +192,24 @@ module PEROBS
       # performance impact of compression is not compensated by writing
       # less data to the storage.
       compressed = false
-      if raw_obj.bytesize > 256
+      raw_obj_bytesize = raw_obj.bytesize
+      if raw_obj_bytesize > 256
         raw_obj = Zlib.deflate(raw_obj)
+        raw_obj_bytesize = raw_obj.bytesize
         compressed = true
       end
-      addr, length = find_free_blob(raw_obj.bytesize)
+      addr, length = find_free_blob(raw_obj_bytesize)
       begin
         if length != -1
           # Just a safeguard so we don't overwrite current data.
-          header = FlatFileBlobHeader.read_at(@f, addr)
+          header = FlatFileBlobHeader.read(@f, addr)
           if header.length != length
             PEROBS.log.fatal "Length in free list (#{length}) and header " +
               "(#{header.length}) for address #{addr} don't match."
           end
-          if raw_obj.bytesize > header.length
-            PEROBS.log.fatal "Object (#{raw_obj.bytesize}) is longer than " +
+          if raw_obj_bytesize > header.length
+            PEROBS.log.fatal "Object (#{raw_obj_bytesize}) is longer than " +
               "blob space (#{header.length})."
           end
           if header.is_valid?
@@ -213,36 +219,40 @@ module PEROBS
         end
         flags = 1 << FlatFileBlobHeader::VALID_FLAG_BIT
         flags |= (1 << FlatFileBlobHeader::COMPRESSED_FLAG_BIT) if compressed
-        FlatFileBlobHeader.new(@f, addr, flags, raw_obj.bytesize, id, crc).write
+        FlatFileBlobHeader.new(@f, addr, flags, raw_obj_bytesize, id, crc).write
         @f.write(raw_obj)
-        if length != -1 && raw_obj.bytesize < length
+        @f.flush
+        if length != -1 && raw_obj_bytesize < length
           # The new object was not appended and it did not completely fill the
           # free space. So we have to write a new header to mark the remaining
           # empty space.
-          unless length - raw_obj.bytesize >= FlatFileBlobHeader::LENGTH
+          unless length - raw_obj_bytesize >= FlatFileBlobHeader::LENGTH
             PEROBS.log.fatal "Not enough space to append the empty space " +
-              "header (space: #{length} bytes, object: #{raw_obj.bytesize} " +
+              "header (space: #{length} bytes, object: #{raw_obj_bytesize} " +
               "bytes)."
           end
           space_address = @f.pos
-          space_length = length - FlatFileBlobHeader::LENGTH - raw_obj.bytesize
+          space_length = length - FlatFileBlobHeader::LENGTH - raw_obj_bytesize
           FlatFileBlobHeader.new(@f, space_address, 0, space_length,
                                  0, 0).write
           # Register the new space with the space list.
-          @space_list.add_space(space_address, space_length) if space_length > 0
+          if @space_list.is_open? && space_length > 0
+            @space_list.add_space(space_address, space_length)
+          end
         end
         # Once the blob has been written we can update the index as well.
-        @index.insert(id, addr)
+        @index.insert(id, addr) if @index.is_open?
         if old_addr
           # If we had an existing object stored for the ID we have to mark
           # this entry as deleted now.
           old_header.clear_flags
-          # And register the newly freed space with the space list.
-          @space_list.add_space(old_addr, old_header.length)
-        else
           @f.flush
+          # And register the newly freed space with the space list.
+          if @space_list.is_open?
+            @space_list.add_space(old_addr, old_header.length)
+          end
         end
       rescue IOError => e
         PEROBS.log.fatal "Cannot write blob for ID #{id} to FlatFileDB: " +
@@ -270,24 +280,20 @@ module PEROBS
       nil
     end
-    def search_object(id)
-      each_blob_header do |pos, header|
-        return read_obj_by_address(pos, id)
-      end
-      nil
+    # @return [Integer] Number of items stored in the DB.
+    def item_counter
+      @index.entries_count
     end
     # Read the object at the specified address.
     # @param addr [Integer] Offset in the flat file
     # @param id [Integer] ID of the data blob
     # @return [String] Raw object data
     def read_obj_by_address(addr, id)
-      header = FlatFileBlobHeader.read_at(@f, addr, id)
+      header = FlatFileBlobHeader.read(@f, addr, id)
       if header.id != id
         PEROBS.log.fatal "Database index corrupted: Index for object " +
-          "#{id} points to object with ID #{header.id}"
+          "#{id} points to object with ID #{header.id} at address #{addr}"
       end
       buf = nil
@@ -296,7 +302,8 @@ module PEROBS
         @f.seek(addr + FlatFileBlobHeader::LENGTH)
         buf = @f.read(header.length)
       rescue IOError => e
-        PEROBS.log.fatal "Cannot read blob for ID #{id}: #{e.message}"
+        PEROBS.log.fatal "Cannot read blob for ID #{id} at address #{addr}: " +
+          e.message
       end
       # Uncompress the data if the compression bit is set in the flags byte.
@@ -305,12 +312,13 @@ module PEROBS
           buf = Zlib.inflate(buf)
         rescue Zlib::BufError, Zlib::DataError
           PEROBS.log.fatal "Corrupted compressed block with ID " +
-            "#{header.id} found."
+            "#{id} found at address #{addr}."
         end
       end
       if checksum(buf) != header.crc
-        PEROBS.log.fatal "Checksum failure while reading blob ID #{id}"
+        PEROBS.log.fatal "Checksum failure while reading blob ID #{id} " +
+          "at address #{addr}"
       end
       buf
@@ -319,19 +327,22 @@ module PEROBS
     # Mark the object with the given ID.
     # @param id [Integer] ID of the object
     def mark_obj_by_id(id)
-      @marks.insert(id, 0)
+      @marks.insert(id)
     end
     # Return true if the object with the given ID is marked, false otherwise.
     # @param id [Integer] ID of the object
     def is_marked_by_id?(id)
-      !@marks.get(id).nil?
+      @marks.include?(id)
     end
     # Clear alls marks.
     def clear_all_marks
-      @marks.erase
-      @marks.open
+      if @marks
+        @marks.clear
+      else
+        @marks = IDList.new(@db_dir, 'marks', item_counter)
+      end
     end
     # Eliminate all the holes in the file. This is an in-place
@@ -340,59 +351,72 @@ module PEROBS
       distance = 0
       new_file_size = 0
       deleted_blobs = 0
+      corrupted_blobs = 0
       valid_blobs = 0
-      t = Time.now
-      PEROBS.log.info "Defragmenting FlatFile"
       # Iterate over all entries.
-      each_blob_header do |pos, header|
-        # Total size of the current entry
-        entry_bytes = FlatFileBlobHeader::LENGTH + header.length
-        if header.is_valid?
-          # We have found a valid entry.
-          valid_blobs += 1
-          if distance > 0
-            begin
-              # Read current entry into a buffer
-              @f.seek(pos)
-              buf = @f.read(entry_bytes)
-              # Write the buffer right after the end of the previous entry.
-              @f.seek(pos - distance)
-              @f.write(buf)
-              # Update the index with the new position
-              @index.insert(header.id, pos - distance)
-              # Mark the space between the relocated current entry and the
-              # next valid entry as deleted space.
-              FlatFileBlobHeader.new(@f, @f.pos, 0,
-                                     distance - FlatFileBlobHeader::LENGTH,
-                                     0, 0).write
-              @f.flush
-            rescue IOError => e
-              PEROBS.log.fatal "Error while moving blob for ID #{header.id}: " +
-                e.message
+      @progressmeter.start('Defragmenting blobs file', @f.size) do |pm|
+        each_blob_header do |header|
+          # If we have stumbled over a corrupted blob we treat it similar to a
+          # deleted blob and reuse the space.
+          if header.corruption_start
+            distance += header.addr - header.corruption_start
+            corrupted_blobs += 1
+          end
+          # Total size of the current entry
+          entry_bytes = FlatFileBlobHeader::LENGTH + header.length
+          if header.is_valid?
+            # We have found a valid entry.
+            valid_blobs += 1
+            if distance > 0
+              begin
+                # Read current entry into a buffer
+                @f.seek(header.addr)
+                buf = @f.read(entry_bytes)
+                # Write the buffer right after the end of the previous entry.
+                @f.seek(header.addr - distance)
+                @f.write(buf)
+                # Mark the space between the relocated current entry and the
+                # next valid entry as deleted space.
+                FlatFileBlobHeader.new(@f, @f.pos, 0,
+                                       distance - FlatFileBlobHeader::LENGTH,
+                                       0, 0).write
+                @f.flush
+              rescue IOError => e
+                PEROBS.log.fatal "Error while moving blob for ID " +
+                  "#{header.id}: #{e.message}"
+              end
             end
+            new_file_size = header.addr - distance +
+              FlatFileBlobHeader::LENGTH + header.length
+          else
+            deleted_blobs += 1
+            distance += entry_bytes
           end
-          new_file_size = pos + FlatFileBlobHeader::LENGTH + header.length
-        else
-          deleted_blobs += 1
-          distance += entry_bytes
+          pm.update(header.addr)
         end
       end
-      PEROBS.log.info "FlatFile defragmented in #{Time.now - t} seconds"
       PEROBS.log.info "#{distance / 1000} KiB/#{deleted_blobs} blobs of " +
         "#{@f.size / 1000} KiB/#{valid_blobs} blobs or " +
         "#{'%.1f' % (distance.to_f / @f.size * 100.0)}% reclaimed"
+      if corrupted_blobs > 0
+        PEROBS.log.info "#{corrupted_blobs} corrupted blob(s) found. Space " +
+          "was recycled."
+      end
       @f.flush
       @f.truncate(new_file_size)
       @f.flush
-      @space_list.clear
       sync
     end
     # This method iterates over all entries in the FlatFile and removes the
     # entry and inserts it again. This is useful to update all entries in
-    # cased the storage format has changed.
+    # case the storage format has changed.
     def refresh
       # This iteration might look scary as we iterate over the entries while
       # while we are rearranging them. Re-inserted items may be inserted
@@ -400,132 +424,276 @@ module PEROBS
       # inserted after the current entry and will be re-read again unless they
       # are inserted after the original file end.
       file_size = @f.size
-      PEROBS.log.info "Refreshing the DB..."
-      t = Time.now
-      each_blob_header do |pos, header|
-        if header.is_valid?
-          buf = read_obj_by_address(pos, header.id)
-          delete_obj_by_address(pos, header.id)
-          write_obj_by_id(header.id, buf)
-        end
-        # Some re-inserted blobs may be inserted after the original file end.
-        # No need to process those blobs again.
-        break if pos >= file_size
+      # We don't update the index and the space list during this operation as
+      # we defragmentize the blob file at the end. We'll end the operation
+      # with an empty space list.
+      clear_index_files
+      @progressmeter.start('Converting objects to new storage format',
+                           @f.size) do |pm|
+        each_blob_header do |header|
+          if header.is_valid?
+            buf = read_obj_by_address(header.addr, header.id)
+            delete_obj_by_address(header.addr, header.id)
+            write_obj_by_id(header.id, buf)
+          end
+          # Some re-inserted blobs may be inserted after the original file end.
+          # No need to process those blobs again.
+          break if header.addr >= file_size
+          pm.update(header.addr)
+        end
       end
-      PEROBS.log.info "DB refresh completed in #{Time.now - t} seconds"
       # Reclaim the space saved by compressing entries.
       defragmentize
+      # Recreate the index file and create an empty space list.
+      regenerate_index_and_spaces
     end
-    # Check (and repair) the FlatFile.
-    # @param repair [Boolean] True if errors should be fixed.
+    # Check the FlatFile.
     # @return [Integer] Number of errors found
-    def check(repair = false)
+    def check()
       errors = 0
       return errors unless @f
       t = Time.now
-      PEROBS.log.info "Checking FlatFile database" +
-        "#{repair ? ' in repair mode' : ''}..."
+      PEROBS.log.info "Checking FlatFile database..."
       # First check the database blob file. Each entry should be readable and
       # correct and all IDs must be unique. We use a shadow index to keep
       # track of the already found IDs.
-      new_index = BTree.new(@db_dir, 'new-index', INDEX_BTREE_ORDER)
+      new_index = BTree.new(@db_dir, 'new-index', INDEX_BTREE_ORDER,
+                            @progressmeter)
       new_index.erase
       new_index.open
-      each_blob_header do |pos, header|
-        if header.is_valid?
-          # We have a non-deleted entry.
-          begin
-            @f.seek(pos + FlatFileBlobHeader::LENGTH)
-            buf = @f.read(header.length)
-            if buf.bytesize != header.length
-              PEROBS.log.error "Premature end of file in blob with ID " +
-                "#{header.id}."
-              discard_damaged_blob(header) if repair
-              errors += 1
-              next
-            end
+      corrupted_blobs = 0
+      end_of_last_healthy_blob = nil
+      @progressmeter.start('Checking blobs file', @f.size) do |pm|
+        corrupted_blobs = each_blob_header do |header|
+          if header.is_valid?
+            # We have a non-deleted entry.
+            begin
+              @f.seek(header.addr + FlatFileBlobHeader::LENGTH)
+              buf = @f.read(header.length)
+              if buf.bytesize != header.length
+                PEROBS.log.error "Premature end of file in blob with ID " +
+                  "#{header.id}."
+                errors += 1
+                next
+              end
-            # Uncompress the data if the compression bit is set in the mark
-            # byte.
-            if header.is_compressed?
-              begin
-                buf = Zlib.inflate(buf)
-              rescue Zlib::BufError, Zlib::DataError
-                PEROBS.log.error "Corrupted compressed block with ID " +
-                  "#{header.id} found."
-                discard_damaged_blob(header) if repair
+              # Uncompress the data if the compression bit is set in the mark
+              # byte.
+              if header.is_compressed?
+                begin
+                  buf = Zlib.inflate(buf)
+                rescue Zlib::BufError, Zlib::DataError
+                  PEROBS.log.error "Corrupted compressed block with ID " +
+                    "#{header.id} found."
+                  errors += 1
+                  next
+                end
+              end
+              if header.crc && checksum(buf) != header.crc
+                PEROBS.log.error "Checksum failure while checking blob " +
+                  "with ID #{header.id}"
                 errors += 1
                 next
               end
+            rescue IOError => e
+              PEROBS.log.fatal "Check of blob with ID #{header.id} failed: " +
+                e.message
             end
-            if header.crc && checksum(buf) != header.crc
-              PEROBS.log.error "Checksum failure while checking blob " +
-                "with ID #{header.id}"
-              discard_damaged_blob(header) if repair
+            # Check if the ID has already been found in the file.
+            if (previous_address = new_index.get(header.id))
+              PEROBS.log.error "Multiple blobs for ID #{header.id} found. " +
+                "Addresses: #{previous_address}, #{header.addr}"
               errors += 1
-              next
+              previous_header = FlatFileBlobHeader.read(@f, previous_address,
+                                                        header.id)
+            else
+              # ID is unique so far. Add it to the shadow index.
+              new_index.insert(header.id, header.addr)
             end
-          rescue IOError => e
-            PEROBS.log.fatal "Check of blob with ID #{header.id} failed: " +
-              e.message
           end
+          end_of_last_healthy_blob = header.addr +
+            FlatFileBlobHeader::LENGTH + header.length
-          # Check if the ID has already been found in the file.
-          if (previous_address = new_index.get(header.id))
-            PEROBS.log.error "Multiple blobs for ID #{header.id} found. " +
-              "Addresses: #{previous_address}, #{pos}"
-            previous_header = FlatFileBlobHeader.read_at(@f, previous_address,
-                                                         header.id)
-            if repair
-              # We have two blobs with the same ID and we must discard one of
-              # them.
-              if header.is_outdated?
-                discard_damaged_blob(header)
-              elsif previous_header.is_outdated?
-                discard_damaged_blob(previous_header)
-              else
-                PEROBS.log.error "None of the blobs with same ID have " +
-                  "the outdated flag set. Deleting the smaller one."
-                discard_damaged_blob(header.length < previous_header.length ?
-                                     header : previous_header)
-              end
-              next
-            end
-          else
-            # ID is unique so far. Add it to the shadow index.
-            new_index.insert(header.id, pos)
-          end
+          pm.update(header.addr)
+        end
+        if end_of_last_healthy_blob && end_of_last_healthy_blob != @f.size
+          # The blob file ends with a corrupted blob header.
+          PEROBS.log.error "#{@f.size - end_of_last_healthy_blob} corrupted " +
+            'bytes found at the end of FlatFile.'
+          corrupted_blobs += 1
         end
+        errors += corrupted_blobs
       end
       # We no longer need the new index.
       new_index.close
       new_index.erase
-      # Now we check the index data. It must be correct and the entries must
-      # match the blob file. All entries in the index must be in the blob file
-      # and vise versa.
-      begin
-        index_ok = @index.check do |id, address|
-          has_id_at?(id, address)
+      if corrupted_blobs == 0
+        # Now we check the index data. It must be correct and the entries must
+        # match the blob file. All entries in the index must be in the blob file
+        # and vise versa.
+        begin
+          index_ok = @index.check do |id, address|
+            unless has_id_at?(id, address)
+              PEROBS.log.error "Index contains an entry for " +
+                "ID #{id} at address #{address} that is not in FlatFile"
+              false
+            else
+              true
+            end
+          end
+          x_check_errs = 0
+          space_check_ok = true
+          unless index_ok && (space_check_ok = @space_list.check(self)) &&
+            (x_check_errs = cross_check_entries) == 0
+            errors += 1 unless index_ok && space_check_ok
+            errors += x_check_errs
+          end
+        rescue PEROBS::FatalError
+          errors += 1
         end
-        unless index_ok && @space_list.check(self) && cross_check_entries
-          regenerate_index_and_spaces if repair
+      end
+      PEROBS.log.info "FlatFile check completed in #{Time.now - t} seconds. " +
+        "#{errors} errors found."
+      errors
+    end
+    # Repair the FlatFile. In contrast to the repair functionality in the
+    # check() method this method is much faster. It simply re-creates the
+    # index and space list from the blob file.
+    # @return [Integer] Number of errors found
+    def repair
+      errors = 0
+      return errors unless @f
+      t = Time.now
+      PEROBS.log.info "Repairing FlatFile database"
+      # Erase and re-open the index and space list files. We purposely don't
+      # close the files at it would trigger needless flushing.
+      clear_index_files(true)
+      # Now we scan the blob file and re-index all blobs and spaces. Corrupted
+      # blobs will be skipped.
+      corrupted_blobs = 0
+      end_of_last_healthy_blob = nil
+      @progressmeter.start('Re-indexing blobs file', @f.size) do |pm|
+        corrupted_blobs = each_blob_header do |header|
+          if header.corruption_start
+            # The blob is preceeded by a corrupted area. We create a new
+            # header of a deleted blob for this area and write the new blob
+            # over it.
+            if (data_length = header.addr - header.corruption_start -
+                FlatFileBlobHeader::LENGTH) <= 0
+              PEROBS.log.error "Found a corrupted blob that is too small to " +
+                "fit a header (#{data_length}). File must be defragmented."
+            else
+              new_header = FlatFileBlobHeader.new(@f, header.corruption_start,
+                                                  0, data_length, 0, 0)
+              new_header.write
+              @space_list.add_space(header.corruption_start, data_length)
+            end
+          end
+          if header.is_valid?
+            # We have a non-deleted entry.
+            begin
+              @f.seek(header.addr + FlatFileBlobHeader::LENGTH)
+              buf = @f.read(header.length)
+              if buf.bytesize != header.length
+                PEROBS.log.error "Premature end of file in blob with ID " +
+                  "#{header.id}."
+                discard_damaged_blob(header)
+                errors += 1
+                next
+              end
+              # Uncompress the data if the compression bit is set in the mark
+              # byte.
+              if header.is_compressed?
+                begin
+                  buf = Zlib.inflate(buf)
+                rescue Zlib::BufError, Zlib::DataError
+                  PEROBS.log.error "Corrupted compressed block with ID " +
+                    "#{header.id} found."
+                  discard_damaged_blob(header)
+                  errors += 1
+                  next
+                end
+              end
+              if header.crc && checksum(buf) != header.crc
+                PEROBS.log.error "Checksum failure while checking blob " +
+                  "with ID #{header.id}"
+                discard_damaged_blob(header)
+                errors += 1
+                next
+              end
+            rescue IOError => e
+              PEROBS.log.fatal "Check of blob with ID #{header.id} failed: " +
+                e.message
+            end
+            # Check if the ID has already been found in the file.
+            if (previous_address = @index.get(header.id))
+              PEROBS.log.error "Multiple blobs for ID #{header.id} found. " +
+                "Addresses: #{previous_address}, #{header.addr}"
+              errors += 1
+              previous_header = FlatFileBlobHeader.read(@f, previous_address,
+                                                        header.id)
+              # We have two blobs with the same ID and we must discard one of
+              # them.
+              discard_duplicate_blobs(header, previous_header)
+            else
+              # ID is unique so far. Add it to the shadow index.
+              @index.insert(header.id, header.addr)
+            end
+          else
+            if header.length > 0
+              @space_list.add_space(header.addr, header.length)
+            end
+          end
+          end_of_last_healthy_blob = header.addr +
+            FlatFileBlobHeader::LENGTH + header.length
+          pm.update(header.addr)
         end
-      rescue PEROBS::FatalError
-        errors += 1
-        regenerate_index_and_spaces if repair
+        if end_of_last_healthy_blob && end_of_last_healthy_blob != @f.size
+          # The blob file ends with a corrupted blob header.
+          PEROBS.log.error "#{@f.size - end_of_last_healthy_blob} corrupted " +
+            'bytes found at the end of FlatFile.'
+          corrupted_blobs += 1
+          PEROBS.log.error "Truncating FlatFile to " +
+            "#{end_of_last_healthy_blob} bytes by discarding " +
+            "#{@f.size - end_of_last_healthy_blob} bytes"
+          @f.truncate(end_of_last_healthy_blob)
+        end
+        errors += corrupted_blobs
       end
-      sync if repair
-      PEROBS.log.info "check_db completed in #{Time.now - t} seconds. " +
+      sync
+      PEROBS.log.info "FlatFile repair completed in #{Time.now - t} seconds. " +
         "#{errors} errors found."
       errors
@@ -535,22 +703,32 @@ module PEROBS
     # regenerates them from the FlatFile.
     def regenerate_index_and_spaces
       PEROBS.log.warn "Re-generating FlatFileDB index and space files"
+      @index.open unless @index.is_open?
       @index.clear
+      @space_list.open unless @space_list.is_open?
       @space_list.clear
-      each_blob_header do |pos, header|
-        if header.is_valid?
-          if (duplicate_pos = @index.get(header.id))
-            PEROBS.log.error "FlatFile contains multiple blobs for ID " +
-              "#{header.id}. First blob is at address #{duplicate_pos}. " +
-              "Other blob found at address #{pos}."
-            @space_list.add_space(pos, header.length) if header.length > 0
-            discard_damaged_blob(header)
+      @progressmeter.start('Re-generating database index', @f.size) do |pm|
+        each_blob_header do |header|
+          if header.is_valid?
+            if (duplicate_pos = @index.get(header.id))
+              PEROBS.log.error "FlatFile contains multiple blobs for ID " +
+                "#{header.id}. First blob is at address #{duplicate_pos}. " +
+                "Other blob found at address #{header.addr}."
+              if header.length > 0
+                @space_list.add_space(header.addr, header.length)
+              end
+              discard_damaged_blob(header)
+            else
+              @index.insert(header.id, header.addr)
+            end
           else
-            @index.insert(header.id, pos)
+            if header.length > 0
+              @space_list.add_space(header.addr, header.length)
+            end
           end
-        else
-          @space_list.add_space(pos, header.length) if header.length > 0
+          pm.update(header.addr)
         end
       end
@@ -558,19 +736,23 @@ module PEROBS
     end
     def has_space?(address, size)
-      header = FlatFileBlobHeader.read_at(@f, address)
+      header = FlatFileBlobHeader.read(@f, address)
       !header.is_valid? && header.length == size
     end
     def has_id_at?(id, address)
-      header = FlatFileBlobHeader.read_at(@f, address)
+      begin
+        header = FlatFileBlobHeader.read(@f, address)
+      rescue PEROBS::FatalError
+        return false
+      end
       header.is_valid? && header.id == id
     end
     def inspect
       s = '['
-      each_blob_header do |pos, header|
-        s << "{ :pos => #{pos}, :flags => #{header.flags}, " +
+      each_blob_header do |header|
+        s << "{ :pos => #{header.addr}, :flags => #{header.flags}, " +
              ":length => #{header.length}, :id => #{header.id}, " +
              ":crc => #{header.crc}"
         if header.is_valid?
@@ -581,21 +763,68 @@ module PEROBS
       s + ']'
     end
+    def FlatFile::insert_header_checksums(db_dir)
+      old_file_name = File.join(db_dir, 'database.blobs')
+      new_file_name = File.join(db_dir, 'database_v4.blobs')
+      bak_file_name = File.join(db_dir, 'database_v3.blobs')
+      old_file = File.open(old_file_name, 'rb')
+      new_file = File.open(new_file_name, 'wb')
+      entries = 0
+      while (buf = old_file.read(21))
+        flags, length, id, crc = *buf.unpack('CQQL')
+        blob_data = old_file.read(length)
+        # Some basic sanity checking to ensure all reserved bits are 0. Older
+        # versions of PEROBS used to set bit 1 despite it being reserved now.
+        unless flags & 0xF0 == 0
+          PEROBS.log.fatal "Blob file #{old_file_name} contains illegal " +
+            "flag byte #{'%02x' % flags} at #{old_file.pos - 21}"
+        end
+        # Check if the blob is valid and current.
+        if flags & 0x1 == 1 && flags & 0x8 == 0
+          # Make sure the bit 1 is not set anymore.
+          flags = flags & 0x05
+          header_str = [ flags, length, id, crc ].pack('CQQL')
+          header_crc = Zlib.crc32(header_str, 0)
+          header_str += [ header_crc ].pack('L')
+          new_file.write(header_str + blob_data)
+          entries += 1
+        end
+      end
+      PEROBS.log.info "Header checksum added to #{entries} entries"
+      old_file.close
+      new_file.close
+      File.rename(old_file_name, bak_file_name)
+      File.rename(new_file_name, old_file_name)
+    end
     private
     def each_blob_header(&block)
-      pos = 0
+      corrupted_blobs = 0
       begin
         @f.seek(0)
         while (header = FlatFileBlobHeader.read(@f))
-          yield(pos, header)
+          if header.corruption_start
+            corrupted_blobs += 1
+          end
+          yield(header)
-          pos += FlatFileBlobHeader::LENGTH + header.length
-          @f.seek(pos)
+          @f.seek(header.addr + FlatFileBlobHeader::LENGTH + header.length)
         end
       rescue IOError => e
         PEROBS.log.fatal "Cannot read blob in flat file DB: #{e.message}"
       end
+      corrupted_blobs
     end
     def find_free_blob(bytes)
@@ -625,26 +854,34 @@ module PEROBS
     def cross_check_entries
       errors = 0
-      each_blob_header do |pos, header|
-        if !header.is_valid?
-          if header.length > 0
-            unless @space_list.has_space?(pos, header.length)
-              PEROBS.log.error "FlatFile has free space " +
-                "(addr: #{pos}, len: #{header.length}) that is not in " +
-                "FreeSpaceManager"
-              errors += 1
+      @progressmeter.start('Cross checking blobs and index', @f.size) do |pm|
+        each_blob_header do |header|
+          if !header.is_valid?
+            if header.length > 0
+              unless @space_list.has_space?(header.addr, header.length)
+                PEROBS.log.error "FlatFile has free space " +
+                  "(addr: #{header.addr}, len: #{header.length}) that is " +
+                  "not in SpaceManager"
+                errors += 1
+              end
+            end
+          else
+            if (index_address = @index.get(header.id)).nil?
+              PEROBS.log.error "FlatFile blob at address #{header.addr} " +
+                "is not listed in the index"
+              errors +=1
+            elsif index_address != header.addr
+                PEROBS.log.error "FlatFile blob at address #{header.addr} " +
+                  "is listed in index with address #{index_address}"
+                errors += 1
             end
           end
-        else
-          unless @index.get(header.id) == pos
-            PEROBS.log.error "FlatFile blob at address #{pos} is listed " +
-              "in index with address #{@index.get(header.id)}"
-            errors += 1
-          end
+          pm.update(header.addr)
         end
       end
-      errors == 0
+      errors
     end
     def discard_damaged_blob(header)
@@ -653,6 +890,61 @@ module PEROBS
       header.clear_flags
     end
+    def discard_duplicate_blobs(header, previous_header)
+      if header.is_outdated?
+        discard_damaged_blob(header)
+      elsif previous_header.is_outdated?
+        discard_damaged_blob(previous_header)
+      else
+        smaller, larger = header.length < previous_header.length ?
+          [ header, previous_header ] : [ previous_header, header ]
+        PEROBS.log.error "None of the blobs with same ID have " +
+          "the outdated flag set. Deleting the smaller one " +
+          "at address #{smaller.addr}"
+        discard_damaged_blob(smaller)
+        @space_list.add_space(smaller.addr, smaller.length)
+        @index.insert(larger.id, larger.addr)
+      end
+    end
+    def open_index_files(abort_on_missing_files = false)
+      begin
+        @index.open(abort_on_missing_files)
+        @space_list.open
+      rescue FatalError
+        clear_index_files
+        regenerate_index_and_spaces
+      end
+    end
+    def erase_index_files(dont_close_files = false)
+      # Ensure that the index is really closed.
+      @index.close unless dont_close_files
+      # Erase it completely
+      @index.erase
+      # Ensure that the spaces list is really closed.
+      @space_list.close unless dont_close_files
+      # Erase it completely
+      @space_list.erase
+      if @space_list.is_a?(SpaceTree)
+        # If we still use the old SpaceTree format, this is the moment to
+        # convert it to the new SpaceManager format.
+        @space_list = SpaceManager.new(@db_dir, @progressmeter)
+        PEROBS.log.warn "Converting space list from SpaceTree format " +
+          "to SpaceManager format"
+      end
+    end
+    def clear_index_files(dont_close_files = false)
+      erase_index_files(dont_close_files)
+      # Then create them again.
+      @index.open
+      @space_list.open
+    end
   end
 end