RubyGems - perobs - Versions diffs - 4.2.0 → 4.5.0 - Mend

perobs 4.2.0 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +4 -4
data/README.md +27 -16
data/lib/perobs/BTree.rb +2 -2
data/lib/perobs/BTreeNode.rb +46 -29
data/lib/perobs/BigArrayNode.rb +11 -9
data/lib/perobs/Cache.rb +32 -6
data/lib/perobs/EquiBlobsFile.rb +2 -0
data/lib/perobs/FlatFile.rb +40 -60
data/lib/perobs/FuzzyStringMatcher.rb +32 -49
data/lib/perobs/Hash.rb +68 -23
data/lib/perobs/IDListPageFile.rb +2 -1
data/lib/perobs/IDListPageRecord.rb +1 -1
data/lib/perobs/Log.rb +5 -0
data/lib/perobs/ObjectBase.rb +7 -0
data/lib/perobs/SpaceTree.rb +1 -1
data/lib/perobs/Store.rb +177 -125
data/lib/perobs/version.rb +1 -1
data/lib/perobs.rb +1 -0
data/perobs.gemspec +1 -1
data/test/FlatFileDB_spec.rb +30 -0
data/test/FuzzyStringMatcher_spec.rb +94 -4
data/test/Hash_spec.rb +12 -1
data/test/Store_spec.rb +14 -0
metadata +8 -10
data/lib/perobs/BTreeNodeCache.rb +0 -109

data/lib/perobs/FlatFile.rb CHANGED Viewed

@@ -221,6 +221,7 @@ module PEROBS
         flags |= (1 << FlatFileBlobHeader::COMPRESSED_FLAG_BIT) if compressed
         FlatFileBlobHeader.new(@f, addr, flags, raw_obj_bytesize, id, crc).write
         @f.write(raw_obj)
+        @f.flush
         if length != -1 && raw_obj_bytesize < length
           # The new object was not appended and it did not completely fill the
           # free space. So we have to write a new header to mark the remaining
@@ -247,12 +248,11 @@ module PEROBS
           # If we had an existing object stored for the ID we have to mark
           # this entry as deleted now.
           old_header.clear_flags
+          @f.flush
           # And register the newly freed space with the space list.
           if @space_list.is_open?
             @space_list.add_space(old_addr, old_header.length)
           end
-        else
-          @f.flush
         end
       rescue IOError => e
         PEROBS.log.fatal "Cannot write blob for ID #{id} to FlatFileDB: " +
@@ -293,7 +293,7 @@ module PEROBS
       header = FlatFileBlobHeader.read(@f, addr, id)
       if header.id != id
         PEROBS.log.fatal "Database index corrupted: Index for object " +
-          "#{id} points to object with ID #{header.id}"
+          "#{id} points to object with ID #{header.id} at address #{addr}"
       end
       buf = nil
@@ -302,7 +302,8 @@ module PEROBS
         @f.seek(addr + FlatFileBlobHeader::LENGTH)
         buf = @f.read(header.length)
       rescue IOError => e
-        PEROBS.log.fatal "Cannot read blob for ID #{id}: #{e.message}"
+        PEROBS.log.fatal "Cannot read blob for ID #{id} at address #{addr}: " +
+          e.message
       end
       # Uncompress the data if the compression bit is set in the flags byte.
@@ -311,12 +312,13 @@ module PEROBS
           buf = Zlib.inflate(buf)
         rescue Zlib::BufError, Zlib::DataError
           PEROBS.log.fatal "Corrupted compressed block with ID " +
-            "#{header.id} found."
+            "#{id} found at address #{addr}."
         end
       end
       if checksum(buf) != header.crc
-        PEROBS.log.fatal "Checksum failure while reading blob ID #{id}"
+        PEROBS.log.fatal "Checksum failure while reading blob ID #{id} " +
+          "at address #{addr}"
       end
       buf
@@ -339,7 +341,7 @@ module PEROBS
       if @marks
         @marks.clear
       else
-        @marks = IDList.new(@db_dir, 'marks', 8)
+        @marks = IDList.new(@db_dir, 'marks', item_counter)
       end
     end
@@ -353,7 +355,7 @@ module PEROBS
       valid_blobs = 0
       # Iterate over all entries.
-      @progressmeter.start('Defragmentizing blobs file', @f.size) do |pm|
+      @progressmeter.start('Defragmenting blobs file', @f.size) do |pm|
         each_blob_header do |header|
           # If we have stumbled over a corrupted blob we treat it similar to a
           # deleted blob and reuse the space.
@@ -452,16 +454,14 @@ module PEROBS
       regenerate_index_and_spaces
     end
-    # Check (and repair) the FlatFile.
-    # @param repair [Boolean] True if errors should be fixed.
+    # Check the FlatFile.
     # @return [Integer] Number of errors found
-    def check(repair = false)
+    def check()
       errors = 0
       return errors unless @f
       t = Time.now
-      PEROBS.log.info "Checking FlatFile database" +
-        "#{repair ? ' in repair mode' : ''}..."
+      PEROBS.log.info "Checking FlatFile database..."
       # First check the database blob file. Each entry should be readable and
       # correct and all IDs must be unique. We use a shadow index to keep
@@ -483,7 +483,6 @@ module PEROBS
               if buf.bytesize != header.length
                 PEROBS.log.error "Premature end of file in blob with ID " +
                   "#{header.id}."
-                discard_damaged_blob(header) if repair
                 errors += 1
                 next
               end
@@ -496,7 +495,6 @@ module PEROBS
                 rescue Zlib::BufError, Zlib::DataError
                   PEROBS.log.error "Corrupted compressed block with ID " +
                     "#{header.id} found."
-                  discard_damaged_blob(header) if repair
                   errors += 1
                   next
                 end
@@ -505,7 +503,6 @@ module PEROBS
               if header.crc && checksum(buf) != header.crc
                 PEROBS.log.error "Checksum failure while checking blob " +
                   "with ID #{header.id}"
-                discard_damaged_blob(header) if repair
                 errors += 1
                 next
               end
@@ -521,22 +518,6 @@ module PEROBS
               errors += 1
               previous_header = FlatFileBlobHeader.read(@f, previous_address,
                                                         header.id)
-              if repair
-                # We have two blobs with the same ID and we must discard one of
-                # them.
-                if header.is_outdated?
-                  discard_damaged_blob(header)
-                elsif previous_header.is_outdated?
-                  discard_damaged_blob(previous_header)
-                else
-                  PEROBS.log.error "None of the blobs with same ID have " +
-                    "the outdated flag set. Deleting the smaller one."
-                  errors += 1
-                  discard_damaged_blob(header.length < previous_header.length ?
-                                       header : previous_header)
-                end
-                next
-              end
             else
               # ID is unique so far. Add it to the shadow index.
               new_index.insert(header.id, header.addr)
@@ -553,12 +534,6 @@ module PEROBS
           PEROBS.log.error "#{@f.size - end_of_last_healthy_blob} corrupted " +
             'bytes found at the end of FlatFile.'
           corrupted_blobs += 1
-          if repair
-            PEROBS.log.error "Truncating FlatFile to " +
-              "#{end_of_last_healthy_blob} bytes by discarding " +
-              "#{@f.size - end_of_last_healthy_blob} bytes"
-            @f.truncate(end_of_last_healthy_blob)
-          end
         end
         errors += corrupted_blobs
@@ -568,17 +543,19 @@ module PEROBS
       new_index.close
       new_index.erase
-      if repair && corrupted_blobs > 0
-        erase_index_files
-        defragmentize
-        regenerate_index_and_spaces
-      elsif corrupted_blobs == 0
+      if corrupted_blobs == 0
         # Now we check the index data. It must be correct and the entries must
         # match the blob file. All entries in the index must be in the blob file
         # and vise versa.
         begin
           index_ok = @index.check do |id, address|
-            has_id_at?(id, address)
+            unless has_id_at?(id, address)
+              PEROBS.log.error "Index contains an entry for " +
+                "ID #{id} at address #{address} that is not in FlatFile"
+              false
+            else
+              true
+            end
           end
           x_check_errs = 0
           space_check_ok = true
@@ -586,16 +563,13 @@ module PEROBS
             (x_check_errs = cross_check_entries) == 0
             errors += 1 unless index_ok && space_check_ok
             errors += x_check_errs
-            regenerate_index_and_spaces if repair
           end
         rescue PEROBS::FatalError
           errors += 1
-          regenerate_index_and_spaces if repair
         end
       end
-      sync if repair
-      PEROBS.log.info "check_db completed in #{Time.now - t} seconds. " +
+      PEROBS.log.info "FlatFile check completed in #{Time.now - t} seconds. " +
         "#{errors} errors found."
       errors
@@ -604,7 +578,6 @@ module PEROBS
     # Repair the FlatFile. In contrast to the repair functionality in the
     # check() method this method is much faster. It simply re-creates the
     # index and space list from the blob file.
-    # @param repair [Boolean] True if errors should be fixed.
     # @return [Integer] Number of errors found
     def repair
       errors = 0
@@ -687,17 +660,7 @@ module PEROBS
                                                         header.id)
               # We have two blobs with the same ID and we must discard one of
               # them.
-              if header.is_outdated?
-                discard_damaged_blob(header)
-              elsif previous_header.is_outdated?
-                discard_damaged_blob(previous_header)
-              else
-                PEROBS.log.error "None of the blobs with same ID have " +
-                  "the outdated flag set. Deleting the smaller one."
-                errors += 1
-                discard_damaged_blob(header.length < previous_header.length ?
-                                     header : previous_header)
-              end
+              discard_duplicate_blobs(header, previous_header)
             else
               # ID is unique so far. Add it to the shadow index.
               @index.insert(header.id, header.addr)
@@ -927,6 +890,23 @@ module PEROBS
       header.clear_flags
     end
+    def discard_duplicate_blobs(header, previous_header)
+      if header.is_outdated?
+        discard_damaged_blob(header)
+      elsif previous_header.is_outdated?
+        discard_damaged_blob(previous_header)
+      else
+        smaller, larger = header.length < previous_header.length ?
+          [ header, previous_header ] : [ previous_header, header ]
+        PEROBS.log.error "None of the blobs with same ID have " +
+          "the outdated flag set. Deleting the smaller one " +
+          "at address #{smaller.addr}"
+        discard_damaged_blob(smaller)
+        @space_list.add_space(smaller.addr, smaller.length)
+        @index.insert(larger.id, larger.addr)
+      end
+    end
     def open_index_files(abort_on_missing_files = false)
       begin
         @index.open(abort_on_missing_files)

data/lib/perobs/FuzzyStringMatcher.rb CHANGED Viewed

@@ -26,40 +26,42 @@
 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 require 'perobs/Log'
-require 'perobs/ObjectBase'
+require 'perobs/Object'
 module PEROBS
   # The fuzzy string matcher can be used to perform a fuzzy string search
   # against a known set of strings. The dictionary of known strings does not
-  # store the actual strings but references to arbitrary objects. These could
-  # be the string, but can be something else related to the learned strings.
-  # To use this class a list of strings with their references must be learned.
-  # Once the dictionary has been established, fuzzy matches can be done.
-  class FuzzyStringMatcher
+  # store the actual strings but references to String or PEROBS objects.
+  # Once the dictionary has been established, fuzzy matches can be done. Since
+  # the actual input strings are not directly stored, you cannot remove or
+  # modified already stored strings. To remove strings, you have to clear the
+  # matcher and add the strings again that you want to keep.
+  class FuzzyStringMatcher < PEROBS::Object
+    attr_persist :case_sensitive, :n, :dict
     # Create a new FuzzyStringMatcher.
-    # @param store [PEROBS::Store] place to store the dictionary
-    # @param name [String] Unique name of the string matcher
+    # @param p [PEROBS::Store] place to store the dictionary
     # @param case_sensitive [Boolean] True if case matters for matching
     # @param n [Integer] Determines what kind of n-gramm is used to store the
     #        references in the dictionary. It also determines the minimum word
-    #        length that can be used for fuzzy matches.
-    def initialize(store, name, case_sensitive = false, n = 4)
-      @store = store
-      @dict_name = "FuzzyStringMatcher::#{name}"
+    #        length that can be used for fuzzy matches. Values between 2 and
+    #        10 are supported. The default is 4.
+    def initialize(p, case_sensitive = false, n = 4)
+      super(p)
       if n < 2 || n > 10
         raise ArgumentError, 'n must be between 2 and 10'
       end
-      @case_sensitive = case_sensitive
-      @n = n
+      self.case_sensitive = case_sensitive
+      self.n = n
-      clear unless (@dict = @store[@dict_name])
+      clear unless @dict
     end
     # Wipe the dictionary.
     def clear
-      @store[@dict_name] = @dict = @store.new(BigHash)
+      self.dict = @store.new(BigHash)
     end
     # Add a string with its reference to the dictionary.
@@ -79,11 +81,8 @@ module PEROBS
           @dict[n_gramm] = ng_list = @store.new(Hash)
         end
-        if ng_list.include?(reference)
-          ng_list[reference] += 1
-        else
-          ng_list[reference] = 0
-        end
+        # We use the Hash as a Set. The value doesn't matter.
+        ng_list[reference] = true unless ng_list.include?(reference)
       end
       nil
@@ -109,22 +108,12 @@ module PEROBS
       matches = {}
-      # This will be the best possible score for a perfect match.
-      best_possible_score = 0
       each_n_gramm(string) do |n_gramm|
-        best_possible_score += 1
         if (ng_list = @dict[n_gramm])
-          ng_list.each do |reference, count|
+          ng_list.each do |reference, dummy|
             if matches.include?(reference)
               matches[reference] += 1
             else
-              # We use internally a 10 times larger list so that we don't
-              # throw away good matches too early. If the max_count value is
-              # chosen too small there is a risk of not finding the best
-              # matches!
-              if matches.size > 10 * max_count
-                matches = discard_worst_match(matches)
-              end
               matches[reference] = 1
             end
           end
@@ -133,19 +122,23 @@ module PEROBS
       return [] if matches.empty?
-      # Sort in the order of occurance count downwards.
-      match_list = matches.to_a.sort do |a, b|
-        b[1] <=> a[1]
-      end
+      match_list = matches.to_a
       # Set occurance counters to scores relative to the best possible score.
+      # This will be the best possible score for a perfect match.
+      best_possible_score = string.length - @n + 1
       match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
-      # Delete all matches that occured less than half as often than the
-      # top match.
+      # Delete all matches that don't have the required minimum match score.
       match_list.delete_if { |a| a[1] < min_score }
-      match_list[0..max_count]
+      # Sort the list best to worst match
+      match_list.sort! do |a, b|
+        b[1] <=> a[1]
+      end
+      # Return the top max_count matches.
+      match_list[0..max_count - 1]
     end
     # Returns some internal stats about the dictionary.
@@ -176,16 +169,6 @@ module PEROBS
       end
     end
-    def discard_worst_match(matches)
-      # Sort in the order of occurance count downwards.
-      match_list = matches.to_a.sort do |a, b|
-        b[1] <=> a[1]
-      end
-      # Discard the lowest half of the matches
-      match_list = match_list[0..match_list.length / 2]
-      match_list.to_h
-    end
   end
 end

data/lib/perobs/Hash.rb CHANGED Viewed

@@ -124,9 +124,9 @@ module PEROBS
     # Proxy for assignment method.
     def []=(key, value)
-      unless key.is_a?(String)
-        raise ArgumentError, "PEROBS::Hash[] key must be a String but is a " +
-          "#{key.class}"
+      unless key.is_a?(String) || key.respond_to?(:is_poxreference?)
+        raise ArgumentError, "PEROBS::Hash[] key must be a String or " +
+          "a PEROBS object but is a #{key.class}"
       end
       _check_assignment_value(value)
       @store.cache.cache_write(self)
@@ -143,18 +143,33 @@ module PEROBS
     # is referencing.
     # @return [Array of Integer] IDs of referenced objects
     def _referenced_object_ids
-      @data.each_value.select { |v| v && v.respond_to?(:is_poxreference?) }.
-        map { |o| o.id }
+      ids = []
+      @data.each do |k, v|
+        if k && k.respond_to?(:is_poxreference?)
+          ids << k.id
+        end
+        if v && v.respond_to?(:is_poxreference?)
+          ids << v.id
+        end
+      end
+      ids
     end
     # This method should only be used during store repair operations. It will
     # delete all referenced to the given object ID.
     # @param id [Integer] targeted object ID
     def _delete_reference_to_id(id)
+      original_length = @data.length
       @data.delete_if do |k, v|
-        v && v.respond_to?(:is_poxreference?) && v.id == id
+        (k && k.respond_to?(:is_poxreference?) && k.id == id) ||
+        (v && v.respond_to?(:is_poxreference?) && v.id == id)
+      end
+      if @data.length != original_length
+        @store.cache.cache_write(self)
       end
-      @store.cache.cache_write(self)
     end
     # Restore the persistent data from a single data structure.
@@ -163,8 +178,18 @@ module PEROBS
     # @private
     def _deserialize(data)
       @data = {}
-      data.each { |k, v| @data[k] = v.is_a?(POReference) ?
-                                    POXReference.new(@store, v.id) : v }
+      data.each do |k, v|
+        # References to other PEROBS Objects are marshalled with our own
+        # format. If we detect such a marshalled String we convert it into a
+        # POXReference object.
+        if (match = /^#<PEROBS::POReference id=([0-9]+)>$/.match(k))
+          k = POXReference.new(@store, match[1].to_i)
+        end
+        dv = v.is_a?(POReference) ? POXReference.new(@store, v.id) : v
+        @data[k] = dv
+      end
       @data
     end
@@ -185,26 +210,46 @@ module PEROBS
       data = {}
       @data.each do |k, v|
-        if v.respond_to?(:is_poxreference?)
-          data[k] = POReference.new(v.id)
-        else
-          # Outside of the PEROBS library all PEROBS::ObjectBase derived
-          # objects should not be used directly. The library only exposes them
-          # via POXReference proxy objects.
-          if v.is_a?(ObjectBase)
-            PEROBS.log.fatal 'A PEROBS::ObjectBase object escaped! ' +
-              "It is stored in a PEROBS::Hash with key #{k.inspect}. " +
-              'Have you used self() instead of myself() to ' +
-              "get the reference of this PEROBS object?\n" +
-              v.inspect
-          end
-          data[k] = v
+        if k.respond_to?(:is_poxreference?)
+          # JSON only supports Strings as hash keys. Since JSON is the default
+          # internal storage format in the database, we have to marshall
+          # PEROBS::Object references ourselves.
+          k = "#<PEROBS::POReference id=#{k.id}>"
+        elsif k[0..24] == '#<PEROBS::POReference id='
+          # This could obviously result in conflicts with 'normal' String hash
+          # keys. This is extremely unlikely, but we better catch this case
+          # before it causes hard to debug trouble.
+          raise ArgumentError, "Hash key #{k} conflicts with PEROBS " +
+            "internal representation of marshalled hash keys!"
         end
+        data[k] = serialize_helper(v)
       end
       data
     end
+    def serialize_helper(v)
+      if v.respond_to?(:is_poxreference?)
+        # References to other PEROBS objects (POXReference) are stored as
+        # POReference in the database.
+        return POReference.new(v.id)
+      else
+        # Outside of the PEROBS library all PEROBS::ObjectBase derived
+        # objects should not be used directly. The library only exposes them
+        # via POXReference proxy objects.
+        if v.is_a?(ObjectBase)
+          PEROBS.log.fatal 'A PEROBS::ObjectBase object escaped! ' +
+            "It is stored in a PEROBS::Hash. " +
+            'Have you used self() instead of myself() to ' +
+            "get the reference of this PEROBS object?\n" +
+            v.inspect
+        end
+        # All other objects are serialized by their native methods.
+        return v
+      end
+    end
   end
 end

data/lib/perobs/IDListPageFile.rb CHANGED Viewed

@@ -54,7 +54,8 @@ module PEROBS
       @file_name = File.join(dir, name + '.cache')
       @page_size = page_size
       open
-      @pages = PersistentObjectCache.new(max_in_memory, -1, IDListPage, self)
+      @pages = PersistentObjectCache.new(max_in_memory, max_in_memory,
+                                         IDListPage, self)
       @page_counter = 0
     end

data/lib/perobs/IDListPageRecord.rb CHANGED Viewed

@@ -65,7 +65,7 @@ module PEROBS
     end
     # Insert an ID into the page.
-    # @param ID [Integer] The ID to store
+    # @param id [Integer] The ID to store
     def insert(id)
       unless @min_id <= id && id <= @max_id
         raise ArgumentError, "IDs for this page must be between #{@min_id} " +

data/lib/perobs/Log.rb CHANGED Viewed

@@ -42,6 +42,11 @@ module PEROBS
   # are caused by user error rather than program logic errors.
   class UsageError < StandardError ; end
+  # This is the Exception type that will be thrown when a transaction start
+  # failed because there is an ongoing transaction from another thread in
+  # progress.
+  class TransactionInOtherThread < StandardError ; end
   # The ILogger class is a singleton that provides a common logging mechanism
   # to all objects. It exposes essentially the same interface as the Logger
   # class, just as a singleton and extends fatal to raise an FatalError

data/lib/perobs/ObjectBase.rb CHANGED Viewed

@@ -102,6 +102,13 @@ module PEROBS
       end
     end
+    # To allow POXReference objects to be used as Hash keys we need to
+    # implement this function. Conveniently, we can just use the PEROBS object
+    # ID since that is unique.
+    def hash
+      @id
+    end
     # Shortcut to access the _id() method of the referenced object.
     def _id
       @id

data/lib/perobs/SpaceTree.rb CHANGED Viewed

@@ -54,7 +54,7 @@ module PEROBS
       # Benchmark runs showed a cache size of 128 to be a good compromise
       # between read and write performance trade-offs and memory consumption.
-      @cache = PersistentObjectCache.new(256, -1, SpaceTreeNode, self)
+      @cache = PersistentObjectCache.new(256, 256, SpaceTreeNode, self)
     end
     # Open the SpaceTree file.