RubyGems - perobs - Versions diffs - 4.1.0 → 4.2.0 - Mend

perobs 4.1.0 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +5 -5
data/lib/perobs/BTree.rb +33 -13
data/lib/perobs/BTreeBlob.rb +3 -2
data/lib/perobs/BTreeDB.rb +4 -3
data/lib/perobs/BTreeNode.rb +107 -78
data/lib/perobs/BTreeNodeLink.rb +10 -0
data/lib/perobs/BigArray.rb +19 -1
data/lib/perobs/BigArrayNode.rb +13 -9
data/lib/perobs/BigHash.rb +8 -24
data/lib/perobs/BigTree.rb +14 -1
data/lib/perobs/BigTreeNode.rb +2 -2
data/lib/perobs/Cache.rb +31 -6
data/lib/perobs/EquiBlobsFile.rb +12 -1
data/lib/perobs/FlatFile.rb +197 -45
data/lib/perobs/FlatFileBlobHeader.rb +20 -5
data/lib/perobs/FlatFileDB.rb +8 -4
data/lib/perobs/FuzzyStringMatcher.rb +192 -0
data/lib/perobs/Hash.rb +4 -0
data/lib/perobs/IDListPageFile.rb +1 -2
data/lib/perobs/ObjectBase.rb +1 -1
data/lib/perobs/PersistentObjectCache.rb +7 -4
data/lib/perobs/SpaceManager.rb +273 -0
data/lib/perobs/SpaceTree.rb +1 -1
data/lib/perobs/Store.rb +67 -25
data/lib/perobs/version.rb +1 -1
data/perobs.gemspec +2 -2
data/test/BTree_spec.rb +1 -0
data/test/BigArray_spec.rb +53 -6
data/test/BigHash_spec.rb +8 -0
data/test/FlatFileDB_spec.rb +108 -3
data/test/FuzzyStringMatcher_spec.rb +171 -0
data/test/LegacyDBs/LegacyDB.rb +4 -0
data/test/SpaceManager_spec.rb +176 -0
data/test/Store_spec.rb +2 -5
metadata +12 -6

data/lib/perobs/FlatFileBlobHeader.rb CHANGED

@@ -115,6 +115,10 @@ module PEROBS
             PEROBS.log.fatal "Cannot read blob header " +
               "#{id ? "for ID #{id} " : ''}at address #{addr}"
           else
+            if corruption_start
+              PEROBS.log.error "Corruption found at end of blob file at " +
+                "address #{addr}"
+            end
             # We have reached the end of the file.
             return nil
           end
@@ -122,10 +126,15 @@ module PEROBS
         # Did we get the full header?
         if buf_with_crc.length != LENGTH
-          PEROBS.log.error "Incomplete FlatFileBlobHeader: Only " +
+          msg = "Incomplete FlatFileBlobHeader: Only " +
             "#{buf_with_crc.length} " +
             "bytes of #{LENGTH} could be read "
           "#{id ? "for ID #{id} " : ''}at address #{addr}"
+          if errors_are_fatal
+            PEROBS.log.fatal msg
+          else
+            PEROBS.log.error msg
+          end
           return nil
         end
@@ -148,10 +157,16 @@ module PEROBS
               "#{'%08x' % crc}."
           else
             if corruption_start.nil?
-              PEROBS.log.error "FlatFile corruption found. The FlatFile " +
-                "Header CRC mismatch at address #{addr}. Header CRC is " +
-                "#{'%08x' % read_crc} but should be #{'%08x' % crc}. Trying " +
-                "to find the next header."
+              if errors_are_fatal
+                PEROBS.log.fatal "FlatFile corruption found. The FlatFile " +
+                  "Header CRC mismatch at address #{addr}. Header CRC is " +
+                  "#{'%08x' % read_crc} but should be #{'%08x' % crc}."
+              else
+                PEROBS.log.error "FlatFile corruption found. The FlatFile " +
+                  "Header CRC mismatch at address #{addr}. Header CRC is " +
+                  "#{'%08x' % read_crc} but should be #{'%08x' % crc}. " +
+                  "Trying to find the next header."
+              end
               corruption_start = addr
             end
             # The blob file is corrupted. There is no valid header at the

data/lib/perobs/FlatFileDB.rb CHANGED

@@ -2,7 +2,7 @@
 #
 # = FlatFileDB.rb -- Persistent Ruby Object Store
 #
-# Copyright (c) 2015, 2016, 2017, 2018
+# Copyright (c) 2015, 2016, 2017, 2018, 2019
 # by Chris Schlaeger <chris@taskjuggler.org>
 #
 # MIT License
@@ -161,8 +161,8 @@ module PEROBS
     # Permanently delete all objects that have not been marked. Those are
     # orphaned and are no longer referenced by any actively used object.
     # @return [Integer] Number of the removed objects from the DB.
-    def delete_unmarked_objects
-      @flat_file.delete_unmarked_objects
+    def delete_unmarked_objects(&block)
+      @flat_file.delete_unmarked_objects(&block)
     end
     # Mark an object.
@@ -184,7 +184,11 @@ module PEROBS
     #        repaired.
     # @return number of errors found
     def check_db(repair = false)
-      @flat_file.check(repair)
+      if repair
+        @flat_file.repair
+      else
+        @flat_file.check
+      end
     end
     # Check if the stored object is syntactically correct.

data/lib/perobs/FuzzyStringMatcher.rb ADDED

@@ -0,0 +1,192 @@
+# encoding: UTF-8
+#
+# = FuzzyStringMatcher.rb -- Persistent Ruby Object Store
+#
+# Copyright (c) 2020 by Chris Schlaeger <chris@taskjuggler.org>
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+require 'perobs/Log'
+require 'perobs/ObjectBase'
+module PEROBS
+  # The fuzzy string matcher can be used to perform a fuzzy string search
+  # against a known set of strings. The dictionary of known strings does not
+  # store the actual strings but references to arbitrary objects. These could
+  # be the string, but can be something else related to the learned strings.
+  # To use this class a list of strings with their references must be learned.
+  # Once the dictionary has been established, fuzzy matches can be done.
+  class FuzzyStringMatcher
+    # Create a new FuzzyStringMatcher.
+    # @param store [PEROBS::Store] place to store the dictionary
+    # @param name [String] Unique name of the string matcher
+    # @param case_sensitive [Boolean] True if case matters for matching
+    # @param n [Integer] Determines what kind of n-gramm is used to store the
+    #        references in the dictionary. It also determines the minimum word
+    #        length that can be used for fuzzy matches.
+    def initialize(store, name, case_sensitive = false, n = 4)
+      @store = store
+      @dict_name = "FuzzyStringMatcher::#{name}"
+      if n < 2 || n > 10
+        raise ArgumentError, 'n must be between 2 and 10'
+      end
+      @case_sensitive = case_sensitive
+      @n = n
+      clear unless (@dict = @store[@dict_name])
+    end
+    # Wipe the dictionary.
+    def clear
+      @store[@dict_name] = @dict = @store.new(BigHash)
+    end
+    # Add a string with its reference to the dictionary.
+    # @param string [String] The string to store
+    # @param reference [Object] Any object that is associated with the string
+    def learn(string, reference = string)
+      reference = string if reference.nil?
+      unless @case_sensitive
+        string = string.downcase
+      end
+      # Enclose string in 'start of text' and 'end of text' ASCII values.
+      string = "\002" + string + "\003"
+      each_n_gramm(string) do |n_gramm|
+        unless (ng_list = @dict[n_gramm])
+          @dict[n_gramm] = ng_list = @store.new(Hash)
+        end
+        if ng_list.include?(reference)
+          ng_list[reference] += 1
+        else
+          ng_list[reference] = 0
+        end
+      end
+      nil
+    end
+    # Find the references who's string best matches the given string.
+    # @param string [String] string to search for
+    # @param min_score [Float] Value 0.01 and 1.0 that specifies how strict
+    #        the matching should be done. The larger the value the more closer
+    #        the given string needs to be.
+    # @param max_count [Integer] The maximum number of matches that should be
+    #        returned.
+    # @return [Array] The result is an Array of Arrays. The nested Arrays only
+    #         have 2 entries. The reference and a Float value between 0 and
+    #         1.0 that describes how good the match is. The matches are sorted
+    #         in descending order by the match score.
+    def best_matches(string, min_score = 0.5, max_count = 100)
+      unless @case_sensitive
+        string = string.downcase
+      end
+      # Enclose string in 'start of text' and 'end of text' ASCII values.
+      string = "\002" + string + "\003"
+      matches = {}
+      # This will be the best possible score for a perfect match.
+      best_possible_score = 0
+      each_n_gramm(string) do |n_gramm|
+        best_possible_score += 1
+        if (ng_list = @dict[n_gramm])
+          ng_list.each do |reference, count|
+            if matches.include?(reference)
+              matches[reference] += 1
+            else
+              # We use internally a 10 times larger list so that we don't
+              # throw away good matches too early. If the max_count value is
+              # chosen too small there is a risk of not finding the best
+              # matches!
+              if matches.size > 10 * max_count
+                matches = discard_worst_match(matches)
+              end
+              matches[reference] = 1
+            end
+          end
+        end
+      end
+      return [] if matches.empty?
+      # Sort in the order of occurance count downwards.
+      match_list = matches.to_a.sort do |a, b|
+        b[1] <=> a[1]
+      end
+      # Set occurance counters to scores relative to the best possible score.
+      match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
+      # Delete all matches that occured less than half as often than the
+      # top match.
+      match_list.delete_if { |a| a[1] < min_score }
+      match_list[0..max_count]
+    end
+    # Returns some internal stats about the dictionary.
+    def stats
+      s = {}
+      s['dictionary_size'] = @dict.size
+      max = total = 0
+      @dict.each do |n_gramm, ng_list|
+        size = ng_list.length
+        max = size if size > max
+        total += size
+      end
+      s['max_list_size'] = max
+      s['avg_list_size'] = total > 0 ? total.to_f / s['dictionary_size'] : 0
+      s
+    end
+    private
+    def each_n_gramm(string, &block)
+      return if string.length < @n
+      0.upto(string.length - @n) do |i|
+        n_gramm = string[i, @n]
+        yield(n_gramm)
+      end
+    end
+    def discard_worst_match(matches)
+      # Sort in the order of occurance count downwards.
+      match_list = matches.to_a.sort do |a, b|
+        b[1] <=> a[1]
+      end
+      # Discard the lowest half of the matches
+      match_list = match_list[0..match_list.length / 2]
+      match_list.to_h
+    end
+  end
+end

data/lib/perobs/Hash.rb CHANGED

@@ -124,6 +124,10 @@ module PEROBS
     # Proxy for assignment method.
     def []=(key, value)
+      unless key.is_a?(String)
+        raise ArgumentError, "PEROBS::Hash[] key must be a String but is a " +
+          "#{key.class}"
+      end
       _check_assignment_value(value)
       @store.cache.cache_write(self)
       @data[key] = value

data/lib/perobs/IDListPageFile.rb CHANGED

@@ -54,8 +54,7 @@ module PEROBS
       @file_name = File.join(dir, name + '.cache')
       @page_size = page_size
       open
-      @pages = PersistentObjectCache.new(max_in_memory, max_in_memory / 2,
-                                         IDListPage, self)
+      @pages = PersistentObjectCache.new(max_in_memory, -1, IDListPage, self)
       @page_counter = 0
     end

data/lib/perobs/ObjectBase.rb CHANGED

@@ -250,7 +250,7 @@ module PEROBS
     def _restore(level)
       # Find the most recently stored state of this object. This could be on
       # any previous stash level or in the regular object DB. If the object
-      # was created during the transaction, there is not previous state to
+      # was created during the transaction, there is no previous state to
       # restore to.
       data = nil
       if @_stash_map

data/lib/perobs/PersistentObjectCache.rb CHANGED

@@ -44,7 +44,8 @@ module PEROBS
     # cache objects.
     # @param size [Integer] Minimum number of objects to be cached at a time
     # @param flush_delay [Integer] Determines how often non-forced flushes are
-    #        ignored in a row before the flush is really done.
+    #        ignored in a row before the flush is really done. If flush_delay
+    #        is smaller than 0 non-forced flushed will always be ignored.
     # @param klass [Class] The class of the objects to be cached. Objects must
     #        provide a uid() method that returns a unique ID for every object.
     # @param collection [] The object collection the objects belong to. It
@@ -71,8 +72,7 @@ module PEROBS
       if modified
         @modified_entries[object.uid] = object
       else
-        index = object.uid % @size
-        @unmodified_entries[index] = object
+        @unmodified_entries[object.uid % @size] = object
       end
       nil
@@ -111,9 +111,12 @@ module PEROBS
     # all modified objects will be written.
     # @param now [Boolean]
     def flush(now = false)
-      if now || (@flush_counter -= 1) <= 0
+      if now || (@flush_delay >= 0 && (@flush_counter -= 1) <= 0)
         @modified_entries.each do |id, object|
           object.save
+          # Add the object to the unmodified object cache. We might still need
+          # it again soon.
+          @unmodified_entries[object.uid % @size] = object
         end
         @modified_entries = ::Hash.new
         @flush_counter = @flush_delay

data/lib/perobs/SpaceManager.rb ADDED

@@ -0,0 +1,273 @@
+# encoding: UTF-8
+#
+# = SpaceManager.rb -- Persistent Ruby Object Store
+#
+# Copyright (c) 2020 by Chris Schlaeger <chris@taskjuggler.org>
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+require 'perobs/BTree'
+require 'perobs/EquiBlobsFile'
+require 'perobs/FlatFile'
+require 'perobs/FlatFileBlobHeader'
+module PEROBS
+  # The SpaceManager is used to keep a list of all the empty spaces in a
+  # FlatFileDB file. An empty space is described by its starting address and
+  # its length in bytes. The SpaceManager keeps a list of all the spaces and
+  # can find the best fit space when a new blob needs to be added to the
+  # FlatFileDB.
+  #
+  # The SpaceManager uses two files to store the list. The first is a file
+  # with the actual addresses. This is a set of linked address lists. Each
+  # list holds the addresses for spaces that have exactly the same size. The
+  # second file is a BTree file that serves as the index. It is used to map
+  # the length of a space to the address of the linked list for that
+  # particular length. The linked list consists of elements that only hold 2
+  # items. The actual address in the FlatFileDB and the address of the next
+  # entry in the linked list in the list file.
+  class SpaceManager
+    attr_reader :added_spaces, :recycled_spaces, :failed_requests
+    def initialize(db_dir, progressmeter, btree_order = 65)
+      @db_dir = db_dir
+      @progressmeter = progressmeter
+      @index = BTree.new(@db_dir, 'space_index', btree_order, @progressmeter)
+      # The space list contains blobs that have each 2 entries. The address of
+      # the space in the FlatFile and the address of the next blob in the
+      # space list file that is an entry for the same space size. An address
+      # of 0 marks the end of the list.
+      @list = EquiBlobsFile.new(@db_dir, 'space_list', @progressmeter, 2 * 8, 1)
+    end
+    def open
+      @index.open
+      @list.open
+      reset_stats
+    end
+    def close
+      if @index.is_open?
+        PEROBS.log.info "SpaceManager has currently #{@list.total_entries} " +
+          "used blobs and #{@list.total_spaces} unused blobs in list " +
+          "EquiBlobsFile"
+        PEROBS.log.info "#{@added_spaces} were added, #{@recycled_spaces} " +
+          "spaces were recycled and #{@failed_requests} requests failed"
+        @list.close
+        @index.close
+      end
+    end
+    def is_open?
+      @index.is_open?
+    end
+    def sync
+      @list.sync
+      @index.sync
+    end
+    def add_space(address, length)
+      if (list_entry_addr = @index.get(length))
+        # There is already at least one move entry for this length.
+        new_list_entry_addr = insert_space_in_list(address, list_entry_addr)
+      else
+        new_list_entry_addr = insert_space_in_list(address, 0)
+      end
+      @index.insert(length, new_list_entry_addr)
+      @added_spaces += 1
+    end
+    def has_space?(address, length)
+      if (list_entry_addr = @index.get(length))
+        while list_entry_addr > 0
+          blob = @list.retrieve_blob(list_entry_addr)
+          space_address, next_entry_addr = blob.unpack('QQ')
+          return true if space_address == address
+          list_entry_addr = next_entry_addr
+        end
+      end
+      false
+    end
+    def get_space(length)
+      # We use a simple exact fit strategy. All attempts to use a more
+      # elaborate scheme were actually less efficient. Non-exact matches
+      # generate new spaces for the remainder and fragment the blob file with
+      # lots of unusable small spaces. Most applications seem to have
+      # clustered their blob sizes around a number of popular sizes. So exact
+      # match is very efficient to implement and results in the highest
+      # probability that a space will be reused soon.
+      list_entry_addr = @index.get(length)
+      if list_entry_addr
+        blob = @list.retrieve_blob(list_entry_addr)
+        space_address, next_entry_addr = blob.unpack('QQ')
+        @list.delete_blob(list_entry_addr)
+        if next_entry_addr > 0
+          # Update the index entry for the length to point to the
+          # following space list entry.
+          @index.insert(length, next_entry_addr)
+        else
+          # The space list for this length is empty. Remove the entry
+          # from the index.
+          @index.remove(length)
+        end
+        @recycled_spaces += 1
+        # We return the length to remain compatible with the old SpaceTree
+        # API.
+        return [ space_address, length ]
+      end
+      @failed_requests += 1
+      nil
+    end
+    def clear
+      @list.clear
+      @index.clear
+      reset_stats
+    end
+    def erase
+      @list.erase
+      @index.erase
+    end
+    def check(flat_file = nil)
+      sync
+      return false unless @index.check
+      return false unless @list.check
+      smallest_space = nil
+      largest_space = nil
+      total_space_bytes = 0
+      space_distribution = ::Hash.new(0)
+      @index.each do |length, list_entry_addr|
+        if list_entry_addr <= 0
+          PEROBS.log.error "list_entry_addr (#{list_entry_addr}) " +
+            "must be positive"
+          return false
+        end
+        # Detect smallest and largest space
+        if smallest_space.nil? || length < smallest_space
+          smallest_space = length
+        end
+        if largest_space.nil? || length > largest_space
+          largest_space = length
+        end
+        known_addresses = [ list_entry_addr ]
+        entries = 0
+        while list_entry_addr > 0
+          entries += 1
+          unless (blob = @list.retrieve_blob(list_entry_addr))
+            PEROBS.log.error "SpaceManager points to non-existing " +
+              "space list entry at address #{list_entry_addr}"
+            return false
+          end
+          space_address, next_entry_addr = blob.unpack('QQ')
+          if known_addresses.include?(next_entry_addr)
+            PEROBS.log.error "Space list is cyclic: "
+              "#{known_addresses + next_entry_addr}"
+            return false
+          end
+          if flat_file &&
+              !flat_file.has_space?(space_address, length)
+            PEROBS.log.error "SpaceManager has space at offset " +
+              "#{space_address} of size #{length} that isn't " +
+              "available in the FlatFile."
+            return false
+          end
+          list_entry_addr = next_entry_addr
+        end
+        total_space_bytes += length * entries
+        space_distribution[msb(length)] += entries
+      end
+      PEROBS.log.info "SpaceManager stats: smallest: #{smallest_space}; " +
+        "largest: #{largest_space}; total bytes: #{total_space_bytes}; " +
+        "distribution: " +
+        "#{space_distribution.map { |l, c| "#{2 ** (l - 1)}-#{2 ** l - 1}:#{c}; " }}"
+      true
+    end
+    def to_a
+      a = []
+      @index.each do |length, list_entry_addr|
+        while list_entry_addr > 0
+          blob = @list.retrieve_blob(list_entry_addr)
+          space_address, next_entry_addr = blob.unpack('QQ')
+          a << [ space_address, length ]
+          list_entry_addr = next_entry_addr
+        end
+      end
+      a.sort { |a, b| a[0] <=> b[0] }
+    end
+    private
+    def insert_space_in_list(next_element_addr, space_address)
+      blob = [ next_element_addr, space_address ].pack('QQ')
+      @list.store_blob(blob_addr = @list.free_address, blob)
+      blob_addr
+    end
+    def msb(i)
+      return 63 if i < 0
+      bit = 0
+      while (i > 0)
+        bit += 1
+        i = i >> 1
+      end
+      bit
+    end
+    def reset_stats
+      @added_spaces = 0
+      @recycled_spaces = 0
+      @failed_requests = 0
+    end
+  end
+end