RubyGems - perobs - Versions diffs - 4.0.0 → 4.4.0 - Mend

perobs 4.0.0 → 4.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

checksums.yaml +5 -5
data/README.md +27 -16
data/lib/perobs/Array.rb +66 -19
data/lib/perobs/BTree.rb +106 -15
data/lib/perobs/BTreeBlob.rb +4 -3
data/lib/perobs/BTreeDB.rb +5 -4
data/lib/perobs/BTreeNode.rb +482 -156
data/lib/perobs/BTreeNodeLink.rb +10 -0
data/lib/perobs/BigArray.rb +285 -0
data/lib/perobs/BigArrayNode.rb +1002 -0
data/lib/perobs/BigHash.rb +246 -0
data/lib/perobs/BigTree.rb +197 -0
data/lib/perobs/BigTreeNode.rb +873 -0
data/lib/perobs/Cache.rb +48 -10
data/lib/perobs/ConsoleProgressMeter.rb +61 -0
data/lib/perobs/DataBase.rb +4 -3
data/lib/perobs/DynamoDB.rb +57 -15
data/lib/perobs/EquiBlobsFile.rb +155 -50
data/lib/perobs/FNV_Hash_1a_64.rb +54 -0
data/lib/perobs/FlatFile.rb +519 -227
data/lib/perobs/FlatFileBlobHeader.rb +113 -54
data/lib/perobs/FlatFileDB.rb +49 -23
data/lib/perobs/FuzzyStringMatcher.rb +175 -0
data/lib/perobs/Hash.rb +127 -33
data/lib/perobs/IDList.rb +144 -0
data/lib/perobs/IDListPage.rb +107 -0
data/lib/perobs/IDListPageFile.rb +180 -0
data/lib/perobs/IDListPageRecord.rb +142 -0
data/lib/perobs/Object.rb +18 -15
data/lib/perobs/ObjectBase.rb +46 -5
data/lib/perobs/PersistentObjectCache.rb +57 -68
data/lib/perobs/PersistentObjectCacheLine.rb +24 -12
data/lib/perobs/ProgressMeter.rb +97 -0
data/lib/perobs/SpaceManager.rb +273 -0
data/lib/perobs/SpaceTree.rb +21 -12
data/lib/perobs/SpaceTreeNode.rb +53 -61
data/lib/perobs/Store.rb +264 -145
data/lib/perobs/version.rb +1 -1
data/lib/perobs.rb +2 -0
data/perobs.gemspec +4 -4
data/test/Array_spec.rb +15 -6
data/test/BTree_spec.rb +6 -2
data/test/BigArray_spec.rb +261 -0
data/test/BigHash_spec.rb +152 -0
data/test/BigTreeNode_spec.rb +153 -0
data/test/BigTree_spec.rb +259 -0
data/test/EquiBlobsFile_spec.rb +105 -1
data/test/FNV_Hash_1a_64_spec.rb +59 -0
data/test/FlatFileDB_spec.rb +198 -14
data/test/FuzzyStringMatcher_spec.rb +261 -0
data/test/Hash_spec.rb +13 -3
data/test/IDList_spec.rb +77 -0
data/test/LegacyDBs/LegacyDB.rb +155 -0
data/test/LegacyDBs/version_3/class_map.json +1 -0
data/test/LegacyDBs/version_3/config.json +1 -0
data/test/LegacyDBs/version_3/database.blobs +0 -0
data/test/LegacyDBs/version_3/database_spaces.blobs +0 -0
data/test/LegacyDBs/version_3/index.blobs +0 -0
data/test/LegacyDBs/version_3/version +1 -0
data/test/LockFile_spec.rb +9 -6
data/test/SpaceManager_spec.rb +176 -0
data/test/SpaceTree_spec.rb +4 -1
data/test/Store_spec.rb +305 -203
data/test/spec_helper.rb +9 -4
metadata +57 -16
data/lib/perobs/BTreeNodeCache.rb +0 -109
data/lib/perobs/TreeDB.rb +0 -277

data/lib/perobs/FlatFileBlobHeader.rb CHANGED Viewed

@@ -48,12 +48,13 @@ module PEROBS
     # The 'pack()' format of the header.
     FORMAT = 'CQQL'
     # The length of the header in bytes.
-    LENGTH = 21
+    LENGTH = 25
     VALID_FLAG_BIT = 0
     COMPRESSED_FLAG_BIT = 2
     OUTDATED_FLAG_BIT = 3
     attr_reader :addr, :flags, :length, :id, :crc
+    attr_accessor :corruption_start
     # Create a new FlatFileBlobHeader with the given flags, length, id and crc.
     # @param file [File] the FlatFile that contains the header
@@ -69,50 +70,120 @@ module PEROBS
       @length = length
       @id = id
       @crc = crc
+      # This is only set if the header is preceded by a corrupted blob.
+      @corruption_start = nil
     end
     # Read the header from the given File.
     # @param file [File]
-    # @return FlatFileBlobHeader
-    def FlatFileBlobHeader::read(file)
-      begin
-        addr = file.pos
-        buf = file.read(LENGTH)
-      rescue IOError => e
-        PEROBS.log.error "Cannot read blob header in flat file DB: #{e.message}"
-        return nil
+    # @param addr [Integer] address in the file to start reading. If no
+    #        address is specified use the current position in the file.
+    # @param id [Integer] Optional ID that the header should have. If no id is
+    #        specified there is no check against the actual ID done.
+    # @return FlatFileBlobHeader or nil if there are no more blobs to read in
+    #         the file.
+    def FlatFileBlobHeader::read(file, addr = nil, id = nil)
+      # If an address was specified we expect the read to always succeed. If
+      # no address is specified and we can't read the header we generate an
+      # error message but it is not fatal.
+      errors_are_fatal = !addr.nil?
+      mode = :searching_next_header
+      addr = file.pos unless addr
+      buf = nil
+      corruption_start = nil
+      loop do
+        buf_with_crc = nil
+        begin
+          file.seek(addr)
+          buf_with_crc = file.read(LENGTH)
+        rescue IOError => e
+          if errors_are_fatal
+            PEROBS.log.fatal "Cannot read blob header in flat file DB at " +
+              "address #{addr}: #{e.message}"
+          else
+            PEROBS.log.error "Cannot read blob header in flat file DB: " +
+              e.message
+            return nil
+          end
+        end
+        # Did we read anything?
+        if buf_with_crc.nil?
+          if errors_are_fatal
+            PEROBS.log.fatal "Cannot read blob header " +
+              "#{id ? "for ID #{id} " : ''}at address #{addr}"
+          else
+            if corruption_start
+              PEROBS.log.error "Corruption found at end of blob file at " +
+                "address #{addr}"
+            end
+            # We have reached the end of the file.
+            return nil
+          end
+        end
+        # Did we get the full header?
+        if buf_with_crc.length != LENGTH
+          msg = "Incomplete FlatFileBlobHeader: Only " +
+            "#{buf_with_crc.length} " +
+            "bytes of #{LENGTH} could be read "
+          "#{id ? "for ID #{id} " : ''}at address #{addr}"
+          if errors_are_fatal
+            PEROBS.log.fatal msg
+          else
+            PEROBS.log.error msg
+          end
+          return nil
+        end
+        # Check the CRC of the header
+        buf = buf_with_crc[0..-5]
+        crc = buf_with_crc[-4..-1].unpack('L')[0]
+        if (read_crc = Zlib.crc32(buf, 0)) == crc
+          # We have found a valid header.
+          if corruption_start
+            PEROBS.log.error "FlatFile corruption ends at #{addr}. " +
+              "#{addr - corruption_start} bytes skipped. Some data may " +
+              "not be recoverable."
+          end
+          break
+        else
+          if errors_are_fatal
+            PEROBS.log.fatal "FlatFile Header CRC mismatch at address " +
+              "#{addr}. Header CRC is #{'%08x' % read_crc} but should be " +
+              "#{'%08x' % crc}."
+          else
+            if corruption_start.nil?
+              if errors_are_fatal
+                PEROBS.log.fatal "FlatFile corruption found. The FlatFile " +
+                  "Header CRC mismatch at address #{addr}. Header CRC is " +
+                  "#{'%08x' % read_crc} but should be #{'%08x' % crc}."
+              else
+                PEROBS.log.error "FlatFile corruption found. The FlatFile " +
+                  "Header CRC mismatch at address #{addr}. Header CRC is " +
+                  "#{'%08x' % read_crc} but should be #{'%08x' % crc}. " +
+                  "Trying to find the next header."
+              end
+              corruption_start = addr
+            end
+            # The blob file is corrupted. There is no valid header at the
+            # current position in the file. We now try to find the next valid
+            # header by iterating over the remainder of the file advanding one
+            # byte with each step until we hit the end of the file or find the
+            # next valid header.
+            addr += 1
+          end
+        end
       end
-      return nil unless buf
-      if buf.length != LENGTH
-        PEROBS.log.error "Incomplete FlatFileBlobHeader: Only #{buf.length} " +
-          "bytes of #{LENGTH} could be read"
-        return nil
+      header = FlatFileBlobHeader.new(file, addr, *buf.unpack(FORMAT))
+      if corruption_start
+        header.corruption_start = corruption_start
       end
-      FlatFileBlobHeader.new(file, addr, *buf.unpack(FORMAT))
-    end
-    # Read the header from the given File.
-    # @param file [File]
-    # @param addr [Integer] address in the file to start reading
-    # @param id [Integer] Optional ID that the header should have
-    # @return FlatFileBlobHeader
-    def FlatFileBlobHeader::read_at(file, addr, id = nil)
-      buf = nil
-      begin
-        file.seek(addr)
-        buf = file.read(LENGTH)
-      rescue IOError => e
-        PEROBS.log.fatal "Cannot read blob in flat file DB: #{e.message}"
-      end
-      if buf.nil? || buf.length != LENGTH
-        PEROBS.log.fatal "Cannot read blob header " +
-          "#{id ? "for ID #{id} " : ''}at address " +
-          "#{addr}"
-      end
-      header = FlatFileBlobHeader.new(file, addr, *buf.unpack(FORMAT))
       if id && header.id != id
         PEROBS.log.fatal "Mismatch between FlatFile index and blob file " +
           "found. FlatFile has entry with ID #{header.id} at address " +
@@ -123,11 +194,12 @@ module PEROBS
     end
     # Write the header to a given File.
-    # @param file [File]
     def write
       begin
+        buf = [ @flags, @length, @id, @crc].pack(FORMAT)
+        crc = Zlib.crc32(buf, 0)
         @file.seek(@addr)
-        @file.write([ @flags, @length, @id, @crc].pack(FORMAT))
+        @file.write(buf + [ crc ].pack('L'))
       rescue IOError => e
         PEROBS.log.fatal "Cannot write blob header into flat file DB: " +
           e.message
@@ -135,11 +207,9 @@ module PEROBS
     end
     # Reset all the flags bit to 0. This marks the blob as invalid.
-    # @param file [File] The file handle of the blob file.
-    # @param addr [Integer] The address of the header
     def clear_flags
       @flags = 0
-      write_flags
+      write
     end
     # Return true if the header is for a non-empty blob.
@@ -156,7 +226,7 @@ module PEROBS
     # transaction has been completed.
     def set_outdated_flag
       set_flag(OUTDATED_FLAG_BIT)
-      write_flags
+      write
     end
     # Return true if the blob contains outdated data.
@@ -166,17 +236,6 @@ module PEROBS
     private
-    def write_flags
-      begin
-        @file.seek(@addr)
-        @file.write([ @flags ].pack('C'))
-        @file.flush
-      rescue IOError => e
-        PEROBS.log.fatal "Writing flags of FlatFileBlobHeader with ID #{@id} " +
-          "failed: #{e.message}"
-      end
-    end
     def bit_set?(n)
       mask = 1 << n
       @flags & mask == mask

data/lib/perobs/FlatFileDB.rb CHANGED Viewed

@@ -2,7 +2,8 @@
 #
 # = FlatFileDB.rb -- Persistent Ruby Object Store
 #
-# Copyright (c) 2015, 2016 by Chris Schlaeger <chris@taskjuggler.org>
+# Copyright (c) 2015, 2016, 2017, 2018, 2019
+# by Chris Schlaeger <chris@taskjuggler.org>
 #
 # MIT License
 #
@@ -41,7 +42,7 @@ module PEROBS
     # This version number increases whenever the on-disk format changes in a
     # way that requires conversion actions after an update.
-    VERSION = 2
+    VERSION = 4
     attr_reader :max_blob_size
@@ -50,13 +51,17 @@ module PEROBS
     # @param options [Hash] options to customize the behavior. Currently only
     #        the following options are supported:
     #        :serializer    : Can be :marshal, :json, :yaml
+    #        :progressmeter : Reference to a ProgressMeter object
+    #        :log           : IO that should be used for logging
+    #        :log_level     : Minimum Logger level to log
     def initialize(db_name, options = {})
-      super(options[:serializer] || :json)
+      super(options)
       @db_dir = db_name
       # Create the database directory if it doesn't exist yet.
       ensure_dir_exists(@db_dir)
-      PEROBS.log.open(File.join(@db_dir, 'log'))
+      PEROBS.log.level = options[:log_level] if options[:log_level]
+      PEROBS.log.open(options[:log] || File.join(@db_dir, 'log'))
       check_version_and_upgrade
       # Read the existing DB config.
@@ -68,7 +73,7 @@ module PEROBS
     # Open the FlatFileDB for transactions.
     def open
-      @flat_file = FlatFile.new(@db_dir)
+      @flat_file = FlatFile.new(@db_dir, @progressmeter)
       @flat_file.open
       PEROBS.log.info "FlatFile '#{@db_dir}' opened"
     end
@@ -143,8 +148,9 @@ module PEROBS
       end
     end
-    def search_object(id)
-      @flat_file.search_object(id)
+    # @return [Integer] Number of objects stored in the DB.
+    def item_counter
+      @flat_file.item_counter
     end
     # This method must be called to initiate the marking process.
@@ -154,9 +160,9 @@ module PEROBS
     # Permanently delete all objects that have not been marked. Those are
     # orphaned and are no longer referenced by any actively used object.
-    # @return [Array] List of IDs that have been removed from the DB.
-    def delete_unmarked_objects
-      @flat_file.delete_unmarked_objects
+    # @return [Integer] Number of the removed objects from the DB.
+    def delete_unmarked_objects(&block)
+      @flat_file.delete_unmarked_objects(&block)
     end
     # Mark an object.
@@ -178,7 +184,11 @@ module PEROBS
     #        repaired.
     # @return number of errors found
     def check_db(repair = false)
-      @flat_file.check(repair)
+      if repair
+        @flat_file.repair
+      else
+        @flat_file.check
+      end
     end
     # Check if the stored object is syntactically correct.
@@ -226,7 +236,8 @@ module PEROBS
                            "'#{version_file}': " + e.message
         end
       else
-        # Early versions of PEROBS did not have a version file.
+        # The DB is brand new.
+        version = VERSION
         write_version_file(version_file)
       end
@@ -234,25 +245,40 @@ module PEROBS
         PEROBS.log.fatal "Cannot downgrade the FlatFile database from " +
                          "version #{version} to version #{VERSION}"
       end
-      if version == 1
-        # Version 1 had no support for data compression. Make sure all entries
-        # are compressed to save space.
-        open
-        @flat_file.refresh
-        close
+      if version < 3
+        PEROBS.log.fatal "The upgrade of this version of the PEROBS database " +
+          "is not supported by this version of PEROBS. Please try an earlier " +
+          "version of PEROBS to upgrade the database before using this version."
       end
-      # After a successful upgrade change the version number in the DB as
-      # well.
-      if version < VERSION
+      # Version upgrades must be done one version number at a time. If the
+      # existing DB is multiple versions older than what the current PEROBS
+      # version expects than multiple upgrade runs will be needed.
+      while version < VERSION
+        if version == 3
+          PEROBS.log.warn "Updating FlatFileDB #{@db_dir} from version 3 to " +
+            "version 4 ..."
+          # Version 4 adds checksums for blob file headers. We have to convert
+          # the blob file to include the checksums.
+          FlatFile.insert_header_checksums(@db_dir)
+          open
+          @flat_file.regenerate_index_and_spaces
+          close
+        end
+        # After a successful upgrade change the version number in the DB as
+        # well.
         write_version_file(version_file)
         PEROBS.log.warn "Update of FlatFileDB '#{@db_dir}' from version " +
-          "#{version} to version #{VERSION} completed"
+          "#{version} to version #{version + 1} completed"
+        # Update version variable to new version.
+        version += 1
       end
     end
     def write_version_file(version_file)
       begin
         RobustFile.write(version_file, VERSION)
       rescue IOError => e

data/lib/perobs/FuzzyStringMatcher.rb ADDED Viewed

@@ -0,0 +1,175 @@
+# encoding: UTF-8
+#
+# = FuzzyStringMatcher.rb -- Persistent Ruby Object Store
+#
+# Copyright (c) 2020 by Chris Schlaeger <chris@taskjuggler.org>
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+require 'perobs/Log'
+require 'perobs/Object'
+module PEROBS
+  # The fuzzy string matcher can be used to perform a fuzzy string search
+  # against a known set of strings. The dictionary of known strings does not
+  # store the actual strings but references to String or PEROBS objects.
+  # Once the dictionary has been established, fuzzy matches can be done. Since
+  # the actual input strings are not directly stored, you cannot remove or
+  # modified already stored strings. To remove strings, you have to clear the
+  # matcher and add the strings again that you want to keep.
+  class FuzzyStringMatcher < PEROBS::Object
+    attr_persist :case_sensitive, :n, :dict
+    # Create a new FuzzyStringMatcher.
+    # @param p [PEROBS::Store] place to store the dictionary
+    # @param case_sensitive [Boolean] True if case matters for matching
+    # @param n [Integer] Determines what kind of n-gramm is used to store the
+    #        references in the dictionary. It also determines the minimum word
+    #        length that can be used for fuzzy matches. Values between 2 and
+    #        10 are supported. The default is 4.
+    def initialize(p, case_sensitive = false, n = 4)
+      super(p)
+      if n < 2 || n > 10
+        raise ArgumentError, 'n must be between 2 and 10'
+      end
+      self.case_sensitive = case_sensitive
+      self.n = n
+      clear unless @dict
+    end
+    # Wipe the dictionary.
+    def clear
+      self.dict = @store.new(BigHash)
+    end
+    # Add a string with its reference to the dictionary.
+    # @param string [String] The string to store
+    # @param reference [Object] Any object that is associated with the string
+    def learn(string, reference = string)
+      reference = string if reference.nil?
+      unless @case_sensitive
+        string = string.downcase
+      end
+      # Enclose string in 'start of text' and 'end of text' ASCII values.
+      string = "\002" + string + "\003"
+      each_n_gramm(string) do |n_gramm|
+        unless (ng_list = @dict[n_gramm])
+          @dict[n_gramm] = ng_list = @store.new(Hash)
+        end
+        # We use the Hash as a Set. The value doesn't matter.
+        ng_list[reference] = true unless ng_list.include?(reference)
+      end
+      nil
+    end
+    # Find the references who's string best matches the given string.
+    # @param string [String] string to search for
+    # @param min_score [Float] Value 0.01 and 1.0 that specifies how strict
+    #        the matching should be done. The larger the value the more closer
+    #        the given string needs to be.
+    # @param max_count [Integer] The maximum number of matches that should be
+    #        returned.
+    # @return [Array] The result is an Array of Arrays. The nested Arrays only
+    #         have 2 entries. The reference and a Float value between 0 and
+    #         1.0 that describes how good the match is. The matches are sorted
+    #         in descending order by the match score.
+    def best_matches(string, min_score = 0.5, max_count = 100)
+      unless @case_sensitive
+        string = string.downcase
+      end
+      # Enclose string in 'start of text' and 'end of text' ASCII values.
+      string = "\002" + string + "\003"
+      matches = {}
+      each_n_gramm(string) do |n_gramm|
+        if (ng_list = @dict[n_gramm])
+          ng_list.each do |reference, dummy|
+            if matches.include?(reference)
+              matches[reference] += 1
+            else
+              matches[reference] = 1
+            end
+          end
+        end
+      end
+      return [] if matches.empty?
+      match_list = matches.to_a
+      # Set occurance counters to scores relative to the best possible score.
+      # This will be the best possible score for a perfect match.
+      best_possible_score = string.length - @n + 1
+      match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
+      # Delete all matches that don't have the required minimum match score.
+      match_list.delete_if { |a| a[1] < min_score }
+      # Sort the list best to worst match
+      match_list.sort! do |a, b|
+        b[1] <=> a[1]
+      end
+      # Return the top max_count matches.
+      match_list[0..max_count - 1]
+    end
+    # Returns some internal stats about the dictionary.
+    def stats
+      s = {}
+      s['dictionary_size'] = @dict.size
+      max = total = 0
+      @dict.each do |n_gramm, ng_list|
+        size = ng_list.length
+        max = size if size > max
+        total += size
+      end
+      s['max_list_size'] = max
+      s['avg_list_size'] = total > 0 ? total.to_f / s['dictionary_size'] : 0
+      s
+    end
+    private
+    def each_n_gramm(string, &block)
+      return if string.length < @n
+      0.upto(string.length - @n) do |i|
+        n_gramm = string[i, @n]
+        yield(n_gramm)
+      end
+    end
+  end
+end