RubyGems - perobs - Versions diffs - 0.0.1 → 1.0.0 - Mend

perobs 0.0.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/.gitignore +7 -9
data/README.md +0 -4
data/lib/perobs/Array.rb +6 -0
data/lib/perobs/BTreeBlob.rb +327 -0
data/lib/perobs/BTreeDB.rb +252 -0
data/lib/perobs/ClassMap.rb +125 -0
data/lib/perobs/DataBase.rb +21 -2
data/lib/perobs/Hash.rb +8 -0
data/lib/perobs/Object.rb +11 -0
data/lib/perobs/ObjectBase.rb +12 -2
data/lib/perobs/Store.rb +21 -7
data/lib/perobs/version.rb +1 -1
data/spec/{FileSystemDB_spec.rb → BTreeDB_spec.rb} +53 -9
data/spec/ClassMap_spec.rb +70 -0
data/spec/Store_spec.rb +43 -2
metadata +9 -7
data/lib/perobs/BlockDB.rb +0 -242
data/lib/perobs/FileSystemDB.rb +0 -171
data/lib/perobs/HashedBlocksDB.rb +0 -153

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 9dd54b9f62dc6b5cc7129d25b9ba87e2d7aa3775
-  data.tar.gz: 1431e7ec23c7bf2c18fa65b7c3e14b33bc696b2b
+  metadata.gz: 059ab4702a14c6ff6328881c528eda77d321de51
+  data.tar.gz: 5a8944222bf1feccb2eda749f05d963598e3b945
 SHA512:
-  metadata.gz: dbf7166adf28acabef48594bb80721512f0b30156f66965e7077f6b4089e429c5d951b621c0c3322bc6c2a0b269e47042994dd9449d75cb00858e0d2a23bbbbc
-  data.tar.gz: 73ae5cbfd48a5bc3a53194961398ec6a0ff57a4ac4ed606ba0e3ab1922fcba89cbc72cb90327e2256e2c9709a2a2707bdea8a8bd9124ff973531b800ffc2f304
+  metadata.gz: eeab29c68225efd8efbfb6a94b14c708fac38e16c6c1a399b55941e11e7dab83ecea925382180b803a7a152b71509eef30c50e40e568d5c9900c0dea1eec1d7f
+  data.tar.gz: 993b3ff327426b2797e4696a43ad0a0483d45a22638241fe34409b8fe4bbdddbe5a27adb0f06b088ae5f45638515fe11aa0b4f1baf6079afbfde4d7c06188ea5

data/.gitignore CHANGED Viewed

@@ -1,12 +1,10 @@
-/.bundle/
-/.yardoc
-/Gemfile.lock
-/_yardoc/
-/coverage/
-/doc/
-/pkg/
-/spec/reports/
-/tmp/
+.bundle/
+.yardoc
+Gemfile.lock
+_yardoc/
+coverage/
+doc/
+pkg/
 *.bundle
 *.so
 *.o

data/README.md CHANGED Viewed

@@ -100,10 +100,6 @@ Or install it yourself as:
     $ gem install perobs
-## Usage
-TODO: Write usage instructions here
 ## Contributing
 1. Fork it ( https://github.com/scrapper/perobs/fork )

data/lib/perobs/Array.rb CHANGED Viewed

@@ -161,6 +161,12 @@ module PEROBS
       @data = data
     end
+    # Textual dump for debugging purposes
+    # @return [String]
+    def inspect
+      "[\n" + @data.map { |v| "  #{v.inspect}" }.join(",\n") + "\n]\n"
+    end
     private
     def _serialize

data/lib/perobs/BTreeBlob.rb ADDED Viewed

@@ -0,0 +1,327 @@
+# encoding: UTF-8
+#
+# = BTreeBlob.rb -- Persistent Ruby Object Store
+#
+# Copyright (c) 2015 by Chris Schlaeger <chris@taskjuggler.org>
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+module PEROBS
+  # This class manages the usage of the data blobs in the corresponding
+  # HashedBlobsDB object.
+  class BTreeBlob
+    # For performance reasons we use an Array for the entries instead of a
+    # Hash. These constants specify the Array index for the corresponding
+    # value.
+    ID = 0
+    # Number of bytes
+    BYTES = 1
+    # Start Address
+    START = 2
+    # Mark/Unmarked flag
+    MARKED = 3
+    # Create a new BTreeBlob object.
+    # @param dir [String] Fully qualified directory name
+    # @param btreedb [BTreeDB] Reference to the DB that owns this blob
+    def initialize(dir, btreedb)
+      @dir = dir
+      @btreedb = btreedb
+      @index_file_name = File.join(dir, 'index')
+      @blobs_file_name = File.join(dir, 'data')
+      read_index
+    end
+    # Write the given bytes with the given ID into the DB.
+    # @param id [Fixnum or Bignum] ID
+    # @param raw [String] sequence of bytes
+    def write_object(id, raw)
+      if @entries.length > @btreedb.max_blob_size
+        # The blob has reached the maximum size. Replace the blob with a BTree
+        # node directory and distribute the blob entires into the sub-blobs of
+        # the new BTree node.
+        split_blob
+        # Insert the passed object into the newly created BTree node.
+        @btreedb.put_raw_object(raw, id)
+      else
+        bytes = raw.bytesize
+        start_address = reserve_bytes(id, bytes)
+        if write_to_blobs_file(raw, start_address) != bytes
+          raise RuntimeError, 'Object length does not match written bytes'
+        end
+        write_index
+      end
+    end
+    # Read the entry for the given ID and return it as bytes.
+    # @param id [Fixnum or Bignum] ID
+    # @return [String] sequence of bytes or nil if ID is unknown
+    def read_object(id)
+      return nil unless (bytes_and_start = find(id))
+      read_from_blobs_file(*bytes_and_start)
+    end
+    # Find the data for the object with given id.
+    # @param id [Fixnum or Bignum] Object ID
+    # @return [Array] Returns an Array with two Fixnum entries. The first is
+    #         the number of bytes and the second is the starting offset in the
+    #         blob storage file.
+    def find(id)
+      if (entry = @entries_by_id[id])
+        return [ entry[BYTES], entry[START] ]
+      end
+      nil
+    end
+    # Clear the mark on all entries in the index.
+    def clear_marks
+      @entries.each { |e| e[MARKED] = 0 }
+      write_index
+    end
+    # Set a mark on the entry with the given ID.
+    # @param id [Fixnum or Bignum] ID of the entry
+    def mark(id)
+      found = false
+      @entries.each do |entry|
+        if entry[ID] == id
+          entry[MARKED] = 1
+          found = true
+          break
+        end
+      end
+      unless found
+        raise ArgumentError,
+              "Cannot find an entry for ID #{'%016X' % id} to mark"
+      end
+      write_index
+    end
+    # Check if the entry for a given ID is marked.
+    # @param id [Fixnum or Bignum] ID of the entry
+    # @return [TrueClass or FalseClass] true if marked, false otherwise
+    def is_marked?(id)
+      @entries.each do |entry|
+        return entry[MARKED] != 0 if entry[ID] == id
+      end
+      raise ArgumentError,
+            "Cannot find an entry for ID #{'%016X' % id} to check"
+    end
+    # Remove all entries from the index that have not been marked.
+    def delete_unmarked_entries
+      # First remove the entry from the hash table.
+      @entries_by_id.delete_if { |id, e| e[MARKED] == 0 }
+      # Then delete the entry itself.
+      @entries.delete_if { |e| e[MARKED] == 0 }
+      write_index
+    end
+    # Run a basic consistency check.
+    # @param repair [TrueClass/FalseClass] Not used right now
+    # @return [TrueClass/FalseClass] Always true right now
+    def check(repair = false)
+      # Determine size of the data blobs file.
+      data_file_size = File.exists?(@blobs_file_name) ?
+        File.size(@blobs_file_name) : 0
+      next_start = 0
+      prev_entry = nil
+      @entries.each do |entry|
+        # Entries should never overlap
+        if prev_entry && next_start > entry[START]
+          raise RuntimeError,
+                "#{@dir}: Index entries are overlapping\n" +
+                "ID: #{'%016X' % prev_entry[ID]}  " +
+                "Start: #{prev_entry[START]}  " +
+                "Bytes: #{prev_entry[BYTES]}\n" +
+                "ID: #{'%016X' % entry[ID]}  Start: #{entry[START]}  " +
+                "Bytes: #{entry[BYTES]}"
+        end
+        next_start = entry[START] + entry[BYTES]
+        # Entries must fit within the data file
+        if next_start > data_file_size
+          raise RuntimeError,
+                "#{@dir}: Entry for ID #{'%016X' % entry[ID]} " +
+                "goes beyond 'data' file " +
+                "size (#{data_file_size})\n" +
+                "ID: #{'%016X' % entry[ID]}  Start: #{entry[START]}  " +
+                "Bytes: #{entry[BYTES]}"
+        end
+        prev_entry = entry
+      end
+      true
+    end
+    private
+    # Write a string of bytes into the file at the given address.
+    # @param raw [String] bytes to write
+    # @param address [Fixnum] offset in the file
+    # @return [Fixnum] number of bytes written
+    def write_to_blobs_file(raw, address)
+      begin
+        File.write(@blobs_file_name, raw, address)
+      rescue => e
+        raise IOError,
+              "Cannot write blobs file #{@blobs_file_name}: #{e.message}"
+      end
+    end
+    # Read _bytes_ bytes from the file starting at offset _address_.
+    # @param bytes [Fixnum] number of bytes to read
+    # @param address [Fixnum] offset in the file
+    def read_from_blobs_file(bytes, address)
+      begin
+        File.read(@blobs_file_name, bytes, address)
+      rescue => e
+        raise IOError,
+              "Cannot read blobs file #{@blobs_file_name}: #{e.message}"
+      end
+    end
+    # Reserve the bytes needed for the specified number of bytes with the
+    # given ID.
+    # @param id [Fixnum or Bignum] ID of the entry
+    # @param bytes [Fixnum] number of bytes for this entry
+    # @return [Fixnum] the start address of the reserved blob
+    def reserve_bytes(id, bytes)
+      # index of first blob after the last seen entry
+      end_of_last_entry = 0
+      # blob index of best fit segment
+      best_fit_start = nil
+      # best fir segment size in bytes
+      best_fit_bytes = nil
+      # Index where to insert the new entry. Append by default.
+      best_fit_index = -1
+      # If there is already an entry for an object with the _id_, we mark it
+      # for deletion.
+      entry_to_delete = nil
+      @entries.each.with_index do |entry, i|
+        if entry[ID] == id
+          # We've found an old entry for this ID. Mark it for deletion.
+          entry_to_delete = entry
+          next
+        end
+        gap = entry[START] - end_of_last_entry
+        if gap >= bytes &&
+          (best_fit_bytes.nil? || gap < best_fit_bytes)
+          # We've found a segment that fits the requested bytes and fits
+          # better than any previous find.
+          best_fit_start = end_of_last_entry
+          best_fit_bytes = gap
+          # The old entry gets deleted before the new one gets inserted. We
+          # need to correct the index appropriately.
+          best_fit_index = i - (entry_to_delete ? 1 : 0)
+        end
+        end_of_last_entry = entry[START] + entry[BYTES]
+      end
+      # Delete the old entry if requested.
+      @entries.delete(entry_to_delete) if entry_to_delete
+      # Create a new entry and insert it. The order must match the above
+      # defined constants!
+      entry = [ id, bytes, best_fit_start || end_of_last_entry, 0 ]
+      @entries.insert(best_fit_index, entry)
+      @entries_by_id[id] = entry
+      entry[START]
+    end
+    def read_index
+      # The entries are stored in two data structures to provide the fastest
+      # access mechanism for each situation. The Array @entries stores them in
+      # a plan Array. @entries_by_id stores them hashed by their ID.
+      @entries = []
+      @entries_by_id = {}
+      if File.exists?(@index_file_name)
+        begin
+          File.open(@index_file_name, 'rb') do |f|
+            # The index is a binary format. Each entry has exactly 25 bytes.
+            # Bytes
+            #  0 -  7 : 64 bits, little endian : ID
+            #  8 - 15 : 64 bits, little endian : Entry length in bytes
+            # 16 - 23 : 64 bits, little endian : Start address in data file
+            # 24      : 8 bits : 0 if unmarked, 1 if marked
+            while (bytes = f.read(25))
+              @entries << (e = bytes.unpack('QQQC'))
+              @entries_by_id[e[ID]] = e
+            end
+          end
+        rescue => e
+          raise RuntimeError,
+                "BTreeBlob file #{@index_file_name} corrupted: #{e.message}"
+        end
+      end
+    end
+    def write_index
+      begin
+        File.open(@index_file_name, 'wb') do |f|
+          # See read_index for data format documentation.
+          @entries.each do |entry|
+            f.write(entry.pack('QQQC'))
+          end
+        end
+      rescue => e
+        raise RuntimeError,
+              "Cannot write BTreeBlob index file #{@index_file_name}: " +
+              e.message
+      end
+    end
+    def split_blob
+      # Rename the index file to hide the blob file from the DB.
+      File.rename(@index_file_name, @index_file_name + '.bak')
+      # Read all entries from the blob and re-store them into the DB. We've
+      # already created the new BTree node, so these entries will be
+      # distributed into new leaf blobs of this new node.
+      @entries.each do |entry|
+        raw = read_from_blobs_file(entry[BYTES], entry[START])
+        @btreedb.put_raw_object(raw, entry[ID])
+      end
+      # Once the entries are re-stored, we can delete the old blob files.
+      File.delete(@index_file_name + '.bak')
+      File.delete(@blobs_file_name)
+    end
+  end
+end

data/lib/perobs/BTreeDB.rb ADDED Viewed

@@ -0,0 +1,252 @@
+# encoding: UTF-8
+#
+# = BTreeBlob.rb -- Persistent Ruby Object Store
+#
+# Copyright (c) 2015 by Chris Schlaeger <chris@taskjuggler.org>
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+require 'perobs/DataBase'
+require 'perobs/BTreeBlob'
+module PEROBS
+  # This class implements a BTree database using filesystem directories as
+  # nodes and blob files as leafs. The BTree grows with the number of stored
+  # entries. Each leaf node blob can hold a fixed number of entries. If more
+  # entries need to be stored, the blob is replaced by a node with multiple
+  # new leafs that store the entries of the previous node. The leafs are
+  # implemented by the BTreeBlob class.
+  class BTreeDB < DataBase
+    attr_reader :max_blob_size
+    # Create a new BTreeDB object.
+    # @param db_name [String] name of the DB directory
+    # @param options [Hash] options to customize the behavior. Currently only
+    #        the following options are supported:
+    #        :serializer    : Can be :marshal, :json, :yaml
+    #        :dir_bits      : The number of bits to use for the BTree nodes.
+    #                         The value must be between 4 and 14. The larger
+    #                         the number the more back-end directories are
+    #                         being used. The default is 12 which results in
+    #                         4096 directories per node.
+    #        :max_blob_size : The maximum number of entries in the BTree leaf
+    #                         nodes. The insert/find/delete time grows
+    #                         linearly with the size.
+    def initialize(db_name, options = {})
+      super(options[:serializer] || :json)
+      @db_dir = db_name
+      # Create the database directory if it doesn't exist yet.
+      ensure_dir_exists(@db_dir)
+      # Read the existing DB config.
+      @config = get_hash('config')
+      check_option('serializer')
+      # Check and set @dir_bits, the number of bits used for each tree level.
+      @dir_bits = options[:dir_bits] || 12
+      if @dir_bits < 4 || @dir_bits > 14
+        raise ArgumentError,
+              "dir_bits option (#{@dir_bits}) must be between 4 and 12"
+      end
+      check_option('dir_bits')
+      @max_blob_size = options[:max_blob_size] || 32
+      if @max_blob_size < 4 || @max_blob_size > 128
+        raise ArgumentError,
+          "max_blob_size option (#{@max_blob_size}) must be between 4 and 128"
+      end
+      check_option('max_blob_size')
+      put_hash('config', @config)
+      # This format string is used to create the directory name.
+      @dir_format_string = "%0#{(@dir_bits / 4) +
+                                (@dir_bits % 4 == 0 ? 0 : 1)}X"
+      # Bit mask to extract the dir_bits LSBs.
+      @dir_mask = 2 ** @dir_bits - 1
+    end
+    # Return true if the object with given ID exists
+    # @param id [Fixnum or Bignum]
+    def include?(id)
+      (blob = find_blob(id)) && blob.find(id)
+    end
+    # Store a simple Hash as a JSON encoded file into the DB directory.
+    # @param name [String] Name of the hash. Will be used as file name.
+    # @param hash [Hash] A Hash that maps String objects to strings or
+    # numbers.
+    def put_hash(name, hash)
+      file_name = File.join(@db_dir, name + '.json')
+      begin
+        File.write(file_name, hash.to_json)
+      rescue => e
+        raise RuntimeError,
+              "Cannot write hash file '#{file_name}': #{e.message}"
+      end
+    end
+    # Load the Hash with the given name.
+    # @param name [String] Name of the hash.
+    # @return [Hash] A Hash that maps String objects to strings or numbers.
+    def get_hash(name)
+      file_name = File.join(@db_dir, name + '.json')
+      return ::Hash.new unless File.exists?(file_name)
+      begin
+        json = File.read(file_name)
+      rescue => e
+        raise RuntimeError,
+              "Cannot read hash file '#{file_name}': #{e.message}"
+      end
+      JSON.parse(json, :create_additions => true)
+    end
+    # Store the given object into the cluster files.
+    # @param obj [Hash] Object as defined by PEROBS::ObjectBase
+    def put_object(obj, id)
+      find_blob(id, true).write_object(id, serialize(obj))
+    end
+    # Load the given object from the filesystem.
+    # @param id [Fixnum or Bignum] object ID
+    # @return [Hash] Object as defined by PEROBS::ObjectBase or nil if ID does
+    #         not exist
+    def get_object(id)
+      return nil unless (blob = find_blob(id)) && (obj = blob.read_object(id))
+      deserialize(obj)
+    end
+    # This method must be called to initiate the marking process.
+    def clear_marks
+      each_blob { |blob| blob.clear_marks }
+    end
+    # Permanently delete all objects that have not been marked. Those are
+    # orphaned and are no longer referenced by any actively used object.
+    def delete_unmarked_objects
+      each_blob { |blob| blob.delete_unmarked_entries }
+    end
+    # Mark an object.
+    # @param id [Fixnum or Bignum] ID of the object to mark
+    def mark(id)
+      (blob = find_blob(id)) && blob.mark(id)
+    end
+    # Check if the object is marked.
+    # @param id [Fixnum or Bignum] ID of the object to check
+    def is_marked?(id)
+      (blob = find_blob(id)) && blob.is_marked?(id)
+    end
+    # Basic consistency check.
+    # @param repair [TrueClass/FalseClass] True if found errors should be
+    #        repaired.
+    def check_db(repair = false)
+      each_blob { |blob| blob.check(repair) }
+    end
+    # Check if the stored object is syntactically correct.
+    # @param id [Fixnum/Bignum] Object ID
+    # @param repair [TrueClass/FalseClass] True if an repair attempt should be
+    #        made.
+    # @return [TrueClass/FalseClass] True if the object is OK, otherwise
+    #         false.
+    def check(id, repair)
+      begin
+        get_object(id)
+      rescue => e
+        $stderr.puts "Cannot read object with ID #{id}: #{e.message}"
+        return false
+      end
+      true
+    end
+    # Store the given serialized object into the cluster files. This method is
+    # for internal use only!
+    # @param raw [String] Serialized Object as defined by PEROBS::ObjectBase
+    # @param id [Fixnum or Bignum] Object ID
+    def put_raw_object(raw, id)
+      find_blob(id, true).write_object(id, raw)
+    end
+    private
+    def find_blob(id, create_missing_blob = false)
+      dir_name = @db_dir
+      loop do
+        dir_bits = id & @dir_mask
+        dir_name = File.join(dir_name, @dir_format_string % dir_bits)
+        if Dir.exists?(dir_name)
+          if File.exists?(File.join(dir_name, 'index'))
+            # The directory is a blob directory and not a BTree node dir.
+            return BTreeBlob.new(dir_name, self)
+          end
+        else
+          if create_missing_blob
+            # Create the new blob directory.
+            Dir.mkdir(dir_name)
+            # And initialize the blob DB.
+            return BTreeBlob.new(dir_name, self)
+          else
+            return nil
+          end
+        end
+        # Discard the least significant @dir_bits bits and start over again
+        # with the directory that matches the @dir_bits LSBs of the new ID.
+        id = id >> @dir_bits
+      end
+    end
+    def each_blob(&block)
+      each_blob_r(@db_dir, &block)
+    end
+    def each_blob_r(dir, &block)
+      Dir.glob(File.join(dir, '*')) do |dir_name|
+        if is_blob_dir?(dir_name)
+          block.call(BTreeBlob.new(dir_name, self))
+        else
+          each_blob_r(dir_name, &block)
+        end
+      end
+    end
+    def is_blob_dir?(dir_name)
+      # A blob directory contains an 'index' and 'data' file. This is in
+      # contrast to BTree node directories that only contain other
+      # directories.
+      index_file = File.join(dir_name, 'index')
+      File.exists?(index_file)
+    end
+  end
+end