RubyGems - purecdb - Versions diffs - 0.1.0 → 1.0.0 - Mend

purecdb 0.1.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/README.md CHANGED

@@ -45,7 +45,7 @@ To create a 32 bit (standard) CDB file:
 To instead create a 64 bit file, pass {mode: 64} as the second argument to PureCDB::Writer#open .
-To read a 32 bit (standard) CDB file:
+To read a CDB file (auto-detecting standard 32-bit or extended 64-bit) CDB files:
 ```ruby
     PureCDB::Reader.open("/tmp/somecdbfile.cdb") do |r|
@@ -53,8 +53,8 @@ To read a 32 bit (standard) CDB file:
     end
 ```
-To instead create a 64 bit file, pass {mode: 64} as the second argument to PureCDB::Reader#open,
-or let the reader auto-detect the format.
+To require a 32 or 64 bit file specifically, pass {mode: 32} or {mode: 64} as
+the second argument to PureCDB::Reader#open.
 See PureCDB::Reader#new for additional usage.

data/lib/purecdb.rb CHANGED

@@ -3,6 +3,15 @@ require "purecdb/base"
 require "purecdb/reader"
 require "purecdb/writer"
-# :nodoc:
 module PureCDB
+  # Convenience alternative to PureCDB::Reader.new(target, *options, &block)
+  def self.reader(target,*options, &block)
+    PureCDB::Reader.new(target, *options, &block)
+  end
+  # Convenience alternative to PureCDB::Writer.new(target, *options, &block)
+  def self.writer(target,*options, &block)
+    PureCDB::Writer.new(target, *options, &block)
+  end
 end

data/lib/purecdb/base.rb CHANGED

@@ -4,6 +4,9 @@ module PureCDB
   # Base class with shared functionality for PureCDB::Reader and
   # PureCDB::Writer that abstracts away 32 bit vs. 64 bit format
   # details. You should not need to use this directly.
+  #
+  # Changing the constants defined here is likely to break all
+  # kinds of stuff.
   #
   class Base
     # The CDB format contains 256 separate hashes by default.
@@ -17,9 +20,6 @@ module PureCDB
     # 64 bit format this is multiplied by 2.
     DEFAULT_HASHPTR_SIZE = 4
-    # Number of bytes that will be buffered
-    BUFFER_SIZE = 4096
     # Magic cookied used to indicate that this is a 64 bit (non-standard) CDB file
     # rather than a 32-bit CDB file.
     CDB64_MAGIC = "cdb64:01"
@@ -53,12 +53,15 @@ module PureCDB
     def set_mode mode
       @mode = mode
       @num_hashes = DEFAULT_NUM_HASHES
       if @mode == 64
         @length_size  = DEFAULT_LENGTH_SIZE  * 2
         @hashptr_size = DEFAULT_HASHPTR_SIZE * 2
+        @format = "Q<"
       else
         @length_size  = DEFAULT_LENGTH_SIZE
         @hashptr_size = DEFAULT_HASHPTR_SIZE
+        @format = "V"
       end
     end
@@ -74,9 +77,6 @@ module PureCDB
         end
       end
-      # Used to speed up 64bit pack/unpack
-      @little_endian = [123456789].pack("L") == [123456789].pack("V")
       if mode == :detect
         @mode = :detect
       else
@@ -112,29 +112,12 @@ module PureCDB
     private
-    # Due to Array#pack's lack of a little/big endian specific 64 bit operator
     def ary_pack(ary)
-      if @mode == 32
-        ary.pack("V*")
-      elsif @little_endian
-        ary.pack("Q*")
-      else
-        ary.collect {|a| [a & 0xffffffff, (a >> 32) & 0xffffffff] }.flatten.pack("V*")
-      end
+      ary.pack("#{@format}*")
     end
-    # Due to String#unpack's lack of a little/big endian specific 64 bit operator
     def ary_unpack(data, num)
-      if @mode == 32
-        data.unpack("V#{num}")
-      elsif @little_endian
-        data.unpack("Q#{num}")
-      else
-        ret = []
-        data = data.unpack("V#{num*2}")
-        data.each_slice(2) {|a| ret << (a[0] + (a[1] << 32)) }
-        ret
-      end
+      data.unpack("#{@format}#{num}")
     end
   end

data/lib/purecdb/reader.rb CHANGED

@@ -1,7 +1,7 @@
 module PureCDB
   #
-  # Read 32 bit or 54 bit CDB file CDB files.
+  # Read 32 bit or 64 bit CDB files.
   #
   class Reader < Base
     include Enumerable
@@ -41,7 +41,6 @@ module PureCDB
         raise "#{mode}bit mode detected in file; options request #{@mode}bit mode"
       end
-      # FIXME: It seems like there are bugs triggered if mmap fails
       @m = Mmap.new(target,"r", Mmap::MAP_SHARED) rescue nil
       read_hashes
@@ -55,12 +54,15 @@ module PureCDB
     end
     #
-    # Shortcut for PureCDB::Reader.new(target,options) ..
+    # Alternative to PureCDB::Reader.new(target,options) ..
     #
     def self.open(target, *options, &block)
       Reader.new(target, *options, &block)
     end
+    #
+    # Close the CDB file
+    #
     def close
       @io.close if @io
       @m.unmap if @m
@@ -68,6 +70,11 @@ module PureCDB
       @io = nil
     end
+    # Iterate over all key/value pairs in the order they occur in the file.
+    # This is *not* sorted or insertion order.
+    #
+    # +each+ will yield each key,value pair separately even when a key is
+    # duplicate.
     def each
       pos = hash_size
       hoff0 = @hashes[0]
@@ -78,16 +85,7 @@ module PureCDB
       end
     end
-    def read_entry(pos)
-      keylen, datalen = read_header(pos)
-      return nil,nil if !keylen
-      pos += hashref_size
-      rkey = read(pos .. pos + keylen - 1)
-      pos += keylen
-      value = read(pos .. pos + datalen - 1)
-      return rkey, value
-    end
+    # Returns all values for +key+ in an array
     def values(key)
       h = hash(key)
@@ -121,6 +119,17 @@ module PureCDB
     private
+    def read_entry(pos)
+      keylen, datalen = read_header(pos)
+      return nil,nil if !keylen
+      pos += hashref_size
+      rkey = read(pos .. pos + keylen - 1)
+      pos += keylen
+      value = read(pos .. pos + datalen - 1)
+      return rkey, value
+    end
     # Warning: This will be very slow if not mmap'd
     def read r
       @m = nil

data/lib/purecdb/version.rb CHANGED

@@ -1,4 +1,3 @@
-#
 module PureCDB
-  VERSION = "0.1.0"
+  VERSION = "1.0.0"
 end

data/lib/purecdb/writer.rb CHANGED

@@ -1,15 +1,58 @@
 module PureCDB
+  #
+  # Write 32 or 64 bit CDB files
+  #
+  # == Memory considerations
+  #
+  # While the entry is written to the target object immediately on calling #store, the actual
+  # hash tables can not be written until the full dataset is ready. You must therefore be able
+  # to hold the hash of each key (including duplicates) and the position in the file the full
+  # netry is stored at in memory while building the CDB file.
+  #
+  # It would be possible to write this to a temporary file at the cost of performance, but the
+  # current implementation does not do this.
+  #
+  # As a compromise, the current implementation stores the hashes and positions as a BER encoded
+  # string per hash bucket until it is ready to write it to disk.
+  #
   class Writer < Base
-    # This just needs to be <= 1. The lower it is, the fewer records will collide. The closer to 1 it is,
-    # the more frequently the reader may have to engage in potentially lengthy (worst case scanning all the
-    # records) probing to find the right entry
-    def hash_fill_factor
-      0.7
-    end
+    # How full any given hash table is allowed to get, as a float between 0 and 1.
+    #
+    # Needs to be <= 1. The lower it is, the fewer records will collide. The closer to 1 it is,
+    # the more frequently the reader may have to engage in potentially lengthy (worst case
+    # scanning all the records) probing to find the right entry
+    attr_accessor :hash_fill_factor
+    # Open a CDB file for writing, or preparing an IO like object for writing.
+    #
+    # :call-seq:
+    #   w = PureCDB::Writer.new(target)
+    #   w = PureCDB::Writer.new(target, *options)
+    #   PureCDB::Writer.new(target)  {|w| ... }
+    #   PureCDB::Writer.new(target, *options) {|w| ... }
+    #
+    # If +:mode+ is passed in +options+, it must be the integers 32 or 64, indicating whether
+    # you wish to write a standard (32 bit) CDB file, or a 64 bit CDB-like file. The default
+    # is 32.
+    #
+    # If +target+ is a +String+ it is treated as a filename of a file to be opened to write to.
+    # Otherwise +target+ is assumed to be an IO-like object that ideally responds to #sysseek
+    # and #syswrite. If it doesn't, it will be wrapped with an object delegating #sysseek and
+    # #syswrite to #seek and #write respectively, and these must be present.
+    #
+    # (+IO+ and +StringIO+ both satisfy these requirements)
+    #
+    # If passed a block, the writer is yielded to the block and PureCDB::Writer#close is called
+    # afterwards.
+    #
+    # **WARNING:** To complete writing the hash tables, you *must* ensure #close is called
+    # when you are done.
+    #
     def initialize target, *options
       super *options
+      @hash_fill_factor = 0.7
       set_mode(32) if @mode == :detect
       if target.is_a?(String)
@@ -34,36 +77,38 @@ module PureCDB
       end
     end
+    # Write out the hashes and hash pointers, and close the target if it responds to #close
+    #
     def close
       write_hashes
       write_hashptrs
       @io.close if @io.respond_to?(:close)
     end
-   # For compatibility w/cdb / CDBMaker
-   def store key,value
-     add key,value
-   end
-   def add key,value
-     # In an attempt to save memory, we pack the hash data we gather into
-     # strings of BER compressed integers...
-     h = hash(key)
-     hi = (h % num_hashes)
-     @hashes[hi] ||= ""
-     header = build_header(key.length, value.length)
-     @io.syswrite(header+key+value)
-     size = header.size + key.size + value.size
-     @hashes[hi] += [h,@pos].pack("ww") # BER compressed
-     @pos += size
-   end
-   def self.open target, *options, &block
-     Writer.new(target, *options, &block)
-   end
+    # Store 'value' under 'key'.
+    #
+    # Multiple values can we stored for the same key by calling #store multiple times
+    # with the same key value.
+    def store key,value
+      # In an attempt to save memory, we pack the hash data we gather into
+      # strings of BER compressed integers...
+      h = hash(key)
+      hi = (h % num_hashes)
+      @hashes[hi] ||= ""
+      header = build_header(key.length, value.length)
+      @io.syswrite(header+key+value)
+      size = header.size + key.size + value.size
+      @hashes[hi] += [h,@pos].pack("ww") # BER compressed
+      @pos += size
+    end
+    #
+    # Alternative to PureCDB::Writer.new(target,options) ..
+    #
+    def self.open target, *options, &block
+      Writer.new(target, *options, &block)
+    end
    private
     def write_hashes
@@ -88,7 +133,7 @@ module PureCDB
             while ary[off*2] != 0
               off = (off + 1) % len
             end
-        free_slots -= 1
+            free_slots -= 1
             ary[off*2] = entry[0]
             ary[off*2+1] = entry[1]
           end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: purecdb
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 1.0.0
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2016-07-03 00:00:00.000000000 Z
+date: 2016-07-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler