RubyGems - purecdb - Versions diffs - 0.1.0 → 1.0.0 - Mend

purecdb 0.1.0 → 1.0.0

Files changed (7) hide show

data/README.md CHANGED

@@ -45,7 +45,7 @@ To create a 32 bit (standard) CDB file:
 To instead create a 64 bit file, pass {mode: 64} as the second argument to PureCDB::Writer#open .
-To read a 32 bit (standard) CDB file:
+To read a CDB file (auto-detecting standard 32-bit or extended 64-bit) CDB files:
 ```ruby
     PureCDB::Reader.open("/tmp/somecdbfile.cdb") do |r|
@@ -53,8 +53,8 @@ To read a 32 bit (standard) CDB file:
     end
 ```
-To instead create a 64 bit file, pass {mode: 64} as the second argument to PureCDB::Reader#open,
-or let the reader auto-detect the format.
+To require a 32 or 64 bit file specifically, pass {mode: 32} or {mode: 64} as
+the second argument to PureCDB::Reader#open.
 See PureCDB::Reader#new for additional usage.

data/lib/purecdb.rb CHANGED

@@ -3,6 +3,15 @@ require "purecdb/base"
 require "purecdb/reader"
 require "purecdb/writer"
-# :nodoc:
 module PureCDB
+  # Convenience alternative to PureCDB::Reader.new(target, *options, &block)
+  def self.reader(target,*options, &block)
+    PureCDB::Reader.new(target, *options, &block)
+  end
+  # Convenience alternative to PureCDB::Writer.new(target, *options, &block)
+  def self.writer(target,*options, &block)
+    PureCDB::Writer.new(target, *options, &block)
+  end
 end

data/lib/purecdb/base.rb CHANGED

@@ -4,6 +4,9 @@ module PureCDB
   # Base class with shared functionality for PureCDB::Reader and
   # PureCDB::Writer that abstracts away 32 bit vs. 64 bit format
   # details. You should not need to use this directly.
+  #
+  # Changing the constants defined here is likely to break all
+  # kinds of stuff.
   #
   class Base
     # The CDB format contains 256 separate hashes by default.
@@ -17,9 +20,6 @@ module PureCDB
     # 64 bit format this is multiplied by 2.
     DEFAULT_HASHPTR_SIZE = 4
-    # Number of bytes that will be buffered
-    BUFFER_SIZE = 4096
     # Magic cookied used to indicate that this is a 64 bit (non-standard) CDB file
     # rather than a 32-bit CDB file.
     CDB64_MAGIC = "cdb64:01"
@@ -53,12 +53,15 @@ module PureCDB
     def set_mode mode
       @mode = mode
       @num_hashes = DEFAULT_NUM_HASHES
       if @mode == 64
         @length_size  = DEFAULT_LENGTH_SIZE  * 2
         @hashptr_size = DEFAULT_HASHPTR_SIZE * 2
+        @format = "Q<"
       else
         @length_size  = DEFAULT_LENGTH_SIZE
         @hashptr_size = DEFAULT_HASHPTR_SIZE
+        @format = "V"
       end
     end
@@ -74,9 +77,6 @@ module PureCDB
         end
       end
-      # Used to speed up 64bit pack/unpack
-      @little_endian = [123456789].pack("L") == [123456789].pack("V")
       if mode == :detect
         @mode = :detect
       else
@@ -112,29 +112,12 @@ module PureCDB
     private
-    # Due to Array#pack's lack of a little/big endian specific 64 bit operator
     def ary_pack(ary)
-      if @mode == 32
-        ary.pack("V*")
-      elsif @little_endian
-        ary.pack("Q*")
-      else
-        ary.collect {|a| [a & 0xffffffff, (a >> 32) & 0xffffffff] }.flatten.pack("V*")
-      end
+      ary.pack("#{@format}*")
     end
-    # Due to String#unpack's lack of a little/big endian specific 64 bit operator
     def ary_unpack(data, num)
-      if @mode == 32
-        data.unpack("V#{num}")
-      elsif @little_endian
-        data.unpack("Q#{num}")
-      else
-        ret = []
-        data = data.unpack("V#{num*2}")
-        data.each_slice(2) {|a| ret << (a[0] + (a[1] << 32)) }
-        ret
-      end
+      data.unpack("#{@format}#{num}")
     end
   end

data/lib/purecdb/reader.rb CHANGED

@@ -1,7 +1,7 @@
 module PureCDB
   #
-  # Read 32 bit or 54 bit CDB file CDB files.
+  # Read 32 bit or 64 bit CDB files.
   #
   class Reader < Base
     include Enumerable
@@ -41,7 +41,6 @@ module PureCDB
         raise "#{mode}bit mode detected in file; options request #{@mode}bit mode"
       end
-      # FIXME: It seems like there are bugs triggered if mmap fails
       @m = Mmap.new(target,"r", Mmap::MAP_SHARED) rescue nil
       read_hashes
@@ -55,12 +54,15 @@ module PureCDB
     end
     #
-    # Shortcut for PureCDB::Reader.new(target,options) ..
+    # Alternative to PureCDB::Reader.new(target,options) ..
     #
     def self.open(target, *options, &block)
       Reader.new(target, *options, &block)
     end
+    #
+    # Close the CDB file
+    #
     def close
       @io.close if @io
       @m.unmap if @m
@@ -68,6 +70,11 @@ module PureCDB
       @io = nil
     end
+    # Iterate over all key/value pairs in the order they occur in the file.
+    # This is *not* sorted or insertion order.
+    #
+    # +each+ will yield each key,value pair separately even when a key is
+    # duplicate.
     def each
       pos = hash_size
       hoff0 = @hashes[0]
@@ -78,16 +85,7 @@ module PureCDB
       end
     end
-    def read_entry(pos)
-      keylen, datalen = read_header(pos)
-      return nil,nil if !keylen
-      pos += hashref_size
-      rkey = read(pos .. pos + keylen - 1)
-      pos += keylen
-      value = read(pos .. pos + datalen - 1)
-      return rkey, value
-    end
+    # Returns all values for +key+ in an array
     def values(key)
       h = hash(key)
@@ -121,6 +119,17 @@ module PureCDB
     private
+    def read_entry(pos)
+      keylen, datalen = read_header(pos)
+      return nil,nil if !keylen
+      pos += hashref_size
+      rkey = read(pos .. pos + keylen - 1)
+      pos += keylen
+      value = read(pos .. pos + datalen - 1)
+      return rkey, value
+    end
     # Warning: This will be very slow if not mmap'd
     def read r
       @m = nil

data/lib/purecdb/version.rb CHANGED

@@ -1,4 +1,3 @@
-#
 module PureCDB
-  VERSION = "0.1.0"
+  VERSION = "1.0.0"
 end

data/lib/purecdb/writer.rb CHANGED

@@ -1,15 +1,58 @@
 module PureCDB
+  #
+  # Write 32 or 64 bit CDB files
+  #
+  # == Memory considerations
+  #
+  # While the entry is written to the target object immediately on calling #store, the actual
+  # hash tables can not be written until the full dataset is ready. You must therefore be able
+  # to hold the hash of each key (including duplicates) and the position in the file the full
+  # netry is stored at in memory while building the CDB file.
+  #
+  # It would be possible to write this to a temporary file at the cost of performance, but the
+  # current implementation does not do this.
+  #
+  # As a compromise, the current implementation stores the hashes and positions as a BER encoded
+  # string per hash bucket until it is ready to write it to disk.
+  #
   class Writer < Base
-    # This just needs to be <= 1. The lower it is, the fewer records will collide. The closer to 1 it is,
-    # the more frequently the reader may have to engage in potentially lengthy (worst case scanning all the
-    # records) probing to find the right entry
-    def hash_fill_factor
-      0.7
-    end
+    # How full any given hash table is allowed to get, as a float between 0 and 1.
+    #
+    # Needs to be <= 1. The lower it is, the fewer records will collide. The closer to 1 it is,
+    # the more frequently the reader may have to engage in potentially lengthy (worst case
+    # scanning all the records) probing to find the right entry
+    attr_accessor :hash_fill_factor
+    # Open a CDB file for writing, or preparing an IO like object for writing.
+    #
+    # :call-seq:
+    #   w = PureCDB::Writer.new(target)
+    #   w = PureCDB::Writer.new(target, *options)
+    #   PureCDB::Writer.new(target)  {|w| ... }
+    #   PureCDB::Writer.new(target, *options) {|w| ... }
+    #
+    # If +:mode+ is passed in +options+, it must be the integers 32 or 64, indicating whether
+    # you wish to write a standard (32 bit) CDB file, or a 64 bit CDB-like file. The default
+    # is 32.
+    #
+    # If +target+ is a +String+ it is treated as a filename of a file to be opened to write to.
+    # Otherwise +target+ is assumed to be an IO-like object that ideally responds to #sysseek
+    # and #syswrite. If it doesn't, it will be wrapped with an object delegating #sysseek and
+    # #syswrite to #seek and #write respectively, and these must be present.
+    #
+    # (+IO+ and +StringIO+ both satisfy these requirements)
+    #
+    # If passed a block, the writer is yielded to the block and PureCDB::Writer#close is called
+    # afterwards.
+    #
+    # **WARNING:** To complete writing the hash tables, you *must* ensure #close is called
+    # when you are done.
+    #
     def initialize target, *options
       super *options
+      @hash_fill_factor = 0.7
       set_mode(32) if @mode == :detect
       if target.is_a?(String)
@@ -34,36 +77,38 @@ module PureCDB
       end
     end
+    # Write out the hashes and hash pointers, and close the target if it responds to #close
+    #
     def close
       write_hashes
       write_hashptrs
       @io.close if @io.respond_to?(:close)
     end
-   # For compatibility w/cdb / CDBMaker
-   def store key,value
-     add key,value
-   end
-   def add key,value
-     # In an attempt to save memory, we pack the hash data we gather into
-     # strings of BER compressed integers...
-     h = hash(key)
-     hi = (h % num_hashes)
-     @hashes[hi] ||= ""
-     header = build_header(key.length, value.length)
-     @io.syswrite(header+key+value)
-     size = header.size + key.size + value.size
-     @hashes[hi] += [h,@pos].pack("ww") # BER compressed
-     @pos += size
-   end
-   def self.open target, *options, &block
-     Writer.new(target, *options, &block)
-   end
+    # Store 'value' under 'key'.
+    #
+    # Multiple values can we stored for the same key by calling #store multiple times
+    # with the same key value.
+    def store key,value
+      # In an attempt to save memory, we pack the hash data we gather into
+      # strings of BER compressed integers...
+      h = hash(key)
+      hi = (h % num_hashes)
+      @hashes[hi] ||= ""
+      header = build_header(key.length, value.length)
+      @io.syswrite(header+key+value)
+      size = header.size + key.size + value.size
+      @hashes[hi] += [h,@pos].pack("ww") # BER compressed
+      @pos += size
+    end
+    #
+    # Alternative to PureCDB::Writer.new(target,options) ..
+    #
+    def self.open target, *options, &block
+      Writer.new(target, *options, &block)
+    end
    private
     def write_hashes
@@ -88,7 +133,7 @@ module PureCDB
             while ary[off*2] != 0
               off = (off + 1) % len
             end
-        free_slots -= 1
+            free_slots -= 1
             ary[off*2] = entry[0]
             ary[off*2+1] = entry[1]
           end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: purecdb
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 1.0.0
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2016-07-03 00:00:00.000000000 Z
+date: 2016-07-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler