RubyGems - correct-horse-battery-staple - Versions diffs - 0.6.3 → 0.6.4 - Mend

correct-horse-battery-staple 0.6.3 → 0.6.4

Files changed (10) hide show

data.tar.gz.sig +0 -0
data/Manifest.txt +1 -0
data/correct-horse-battery-staple.gemspec +3 -3
data/lib/correct_horse_battery_staple.rb +1 -1
data/lib/correct_horse_battery_staple/backend/isam.rb +332 -0
data/lib/correct_horse_battery_staple/backend/isam_kd.rb +7 -3
data/lib/correct_horse_battery_staple/corpus/base.rb +17 -4
data/lib/correct_horse_battery_staple/corpus/isam.rb +2 -163
metadata +21 -20
metadata.gz.sig +0 -0

data.tar.gz.sig CHANGED

Binary file

data/Manifest.txt CHANGED

@@ -9,6 +9,7 @@ bin/chbs-mkpass
 lib/correct_horse_battery_staple.rb
 lib/correct_horse_battery_staple/assembler.rb
 lib/correct_horse_battery_staple/backend.rb
+lib/correct_horse_battery_staple/backend/isam.rb
 lib/correct_horse_battery_staple/backend/isam_kd.rb
 lib/correct_horse_battery_staple/backend/redis.rb
 lib/correct_horse_battery_staple/backend/redis/d_range.rb

data/correct-horse-battery-staple.gemspec CHANGED

@@ -2,17 +2,17 @@
 Gem::Specification.new do |s|
   s.name = "correct-horse-battery-staple"
-  s.version = "0.6.3.20120111134214"
+  s.version = "0.6.4.20120113111503"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Robert Sanders"]
   s.cert_chain = ["/Users/robertsanders/.gem/gem-public_cert.pem"]
-  s.date = "2012-01-11"
+  s.date = "2012-01-13"
   s.description = "Generate a 4 word password from words of size 3-8 characters, with\nfrequencies in the 30th-60th percentile. This range gives a nice set\nof uncommon but not completely alien words.\n\n    $ chbs generate --verbose -W 3..8 -P 30..60\n    Corpus size: 6396 candidate words of 33075 total\n    Entropy: 48 bits (2^48 = 281474976710656)\n    Years to guess at 1000 guesses/sec: 8926\n    magnate-thermal-sandbank-augur\n\nWith the --verbose flag, the utility will calculate a time-to-guess\nbased on a completely arbitrary 1000 guesses/sec.  If you'd like a\nmore secure password, either relax the various filtering rules (-W and\n-P), add more words to the password, or use a larger corpus.\n\nBy default we use the American TV Shows & Scripts corpus taken from\nWiktionary.\n\nOthers provided:\n\n* Project Gutenberg 2005 corpus taken from Wiktionary.\n* 1 of every 7 of the top 60000 lemmas from wordfrequency.info (6900\n  actual lemmas after processing)\n\nSee http://xkcd.com/936/ for the genesis of the idea.\n\nData sources:\n\n     http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists\n     http://wordfrequency.info/"
   s.email = ["robert@curioussquid.com"]
   s.executables = ["chbs", "chbs-mkpass"]
   s.extra_rdoc_files = ["History.txt", "Manifest.txt", "README.txt"]
-  s.files = ["Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "README.txt", "Rakefile", "bin/chbs", "bin/chbs-mkpass", "lib/correct_horse_battery_staple.rb", "lib/correct_horse_battery_staple/assembler.rb", "lib/correct_horse_battery_staple/backend.rb", "lib/correct_horse_battery_staple/backend/isam_kd.rb", "lib/correct_horse_battery_staple/backend/redis.rb", "lib/correct_horse_battery_staple/backend/redis/d_range.rb", "lib/correct_horse_battery_staple/corpus.rb", "lib/correct_horse_battery_staple/corpus/base.rb", "lib/correct_horse_battery_staple/corpus/isam.rb", "lib/correct_horse_battery_staple/corpus/isam_kd.rb", "lib/correct_horse_battery_staple/corpus/redis.rb", "lib/correct_horse_battery_staple/corpus/redis2.rb", "lib/correct_horse_battery_staple/corpus/serialized.rb", "lib/correct_horse_battery_staple/corpus/sqlite.rb", "lib/correct_horse_battery_staple/generator.rb", "lib/correct_horse_battery_staple/memoize.rb", "lib/correct_horse_battery_staple/parser.rb", "lib/correct_horse_battery_staple/parser/base.rb", "lib/correct_horse_battery_staple/parser/regex.rb", "lib/correct_horse_battery_staple/range_parser.rb", "lib/correct_horse_battery_staple/statistical_array.rb", "lib/correct_horse_battery_staple/stats.rb", "lib/correct_horse_battery_staple/word.rb", "lib/correct_horse_battery_staple/writer.rb", "lib/correct_horse_battery_staple/writer/base.rb", "lib/correct_horse_battery_staple/writer/csv.rb", "lib/correct_horse_battery_staple/writer/file.rb", "lib/correct_horse_battery_staple/writer/isam.rb", "lib/correct_horse_battery_staple/writer/isam_kd.rb", "lib/correct_horse_battery_staple/writer/json.rb", "lib/correct_horse_battery_staple/writer/marshal.rb", "lib/correct_horse_battery_staple/writer/redis.rb", "lib/correct_horse_battery_staple/writer/sqlite.rb", "script/generate_all", "script/load_redis", "script/perftest", "spec/corpus/serialized_spec.rb", "spec/corpus_spec.rb", "spec/correct_horse_battery_staple_spec.rb", "spec/fixtures/100.json", "spec/fixtures/corpus1.csv", "spec/fixtures/corpus100.json", "spec/fixtures/wiktionary1000.htm", "spec/range_parser_spec.rb", "spec/spec_helper.rb", "spec/statistical_array_spec.rb", "spec/support/spec_pry.rb", "spec/word_spec.rb", "correct-horse-battery-staple.gemspec", ".gemtest"]
+  s.files = ["Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "README.txt", "Rakefile", "bin/chbs", "bin/chbs-mkpass", "lib/correct_horse_battery_staple.rb", "lib/correct_horse_battery_staple/assembler.rb", "lib/correct_horse_battery_staple/backend.rb", "lib/correct_horse_battery_staple/backend/isam.rb", "lib/correct_horse_battery_staple/backend/isam_kd.rb", "lib/correct_horse_battery_staple/backend/redis.rb", "lib/correct_horse_battery_staple/backend/redis/d_range.rb", "lib/correct_horse_battery_staple/corpus.rb", "lib/correct_horse_battery_staple/corpus/base.rb", "lib/correct_horse_battery_staple/corpus/isam.rb", "lib/correct_horse_battery_staple/corpus/isam_kd.rb", "lib/correct_horse_battery_staple/corpus/redis.rb", "lib/correct_horse_battery_staple/corpus/redis2.rb", "lib/correct_horse_battery_staple/corpus/serialized.rb", "lib/correct_horse_battery_staple/corpus/sqlite.rb", "lib/correct_horse_battery_staple/generator.rb", "lib/correct_horse_battery_staple/memoize.rb", "lib/correct_horse_battery_staple/parser.rb", "lib/correct_horse_battery_staple/parser/base.rb", "lib/correct_horse_battery_staple/parser/regex.rb", "lib/correct_horse_battery_staple/range_parser.rb", "lib/correct_horse_battery_staple/statistical_array.rb", "lib/correct_horse_battery_staple/stats.rb", "lib/correct_horse_battery_staple/word.rb", "lib/correct_horse_battery_staple/writer.rb", "lib/correct_horse_battery_staple/writer/base.rb", "lib/correct_horse_battery_staple/writer/csv.rb", "lib/correct_horse_battery_staple/writer/file.rb", "lib/correct_horse_battery_staple/writer/isam.rb", "lib/correct_horse_battery_staple/writer/isam_kd.rb", "lib/correct_horse_battery_staple/writer/json.rb", "lib/correct_horse_battery_staple/writer/marshal.rb", "lib/correct_horse_battery_staple/writer/redis.rb", "lib/correct_horse_battery_staple/writer/sqlite.rb", "script/generate_all", "script/load_redis", "script/perftest", "spec/corpus/serialized_spec.rb", "spec/corpus_spec.rb", "spec/correct_horse_battery_staple_spec.rb", "spec/fixtures/100.json", "spec/fixtures/corpus1.csv", "spec/fixtures/corpus100.json", "spec/fixtures/wiktionary1000.htm", "spec/range_parser_spec.rb", "spec/spec_helper.rb", "spec/statistical_array_spec.rb", "spec/support/spec_pry.rb", "spec/word_spec.rb", "correct-horse-battery-staple.gemspec", ".gemtest"]
   s.homepage = "http://github.com/rsanders/correct-horse-battery-staple"
   s.rdoc_options = ["--main", "README.txt"]
   s.require_paths = ["lib"]

data/lib/correct_horse_battery_staple.rb CHANGED

@@ -1,7 +1,7 @@
 require 'logger'
 module CorrectHorseBatteryStaple
-  VERSION = '0.6.3'
+  VERSION = '0.6.4'
   DEFAULT_CORPUS_NAME = "tvscripts"

data/lib/correct_horse_battery_staple/backend/isam.rb ADDED

@@ -0,0 +1,332 @@
+require 'bigdecimal'
+require 'json'
+require 'set'
+module CorrectHorseBatteryStaple::Backend::Isam
+  INITIAL_PRELUDE_LENGTH = 4096
+  F_PRELUDE_AT_END = 1
+  def self.included(base)
+    base.extend ClassMethods
+    base.send :include, InstanceMethods
+  end
+  module ClassMethods
+  end
+  module InstanceMethods
+    #
+    #
+    #
+    def initialize_backend_variables
+      @length_scaling_factor = 15
+      @page_size = 4096
+    end
+    def fix_stats(stats)
+      stats.each do |k,v|
+        if v.respond_to?(:nan?) && v.nan?
+          stats[k] = -1
+        end
+      end
+      stats
+    end
+    def page_size
+      @page_size || 4096
+    end
+    # many MMUs in default mode and modern highcap drives have 4k pages/blocks
+    def round_up(val, blocksize=page_size)
+      [(val.to_f/blocksize).ceil, 1].max * blocksize
+    end
+    def write_corpus_to_io(corpus, io=STDOUT)
+      io.rewind
+      # includes prefix length byte
+      @word_length = corpus.reduce(0) { |m, e| m > e.word.length ? m : e.word.length } + 1
+      @freq_length = 4
+      @entry_length = @word_length + @freq_length
+      stats = fix_stats(corpus.stats)
+      corpus_word_count = corpus.length
+      prelude = {
+        "wlen"           => @word_length,
+        "flen"           => 4,
+        "entrylen"       => @word_length + @freq_length,
+        "sort"           => "frequency",
+        "n"              => corpus_word_count,
+        "stats"          => stats,
+        "flags"          => 0,
+        "length_scaling_factor" => (@length_scaling_factor || 15),
+        "records_length" => "0000000000",
+        "offset_records" => "0000000000",
+        "offset_index1"  => "0000000000",
+        "offset_index2"  => "0000000000"
+      }
+      prelude_json_length = prelude.to_json.length
+      prelude["offset_records"] = offset_records = round_up(prelude_json_length+8.0)
+      prelude["records_length"] = records_length = corpus_word_count * prelude["entrylen"]
+      offset_index1 = prelude["offset_records"] +
+        round_up(records_length, page_size)
+      prelude["offset_index1"]  = offset_index1
+      io.write([offset_records, prelude_json_length, prelude.to_json].
+               pack("NNA#{offset_records-8}"))
+      corpus.each_with_index do |w, index|
+        io.write(s=[w.word.length, w.word, w.frequency].pack("Ca#{@word_length-1}N"))
+      end
+    end
+    def pad(size, io)
+      io.write([].pack("x#{size}"))
+    end
+    def binwrite(*args)
+      method = io.respond_to?(:binwrite) ? :binwrite : :write
+      io.send(method, *args)
+    end
+    def openmode
+      IO.respond_to?(:binwrite) ? "wb:ASCII-8BIT" : "w"
+    end
+    #
+    #
+    # Format of header:
+    #
+    # 0..3    -  OB - offset of body start in bytes; network byte order
+    # 4..7    -  LP - length of prelude in network byte order
+    # 8..OB-1 -  P  - JSON-encoded prelude hash and space padding
+    # OB..EOF -  array of fixed size records as described in prelude
+    #
+    # Contents of Prelude (after JSON decoding):
+    #
+    # P["wlen"]                   - length of word part of record
+    # P["flen"]                   - length of frequency part of record (always 4 bytes)
+    # P["entrylen"]               - length of total part of record
+    # P["n"]                      - number of records
+    # P["sort"]                   - field name sorted by (word or frequency)
+    # P["stats"]                  - corpus statistics
+    # P["offset_index1"]          - absolute file offset of KDTree index
+    # P["records_length"]         - length in bytes of records section, excluding padding
+    # P["length_scaling_factor"]  - what length was multiplied by in creating KDTree (usually 15)
+    #
+    # Format of record:
+    #
+    # 2 bytes              - LW - actual length of word within field
+    # P["wlen"] bytes      - LW bytes of word (W) + P["wlen"]-LW bytes of padding
+    # P["flen"] (4) bytes  - frequency as network byte order long
+    #
+    # After record section, there is padding up to the next page_size boundary,
+    # and then there is a dumped KDTree which extends to EOF.
+    #
+    #
+    def precache(max = -1)
+      return if max > -1 && file_size(@file) > max
+      @file.seek 0
+      @file = StringIO.new @file.read, "r"
+    end
+    def file_size(file)
+      (file.respond_to?(:size) ? file.size : file.stat.size)
+    end
+    def prelude
+      @prelude || parse_prelude
+    end
+    def parse_prelude
+      @file.seek 0
+      prelude_buf = @file.read(INITIAL_PRELUDE_LENGTH)
+      # byte offset of first record from beginning of file
+      # total length of JSON string (without padding)
+      (@record_offset, @prelude_len)  = prelude_buf.unpack("NN")
+      # read more if our initial read didn't slurp in the entire prelude
+      if @prelude_len > prelude_buf.length
+        prelude_buf += @file.read(@prelude_len - prelude_buf.length)
+      end
+      @prelude = JSON.parse( prelude_buf.unpack("@8a#{@prelude_len}")[0] ) || {}
+      # includes prefix length byte
+      @word_length      = @prelude["wlen"]     || raise(ArgumentError, "Word length is not defined!")
+      # as network byte order int
+      @frequency_length = @prelude["flen"]     || 4
+      # total length of record
+      @entry_length     = @prelude["entrylen"] || raise(ArgumentError, "Prelude does not include entrylen!")
+      @offset_index1    = @prelude["offset_index1"]
+      @offset_index2    = @prelude["offset_index2"]
+      @entry_count      = @prelude["n"] || raise(ArgumentError, "Number of records not included!")
+      @records_length   = @prelude["records_length"] || (@entry_length * @entry_count)
+      @length_scaling_factor = @prelude["length_scaling_factor"] || 10
+      load_stats_from_hash(@prelude["stats"]) if @prelude["stats"]
+      @prelude
+    end
+    #
+    # Show some information about
+    #
+    def inspect
+      super + "\n" + <<INSPECT
+File size: #{file_size(@file)}
+Word length: #{@word_length}
+Frequency bytes: #{@frequency_length}
+Total record bytes: #{@records_length}
+Prelude:
+#{@prelude.map {|k,v| k=="stats" ? "" : "  #{k}: #{v}\n" }.join("") }
+INSPECT
+    end
+    ## parsing
+    #
+    # Parse a record into an array of [word, frequency] IFF the word
+    # fits into the length_range or length_range is nil
+    #
+    def parse_record_into_array(string, index, length_range = nil)
+      chunk = nth_chunk(index, string)
+      raise "No chunk for index #{index}" unless chunk
+      actual_word_length = chunk.unpack("C")[0]
+      if !length_range || length_range.include?(actual_word_length)
+        # returns [word, frequency]
+        chunk.unpack("xa#{actual_word_length}@#{@word_length}N")
+      else
+        nil
+      end
+    end
+    #
+    # Parse a record into a Word object, which can be provided or will otherwise
+    # be constructed as needed fourth arg is a length range which can act as a
+    # filter; if not satisfied, nil will be returned
+    #
+    def parse_record(string, index=0,
+                     word=CorrectHorseBatteryStaple::Word.new(:word => ""),
+                     length_range = nil)
+      bare = parse_record_into_array(string, index, length_range)
+      return nil unless bare
+      word.word = bare[0]
+      word.frequency = bare[1]
+      word
+    end
+    def word_length(chunk_string)
+      chunk_string.unpack("C")
+    end
+    # return a string representing the nth_record
+    def nth_chunk(n, string)
+      string[@entry_length * n, @entry_length]
+    end
+    def pos_of_nth_word_in_file(n)
+      pos = @record_offset + (n * @entry_length)
+    end
+    def get_word_by_idx(n)
+      chunk = nth_chunk(n, records_string)
+      parse_record(chunk).tap do |w|
+        w.index      = n
+        w.percentile = [(n-0.5)/size,0].max * 100
+      end
+    end
+    ## some core Enumerable building blocks
+    def each(&block)
+      string = records_string
+      max_index = size - 1
+      index = 0
+      while index < max_index
+        word = parse_record(string, index)
+        word.index = index
+        word.percentile = [(index-0.5)/size,0].max * 100
+        yield word
+        index += 1
+      end
+    end
+    def size
+      @entry_count ||= records_size / @entry_length
+    end
+    ## our Corpus Enumerablish abstract methods
+    # we presume that the ISAM file has been sorted
+    def sorted_entries
+      @sorted_entries ||= entries
+    end
+    ## file I/O
+    def records_size
+      @records_length
+    end
+    def file_string
+      @file.is_a?(StringIO) ? @file.string : file_range_read(nil)
+    end
+    def file_range_read(file_range = nil)
+      file_range ||= 0...file_size(@file)
+      pos = @file.tell
+      @file.seek(file_range.first)
+      @file.read(range_count(file_range))
+    ensure
+      @file.seek(pos)
+    end
+    # memoize :file_range_read
+    # returns a string representing the record-holding portion of the file
+    def records_string
+      @records_string ||=
+        record_range_read(0 ... records_size)
+    end
+    def record_range_read(record_range = nil)
+      record_range ||= 0...records_size
+      file_range_read((record_range.first + @record_offset)...(range_count(record_range) + @record_offset))
+    end
+    # memoize :record_range_read
+    def record_percentile_range_read(percentile_range)
+      record_range = record_range_for_percentile(percentile_range)
+      record_range_read(record_range)
+    end
+    ## rather than using a StatisticalArray, we do direct indexing into the file/string
+    def percentile_index(percentile, round=true)
+      r = percentile.to_f/100 * count + 0.5
+      round ? r.round : r
+    end
+    def record_range_for_percentile(range)
+      range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric)
+      (percentile_index(range.begin, false).floor * @entry_length ...
+       percentile_index(range.end,   false).ceil * @entry_length)
+    end
+  end
+end

data/lib/correct_horse_battery_staple/backend/isam_kd.rb CHANGED

@@ -292,7 +292,7 @@ INSPECT
       chunk = nth_chunk(n, records_string)
       parse_record(chunk).tap do |w|
         w.index      = n
-        w.percentile = (n-0.5)/size * 100
+        w.percentile = [(n-0.5)/size,0].max * 100
       end
     end
@@ -303,7 +303,10 @@ INSPECT
       max_index = size - 1
       index = 0
       while index < max_index
-        yield parse_record(string, index)
+        word = parse_record(string, index)
+        word.index = index
+        word.percentile = [(index-0.5)/size,0].max * 100
+        yield word
         index += 1
       end
     end
@@ -332,7 +335,8 @@ INSPECT
       result = []
       found_indexes = []
       iterations = 0
-      while (result.size < count && iterations < 1000)
+      max_iterations = [1000, 4 * count].max
+      while (result.size < count && iterations < max_iterations)
         len = random_in_range(options[:word_length])
         pct = random_in_range(options[:percentile])
         word_idx = @kdtree.nearest(len2coord(len), pct)

data/lib/correct_horse_battery_staple/corpus/base.rb CHANGED

@@ -48,6 +48,15 @@ class CorrectHorseBatteryStaple::Corpus::Base < CorrectHorseBatteryStaple::Corpu
   end
+  def count_by_options(options = {})
+    if options.empty?
+      count
+    else
+      count &filter_for_options(options)
+    end
+  end
+  memoize :count_by_options
   def sorted_entries
     entries.sort
   end
@@ -136,8 +145,12 @@ class CorrectHorseBatteryStaple::Corpus::Base < CorrectHorseBatteryStaple::Corpu
   end
   memoize :frequencies
-  def entropy_per_word
-    Math.log(count) / Math.log(2)
+  def entropy_per_word(options = {})
+    Math.log(count_by_options(options)) / Math.log(2)
+  end
+  def entropy_per_word_by_filter(&filter)
+    Math.log(filter ? count(&filter) : size) / Math.log(2)
   end
   # filtering
@@ -279,8 +292,8 @@ INSPECT
     filters.empty? ? nil : compose_filters(filters)
   end
-  memoize :filter_for_options
+  # memoize :filter_for_options
+  public :filter_for_options
 end
 # Random.srand(SecureRandom.random_number)

data/lib/correct_horse_battery_staple/corpus/isam.rb CHANGED

@@ -28,9 +28,10 @@ require 'set'
 #
 class CorrectHorseBatteryStaple::Corpus::Isam < CorrectHorseBatteryStaple::Corpus::Base
+  include CorrectHorseBatteryStaple::Backend::Isam
   include CorrectHorseBatteryStaple::Memoize
-  INITIAL_PRELUDE_LENGTH = 512
+  INITIAL_PRELUDE_LENGTH = 4096
   def initialize(filename, stats = nil)
     super
@@ -39,122 +40,11 @@ class CorrectHorseBatteryStaple::Corpus::Isam < CorrectHorseBatteryStaple::Corpu
     parse_prelude
   end
-  def precache(max = -1)
-    return if max > -1 && file_size(@file) > max
-    @file.seek 0
-    @file = StringIO.new @file.read, "r"
-  end
-  def file_size(file)
-    (file.respond_to?(:size) ? file.size : file.stat.size)
-  end
-  def prelude
-    @prelude || parse_prelude
-  end
-  def parse_prelude
-    @file.seek 0
-    prelude_buf = @file.read(INITIAL_PRELUDE_LENGTH)
-    # byte offset of first record from beginning of file
-    # total length of JSON string (without padding)
-    (@record_offset, @prelude_len)  = prelude_buf.unpack("NN")
-    # read more if our initial read didn't slurp in the entire prelude
-    if @prelude_len > prelude_buf.length
-      prelude_buf += @file.read(@prelude_len - prelude_buf.length)
-    end
-    @prelude = JSON.parse( prelude_buf.unpack("@8a#{@prelude_len}")[0] ) || {}
-    # includes prefix length byte
-    @word_length      = @prelude["wlen"]     || raise(ArgumentError, "Word length is not defined!")
-    # as network byte order int
-    @frequency_length = @prelude["flen"]     || 4
-    # total length of record
-    @entry_length     = @prelude["entrylen"] || raise(ArgumentError, "Prelude does not include entrylen!")
-    load_stats_from_hash(@prelude["stats"]) if @prelude["stats"]
-    @prelude
-  end
   # factory-ish constructor
   def self.read(filename)
     self.new filename
   end
-  ## parsing
-  #
-  # Parse a record into an array of [word, frequency] IFF the word
-  # fits into the length_range or length_range is nil
-  #
-  def parse_record_into_array(string, index, length_range = nil)
-    chunk = nth_chunk(index, string)
-    raise "No chunk for index #{index}" unless chunk
-    actual_word_length = chunk.unpack("C")[0]
-    if !length_range || length_range.include?(actual_word_length)
-      # returns [word, frequency]
-      chunk.unpack("xa#{actual_word_length}@#{@word_length}N")
-    else
-      nil
-    end
-  end
-  #
-  # Parse a record into a Word object, which can be provided or will otherwise
-  # be constructed as needed fourth arg is a length range which can act as a
-  # filter; if not satisfied, nil will be returned
-  #
-  def parse_record(string, index=0,
-                   word=CorrectHorseBatteryStaple::Word.new(:word => ""),
-                   length_range = nil)
-    bare = parse_record_into_array(string, index, length_range)
-    return nil unless bare
-    word.word = bare[0]
-    word.frequency = bare[1]
-    word
-  end
-  def word_length(chunk_string)
-    chunk_string.unpack("C")
-  end
-  # return a string representing the nth_record
-  def nth_chunk(n, string)
-    string[@entry_length * n, @entry_length]
-  end
-  ## some core Enumerable building blocks
-  def each(&block)
-    string = records_string
-    max_index = size - 1
-    index = 0
-    while index < max_index
-      yield parse_record(string, index)
-      index += 1
-    end
-  end
-  def size
-    @size ||= records_size / @entry_length
-  end
-  ## our Corpus Enumerablish abstract methods
-  # we presume that the ISAM file has been sorted
-  def sorted_entries
-    @sorted_entries ||= entries
-  end
   ## optimized pick - does NOT support :filter, though
   def pick(count, options = {})
     # incompat check
@@ -203,55 +93,4 @@ class CorrectHorseBatteryStaple::Corpus::Isam < CorrectHorseBatteryStaple::Corpu
     result
   end
-  ## file I/O
-  def records_size
-    @records_size ||= (file_size(@file) - @record_offset)
-  end
-  def file_string
-    @file.is_a?(StringIO) ? @file.string : file_range_read(nil)
-  end
-  def file_range_read(file_range = nil)
-    file_range ||= 0...file_size(@file)
-    pos = @file.tell
-    @file.seek(file_range.first)
-    @file.read(range_count(file_range))
-  ensure
-    @file.seek(pos)
-  end
-  memoize :file_range_read
-  # returns a string representing the record-holding portion of the file
-  def records_string
-    @records_string ||=
-      record_range_read(0 ... records_size)
-  end
-  def record_range_read(record_range = nil)
-    record_range ||= 0...records_size
-    file_range_read((record_range.first + @record_offset)...(record_range.first + range_count(record_range) + @record_offset))
-  end
-  # memoize :record_range_read
-  def record_percentile_range_read(percentile_range)
-    record_range = record_range_for_percentile(percentile_range)
-    record_range_read(record_range)
-  end
-  ## rather than using a StatisticalArray, we do direct indexing into the file/string
-  def percentile_index(percentile, round=true)
-    r = percentile.to_f/100 * count + 0.5
-    round ? r.round : r
-  end
-  def record_range_for_percentile(range)
-    range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric)
-    (percentile_index(range.begin, false).floor * @entry_length ...
-     percentile_index(range.end,   false).ceil * @entry_length)
-  end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: correct-horse-battery-staple
 version: !ruby/object:Gem::Version
-  version: 0.6.3
+  version: 0.6.4
   prerelease:
 platform: ruby
 authors:
@@ -50,11 +50,11 @@ cert_chain:
   -----END CERTIFICATE-----
 '
-date: 2012-01-11 00:00:00.000000000 Z
+date: 2012-01-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: commander
-  requirement: &70319122299080 !ruby/object:Gem::Requirement
+  requirement: &70214898076980 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -62,10 +62,10 @@ dependencies:
         version: '4.0'
   type: :runtime
   prerelease: false
-  version_requirements: *70319122299080
+  version_requirements: *70214898076980
 - !ruby/object:Gem::Dependency
   name: fastercsv
-  requirement: &70319122297940 !ruby/object:Gem::Requirement
+  requirement: &70214898075860 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -73,10 +73,10 @@ dependencies:
         version: 1.5.3
   type: :runtime
   prerelease: false
-  version_requirements: *70319122297940
+  version_requirements: *70214898075860
 - !ruby/object:Gem::Dependency
   name: json
-  requirement: &70319122297020 !ruby/object:Gem::Requirement
+  requirement: &70214898075000 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -84,10 +84,10 @@ dependencies:
         version: 1.6.0
   type: :runtime
   prerelease: false
-  version_requirements: *70319122297020
+  version_requirements: *70214898075000
 - !ruby/object:Gem::Dependency
   name: redis
-  requirement: &70319122296340 !ruby/object:Gem::Requirement
+  requirement: &70214898074260 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -95,10 +95,10 @@ dependencies:
         version: 2.2.2
   type: :runtime
   prerelease: false
-  version_requirements: *70319122296340
+  version_requirements: *70214898074260
 - !ruby/object:Gem::Dependency
   name: hiredis
-  requirement: &70319122295780 !ruby/object:Gem::Requirement
+  requirement: &70214898073780 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -106,10 +106,10 @@ dependencies:
         version: 0.4.0
   type: :runtime
   prerelease: false
-  version_requirements: *70319122295780
+  version_requirements: *70214898073780
 - !ruby/object:Gem::Dependency
   name: tupalo-kdtree
-  requirement: &70319122295280 !ruby/object:Gem::Requirement
+  requirement: &70214898073340 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -117,10 +117,10 @@ dependencies:
         version: 0.2.3
   type: :runtime
   prerelease: false
-  version_requirements: *70319122295280
+  version_requirements: *70214898073340
 - !ruby/object:Gem::Dependency
   name: sqlite3
-  requirement: &70319122294840 !ruby/object:Gem::Requirement
+  requirement: &70214898072840 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -128,10 +128,10 @@ dependencies:
         version: 1.3.0
   type: :runtime
   prerelease: false
-  version_requirements: *70319122294840
+  version_requirements: *70214898072840
 - !ruby/object:Gem::Dependency
   name: rubyforge
-  requirement: &70319122294400 !ruby/object:Gem::Requirement
+  requirement: &70214898072400 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -139,10 +139,10 @@ dependencies:
         version: 2.0.4
   type: :development
   prerelease: false
-  version_requirements: *70319122294400
+  version_requirements: *70214898072400
 - !ruby/object:Gem::Dependency
   name: hoe
-  requirement: &70319122293960 !ruby/object:Gem::Requirement
+  requirement: &70214898071960 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -150,7 +150,7 @@ dependencies:
         version: '2.12'
   type: :development
   prerelease: false
-  version_requirements: *70319122293960
+  version_requirements: *70214898071960
 description: ! "Generate a 4 word password from words of size 3-8 characters, with\nfrequencies
   in the 30th-60th percentile. This range gives a nice set\nof uncommon but not completely
   alien words.\n\n    $ chbs generate --verbose -W 3..8 -P 30..60\n    Corpus size:
@@ -187,6 +187,7 @@ files:
 - lib/correct_horse_battery_staple.rb
 - lib/correct_horse_battery_staple/assembler.rb
 - lib/correct_horse_battery_staple/backend.rb
+- lib/correct_horse_battery_staple/backend/isam.rb
 - lib/correct_horse_battery_staple/backend/isam_kd.rb
 - lib/correct_horse_battery_staple/backend/redis.rb
 - lib/correct_horse_battery_staple/backend/redis/d_range.rb

metadata.gz.sig CHANGED

Binary file