RubyGems - correct-horse-battery-staple - Versions diffs - 0.6.1 - Mend

correct-horse-battery-staple 0.6.1

Files changed (61) hide show

data.tar.gz.sig +1 -1
data/.gemtest +0 -0
data/Gemfile +53 -0
data/Gemfile.lock +109 -0
data/History.txt +6 -0
data/Manifest.txt +57 -0
data/README.txt +115 -0
data/Rakefile +47 -0
data/bin/chbs +234 -0
data/bin/chbs-mkpass +16 -0
data/correct-horse-battery-staple.gemspec +59 -0
data/lib/correct_horse_battery_staple.rb +117 -0
data/lib/correct_horse_battery_staple/assembler.rb +45 -0
data/lib/correct_horse_battery_staple/backend.rb +6 -0
data/lib/correct_horse_battery_staple/backend/isam_kd.rb +410 -0
data/lib/correct_horse_battery_staple/backend/redis.rb +95 -0
data/lib/correct_horse_battery_staple/backend/redis/d_range.rb +105 -0
data/lib/correct_horse_battery_staple/corpus.rb +33 -0
data/lib/correct_horse_battery_staple/corpus/base.rb +278 -0
data/lib/correct_horse_battery_staple/corpus/isam.rb +258 -0
data/lib/correct_horse_battery_staple/corpus/isam_kd.rb +60 -0
data/lib/correct_horse_battery_staple/corpus/redis.rb +188 -0
data/lib/correct_horse_battery_staple/corpus/redis2.rb +88 -0
data/lib/correct_horse_battery_staple/corpus/serialized.rb +121 -0
data/lib/correct_horse_battery_staple/corpus/sqlite.rb +266 -0
data/lib/correct_horse_battery_staple/generator.rb +40 -0
data/lib/correct_horse_battery_staple/memoize.rb +25 -0
data/lib/correct_horse_battery_staple/parser.rb +5 -0
data/lib/correct_horse_battery_staple/parser/base.rb +5 -0
data/lib/correct_horse_battery_staple/parser/regex.rb +58 -0
data/lib/correct_horse_battery_staple/range_parser.rb +29 -0
data/lib/correct_horse_battery_staple/statistical_array.rb +74 -0
data/lib/correct_horse_battery_staple/stats.rb +22 -0
data/lib/correct_horse_battery_staple/word.rb +90 -0
data/lib/correct_horse_battery_staple/writer.rb +29 -0
data/lib/correct_horse_battery_staple/writer/base.rb +22 -0
data/lib/correct_horse_battery_staple/writer/csv.rb +15 -0
data/lib/correct_horse_battery_staple/writer/file.rb +54 -0
data/lib/correct_horse_battery_staple/writer/isam.rb +50 -0
data/lib/correct_horse_battery_staple/writer/isam_kd.rb +12 -0
data/lib/correct_horse_battery_staple/writer/json.rb +19 -0
data/lib/correct_horse_battery_staple/writer/marshal.rb +10 -0
data/lib/correct_horse_battery_staple/writer/redis.rb +41 -0
data/lib/correct_horse_battery_staple/writer/sqlite.rb +115 -0
data/script/generate_all +34 -0
data/script/load_redis +17 -0
data/script/perftest +74 -0
data/spec/corpus/serialized_spec.rb +62 -0
data/spec/corpus_spec.rb +50 -0
data/spec/correct_horse_battery_staple_spec.rb +73 -0
data/spec/fixtures/100.json +101 -0
data/spec/fixtures/corpus1.csv +101 -0
data/spec/fixtures/corpus100.json +101 -0
data/spec/fixtures/wiktionary1000.htm +648 -0
data/spec/range_parser_spec.rb +54 -0
data/spec/spec_helper.rb +20 -0
data/spec/statistical_array_spec.rb +52 -0
data/spec/support/spec_pry.rb +1 -0
data/spec/word_spec.rb +95 -0
metadata +264 -0
metadata.gz.sig +1 -0

@@ -0,0 +1,16 @@
+#!/usr/bin/env ruby
+require 'correct_horse_battery_staple'
+format = ARGV[2] || ENV['corpus_format'] || "isam"
+corpus = CorrectHorseBatteryStaple.load_corpus(ARGV[0] || "tvscripts", format)
+word_length = 3..9
+percentile = 30..80
+number_of_words = (ARGV[1] || 4).to_i
+generator = CorrectHorseBatteryStaple::Generator.new(corpus)
+puts generator.make(number_of_words,
+                    :word_length => word_length,
+                    :percentile => percentile)

data/correct-horse-battery-staple.gemspec ADDED

@@ -0,0 +1,59 @@
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = "correct-horse-battery-staple"
+  s.version = "0.6.1.20120109223855"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Robert Sanders"]
+  s.cert_chain = ["/Users/robertsanders/.gem/gem-public_cert.pem"]
+  s.date = "2012-01-10"
+  s.description = "Generate a 4 word password from words of size 3-8 characters, with\nfrequencies in the 30th-60th percentile. This range gives a nice set\nof uncommon but not completely alien words.\n\n    $ chbs generate --verbose -W 3..8 -P 30..60\n    Corpus size: 6396 candidate words of 33075 total\n    Entropy: 48 bits (2^48 = 281474976710656)\n    Years to guess at 1000 guesses/sec: 8926\n    magnate-thermal-sandbank-augur\n\nWith the --verbose flag, the utility will calculate a time-to-guess\nbased on a completely arbitrary 1000 guesses/sec.  If you'd like a\nmore secure password, either relax the various filtering rules (-W and\n-P), add more words to the password, or use a larger corpus.\n\nBy default we use the American TV Shows & Scripts corpus taken from\nWiktionary.\n\nOthers provided:\n\n* Project Gutenberg 2005 corpus taken from Wiktionary.\n* 1 of every 7 of the top 60000 lemmas from wordfrequency.info (6900\n  actual lemmas after processing)\n\nSee http://xkcd.com/936/ for the genesis of the idea.\n\nData sources:\n\n     http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists\n     http://wordfrequency.info/"
+  s.email = ["robert@curioussquid.com"]
+  s.executables = ["chbs", "chbs-mkpass"]
+  s.extra_rdoc_files = ["History.txt", "Manifest.txt", "README.txt"]
+  s.files = ["Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "README.txt", "Rakefile", "bin/chbs", "bin/chbs-mkpass", "lib/correct_horse_battery_staple.rb", "lib/correct_horse_battery_staple/assembler.rb", "lib/correct_horse_battery_staple/backend.rb", "lib/correct_horse_battery_staple/backend/isam_kd.rb", "lib/correct_horse_battery_staple/backend/redis.rb", "lib/correct_horse_battery_staple/backend/redis/d_range.rb", "lib/correct_horse_battery_staple/corpus.rb", "lib/correct_horse_battery_staple/corpus/base.rb", "lib/correct_horse_battery_staple/corpus/isam.rb", "lib/correct_horse_battery_staple/corpus/isam_kd.rb", "lib/correct_horse_battery_staple/corpus/redis.rb", "lib/correct_horse_battery_staple/corpus/redis2.rb", "lib/correct_horse_battery_staple/corpus/serialized.rb", "lib/correct_horse_battery_staple/corpus/sqlite.rb", "lib/correct_horse_battery_staple/generator.rb", "lib/correct_horse_battery_staple/memoize.rb", "lib/correct_horse_battery_staple/parser.rb", "lib/correct_horse_battery_staple/parser/base.rb", "lib/correct_horse_battery_staple/parser/regex.rb", "lib/correct_horse_battery_staple/range_parser.rb", "lib/correct_horse_battery_staple/statistical_array.rb", "lib/correct_horse_battery_staple/stats.rb", "lib/correct_horse_battery_staple/word.rb", "lib/correct_horse_battery_staple/writer.rb", "lib/correct_horse_battery_staple/writer/base.rb", "lib/correct_horse_battery_staple/writer/csv.rb", "lib/correct_horse_battery_staple/writer/file.rb", "lib/correct_horse_battery_staple/writer/isam.rb", "lib/correct_horse_battery_staple/writer/isam_kd.rb", "lib/correct_horse_battery_staple/writer/json.rb", "lib/correct_horse_battery_staple/writer/marshal.rb", "lib/correct_horse_battery_staple/writer/redis.rb", "lib/correct_horse_battery_staple/writer/sqlite.rb", "script/generate_all", "script/load_redis", "script/perftest", "spec/corpus/serialized_spec.rb", "spec/corpus_spec.rb", "spec/correct_horse_battery_staple_spec.rb", "spec/fixtures/100.json", "spec/fixtures/corpus1.csv", "spec/fixtures/corpus100.json", "spec/fixtures/wiktionary1000.htm", "spec/range_parser_spec.rb", "spec/spec_helper.rb", "spec/statistical_array_spec.rb", "spec/support/spec_pry.rb", "spec/word_spec.rb", "correct-horse-battery-staple.gemspec", ".gemtest"]
+  s.homepage = "http://github.com/rsanders/correct-horse-battery-staple"
+  s.rdoc_options = ["--main", "README.txt"]
+  s.require_paths = ["lib"]
+  s.rubyforge_project = "correct-horse-battery-staple"
+  s.rubygems_version = "1.8.10"
+  s.signing_key = "/Users/robertsanders/.gem/gem-private_key.pem"
+  s.summary = "Generate a 4 word password from words of size 3-8 characters, with frequencies in the 30th-60th percentile"
+  if s.respond_to? :specification_version then
+    s.specification_version = 3
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+      s.add_runtime_dependency(%q<commander>, [">= 4.0"])
+      s.add_runtime_dependency(%q<fastercsv>, [">= 1.5.3"])
+      s.add_runtime_dependency(%q<json>, [">= 1.6.0"])
+      s.add_runtime_dependency(%q<redis>, [">= 2.2.2"])
+      s.add_runtime_dependency(%q<hiredis>, [">= 0.4.0"])
+      s.add_runtime_dependency(%q<tupalo-kdtree>, [">= 0.2.3"])
+      s.add_runtime_dependency(%q<sqlite3>, [">= 1.3.0"])
+      s.add_development_dependency(%q<rubyforge>, [">= 2.0.4"])
+      s.add_development_dependency(%q<hoe>, ["~> 2.12"])
+    else
+      s.add_dependency(%q<commander>, [">= 4.0"])
+      s.add_dependency(%q<fastercsv>, [">= 1.5.3"])
+      s.add_dependency(%q<json>, [">= 1.6.0"])
+      s.add_dependency(%q<redis>, [">= 2.2.2"])
+      s.add_dependency(%q<hiredis>, [">= 0.4.0"])
+      s.add_dependency(%q<tupalo-kdtree>, [">= 0.2.3"])
+      s.add_dependency(%q<sqlite3>, [">= 1.3.0"])
+      s.add_dependency(%q<rubyforge>, [">= 2.0.4"])
+      s.add_dependency(%q<hoe>, ["~> 2.12"])
+    end
+  else
+    s.add_dependency(%q<commander>, [">= 4.0"])
+    s.add_dependency(%q<fastercsv>, [">= 1.5.3"])
+    s.add_dependency(%q<json>, [">= 1.6.0"])
+    s.add_dependency(%q<redis>, [">= 2.2.2"])
+    s.add_dependency(%q<hiredis>, [">= 0.4.0"])
+    s.add_dependency(%q<tupalo-kdtree>, [">= 0.2.3"])
+    s.add_dependency(%q<sqlite3>, [">= 1.3.0"])
+    s.add_dependency(%q<rubyforge>, [">= 2.0.4"])
+    s.add_dependency(%q<hoe>, ["~> 2.12"])
+  end
+end

data/lib/correct_horse_battery_staple.rb ADDED

@@ -0,0 +1,117 @@
+require 'logger'
+module CorrectHorseBatteryStaple
+  VERSION = '0.6.1'
+  DEFAULT_CORPUS_NAME = "tvscripts"
+  SUPPORTED_FORMATS = %w[isam isamkd sqlite json csv marshal]
+  class << self
+    attr_accessor :logger
+    attr_accessor :corpus_directory
+  end
+  self.logger = Logger.new(STDERR)
+  def self.default_corpus
+    self.load_corpus DEFAULT_CORPUS_NAME
+  end
+  def self.corpus_search_directories
+    [self.corpus_directory]
+  end
+  def self.corpus_list(options = {})
+    self.corpus_search_directories.map do |dir|
+      Dir[File.join(dir, "*.{#{SUPPORTED_FORMATS.join(',')}}")].
+        map {|file| options[:with_paths] ? File.expand_path(file) : File.basename(file, File.extname(file)) }
+    end.flatten.sort.uniq
+  end
+  def self.find_corpus(corpus_name, formats = SUPPORTED_FORMATS)
+    formats.each do |fmt|
+      fname = "#{corpus_name}.#{fmt}"
+      self.corpus_search_directories.each do |dir|
+        path = File.join(dir, fname)
+        return path if File.exist?(path)
+      end
+    end
+    nil
+  end
+  def self.load_corpus(corpus_name, formats = nil)
+    if corpus_name.include?(':')
+      return CorrectHorseBatteryStaple::Corpus.read corpus_name
+    end
+    formats = Array(formats || SUPPORTED_FORMATS)
+    filename = corpus_name.match(/[.?]/) ? corpus_name :
+      self.find_corpus(corpus_name, formats)
+    unless (filename && File.exist?(filename))
+      raise ArgumentError, "Cannot find corpus #{corpus_name}"
+    end
+    CorrectHorseBatteryStaple::Corpus.read filename
+  end
+  def self.generate(length = 4)
+    CorrectHorseBatteryStaple::Generator.new(self.default_corpus).make(length)
+  end
+  protected
+  module Common
+    def logger
+      CorrectHorseBatteryStaple.logger
+    end
+    if Object.const_defined?("Random")
+      def random_number(max=1.0)
+        Random.rand(max)
+      end
+    else
+      def random_number(max=1.0)
+        SecureRandom.random_number(max)
+      end
+    end
+    def random_in_range(range)
+      range.first + random_number(range_count(range))
+    end
+    def array_sample(array, count)
+      l = array.length
+      array.values_at(* count.times.map { random_number(l) })
+    end
+    def set_sample(array, count)
+      l = array.length
+      array.values_at(* count.times.map { random_number(l) })
+    end
+  end
+  module Util
+    def self.open_binary(filename, mode = "r", *rest)
+      open(filename, openmode(mode), *rest)
+    end
+    def self.openmode(mode)
+      IO.respond_to?(:binwrite) ? "#{mode}b:ASCII-8BIT" : mode
+    end
+  end
+  autoload :Word,               'correct_horse_battery_staple/word'
+  autoload :Stats,              'correct_horse_battery_staple/stats'
+  autoload :Generator,          'correct_horse_battery_staple/generator'
+  autoload :Corpus,             'correct_horse_battery_staple/corpus'
+  autoload :Parser,             'correct_horse_battery_staple/parser'
+  autoload :StatisticalArray,   'correct_horse_battery_staple/statistical_array'
+  autoload :RangeParser,        'correct_horse_battery_staple/range_parser'
+  autoload :Writer,             'correct_horse_battery_staple/writer'
+  autoload :Backend,            'correct_horse_battery_staple/backend'
+  autoload :Memoize,            'correct_horse_battery_staple/memoize'
+  self.corpus_directory = File.join(File.dirname(__FILE__), "../corpus")
+end

data/lib/correct_horse_battery_staple/assembler.rb ADDED

@@ -0,0 +1,45 @@
+require 'bigdecimal'
+require 'json'
+class CorrectHorseBatteryStaple::Assembler
+  include CorrectHorseBatteryStaple::Common
+  attr_accessor :words
+  VALID_INITIAL_CHARS = ([*'a'..'z']).map {|ls| ls[0]}
+  def initialize(parser = nil)
+    @parser = (parser || CorrectHorseBatteryStaple::Parser::Regex.new(:wiktionary))
+  end
+  def read(urls)
+    self.words =
+      urls.map do |url|
+        @parser.parse open(url)
+      end.reduce(:+).
+          select {|wstruct| VALID_INITIAL_CHARS.include?(wstruct.word[0]) }.
+      # we take a round-trip through a Hash to weed out dupes
+          inject({}) {|h, wstruct|  h[wstruct.word] = wstruct; h }.
+          values.
+          sort
+    self
+  end
+  def randomize
+    self.words.shuffle!
+    self
+  end
+  def limit(count)
+    self.words.slice!(count..-1) if self.words.length > count
+    self
+  end
+  def corpus
+    @corpus ||= CorrectHorseBatteryStaple::Corpus::Serialized.new(self.words).tap do |corpus|
+      corpus.recalculate
+    end
+  end
+end

data/lib/correct_horse_battery_staple/backend.rb ADDED

@@ -0,0 +1,6 @@
+class CorrectHorseBatteryStaple::Backend
+  autoload :Isam,     "correct_horse_battery_staple/backend/isam"
+  autoload :IsamKD,   "correct_horse_battery_staple/backend/isam_kd"
+  autoload :Sqlite,   "correct_horse_battery_staple/backend/sqlite"
+  autoload :Redis,    "correct_horse_battery_staple/backend/redis"
+end

data/lib/correct_horse_battery_staple/backend/isam_kd.rb ADDED

@@ -0,0 +1,410 @@
+require 'bigdecimal'
+require 'json'
+require 'set'
+require 'kdtree'
+module CorrectHorseBatteryStaple::Backend::IsamKD
+  INITIAL_PRELUDE_LENGTH = 4096
+  F_PRELUDE_AT_END = 1
+  def self.included(base)
+    base.extend ClassMethods
+    base.send :include, InstanceMethods
+  end
+  module ClassMethods
+  end
+  module InstanceMethods
+    #
+    #
+    #
+    def initialize_backend_variables
+      @length_scaling_factor = 15
+      @page_size = 4096
+    end
+    def fix_stats(stats)
+      stats.each do |k,v|
+        if v.respond_to?(:nan?) && v.nan?
+          stats[k] = -1
+        end
+      end
+      stats
+    end
+    def page_size
+      @page_size || 4096
+    end
+    # many MMUs in default mode and modern highcap drives have 4k pages/blocks
+    def round_up(val, blocksize=page_size)
+      [(val.to_f/blocksize).ceil, 1].max * blocksize
+    end
+    def write_corpus_to_io(corpus, io=STDOUT)
+      io.rewind
+      # includes prefix length byte
+      @word_length = corpus.reduce(0) { |m, e| m > e.word.length ? m : e.word.length } + 1
+      @freq_length = 4
+      @entry_length = @word_length + @freq_length
+      stats = fix_stats(corpus.stats)
+      corpus_word_count = corpus.length
+      prelude = {
+        "wlen"           => @word_length,
+        "flen"           => 4,
+        "entrylen"       => @word_length + @freq_length,
+        "sort"           => "frequency",
+        "n"              => corpus_word_count,
+        "stats"          => stats,
+        "flags"          => 0,
+        "length_scaling_factor" => (@length_scaling_factor || 15),
+        "records_length" => "0000000000",
+        "offset_records" => "0000000000",
+        "offset_index1"  => "0000000000",
+        "offset_index2"  => "0000000000"
+      }
+      prelude_json_length = prelude.to_json.length
+      prelude["offset_records"] = offset_records = round_up(prelude_json_length+8.0)
+      prelude["records_length"] = records_length = corpus_word_count * prelude["entrylen"]
+      offset_index1 = prelude["offset_records"] +
+        round_up(records_length, page_size)
+      prelude["offset_index1"]  = offset_index1
+      io.write([offset_records, prelude_json_length, prelude.to_json].
+               pack("NNA#{offset_records-8}"))
+      corpus.each_with_index do |w, index|
+        io.write(s=[w.word.length, w.word, w.frequency].pack("Ca#{@word_length-1}N"))
+      end
+      pad(offset_index1 - (offset_records + records_length), io)
+      write_kdtree(corpus, io)
+    end
+    def pad(size, io)
+      io.write([].pack("x#{size}"))
+    end
+    def write_kdtree(corpus, io)
+      i = -1
+      k = KDTree.new(
+                     corpus.entries.map {|w| [
+                                              len2coord(w.word.length.to_f),
+                                              w.percentile.to_f,
+                                              i+=1
+                                             ]
+                     }
+                     )
+      k.persist(io)
+    end
+    # make the search space more square by increasing the length of
+    # the "word length" axis
+    def len2coord(len)
+      len * (@length_scaling_factor || 10)
+    end
+    def binwrite(*args)
+      method = io.respond_to?(:binwrite) ? :binwrite : :write
+      io.send(method, *args)
+    end
+    def openmode
+      IO.respond_to?(:binwrite) ? "wb:ASCII-8BIT" : "w"
+    end
+    #
+    #
+    # Format of header:
+    #
+    # 0..3    -  OB - offset of body start in bytes; network byte order
+    # 4..7    -  LP - length of prelude in network byte order
+    # 8..OB-1 -  P  - JSON-encoded prelude hash and space padding
+    # OB..EOF -  array of fixed size records as described in prelude
+    #
+    # Contents of Prelude (after JSON decoding):
+    #
+    # P["wlen"]                   - length of word part of record
+    # P["flen"]                   - length of frequency part of record (always 4 bytes)
+    # P["entrylen"]               - length of total part of record
+    # P["n"]                      - number of records
+    # P["sort"]                   - field name sorted by (word or frequency)
+    # P["stats"]                  - corpus statistics
+    # P["offset_index1"]          - absolute file offset of KDTree index
+    # P["records_length"]         - length in bytes of records section, excluding padding
+    # P["length_scaling_factor"]  - what length was multiplied by in creating KDTree (usually 15)
+    #
+    # Format of record:
+    #
+    # 2 bytes              - LW - actual length of word within field
+    # P["wlen"] bytes      - LW bytes of word (W) + P["wlen"]-LW bytes of padding
+    # P["flen"] (4) bytes  - frequency as network byte order long
+    #
+    # After record section, there is padding up to the next page_size boundary,
+    # and then there is a dumped KDTree which extends to EOF.
+    #
+    #
+    def precache(max = -1)
+      return if max > -1 && file_size(@file) > max
+      @file.seek 0
+      @file = StringIO.new @file.read, "r"
+    end
+    def file_size(file)
+      (file.respond_to?(:size) ? file.size : file.stat.size)
+    end
+    def prelude
+      @prelude || parse_prelude
+    end
+    def parse_prelude
+      @file.seek 0
+      prelude_buf = @file.read(INITIAL_PRELUDE_LENGTH)
+      # byte offset of first record from beginning of file
+      # total length of JSON string (without padding)
+      (@record_offset, @prelude_len)  = prelude_buf.unpack("NN")
+      # read more if our initial read didn't slurp in the entire prelude
+      if @prelude_len > prelude_buf.length
+        prelude_buf += @file.read(@prelude_len - prelude_buf.length)
+      end
+      @prelude = JSON.parse( prelude_buf.unpack("@8a#{@prelude_len}")[0] ) || {}
+      # includes prefix length byte
+      @word_length      = @prelude["wlen"]     || raise(ArgumentError, "Word length is not defined!")
+      # as network byte order int
+      @frequency_length = @prelude["flen"]     || 4
+      # total length of record
+      @entry_length     = @prelude["entrylen"] || raise(ArgumentError, "Prelude does not include entrylen!")
+      @offset_index1    = @prelude["offset_index1"] || raise(ArgumentError, "No index offset!")
+      @records_length   = @prelude["records_length"] || raise(ArgumentError, "No records length!")
+      @entry_count      = @prelude["n"] || raise(ArgumentError, "Number of records not included!")
+      @length_scaling_factor = @prelude["length_scaling_factor"] || 10
+      load_stats_from_hash(@prelude["stats"]) if @prelude["stats"]
+      @prelude
+    end
+    #
+    # Show some information about
+    #
+    def inspect
+      super + "\n" + <<INSPECT
+File size: #{file_size(@file)}
+Word length: #{@word_length}
+Frequency bytes: #{@frequency_length}
+Total record bytes: #{@records_length}
+Offset of K-D Tree index: #{@offset_index1}
+Total K-D Tree index bytes: #{file_size(@file) - @offset_index1}
+K-D Tree Signature: #{file_range_read(@offset_index1..(@offset_index1+3))}
+Prelude:
+#{@prelude.map {|k,v| k=="stats" ? "" : "  #{k}: #{v}\n" }.join("") }
+INSPECT
+    end
+    def load_kdtree
+      @file.seek(@offset_index1)
+      KDTree.new @file
+    end
+    ## parsing
+    #
+    # Parse a record into an array of [word, frequency] IFF the word
+    # fits into the length_range or length_range is nil
+    #
+    def parse_record_into_array(string, index, length_range = nil)
+      chunk = nth_chunk(index, string)
+      raise "No chunk for index #{index}" unless chunk
+      actual_word_length = chunk.unpack("C")[0]
+      if !length_range || length_range.include?(actual_word_length)
+        # returns [word, frequency]
+        chunk.unpack("xa#{actual_word_length}@#{@word_length}N")
+      else
+        nil
+      end
+    end
+    #
+    # Parse a record into a Word object, which can be provided or will otherwise
+    # be constructed as needed fourth arg is a length range which can act as a
+    # filter; if not satisfied, nil will be returned
+    #
+    def parse_record(string, index=0,
+                     word=CorrectHorseBatteryStaple::Word.new(:word => ""),
+                     length_range = nil)
+      bare = parse_record_into_array(string, index, length_range)
+      return nil unless bare
+      word.word = bare[0]
+      word.frequency = bare[1]
+      word
+    end
+    def word_length(chunk_string)
+      chunk_string.unpack("C")
+    end
+    # return a string representing the nth_record
+    def nth_chunk(n, string)
+      string[@entry_length * n, @entry_length]
+    end
+    def pos_of_nth_word_in_file(n)
+      pos = @record_offset + (n * @entry_length)
+    end
+    #
+    # this version is much slower than the other - 1.5x total runtime
+    # slower in some cases.
+    #
+    # def get_word_by_idx_direct(n)
+    #   @file.seek(pos_of_nth_word_in_file(n))
+    #   chunk = @file.read(@entry_length)
+    #   parse_record(chunk)
+    # end
+    def get_word_by_idx(n)
+      chunk = nth_chunk(n, records_string)
+      parse_record(chunk).tap do |w|
+        w.index      = n
+        w.percentile = (n-0.5)/size * 100
+      end
+    end
+    ## some core Enumerable building blocks
+    def each(&block)
+      string = records_string
+      max_index = size - 1
+      index = 0
+      while index < max_index
+        yield parse_record(string, index)
+        index += 1
+      end
+    end
+    def count; size; end
+    def size
+      @entry_count ||= records_size / @entry_length
+    end
+    ## our Corpus Enumerablish abstract methods
+    # we presume that the ISAM file has been sorted
+    def sorted_entries
+      @sorted_entries ||= entries
+    end
+    ## optimized pick - does NOT support :filter, though
+    def pick(count, options = {})
+      # incompat check
+      raise NotImplementedError, "ISAM does not support :filter option" if options[:filter]
+      options = {:percentile  => 0..100,
+                 :word_length => 0..20}.merge(options)
+      result = []
+      found_indexes = []
+      iterations = 0
+      while (result.size < count && iterations < 1000)
+        len = random_in_range(options[:word_length])
+        pct = random_in_range(options[:percentile])
+        word_idx = @kdtree.nearest(len2coord(len), pct)
+        unless found_indexes.include?(word_idx)
+          found_indexes << word_idx
+          word = get_word_by_idx(word_idx)
+          if options[:word_length].include?(word.word.length)
+            result << word
+          else
+            STDERR.puts "non-qualifying word: #{word.word.length}"
+          end
+        end
+        iterations += 1
+      end
+      # validate that we succeeded
+      raise "Cannot find #{count} words matching criteria" if result.length < count
+      result
+    end
+    ## file I/O
+    def records_size
+      @records_length
+    end
+    def file_string
+      @file.is_a?(StringIO) ? @file.string : file_range_read(nil)
+    end
+    def file_range_read(file_range = nil)
+      file_range ||= 0...file_size(@file)
+      pos = @file.tell
+      @file.seek(file_range.first)
+      @file.read(range_count(file_range))
+    ensure
+      @file.seek(pos)
+    end
+    # memoize :file_range_read
+    # returns a string representing the record-holding portion of the file
+    def records_string
+      @records_string ||=
+        record_range_read(0 ... records_size)
+    end
+    def record_range_read(record_range = nil)
+      record_range ||= 0...records_size
+      file_range_read((record_range.first + @record_offset)...(range_count(record_range) + @record_offset))
+    end
+    # memoize :record_range_read
+    def record_percentile_range_read(percentile_range)
+      record_range = record_range_for_percentile(percentile_range)
+      record_range_read(record_range)
+    end
+    ## rather than using a StatisticalArray, we do direct indexing into the file/string
+    def percentile_index(percentile, round=true)
+      r = percentile.to_f/100 * count + 0.5
+      round ? r.round : r
+    end
+    def record_range_for_percentile(range)
+      range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric)
+      (percentile_index(range.begin, false).floor * @entry_length ...
+       percentile_index(range.end,   false).ceil * @entry_length)
+    end
+  end
+end