RubyGems - correct-horse-battery-staple - Versions diffs - 0.6.1 - Mend

correct-horse-battery-staple 0.6.1

Files changed (61) hide show

data.tar.gz.sig +1 -1
data/.gemtest +0 -0
data/Gemfile +53 -0
data/Gemfile.lock +109 -0
data/History.txt +6 -0
data/Manifest.txt +57 -0
data/README.txt +115 -0
data/Rakefile +47 -0
data/bin/chbs +234 -0
data/bin/chbs-mkpass +16 -0
data/correct-horse-battery-staple.gemspec +59 -0
data/lib/correct_horse_battery_staple.rb +117 -0
data/lib/correct_horse_battery_staple/assembler.rb +45 -0
data/lib/correct_horse_battery_staple/backend.rb +6 -0
data/lib/correct_horse_battery_staple/backend/isam_kd.rb +410 -0
data/lib/correct_horse_battery_staple/backend/redis.rb +95 -0
data/lib/correct_horse_battery_staple/backend/redis/d_range.rb +105 -0
data/lib/correct_horse_battery_staple/corpus.rb +33 -0
data/lib/correct_horse_battery_staple/corpus/base.rb +278 -0
data/lib/correct_horse_battery_staple/corpus/isam.rb +258 -0
data/lib/correct_horse_battery_staple/corpus/isam_kd.rb +60 -0
data/lib/correct_horse_battery_staple/corpus/redis.rb +188 -0
data/lib/correct_horse_battery_staple/corpus/redis2.rb +88 -0
data/lib/correct_horse_battery_staple/corpus/serialized.rb +121 -0
data/lib/correct_horse_battery_staple/corpus/sqlite.rb +266 -0
data/lib/correct_horse_battery_staple/generator.rb +40 -0
data/lib/correct_horse_battery_staple/memoize.rb +25 -0
data/lib/correct_horse_battery_staple/parser.rb +5 -0
data/lib/correct_horse_battery_staple/parser/base.rb +5 -0
data/lib/correct_horse_battery_staple/parser/regex.rb +58 -0
data/lib/correct_horse_battery_staple/range_parser.rb +29 -0
data/lib/correct_horse_battery_staple/statistical_array.rb +74 -0
data/lib/correct_horse_battery_staple/stats.rb +22 -0
data/lib/correct_horse_battery_staple/word.rb +90 -0
data/lib/correct_horse_battery_staple/writer.rb +29 -0
data/lib/correct_horse_battery_staple/writer/base.rb +22 -0
data/lib/correct_horse_battery_staple/writer/csv.rb +15 -0
data/lib/correct_horse_battery_staple/writer/file.rb +54 -0
data/lib/correct_horse_battery_staple/writer/isam.rb +50 -0
data/lib/correct_horse_battery_staple/writer/isam_kd.rb +12 -0
data/lib/correct_horse_battery_staple/writer/json.rb +19 -0
data/lib/correct_horse_battery_staple/writer/marshal.rb +10 -0
data/lib/correct_horse_battery_staple/writer/redis.rb +41 -0
data/lib/correct_horse_battery_staple/writer/sqlite.rb +115 -0
data/script/generate_all +34 -0
data/script/load_redis +17 -0
data/script/perftest +74 -0
data/spec/corpus/serialized_spec.rb +62 -0
data/spec/corpus_spec.rb +50 -0
data/spec/correct_horse_battery_staple_spec.rb +73 -0
data/spec/fixtures/100.json +101 -0
data/spec/fixtures/corpus1.csv +101 -0
data/spec/fixtures/corpus100.json +101 -0
data/spec/fixtures/wiktionary1000.htm +648 -0
data/spec/range_parser_spec.rb +54 -0
data/spec/spec_helper.rb +20 -0
data/spec/statistical_array_spec.rb +52 -0
data/spec/support/spec_pry.rb +1 -0
data/spec/word_spec.rb +95 -0
metadata +264 -0
metadata.gz.sig +1 -0

data/lib/correct_horse_battery_staple/corpus/isam.rb ADDED

@@ -0,0 +1,258 @@
+require 'bigdecimal'
+require 'json'
+require 'set'
+#
+#
+# Format of header:
+#
+# 0..3    -  OB - offset of body start in bytes; network byte order
+# 4..7    -  LP - length of prelude in network byte order
+# 8..OB-1 -  P  - JSON-encoded prelude hash and space padding
+# OB..EOF -  array of fixed size records as described in prelude
+#
+# Contents of Prelude (after JSON decoding):
+#
+# P["wlen"]     - length of word part of record
+# P["flen"]     - length of frequency part of record (always 4 bytes)
+# P["entrylen"] - length of total part of record
+# P["n"]        - number of records
+# P["sort"]     - field name sorted by (word or frequency)
+# P["stats"]    - corpus statistics
+#
+# Format of record:
+#
+# 2 bytes              - LW - actual length of word within field
+# P["wlen"] bytes      - LW bytes of word (W) + P["wlen"]-LW bytes of padding
+# P["flen"] (4) bytes  - frequency as network byte order long
+#
+class CorrectHorseBatteryStaple::Corpus::Isam < CorrectHorseBatteryStaple::Corpus::Base
+  include CorrectHorseBatteryStaple::Memoize
+  INITIAL_PRELUDE_LENGTH = 512
+  def initialize(filename, stats = nil)
+    super
+    @filename = filename
+    @file = CorrectHorseBatteryStaple::Util.open_binary(filename, "r")
+    parse_prelude
+  end
+  def precache(max = -1)
+    return if max > -1 && file_size(@file) > max
+    @file.seek 0
+    @file = StringIO.new @file.read, "r"
+  end
+  def file_size(file)
+    (file.respond_to?(:size) ? file.size : file.stat.size)
+  end
+  def prelude
+    @prelude || parse_prelude
+  end
+  def parse_prelude
+    @file.seek 0
+    prelude_buf = @file.read(INITIAL_PRELUDE_LENGTH)
+    # byte offset of first record from beginning of file
+    # total length of JSON string (without padding)
+    (@record_offset, @prelude_len)  = prelude_buf.unpack("NN")
+    # read more if our initial read didn't slurp in the entire prelude
+    if @prelude_len > prelude_buf.length
+      prelude_buf += @file.read(@prelude_len - prelude_buf.length)
+    end
+    @prelude = JSON.parse( prelude_buf.unpack("@8a#{@prelude_len}")[0] ) || {}
+    # includes prefix length byte
+    @word_length      = @prelude["wlen"]     || raise(ArgumentError, "Word length is not defined!")
+    # as network byte order int
+    @frequency_length = @prelude["flen"]     || 4
+    # total length of record
+    @entry_length     = @prelude["entrylen"] || raise(ArgumentError, "Prelude does not include entrylen!")
+    load_stats_from_hash(@prelude["stats"]) if @prelude["stats"]
+    @prelude
+  end
+  # factory-ish constructor
+  def self.read(filename)
+    self.new filename
+  end
+  ## parsing
+  #
+  # Parse a record into an array of [word, frequency] IFF the word
+  # fits into the length_range or length_range is nil
+  #
+  def parse_record_into_array(string, index, length_range = nil)
+    chunk = nth_chunk(index, string)
+    raise "No chunk for index #{index}" unless chunk
+    actual_word_length = chunk.unpack("C")[0]
+    if !length_range || length_range.include?(actual_word_length)
+      # returns [word, frequency]
+      chunk.unpack("xa#{actual_word_length}@#{@word_length}N")
+    else
+      nil
+    end
+  end
+  #
+  # Parse a record into a Word object, which can be provided or will otherwise
+  # be constructed as needed fourth arg is a length range which can act as a
+  # filter; if not satisfied, nil will be returned
+  #
+  def parse_record(string, index=0,
+                   word=CorrectHorseBatteryStaple::Word.new(:word => ""),
+                   length_range = nil)
+    bare = parse_record_into_array(string, index, length_range)
+    return nil unless bare
+    word.word = bare[0]
+    word.frequency = bare[1]
+    word
+  end
+  def word_length(chunk_string)
+    chunk_string.unpack("C")
+  end
+  # return a string representing the nth_record
+  def nth_chunk(n, string)
+    string[@entry_length * n, @entry_length]
+  end
+  ## some core Enumerable building blocks
+  def each(&block)
+    string = records_string
+    max_index = size - 1
+    index = 0
+    while index < max_index
+      yield parse_record(string, index)
+      index += 1
+    end
+  end
+  def count; size; end
+  def size
+    @size ||= records_size / @entry_length
+  end
+  ## our Corpus Enumerablish abstract methods
+  # we presume that the ISAM file has been sorted
+  def sorted_entries
+    @sorted_entries ||= entries
+  end
+  ## optimized pick - does NOT support :filter, though
+  def pick(count, options = {})
+    # incompat check
+    raise NotImplementedError, "ISAM does not support :filter option" if options[:filter]
+    # options parsing
+    string         = record_percentile_range_read(options[:percentile] || (0..100))
+    range_size     = string.length / @entry_length
+    max_iterations = [options[:max_iterations] || 1000, count*10].max
+    if range_size < count
+      raise ArgumentError, "Percentile range contains fewer words than requested count"
+    end
+    # the real work
+    result         = _pick(string, count, options[:word_length], max_iterations)
+    # validate that we succeeded
+    raise "Cannot find #{count} words matching criteria" if result.length < count
+    result
+  end
+  def _pick(string, count, length_range, max_iterations)
+    result = []
+    iterations = 0
+    # don't bother reading already read words
+    skip_cache = Set.new
+    range_size = string.length / @entry_length
+    # don't cons!
+    entry = CorrectHorseBatteryStaple::Word.new :word => ""
+    while result.length < count && iterations < max_iterations
+      i = random_number(range_size)
+      unless skip_cache.include? i
+        pr = parse_record(string, i, entry, length_range)
+        if pr
+          result << pr.dup
+        else
+          skip_cache << i
+        end
+      end
+      iterations += 1
+    end
+    result
+  end
+  ## file I/O
+  def records_size
+    @records_size ||= (file_size(@file) - @record_offset)
+  end
+  def file_string
+    @file.is_a?(StringIO) ? @file.string : file_range_read(nil)
+  end
+  def file_range_read(file_range = nil)
+    file_range ||= 0...file_size(@file)
+    pos = @file.tell
+    @file.seek(file_range.first)
+    @file.read(range_count(file_range))
+  ensure
+    @file.seek(pos)
+  end
+  memoize :file_range_read
+  # returns a string representing the record-holding portion of the file
+  def records_string
+    @records_string ||=
+      record_range_read(0 ... records_size)
+  end
+  def record_range_read(record_range = nil)
+    record_range ||= 0...records_size
+    file_range_read((record_range.first + @record_offset)...(range_count(record_range) + @record_offset))
+  end
+  # memoize :record_range_read
+  def record_percentile_range_read(percentile_range)
+    record_range = record_range_for_percentile(percentile_range)
+    record_range_read(record_range)
+  end
+  ## rather than using a StatisticalArray, we do direct indexing into the file/string
+  def percentile_index(percentile, round=true)
+    r = percentile.to_f/100 * count + 0.5
+    round ? r.round : r
+  end
+  def record_range_for_percentile(range)
+    range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric)
+    (percentile_index(range.begin, false).floor * @entry_length ...
+     percentile_index(range.end,   false).ceil * @entry_length)
+  end
+end

data/lib/correct_horse_battery_staple/corpus/isam_kd.rb ADDED

@@ -0,0 +1,60 @@
+require 'bigdecimal'
+require 'json'
+require 'set'
+#
+#
+# Format of header:
+#
+# 0..3    -  OB - offset of body start in bytes; network byte order
+# 4..7    -  LP - length of prelude in network byte order
+# 8..OB-1 -  P  - JSON-encoded prelude hash and space padding
+# OB..EOF -  array of fixed size records as described in prelude
+#
+# Contents of Prelude (after JSON decoding):
+#
+# P["wlen"]     - length of word part of record
+# P["flen"]     - length of frequency part of record (always 4 bytes)
+# P["entrylen"] - length of total part of record
+# P["n"]        - number of records
+# P["sort"]     - field name sorted by (word or frequency)
+# P["stats"]    - corpus statistics
+#
+# Format of record:
+#
+# 2 bytes              - LW - actual length of word within field
+# P["wlen"] bytes      - LW bytes of word (W) + P["wlen"]-LW bytes of padding
+# P["flen"] (4) bytes  - frequency as network byte order long
+#
+class CorrectHorseBatteryStaple::Corpus::IsamKD < CorrectHorseBatteryStaple::Corpus::Base
+  include CorrectHorseBatteryStaple::Memoize
+  include CorrectHorseBatteryStaple::Backend::IsamKD
+  def initialize(filename, stats = nil)
+    super
+    @filename = filename
+    @file = CorrectHorseBatteryStaple::Util.open_binary(filename, "r")
+    parse_prelude
+    load_index
+  end
+  def precache(max = -1)
+    return if max > -1 && file_size(@file) > max
+    @file.seek 0
+    @file = StringIO.new @file.read, "r"
+  end
+  def file_size(file)
+    (file.respond_to?(:size) ? file.size : file.stat.size)
+  end
+  def prelude
+    @prelude ||= parse_prelude
+  end
+  def load_index
+    @kdtree ||= load_kdtree
+  end
+end

data/lib/correct_horse_battery_staple/corpus/redis.rb ADDED

@@ -0,0 +1,188 @@
+require 'bigdecimal'
+require 'hiredis'
+require 'redis'
+require 'set'
+class CorrectHorseBatteryStaple::Corpus::Redis < CorrectHorseBatteryStaple::Corpus::Base
+  include CorrectHorseBatteryStaple::Backend::Redis
+  MAX_ITERATIONS = 1000
+  attr_accessor :dest
+  attr_accessor :options
+  def initialize(dest)
+    super
+    self.dest    = dest
+    self.options = {}
+    parse_uri(dest)
+    load_stats
+  end
+  def self.read(file)
+    self.new file
+  end
+  ## some core Enumerable building blocks
+  def each(&block)
+    entries.each &block
+  end
+  def count
+    @count ||= db.zcard(@words_key)
+  end
+  def size
+    stats[:size] || count
+  end
+  ## our own collection operations
+  def entries
+    table
+  end
+  def sorted_entries
+    entries.sort
+  end
+  def pick(count, options = {})
+    # incompat check
+    raise NotImplementedError, "Redis does not support :filter option" if options[:filter]
+    strategy = options.delete(:strategy) || ENV['pick_strategy'] || "drange"
+    send("pick_#{strategy}", count, options)
+  end
+  ## optimized pick implementations - they do NOT support :filter, though
+  def pick_standard(count, options = {})
+    percentile_range = options[:percentile]
+    length_range     = options[:word_length]
+    if percentile_range && percentile_range.begin == 0 && percentile_range.end == 100
+      percentile_range = nil
+    end
+    if (!percentile_range && !length_range)
+      get_words_for_ids(pick_random_words(count))
+    else
+      sets = []
+      sets << get_word_ids_in_zset(@percentile_key, percentile_range) if percentile_range
+      sets << get_word_ids_in_zset(@lenprod_key, length_range)         if length_range
+      candidates = (sets.length == 1 ? sets[0] : intersection(*sets))
+      get_words_for_ids(array_sample(candidates, count))
+    end
+  end
+  def pick_drange(count, options = {})
+    percentile_range = options[:percentile]
+    length_range     = options[:word_length]
+    if percentile_range && range_cover?(percentile_range, 0..100)
+      percentile_range = nil
+    end
+    corpus_length_range = self.corpus_length_range
+    if !length_range || range_cover?(length_range, corpus_length_range)
+      length_range = nil
+    end
+    if (!percentile_range && !length_range)
+      get_words_for_ids(pick_random_words(count))
+    else
+      dspace = discontiguous_range_map(@lenprod_key, length_range, percentile_range)
+      max = dspace.count
+      ids = count.times.map do
+        dspace.pick_nth(random_number(max))
+      end
+      # STDERR.puts "ids from decimal are #{ids.inspect}"
+      get_words_for_ids(ids)
+    end
+  end
+  def zcount(key, min, max)
+    db.zcount(key, min, max)
+  end
+  memoize :zcount
+  def discontiguous_range_map(key, outer_range, inner_range, divisor=100)
+    CorrectHorseBatteryStaple::Backend::Redis::DRange.new(@db, key, outer_range,
+                                                          inner_range, divisor)
+  end
+  memoize :discontiguous_range_map
+  # XXX - does not handle exclusive endpoints
+  def range_cover?(outer, inner)
+    outer.cover?(inner.begin) && outer.cover?(inner.end)
+  end
+  # TODO: make this use actual data from stored stats
+  def corpus_length_range
+    3..18
+  end
+  def pick_random_words(count)
+    count.times.map do
+      idx = random_number(size)-1
+      db.zrange(@words_key, idx, idx)[0]
+    end
+  end
+  def intersection(*sets)
+    sets.reduce {|a,b|  a & b }
+  end
+  def get_word_ids_in_zset(key, range)
+    db.zrangebyscore(key, range.begin, range.end)
+  end
+  memoize :get_word_ids_in_zset
+  def get_words_for_ids(ids)
+    ids.map {|id| CorrectHorseBatteryStaple::Word.new(:word => get_word_by_id(id)) }
+  end
+  def close
+    super
+  end
+  protected
+  def table
+    percentiles = db.zrangebyscore(@percentile_key, -1, 101, :withscores => true)
+    frequencies = db.zrangebyscore(@frequency_key, -1, 99999999, :withscores => true)
+    phash = {}
+    fhash = {}
+    (0...percentiles.length / 2).each do |index|
+      base = index * 2
+      phash[percentiles[base]] = percentiles[base+1]
+    end
+    (0...frequencies.length / 2).each do |index|
+      base = index * 2
+      fhash[frequencies[base]] = frequencies[base+1]
+    end
+    count = phash.length
+    index = 0
+    phash.keys.map do |w|
+      word_from_hash :word => w, :percentile => phash[w].to_f, :index => (index+=1),
+                     :rank => count-index+1, :frequency => fhash[w].to_f
+    end
+  end
+  def word_from_hash(hash)
+    CorrectHorseBatteryStaple::Word.new(hash)
+  end
+end