correct-horse-battery-staple 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +1 -1
- data/.gemtest +0 -0
- data/Gemfile +53 -0
- data/Gemfile.lock +109 -0
- data/History.txt +6 -0
- data/Manifest.txt +57 -0
- data/README.txt +115 -0
- data/Rakefile +47 -0
- data/bin/chbs +234 -0
- data/bin/chbs-mkpass +16 -0
- data/correct-horse-battery-staple.gemspec +59 -0
- data/lib/correct_horse_battery_staple.rb +117 -0
- data/lib/correct_horse_battery_staple/assembler.rb +45 -0
- data/lib/correct_horse_battery_staple/backend.rb +6 -0
- data/lib/correct_horse_battery_staple/backend/isam_kd.rb +410 -0
- data/lib/correct_horse_battery_staple/backend/redis.rb +95 -0
- data/lib/correct_horse_battery_staple/backend/redis/d_range.rb +105 -0
- data/lib/correct_horse_battery_staple/corpus.rb +33 -0
- data/lib/correct_horse_battery_staple/corpus/base.rb +278 -0
- data/lib/correct_horse_battery_staple/corpus/isam.rb +258 -0
- data/lib/correct_horse_battery_staple/corpus/isam_kd.rb +60 -0
- data/lib/correct_horse_battery_staple/corpus/redis.rb +188 -0
- data/lib/correct_horse_battery_staple/corpus/redis2.rb +88 -0
- data/lib/correct_horse_battery_staple/corpus/serialized.rb +121 -0
- data/lib/correct_horse_battery_staple/corpus/sqlite.rb +266 -0
- data/lib/correct_horse_battery_staple/generator.rb +40 -0
- data/lib/correct_horse_battery_staple/memoize.rb +25 -0
- data/lib/correct_horse_battery_staple/parser.rb +5 -0
- data/lib/correct_horse_battery_staple/parser/base.rb +5 -0
- data/lib/correct_horse_battery_staple/parser/regex.rb +58 -0
- data/lib/correct_horse_battery_staple/range_parser.rb +29 -0
- data/lib/correct_horse_battery_staple/statistical_array.rb +74 -0
- data/lib/correct_horse_battery_staple/stats.rb +22 -0
- data/lib/correct_horse_battery_staple/word.rb +90 -0
- data/lib/correct_horse_battery_staple/writer.rb +29 -0
- data/lib/correct_horse_battery_staple/writer/base.rb +22 -0
- data/lib/correct_horse_battery_staple/writer/csv.rb +15 -0
- data/lib/correct_horse_battery_staple/writer/file.rb +54 -0
- data/lib/correct_horse_battery_staple/writer/isam.rb +50 -0
- data/lib/correct_horse_battery_staple/writer/isam_kd.rb +12 -0
- data/lib/correct_horse_battery_staple/writer/json.rb +19 -0
- data/lib/correct_horse_battery_staple/writer/marshal.rb +10 -0
- data/lib/correct_horse_battery_staple/writer/redis.rb +41 -0
- data/lib/correct_horse_battery_staple/writer/sqlite.rb +115 -0
- data/script/generate_all +34 -0
- data/script/load_redis +17 -0
- data/script/perftest +74 -0
- data/spec/corpus/serialized_spec.rb +62 -0
- data/spec/corpus_spec.rb +50 -0
- data/spec/correct_horse_battery_staple_spec.rb +73 -0
- data/spec/fixtures/100.json +101 -0
- data/spec/fixtures/corpus1.csv +101 -0
- data/spec/fixtures/corpus100.json +101 -0
- data/spec/fixtures/wiktionary1000.htm +648 -0
- data/spec/range_parser_spec.rb +54 -0
- data/spec/spec_helper.rb +20 -0
- data/spec/statistical_array_spec.rb +52 -0
- data/spec/support/spec_pry.rb +1 -0
- data/spec/word_spec.rb +95 -0
- metadata +264 -0
- metadata.gz.sig +1 -0
data/bin/chbs-mkpass
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'correct_horse_battery_staple'
|
4
|
+
|
5
|
+
format = ARGV[2] || ENV['corpus_format'] || "isam"
|
6
|
+
corpus = CorrectHorseBatteryStaple.load_corpus(ARGV[0] || "tvscripts", format)
|
7
|
+
|
8
|
+
word_length = 3..9
|
9
|
+
percentile = 30..80
|
10
|
+
number_of_words = (ARGV[1] || 4).to_i
|
11
|
+
|
12
|
+
generator = CorrectHorseBatteryStaple::Generator.new(corpus)
|
13
|
+
|
14
|
+
puts generator.make(number_of_words,
|
15
|
+
:word_length => word_length,
|
16
|
+
:percentile => percentile)
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = "correct-horse-battery-staple"
|
5
|
+
s.version = "0.6.1.20120109223855"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Robert Sanders"]
|
9
|
+
s.cert_chain = ["/Users/robertsanders/.gem/gem-public_cert.pem"]
|
10
|
+
s.date = "2012-01-10"
|
11
|
+
s.description = "Generate a 4 word password from words of size 3-8 characters, with\nfrequencies in the 30th-60th percentile. This range gives a nice set\nof uncommon but not completely alien words.\n\n $ chbs generate --verbose -W 3..8 -P 30..60\n Corpus size: 6396 candidate words of 33075 total\n Entropy: 48 bits (2^48 = 281474976710656)\n Years to guess at 1000 guesses/sec: 8926\n magnate-thermal-sandbank-augur\n\nWith the --verbose flag, the utility will calculate a time-to-guess\nbased on a completely arbitrary 1000 guesses/sec. If you'd like a\nmore secure password, either relax the various filtering rules (-W and\n-P), add more words to the password, or use a larger corpus.\n\nBy default we use the American TV Shows & Scripts corpus taken from\nWiktionary.\n\nOthers provided:\n\n* Project Gutenberg 2005 corpus taken from Wiktionary.\n* 1 of every 7 of the top 60000 lemmas from wordfrequency.info (6900\n actual lemmas after processing)\n\nSee http://xkcd.com/936/ for the genesis of the idea.\n\nData sources:\n\n http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists\n http://wordfrequency.info/"
|
12
|
+
s.email = ["robert@curioussquid.com"]
|
13
|
+
s.executables = ["chbs", "chbs-mkpass"]
|
14
|
+
s.extra_rdoc_files = ["History.txt", "Manifest.txt", "README.txt"]
|
15
|
+
s.files = ["Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "README.txt", "Rakefile", "bin/chbs", "bin/chbs-mkpass", "lib/correct_horse_battery_staple.rb", "lib/correct_horse_battery_staple/assembler.rb", "lib/correct_horse_battery_staple/backend.rb", "lib/correct_horse_battery_staple/backend/isam_kd.rb", "lib/correct_horse_battery_staple/backend/redis.rb", "lib/correct_horse_battery_staple/backend/redis/d_range.rb", "lib/correct_horse_battery_staple/corpus.rb", "lib/correct_horse_battery_staple/corpus/base.rb", "lib/correct_horse_battery_staple/corpus/isam.rb", "lib/correct_horse_battery_staple/corpus/isam_kd.rb", "lib/correct_horse_battery_staple/corpus/redis.rb", "lib/correct_horse_battery_staple/corpus/redis2.rb", "lib/correct_horse_battery_staple/corpus/serialized.rb", "lib/correct_horse_battery_staple/corpus/sqlite.rb", "lib/correct_horse_battery_staple/generator.rb", "lib/correct_horse_battery_staple/memoize.rb", "lib/correct_horse_battery_staple/parser.rb", "lib/correct_horse_battery_staple/parser/base.rb", "lib/correct_horse_battery_staple/parser/regex.rb", "lib/correct_horse_battery_staple/range_parser.rb", "lib/correct_horse_battery_staple/statistical_array.rb", "lib/correct_horse_battery_staple/stats.rb", "lib/correct_horse_battery_staple/word.rb", "lib/correct_horse_battery_staple/writer.rb", "lib/correct_horse_battery_staple/writer/base.rb", "lib/correct_horse_battery_staple/writer/csv.rb", "lib/correct_horse_battery_staple/writer/file.rb", "lib/correct_horse_battery_staple/writer/isam.rb", "lib/correct_horse_battery_staple/writer/isam_kd.rb", "lib/correct_horse_battery_staple/writer/json.rb", "lib/correct_horse_battery_staple/writer/marshal.rb", "lib/correct_horse_battery_staple/writer/redis.rb", "lib/correct_horse_battery_staple/writer/sqlite.rb", "script/generate_all", "script/load_redis", "script/perftest", "spec/corpus/serialized_spec.rb", "spec/corpus_spec.rb", "spec/correct_horse_battery_staple_spec.rb", "spec/fixtures/100.json", "spec/fixtures/corpus1.csv", "spec/fixtures/corpus100.json", "spec/fixtures/wiktionary1000.htm", "spec/range_parser_spec.rb", "spec/spec_helper.rb", "spec/statistical_array_spec.rb", "spec/support/spec_pry.rb", "spec/word_spec.rb", "correct-horse-battery-staple.gemspec", ".gemtest"]
|
16
|
+
s.homepage = "http://github.com/rsanders/correct-horse-battery-staple"
|
17
|
+
s.rdoc_options = ["--main", "README.txt"]
|
18
|
+
s.require_paths = ["lib"]
|
19
|
+
s.rubyforge_project = "correct-horse-battery-staple"
|
20
|
+
s.rubygems_version = "1.8.10"
|
21
|
+
s.signing_key = "/Users/robertsanders/.gem/gem-private_key.pem"
|
22
|
+
s.summary = "Generate a 4 word password from words of size 3-8 characters, with frequencies in the 30th-60th percentile"
|
23
|
+
|
24
|
+
if s.respond_to? :specification_version then
|
25
|
+
s.specification_version = 3
|
26
|
+
|
27
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
28
|
+
s.add_runtime_dependency(%q<commander>, [">= 4.0"])
|
29
|
+
s.add_runtime_dependency(%q<fastercsv>, [">= 1.5.3"])
|
30
|
+
s.add_runtime_dependency(%q<json>, [">= 1.6.0"])
|
31
|
+
s.add_runtime_dependency(%q<redis>, [">= 2.2.2"])
|
32
|
+
s.add_runtime_dependency(%q<hiredis>, [">= 0.4.0"])
|
33
|
+
s.add_runtime_dependency(%q<tupalo-kdtree>, [">= 0.2.3"])
|
34
|
+
s.add_runtime_dependency(%q<sqlite3>, [">= 1.3.0"])
|
35
|
+
s.add_development_dependency(%q<rubyforge>, [">= 2.0.4"])
|
36
|
+
s.add_development_dependency(%q<hoe>, ["~> 2.12"])
|
37
|
+
else
|
38
|
+
s.add_dependency(%q<commander>, [">= 4.0"])
|
39
|
+
s.add_dependency(%q<fastercsv>, [">= 1.5.3"])
|
40
|
+
s.add_dependency(%q<json>, [">= 1.6.0"])
|
41
|
+
s.add_dependency(%q<redis>, [">= 2.2.2"])
|
42
|
+
s.add_dependency(%q<hiredis>, [">= 0.4.0"])
|
43
|
+
s.add_dependency(%q<tupalo-kdtree>, [">= 0.2.3"])
|
44
|
+
s.add_dependency(%q<sqlite3>, [">= 1.3.0"])
|
45
|
+
s.add_dependency(%q<rubyforge>, [">= 2.0.4"])
|
46
|
+
s.add_dependency(%q<hoe>, ["~> 2.12"])
|
47
|
+
end
|
48
|
+
else
|
49
|
+
s.add_dependency(%q<commander>, [">= 4.0"])
|
50
|
+
s.add_dependency(%q<fastercsv>, [">= 1.5.3"])
|
51
|
+
s.add_dependency(%q<json>, [">= 1.6.0"])
|
52
|
+
s.add_dependency(%q<redis>, [">= 2.2.2"])
|
53
|
+
s.add_dependency(%q<hiredis>, [">= 0.4.0"])
|
54
|
+
s.add_dependency(%q<tupalo-kdtree>, [">= 0.2.3"])
|
55
|
+
s.add_dependency(%q<sqlite3>, [">= 1.3.0"])
|
56
|
+
s.add_dependency(%q<rubyforge>, [">= 2.0.4"])
|
57
|
+
s.add_dependency(%q<hoe>, ["~> 2.12"])
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
module CorrectHorseBatteryStaple
|
4
|
+
VERSION = '0.6.1'
|
5
|
+
|
6
|
+
DEFAULT_CORPUS_NAME = "tvscripts"
|
7
|
+
|
8
|
+
SUPPORTED_FORMATS = %w[isam isamkd sqlite json csv marshal]
|
9
|
+
|
10
|
+
class << self
|
11
|
+
attr_accessor :logger
|
12
|
+
attr_accessor :corpus_directory
|
13
|
+
end
|
14
|
+
self.logger = Logger.new(STDERR)
|
15
|
+
|
16
|
+
def self.default_corpus
|
17
|
+
self.load_corpus DEFAULT_CORPUS_NAME
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.corpus_search_directories
|
21
|
+
[self.corpus_directory]
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.corpus_list(options = {})
|
25
|
+
self.corpus_search_directories.map do |dir|
|
26
|
+
Dir[File.join(dir, "*.{#{SUPPORTED_FORMATS.join(',')}}")].
|
27
|
+
map {|file| options[:with_paths] ? File.expand_path(file) : File.basename(file, File.extname(file)) }
|
28
|
+
end.flatten.sort.uniq
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.find_corpus(corpus_name, formats = SUPPORTED_FORMATS)
|
32
|
+
formats.each do |fmt|
|
33
|
+
fname = "#{corpus_name}.#{fmt}"
|
34
|
+
self.corpus_search_directories.each do |dir|
|
35
|
+
path = File.join(dir, fname)
|
36
|
+
return path if File.exist?(path)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
nil
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.load_corpus(corpus_name, formats = nil)
|
43
|
+
if corpus_name.include?(':')
|
44
|
+
return CorrectHorseBatteryStaple::Corpus.read corpus_name
|
45
|
+
end
|
46
|
+
|
47
|
+
formats = Array(formats || SUPPORTED_FORMATS)
|
48
|
+
filename = corpus_name.match(/[.?]/) ? corpus_name :
|
49
|
+
self.find_corpus(corpus_name, formats)
|
50
|
+
unless (filename && File.exist?(filename))
|
51
|
+
raise ArgumentError, "Cannot find corpus #{corpus_name}"
|
52
|
+
end
|
53
|
+
|
54
|
+
CorrectHorseBatteryStaple::Corpus.read filename
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.generate(length = 4)
|
58
|
+
CorrectHorseBatteryStaple::Generator.new(self.default_corpus).make(length)
|
59
|
+
end
|
60
|
+
|
61
|
+
protected
|
62
|
+
|
63
|
+
|
64
|
+
module Common
|
65
|
+
def logger
|
66
|
+
CorrectHorseBatteryStaple.logger
|
67
|
+
end
|
68
|
+
|
69
|
+
if Object.const_defined?("Random")
|
70
|
+
def random_number(max=1.0)
|
71
|
+
Random.rand(max)
|
72
|
+
end
|
73
|
+
else
|
74
|
+
def random_number(max=1.0)
|
75
|
+
SecureRandom.random_number(max)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def random_in_range(range)
|
80
|
+
range.first + random_number(range_count(range))
|
81
|
+
end
|
82
|
+
|
83
|
+
def array_sample(array, count)
|
84
|
+
l = array.length
|
85
|
+
array.values_at(* count.times.map { random_number(l) })
|
86
|
+
end
|
87
|
+
|
88
|
+
def set_sample(array, count)
|
89
|
+
l = array.length
|
90
|
+
array.values_at(* count.times.map { random_number(l) })
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
module Util
|
95
|
+
def self.open_binary(filename, mode = "r", *rest)
|
96
|
+
open(filename, openmode(mode), *rest)
|
97
|
+
end
|
98
|
+
|
99
|
+
def self.openmode(mode)
|
100
|
+
IO.respond_to?(:binwrite) ? "#{mode}b:ASCII-8BIT" : mode
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
autoload :Word, 'correct_horse_battery_staple/word'
|
105
|
+
autoload :Stats, 'correct_horse_battery_staple/stats'
|
106
|
+
autoload :Generator, 'correct_horse_battery_staple/generator'
|
107
|
+
autoload :Corpus, 'correct_horse_battery_staple/corpus'
|
108
|
+
autoload :Parser, 'correct_horse_battery_staple/parser'
|
109
|
+
autoload :StatisticalArray, 'correct_horse_battery_staple/statistical_array'
|
110
|
+
autoload :RangeParser, 'correct_horse_battery_staple/range_parser'
|
111
|
+
autoload :Writer, 'correct_horse_battery_staple/writer'
|
112
|
+
autoload :Backend, 'correct_horse_battery_staple/backend'
|
113
|
+
autoload :Memoize, 'correct_horse_battery_staple/memoize'
|
114
|
+
|
115
|
+
self.corpus_directory = File.join(File.dirname(__FILE__), "../corpus")
|
116
|
+
end
|
117
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'bigdecimal'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
class CorrectHorseBatteryStaple::Assembler
|
5
|
+
include CorrectHorseBatteryStaple::Common
|
6
|
+
|
7
|
+
attr_accessor :words
|
8
|
+
|
9
|
+
VALID_INITIAL_CHARS = ([*'a'..'z']).map {|ls| ls[0]}
|
10
|
+
|
11
|
+
def initialize(parser = nil)
|
12
|
+
@parser = (parser || CorrectHorseBatteryStaple::Parser::Regex.new(:wiktionary))
|
13
|
+
end
|
14
|
+
|
15
|
+
def read(urls)
|
16
|
+
self.words =
|
17
|
+
urls.map do |url|
|
18
|
+
@parser.parse open(url)
|
19
|
+
end.reduce(:+).
|
20
|
+
select {|wstruct| VALID_INITIAL_CHARS.include?(wstruct.word[0]) }.
|
21
|
+
# we take a round-trip through a Hash to weed out dupes
|
22
|
+
inject({}) {|h, wstruct| h[wstruct.word] = wstruct; h }.
|
23
|
+
values.
|
24
|
+
sort
|
25
|
+
|
26
|
+
self
|
27
|
+
end
|
28
|
+
|
29
|
+
def randomize
|
30
|
+
self.words.shuffle!
|
31
|
+
self
|
32
|
+
end
|
33
|
+
|
34
|
+
def limit(count)
|
35
|
+
self.words.slice!(count..-1) if self.words.length > count
|
36
|
+
self
|
37
|
+
end
|
38
|
+
|
39
|
+
def corpus
|
40
|
+
@corpus ||= CorrectHorseBatteryStaple::Corpus::Serialized.new(self.words).tap do |corpus|
|
41
|
+
corpus.recalculate
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
@@ -0,0 +1,6 @@
|
|
1
|
+
class CorrectHorseBatteryStaple::Backend
|
2
|
+
autoload :Isam, "correct_horse_battery_staple/backend/isam"
|
3
|
+
autoload :IsamKD, "correct_horse_battery_staple/backend/isam_kd"
|
4
|
+
autoload :Sqlite, "correct_horse_battery_staple/backend/sqlite"
|
5
|
+
autoload :Redis, "correct_horse_battery_staple/backend/redis"
|
6
|
+
end
|
@@ -0,0 +1,410 @@
|
|
1
|
+
require 'bigdecimal'
|
2
|
+
require 'json'
|
3
|
+
require 'set'
|
4
|
+
require 'kdtree'
|
5
|
+
|
6
|
+
module CorrectHorseBatteryStaple::Backend::IsamKD
|
7
|
+
INITIAL_PRELUDE_LENGTH = 4096
|
8
|
+
|
9
|
+
F_PRELUDE_AT_END = 1
|
10
|
+
|
11
|
+
def self.included(base)
|
12
|
+
base.extend ClassMethods
|
13
|
+
base.send :include, InstanceMethods
|
14
|
+
end
|
15
|
+
|
16
|
+
module ClassMethods
|
17
|
+
end
|
18
|
+
|
19
|
+
module InstanceMethods
|
20
|
+
#
|
21
|
+
#
|
22
|
+
#
|
23
|
+
def initialize_backend_variables
|
24
|
+
@length_scaling_factor = 15
|
25
|
+
@page_size = 4096
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
def fix_stats(stats)
|
31
|
+
stats.each do |k,v|
|
32
|
+
if v.respond_to?(:nan?) && v.nan?
|
33
|
+
stats[k] = -1
|
34
|
+
end
|
35
|
+
end
|
36
|
+
stats
|
37
|
+
end
|
38
|
+
|
39
|
+
def page_size
|
40
|
+
@page_size || 4096
|
41
|
+
end
|
42
|
+
|
43
|
+
# many MMUs in default mode and modern highcap drives have 4k pages/blocks
|
44
|
+
def round_up(val, blocksize=page_size)
|
45
|
+
[(val.to_f/blocksize).ceil, 1].max * blocksize
|
46
|
+
end
|
47
|
+
|
48
|
+
def write_corpus_to_io(corpus, io=STDOUT)
|
49
|
+
io.rewind
|
50
|
+
|
51
|
+
# includes prefix length byte
|
52
|
+
@word_length = corpus.reduce(0) { |m, e| m > e.word.length ? m : e.word.length } + 1
|
53
|
+
@freq_length = 4
|
54
|
+
@entry_length = @word_length + @freq_length
|
55
|
+
|
56
|
+
stats = fix_stats(corpus.stats)
|
57
|
+
corpus_word_count = corpus.length
|
58
|
+
|
59
|
+
prelude = {
|
60
|
+
"wlen" => @word_length,
|
61
|
+
"flen" => 4,
|
62
|
+
"entrylen" => @word_length + @freq_length,
|
63
|
+
"sort" => "frequency",
|
64
|
+
"n" => corpus_word_count,
|
65
|
+
"stats" => stats,
|
66
|
+
"flags" => 0,
|
67
|
+
"length_scaling_factor" => (@length_scaling_factor || 15),
|
68
|
+
"records_length" => "0000000000",
|
69
|
+
"offset_records" => "0000000000",
|
70
|
+
"offset_index1" => "0000000000",
|
71
|
+
"offset_index2" => "0000000000"
|
72
|
+
}
|
73
|
+
|
74
|
+
prelude_json_length = prelude.to_json.length
|
75
|
+
prelude["offset_records"] = offset_records = round_up(prelude_json_length+8.0)
|
76
|
+
|
77
|
+
prelude["records_length"] = records_length = corpus_word_count * prelude["entrylen"]
|
78
|
+
offset_index1 = prelude["offset_records"] +
|
79
|
+
round_up(records_length, page_size)
|
80
|
+
|
81
|
+
prelude["offset_index1"] = offset_index1
|
82
|
+
|
83
|
+
io.write([offset_records, prelude_json_length, prelude.to_json].
|
84
|
+
pack("NNA#{offset_records-8}"))
|
85
|
+
|
86
|
+
corpus.each_with_index do |w, index|
|
87
|
+
io.write(s=[w.word.length, w.word, w.frequency].pack("Ca#{@word_length-1}N"))
|
88
|
+
end
|
89
|
+
|
90
|
+
pad(offset_index1 - (offset_records + records_length), io)
|
91
|
+
write_kdtree(corpus, io)
|
92
|
+
end
|
93
|
+
|
94
|
+
def pad(size, io)
|
95
|
+
io.write([].pack("x#{size}"))
|
96
|
+
end
|
97
|
+
|
98
|
+
def write_kdtree(corpus, io)
|
99
|
+
i = -1
|
100
|
+
k = KDTree.new(
|
101
|
+
corpus.entries.map {|w| [
|
102
|
+
len2coord(w.word.length.to_f),
|
103
|
+
w.percentile.to_f,
|
104
|
+
i+=1
|
105
|
+
]
|
106
|
+
}
|
107
|
+
)
|
108
|
+
|
109
|
+
k.persist(io)
|
110
|
+
end
|
111
|
+
|
112
|
+
# make the search space more square by increasing the length of
|
113
|
+
# the "word length" axis
|
114
|
+
def len2coord(len)
|
115
|
+
len * (@length_scaling_factor || 10)
|
116
|
+
end
|
117
|
+
|
118
|
+
def binwrite(*args)
|
119
|
+
method = io.respond_to?(:binwrite) ? :binwrite : :write
|
120
|
+
io.send(method, *args)
|
121
|
+
end
|
122
|
+
|
123
|
+
def openmode
|
124
|
+
IO.respond_to?(:binwrite) ? "wb:ASCII-8BIT" : "w"
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
#
|
129
|
+
#
|
130
|
+
# Format of header:
|
131
|
+
#
|
132
|
+
# 0..3 - OB - offset of body start in bytes; network byte order
|
133
|
+
# 4..7 - LP - length of prelude in network byte order
|
134
|
+
# 8..OB-1 - P - JSON-encoded prelude hash and space padding
|
135
|
+
# OB..EOF - array of fixed size records as described in prelude
|
136
|
+
#
|
137
|
+
# Contents of Prelude (after JSON decoding):
|
138
|
+
#
|
139
|
+
# P["wlen"] - length of word part of record
|
140
|
+
# P["flen"] - length of frequency part of record (always 4 bytes)
|
141
|
+
# P["entrylen"] - length of total part of record
|
142
|
+
# P["n"] - number of records
|
143
|
+
# P["sort"] - field name sorted by (word or frequency)
|
144
|
+
# P["stats"] - corpus statistics
|
145
|
+
# P["offset_index1"] - absolute file offset of KDTree index
|
146
|
+
# P["records_length"] - length in bytes of records section, excluding padding
|
147
|
+
# P["length_scaling_factor"] - what length was multiplied by in creating KDTree (usually 15)
|
148
|
+
#
|
149
|
+
# Format of record:
|
150
|
+
#
|
151
|
+
# 2 bytes - LW - actual length of word within field
|
152
|
+
# P["wlen"] bytes - LW bytes of word (W) + P["wlen"]-LW bytes of padding
|
153
|
+
# P["flen"] (4) bytes - frequency as network byte order long
|
154
|
+
#
|
155
|
+
# After record section, there is padding up to the next page_size boundary,
|
156
|
+
# and then there is a dumped KDTree which extends to EOF.
|
157
|
+
#
|
158
|
+
#
|
159
|
+
|
160
|
+
def precache(max = -1)
|
161
|
+
return if max > -1 && file_size(@file) > max
|
162
|
+
@file.seek 0
|
163
|
+
@file = StringIO.new @file.read, "r"
|
164
|
+
end
|
165
|
+
|
166
|
+
def file_size(file)
|
167
|
+
(file.respond_to?(:size) ? file.size : file.stat.size)
|
168
|
+
end
|
169
|
+
|
170
|
+
def prelude
|
171
|
+
@prelude || parse_prelude
|
172
|
+
end
|
173
|
+
|
174
|
+
def parse_prelude
|
175
|
+
@file.seek 0
|
176
|
+
prelude_buf = @file.read(INITIAL_PRELUDE_LENGTH)
|
177
|
+
|
178
|
+
# byte offset of first record from beginning of file
|
179
|
+
# total length of JSON string (without padding)
|
180
|
+
(@record_offset, @prelude_len) = prelude_buf.unpack("NN")
|
181
|
+
|
182
|
+
# read more if our initial read didn't slurp in the entire prelude
|
183
|
+
if @prelude_len > prelude_buf.length
|
184
|
+
prelude_buf += @file.read(@prelude_len - prelude_buf.length)
|
185
|
+
end
|
186
|
+
|
187
|
+
@prelude = JSON.parse( prelude_buf.unpack("@8a#{@prelude_len}")[0] ) || {}
|
188
|
+
|
189
|
+
# includes prefix length byte
|
190
|
+
@word_length = @prelude["wlen"] || raise(ArgumentError, "Word length is not defined!")
|
191
|
+
|
192
|
+
# as network byte order int
|
193
|
+
@frequency_length = @prelude["flen"] || 4
|
194
|
+
|
195
|
+
# total length of record
|
196
|
+
@entry_length = @prelude["entrylen"] || raise(ArgumentError, "Prelude does not include entrylen!")
|
197
|
+
|
198
|
+
@offset_index1 = @prelude["offset_index1"] || raise(ArgumentError, "No index offset!")
|
199
|
+
|
200
|
+
@records_length = @prelude["records_length"] || raise(ArgumentError, "No records length!")
|
201
|
+
|
202
|
+
@entry_count = @prelude["n"] || raise(ArgumentError, "Number of records not included!")
|
203
|
+
|
204
|
+
@length_scaling_factor = @prelude["length_scaling_factor"] || 10
|
205
|
+
|
206
|
+
load_stats_from_hash(@prelude["stats"]) if @prelude["stats"]
|
207
|
+
|
208
|
+
@prelude
|
209
|
+
end
|
210
|
+
|
211
|
+
#
|
212
|
+
# Show some information about
|
213
|
+
#
|
214
|
+
def inspect
|
215
|
+
super + "\n" + <<INSPECT
|
216
|
+
File size: #{file_size(@file)}
|
217
|
+
Word length: #{@word_length}
|
218
|
+
Frequency bytes: #{@frequency_length}
|
219
|
+
Total record bytes: #{@records_length}
|
220
|
+
Offset of K-D Tree index: #{@offset_index1}
|
221
|
+
Total K-D Tree index bytes: #{file_size(@file) - @offset_index1}
|
222
|
+
K-D Tree Signature: #{file_range_read(@offset_index1..(@offset_index1+3))}
|
223
|
+
|
224
|
+
Prelude:
|
225
|
+
#{@prelude.map {|k,v| k=="stats" ? "" : " #{k}: #{v}\n" }.join("") }
|
226
|
+
INSPECT
|
227
|
+
end
|
228
|
+
|
229
|
+
def load_kdtree
|
230
|
+
@file.seek(@offset_index1)
|
231
|
+
KDTree.new @file
|
232
|
+
end
|
233
|
+
|
234
|
+
|
235
|
+
## parsing
|
236
|
+
|
237
|
+
#
|
238
|
+
# Parse a record into an array of [word, frequency] IFF the word
|
239
|
+
# fits into the length_range or length_range is nil
|
240
|
+
#
|
241
|
+
def parse_record_into_array(string, index, length_range = nil)
|
242
|
+
chunk = nth_chunk(index, string)
|
243
|
+
raise "No chunk for index #{index}" unless chunk
|
244
|
+
actual_word_length = chunk.unpack("C")[0]
|
245
|
+
if !length_range || length_range.include?(actual_word_length)
|
246
|
+
# returns [word, frequency]
|
247
|
+
chunk.unpack("xa#{actual_word_length}@#{@word_length}N")
|
248
|
+
else
|
249
|
+
nil
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
#
|
254
|
+
# Parse a record into a Word object, which can be provided or will otherwise
|
255
|
+
# be constructed as needed fourth arg is a length range which can act as a
|
256
|
+
# filter; if not satisfied, nil will be returned
|
257
|
+
#
|
258
|
+
def parse_record(string, index=0,
|
259
|
+
word=CorrectHorseBatteryStaple::Word.new(:word => ""),
|
260
|
+
length_range = nil)
|
261
|
+
bare = parse_record_into_array(string, index, length_range)
|
262
|
+
return nil unless bare
|
263
|
+
word.word = bare[0]
|
264
|
+
word.frequency = bare[1]
|
265
|
+
word
|
266
|
+
end
|
267
|
+
|
268
|
+
def word_length(chunk_string)
|
269
|
+
chunk_string.unpack("C")
|
270
|
+
end
|
271
|
+
|
272
|
+
# return a string representing the nth_record
|
273
|
+
def nth_chunk(n, string)
|
274
|
+
string[@entry_length * n, @entry_length]
|
275
|
+
end
|
276
|
+
|
277
|
+
def pos_of_nth_word_in_file(n)
|
278
|
+
pos = @record_offset + (n * @entry_length)
|
279
|
+
end
|
280
|
+
|
281
|
+
#
|
282
|
+
# this version is much slower than the other - 1.5x total runtime
|
283
|
+
# slower in some cases.
|
284
|
+
#
|
285
|
+
# def get_word_by_idx_direct(n)
|
286
|
+
# @file.seek(pos_of_nth_word_in_file(n))
|
287
|
+
# chunk = @file.read(@entry_length)
|
288
|
+
# parse_record(chunk)
|
289
|
+
# end
|
290
|
+
|
291
|
+
def get_word_by_idx(n)
|
292
|
+
chunk = nth_chunk(n, records_string)
|
293
|
+
parse_record(chunk).tap do |w|
|
294
|
+
w.index = n
|
295
|
+
w.percentile = (n-0.5)/size * 100
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
299
|
+
## some core Enumerable building blocks
|
300
|
+
|
301
|
+
def each(&block)
|
302
|
+
string = records_string
|
303
|
+
max_index = size - 1
|
304
|
+
index = 0
|
305
|
+
while index < max_index
|
306
|
+
yield parse_record(string, index)
|
307
|
+
index += 1
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
def count; size; end
|
312
|
+
def size
|
313
|
+
@entry_count ||= records_size / @entry_length
|
314
|
+
end
|
315
|
+
|
316
|
+
|
317
|
+
## our Corpus Enumerablish abstract methods
|
318
|
+
|
319
|
+
# we presume that the ISAM file has been sorted
|
320
|
+
def sorted_entries
|
321
|
+
@sorted_entries ||= entries
|
322
|
+
end
|
323
|
+
|
324
|
+
|
325
|
+
## optimized pick - does NOT support :filter, though
|
326
|
+
def pick(count, options = {})
|
327
|
+
# incompat check
|
328
|
+
raise NotImplementedError, "ISAM does not support :filter option" if options[:filter]
|
329
|
+
|
330
|
+
options = {:percentile => 0..100,
|
331
|
+
:word_length => 0..20}.merge(options)
|
332
|
+
|
333
|
+
result = []
|
334
|
+
found_indexes = []
|
335
|
+
iterations = 0
|
336
|
+
while (result.size < count && iterations < 1000)
|
337
|
+
len = random_in_range(options[:word_length])
|
338
|
+
pct = random_in_range(options[:percentile])
|
339
|
+
word_idx = @kdtree.nearest(len2coord(len), pct)
|
340
|
+
unless found_indexes.include?(word_idx)
|
341
|
+
found_indexes << word_idx
|
342
|
+
word = get_word_by_idx(word_idx)
|
343
|
+
if options[:word_length].include?(word.word.length)
|
344
|
+
result << word
|
345
|
+
else
|
346
|
+
STDERR.puts "non-qualifying word: #{word.word.length}"
|
347
|
+
end
|
348
|
+
end
|
349
|
+
iterations += 1
|
350
|
+
end
|
351
|
+
|
352
|
+
# validate that we succeeded
|
353
|
+
raise "Cannot find #{count} words matching criteria" if result.length < count
|
354
|
+
|
355
|
+
result
|
356
|
+
end
|
357
|
+
|
358
|
+
|
359
|
+
|
360
|
+
## file I/O
|
361
|
+
|
362
|
+
def records_size
|
363
|
+
@records_length
|
364
|
+
end
|
365
|
+
|
366
|
+
def file_string
|
367
|
+
@file.is_a?(StringIO) ? @file.string : file_range_read(nil)
|
368
|
+
end
|
369
|
+
|
370
|
+
def file_range_read(file_range = nil)
|
371
|
+
file_range ||= 0...file_size(@file)
|
372
|
+
pos = @file.tell
|
373
|
+
@file.seek(file_range.first)
|
374
|
+
@file.read(range_count(file_range))
|
375
|
+
ensure
|
376
|
+
@file.seek(pos)
|
377
|
+
end
|
378
|
+
# memoize :file_range_read
|
379
|
+
|
380
|
+
# returns a string representing the record-holding portion of the file
|
381
|
+
def records_string
|
382
|
+
@records_string ||=
|
383
|
+
record_range_read(0 ... records_size)
|
384
|
+
end
|
385
|
+
|
386
|
+
def record_range_read(record_range = nil)
|
387
|
+
record_range ||= 0...records_size
|
388
|
+
file_range_read((record_range.first + @record_offset)...(range_count(record_range) + @record_offset))
|
389
|
+
end
|
390
|
+
# memoize :record_range_read
|
391
|
+
|
392
|
+
def record_percentile_range_read(percentile_range)
|
393
|
+
record_range = record_range_for_percentile(percentile_range)
|
394
|
+
record_range_read(record_range)
|
395
|
+
end
|
396
|
+
|
397
|
+
|
398
|
+
## rather than using a StatisticalArray, we do direct indexing into the file/string
|
399
|
+
def percentile_index(percentile, round=true)
|
400
|
+
r = percentile.to_f/100 * count + 0.5
|
401
|
+
round ? r.round : r
|
402
|
+
end
|
403
|
+
|
404
|
+
def record_range_for_percentile(range)
|
405
|
+
range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric)
|
406
|
+
(percentile_index(range.begin, false).floor * @entry_length ...
|
407
|
+
percentile_index(range.end, false).ceil * @entry_length)
|
408
|
+
end
|
409
|
+
end
|
410
|
+
end
|