correct-horse-battery-staple 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +1 -1
- data/.gemtest +0 -0
- data/Gemfile +53 -0
- data/Gemfile.lock +109 -0
- data/History.txt +6 -0
- data/Manifest.txt +57 -0
- data/README.txt +115 -0
- data/Rakefile +47 -0
- data/bin/chbs +234 -0
- data/bin/chbs-mkpass +16 -0
- data/correct-horse-battery-staple.gemspec +59 -0
- data/lib/correct_horse_battery_staple.rb +117 -0
- data/lib/correct_horse_battery_staple/assembler.rb +45 -0
- data/lib/correct_horse_battery_staple/backend.rb +6 -0
- data/lib/correct_horse_battery_staple/backend/isam_kd.rb +410 -0
- data/lib/correct_horse_battery_staple/backend/redis.rb +95 -0
- data/lib/correct_horse_battery_staple/backend/redis/d_range.rb +105 -0
- data/lib/correct_horse_battery_staple/corpus.rb +33 -0
- data/lib/correct_horse_battery_staple/corpus/base.rb +278 -0
- data/lib/correct_horse_battery_staple/corpus/isam.rb +258 -0
- data/lib/correct_horse_battery_staple/corpus/isam_kd.rb +60 -0
- data/lib/correct_horse_battery_staple/corpus/redis.rb +188 -0
- data/lib/correct_horse_battery_staple/corpus/redis2.rb +88 -0
- data/lib/correct_horse_battery_staple/corpus/serialized.rb +121 -0
- data/lib/correct_horse_battery_staple/corpus/sqlite.rb +266 -0
- data/lib/correct_horse_battery_staple/generator.rb +40 -0
- data/lib/correct_horse_battery_staple/memoize.rb +25 -0
- data/lib/correct_horse_battery_staple/parser.rb +5 -0
- data/lib/correct_horse_battery_staple/parser/base.rb +5 -0
- data/lib/correct_horse_battery_staple/parser/regex.rb +58 -0
- data/lib/correct_horse_battery_staple/range_parser.rb +29 -0
- data/lib/correct_horse_battery_staple/statistical_array.rb +74 -0
- data/lib/correct_horse_battery_staple/stats.rb +22 -0
- data/lib/correct_horse_battery_staple/word.rb +90 -0
- data/lib/correct_horse_battery_staple/writer.rb +29 -0
- data/lib/correct_horse_battery_staple/writer/base.rb +22 -0
- data/lib/correct_horse_battery_staple/writer/csv.rb +15 -0
- data/lib/correct_horse_battery_staple/writer/file.rb +54 -0
- data/lib/correct_horse_battery_staple/writer/isam.rb +50 -0
- data/lib/correct_horse_battery_staple/writer/isam_kd.rb +12 -0
- data/lib/correct_horse_battery_staple/writer/json.rb +19 -0
- data/lib/correct_horse_battery_staple/writer/marshal.rb +10 -0
- data/lib/correct_horse_battery_staple/writer/redis.rb +41 -0
- data/lib/correct_horse_battery_staple/writer/sqlite.rb +115 -0
- data/script/generate_all +34 -0
- data/script/load_redis +17 -0
- data/script/perftest +74 -0
- data/spec/corpus/serialized_spec.rb +62 -0
- data/spec/corpus_spec.rb +50 -0
- data/spec/correct_horse_battery_staple_spec.rb +73 -0
- data/spec/fixtures/100.json +101 -0
- data/spec/fixtures/corpus1.csv +101 -0
- data/spec/fixtures/corpus100.json +101 -0
- data/spec/fixtures/wiktionary1000.htm +648 -0
- data/spec/range_parser_spec.rb +54 -0
- data/spec/spec_helper.rb +20 -0
- data/spec/statistical_array_spec.rb +52 -0
- data/spec/support/spec_pry.rb +1 -0
- data/spec/word_spec.rb +95 -0
- metadata +264 -0
- metadata.gz.sig +1 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
|
2
|
+
require 'securerandom'
|
3
|
+
|
4
|
+
#
|
5
|
+
# Generate an N-word passphrase from a corpus
|
6
|
+
#
|
7
|
+
class CorrectHorseBatteryStaple::Generator
|
8
|
+
include CorrectHorseBatteryStaple::Common
|
9
|
+
include CorrectHorseBatteryStaple::Memoize
|
10
|
+
|
11
|
+
attr_accessor :word_length, :corpus
|
12
|
+
|
13
|
+
def initialize(corpus, word_length = nil)
|
14
|
+
@corpus = corpus
|
15
|
+
if word_length
|
16
|
+
@corpus.filter {|entry| word_length.include?(entry.word.to_s.length) }
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def make(count = 4, options = {})
|
21
|
+
@corpus.pick(count, options).
|
22
|
+
map {|entry| entry.word.downcase }.
|
23
|
+
join("-")
|
24
|
+
end
|
25
|
+
|
26
|
+
def estimate_entropy(options)
|
27
|
+
candidate_count = @corpus.count_candidates(options)
|
28
|
+
(log(candidate_count) / log(2)).floor
|
29
|
+
end
|
30
|
+
memoize :estimate_entropy
|
31
|
+
|
32
|
+
def words
|
33
|
+
@words ||= @corpus.result
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
if __FILE__ == $0
|
38
|
+
puts CorrectHorseBatteryStaple::Generator.new(CorrectHorseBatteryStaple.default_corpus, 3..6).
|
39
|
+
make((ARGV[0] || 4).to_i)
|
40
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module CorrectHorseBatteryStaple::Memoize
|
2
|
+
def self.included(base)
|
3
|
+
base.extend ClassMethods
|
4
|
+
end
|
5
|
+
|
6
|
+
module ClassMethods
|
7
|
+
def memoize(method)
|
8
|
+
old_method = "_#{method}_unmemoized".to_sym
|
9
|
+
miss_object = Object.new
|
10
|
+
alias_method old_method, method
|
11
|
+
define_method method do |*args, &block|
|
12
|
+
@_memoize_cache ||= {}
|
13
|
+
methcache = (@_memoize_cache[method] ||= {})
|
14
|
+
if block
|
15
|
+
raise ArgumentError, "You cannot call a memoized method with a block! #{method}"
|
16
|
+
end
|
17
|
+
value = methcache.fetch(args, miss_object)
|
18
|
+
if value === miss_object
|
19
|
+
value = methcache[args] = send(old_method, *args)
|
20
|
+
end
|
21
|
+
value
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
|
2
|
+
class CorrectHorseBatteryStaple::Parser
|
3
|
+
class Regex < Base
|
4
|
+
PARSERS = {
|
5
|
+
:wiktionary => [%r{<a href="/wiki/\w+" title="(\w+)">\w+</a> = (\d+)},
|
6
|
+
lambda {|match| CorrectHorseBatteryStaple::Word.new(:word => match[0], :frequency => match[1].to_i) }],
|
7
|
+
|
8
|
+
# rank lemma PoS freq dispersion
|
9
|
+
# 7 to t 6332195 0.98
|
10
|
+
:wordfrequency => [ %r{^(\d+)\s+(\w+)\s+\w*\s+(\d+)\s+([0-9.]+)},
|
11
|
+
lambda {|match| CorrectHorseBatteryStaple::Word.new(:word => match[1],
|
12
|
+
:rank => match[0].to_f,
|
13
|
+
:frequency => match[3].to_f,
|
14
|
+
:dispersion => match[4].to_f)
|
15
|
+
}],
|
16
|
+
|
17
|
+
# using tabs between columns
|
18
|
+
# freq word PoS # texts
|
19
|
+
# ----- ----- ----- -----
|
20
|
+
# 22995878 the at 169011
|
21
|
+
# 11239776 and cc 168844
|
22
|
+
:coca => [ %r{^(\d+)\s+(\w+)\s+\w*\s+(\d+)},
|
23
|
+
lambda {|match| CorrectHorseBatteryStaple::Word.new(:word => match[1],
|
24
|
+
:frequency => match[0].to_i,
|
25
|
+
:texts => match[2].to_i)
|
26
|
+
}],
|
27
|
+
|
28
|
+
# <tr>
|
29
|
+
# <td>25</td>
|
30
|
+
# <td><a href="/wiki/be" title="be">be</a></td>
|
31
|
+
# <td>191823</td>
|
32
|
+
# </tr>
|
33
|
+
:tvscripts => [
|
34
|
+
Regexp.new('<tr>.*?<td>(\d+)</td>.*?<td>.*?title="(\w+)".*?</td>.*?<td>(\d+)</td>.*?</tr>', Regexp::MULTILINE),
|
35
|
+
lambda {|match| CorrectHorseBatteryStaple::Word.new(
|
36
|
+
:rank => match[0].to_i,
|
37
|
+
:word => match[1],
|
38
|
+
:frequency => match[2].to_i
|
39
|
+
) }
|
40
|
+
]
|
41
|
+
}
|
42
|
+
|
43
|
+
def initialize(type = :wiktionary)
|
44
|
+
@parser_type = type.to_sym
|
45
|
+
end
|
46
|
+
|
47
|
+
def parse(file)
|
48
|
+
raise ArgumentError, "unknown regex parser type #{@parser_type}" unless PARSERS.has_key?(@parser_type)
|
49
|
+
(regex, lexer) = PARSERS[@parser_type]
|
50
|
+
|
51
|
+
words =
|
52
|
+
file.read.scan(regex).map do |match|
|
53
|
+
lexer.call(match)
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# doesn't handle X...Y
|
2
|
+
|
3
|
+
class CorrectHorseBatteryStaple::RangeParser
|
4
|
+
NUM = '-?(?:\.[0-9]+|[0-9]+|[0-9]+\.[0-9]+|[0-9]+\.(?!\.))'
|
5
|
+
SPACE = " *"
|
6
|
+
SEPARATOR = "(-|\\.\\.)"
|
7
|
+
REGEX_PAIR = Regexp.new("(#{NUM})#{SPACE}#{SEPARATOR}#{SPACE}(#{NUM})")
|
8
|
+
REGEX_SINGLE = Regexp.new("#{SPACE}(#{NUM})#{SPACE}")
|
9
|
+
def parse(string)
|
10
|
+
match = string.match(REGEX_PAIR)
|
11
|
+
if match
|
12
|
+
return Range.new(parse_number(match[1]), parse_number(match[3]))
|
13
|
+
end
|
14
|
+
|
15
|
+
match = string.match(REGEX_SINGLE)
|
16
|
+
if match
|
17
|
+
num = parse_number(match[0])
|
18
|
+
return Range.new(num, num)
|
19
|
+
end
|
20
|
+
|
21
|
+
nil
|
22
|
+
end
|
23
|
+
|
24
|
+
protected
|
25
|
+
|
26
|
+
def parse_number(str)
|
27
|
+
str.include?(".") ? str.to_f : str.to_i
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module CorrectHorseBatteryStaple
|
2
|
+
|
3
|
+
class StatisticalArray
|
4
|
+
def initialize(array, sorted=false)
|
5
|
+
@obj = array
|
6
|
+
@sorted = sorted
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.cast(array, sorted=false)
|
10
|
+
if array.is_a?(CorrectHorseBatteryStaple::StatisticalArray)
|
11
|
+
array
|
12
|
+
else
|
13
|
+
CorrectHorseBatteryStaple::StatisticalArray.new(array, sorted)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def sort!
|
18
|
+
@obj = @obj.sort unless @sorted
|
19
|
+
@sorted = true
|
20
|
+
self
|
21
|
+
end
|
22
|
+
|
23
|
+
def sort_by!(&block)
|
24
|
+
@obj = @obj.sort_by(&block)
|
25
|
+
@sorted = true
|
26
|
+
self
|
27
|
+
end
|
28
|
+
|
29
|
+
def method_missing(name, *args, &block)
|
30
|
+
@obj.__send__(name, *args, &block)
|
31
|
+
end
|
32
|
+
|
33
|
+
def mean
|
34
|
+
inject(0) { |sum, x| sum += x } / size.to_f
|
35
|
+
end
|
36
|
+
alias :average :mean
|
37
|
+
|
38
|
+
def sum
|
39
|
+
reduce(:+)
|
40
|
+
end
|
41
|
+
|
42
|
+
def sort(&block)
|
43
|
+
return super(&block) if block || !@sorted
|
44
|
+
self
|
45
|
+
end
|
46
|
+
|
47
|
+
def standard_deviation(m = mean)
|
48
|
+
variance = inject(0) { |v, x| v += (x - m) ** 2 }
|
49
|
+
return Math.sqrt(variance/(size-1))
|
50
|
+
end
|
51
|
+
|
52
|
+
def mean_and_standard_deviation
|
53
|
+
return m=mean, standard_deviation(m)
|
54
|
+
end
|
55
|
+
|
56
|
+
def percentile_index(percentile, round=true)
|
57
|
+
r = percentile.to_f/100 * length + 0.5
|
58
|
+
round ? r.round : r
|
59
|
+
end
|
60
|
+
|
61
|
+
def index_range_for_percentile(range)
|
62
|
+
range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric)
|
63
|
+
sort!
|
64
|
+
|
65
|
+
(percentile_index(range.begin, false).floor ..
|
66
|
+
percentile_index(range.end, false).ceil)
|
67
|
+
end
|
68
|
+
|
69
|
+
def select_percentile(range)
|
70
|
+
slice(index_range_for_percentile(range))
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'ostruct'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
class CorrectHorseBatteryStaple::Stats < OpenStruct
|
5
|
+
def to_hash
|
6
|
+
marshal_dump
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.from_hash(hash)
|
10
|
+
new.tap do |newobj|
|
11
|
+
marshal_load(hash)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def to_json
|
16
|
+
to_hash.to_json
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.from_json(json)
|
20
|
+
from_hash JSON.parse(json)
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
class CorrectHorseBatteryStaple::Word
|
2
|
+
# text of word
|
3
|
+
attr_accessor :word
|
4
|
+
|
5
|
+
# frequency is the total count of the word in corpus
|
6
|
+
attr_accessor :frequency
|
7
|
+
|
8
|
+
# rank is the word position when sorted by frequency in entire corpus
|
9
|
+
# index is the index of the word in this (sub)corpus
|
10
|
+
attr_accessor :rank, :index
|
11
|
+
|
12
|
+
# dispersion is Juilland dispersion, the % of texts containing the
|
13
|
+
# word. texts is the # of texts containing the word.
|
14
|
+
attr_accessor :dispersion
|
15
|
+
|
16
|
+
# texts is the # of texts containing the word. this is not available
|
17
|
+
# for many frequency lists.
|
18
|
+
attr_accessor :texts
|
19
|
+
|
20
|
+
## statistical measure of word position in sorted frequency list
|
21
|
+
|
22
|
+
# in which percentile does the word appear. this can be calculated
|
23
|
+
# from the array of words so is somewhat redundant here
|
24
|
+
attr_accessor :percentile
|
25
|
+
|
26
|
+
# this word's frequency's distance from mean frequency in stddevs;
|
27
|
+
# signed.
|
28
|
+
attr_accessor :distance
|
29
|
+
|
30
|
+
# probability is the chance of any given word in a text composed
|
31
|
+
# of the sum of (word*frequency) in the corpus being this word.
|
32
|
+
attr_accessor :probability
|
33
|
+
|
34
|
+
# distance_probability is the distance of this word's probability
|
35
|
+
# from the mean in stddev
|
36
|
+
attr_accessor :distance_probability
|
37
|
+
|
38
|
+
include Comparable
|
39
|
+
|
40
|
+
def initialize(value_map = {})
|
41
|
+
raise ArgumentError, "Must supply at least :word" unless value_map[:word] || value_map["word"]
|
42
|
+
|
43
|
+
# phasing this out
|
44
|
+
self.index = -1
|
45
|
+
|
46
|
+
case value_map
|
47
|
+
when Hash then update_from_hash(value_map)
|
48
|
+
when CorrectHorseBatteryStaple::Word then update_from_hash(value_map.to_hash)
|
49
|
+
else raise "Can't initialize Word from #{value_map.inspect}"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def <=>(other)
|
54
|
+
self.frequency <=> other.frequency
|
55
|
+
end
|
56
|
+
|
57
|
+
def to_json(*args)
|
58
|
+
to_hash.to_json(*args)
|
59
|
+
end
|
60
|
+
|
61
|
+
def to_s
|
62
|
+
self.word
|
63
|
+
end
|
64
|
+
|
65
|
+
def inspect
|
66
|
+
"CHBS::Word(#{self.to_hash.inspect})"
|
67
|
+
end
|
68
|
+
|
69
|
+
def to_hash
|
70
|
+
instance_variables.reduce({}) do |hash, key|
|
71
|
+
hash[key.to_s[1..-1]] = instance_variable_get(key)
|
72
|
+
hash
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def update_from_hash(hash)
|
77
|
+
hash.each do |key, val|
|
78
|
+
self[key] = val unless key.to_s == "wstruct"
|
79
|
+
end
|
80
|
+
self
|
81
|
+
end
|
82
|
+
|
83
|
+
def [](attr)
|
84
|
+
send(attr.to_s)
|
85
|
+
end
|
86
|
+
|
87
|
+
def []=(attr, value)
|
88
|
+
send("#{attr}=", value)
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
class CorrectHorseBatteryStaple::Writer
|
2
|
+
def self.make_writer(dest, fformat, options = {})
|
3
|
+
fformat ||= CorrectHorseBatteryStaple::Corpus.format_for(dest)
|
4
|
+
raise ArgumentError, "Cannot determine file format for #{dest}" if !fformat || fformat.empty?
|
5
|
+
|
6
|
+
clazz = const_get(fformat.downcase.capitalize)
|
7
|
+
clazz.new(dest, options)
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.write(corpus, dest, fformat, options = {})
|
11
|
+
writer = self.make_writer(dest, fformat, options)
|
12
|
+
begin
|
13
|
+
writer.write_corpus(corpus)
|
14
|
+
ensure
|
15
|
+
writer && writer.close
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
autoload :Base, "correct_horse_battery_staple/writer/base"
|
20
|
+
autoload :File, "correct_horse_battery_staple/writer/file"
|
21
|
+
autoload :Json, "correct_horse_battery_staple/writer/json"
|
22
|
+
autoload :Csv, "correct_horse_battery_staple/writer/csv"
|
23
|
+
autoload :Isam, "correct_horse_battery_staple/writer/isam"
|
24
|
+
autoload :Isamkd, "correct_horse_battery_staple/writer/isam_kd"
|
25
|
+
autoload :Marshal, "correct_horse_battery_staple/writer/marshal"
|
26
|
+
autoload :Sqlite, "correct_horse_battery_staple/writer/sqlite"
|
27
|
+
autoload :Redis, "correct_horse_battery_staple/writer/redis"
|
28
|
+
# autoload :KDTree, "correct_horse_battery_staple/writer/kdtree"
|
29
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
#
|
2
|
+
# Base class for all writers
|
3
|
+
#
|
4
|
+
|
5
|
+
class CorrectHorseBatteryStaple::Writer::Base < CorrectHorseBatteryStaple::Writer
|
6
|
+
include CorrectHorseBatteryStaple::Common
|
7
|
+
|
8
|
+
attr_accessor :dest, :options
|
9
|
+
|
10
|
+
def initialize(dest, options = {})
|
11
|
+
self.dest = dest
|
12
|
+
self.options = options
|
13
|
+
initialize_backend_variables if respond_to?(:initialize_backend_variables)
|
14
|
+
end
|
15
|
+
|
16
|
+
def write_corpus(corpus)
|
17
|
+
raise NotImplementedError, "#{self.class.name} is not a complete implementation"
|
18
|
+
end
|
19
|
+
|
20
|
+
def close
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
class CorrectHorseBatteryStaple::Writer::Csv < CorrectHorseBatteryStaple::Writer::File
|
2
|
+
|
3
|
+
def initialize(dest, options={})
|
4
|
+
super
|
5
|
+
end
|
6
|
+
|
7
|
+
def write_corpus(corpus)
|
8
|
+
puts "index,rank,word,frequency,percentile,distance,probability,distance_probability"
|
9
|
+
corpus.each_with_index do |w, index|
|
10
|
+
puts sprintf("%d,%d,\"%s\",%d,%.4f,%.6f,%.8f,%.8f\n",
|
11
|
+
index || -1, w.rank || -1, w.word, w.frequency || -1,
|
12
|
+
w.percentile || -1, w.distance || -1, w.probability || -1, w.distance_probability || -1)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
#
|
2
|
+
# base class for file-based stores
|
3
|
+
#
|
4
|
+
#
|
5
|
+
class CorrectHorseBatteryStaple::Writer::File < CorrectHorseBatteryStaple::Writer::Base
|
6
|
+
attr_accessor :io
|
7
|
+
|
8
|
+
def initialize(dest, options = {})
|
9
|
+
super
|
10
|
+
|
11
|
+
@do_close = false
|
12
|
+
if dest.respond_to?(:write)
|
13
|
+
self.io = dest
|
14
|
+
else
|
15
|
+
if ["/dev/stdout", "-"].include?(dest)
|
16
|
+
self.io = STDOUT
|
17
|
+
else
|
18
|
+
self.io = open(dest, openmode)
|
19
|
+
@do_close = true
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def close
|
25
|
+
return unless @do_close
|
26
|
+
self.io.close rescue nil
|
27
|
+
ensure
|
28
|
+
self.io = nil
|
29
|
+
@do_close = false
|
30
|
+
end
|
31
|
+
|
32
|
+
protected
|
33
|
+
|
34
|
+
def openmode
|
35
|
+
"w"
|
36
|
+
end
|
37
|
+
|
38
|
+
def <<(string)
|
39
|
+
self.io.write string
|
40
|
+
end
|
41
|
+
|
42
|
+
def print(string)
|
43
|
+
self.io.print string
|
44
|
+
end
|
45
|
+
|
46
|
+
def puts(string)
|
47
|
+
self.io.puts string
|
48
|
+
end
|
49
|
+
|
50
|
+
def write(string)
|
51
|
+
self.io.write string
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|