correct-horse-battery-staple 0.6.3 → 0.6.4
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +0 -0
- data/Manifest.txt +1 -0
- data/correct-horse-battery-staple.gemspec +3 -3
- data/lib/correct_horse_battery_staple.rb +1 -1
- data/lib/correct_horse_battery_staple/backend/isam.rb +332 -0
- data/lib/correct_horse_battery_staple/backend/isam_kd.rb +7 -3
- data/lib/correct_horse_battery_staple/corpus/base.rb +17 -4
- data/lib/correct_horse_battery_staple/corpus/isam.rb +2 -163
- metadata +21 -20
- metadata.gz.sig +0 -0
data.tar.gz.sig
CHANGED
Binary file
|
data/Manifest.txt
CHANGED
@@ -9,6 +9,7 @@ bin/chbs-mkpass
|
|
9
9
|
lib/correct_horse_battery_staple.rb
|
10
10
|
lib/correct_horse_battery_staple/assembler.rb
|
11
11
|
lib/correct_horse_battery_staple/backend.rb
|
12
|
+
lib/correct_horse_battery_staple/backend/isam.rb
|
12
13
|
lib/correct_horse_battery_staple/backend/isam_kd.rb
|
13
14
|
lib/correct_horse_battery_staple/backend/redis.rb
|
14
15
|
lib/correct_horse_battery_staple/backend/redis/d_range.rb
|
@@ -2,17 +2,17 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = "correct-horse-battery-staple"
|
5
|
-
s.version = "0.6.
|
5
|
+
s.version = "0.6.4.20120113111503"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Robert Sanders"]
|
9
9
|
s.cert_chain = ["/Users/robertsanders/.gem/gem-public_cert.pem"]
|
10
|
-
s.date = "2012-01-
|
10
|
+
s.date = "2012-01-13"
|
11
11
|
s.description = "Generate a 4 word password from words of size 3-8 characters, with\nfrequencies in the 30th-60th percentile. This range gives a nice set\nof uncommon but not completely alien words.\n\n $ chbs generate --verbose -W 3..8 -P 30..60\n Corpus size: 6396 candidate words of 33075 total\n Entropy: 48 bits (2^48 = 281474976710656)\n Years to guess at 1000 guesses/sec: 8926\n magnate-thermal-sandbank-augur\n\nWith the --verbose flag, the utility will calculate a time-to-guess\nbased on a completely arbitrary 1000 guesses/sec. If you'd like a\nmore secure password, either relax the various filtering rules (-W and\n-P), add more words to the password, or use a larger corpus.\n\nBy default we use the American TV Shows & Scripts corpus taken from\nWiktionary.\n\nOthers provided:\n\n* Project Gutenberg 2005 corpus taken from Wiktionary.\n* 1 of every 7 of the top 60000 lemmas from wordfrequency.info (6900\n actual lemmas after processing)\n\nSee http://xkcd.com/936/ for the genesis of the idea.\n\nData sources:\n\n http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists\n http://wordfrequency.info/"
|
12
12
|
s.email = ["robert@curioussquid.com"]
|
13
13
|
s.executables = ["chbs", "chbs-mkpass"]
|
14
14
|
s.extra_rdoc_files = ["History.txt", "Manifest.txt", "README.txt"]
|
15
|
-
s.files = ["Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "README.txt", "Rakefile", "bin/chbs", "bin/chbs-mkpass", "lib/correct_horse_battery_staple.rb", "lib/correct_horse_battery_staple/assembler.rb", "lib/correct_horse_battery_staple/backend.rb", "lib/correct_horse_battery_staple/backend/isam_kd.rb", "lib/correct_horse_battery_staple/backend/redis.rb", "lib/correct_horse_battery_staple/backend/redis/d_range.rb", "lib/correct_horse_battery_staple/corpus.rb", "lib/correct_horse_battery_staple/corpus/base.rb", "lib/correct_horse_battery_staple/corpus/isam.rb", "lib/correct_horse_battery_staple/corpus/isam_kd.rb", "lib/correct_horse_battery_staple/corpus/redis.rb", "lib/correct_horse_battery_staple/corpus/redis2.rb", "lib/correct_horse_battery_staple/corpus/serialized.rb", "lib/correct_horse_battery_staple/corpus/sqlite.rb", "lib/correct_horse_battery_staple/generator.rb", "lib/correct_horse_battery_staple/memoize.rb", "lib/correct_horse_battery_staple/parser.rb", "lib/correct_horse_battery_staple/parser/base.rb", "lib/correct_horse_battery_staple/parser/regex.rb", "lib/correct_horse_battery_staple/range_parser.rb", "lib/correct_horse_battery_staple/statistical_array.rb", "lib/correct_horse_battery_staple/stats.rb", "lib/correct_horse_battery_staple/word.rb", "lib/correct_horse_battery_staple/writer.rb", "lib/correct_horse_battery_staple/writer/base.rb", "lib/correct_horse_battery_staple/writer/csv.rb", "lib/correct_horse_battery_staple/writer/file.rb", "lib/correct_horse_battery_staple/writer/isam.rb", "lib/correct_horse_battery_staple/writer/isam_kd.rb", "lib/correct_horse_battery_staple/writer/json.rb", "lib/correct_horse_battery_staple/writer/marshal.rb", "lib/correct_horse_battery_staple/writer/redis.rb", "lib/correct_horse_battery_staple/writer/sqlite.rb", "script/generate_all", "script/load_redis", "script/perftest", "spec/corpus/serialized_spec.rb", "spec/corpus_spec.rb", "spec/correct_horse_battery_staple_spec.rb", "spec/fixtures/100.json", "spec/fixtures/corpus1.csv", "spec/fixtures/corpus100.json", "spec/fixtures/wiktionary1000.htm", "spec/range_parser_spec.rb", "spec/spec_helper.rb", "spec/statistical_array_spec.rb", "spec/support/spec_pry.rb", "spec/word_spec.rb", "correct-horse-battery-staple.gemspec", ".gemtest"]
|
15
|
+
s.files = ["Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "README.txt", "Rakefile", "bin/chbs", "bin/chbs-mkpass", "lib/correct_horse_battery_staple.rb", "lib/correct_horse_battery_staple/assembler.rb", "lib/correct_horse_battery_staple/backend.rb", "lib/correct_horse_battery_staple/backend/isam.rb", "lib/correct_horse_battery_staple/backend/isam_kd.rb", "lib/correct_horse_battery_staple/backend/redis.rb", "lib/correct_horse_battery_staple/backend/redis/d_range.rb", "lib/correct_horse_battery_staple/corpus.rb", "lib/correct_horse_battery_staple/corpus/base.rb", "lib/correct_horse_battery_staple/corpus/isam.rb", "lib/correct_horse_battery_staple/corpus/isam_kd.rb", "lib/correct_horse_battery_staple/corpus/redis.rb", "lib/correct_horse_battery_staple/corpus/redis2.rb", "lib/correct_horse_battery_staple/corpus/serialized.rb", "lib/correct_horse_battery_staple/corpus/sqlite.rb", "lib/correct_horse_battery_staple/generator.rb", "lib/correct_horse_battery_staple/memoize.rb", "lib/correct_horse_battery_staple/parser.rb", "lib/correct_horse_battery_staple/parser/base.rb", "lib/correct_horse_battery_staple/parser/regex.rb", "lib/correct_horse_battery_staple/range_parser.rb", "lib/correct_horse_battery_staple/statistical_array.rb", "lib/correct_horse_battery_staple/stats.rb", "lib/correct_horse_battery_staple/word.rb", "lib/correct_horse_battery_staple/writer.rb", "lib/correct_horse_battery_staple/writer/base.rb", "lib/correct_horse_battery_staple/writer/csv.rb", "lib/correct_horse_battery_staple/writer/file.rb", "lib/correct_horse_battery_staple/writer/isam.rb", "lib/correct_horse_battery_staple/writer/isam_kd.rb", "lib/correct_horse_battery_staple/writer/json.rb", "lib/correct_horse_battery_staple/writer/marshal.rb", "lib/correct_horse_battery_staple/writer/redis.rb", "lib/correct_horse_battery_staple/writer/sqlite.rb", "script/generate_all", "script/load_redis", "script/perftest", "spec/corpus/serialized_spec.rb", "spec/corpus_spec.rb", "spec/correct_horse_battery_staple_spec.rb", "spec/fixtures/100.json", "spec/fixtures/corpus1.csv", "spec/fixtures/corpus100.json", "spec/fixtures/wiktionary1000.htm", "spec/range_parser_spec.rb", "spec/spec_helper.rb", "spec/statistical_array_spec.rb", "spec/support/spec_pry.rb", "spec/word_spec.rb", "correct-horse-battery-staple.gemspec", ".gemtest"]
|
16
16
|
s.homepage = "http://github.com/rsanders/correct-horse-battery-staple"
|
17
17
|
s.rdoc_options = ["--main", "README.txt"]
|
18
18
|
s.require_paths = ["lib"]
|
@@ -0,0 +1,332 @@
|
|
1
|
+
require 'bigdecimal'
|
2
|
+
require 'json'
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
module CorrectHorseBatteryStaple::Backend::Isam
|
6
|
+
INITIAL_PRELUDE_LENGTH = 4096
|
7
|
+
|
8
|
+
F_PRELUDE_AT_END = 1
|
9
|
+
|
10
|
+
def self.included(base)
|
11
|
+
base.extend ClassMethods
|
12
|
+
base.send :include, InstanceMethods
|
13
|
+
end
|
14
|
+
|
15
|
+
module ClassMethods
|
16
|
+
end
|
17
|
+
|
18
|
+
module InstanceMethods
|
19
|
+
#
|
20
|
+
#
|
21
|
+
#
|
22
|
+
def initialize_backend_variables
|
23
|
+
@length_scaling_factor = 15
|
24
|
+
@page_size = 4096
|
25
|
+
end
|
26
|
+
|
27
|
+
def fix_stats(stats)
|
28
|
+
stats.each do |k,v|
|
29
|
+
if v.respond_to?(:nan?) && v.nan?
|
30
|
+
stats[k] = -1
|
31
|
+
end
|
32
|
+
end
|
33
|
+
stats
|
34
|
+
end
|
35
|
+
|
36
|
+
def page_size
|
37
|
+
@page_size || 4096
|
38
|
+
end
|
39
|
+
|
40
|
+
# many MMUs in default mode and modern highcap drives have 4k pages/blocks
|
41
|
+
def round_up(val, blocksize=page_size)
|
42
|
+
[(val.to_f/blocksize).ceil, 1].max * blocksize
|
43
|
+
end
|
44
|
+
|
45
|
+
def write_corpus_to_io(corpus, io=STDOUT)
|
46
|
+
io.rewind
|
47
|
+
|
48
|
+
# includes prefix length byte
|
49
|
+
@word_length = corpus.reduce(0) { |m, e| m > e.word.length ? m : e.word.length } + 1
|
50
|
+
@freq_length = 4
|
51
|
+
@entry_length = @word_length + @freq_length
|
52
|
+
|
53
|
+
stats = fix_stats(corpus.stats)
|
54
|
+
corpus_word_count = corpus.length
|
55
|
+
|
56
|
+
prelude = {
|
57
|
+
"wlen" => @word_length,
|
58
|
+
"flen" => 4,
|
59
|
+
"entrylen" => @word_length + @freq_length,
|
60
|
+
"sort" => "frequency",
|
61
|
+
"n" => corpus_word_count,
|
62
|
+
"stats" => stats,
|
63
|
+
"flags" => 0,
|
64
|
+
"length_scaling_factor" => (@length_scaling_factor || 15),
|
65
|
+
"records_length" => "0000000000",
|
66
|
+
"offset_records" => "0000000000",
|
67
|
+
"offset_index1" => "0000000000",
|
68
|
+
"offset_index2" => "0000000000"
|
69
|
+
}
|
70
|
+
|
71
|
+
prelude_json_length = prelude.to_json.length
|
72
|
+
prelude["offset_records"] = offset_records = round_up(prelude_json_length+8.0)
|
73
|
+
|
74
|
+
prelude["records_length"] = records_length = corpus_word_count * prelude["entrylen"]
|
75
|
+
offset_index1 = prelude["offset_records"] +
|
76
|
+
round_up(records_length, page_size)
|
77
|
+
|
78
|
+
prelude["offset_index1"] = offset_index1
|
79
|
+
|
80
|
+
io.write([offset_records, prelude_json_length, prelude.to_json].
|
81
|
+
pack("NNA#{offset_records-8}"))
|
82
|
+
|
83
|
+
corpus.each_with_index do |w, index|
|
84
|
+
io.write(s=[w.word.length, w.word, w.frequency].pack("Ca#{@word_length-1}N"))
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def pad(size, io)
|
89
|
+
io.write([].pack("x#{size}"))
|
90
|
+
end
|
91
|
+
|
92
|
+
def binwrite(*args)
|
93
|
+
method = io.respond_to?(:binwrite) ? :binwrite : :write
|
94
|
+
io.send(method, *args)
|
95
|
+
end
|
96
|
+
|
97
|
+
def openmode
|
98
|
+
IO.respond_to?(:binwrite) ? "wb:ASCII-8BIT" : "w"
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
#
|
103
|
+
#
|
104
|
+
# Format of header:
|
105
|
+
#
|
106
|
+
# 0..3 - OB - offset of body start in bytes; network byte order
|
107
|
+
# 4..7 - LP - length of prelude in network byte order
|
108
|
+
# 8..OB-1 - P - JSON-encoded prelude hash and space padding
|
109
|
+
# OB..EOF - array of fixed size records as described in prelude
|
110
|
+
#
|
111
|
+
# Contents of Prelude (after JSON decoding):
|
112
|
+
#
|
113
|
+
# P["wlen"] - length of word part of record
|
114
|
+
# P["flen"] - length of frequency part of record (always 4 bytes)
|
115
|
+
# P["entrylen"] - length of total part of record
|
116
|
+
# P["n"] - number of records
|
117
|
+
# P["sort"] - field name sorted by (word or frequency)
|
118
|
+
# P["stats"] - corpus statistics
|
119
|
+
# P["offset_index1"] - absolute file offset of KDTree index
|
120
|
+
# P["records_length"] - length in bytes of records section, excluding padding
|
121
|
+
# P["length_scaling_factor"] - what length was multiplied by in creating KDTree (usually 15)
|
122
|
+
#
|
123
|
+
# Format of record:
|
124
|
+
#
|
125
|
+
# 2 bytes - LW - actual length of word within field
|
126
|
+
# P["wlen"] bytes - LW bytes of word (W) + P["wlen"]-LW bytes of padding
|
127
|
+
# P["flen"] (4) bytes - frequency as network byte order long
|
128
|
+
#
|
129
|
+
# After record section, there is padding up to the next page_size boundary,
|
130
|
+
# and then there is a dumped KDTree which extends to EOF.
|
131
|
+
#
|
132
|
+
#
|
133
|
+
|
134
|
+
def precache(max = -1)
|
135
|
+
return if max > -1 && file_size(@file) > max
|
136
|
+
@file.seek 0
|
137
|
+
@file = StringIO.new @file.read, "r"
|
138
|
+
end
|
139
|
+
|
140
|
+
def file_size(file)
|
141
|
+
(file.respond_to?(:size) ? file.size : file.stat.size)
|
142
|
+
end
|
143
|
+
|
144
|
+
def prelude
|
145
|
+
@prelude || parse_prelude
|
146
|
+
end
|
147
|
+
|
148
|
+
def parse_prelude
|
149
|
+
@file.seek 0
|
150
|
+
prelude_buf = @file.read(INITIAL_PRELUDE_LENGTH)
|
151
|
+
|
152
|
+
# byte offset of first record from beginning of file
|
153
|
+
# total length of JSON string (without padding)
|
154
|
+
(@record_offset, @prelude_len) = prelude_buf.unpack("NN")
|
155
|
+
|
156
|
+
# read more if our initial read didn't slurp in the entire prelude
|
157
|
+
if @prelude_len > prelude_buf.length
|
158
|
+
prelude_buf += @file.read(@prelude_len - prelude_buf.length)
|
159
|
+
end
|
160
|
+
|
161
|
+
@prelude = JSON.parse( prelude_buf.unpack("@8a#{@prelude_len}")[0] ) || {}
|
162
|
+
|
163
|
+
# includes prefix length byte
|
164
|
+
@word_length = @prelude["wlen"] || raise(ArgumentError, "Word length is not defined!")
|
165
|
+
|
166
|
+
# as network byte order int
|
167
|
+
@frequency_length = @prelude["flen"] || 4
|
168
|
+
|
169
|
+
# total length of record
|
170
|
+
@entry_length = @prelude["entrylen"] || raise(ArgumentError, "Prelude does not include entrylen!")
|
171
|
+
|
172
|
+
@offset_index1 = @prelude["offset_index1"]
|
173
|
+
@offset_index2 = @prelude["offset_index2"]
|
174
|
+
|
175
|
+
@entry_count = @prelude["n"] || raise(ArgumentError, "Number of records not included!")
|
176
|
+
|
177
|
+
@records_length = @prelude["records_length"] || (@entry_length * @entry_count)
|
178
|
+
|
179
|
+
@length_scaling_factor = @prelude["length_scaling_factor"] || 10
|
180
|
+
|
181
|
+
load_stats_from_hash(@prelude["stats"]) if @prelude["stats"]
|
182
|
+
|
183
|
+
@prelude
|
184
|
+
end
|
185
|
+
|
186
|
+
#
|
187
|
+
# Show some information about
|
188
|
+
#
|
189
|
+
def inspect
|
190
|
+
super + "\n" + <<INSPECT
|
191
|
+
File size: #{file_size(@file)}
|
192
|
+
Word length: #{@word_length}
|
193
|
+
Frequency bytes: #{@frequency_length}
|
194
|
+
Total record bytes: #{@records_length}
|
195
|
+
|
196
|
+
Prelude:
|
197
|
+
#{@prelude.map {|k,v| k=="stats" ? "" : " #{k}: #{v}\n" }.join("") }
|
198
|
+
INSPECT
|
199
|
+
end
|
200
|
+
|
201
|
+
## parsing
|
202
|
+
|
203
|
+
#
|
204
|
+
# Parse a record into an array of [word, frequency] IFF the word
|
205
|
+
# fits into the length_range or length_range is nil
|
206
|
+
#
|
207
|
+
def parse_record_into_array(string, index, length_range = nil)
|
208
|
+
chunk = nth_chunk(index, string)
|
209
|
+
raise "No chunk for index #{index}" unless chunk
|
210
|
+
actual_word_length = chunk.unpack("C")[0]
|
211
|
+
if !length_range || length_range.include?(actual_word_length)
|
212
|
+
# returns [word, frequency]
|
213
|
+
chunk.unpack("xa#{actual_word_length}@#{@word_length}N")
|
214
|
+
else
|
215
|
+
nil
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
#
|
220
|
+
# Parse a record into a Word object, which can be provided or will otherwise
|
221
|
+
# be constructed as needed fourth arg is a length range which can act as a
|
222
|
+
# filter; if not satisfied, nil will be returned
|
223
|
+
#
|
224
|
+
def parse_record(string, index=0,
|
225
|
+
word=CorrectHorseBatteryStaple::Word.new(:word => ""),
|
226
|
+
length_range = nil)
|
227
|
+
bare = parse_record_into_array(string, index, length_range)
|
228
|
+
return nil unless bare
|
229
|
+
word.word = bare[0]
|
230
|
+
word.frequency = bare[1]
|
231
|
+
word
|
232
|
+
end
|
233
|
+
|
234
|
+
def word_length(chunk_string)
|
235
|
+
chunk_string.unpack("C")
|
236
|
+
end
|
237
|
+
|
238
|
+
# return a string representing the nth_record
|
239
|
+
def nth_chunk(n, string)
|
240
|
+
string[@entry_length * n, @entry_length]
|
241
|
+
end
|
242
|
+
|
243
|
+
def pos_of_nth_word_in_file(n)
|
244
|
+
pos = @record_offset + (n * @entry_length)
|
245
|
+
end
|
246
|
+
|
247
|
+
def get_word_by_idx(n)
|
248
|
+
chunk = nth_chunk(n, records_string)
|
249
|
+
parse_record(chunk).tap do |w|
|
250
|
+
w.index = n
|
251
|
+
w.percentile = [(n-0.5)/size,0].max * 100
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
## some core Enumerable building blocks
|
256
|
+
|
257
|
+
def each(&block)
|
258
|
+
string = records_string
|
259
|
+
max_index = size - 1
|
260
|
+
index = 0
|
261
|
+
while index < max_index
|
262
|
+
word = parse_record(string, index)
|
263
|
+
word.index = index
|
264
|
+
word.percentile = [(index-0.5)/size,0].max * 100
|
265
|
+
yield word
|
266
|
+
index += 1
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
def size
|
271
|
+
@entry_count ||= records_size / @entry_length
|
272
|
+
end
|
273
|
+
|
274
|
+
|
275
|
+
## our Corpus Enumerablish abstract methods
|
276
|
+
|
277
|
+
# we presume that the ISAM file has been sorted
|
278
|
+
def sorted_entries
|
279
|
+
@sorted_entries ||= entries
|
280
|
+
end
|
281
|
+
|
282
|
+
## file I/O
|
283
|
+
|
284
|
+
def records_size
|
285
|
+
@records_length
|
286
|
+
end
|
287
|
+
|
288
|
+
def file_string
|
289
|
+
@file.is_a?(StringIO) ? @file.string : file_range_read(nil)
|
290
|
+
end
|
291
|
+
|
292
|
+
def file_range_read(file_range = nil)
|
293
|
+
file_range ||= 0...file_size(@file)
|
294
|
+
pos = @file.tell
|
295
|
+
@file.seek(file_range.first)
|
296
|
+
@file.read(range_count(file_range))
|
297
|
+
ensure
|
298
|
+
@file.seek(pos)
|
299
|
+
end
|
300
|
+
# memoize :file_range_read
|
301
|
+
|
302
|
+
# returns a string representing the record-holding portion of the file
|
303
|
+
def records_string
|
304
|
+
@records_string ||=
|
305
|
+
record_range_read(0 ... records_size)
|
306
|
+
end
|
307
|
+
|
308
|
+
def record_range_read(record_range = nil)
|
309
|
+
record_range ||= 0...records_size
|
310
|
+
file_range_read((record_range.first + @record_offset)...(range_count(record_range) + @record_offset))
|
311
|
+
end
|
312
|
+
# memoize :record_range_read
|
313
|
+
|
314
|
+
def record_percentile_range_read(percentile_range)
|
315
|
+
record_range = record_range_for_percentile(percentile_range)
|
316
|
+
record_range_read(record_range)
|
317
|
+
end
|
318
|
+
|
319
|
+
|
320
|
+
## rather than using a StatisticalArray, we do direct indexing into the file/string
|
321
|
+
def percentile_index(percentile, round=true)
|
322
|
+
r = percentile.to_f/100 * count + 0.5
|
323
|
+
round ? r.round : r
|
324
|
+
end
|
325
|
+
|
326
|
+
def record_range_for_percentile(range)
|
327
|
+
range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric)
|
328
|
+
(percentile_index(range.begin, false).floor * @entry_length ...
|
329
|
+
percentile_index(range.end, false).ceil * @entry_length)
|
330
|
+
end
|
331
|
+
end
|
332
|
+
end
|
@@ -292,7 +292,7 @@ INSPECT
|
|
292
292
|
chunk = nth_chunk(n, records_string)
|
293
293
|
parse_record(chunk).tap do |w|
|
294
294
|
w.index = n
|
295
|
-
w.percentile = (n-0.5)/size * 100
|
295
|
+
w.percentile = [(n-0.5)/size,0].max * 100
|
296
296
|
end
|
297
297
|
end
|
298
298
|
|
@@ -303,7 +303,10 @@ INSPECT
|
|
303
303
|
max_index = size - 1
|
304
304
|
index = 0
|
305
305
|
while index < max_index
|
306
|
-
|
306
|
+
word = parse_record(string, index)
|
307
|
+
word.index = index
|
308
|
+
word.percentile = [(index-0.5)/size,0].max * 100
|
309
|
+
yield word
|
307
310
|
index += 1
|
308
311
|
end
|
309
312
|
end
|
@@ -332,7 +335,8 @@ INSPECT
|
|
332
335
|
result = []
|
333
336
|
found_indexes = []
|
334
337
|
iterations = 0
|
335
|
-
|
338
|
+
max_iterations = [1000, 4 * count].max
|
339
|
+
while (result.size < count && iterations < max_iterations)
|
336
340
|
len = random_in_range(options[:word_length])
|
337
341
|
pct = random_in_range(options[:percentile])
|
338
342
|
word_idx = @kdtree.nearest(len2coord(len), pct)
|
@@ -48,6 +48,15 @@ class CorrectHorseBatteryStaple::Corpus::Base < CorrectHorseBatteryStaple::Corpu
|
|
48
48
|
end
|
49
49
|
|
50
50
|
|
51
|
+
def count_by_options(options = {})
|
52
|
+
if options.empty?
|
53
|
+
count
|
54
|
+
else
|
55
|
+
count &filter_for_options(options)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
memoize :count_by_options
|
59
|
+
|
51
60
|
def sorted_entries
|
52
61
|
entries.sort
|
53
62
|
end
|
@@ -136,8 +145,12 @@ class CorrectHorseBatteryStaple::Corpus::Base < CorrectHorseBatteryStaple::Corpu
|
|
136
145
|
end
|
137
146
|
memoize :frequencies
|
138
147
|
|
139
|
-
def entropy_per_word
|
140
|
-
Math.log(
|
148
|
+
def entropy_per_word(options = {})
|
149
|
+
Math.log(count_by_options(options)) / Math.log(2)
|
150
|
+
end
|
151
|
+
|
152
|
+
def entropy_per_word_by_filter(&filter)
|
153
|
+
Math.log(filter ? count(&filter) : size) / Math.log(2)
|
141
154
|
end
|
142
155
|
|
143
156
|
# filtering
|
@@ -279,8 +292,8 @@ INSPECT
|
|
279
292
|
|
280
293
|
filters.empty? ? nil : compose_filters(filters)
|
281
294
|
end
|
282
|
-
memoize :filter_for_options
|
283
|
-
|
295
|
+
# memoize :filter_for_options
|
296
|
+
public :filter_for_options
|
284
297
|
end
|
285
298
|
|
286
299
|
# Random.srand(SecureRandom.random_number)
|
@@ -28,9 +28,10 @@ require 'set'
|
|
28
28
|
#
|
29
29
|
|
30
30
|
class CorrectHorseBatteryStaple::Corpus::Isam < CorrectHorseBatteryStaple::Corpus::Base
|
31
|
+
include CorrectHorseBatteryStaple::Backend::Isam
|
31
32
|
include CorrectHorseBatteryStaple::Memoize
|
32
33
|
|
33
|
-
INITIAL_PRELUDE_LENGTH =
|
34
|
+
INITIAL_PRELUDE_LENGTH = 4096
|
34
35
|
|
35
36
|
def initialize(filename, stats = nil)
|
36
37
|
super
|
@@ -39,122 +40,11 @@ class CorrectHorseBatteryStaple::Corpus::Isam < CorrectHorseBatteryStaple::Corpu
|
|
39
40
|
parse_prelude
|
40
41
|
end
|
41
42
|
|
42
|
-
def precache(max = -1)
|
43
|
-
return if max > -1 && file_size(@file) > max
|
44
|
-
@file.seek 0
|
45
|
-
@file = StringIO.new @file.read, "r"
|
46
|
-
end
|
47
|
-
|
48
|
-
def file_size(file)
|
49
|
-
(file.respond_to?(:size) ? file.size : file.stat.size)
|
50
|
-
end
|
51
|
-
|
52
|
-
def prelude
|
53
|
-
@prelude || parse_prelude
|
54
|
-
end
|
55
|
-
|
56
|
-
def parse_prelude
|
57
|
-
@file.seek 0
|
58
|
-
prelude_buf = @file.read(INITIAL_PRELUDE_LENGTH)
|
59
|
-
|
60
|
-
# byte offset of first record from beginning of file
|
61
|
-
# total length of JSON string (without padding)
|
62
|
-
(@record_offset, @prelude_len) = prelude_buf.unpack("NN")
|
63
|
-
|
64
|
-
# read more if our initial read didn't slurp in the entire prelude
|
65
|
-
if @prelude_len > prelude_buf.length
|
66
|
-
prelude_buf += @file.read(@prelude_len - prelude_buf.length)
|
67
|
-
end
|
68
|
-
|
69
|
-
@prelude = JSON.parse( prelude_buf.unpack("@8a#{@prelude_len}")[0] ) || {}
|
70
|
-
|
71
|
-
# includes prefix length byte
|
72
|
-
@word_length = @prelude["wlen"] || raise(ArgumentError, "Word length is not defined!")
|
73
|
-
|
74
|
-
# as network byte order int
|
75
|
-
@frequency_length = @prelude["flen"] || 4
|
76
|
-
|
77
|
-
# total length of record
|
78
|
-
@entry_length = @prelude["entrylen"] || raise(ArgumentError, "Prelude does not include entrylen!")
|
79
|
-
|
80
|
-
load_stats_from_hash(@prelude["stats"]) if @prelude["stats"]
|
81
|
-
|
82
|
-
@prelude
|
83
|
-
end
|
84
|
-
|
85
43
|
# factory-ish constructor
|
86
44
|
def self.read(filename)
|
87
45
|
self.new filename
|
88
46
|
end
|
89
47
|
|
90
|
-
|
91
|
-
## parsing
|
92
|
-
|
93
|
-
#
|
94
|
-
# Parse a record into an array of [word, frequency] IFF the word
|
95
|
-
# fits into the length_range or length_range is nil
|
96
|
-
#
|
97
|
-
def parse_record_into_array(string, index, length_range = nil)
|
98
|
-
chunk = nth_chunk(index, string)
|
99
|
-
raise "No chunk for index #{index}" unless chunk
|
100
|
-
actual_word_length = chunk.unpack("C")[0]
|
101
|
-
if !length_range || length_range.include?(actual_word_length)
|
102
|
-
# returns [word, frequency]
|
103
|
-
chunk.unpack("xa#{actual_word_length}@#{@word_length}N")
|
104
|
-
else
|
105
|
-
nil
|
106
|
-
end
|
107
|
-
end
|
108
|
-
|
109
|
-
#
|
110
|
-
# Parse a record into a Word object, which can be provided or will otherwise
|
111
|
-
# be constructed as needed fourth arg is a length range which can act as a
|
112
|
-
# filter; if not satisfied, nil will be returned
|
113
|
-
#
|
114
|
-
def parse_record(string, index=0,
|
115
|
-
word=CorrectHorseBatteryStaple::Word.new(:word => ""),
|
116
|
-
length_range = nil)
|
117
|
-
bare = parse_record_into_array(string, index, length_range)
|
118
|
-
return nil unless bare
|
119
|
-
word.word = bare[0]
|
120
|
-
word.frequency = bare[1]
|
121
|
-
word
|
122
|
-
end
|
123
|
-
|
124
|
-
def word_length(chunk_string)
|
125
|
-
chunk_string.unpack("C")
|
126
|
-
end
|
127
|
-
|
128
|
-
# return a string representing the nth_record
|
129
|
-
def nth_chunk(n, string)
|
130
|
-
string[@entry_length * n, @entry_length]
|
131
|
-
end
|
132
|
-
|
133
|
-
## some core Enumerable building blocks
|
134
|
-
|
135
|
-
def each(&block)
|
136
|
-
string = records_string
|
137
|
-
max_index = size - 1
|
138
|
-
index = 0
|
139
|
-
while index < max_index
|
140
|
-
yield parse_record(string, index)
|
141
|
-
index += 1
|
142
|
-
end
|
143
|
-
end
|
144
|
-
|
145
|
-
def size
|
146
|
-
@size ||= records_size / @entry_length
|
147
|
-
end
|
148
|
-
|
149
|
-
|
150
|
-
## our Corpus Enumerablish abstract methods
|
151
|
-
|
152
|
-
# we presume that the ISAM file has been sorted
|
153
|
-
def sorted_entries
|
154
|
-
@sorted_entries ||= entries
|
155
|
-
end
|
156
|
-
|
157
|
-
|
158
48
|
## optimized pick - does NOT support :filter, though
|
159
49
|
def pick(count, options = {})
|
160
50
|
# incompat check
|
@@ -203,55 +93,4 @@ class CorrectHorseBatteryStaple::Corpus::Isam < CorrectHorseBatteryStaple::Corpu
|
|
203
93
|
result
|
204
94
|
end
|
205
95
|
|
206
|
-
|
207
|
-
## file I/O
|
208
|
-
|
209
|
-
def records_size
|
210
|
-
@records_size ||= (file_size(@file) - @record_offset)
|
211
|
-
end
|
212
|
-
|
213
|
-
def file_string
|
214
|
-
@file.is_a?(StringIO) ? @file.string : file_range_read(nil)
|
215
|
-
end
|
216
|
-
|
217
|
-
def file_range_read(file_range = nil)
|
218
|
-
file_range ||= 0...file_size(@file)
|
219
|
-
pos = @file.tell
|
220
|
-
@file.seek(file_range.first)
|
221
|
-
@file.read(range_count(file_range))
|
222
|
-
ensure
|
223
|
-
@file.seek(pos)
|
224
|
-
end
|
225
|
-
memoize :file_range_read
|
226
|
-
|
227
|
-
# returns a string representing the record-holding portion of the file
|
228
|
-
def records_string
|
229
|
-
@records_string ||=
|
230
|
-
record_range_read(0 ... records_size)
|
231
|
-
end
|
232
|
-
|
233
|
-
def record_range_read(record_range = nil)
|
234
|
-
record_range ||= 0...records_size
|
235
|
-
file_range_read((record_range.first + @record_offset)...(record_range.first + range_count(record_range) + @record_offset))
|
236
|
-
end
|
237
|
-
# memoize :record_range_read
|
238
|
-
|
239
|
-
def record_percentile_range_read(percentile_range)
|
240
|
-
record_range = record_range_for_percentile(percentile_range)
|
241
|
-
record_range_read(record_range)
|
242
|
-
end
|
243
|
-
|
244
|
-
|
245
|
-
## rather than using a StatisticalArray, we do direct indexing into the file/string
|
246
|
-
def percentile_index(percentile, round=true)
|
247
|
-
r = percentile.to_f/100 * count + 0.5
|
248
|
-
round ? r.round : r
|
249
|
-
end
|
250
|
-
|
251
|
-
def record_range_for_percentile(range)
|
252
|
-
range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric)
|
253
|
-
(percentile_index(range.begin, false).floor * @entry_length ...
|
254
|
-
percentile_index(range.end, false).ceil * @entry_length)
|
255
|
-
end
|
256
|
-
|
257
96
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: correct-horse-battery-staple
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -50,11 +50,11 @@ cert_chain:
|
|
50
50
|
-----END CERTIFICATE-----
|
51
51
|
|
52
52
|
'
|
53
|
-
date: 2012-01-
|
53
|
+
date: 2012-01-13 00:00:00.000000000 Z
|
54
54
|
dependencies:
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: commander
|
57
|
-
requirement: &
|
57
|
+
requirement: &70214898076980 !ruby/object:Gem::Requirement
|
58
58
|
none: false
|
59
59
|
requirements:
|
60
60
|
- - ! '>='
|
@@ -62,10 +62,10 @@ dependencies:
|
|
62
62
|
version: '4.0'
|
63
63
|
type: :runtime
|
64
64
|
prerelease: false
|
65
|
-
version_requirements: *
|
65
|
+
version_requirements: *70214898076980
|
66
66
|
- !ruby/object:Gem::Dependency
|
67
67
|
name: fastercsv
|
68
|
-
requirement: &
|
68
|
+
requirement: &70214898075860 !ruby/object:Gem::Requirement
|
69
69
|
none: false
|
70
70
|
requirements:
|
71
71
|
- - ! '>='
|
@@ -73,10 +73,10 @@ dependencies:
|
|
73
73
|
version: 1.5.3
|
74
74
|
type: :runtime
|
75
75
|
prerelease: false
|
76
|
-
version_requirements: *
|
76
|
+
version_requirements: *70214898075860
|
77
77
|
- !ruby/object:Gem::Dependency
|
78
78
|
name: json
|
79
|
-
requirement: &
|
79
|
+
requirement: &70214898075000 !ruby/object:Gem::Requirement
|
80
80
|
none: false
|
81
81
|
requirements:
|
82
82
|
- - ! '>='
|
@@ -84,10 +84,10 @@ dependencies:
|
|
84
84
|
version: 1.6.0
|
85
85
|
type: :runtime
|
86
86
|
prerelease: false
|
87
|
-
version_requirements: *
|
87
|
+
version_requirements: *70214898075000
|
88
88
|
- !ruby/object:Gem::Dependency
|
89
89
|
name: redis
|
90
|
-
requirement: &
|
90
|
+
requirement: &70214898074260 !ruby/object:Gem::Requirement
|
91
91
|
none: false
|
92
92
|
requirements:
|
93
93
|
- - ! '>='
|
@@ -95,10 +95,10 @@ dependencies:
|
|
95
95
|
version: 2.2.2
|
96
96
|
type: :runtime
|
97
97
|
prerelease: false
|
98
|
-
version_requirements: *
|
98
|
+
version_requirements: *70214898074260
|
99
99
|
- !ruby/object:Gem::Dependency
|
100
100
|
name: hiredis
|
101
|
-
requirement: &
|
101
|
+
requirement: &70214898073780 !ruby/object:Gem::Requirement
|
102
102
|
none: false
|
103
103
|
requirements:
|
104
104
|
- - ! '>='
|
@@ -106,10 +106,10 @@ dependencies:
|
|
106
106
|
version: 0.4.0
|
107
107
|
type: :runtime
|
108
108
|
prerelease: false
|
109
|
-
version_requirements: *
|
109
|
+
version_requirements: *70214898073780
|
110
110
|
- !ruby/object:Gem::Dependency
|
111
111
|
name: tupalo-kdtree
|
112
|
-
requirement: &
|
112
|
+
requirement: &70214898073340 !ruby/object:Gem::Requirement
|
113
113
|
none: false
|
114
114
|
requirements:
|
115
115
|
- - ! '>='
|
@@ -117,10 +117,10 @@ dependencies:
|
|
117
117
|
version: 0.2.3
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
|
-
version_requirements: *
|
120
|
+
version_requirements: *70214898073340
|
121
121
|
- !ruby/object:Gem::Dependency
|
122
122
|
name: sqlite3
|
123
|
-
requirement: &
|
123
|
+
requirement: &70214898072840 !ruby/object:Gem::Requirement
|
124
124
|
none: false
|
125
125
|
requirements:
|
126
126
|
- - ! '>='
|
@@ -128,10 +128,10 @@ dependencies:
|
|
128
128
|
version: 1.3.0
|
129
129
|
type: :runtime
|
130
130
|
prerelease: false
|
131
|
-
version_requirements: *
|
131
|
+
version_requirements: *70214898072840
|
132
132
|
- !ruby/object:Gem::Dependency
|
133
133
|
name: rubyforge
|
134
|
-
requirement: &
|
134
|
+
requirement: &70214898072400 !ruby/object:Gem::Requirement
|
135
135
|
none: false
|
136
136
|
requirements:
|
137
137
|
- - ! '>='
|
@@ -139,10 +139,10 @@ dependencies:
|
|
139
139
|
version: 2.0.4
|
140
140
|
type: :development
|
141
141
|
prerelease: false
|
142
|
-
version_requirements: *
|
142
|
+
version_requirements: *70214898072400
|
143
143
|
- !ruby/object:Gem::Dependency
|
144
144
|
name: hoe
|
145
|
-
requirement: &
|
145
|
+
requirement: &70214898071960 !ruby/object:Gem::Requirement
|
146
146
|
none: false
|
147
147
|
requirements:
|
148
148
|
- - ~>
|
@@ -150,7 +150,7 @@ dependencies:
|
|
150
150
|
version: '2.12'
|
151
151
|
type: :development
|
152
152
|
prerelease: false
|
153
|
-
version_requirements: *
|
153
|
+
version_requirements: *70214898071960
|
154
154
|
description: ! "Generate a 4 word password from words of size 3-8 characters, with\nfrequencies
|
155
155
|
in the 30th-60th percentile. This range gives a nice set\nof uncommon but not completely
|
156
156
|
alien words.\n\n $ chbs generate --verbose -W 3..8 -P 30..60\n Corpus size:
|
@@ -187,6 +187,7 @@ files:
|
|
187
187
|
- lib/correct_horse_battery_staple.rb
|
188
188
|
- lib/correct_horse_battery_staple/assembler.rb
|
189
189
|
- lib/correct_horse_battery_staple/backend.rb
|
190
|
+
- lib/correct_horse_battery_staple/backend/isam.rb
|
190
191
|
- lib/correct_horse_battery_staple/backend/isam_kd.rb
|
191
192
|
- lib/correct_horse_battery_staple/backend/redis.rb
|
192
193
|
- lib/correct_horse_battery_staple/backend/redis/d_range.rb
|
metadata.gz.sig
CHANGED
Binary file
|