correct-horse-battery-staple 0.6.3 → 0.6.4

Sign up to get free protection for your applications and to get access to all the features.
data.tar.gz.sig CHANGED
Binary file
@@ -9,6 +9,7 @@ bin/chbs-mkpass
9
9
  lib/correct_horse_battery_staple.rb
10
10
  lib/correct_horse_battery_staple/assembler.rb
11
11
  lib/correct_horse_battery_staple/backend.rb
12
+ lib/correct_horse_battery_staple/backend/isam.rb
12
13
  lib/correct_horse_battery_staple/backend/isam_kd.rb
13
14
  lib/correct_horse_battery_staple/backend/redis.rb
14
15
  lib/correct_horse_battery_staple/backend/redis/d_range.rb
@@ -2,17 +2,17 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "correct-horse-battery-staple"
5
- s.version = "0.6.3.20120111134214"
5
+ s.version = "0.6.4.20120113111503"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Robert Sanders"]
9
9
  s.cert_chain = ["/Users/robertsanders/.gem/gem-public_cert.pem"]
10
- s.date = "2012-01-11"
10
+ s.date = "2012-01-13"
11
11
  s.description = "Generate a 4 word password from words of size 3-8 characters, with\nfrequencies in the 30th-60th percentile. This range gives a nice set\nof uncommon but not completely alien words.\n\n $ chbs generate --verbose -W 3..8 -P 30..60\n Corpus size: 6396 candidate words of 33075 total\n Entropy: 48 bits (2^48 = 281474976710656)\n Years to guess at 1000 guesses/sec: 8926\n magnate-thermal-sandbank-augur\n\nWith the --verbose flag, the utility will calculate a time-to-guess\nbased on a completely arbitrary 1000 guesses/sec. If you'd like a\nmore secure password, either relax the various filtering rules (-W and\n-P), add more words to the password, or use a larger corpus.\n\nBy default we use the American TV Shows & Scripts corpus taken from\nWiktionary.\n\nOthers provided:\n\n* Project Gutenberg 2005 corpus taken from Wiktionary.\n* 1 of every 7 of the top 60000 lemmas from wordfrequency.info (6900\n actual lemmas after processing)\n\nSee http://xkcd.com/936/ for the genesis of the idea.\n\nData sources:\n\n http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists\n http://wordfrequency.info/"
12
12
  s.email = ["robert@curioussquid.com"]
13
13
  s.executables = ["chbs", "chbs-mkpass"]
14
14
  s.extra_rdoc_files = ["History.txt", "Manifest.txt", "README.txt"]
15
- s.files = ["Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "README.txt", "Rakefile", "bin/chbs", "bin/chbs-mkpass", "lib/correct_horse_battery_staple.rb", "lib/correct_horse_battery_staple/assembler.rb", "lib/correct_horse_battery_staple/backend.rb", "lib/correct_horse_battery_staple/backend/isam_kd.rb", "lib/correct_horse_battery_staple/backend/redis.rb", "lib/correct_horse_battery_staple/backend/redis/d_range.rb", "lib/correct_horse_battery_staple/corpus.rb", "lib/correct_horse_battery_staple/corpus/base.rb", "lib/correct_horse_battery_staple/corpus/isam.rb", "lib/correct_horse_battery_staple/corpus/isam_kd.rb", "lib/correct_horse_battery_staple/corpus/redis.rb", "lib/correct_horse_battery_staple/corpus/redis2.rb", "lib/correct_horse_battery_staple/corpus/serialized.rb", "lib/correct_horse_battery_staple/corpus/sqlite.rb", "lib/correct_horse_battery_staple/generator.rb", "lib/correct_horse_battery_staple/memoize.rb", "lib/correct_horse_battery_staple/parser.rb", "lib/correct_horse_battery_staple/parser/base.rb", "lib/correct_horse_battery_staple/parser/regex.rb", "lib/correct_horse_battery_staple/range_parser.rb", "lib/correct_horse_battery_staple/statistical_array.rb", "lib/correct_horse_battery_staple/stats.rb", "lib/correct_horse_battery_staple/word.rb", "lib/correct_horse_battery_staple/writer.rb", "lib/correct_horse_battery_staple/writer/base.rb", "lib/correct_horse_battery_staple/writer/csv.rb", "lib/correct_horse_battery_staple/writer/file.rb", "lib/correct_horse_battery_staple/writer/isam.rb", "lib/correct_horse_battery_staple/writer/isam_kd.rb", "lib/correct_horse_battery_staple/writer/json.rb", "lib/correct_horse_battery_staple/writer/marshal.rb", "lib/correct_horse_battery_staple/writer/redis.rb", "lib/correct_horse_battery_staple/writer/sqlite.rb", "script/generate_all", "script/load_redis", "script/perftest", "spec/corpus/serialized_spec.rb", "spec/corpus_spec.rb", "spec/correct_horse_battery_staple_spec.rb", "spec/fixtures/100.json", "spec/fixtures/corpus1.csv", "spec/fixtures/corpus100.json", "spec/fixtures/wiktionary1000.htm", "spec/range_parser_spec.rb", "spec/spec_helper.rb", "spec/statistical_array_spec.rb", "spec/support/spec_pry.rb", "spec/word_spec.rb", "correct-horse-battery-staple.gemspec", ".gemtest"]
15
+ s.files = ["Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "README.txt", "Rakefile", "bin/chbs", "bin/chbs-mkpass", "lib/correct_horse_battery_staple.rb", "lib/correct_horse_battery_staple/assembler.rb", "lib/correct_horse_battery_staple/backend.rb", "lib/correct_horse_battery_staple/backend/isam.rb", "lib/correct_horse_battery_staple/backend/isam_kd.rb", "lib/correct_horse_battery_staple/backend/redis.rb", "lib/correct_horse_battery_staple/backend/redis/d_range.rb", "lib/correct_horse_battery_staple/corpus.rb", "lib/correct_horse_battery_staple/corpus/base.rb", "lib/correct_horse_battery_staple/corpus/isam.rb", "lib/correct_horse_battery_staple/corpus/isam_kd.rb", "lib/correct_horse_battery_staple/corpus/redis.rb", "lib/correct_horse_battery_staple/corpus/redis2.rb", "lib/correct_horse_battery_staple/corpus/serialized.rb", "lib/correct_horse_battery_staple/corpus/sqlite.rb", "lib/correct_horse_battery_staple/generator.rb", "lib/correct_horse_battery_staple/memoize.rb", "lib/correct_horse_battery_staple/parser.rb", "lib/correct_horse_battery_staple/parser/base.rb", "lib/correct_horse_battery_staple/parser/regex.rb", "lib/correct_horse_battery_staple/range_parser.rb", "lib/correct_horse_battery_staple/statistical_array.rb", "lib/correct_horse_battery_staple/stats.rb", "lib/correct_horse_battery_staple/word.rb", "lib/correct_horse_battery_staple/writer.rb", "lib/correct_horse_battery_staple/writer/base.rb", "lib/correct_horse_battery_staple/writer/csv.rb", "lib/correct_horse_battery_staple/writer/file.rb", "lib/correct_horse_battery_staple/writer/isam.rb", "lib/correct_horse_battery_staple/writer/isam_kd.rb", "lib/correct_horse_battery_staple/writer/json.rb", "lib/correct_horse_battery_staple/writer/marshal.rb", "lib/correct_horse_battery_staple/writer/redis.rb", "lib/correct_horse_battery_staple/writer/sqlite.rb", "script/generate_all", "script/load_redis", "script/perftest", "spec/corpus/serialized_spec.rb", "spec/corpus_spec.rb", "spec/correct_horse_battery_staple_spec.rb", "spec/fixtures/100.json", "spec/fixtures/corpus1.csv", "spec/fixtures/corpus100.json", "spec/fixtures/wiktionary1000.htm", "spec/range_parser_spec.rb", "spec/spec_helper.rb", "spec/statistical_array_spec.rb", "spec/support/spec_pry.rb", "spec/word_spec.rb", "correct-horse-battery-staple.gemspec", ".gemtest"]
16
16
  s.homepage = "http://github.com/rsanders/correct-horse-battery-staple"
17
17
  s.rdoc_options = ["--main", "README.txt"]
18
18
  s.require_paths = ["lib"]
@@ -1,7 +1,7 @@
1
1
  require 'logger'
2
2
 
3
3
  module CorrectHorseBatteryStaple
4
- VERSION = '0.6.3'
4
+ VERSION = '0.6.4'
5
5
 
6
6
  DEFAULT_CORPUS_NAME = "tvscripts"
7
7
 
@@ -0,0 +1,332 @@
1
+ require 'bigdecimal'
2
+ require 'json'
3
+ require 'set'
4
+
5
+ module CorrectHorseBatteryStaple::Backend::Isam
6
+ INITIAL_PRELUDE_LENGTH = 4096
7
+
8
+ F_PRELUDE_AT_END = 1
9
+
10
+ def self.included(base)
11
+ base.extend ClassMethods
12
+ base.send :include, InstanceMethods
13
+ end
14
+
15
+ module ClassMethods
16
+ end
17
+
18
+ module InstanceMethods
19
+ #
20
+ #
21
+ #
22
+ def initialize_backend_variables
23
+ @length_scaling_factor = 15
24
+ @page_size = 4096
25
+ end
26
+
27
+ def fix_stats(stats)
28
+ stats.each do |k,v|
29
+ if v.respond_to?(:nan?) && v.nan?
30
+ stats[k] = -1
31
+ end
32
+ end
33
+ stats
34
+ end
35
+
36
+ def page_size
37
+ @page_size || 4096
38
+ end
39
+
40
+ # many MMUs in default mode and modern highcap drives have 4k pages/blocks
41
+ def round_up(val, blocksize=page_size)
42
+ [(val.to_f/blocksize).ceil, 1].max * blocksize
43
+ end
44
+
45
+ def write_corpus_to_io(corpus, io=STDOUT)
46
+ io.rewind
47
+
48
+ # includes prefix length byte
49
+ @word_length = corpus.reduce(0) { |m, e| m > e.word.length ? m : e.word.length } + 1
50
+ @freq_length = 4
51
+ @entry_length = @word_length + @freq_length
52
+
53
+ stats = fix_stats(corpus.stats)
54
+ corpus_word_count = corpus.length
55
+
56
+ prelude = {
57
+ "wlen" => @word_length,
58
+ "flen" => 4,
59
+ "entrylen" => @word_length + @freq_length,
60
+ "sort" => "frequency",
61
+ "n" => corpus_word_count,
62
+ "stats" => stats,
63
+ "flags" => 0,
64
+ "length_scaling_factor" => (@length_scaling_factor || 15),
65
+ "records_length" => "0000000000",
66
+ "offset_records" => "0000000000",
67
+ "offset_index1" => "0000000000",
68
+ "offset_index2" => "0000000000"
69
+ }
70
+
71
+ prelude_json_length = prelude.to_json.length
72
+ prelude["offset_records"] = offset_records = round_up(prelude_json_length+8.0)
73
+
74
+ prelude["records_length"] = records_length = corpus_word_count * prelude["entrylen"]
75
+ offset_index1 = prelude["offset_records"] +
76
+ round_up(records_length, page_size)
77
+
78
+ prelude["offset_index1"] = offset_index1
79
+
80
+ io.write([offset_records, prelude_json_length, prelude.to_json].
81
+ pack("NNA#{offset_records-8}"))
82
+
83
+ corpus.each_with_index do |w, index|
84
+ io.write(s=[w.word.length, w.word, w.frequency].pack("Ca#{@word_length-1}N"))
85
+ end
86
+ end
87
+
88
+ def pad(size, io)
89
+ io.write([].pack("x#{size}"))
90
+ end
91
+
92
+ def binwrite(*args)
93
+ method = io.respond_to?(:binwrite) ? :binwrite : :write
94
+ io.send(method, *args)
95
+ end
96
+
97
+ def openmode
98
+ IO.respond_to?(:binwrite) ? "wb:ASCII-8BIT" : "w"
99
+ end
100
+
101
+
102
+ #
103
+ #
104
+ # Format of header:
105
+ #
106
+ # 0..3 - OB - offset of body start in bytes; network byte order
107
+ # 4..7 - LP - length of prelude in network byte order
108
+ # 8..OB-1 - P - JSON-encoded prelude hash and space padding
109
+ # OB..EOF - array of fixed size records as described in prelude
110
+ #
111
+ # Contents of Prelude (after JSON decoding):
112
+ #
113
+ # P["wlen"] - length of word part of record
114
+ # P["flen"] - length of frequency part of record (always 4 bytes)
115
+ # P["entrylen"] - length of total part of record
116
+ # P["n"] - number of records
117
+ # P["sort"] - field name sorted by (word or frequency)
118
+ # P["stats"] - corpus statistics
119
+ # P["offset_index1"] - absolute file offset of KDTree index
120
+ # P["records_length"] - length in bytes of records section, excluding padding
121
+ # P["length_scaling_factor"] - what length was multiplied by in creating KDTree (usually 15)
122
+ #
123
+ # Format of record:
124
+ #
125
+ # 2 bytes - LW - actual length of word within field
126
+ # P["wlen"] bytes - LW bytes of word (W) + P["wlen"]-LW bytes of padding
127
+ # P["flen"] (4) bytes - frequency as network byte order long
128
+ #
129
+ # After record section, there is padding up to the next page_size boundary,
130
+ # and then there is a dumped KDTree which extends to EOF.
131
+ #
132
+ #
133
+
134
+ def precache(max = -1)
135
+ return if max > -1 && file_size(@file) > max
136
+ @file.seek 0
137
+ @file = StringIO.new @file.read, "r"
138
+ end
139
+
140
+ def file_size(file)
141
+ (file.respond_to?(:size) ? file.size : file.stat.size)
142
+ end
143
+
144
+ def prelude
145
+ @prelude || parse_prelude
146
+ end
147
+
148
+ def parse_prelude
149
+ @file.seek 0
150
+ prelude_buf = @file.read(INITIAL_PRELUDE_LENGTH)
151
+
152
+ # byte offset of first record from beginning of file
153
+ # total length of JSON string (without padding)
154
+ (@record_offset, @prelude_len) = prelude_buf.unpack("NN")
155
+
156
+ # read more if our initial read didn't slurp in the entire prelude
157
+ if @prelude_len > prelude_buf.length
158
+ prelude_buf += @file.read(@prelude_len - prelude_buf.length)
159
+ end
160
+
161
+ @prelude = JSON.parse( prelude_buf.unpack("@8a#{@prelude_len}")[0] ) || {}
162
+
163
+ # includes prefix length byte
164
+ @word_length = @prelude["wlen"] || raise(ArgumentError, "Word length is not defined!")
165
+
166
+ # as network byte order int
167
+ @frequency_length = @prelude["flen"] || 4
168
+
169
+ # total length of record
170
+ @entry_length = @prelude["entrylen"] || raise(ArgumentError, "Prelude does not include entrylen!")
171
+
172
+ @offset_index1 = @prelude["offset_index1"]
173
+ @offset_index2 = @prelude["offset_index2"]
174
+
175
+ @entry_count = @prelude["n"] || raise(ArgumentError, "Number of records not included!")
176
+
177
+ @records_length = @prelude["records_length"] || (@entry_length * @entry_count)
178
+
179
+ @length_scaling_factor = @prelude["length_scaling_factor"] || 10
180
+
181
+ load_stats_from_hash(@prelude["stats"]) if @prelude["stats"]
182
+
183
+ @prelude
184
+ end
185
+
186
+ #
187
+ # Show some information about
188
+ #
189
+ def inspect
190
+ super + "\n" + <<INSPECT
191
+ File size: #{file_size(@file)}
192
+ Word length: #{@word_length}
193
+ Frequency bytes: #{@frequency_length}
194
+ Total record bytes: #{@records_length}
195
+
196
+ Prelude:
197
+ #{@prelude.map {|k,v| k=="stats" ? "" : " #{k}: #{v}\n" }.join("") }
198
+ INSPECT
199
+ end
200
+
201
+ ## parsing
202
+
203
+ #
204
+ # Parse a record into an array of [word, frequency] IFF the word
205
+ # fits into the length_range or length_range is nil
206
+ #
207
+ def parse_record_into_array(string, index, length_range = nil)
208
+ chunk = nth_chunk(index, string)
209
+ raise "No chunk for index #{index}" unless chunk
210
+ actual_word_length = chunk.unpack("C")[0]
211
+ if !length_range || length_range.include?(actual_word_length)
212
+ # returns [word, frequency]
213
+ chunk.unpack("xa#{actual_word_length}@#{@word_length}N")
214
+ else
215
+ nil
216
+ end
217
+ end
218
+
219
+ #
220
+ # Parse a record into a Word object, which can be provided or will otherwise
221
+ # be constructed as needed fourth arg is a length range which can act as a
222
+ # filter; if not satisfied, nil will be returned
223
+ #
224
+ def parse_record(string, index=0,
225
+ word=CorrectHorseBatteryStaple::Word.new(:word => ""),
226
+ length_range = nil)
227
+ bare = parse_record_into_array(string, index, length_range)
228
+ return nil unless bare
229
+ word.word = bare[0]
230
+ word.frequency = bare[1]
231
+ word
232
+ end
233
+
234
+ def word_length(chunk_string)
235
+ chunk_string.unpack("C")
236
+ end
237
+
238
+ # return a string representing the nth_record
239
+ def nth_chunk(n, string)
240
+ string[@entry_length * n, @entry_length]
241
+ end
242
+
243
+ def pos_of_nth_word_in_file(n)
244
+ pos = @record_offset + (n * @entry_length)
245
+ end
246
+
247
+ def get_word_by_idx(n)
248
+ chunk = nth_chunk(n, records_string)
249
+ parse_record(chunk).tap do |w|
250
+ w.index = n
251
+ w.percentile = [(n-0.5)/size,0].max * 100
252
+ end
253
+ end
254
+
255
+ ## some core Enumerable building blocks
256
+
257
+ def each(&block)
258
+ string = records_string
259
+ max_index = size - 1
260
+ index = 0
261
+ while index < max_index
262
+ word = parse_record(string, index)
263
+ word.index = index
264
+ word.percentile = [(index-0.5)/size,0].max * 100
265
+ yield word
266
+ index += 1
267
+ end
268
+ end
269
+
270
+ def size
271
+ @entry_count ||= records_size / @entry_length
272
+ end
273
+
274
+
275
+ ## our Corpus Enumerablish abstract methods
276
+
277
+ # we presume that the ISAM file has been sorted
278
+ def sorted_entries
279
+ @sorted_entries ||= entries
280
+ end
281
+
282
+ ## file I/O
283
+
284
+ def records_size
285
+ @records_length
286
+ end
287
+
288
+ def file_string
289
+ @file.is_a?(StringIO) ? @file.string : file_range_read(nil)
290
+ end
291
+
292
+ def file_range_read(file_range = nil)
293
+ file_range ||= 0...file_size(@file)
294
+ pos = @file.tell
295
+ @file.seek(file_range.first)
296
+ @file.read(range_count(file_range))
297
+ ensure
298
+ @file.seek(pos)
299
+ end
300
+ # memoize :file_range_read
301
+
302
+ # returns a string representing the record-holding portion of the file
303
+ def records_string
304
+ @records_string ||=
305
+ record_range_read(0 ... records_size)
306
+ end
307
+
308
+ def record_range_read(record_range = nil)
309
+ record_range ||= 0...records_size
310
+ file_range_read((record_range.first + @record_offset)...(range_count(record_range) + @record_offset))
311
+ end
312
+ # memoize :record_range_read
313
+
314
+ def record_percentile_range_read(percentile_range)
315
+ record_range = record_range_for_percentile(percentile_range)
316
+ record_range_read(record_range)
317
+ end
318
+
319
+
320
+ ## rather than using a StatisticalArray, we do direct indexing into the file/string
321
+ def percentile_index(percentile, round=true)
322
+ r = percentile.to_f/100 * count + 0.5
323
+ round ? r.round : r
324
+ end
325
+
326
+ def record_range_for_percentile(range)
327
+ range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric)
328
+ (percentile_index(range.begin, false).floor * @entry_length ...
329
+ percentile_index(range.end, false).ceil * @entry_length)
330
+ end
331
+ end
332
+ end
@@ -292,7 +292,7 @@ INSPECT
292
292
  chunk = nth_chunk(n, records_string)
293
293
  parse_record(chunk).tap do |w|
294
294
  w.index = n
295
- w.percentile = (n-0.5)/size * 100
295
+ w.percentile = [(n-0.5)/size,0].max * 100
296
296
  end
297
297
  end
298
298
 
@@ -303,7 +303,10 @@ INSPECT
303
303
  max_index = size - 1
304
304
  index = 0
305
305
  while index < max_index
306
- yield parse_record(string, index)
306
+ word = parse_record(string, index)
307
+ word.index = index
308
+ word.percentile = [(index-0.5)/size,0].max * 100
309
+ yield word
307
310
  index += 1
308
311
  end
309
312
  end
@@ -332,7 +335,8 @@ INSPECT
332
335
  result = []
333
336
  found_indexes = []
334
337
  iterations = 0
335
- while (result.size < count && iterations < 1000)
338
+ max_iterations = [1000, 4 * count].max
339
+ while (result.size < count && iterations < max_iterations)
336
340
  len = random_in_range(options[:word_length])
337
341
  pct = random_in_range(options[:percentile])
338
342
  word_idx = @kdtree.nearest(len2coord(len), pct)
@@ -48,6 +48,15 @@ class CorrectHorseBatteryStaple::Corpus::Base < CorrectHorseBatteryStaple::Corpu
48
48
  end
49
49
 
50
50
 
51
+ def count_by_options(options = {})
52
+ if options.empty?
53
+ count
54
+ else
55
+ count &filter_for_options(options)
56
+ end
57
+ end
58
+ memoize :count_by_options
59
+
51
60
  def sorted_entries
52
61
  entries.sort
53
62
  end
@@ -136,8 +145,12 @@ class CorrectHorseBatteryStaple::Corpus::Base < CorrectHorseBatteryStaple::Corpu
136
145
  end
137
146
  memoize :frequencies
138
147
 
139
- def entropy_per_word
140
- Math.log(count) / Math.log(2)
148
+ def entropy_per_word(options = {})
149
+ Math.log(count_by_options(options)) / Math.log(2)
150
+ end
151
+
152
+ def entropy_per_word_by_filter(&filter)
153
+ Math.log(filter ? count(&filter) : size) / Math.log(2)
141
154
  end
142
155
 
143
156
  # filtering
@@ -279,8 +292,8 @@ INSPECT
279
292
 
280
293
  filters.empty? ? nil : compose_filters(filters)
281
294
  end
282
- memoize :filter_for_options
283
-
295
+ # memoize :filter_for_options
296
+ public :filter_for_options
284
297
  end
285
298
 
286
299
  # Random.srand(SecureRandom.random_number)
@@ -28,9 +28,10 @@ require 'set'
28
28
  #
29
29
 
30
30
  class CorrectHorseBatteryStaple::Corpus::Isam < CorrectHorseBatteryStaple::Corpus::Base
31
+ include CorrectHorseBatteryStaple::Backend::Isam
31
32
  include CorrectHorseBatteryStaple::Memoize
32
33
 
33
- INITIAL_PRELUDE_LENGTH = 512
34
+ INITIAL_PRELUDE_LENGTH = 4096
34
35
 
35
36
  def initialize(filename, stats = nil)
36
37
  super
@@ -39,122 +40,11 @@ class CorrectHorseBatteryStaple::Corpus::Isam < CorrectHorseBatteryStaple::Corpu
39
40
  parse_prelude
40
41
  end
41
42
 
42
- def precache(max = -1)
43
- return if max > -1 && file_size(@file) > max
44
- @file.seek 0
45
- @file = StringIO.new @file.read, "r"
46
- end
47
-
48
- def file_size(file)
49
- (file.respond_to?(:size) ? file.size : file.stat.size)
50
- end
51
-
52
- def prelude
53
- @prelude || parse_prelude
54
- end
55
-
56
- def parse_prelude
57
- @file.seek 0
58
- prelude_buf = @file.read(INITIAL_PRELUDE_LENGTH)
59
-
60
- # byte offset of first record from beginning of file
61
- # total length of JSON string (without padding)
62
- (@record_offset, @prelude_len) = prelude_buf.unpack("NN")
63
-
64
- # read more if our initial read didn't slurp in the entire prelude
65
- if @prelude_len > prelude_buf.length
66
- prelude_buf += @file.read(@prelude_len - prelude_buf.length)
67
- end
68
-
69
- @prelude = JSON.parse( prelude_buf.unpack("@8a#{@prelude_len}")[0] ) || {}
70
-
71
- # includes prefix length byte
72
- @word_length = @prelude["wlen"] || raise(ArgumentError, "Word length is not defined!")
73
-
74
- # as network byte order int
75
- @frequency_length = @prelude["flen"] || 4
76
-
77
- # total length of record
78
- @entry_length = @prelude["entrylen"] || raise(ArgumentError, "Prelude does not include entrylen!")
79
-
80
- load_stats_from_hash(@prelude["stats"]) if @prelude["stats"]
81
-
82
- @prelude
83
- end
84
-
85
43
  # factory-ish constructor
86
44
  def self.read(filename)
87
45
  self.new filename
88
46
  end
89
47
 
90
-
91
- ## parsing
92
-
93
- #
94
- # Parse a record into an array of [word, frequency] IFF the word
95
- # fits into the length_range or length_range is nil
96
- #
97
- def parse_record_into_array(string, index, length_range = nil)
98
- chunk = nth_chunk(index, string)
99
- raise "No chunk for index #{index}" unless chunk
100
- actual_word_length = chunk.unpack("C")[0]
101
- if !length_range || length_range.include?(actual_word_length)
102
- # returns [word, frequency]
103
- chunk.unpack("xa#{actual_word_length}@#{@word_length}N")
104
- else
105
- nil
106
- end
107
- end
108
-
109
- #
110
- # Parse a record into a Word object, which can be provided or will otherwise
111
- # be constructed as needed fourth arg is a length range which can act as a
112
- # filter; if not satisfied, nil will be returned
113
- #
114
- def parse_record(string, index=0,
115
- word=CorrectHorseBatteryStaple::Word.new(:word => ""),
116
- length_range = nil)
117
- bare = parse_record_into_array(string, index, length_range)
118
- return nil unless bare
119
- word.word = bare[0]
120
- word.frequency = bare[1]
121
- word
122
- end
123
-
124
- def word_length(chunk_string)
125
- chunk_string.unpack("C")
126
- end
127
-
128
- # return a string representing the nth_record
129
- def nth_chunk(n, string)
130
- string[@entry_length * n, @entry_length]
131
- end
132
-
133
- ## some core Enumerable building blocks
134
-
135
- def each(&block)
136
- string = records_string
137
- max_index = size - 1
138
- index = 0
139
- while index < max_index
140
- yield parse_record(string, index)
141
- index += 1
142
- end
143
- end
144
-
145
- def size
146
- @size ||= records_size / @entry_length
147
- end
148
-
149
-
150
- ## our Corpus Enumerablish abstract methods
151
-
152
- # we presume that the ISAM file has been sorted
153
- def sorted_entries
154
- @sorted_entries ||= entries
155
- end
156
-
157
-
158
48
  ## optimized pick - does NOT support :filter, though
159
49
  def pick(count, options = {})
160
50
  # incompat check
@@ -203,55 +93,4 @@ class CorrectHorseBatteryStaple::Corpus::Isam < CorrectHorseBatteryStaple::Corpu
203
93
  result
204
94
  end
205
95
 
206
-
207
- ## file I/O
208
-
209
- def records_size
210
- @records_size ||= (file_size(@file) - @record_offset)
211
- end
212
-
213
- def file_string
214
- @file.is_a?(StringIO) ? @file.string : file_range_read(nil)
215
- end
216
-
217
- def file_range_read(file_range = nil)
218
- file_range ||= 0...file_size(@file)
219
- pos = @file.tell
220
- @file.seek(file_range.first)
221
- @file.read(range_count(file_range))
222
- ensure
223
- @file.seek(pos)
224
- end
225
- memoize :file_range_read
226
-
227
- # returns a string representing the record-holding portion of the file
228
- def records_string
229
- @records_string ||=
230
- record_range_read(0 ... records_size)
231
- end
232
-
233
- def record_range_read(record_range = nil)
234
- record_range ||= 0...records_size
235
- file_range_read((record_range.first + @record_offset)...(record_range.first + range_count(record_range) + @record_offset))
236
- end
237
- # memoize :record_range_read
238
-
239
- def record_percentile_range_read(percentile_range)
240
- record_range = record_range_for_percentile(percentile_range)
241
- record_range_read(record_range)
242
- end
243
-
244
-
245
- ## rather than using a StatisticalArray, we do direct indexing into the file/string
246
- def percentile_index(percentile, round=true)
247
- r = percentile.to_f/100 * count + 0.5
248
- round ? r.round : r
249
- end
250
-
251
- def record_range_for_percentile(range)
252
- range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric)
253
- (percentile_index(range.begin, false).floor * @entry_length ...
254
- percentile_index(range.end, false).ceil * @entry_length)
255
- end
256
-
257
96
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: correct-horse-battery-staple
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.3
4
+ version: 0.6.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -50,11 +50,11 @@ cert_chain:
50
50
  -----END CERTIFICATE-----
51
51
 
52
52
  '
53
- date: 2012-01-11 00:00:00.000000000 Z
53
+ date: 2012-01-13 00:00:00.000000000 Z
54
54
  dependencies:
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: commander
57
- requirement: &70319122299080 !ruby/object:Gem::Requirement
57
+ requirement: &70214898076980 !ruby/object:Gem::Requirement
58
58
  none: false
59
59
  requirements:
60
60
  - - ! '>='
@@ -62,10 +62,10 @@ dependencies:
62
62
  version: '4.0'
63
63
  type: :runtime
64
64
  prerelease: false
65
- version_requirements: *70319122299080
65
+ version_requirements: *70214898076980
66
66
  - !ruby/object:Gem::Dependency
67
67
  name: fastercsv
68
- requirement: &70319122297940 !ruby/object:Gem::Requirement
68
+ requirement: &70214898075860 !ruby/object:Gem::Requirement
69
69
  none: false
70
70
  requirements:
71
71
  - - ! '>='
@@ -73,10 +73,10 @@ dependencies:
73
73
  version: 1.5.3
74
74
  type: :runtime
75
75
  prerelease: false
76
- version_requirements: *70319122297940
76
+ version_requirements: *70214898075860
77
77
  - !ruby/object:Gem::Dependency
78
78
  name: json
79
- requirement: &70319122297020 !ruby/object:Gem::Requirement
79
+ requirement: &70214898075000 !ruby/object:Gem::Requirement
80
80
  none: false
81
81
  requirements:
82
82
  - - ! '>='
@@ -84,10 +84,10 @@ dependencies:
84
84
  version: 1.6.0
85
85
  type: :runtime
86
86
  prerelease: false
87
- version_requirements: *70319122297020
87
+ version_requirements: *70214898075000
88
88
  - !ruby/object:Gem::Dependency
89
89
  name: redis
90
- requirement: &70319122296340 !ruby/object:Gem::Requirement
90
+ requirement: &70214898074260 !ruby/object:Gem::Requirement
91
91
  none: false
92
92
  requirements:
93
93
  - - ! '>='
@@ -95,10 +95,10 @@ dependencies:
95
95
  version: 2.2.2
96
96
  type: :runtime
97
97
  prerelease: false
98
- version_requirements: *70319122296340
98
+ version_requirements: *70214898074260
99
99
  - !ruby/object:Gem::Dependency
100
100
  name: hiredis
101
- requirement: &70319122295780 !ruby/object:Gem::Requirement
101
+ requirement: &70214898073780 !ruby/object:Gem::Requirement
102
102
  none: false
103
103
  requirements:
104
104
  - - ! '>='
@@ -106,10 +106,10 @@ dependencies:
106
106
  version: 0.4.0
107
107
  type: :runtime
108
108
  prerelease: false
109
- version_requirements: *70319122295780
109
+ version_requirements: *70214898073780
110
110
  - !ruby/object:Gem::Dependency
111
111
  name: tupalo-kdtree
112
- requirement: &70319122295280 !ruby/object:Gem::Requirement
112
+ requirement: &70214898073340 !ruby/object:Gem::Requirement
113
113
  none: false
114
114
  requirements:
115
115
  - - ! '>='
@@ -117,10 +117,10 @@ dependencies:
117
117
  version: 0.2.3
118
118
  type: :runtime
119
119
  prerelease: false
120
- version_requirements: *70319122295280
120
+ version_requirements: *70214898073340
121
121
  - !ruby/object:Gem::Dependency
122
122
  name: sqlite3
123
- requirement: &70319122294840 !ruby/object:Gem::Requirement
123
+ requirement: &70214898072840 !ruby/object:Gem::Requirement
124
124
  none: false
125
125
  requirements:
126
126
  - - ! '>='
@@ -128,10 +128,10 @@ dependencies:
128
128
  version: 1.3.0
129
129
  type: :runtime
130
130
  prerelease: false
131
- version_requirements: *70319122294840
131
+ version_requirements: *70214898072840
132
132
  - !ruby/object:Gem::Dependency
133
133
  name: rubyforge
134
- requirement: &70319122294400 !ruby/object:Gem::Requirement
134
+ requirement: &70214898072400 !ruby/object:Gem::Requirement
135
135
  none: false
136
136
  requirements:
137
137
  - - ! '>='
@@ -139,10 +139,10 @@ dependencies:
139
139
  version: 2.0.4
140
140
  type: :development
141
141
  prerelease: false
142
- version_requirements: *70319122294400
142
+ version_requirements: *70214898072400
143
143
  - !ruby/object:Gem::Dependency
144
144
  name: hoe
145
- requirement: &70319122293960 !ruby/object:Gem::Requirement
145
+ requirement: &70214898071960 !ruby/object:Gem::Requirement
146
146
  none: false
147
147
  requirements:
148
148
  - - ~>
@@ -150,7 +150,7 @@ dependencies:
150
150
  version: '2.12'
151
151
  type: :development
152
152
  prerelease: false
153
- version_requirements: *70319122293960
153
+ version_requirements: *70214898071960
154
154
  description: ! "Generate a 4 word password from words of size 3-8 characters, with\nfrequencies
155
155
  in the 30th-60th percentile. This range gives a nice set\nof uncommon but not completely
156
156
  alien words.\n\n $ chbs generate --verbose -W 3..8 -P 30..60\n Corpus size:
@@ -187,6 +187,7 @@ files:
187
187
  - lib/correct_horse_battery_staple.rb
188
188
  - lib/correct_horse_battery_staple/assembler.rb
189
189
  - lib/correct_horse_battery_staple/backend.rb
190
+ - lib/correct_horse_battery_staple/backend/isam.rb
190
191
  - lib/correct_horse_battery_staple/backend/isam_kd.rb
191
192
  - lib/correct_horse_battery_staple/backend/redis.rb
192
193
  - lib/correct_horse_battery_staple/backend/redis/d_range.rb
metadata.gz.sig CHANGED
Binary file