correct-horse-battery-staple 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. data.tar.gz.sig +1 -1
  2. data/.gemtest +0 -0
  3. data/Gemfile +53 -0
  4. data/Gemfile.lock +109 -0
  5. data/History.txt +6 -0
  6. data/Manifest.txt +57 -0
  7. data/README.txt +115 -0
  8. data/Rakefile +47 -0
  9. data/bin/chbs +234 -0
  10. data/bin/chbs-mkpass +16 -0
  11. data/correct-horse-battery-staple.gemspec +59 -0
  12. data/lib/correct_horse_battery_staple.rb +117 -0
  13. data/lib/correct_horse_battery_staple/assembler.rb +45 -0
  14. data/lib/correct_horse_battery_staple/backend.rb +6 -0
  15. data/lib/correct_horse_battery_staple/backend/isam_kd.rb +410 -0
  16. data/lib/correct_horse_battery_staple/backend/redis.rb +95 -0
  17. data/lib/correct_horse_battery_staple/backend/redis/d_range.rb +105 -0
  18. data/lib/correct_horse_battery_staple/corpus.rb +33 -0
  19. data/lib/correct_horse_battery_staple/corpus/base.rb +278 -0
  20. data/lib/correct_horse_battery_staple/corpus/isam.rb +258 -0
  21. data/lib/correct_horse_battery_staple/corpus/isam_kd.rb +60 -0
  22. data/lib/correct_horse_battery_staple/corpus/redis.rb +188 -0
  23. data/lib/correct_horse_battery_staple/corpus/redis2.rb +88 -0
  24. data/lib/correct_horse_battery_staple/corpus/serialized.rb +121 -0
  25. data/lib/correct_horse_battery_staple/corpus/sqlite.rb +266 -0
  26. data/lib/correct_horse_battery_staple/generator.rb +40 -0
  27. data/lib/correct_horse_battery_staple/memoize.rb +25 -0
  28. data/lib/correct_horse_battery_staple/parser.rb +5 -0
  29. data/lib/correct_horse_battery_staple/parser/base.rb +5 -0
  30. data/lib/correct_horse_battery_staple/parser/regex.rb +58 -0
  31. data/lib/correct_horse_battery_staple/range_parser.rb +29 -0
  32. data/lib/correct_horse_battery_staple/statistical_array.rb +74 -0
  33. data/lib/correct_horse_battery_staple/stats.rb +22 -0
  34. data/lib/correct_horse_battery_staple/word.rb +90 -0
  35. data/lib/correct_horse_battery_staple/writer.rb +29 -0
  36. data/lib/correct_horse_battery_staple/writer/base.rb +22 -0
  37. data/lib/correct_horse_battery_staple/writer/csv.rb +15 -0
  38. data/lib/correct_horse_battery_staple/writer/file.rb +54 -0
  39. data/lib/correct_horse_battery_staple/writer/isam.rb +50 -0
  40. data/lib/correct_horse_battery_staple/writer/isam_kd.rb +12 -0
  41. data/lib/correct_horse_battery_staple/writer/json.rb +19 -0
  42. data/lib/correct_horse_battery_staple/writer/marshal.rb +10 -0
  43. data/lib/correct_horse_battery_staple/writer/redis.rb +41 -0
  44. data/lib/correct_horse_battery_staple/writer/sqlite.rb +115 -0
  45. data/script/generate_all +34 -0
  46. data/script/load_redis +17 -0
  47. data/script/perftest +74 -0
  48. data/spec/corpus/serialized_spec.rb +62 -0
  49. data/spec/corpus_spec.rb +50 -0
  50. data/spec/correct_horse_battery_staple_spec.rb +73 -0
  51. data/spec/fixtures/100.json +101 -0
  52. data/spec/fixtures/corpus1.csv +101 -0
  53. data/spec/fixtures/corpus100.json +101 -0
  54. data/spec/fixtures/wiktionary1000.htm +648 -0
  55. data/spec/range_parser_spec.rb +54 -0
  56. data/spec/spec_helper.rb +20 -0
  57. data/spec/statistical_array_spec.rb +52 -0
  58. data/spec/support/spec_pry.rb +1 -0
  59. data/spec/word_spec.rb +95 -0
  60. metadata +264 -0
  61. metadata.gz.sig +1 -0
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'correct_horse_battery_staple'
4
+
5
+ format = ARGV[2] || ENV['corpus_format'] || "isam"
6
+ corpus = CorrectHorseBatteryStaple.load_corpus(ARGV[0] || "tvscripts", format)
7
+
8
+ word_length = 3..9
9
+ percentile = 30..80
10
+ number_of_words = (ARGV[1] || 4).to_i
11
+
12
+ generator = CorrectHorseBatteryStaple::Generator.new(corpus)
13
+
14
+ puts generator.make(number_of_words,
15
+ :word_length => word_length,
16
+ :percentile => percentile)
@@ -0,0 +1,59 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "correct-horse-battery-staple"
5
+ s.version = "0.6.1.20120109223855"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Robert Sanders"]
9
+ s.cert_chain = ["/Users/robertsanders/.gem/gem-public_cert.pem"]
10
+ s.date = "2012-01-10"
11
+ s.description = "Generate a 4 word password from words of size 3-8 characters, with\nfrequencies in the 30th-60th percentile. This range gives a nice set\nof uncommon but not completely alien words.\n\n $ chbs generate --verbose -W 3..8 -P 30..60\n Corpus size: 6396 candidate words of 33075 total\n Entropy: 48 bits (2^48 = 281474976710656)\n Years to guess at 1000 guesses/sec: 8926\n magnate-thermal-sandbank-augur\n\nWith the --verbose flag, the utility will calculate a time-to-guess\nbased on a completely arbitrary 1000 guesses/sec. If you'd like a\nmore secure password, either relax the various filtering rules (-W and\n-P), add more words to the password, or use a larger corpus.\n\nBy default we use the American TV Shows & Scripts corpus taken from\nWiktionary.\n\nOthers provided:\n\n* Project Gutenberg 2005 corpus taken from Wiktionary.\n* 1 of every 7 of the top 60000 lemmas from wordfrequency.info (6900\n actual lemmas after processing)\n\nSee http://xkcd.com/936/ for the genesis of the idea.\n\nData sources:\n\n http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists\n http://wordfrequency.info/"
12
+ s.email = ["robert@curioussquid.com"]
13
+ s.executables = ["chbs", "chbs-mkpass"]
14
+ s.extra_rdoc_files = ["History.txt", "Manifest.txt", "README.txt"]
15
+ s.files = ["Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "README.txt", "Rakefile", "bin/chbs", "bin/chbs-mkpass", "lib/correct_horse_battery_staple.rb", "lib/correct_horse_battery_staple/assembler.rb", "lib/correct_horse_battery_staple/backend.rb", "lib/correct_horse_battery_staple/backend/isam_kd.rb", "lib/correct_horse_battery_staple/backend/redis.rb", "lib/correct_horse_battery_staple/backend/redis/d_range.rb", "lib/correct_horse_battery_staple/corpus.rb", "lib/correct_horse_battery_staple/corpus/base.rb", "lib/correct_horse_battery_staple/corpus/isam.rb", "lib/correct_horse_battery_staple/corpus/isam_kd.rb", "lib/correct_horse_battery_staple/corpus/redis.rb", "lib/correct_horse_battery_staple/corpus/redis2.rb", "lib/correct_horse_battery_staple/corpus/serialized.rb", "lib/correct_horse_battery_staple/corpus/sqlite.rb", "lib/correct_horse_battery_staple/generator.rb", "lib/correct_horse_battery_staple/memoize.rb", "lib/correct_horse_battery_staple/parser.rb", "lib/correct_horse_battery_staple/parser/base.rb", "lib/correct_horse_battery_staple/parser/regex.rb", "lib/correct_horse_battery_staple/range_parser.rb", "lib/correct_horse_battery_staple/statistical_array.rb", "lib/correct_horse_battery_staple/stats.rb", "lib/correct_horse_battery_staple/word.rb", "lib/correct_horse_battery_staple/writer.rb", "lib/correct_horse_battery_staple/writer/base.rb", "lib/correct_horse_battery_staple/writer/csv.rb", "lib/correct_horse_battery_staple/writer/file.rb", "lib/correct_horse_battery_staple/writer/isam.rb", "lib/correct_horse_battery_staple/writer/isam_kd.rb", "lib/correct_horse_battery_staple/writer/json.rb", "lib/correct_horse_battery_staple/writer/marshal.rb", "lib/correct_horse_battery_staple/writer/redis.rb", "lib/correct_horse_battery_staple/writer/sqlite.rb", "script/generate_all", "script/load_redis", "script/perftest", "spec/corpus/serialized_spec.rb", "spec/corpus_spec.rb", "spec/correct_horse_battery_staple_spec.rb", "spec/fixtures/100.json", "spec/fixtures/corpus1.csv", "spec/fixtures/corpus100.json", "spec/fixtures/wiktionary1000.htm", "spec/range_parser_spec.rb", "spec/spec_helper.rb", "spec/statistical_array_spec.rb", "spec/support/spec_pry.rb", "spec/word_spec.rb", "correct-horse-battery-staple.gemspec", ".gemtest"]
16
+ s.homepage = "http://github.com/rsanders/correct-horse-battery-staple"
17
+ s.rdoc_options = ["--main", "README.txt"]
18
+ s.require_paths = ["lib"]
19
+ s.rubyforge_project = "correct-horse-battery-staple"
20
+ s.rubygems_version = "1.8.10"
21
+ s.signing_key = "/Users/robertsanders/.gem/gem-private_key.pem"
22
+ s.summary = "Generate a 4 word password from words of size 3-8 characters, with frequencies in the 30th-60th percentile"
23
+
24
+ if s.respond_to? :specification_version then
25
+ s.specification_version = 3
26
+
27
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
28
+ s.add_runtime_dependency(%q<commander>, [">= 4.0"])
29
+ s.add_runtime_dependency(%q<fastercsv>, [">= 1.5.3"])
30
+ s.add_runtime_dependency(%q<json>, [">= 1.6.0"])
31
+ s.add_runtime_dependency(%q<redis>, [">= 2.2.2"])
32
+ s.add_runtime_dependency(%q<hiredis>, [">= 0.4.0"])
33
+ s.add_runtime_dependency(%q<tupalo-kdtree>, [">= 0.2.3"])
34
+ s.add_runtime_dependency(%q<sqlite3>, [">= 1.3.0"])
35
+ s.add_development_dependency(%q<rubyforge>, [">= 2.0.4"])
36
+ s.add_development_dependency(%q<hoe>, ["~> 2.12"])
37
+ else
38
+ s.add_dependency(%q<commander>, [">= 4.0"])
39
+ s.add_dependency(%q<fastercsv>, [">= 1.5.3"])
40
+ s.add_dependency(%q<json>, [">= 1.6.0"])
41
+ s.add_dependency(%q<redis>, [">= 2.2.2"])
42
+ s.add_dependency(%q<hiredis>, [">= 0.4.0"])
43
+ s.add_dependency(%q<tupalo-kdtree>, [">= 0.2.3"])
44
+ s.add_dependency(%q<sqlite3>, [">= 1.3.0"])
45
+ s.add_dependency(%q<rubyforge>, [">= 2.0.4"])
46
+ s.add_dependency(%q<hoe>, ["~> 2.12"])
47
+ end
48
+ else
49
+ s.add_dependency(%q<commander>, [">= 4.0"])
50
+ s.add_dependency(%q<fastercsv>, [">= 1.5.3"])
51
+ s.add_dependency(%q<json>, [">= 1.6.0"])
52
+ s.add_dependency(%q<redis>, [">= 2.2.2"])
53
+ s.add_dependency(%q<hiredis>, [">= 0.4.0"])
54
+ s.add_dependency(%q<tupalo-kdtree>, [">= 0.2.3"])
55
+ s.add_dependency(%q<sqlite3>, [">= 1.3.0"])
56
+ s.add_dependency(%q<rubyforge>, [">= 2.0.4"])
57
+ s.add_dependency(%q<hoe>, ["~> 2.12"])
58
+ end
59
+ end
@@ -0,0 +1,117 @@
1
+ require 'logger'
2
+
3
+ module CorrectHorseBatteryStaple
4
+ VERSION = '0.6.1'
5
+
6
+ DEFAULT_CORPUS_NAME = "tvscripts"
7
+
8
+ SUPPORTED_FORMATS = %w[isam isamkd sqlite json csv marshal]
9
+
10
+ class << self
11
+ attr_accessor :logger
12
+ attr_accessor :corpus_directory
13
+ end
14
+ self.logger = Logger.new(STDERR)
15
+
16
+ def self.default_corpus
17
+ self.load_corpus DEFAULT_CORPUS_NAME
18
+ end
19
+
20
+ def self.corpus_search_directories
21
+ [self.corpus_directory]
22
+ end
23
+
24
+ def self.corpus_list(options = {})
25
+ self.corpus_search_directories.map do |dir|
26
+ Dir[File.join(dir, "*.{#{SUPPORTED_FORMATS.join(',')}}")].
27
+ map {|file| options[:with_paths] ? File.expand_path(file) : File.basename(file, File.extname(file)) }
28
+ end.flatten.sort.uniq
29
+ end
30
+
31
+ def self.find_corpus(corpus_name, formats = SUPPORTED_FORMATS)
32
+ formats.each do |fmt|
33
+ fname = "#{corpus_name}.#{fmt}"
34
+ self.corpus_search_directories.each do |dir|
35
+ path = File.join(dir, fname)
36
+ return path if File.exist?(path)
37
+ end
38
+ end
39
+ nil
40
+ end
41
+
42
+ def self.load_corpus(corpus_name, formats = nil)
43
+ if corpus_name.include?(':')
44
+ return CorrectHorseBatteryStaple::Corpus.read corpus_name
45
+ end
46
+
47
+ formats = Array(formats || SUPPORTED_FORMATS)
48
+ filename = corpus_name.match(/[.?]/) ? corpus_name :
49
+ self.find_corpus(corpus_name, formats)
50
+ unless (filename && File.exist?(filename))
51
+ raise ArgumentError, "Cannot find corpus #{corpus_name}"
52
+ end
53
+
54
+ CorrectHorseBatteryStaple::Corpus.read filename
55
+ end
56
+
57
+ def self.generate(length = 4)
58
+ CorrectHorseBatteryStaple::Generator.new(self.default_corpus).make(length)
59
+ end
60
+
61
+ protected
62
+
63
+
64
+ module Common
65
+ def logger
66
+ CorrectHorseBatteryStaple.logger
67
+ end
68
+
69
+ if Object.const_defined?("Random")
70
+ def random_number(max=1.0)
71
+ Random.rand(max)
72
+ end
73
+ else
74
+ def random_number(max=1.0)
75
+ SecureRandom.random_number(max)
76
+ end
77
+ end
78
+
79
+ def random_in_range(range)
80
+ range.first + random_number(range_count(range))
81
+ end
82
+
83
+ def array_sample(array, count)
84
+ l = array.length
85
+ array.values_at(* count.times.map { random_number(l) })
86
+ end
87
+
88
+ def set_sample(array, count)
89
+ l = array.length
90
+ array.values_at(* count.times.map { random_number(l) })
91
+ end
92
+ end
93
+
94
+ module Util
95
+ def self.open_binary(filename, mode = "r", *rest)
96
+ open(filename, openmode(mode), *rest)
97
+ end
98
+
99
+ def self.openmode(mode)
100
+ IO.respond_to?(:binwrite) ? "#{mode}b:ASCII-8BIT" : mode
101
+ end
102
+ end
103
+
104
+ autoload :Word, 'correct_horse_battery_staple/word'
105
+ autoload :Stats, 'correct_horse_battery_staple/stats'
106
+ autoload :Generator, 'correct_horse_battery_staple/generator'
107
+ autoload :Corpus, 'correct_horse_battery_staple/corpus'
108
+ autoload :Parser, 'correct_horse_battery_staple/parser'
109
+ autoload :StatisticalArray, 'correct_horse_battery_staple/statistical_array'
110
+ autoload :RangeParser, 'correct_horse_battery_staple/range_parser'
111
+ autoload :Writer, 'correct_horse_battery_staple/writer'
112
+ autoload :Backend, 'correct_horse_battery_staple/backend'
113
+ autoload :Memoize, 'correct_horse_battery_staple/memoize'
114
+
115
+ self.corpus_directory = File.join(File.dirname(__FILE__), "../corpus")
116
+ end
117
+
@@ -0,0 +1,45 @@
1
+ require 'bigdecimal'
2
+ require 'json'
3
+
4
+ class CorrectHorseBatteryStaple::Assembler
5
+ include CorrectHorseBatteryStaple::Common
6
+
7
+ attr_accessor :words
8
+
9
+ VALID_INITIAL_CHARS = ([*'a'..'z']).map {|ls| ls[0]}
10
+
11
+ def initialize(parser = nil)
12
+ @parser = (parser || CorrectHorseBatteryStaple::Parser::Regex.new(:wiktionary))
13
+ end
14
+
15
+ def read(urls)
16
+ self.words =
17
+ urls.map do |url|
18
+ @parser.parse open(url)
19
+ end.reduce(:+).
20
+ select {|wstruct| VALID_INITIAL_CHARS.include?(wstruct.word[0]) }.
21
+ # we take a round-trip through a Hash to weed out dupes
22
+ inject({}) {|h, wstruct| h[wstruct.word] = wstruct; h }.
23
+ values.
24
+ sort
25
+
26
+ self
27
+ end
28
+
29
+ def randomize
30
+ self.words.shuffle!
31
+ self
32
+ end
33
+
34
+ def limit(count)
35
+ self.words.slice!(count..-1) if self.words.length > count
36
+ self
37
+ end
38
+
39
+ def corpus
40
+ @corpus ||= CorrectHorseBatteryStaple::Corpus::Serialized.new(self.words).tap do |corpus|
41
+ corpus.recalculate
42
+ end
43
+ end
44
+
45
+ end
@@ -0,0 +1,6 @@
1
+ class CorrectHorseBatteryStaple::Backend
2
+ autoload :Isam, "correct_horse_battery_staple/backend/isam"
3
+ autoload :IsamKD, "correct_horse_battery_staple/backend/isam_kd"
4
+ autoload :Sqlite, "correct_horse_battery_staple/backend/sqlite"
5
+ autoload :Redis, "correct_horse_battery_staple/backend/redis"
6
+ end
@@ -0,0 +1,410 @@
1
+ require 'bigdecimal'
2
+ require 'json'
3
+ require 'set'
4
+ require 'kdtree'
5
+
6
+ module CorrectHorseBatteryStaple::Backend::IsamKD
7
+ INITIAL_PRELUDE_LENGTH = 4096
8
+
9
+ F_PRELUDE_AT_END = 1
10
+
11
+ def self.included(base)
12
+ base.extend ClassMethods
13
+ base.send :include, InstanceMethods
14
+ end
15
+
16
+ module ClassMethods
17
+ end
18
+
19
+ module InstanceMethods
20
+ #
21
+ #
22
+ #
23
+ def initialize_backend_variables
24
+ @length_scaling_factor = 15
25
+ @page_size = 4096
26
+ end
27
+
28
+
29
+
30
+ def fix_stats(stats)
31
+ stats.each do |k,v|
32
+ if v.respond_to?(:nan?) && v.nan?
33
+ stats[k] = -1
34
+ end
35
+ end
36
+ stats
37
+ end
38
+
39
+ def page_size
40
+ @page_size || 4096
41
+ end
42
+
43
+ # many MMUs in default mode and modern highcap drives have 4k pages/blocks
44
+ def round_up(val, blocksize=page_size)
45
+ [(val.to_f/blocksize).ceil, 1].max * blocksize
46
+ end
47
+
48
+ def write_corpus_to_io(corpus, io=STDOUT)
49
+ io.rewind
50
+
51
+ # includes prefix length byte
52
+ @word_length = corpus.reduce(0) { |m, e| m > e.word.length ? m : e.word.length } + 1
53
+ @freq_length = 4
54
+ @entry_length = @word_length + @freq_length
55
+
56
+ stats = fix_stats(corpus.stats)
57
+ corpus_word_count = corpus.length
58
+
59
+ prelude = {
60
+ "wlen" => @word_length,
61
+ "flen" => 4,
62
+ "entrylen" => @word_length + @freq_length,
63
+ "sort" => "frequency",
64
+ "n" => corpus_word_count,
65
+ "stats" => stats,
66
+ "flags" => 0,
67
+ "length_scaling_factor" => (@length_scaling_factor || 15),
68
+ "records_length" => "0000000000",
69
+ "offset_records" => "0000000000",
70
+ "offset_index1" => "0000000000",
71
+ "offset_index2" => "0000000000"
72
+ }
73
+
74
+ prelude_json_length = prelude.to_json.length
75
+ prelude["offset_records"] = offset_records = round_up(prelude_json_length+8.0)
76
+
77
+ prelude["records_length"] = records_length = corpus_word_count * prelude["entrylen"]
78
+ offset_index1 = prelude["offset_records"] +
79
+ round_up(records_length, page_size)
80
+
81
+ prelude["offset_index1"] = offset_index1
82
+
83
+ io.write([offset_records, prelude_json_length, prelude.to_json].
84
+ pack("NNA#{offset_records-8}"))
85
+
86
+ corpus.each_with_index do |w, index|
87
+ io.write(s=[w.word.length, w.word, w.frequency].pack("Ca#{@word_length-1}N"))
88
+ end
89
+
90
+ pad(offset_index1 - (offset_records + records_length), io)
91
+ write_kdtree(corpus, io)
92
+ end
93
+
94
+ def pad(size, io)
95
+ io.write([].pack("x#{size}"))
96
+ end
97
+
98
+ def write_kdtree(corpus, io)
99
+ i = -1
100
+ k = KDTree.new(
101
+ corpus.entries.map {|w| [
102
+ len2coord(w.word.length.to_f),
103
+ w.percentile.to_f,
104
+ i+=1
105
+ ]
106
+ }
107
+ )
108
+
109
+ k.persist(io)
110
+ end
111
+
112
+ # make the search space more square by increasing the length of
113
+ # the "word length" axis
114
+ def len2coord(len)
115
+ len * (@length_scaling_factor || 10)
116
+ end
117
+
118
+ def binwrite(*args)
119
+ method = io.respond_to?(:binwrite) ? :binwrite : :write
120
+ io.send(method, *args)
121
+ end
122
+
123
+ def openmode
124
+ IO.respond_to?(:binwrite) ? "wb:ASCII-8BIT" : "w"
125
+ end
126
+
127
+
128
+ #
129
+ #
130
+ # Format of header:
131
+ #
132
+ # 0..3 - OB - offset of body start in bytes; network byte order
133
+ # 4..7 - LP - length of prelude in network byte order
134
+ # 8..OB-1 - P - JSON-encoded prelude hash and space padding
135
+ # OB..EOF - array of fixed size records as described in prelude
136
+ #
137
+ # Contents of Prelude (after JSON decoding):
138
+ #
139
+ # P["wlen"] - length of word part of record
140
+ # P["flen"] - length of frequency part of record (always 4 bytes)
141
+ # P["entrylen"] - length of total part of record
142
+ # P["n"] - number of records
143
+ # P["sort"] - field name sorted by (word or frequency)
144
+ # P["stats"] - corpus statistics
145
+ # P["offset_index1"] - absolute file offset of KDTree index
146
+ # P["records_length"] - length in bytes of records section, excluding padding
147
+ # P["length_scaling_factor"] - what length was multiplied by in creating KDTree (usually 15)
148
+ #
149
+ # Format of record:
150
+ #
151
+ # 2 bytes - LW - actual length of word within field
152
+ # P["wlen"] bytes - LW bytes of word (W) + P["wlen"]-LW bytes of padding
153
+ # P["flen"] (4) bytes - frequency as network byte order long
154
+ #
155
+ # After record section, there is padding up to the next page_size boundary,
156
+ # and then there is a dumped KDTree which extends to EOF.
157
+ #
158
+ #
159
+
160
+ def precache(max = -1)
161
+ return if max > -1 && file_size(@file) > max
162
+ @file.seek 0
163
+ @file = StringIO.new @file.read, "r"
164
+ end
165
+
166
+ def file_size(file)
167
+ (file.respond_to?(:size) ? file.size : file.stat.size)
168
+ end
169
+
170
+ def prelude
171
+ @prelude || parse_prelude
172
+ end
173
+
174
+ def parse_prelude
175
+ @file.seek 0
176
+ prelude_buf = @file.read(INITIAL_PRELUDE_LENGTH)
177
+
178
+ # byte offset of first record from beginning of file
179
+ # total length of JSON string (without padding)
180
+ (@record_offset, @prelude_len) = prelude_buf.unpack("NN")
181
+
182
+ # read more if our initial read didn't slurp in the entire prelude
183
+ if @prelude_len > prelude_buf.length
184
+ prelude_buf += @file.read(@prelude_len - prelude_buf.length)
185
+ end
186
+
187
+ @prelude = JSON.parse( prelude_buf.unpack("@8a#{@prelude_len}")[0] ) || {}
188
+
189
+ # includes prefix length byte
190
+ @word_length = @prelude["wlen"] || raise(ArgumentError, "Word length is not defined!")
191
+
192
+ # as network byte order int
193
+ @frequency_length = @prelude["flen"] || 4
194
+
195
+ # total length of record
196
+ @entry_length = @prelude["entrylen"] || raise(ArgumentError, "Prelude does not include entrylen!")
197
+
198
+ @offset_index1 = @prelude["offset_index1"] || raise(ArgumentError, "No index offset!")
199
+
200
+ @records_length = @prelude["records_length"] || raise(ArgumentError, "No records length!")
201
+
202
+ @entry_count = @prelude["n"] || raise(ArgumentError, "Number of records not included!")
203
+
204
+ @length_scaling_factor = @prelude["length_scaling_factor"] || 10
205
+
206
+ load_stats_from_hash(@prelude["stats"]) if @prelude["stats"]
207
+
208
+ @prelude
209
+ end
210
+
211
+ #
212
+ # Show some information about
213
+ #
214
+ def inspect
215
+ super + "\n" + <<INSPECT
216
+ File size: #{file_size(@file)}
217
+ Word length: #{@word_length}
218
+ Frequency bytes: #{@frequency_length}
219
+ Total record bytes: #{@records_length}
220
+ Offset of K-D Tree index: #{@offset_index1}
221
+ Total K-D Tree index bytes: #{file_size(@file) - @offset_index1}
222
+ K-D Tree Signature: #{file_range_read(@offset_index1..(@offset_index1+3))}
223
+
224
+ Prelude:
225
+ #{@prelude.map {|k,v| k=="stats" ? "" : " #{k}: #{v}\n" }.join("") }
226
+ INSPECT
227
+ end
228
+
229
+ def load_kdtree
230
+ @file.seek(@offset_index1)
231
+ KDTree.new @file
232
+ end
233
+
234
+
235
+ ## parsing
236
+
237
+ #
238
+ # Parse a record into an array of [word, frequency] IFF the word
239
+ # fits into the length_range or length_range is nil
240
+ #
241
+ def parse_record_into_array(string, index, length_range = nil)
242
+ chunk = nth_chunk(index, string)
243
+ raise "No chunk for index #{index}" unless chunk
244
+ actual_word_length = chunk.unpack("C")[0]
245
+ if !length_range || length_range.include?(actual_word_length)
246
+ # returns [word, frequency]
247
+ chunk.unpack("xa#{actual_word_length}@#{@word_length}N")
248
+ else
249
+ nil
250
+ end
251
+ end
252
+
253
+ #
254
+ # Parse a record into a Word object, which can be provided or will otherwise
255
+ # be constructed as needed fourth arg is a length range which can act as a
256
+ # filter; if not satisfied, nil will be returned
257
+ #
258
+ def parse_record(string, index=0,
259
+ word=CorrectHorseBatteryStaple::Word.new(:word => ""),
260
+ length_range = nil)
261
+ bare = parse_record_into_array(string, index, length_range)
262
+ return nil unless bare
263
+ word.word = bare[0]
264
+ word.frequency = bare[1]
265
+ word
266
+ end
267
+
268
+ def word_length(chunk_string)
269
+ chunk_string.unpack("C")
270
+ end
271
+
272
+ # return a string representing the nth_record
273
+ def nth_chunk(n, string)
274
+ string[@entry_length * n, @entry_length]
275
+ end
276
+
277
+ def pos_of_nth_word_in_file(n)
278
+ pos = @record_offset + (n * @entry_length)
279
+ end
280
+
281
+ #
282
+ # this version is much slower than the other - 1.5x total runtime
283
+ # slower in some cases.
284
+ #
285
+ # def get_word_by_idx_direct(n)
286
+ # @file.seek(pos_of_nth_word_in_file(n))
287
+ # chunk = @file.read(@entry_length)
288
+ # parse_record(chunk)
289
+ # end
290
+
291
+ def get_word_by_idx(n)
292
+ chunk = nth_chunk(n, records_string)
293
+ parse_record(chunk).tap do |w|
294
+ w.index = n
295
+ w.percentile = (n-0.5)/size * 100
296
+ end
297
+ end
298
+
299
+ ## some core Enumerable building blocks
300
+
301
+ def each(&block)
302
+ string = records_string
303
+ max_index = size - 1
304
+ index = 0
305
+ while index < max_index
306
+ yield parse_record(string, index)
307
+ index += 1
308
+ end
309
+ end
310
+
311
+ def count; size; end
312
+ def size
313
+ @entry_count ||= records_size / @entry_length
314
+ end
315
+
316
+
317
+ ## our Corpus Enumerablish abstract methods
318
+
319
+ # we presume that the ISAM file has been sorted
320
+ def sorted_entries
321
+ @sorted_entries ||= entries
322
+ end
323
+
324
+
325
+ ## optimized pick - does NOT support :filter, though
326
+ def pick(count, options = {})
327
+ # incompat check
328
+ raise NotImplementedError, "ISAM does not support :filter option" if options[:filter]
329
+
330
+ options = {:percentile => 0..100,
331
+ :word_length => 0..20}.merge(options)
332
+
333
+ result = []
334
+ found_indexes = []
335
+ iterations = 0
336
+ while (result.size < count && iterations < 1000)
337
+ len = random_in_range(options[:word_length])
338
+ pct = random_in_range(options[:percentile])
339
+ word_idx = @kdtree.nearest(len2coord(len), pct)
340
+ unless found_indexes.include?(word_idx)
341
+ found_indexes << word_idx
342
+ word = get_word_by_idx(word_idx)
343
+ if options[:word_length].include?(word.word.length)
344
+ result << word
345
+ else
346
+ STDERR.puts "non-qualifying word: #{word.word.length}"
347
+ end
348
+ end
349
+ iterations += 1
350
+ end
351
+
352
+ # validate that we succeeded
353
+ raise "Cannot find #{count} words matching criteria" if result.length < count
354
+
355
+ result
356
+ end
357
+
358
+
359
+
360
+ ## file I/O
361
+
362
+ def records_size
363
+ @records_length
364
+ end
365
+
366
+ def file_string
367
+ @file.is_a?(StringIO) ? @file.string : file_range_read(nil)
368
+ end
369
+
370
+ def file_range_read(file_range = nil)
371
+ file_range ||= 0...file_size(@file)
372
+ pos = @file.tell
373
+ @file.seek(file_range.first)
374
+ @file.read(range_count(file_range))
375
+ ensure
376
+ @file.seek(pos)
377
+ end
378
+ # memoize :file_range_read
379
+
380
+ # returns a string representing the record-holding portion of the file
381
+ def records_string
382
+ @records_string ||=
383
+ record_range_read(0 ... records_size)
384
+ end
385
+
386
+ def record_range_read(record_range = nil)
387
+ record_range ||= 0...records_size
388
+ file_range_read((record_range.first + @record_offset)...(range_count(record_range) + @record_offset))
389
+ end
390
+ # memoize :record_range_read
391
+
392
+ def record_percentile_range_read(percentile_range)
393
+ record_range = record_range_for_percentile(percentile_range)
394
+ record_range_read(record_range)
395
+ end
396
+
397
+
398
+ ## rather than using a StatisticalArray, we do direct indexing into the file/string
399
+ def percentile_index(percentile, round=true)
400
+ r = percentile.to_f/100 * count + 0.5
401
+ round ? r.round : r
402
+ end
403
+
404
+ def record_range_for_percentile(range)
405
+ range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric)
406
+ (percentile_index(range.begin, false).floor * @entry_length ...
407
+ percentile_index(range.end, false).ceil * @entry_length)
408
+ end
409
+ end
410
+ end