correct-horse-battery-staple 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. data.tar.gz.sig +1 -1
  2. data/.gemtest +0 -0
  3. data/Gemfile +53 -0
  4. data/Gemfile.lock +109 -0
  5. data/History.txt +6 -0
  6. data/Manifest.txt +57 -0
  7. data/README.txt +115 -0
  8. data/Rakefile +47 -0
  9. data/bin/chbs +234 -0
  10. data/bin/chbs-mkpass +16 -0
  11. data/correct-horse-battery-staple.gemspec +59 -0
  12. data/lib/correct_horse_battery_staple.rb +117 -0
  13. data/lib/correct_horse_battery_staple/assembler.rb +45 -0
  14. data/lib/correct_horse_battery_staple/backend.rb +6 -0
  15. data/lib/correct_horse_battery_staple/backend/isam_kd.rb +410 -0
  16. data/lib/correct_horse_battery_staple/backend/redis.rb +95 -0
  17. data/lib/correct_horse_battery_staple/backend/redis/d_range.rb +105 -0
  18. data/lib/correct_horse_battery_staple/corpus.rb +33 -0
  19. data/lib/correct_horse_battery_staple/corpus/base.rb +278 -0
  20. data/lib/correct_horse_battery_staple/corpus/isam.rb +258 -0
  21. data/lib/correct_horse_battery_staple/corpus/isam_kd.rb +60 -0
  22. data/lib/correct_horse_battery_staple/corpus/redis.rb +188 -0
  23. data/lib/correct_horse_battery_staple/corpus/redis2.rb +88 -0
  24. data/lib/correct_horse_battery_staple/corpus/serialized.rb +121 -0
  25. data/lib/correct_horse_battery_staple/corpus/sqlite.rb +266 -0
  26. data/lib/correct_horse_battery_staple/generator.rb +40 -0
  27. data/lib/correct_horse_battery_staple/memoize.rb +25 -0
  28. data/lib/correct_horse_battery_staple/parser.rb +5 -0
  29. data/lib/correct_horse_battery_staple/parser/base.rb +5 -0
  30. data/lib/correct_horse_battery_staple/parser/regex.rb +58 -0
  31. data/lib/correct_horse_battery_staple/range_parser.rb +29 -0
  32. data/lib/correct_horse_battery_staple/statistical_array.rb +74 -0
  33. data/lib/correct_horse_battery_staple/stats.rb +22 -0
  34. data/lib/correct_horse_battery_staple/word.rb +90 -0
  35. data/lib/correct_horse_battery_staple/writer.rb +29 -0
  36. data/lib/correct_horse_battery_staple/writer/base.rb +22 -0
  37. data/lib/correct_horse_battery_staple/writer/csv.rb +15 -0
  38. data/lib/correct_horse_battery_staple/writer/file.rb +54 -0
  39. data/lib/correct_horse_battery_staple/writer/isam.rb +50 -0
  40. data/lib/correct_horse_battery_staple/writer/isam_kd.rb +12 -0
  41. data/lib/correct_horse_battery_staple/writer/json.rb +19 -0
  42. data/lib/correct_horse_battery_staple/writer/marshal.rb +10 -0
  43. data/lib/correct_horse_battery_staple/writer/redis.rb +41 -0
  44. data/lib/correct_horse_battery_staple/writer/sqlite.rb +115 -0
  45. data/script/generate_all +34 -0
  46. data/script/load_redis +17 -0
  47. data/script/perftest +74 -0
  48. data/spec/corpus/serialized_spec.rb +62 -0
  49. data/spec/corpus_spec.rb +50 -0
  50. data/spec/correct_horse_battery_staple_spec.rb +73 -0
  51. data/spec/fixtures/100.json +101 -0
  52. data/spec/fixtures/corpus1.csv +101 -0
  53. data/spec/fixtures/corpus100.json +101 -0
  54. data/spec/fixtures/wiktionary1000.htm +648 -0
  55. data/spec/range_parser_spec.rb +54 -0
  56. data/spec/spec_helper.rb +20 -0
  57. data/spec/statistical_array_spec.rb +52 -0
  58. data/spec/support/spec_pry.rb +1 -0
  59. data/spec/word_spec.rb +95 -0
  60. metadata +264 -0
  61. metadata.gz.sig +1 -0
@@ -0,0 +1,95 @@
1
+ require 'redis'
2
+ if ! Object.const_defined?("JRUBY_VERSION")
3
+ require 'redis/connection/hiredis'
4
+ end
5
+ require 'securerandom'
6
+
7
+ module CorrectHorseBatteryStaple::Backend::Redis
8
+
9
+ def self.included(base)
10
+ base.extend ClassMethods
11
+ base.send :include, InstanceMethods
12
+ end
13
+
14
+ module ClassMethods
15
+ end
16
+
17
+ module InstanceMethods
18
+ def parse_uri(dest)
19
+ (dbname, host, port) = dest.gsub(/\.redis[0-9]?/, '').split(':')
20
+ options[:dbname] ||= (dbname || "chbs")
21
+ options[:host] ||= (host || "127.0.0.1")
22
+ options[:port] ||= (port || 6379).to_i
23
+ end
24
+
25
+ def add_word(w, wid=nil)
26
+ percentile = [0, w.percentile].max
27
+
28
+ wid = get_new_word_id if wid.nil?
29
+
30
+ db.zadd(@words_key, wid, w.word)
31
+ db.zadd(@percentile_key, percentile, wid)
32
+ # db.zadd(@frequency_key, w.frequency, wid)
33
+ db.zadd(@lenprod_key, w.word.length + (percentile / 100.0), wid)
34
+ end
35
+
36
+ #
37
+ # Note that this does NOT work inside a multi/exec
38
+ #
39
+ def get_new_word_id
40
+ db.incr(@id_key)
41
+ end
42
+
43
+ def get_word_by_id(wid)
44
+ db.zrangebyscore(@words_key, wid, wid, :limit => [0,1])[0] rescue nil
45
+ end
46
+
47
+ def load_stats
48
+ #noinspection RubyHashKeysTypesInspection
49
+ load_stats_from_hash Hash[db.hgetall(@stats_key).map {|k,v| [k, v.to_f]}]
50
+ end
51
+
52
+ def save_stats(stats)
53
+ db.hmset @stats_key, *stats.to_a.flatten
54
+ end
55
+
56
+ def create_database
57
+ db.del(@length_key, @percentile_key, @frequency_key, @lenprod_key, @stats_key,
58
+ @words_key, @id_key)
59
+ end
60
+
61
+ def open_database
62
+ @db ||= begin
63
+ @gensym_id = 0
64
+ @length_key = make_key("length_zset")
65
+ @percentile_key = make_key("percentile_zset")
66
+ @frequency_key = make_key("frequency_zset")
67
+ # scores given by w.word.length + w.percentile/100.0
68
+ @lenprod_key = make_key("lenprod_zset")
69
+ @stats_key = make_key("stats_hash")
70
+ @words_key = make_key("words_zset")
71
+ @id_key = make_key("word_id_counter")
72
+ ::Redis.new(:host => options[:host], :port => options[:port])
73
+ end
74
+ end
75
+
76
+ def db
77
+ @db || open_database
78
+ end
79
+
80
+ def close_database
81
+ end
82
+
83
+ def make_key(name)
84
+ "chbs_#{options[:dbname]}_#{name}"
85
+ end
86
+
87
+ def gensym_temp
88
+ @_gensym_id ||= 0
89
+ make_key("TEMP_#{Process.pid}_#{@gensym_id += 1}")
90
+ end
91
+ end
92
+
93
+ autoload :DRange, 'correct_horse_battery_staple/backend/redis/d_range.rb'
94
+ end
95
+
@@ -0,0 +1,105 @@
1
+ #
2
+ # Represents a list of items corresponding to a square area of items
3
+ # formed by a range on axis 1 and a range on axis 2 of numbers
4
+ # associated with an item. In other words, this composes two
5
+ # different data about an item into a single score in a Redis sorted
6
+ # set, and allow that area to be treated as a single logical ordered
7
+ # list of items.
8
+ #
9
+ # This is used to construct a single score out of a word's length
10
+ # and percentile ranking. The length is the "outer" score and
11
+ # ranges (generally) from 3..18 or thereabouts in integral steps.
12
+ # Percentiles exist as fractional parts of the score added to the
13
+ # base word length. So, to address the items in a sorted set
14
+ # with the word length from 5..8 and percentile range 20..30,
15
+ # you would (in the Writer::Redis class) generate a Sorted set
16
+ # in which every word has a score with an integer and fractional
17
+ # part. The word "the" which appeared in the 95th percentile would
18
+ # have a score of 3.95.
19
+ #
20
+ # Once defined, this class allows the following operations:
21
+ #
22
+ # - counting the total # of items in the 2d bounding box
23
+ # - picking the nth item from the (virtual) sorted list
24
+ #
25
+ #
26
+
27
+ class CorrectHorseBatteryStaple::Backend::Redis::DRange
28
+ include CorrectHorseBatteryStaple::Memoize
29
+ def initialize(db, key, outer, inner, divisor=100)
30
+ @db = db
31
+ @key = key
32
+ @outer = outer
33
+ @inner = inner
34
+ @divisor = divisor
35
+ @counts = {}
36
+ end
37
+
38
+ def dump
39
+ iterate_ranges do |min, max|
40
+ cnt = @db.zcount(@key, min, max)
41
+ [min, max, cnt]
42
+ end
43
+ end
44
+
45
+ def count
46
+ precache_counts
47
+ @counts.values.reduce(:+)
48
+ end
49
+ memoize :count
50
+
51
+ def pick_nth(n)
52
+ precache_counts
53
+ return nil if n > count-1
54
+
55
+ pos = 0
56
+ @outer.each do |base|
57
+ cib = count_in_base(base)
58
+ minpos = pos
59
+ maxpos = pos + cib
60
+ if cib > 0 && n >= minpos && n <= maxpos
61
+ (min, max) = minmax_for_base(base)
62
+ return @db.zrangebyscore(@key, min, max,
63
+ :limit => [n-pos, 1])[0]
64
+ end
65
+ pos += cib
66
+ end
67
+ return nil
68
+ end
69
+
70
+ protected
71
+
72
+ def precache_counts
73
+ return if @precached_counts
74
+ counts = @db.multi do
75
+ @outer.each do |base|
76
+ zcount(*minmax_for_base(base))
77
+ end
78
+ end
79
+ #noinspection RubyHashKeysTypesInspection
80
+ @counts = Hash[@outer.to_a.zip(counts)]
81
+ @precached_counts = true
82
+ @counts
83
+ end
84
+
85
+ def count_in_base(b)
86
+ @counts[b] ||= zcount(*minmax_for_base(b))
87
+ end
88
+
89
+ def minmax_for_base(base)
90
+ [base + @inner.begin / (@divisor.to_f),
91
+ base + @inner.end / (@divisor.to_f)]
92
+ end
93
+
94
+ def zcount(min, max)
95
+ @db.zcount(@key, min, max)
96
+ end
97
+
98
+ def iterate_ranges
99
+ @outer.map do |base|
100
+ (min, max) = minmax_for_base(base)
101
+ yield min, max
102
+ end
103
+ end
104
+
105
+ end
@@ -0,0 +1,33 @@
1
+
2
+ class CorrectHorseBatteryStaple::Corpus
3
+ def self.read(filename, clazz=nil)
4
+ clazz ||=
5
+ case CorrectHorseBatteryStaple::Corpus.format_for(filename)
6
+ # when 'kdtree' then CorrectHorseBatteryStaple::Corpus::KDTree
7
+ when 'isam' then CorrectHorseBatteryStaple::Corpus::Isam
8
+ when 'kdtree', 'isamkd' then CorrectHorseBatteryStaple::Corpus::IsamKD
9
+ when 'sqlite' then CorrectHorseBatteryStaple::Corpus::Sqlite
10
+ when 'redis2' then CorrectHorseBatteryStaple::Corpus::Redis2
11
+ when 'redis' then CorrectHorseBatteryStaple::Corpus::Redis
12
+ else CorrectHorseBatteryStaple::Corpus::Serialized
13
+ end
14
+
15
+ clazz.read(filename)
16
+ end
17
+
18
+ def self.format_for(spec, defval = nil)
19
+ File.extname(spec)[1..-1].downcase || defval
20
+ rescue
21
+ defval
22
+ end
23
+
24
+ autoload :Base, 'correct_horse_battery_staple/corpus/base'
25
+ autoload :Serialized, 'correct_horse_battery_staple/corpus/serialized'
26
+ autoload :Isam, 'correct_horse_battery_staple/corpus/isam'
27
+ autoload :IsamKD, 'correct_horse_battery_staple/corpus/isam_kd'
28
+ autoload :Sqlite, 'correct_horse_battery_staple/corpus/sqlite'
29
+ autoload :Redis, 'correct_horse_battery_staple/corpus/redis'
30
+ autoload :Redis2, 'correct_horse_battery_staple/corpus/redis2'
31
+ # autoload :KDTree, 'correct_horse_battery_staple/corpus/kdtree'
32
+ end
33
+
@@ -0,0 +1,278 @@
1
+ require 'bigdecimal'
2
+ # require 'securerandom'
3
+ require 'forwardable'
4
+
5
+ class CorrectHorseBatteryStaple::Corpus::Base < CorrectHorseBatteryStaple::Corpus
6
+ extend Forwardable
7
+
8
+ attr_accessor :frequency_mean, :frequency_stddev
9
+ attr_accessor :probability_mean, :probability_stddev
10
+ attr_accessor :original_size
11
+ attr_accessor :weighted_size
12
+
13
+ include CorrectHorseBatteryStaple::Common
14
+ include CorrectHorseBatteryStaple::Memoize
15
+ include Enumerable
16
+
17
+ def initialize(*args)
18
+ initialize_backend_variables if respond_to?(:initialize_backend_variables)
19
+ end
20
+
21
+ def self.read(dest)
22
+ self.new dest
23
+ end
24
+
25
+ # you MUST override this method for Enumerable to use
26
+
27
+ def each(&block)
28
+ raise NotImplementedError
29
+ end
30
+
31
+ # other methods you should implement if possible:
32
+ #
33
+ # Enumerable
34
+ # size
35
+ #
36
+ # CHBS::Corpus
37
+ # pick
38
+ # words
39
+ # frequencies
40
+ #
41
+
42
+
43
+ def sorted_entries
44
+ entries.sort
45
+ end
46
+
47
+ # return all the candidates for a given set of options
48
+ def candidates(options = {})
49
+ return size if !options || options.empty?
50
+ filter = filter_for_options(options)
51
+ return size unless filter
52
+ entries.select {|entry| filter.call(entry) }
53
+ end
54
+
55
+ def count_candidates(options = {})
56
+ return size if !options || options.empty?
57
+ filter = filter_for_options(options)
58
+ return size unless filter
59
+
60
+ count = 0
61
+ each do |entry|
62
+ count += 1 if filter.call(entry)
63
+ end
64
+ count
65
+ end
66
+ memoize :count_candidates
67
+
68
+
69
+
70
+ #
71
+ # this is the core password picker method. it is not especially
72
+ # efficient but it is relatively generic. If a corpus supports
73
+ # Enumerable, it will work.
74
+ #
75
+ def pick(count, options = {})
76
+ array = CorrectHorseBatteryStaple::StatisticalArray.new(sorted_entries)
77
+
78
+ filters = Array(options[:filter])
79
+
80
+ if options[:percentile]
81
+ range = array.index_range_for_percentile(options[:percentile])
82
+ else
83
+ range = 0..array.size-1
84
+ end
85
+ range_size = range_size(range)
86
+
87
+ if range_size < count
88
+ raise ArgumentError, "Percentile range contains fewer words than requested count"
89
+ end
90
+
91
+ if options[:word_length]
92
+ wl = options[:word_length]
93
+ filters << lambda {|entry| wl.include? entry.word.length }
94
+ end
95
+
96
+ filter = filters.empty? ? nil : compose_filters(filters)
97
+
98
+ max_iterations = options[:max_iterations] || 1000
99
+
100
+ result = []
101
+ iterations = 0
102
+ while result.length < count && iterations < max_iterations
103
+ i = random_number(range_size)
104
+ entry = array[i + range.first]
105
+ if entry && (!filter || filter.call(entry))
106
+ result << entry
107
+ end
108
+ iterations += 1
109
+ end
110
+
111
+ raise "Cannot find #{count} words matching criteria" if result.length < count
112
+ result
113
+ end
114
+
115
+
116
+
117
+ def words
118
+ execute_filters.map {|entry| entry.word }
119
+ end
120
+ memoize :words
121
+
122
+ # no-op for serialized forms
123
+ def precache(max=0)
124
+ end
125
+
126
+ def frequencies
127
+ CorrectHorseBatteryStaple::StatisticalArray.new(entries.map {|entry| entry.frequency })
128
+ end
129
+ memoize :frequencies
130
+
131
+ def entropy_per_word
132
+ Math.log(count) / Math.log(2)
133
+ end
134
+
135
+ # filtering
136
+
137
+ def filter(&block)
138
+ (@filters ||= []) << block
139
+ self
140
+ end
141
+
142
+ def reset
143
+ @filters = []
144
+ end
145
+
146
+ # create a single composed function of all the filters
147
+ def compose_filters(filters)
148
+ return nil if !filters || filters.empty?
149
+ filters.reduce do |prev, current|
150
+ lambda {|value| prev.call(value) && current.call(value) }
151
+ end
152
+ end
153
+
154
+ def result
155
+ return self if @filters.empty?
156
+
157
+ self.class.new(execute_filters).tap do |new_corpus|
158
+ new_corpus.original_size = self.original_size
159
+ end
160
+ end
161
+
162
+
163
+ ## statistics
164
+
165
+ def load_stats_from_hash(hash)
166
+ hash.each do |k,v|
167
+ setter = "#{k}=".to_sym
168
+ send setter, v if respond_to?(setter)
169
+ end
170
+ end
171
+
172
+ def recalculate
173
+ size = self.size
174
+ frequencies = self.frequencies
175
+
176
+ # corpus-wide statistics
177
+ self.weighted_size = frequencies.reduce(BigDecimal.new("0"), :+)
178
+ (self.probability_mean, self.probability_stddev) =
179
+ CorrectHorseBatteryStaple::StatisticalArray.new(frequencies.map do |freq|
180
+ (freq/weighted_size) * 100
181
+ end).mean_and_standard_deviation
182
+
183
+ (self.frequency_mean, self.frequency_stddev) = frequencies.mean_and_standard_deviation
184
+
185
+ # stats = corpus.stats
186
+ # size = corpus.size
187
+ # frequency_mean = corpus.frequency_mean
188
+ # frequency_stddev = corpus.frequency_stddev
189
+ # weighted_size = corpus.weighted_size
190
+ # probability_mean = corpus.probability_mean
191
+ # probability_stddev = corpus.probability_stddev
192
+
193
+ each_with_index do |entry, index|
194
+ entry.rank = size - index
195
+ entry.distance = (entry.frequency-frequency_mean)/frequency_stddev
196
+ entry.probability = entry.frequency / weighted_size
197
+ entry.distance_probability = (entry.probability - probability_mean) / probability_stddev
198
+ entry.percentile = (index-0.5)/size * 100
199
+ end
200
+
201
+ self
202
+ end
203
+
204
+ def stats
205
+ {:frequency_mean => frequency_mean, :frequency_stddev => frequency_stddev,
206
+ :probability_mean => probability_mean, :probability_stddev => probability_stddev,
207
+ :size => count, :original_size => original_size,
208
+ :weighted_size => weighted_size.to_f}
209
+ end
210
+
211
+ def inspect
212
+ <<INSPECT
213
+ Type: #{self.class.name}
214
+ Entry count: #{count}
215
+
216
+ Stats:
217
+ #{stats.map {|k,v| " #{k}: #{v}\n" }.join("") }
218
+ INSPECT
219
+ end
220
+
221
+ alias :length :count
222
+
223
+
224
+ protected
225
+
226
+ #
227
+ # Return the number of distinct objects within the Range.
228
+ # This assumes plain vanilla ranges, though it does respect .. vs ...
229
+ #
230
+ # Why? Range#count is basically #to_a.count, which is INSANE
231
+ #
232
+ def range_count(r)
233
+ (r.last - r.first +
234
+ (r.exclude_end? ? 0 : (r.first > r.last ? -1 : 1))
235
+ ).abs
236
+ end
237
+ alias :range_size :range_count
238
+
239
+ #
240
+ # Given a filter, return all Word objects in this Corpus that the
241
+ # filter accepts.
242
+ #
243
+ # this is an exceptionally inefficient version
244
+ def execute_filters
245
+ return entries if @filters.nil? || @filters.empty?
246
+ entries.select(&compose_filters(@filters))
247
+ ensure
248
+ reset
249
+ end
250
+
251
+ #
252
+ # Return a single lambda that will return true/false given a Word object
253
+ #
254
+ # Respects the :word_length, :percentile, and :filter options
255
+ # :word_length and :percentile should be Range objects
256
+ # :filter can be a single Proc/lambda or an array of them
257
+ #
258
+ def filter_for_options(options = {})
259
+ return nil if !options || options.empty?
260
+
261
+ filters = Array(options[:filter])
262
+ if options[:percentile]
263
+ p_range = options[:percentile]
264
+ filters << lambda {|entry| p_range.include? entry.percentile }
265
+ end
266
+
267
+ if options[:word_length]
268
+ wl_range = options[:word_length]
269
+ filters << lambda {|entry| wl_range.include? entry.word.length }
270
+ end
271
+
272
+ filters.empty? ? nil : compose_filters(filters)
273
+ end
274
+ memoize :filter_for_options
275
+
276
+ end
277
+
278
+ # Random.srand(SecureRandom.random_number)