correct-horse-battery-staple 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. data.tar.gz.sig +1 -1
  2. data/.gemtest +0 -0
  3. data/Gemfile +53 -0
  4. data/Gemfile.lock +109 -0
  5. data/History.txt +6 -0
  6. data/Manifest.txt +57 -0
  7. data/README.txt +115 -0
  8. data/Rakefile +47 -0
  9. data/bin/chbs +234 -0
  10. data/bin/chbs-mkpass +16 -0
  11. data/correct-horse-battery-staple.gemspec +59 -0
  12. data/lib/correct_horse_battery_staple.rb +117 -0
  13. data/lib/correct_horse_battery_staple/assembler.rb +45 -0
  14. data/lib/correct_horse_battery_staple/backend.rb +6 -0
  15. data/lib/correct_horse_battery_staple/backend/isam_kd.rb +410 -0
  16. data/lib/correct_horse_battery_staple/backend/redis.rb +95 -0
  17. data/lib/correct_horse_battery_staple/backend/redis/d_range.rb +105 -0
  18. data/lib/correct_horse_battery_staple/corpus.rb +33 -0
  19. data/lib/correct_horse_battery_staple/corpus/base.rb +278 -0
  20. data/lib/correct_horse_battery_staple/corpus/isam.rb +258 -0
  21. data/lib/correct_horse_battery_staple/corpus/isam_kd.rb +60 -0
  22. data/lib/correct_horse_battery_staple/corpus/redis.rb +188 -0
  23. data/lib/correct_horse_battery_staple/corpus/redis2.rb +88 -0
  24. data/lib/correct_horse_battery_staple/corpus/serialized.rb +121 -0
  25. data/lib/correct_horse_battery_staple/corpus/sqlite.rb +266 -0
  26. data/lib/correct_horse_battery_staple/generator.rb +40 -0
  27. data/lib/correct_horse_battery_staple/memoize.rb +25 -0
  28. data/lib/correct_horse_battery_staple/parser.rb +5 -0
  29. data/lib/correct_horse_battery_staple/parser/base.rb +5 -0
  30. data/lib/correct_horse_battery_staple/parser/regex.rb +58 -0
  31. data/lib/correct_horse_battery_staple/range_parser.rb +29 -0
  32. data/lib/correct_horse_battery_staple/statistical_array.rb +74 -0
  33. data/lib/correct_horse_battery_staple/stats.rb +22 -0
  34. data/lib/correct_horse_battery_staple/word.rb +90 -0
  35. data/lib/correct_horse_battery_staple/writer.rb +29 -0
  36. data/lib/correct_horse_battery_staple/writer/base.rb +22 -0
  37. data/lib/correct_horse_battery_staple/writer/csv.rb +15 -0
  38. data/lib/correct_horse_battery_staple/writer/file.rb +54 -0
  39. data/lib/correct_horse_battery_staple/writer/isam.rb +50 -0
  40. data/lib/correct_horse_battery_staple/writer/isam_kd.rb +12 -0
  41. data/lib/correct_horse_battery_staple/writer/json.rb +19 -0
  42. data/lib/correct_horse_battery_staple/writer/marshal.rb +10 -0
  43. data/lib/correct_horse_battery_staple/writer/redis.rb +41 -0
  44. data/lib/correct_horse_battery_staple/writer/sqlite.rb +115 -0
  45. data/script/generate_all +34 -0
  46. data/script/load_redis +17 -0
  47. data/script/perftest +74 -0
  48. data/spec/corpus/serialized_spec.rb +62 -0
  49. data/spec/corpus_spec.rb +50 -0
  50. data/spec/correct_horse_battery_staple_spec.rb +73 -0
  51. data/spec/fixtures/100.json +101 -0
  52. data/spec/fixtures/corpus1.csv +101 -0
  53. data/spec/fixtures/corpus100.json +101 -0
  54. data/spec/fixtures/wiktionary1000.htm +648 -0
  55. data/spec/range_parser_spec.rb +54 -0
  56. data/spec/spec_helper.rb +20 -0
  57. data/spec/statistical_array_spec.rb +52 -0
  58. data/spec/support/spec_pry.rb +1 -0
  59. data/spec/word_spec.rb +95 -0
  60. metadata +264 -0
  61. metadata.gz.sig +1 -0
@@ -0,0 +1,258 @@
1
+ require 'bigdecimal'
2
+ require 'json'
3
+ require 'set'
4
+
5
+ #
6
+ #
7
+ # Format of header:
8
+ #
9
+ # 0..3 - OB - offset of body start in bytes; network byte order
10
+ # 4..7 - LP - length of prelude in network byte order
11
+ # 8..OB-1 - P - JSON-encoded prelude hash and space padding
12
+ # OB..EOF - array of fixed size records as described in prelude
13
+ #
14
+ # Contents of Prelude (after JSON decoding):
15
+ #
16
+ # P["wlen"] - length of word part of record
17
+ # P["flen"] - length of frequency part of record (always 4 bytes)
18
+ # P["entrylen"] - length of total part of record
19
+ # P["n"] - number of records
20
+ # P["sort"] - field name sorted by (word or frequency)
21
+ # P["stats"] - corpus statistics
22
+ #
23
+ # Format of record:
24
+ #
25
+ # 2 bytes - LW - actual length of word within field
26
+ # P["wlen"] bytes - LW bytes of word (W) + P["wlen"]-LW bytes of padding
27
+ # P["flen"] (4) bytes - frequency as network byte order long
28
+ #
29
+
30
+ class CorrectHorseBatteryStaple::Corpus::Isam < CorrectHorseBatteryStaple::Corpus::Base
31
+ include CorrectHorseBatteryStaple::Memoize
32
+
33
+ INITIAL_PRELUDE_LENGTH = 512
34
+
35
+ def initialize(filename, stats = nil)
36
+ super
37
+ @filename = filename
38
+ @file = CorrectHorseBatteryStaple::Util.open_binary(filename, "r")
39
+ parse_prelude
40
+ end
41
+
42
+ def precache(max = -1)
43
+ return if max > -1 && file_size(@file) > max
44
+ @file.seek 0
45
+ @file = StringIO.new @file.read, "r"
46
+ end
47
+
48
+ def file_size(file)
49
+ (file.respond_to?(:size) ? file.size : file.stat.size)
50
+ end
51
+
52
+ def prelude
53
+ @prelude || parse_prelude
54
+ end
55
+
56
+ def parse_prelude
57
+ @file.seek 0
58
+ prelude_buf = @file.read(INITIAL_PRELUDE_LENGTH)
59
+
60
+ # byte offset of first record from beginning of file
61
+ # total length of JSON string (without padding)
62
+ (@record_offset, @prelude_len) = prelude_buf.unpack("NN")
63
+
64
+ # read more if our initial read didn't slurp in the entire prelude
65
+ if @prelude_len > prelude_buf.length
66
+ prelude_buf += @file.read(@prelude_len - prelude_buf.length)
67
+ end
68
+
69
+ @prelude = JSON.parse( prelude_buf.unpack("@8a#{@prelude_len}")[0] ) || {}
70
+
71
+ # includes prefix length byte
72
+ @word_length = @prelude["wlen"] || raise(ArgumentError, "Word length is not defined!")
73
+
74
+ # as network byte order int
75
+ @frequency_length = @prelude["flen"] || 4
76
+
77
+ # total length of record
78
+ @entry_length = @prelude["entrylen"] || raise(ArgumentError, "Prelude does not include entrylen!")
79
+
80
+ load_stats_from_hash(@prelude["stats"]) if @prelude["stats"]
81
+
82
+ @prelude
83
+ end
84
+
85
+ # factory-ish constructor
86
+ def self.read(filename)
87
+ self.new filename
88
+ end
89
+
90
+
91
+ ## parsing
92
+
93
+ #
94
+ # Parse a record into an array of [word, frequency] IFF the word
95
+ # fits into the length_range or length_range is nil
96
+ #
97
+ def parse_record_into_array(string, index, length_range = nil)
98
+ chunk = nth_chunk(index, string)
99
+ raise "No chunk for index #{index}" unless chunk
100
+ actual_word_length = chunk.unpack("C")[0]
101
+ if !length_range || length_range.include?(actual_word_length)
102
+ # returns [word, frequency]
103
+ chunk.unpack("xa#{actual_word_length}@#{@word_length}N")
104
+ else
105
+ nil
106
+ end
107
+ end
108
+
109
+ #
110
+ # Parse a record into a Word object, which can be provided or will otherwise
111
+ # be constructed as needed fourth arg is a length range which can act as a
112
+ # filter; if not satisfied, nil will be returned
113
+ #
114
+ def parse_record(string, index=0,
115
+ word=CorrectHorseBatteryStaple::Word.new(:word => ""),
116
+ length_range = nil)
117
+ bare = parse_record_into_array(string, index, length_range)
118
+ return nil unless bare
119
+ word.word = bare[0]
120
+ word.frequency = bare[1]
121
+ word
122
+ end
123
+
124
+ def word_length(chunk_string)
125
+ chunk_string.unpack("C")
126
+ end
127
+
128
+ # return a string representing the nth_record
129
+ def nth_chunk(n, string)
130
+ string[@entry_length * n, @entry_length]
131
+ end
132
+
133
+ ## some core Enumerable building blocks
134
+
135
+ def each(&block)
136
+ string = records_string
137
+ max_index = size - 1
138
+ index = 0
139
+ while index < max_index
140
+ yield parse_record(string, index)
141
+ index += 1
142
+ end
143
+ end
144
+
145
+ def count; size; end
146
+ def size
147
+ @size ||= records_size / @entry_length
148
+ end
149
+
150
+
151
+ ## our Corpus Enumerablish abstract methods
152
+
153
+ # we presume that the ISAM file has been sorted
154
+ def sorted_entries
155
+ @sorted_entries ||= entries
156
+ end
157
+
158
+
159
+ ## optimized pick - does NOT support :filter, though
160
+ def pick(count, options = {})
161
+ # incompat check
162
+ raise NotImplementedError, "ISAM does not support :filter option" if options[:filter]
163
+
164
+ # options parsing
165
+ string = record_percentile_range_read(options[:percentile] || (0..100))
166
+ range_size = string.length / @entry_length
167
+ max_iterations = [options[:max_iterations] || 1000, count*10].max
168
+
169
+ if range_size < count
170
+ raise ArgumentError, "Percentile range contains fewer words than requested count"
171
+ end
172
+
173
+ # the real work
174
+ result = _pick(string, count, options[:word_length], max_iterations)
175
+
176
+ # validate that we succeeded
177
+ raise "Cannot find #{count} words matching criteria" if result.length < count
178
+
179
+ result
180
+ end
181
+
182
+ def _pick(string, count, length_range, max_iterations)
183
+ result = []
184
+ iterations = 0
185
+
186
+ # don't bother reading already read words
187
+ skip_cache = Set.new
188
+ range_size = string.length / @entry_length
189
+
190
+ # don't cons!
191
+ entry = CorrectHorseBatteryStaple::Word.new :word => ""
192
+ while result.length < count && iterations < max_iterations
193
+ i = random_number(range_size)
194
+ unless skip_cache.include? i
195
+ pr = parse_record(string, i, entry, length_range)
196
+ if pr
197
+ result << pr.dup
198
+ else
199
+ skip_cache << i
200
+ end
201
+ end
202
+ iterations += 1
203
+ end
204
+ result
205
+ end
206
+
207
+
208
+ ## file I/O
209
+
210
+ def records_size
211
+ @records_size ||= (file_size(@file) - @record_offset)
212
+ end
213
+
214
+ def file_string
215
+ @file.is_a?(StringIO) ? @file.string : file_range_read(nil)
216
+ end
217
+
218
+ def file_range_read(file_range = nil)
219
+ file_range ||= 0...file_size(@file)
220
+ pos = @file.tell
221
+ @file.seek(file_range.first)
222
+ @file.read(range_count(file_range))
223
+ ensure
224
+ @file.seek(pos)
225
+ end
226
+ memoize :file_range_read
227
+
228
+ # returns a string representing the record-holding portion of the file
229
+ def records_string
230
+ @records_string ||=
231
+ record_range_read(0 ... records_size)
232
+ end
233
+
234
+ def record_range_read(record_range = nil)
235
+ record_range ||= 0...records_size
236
+ file_range_read((record_range.first + @record_offset)...(range_count(record_range) + @record_offset))
237
+ end
238
+ # memoize :record_range_read
239
+
240
+ def record_percentile_range_read(percentile_range)
241
+ record_range = record_range_for_percentile(percentile_range)
242
+ record_range_read(record_range)
243
+ end
244
+
245
+
246
+ ## rather than using a StatisticalArray, we do direct indexing into the file/string
247
+ def percentile_index(percentile, round=true)
248
+ r = percentile.to_f/100 * count + 0.5
249
+ round ? r.round : r
250
+ end
251
+
252
+ def record_range_for_percentile(range)
253
+ range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric)
254
+ (percentile_index(range.begin, false).floor * @entry_length ...
255
+ percentile_index(range.end, false).ceil * @entry_length)
256
+ end
257
+
258
+ end
@@ -0,0 +1,60 @@
1
+ require 'bigdecimal'
2
+ require 'json'
3
+ require 'set'
4
+
5
+ #
6
+ #
7
+ # Format of header:
8
+ #
9
+ # 0..3 - OB - offset of body start in bytes; network byte order
10
+ # 4..7 - LP - length of prelude in network byte order
11
+ # 8..OB-1 - P - JSON-encoded prelude hash and space padding
12
+ # OB..EOF - array of fixed size records as described in prelude
13
+ #
14
+ # Contents of Prelude (after JSON decoding):
15
+ #
16
+ # P["wlen"] - length of word part of record
17
+ # P["flen"] - length of frequency part of record (always 4 bytes)
18
+ # P["entrylen"] - length of total part of record
19
+ # P["n"] - number of records
20
+ # P["sort"] - field name sorted by (word or frequency)
21
+ # P["stats"] - corpus statistics
22
+ #
23
+ # Format of record:
24
+ #
25
+ # 2 bytes - LW - actual length of word within field
26
+ # P["wlen"] bytes - LW bytes of word (W) + P["wlen"]-LW bytes of padding
27
+ # P["flen"] (4) bytes - frequency as network byte order long
28
+ #
29
+
30
+ class CorrectHorseBatteryStaple::Corpus::IsamKD < CorrectHorseBatteryStaple::Corpus::Base
31
+ include CorrectHorseBatteryStaple::Memoize
32
+ include CorrectHorseBatteryStaple::Backend::IsamKD
33
+
34
+ def initialize(filename, stats = nil)
35
+ super
36
+ @filename = filename
37
+ @file = CorrectHorseBatteryStaple::Util.open_binary(filename, "r")
38
+ parse_prelude
39
+ load_index
40
+ end
41
+
42
+ def precache(max = -1)
43
+ return if max > -1 && file_size(@file) > max
44
+ @file.seek 0
45
+ @file = StringIO.new @file.read, "r"
46
+ end
47
+
48
+ def file_size(file)
49
+ (file.respond_to?(:size) ? file.size : file.stat.size)
50
+ end
51
+
52
+ def prelude
53
+ @prelude ||= parse_prelude
54
+ end
55
+
56
+ def load_index
57
+ @kdtree ||= load_kdtree
58
+ end
59
+
60
+ end
@@ -0,0 +1,188 @@
1
+ require 'bigdecimal'
2
+ require 'hiredis'
3
+ require 'redis'
4
+ require 'set'
5
+
6
+ class CorrectHorseBatteryStaple::Corpus::Redis < CorrectHorseBatteryStaple::Corpus::Base
7
+ include CorrectHorseBatteryStaple::Backend::Redis
8
+
9
+ MAX_ITERATIONS = 1000
10
+
11
+ attr_accessor :dest
12
+ attr_accessor :options
13
+
14
+ def initialize(dest)
15
+ super
16
+ self.dest = dest
17
+ self.options = {}
18
+ parse_uri(dest)
19
+
20
+ load_stats
21
+ end
22
+
23
+ def self.read(file)
24
+ self.new file
25
+ end
26
+
27
+ ## some core Enumerable building blocks
28
+
29
+ def each(&block)
30
+ entries.each &block
31
+ end
32
+
33
+ def count
34
+ @count ||= db.zcard(@words_key)
35
+ end
36
+
37
+ def size
38
+ stats[:size] || count
39
+ end
40
+
41
+
42
+
43
+ ## our own collection operations
44
+
45
+ def entries
46
+ table
47
+ end
48
+
49
+ def sorted_entries
50
+ entries.sort
51
+ end
52
+
53
+
54
+ def pick(count, options = {})
55
+ # incompat check
56
+ raise NotImplementedError, "Redis does not support :filter option" if options[:filter]
57
+
58
+ strategy = options.delete(:strategy) || ENV['pick_strategy'] || "drange"
59
+ send("pick_#{strategy}", count, options)
60
+ end
61
+
62
+
63
+ ## optimized pick implementations - they do NOT support :filter, though
64
+
65
+ def pick_standard(count, options = {})
66
+ percentile_range = options[:percentile]
67
+ length_range = options[:word_length]
68
+
69
+ if percentile_range && percentile_range.begin == 0 && percentile_range.end == 100
70
+ percentile_range = nil
71
+ end
72
+
73
+ if (!percentile_range && !length_range)
74
+ get_words_for_ids(pick_random_words(count))
75
+ else
76
+ sets = []
77
+ sets << get_word_ids_in_zset(@percentile_key, percentile_range) if percentile_range
78
+ sets << get_word_ids_in_zset(@lenprod_key, length_range) if length_range
79
+
80
+ candidates = (sets.length == 1 ? sets[0] : intersection(*sets))
81
+ get_words_for_ids(array_sample(candidates, count))
82
+ end
83
+ end
84
+
85
+
86
+
87
+ def pick_drange(count, options = {})
88
+ percentile_range = options[:percentile]
89
+ length_range = options[:word_length]
90
+
91
+ if percentile_range && range_cover?(percentile_range, 0..100)
92
+ percentile_range = nil
93
+ end
94
+
95
+ corpus_length_range = self.corpus_length_range
96
+ if !length_range || range_cover?(length_range, corpus_length_range)
97
+ length_range = nil
98
+ end
99
+
100
+ if (!percentile_range && !length_range)
101
+ get_words_for_ids(pick_random_words(count))
102
+ else
103
+ dspace = discontiguous_range_map(@lenprod_key, length_range, percentile_range)
104
+ max = dspace.count
105
+ ids = count.times.map do
106
+ dspace.pick_nth(random_number(max))
107
+ end
108
+ # STDERR.puts "ids from decimal are #{ids.inspect}"
109
+ get_words_for_ids(ids)
110
+ end
111
+ end
112
+
113
+ def zcount(key, min, max)
114
+ db.zcount(key, min, max)
115
+ end
116
+ memoize :zcount
117
+
118
+ def discontiguous_range_map(key, outer_range, inner_range, divisor=100)
119
+ CorrectHorseBatteryStaple::Backend::Redis::DRange.new(@db, key, outer_range,
120
+ inner_range, divisor)
121
+ end
122
+ memoize :discontiguous_range_map
123
+
124
+ # XXX - does not handle exclusive endpoints
125
+ def range_cover?(outer, inner)
126
+ outer.cover?(inner.begin) && outer.cover?(inner.end)
127
+ end
128
+
129
+ # TODO: make this use actual data from stored stats
130
+ def corpus_length_range
131
+ 3..18
132
+ end
133
+
134
+ def pick_random_words(count)
135
+ count.times.map do
136
+ idx = random_number(size)-1
137
+ db.zrange(@words_key, idx, idx)[0]
138
+ end
139
+ end
140
+
141
+ def intersection(*sets)
142
+ sets.reduce {|a,b| a & b }
143
+ end
144
+
145
+ def get_word_ids_in_zset(key, range)
146
+ db.zrangebyscore(key, range.begin, range.end)
147
+ end
148
+ memoize :get_word_ids_in_zset
149
+
150
+ def get_words_for_ids(ids)
151
+ ids.map {|id| CorrectHorseBatteryStaple::Word.new(:word => get_word_by_id(id)) }
152
+ end
153
+
154
+
155
+ def close
156
+ super
157
+ end
158
+
159
+ protected
160
+
161
+ def table
162
+ percentiles = db.zrangebyscore(@percentile_key, -1, 101, :withscores => true)
163
+ frequencies = db.zrangebyscore(@frequency_key, -1, 99999999, :withscores => true)
164
+
165
+ phash = {}
166
+ fhash = {}
167
+ (0...percentiles.length / 2).each do |index|
168
+ base = index * 2
169
+ phash[percentiles[base]] = percentiles[base+1]
170
+ end
171
+ (0...frequencies.length / 2).each do |index|
172
+ base = index * 2
173
+ fhash[frequencies[base]] = frequencies[base+1]
174
+ end
175
+
176
+ count = phash.length
177
+ index = 0
178
+ phash.keys.map do |w|
179
+ word_from_hash :word => w, :percentile => phash[w].to_f, :index => (index+=1),
180
+ :rank => count-index+1, :frequency => fhash[w].to_f
181
+ end
182
+ end
183
+
184
+ def word_from_hash(hash)
185
+ CorrectHorseBatteryStaple::Word.new(hash)
186
+ end
187
+
188
+ end