correct-horse-battery-staple 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. data.tar.gz.sig +1 -1
  2. data/.gemtest +0 -0
  3. data/Gemfile +53 -0
  4. data/Gemfile.lock +109 -0
  5. data/History.txt +6 -0
  6. data/Manifest.txt +57 -0
  7. data/README.txt +115 -0
  8. data/Rakefile +47 -0
  9. data/bin/chbs +234 -0
  10. data/bin/chbs-mkpass +16 -0
  11. data/correct-horse-battery-staple.gemspec +59 -0
  12. data/lib/correct_horse_battery_staple.rb +117 -0
  13. data/lib/correct_horse_battery_staple/assembler.rb +45 -0
  14. data/lib/correct_horse_battery_staple/backend.rb +6 -0
  15. data/lib/correct_horse_battery_staple/backend/isam_kd.rb +410 -0
  16. data/lib/correct_horse_battery_staple/backend/redis.rb +95 -0
  17. data/lib/correct_horse_battery_staple/backend/redis/d_range.rb +105 -0
  18. data/lib/correct_horse_battery_staple/corpus.rb +33 -0
  19. data/lib/correct_horse_battery_staple/corpus/base.rb +278 -0
  20. data/lib/correct_horse_battery_staple/corpus/isam.rb +258 -0
  21. data/lib/correct_horse_battery_staple/corpus/isam_kd.rb +60 -0
  22. data/lib/correct_horse_battery_staple/corpus/redis.rb +188 -0
  23. data/lib/correct_horse_battery_staple/corpus/redis2.rb +88 -0
  24. data/lib/correct_horse_battery_staple/corpus/serialized.rb +121 -0
  25. data/lib/correct_horse_battery_staple/corpus/sqlite.rb +266 -0
  26. data/lib/correct_horse_battery_staple/generator.rb +40 -0
  27. data/lib/correct_horse_battery_staple/memoize.rb +25 -0
  28. data/lib/correct_horse_battery_staple/parser.rb +5 -0
  29. data/lib/correct_horse_battery_staple/parser/base.rb +5 -0
  30. data/lib/correct_horse_battery_staple/parser/regex.rb +58 -0
  31. data/lib/correct_horse_battery_staple/range_parser.rb +29 -0
  32. data/lib/correct_horse_battery_staple/statistical_array.rb +74 -0
  33. data/lib/correct_horse_battery_staple/stats.rb +22 -0
  34. data/lib/correct_horse_battery_staple/word.rb +90 -0
  35. data/lib/correct_horse_battery_staple/writer.rb +29 -0
  36. data/lib/correct_horse_battery_staple/writer/base.rb +22 -0
  37. data/lib/correct_horse_battery_staple/writer/csv.rb +15 -0
  38. data/lib/correct_horse_battery_staple/writer/file.rb +54 -0
  39. data/lib/correct_horse_battery_staple/writer/isam.rb +50 -0
  40. data/lib/correct_horse_battery_staple/writer/isam_kd.rb +12 -0
  41. data/lib/correct_horse_battery_staple/writer/json.rb +19 -0
  42. data/lib/correct_horse_battery_staple/writer/marshal.rb +10 -0
  43. data/lib/correct_horse_battery_staple/writer/redis.rb +41 -0
  44. data/lib/correct_horse_battery_staple/writer/sqlite.rb +115 -0
  45. data/script/generate_all +34 -0
  46. data/script/load_redis +17 -0
  47. data/script/perftest +74 -0
  48. data/spec/corpus/serialized_spec.rb +62 -0
  49. data/spec/corpus_spec.rb +50 -0
  50. data/spec/correct_horse_battery_staple_spec.rb +73 -0
  51. data/spec/fixtures/100.json +101 -0
  52. data/spec/fixtures/corpus1.csv +101 -0
  53. data/spec/fixtures/corpus100.json +101 -0
  54. data/spec/fixtures/wiktionary1000.htm +648 -0
  55. data/spec/range_parser_spec.rb +54 -0
  56. data/spec/spec_helper.rb +20 -0
  57. data/spec/statistical_array_spec.rb +52 -0
  58. data/spec/support/spec_pry.rb +1 -0
  59. data/spec/word_spec.rb +95 -0
  60. metadata +264 -0
  61. metadata.gz.sig +1 -0
@@ -0,0 +1,88 @@
1
+ require 'bigdecimal'
2
+ require 'hiredis'
3
+ require 'redis'
4
+
5
+ class CorrectHorseBatteryStaple::Corpus::Redis2 < CorrectHorseBatteryStaple::Corpus::Redis
6
+ MAX_ITERATIONS = 1000
7
+
8
+ def size
9
+ @size ||= db.zcard(@percentile_key)
10
+ end
11
+
12
+
13
+
14
+ ## our own collection operations
15
+ ## optimized pick implementations - they do NOT support :filter, though
16
+
17
+ def pick(count, options = {})
18
+ percentile_range = options[:percentile]
19
+ length_range = options[:word_length]
20
+ tempkey = nil
21
+
22
+ if percentile_range && percentile_range.begin == 0 && percentile_range.end == 100
23
+ percentile_range = nil
24
+ end
25
+
26
+ pick_sset_random(@words_key, 4)
27
+
28
+ if (!percentile_range && !length_range)
29
+ get_words_for_ids(pick_random_words(count))
30
+ else
31
+ sets = []
32
+ sets << make_subset_spec(@percentile_key, percentile_range) if percentile_range
33
+
34
+ # this isn't correct because lenprod_key will have values in the range 18...19
35
+ # sets << make_subset_spec(@lenprod_key, length_range) if length_range
36
+ if length_range
37
+ sets << [@lenprod_key, ["-inf", "(#{length_range.begin}"],
38
+ ["#{length_range.end.floor + 1}", "inf"]]
39
+ end
40
+
41
+ # returns union set key
42
+ tempkey = subset_and_union(sets)
43
+ # STDERR.puts "result count in #{tempkey} is #{db.zcard(tempkey)}"
44
+
45
+ get_words_for_ids(pick_sset_random(tempkey, count))
46
+ end
47
+ ensure
48
+ db.del tempkey if tempkey
49
+ end
50
+
51
+ def make_subset_spec(key, range)
52
+ [key, ["-inf", "(#{range.begin}"], ["(#{range.end}", "inf"]]
53
+ end
54
+
55
+ def make_subset(spec)
56
+ key = gensym_temp
57
+ source_key = spec.shift
58
+ db.zunionstore(key, [source_key])
59
+ db.expire(key, 180)
60
+ spec.each do |(min, max)|
61
+ db.zremrangebyscore(key, min, max)
62
+ end
63
+ key
64
+ end
65
+
66
+ def subset_and_union(specs)
67
+ result_key = gensym_temp
68
+ db.multi do
69
+ keys = specs.map do |spec|
70
+ make_subset(spec)
71
+ end
72
+ db.zinterstore(result_key, keys)
73
+ db.del(*keys)
74
+ end
75
+ db.expire(result_key, 1800)
76
+ result_key
77
+ end
78
+
79
+ def pick_sset_random(key, count)
80
+ max = db.zcard(key)
81
+ db.multi do
82
+ count.times.map do
83
+ rnd = random_number(max)
84
+ db.zrange(key, rnd, rnd)
85
+ end
86
+ end.flatten
87
+ end
88
+ end
@@ -0,0 +1,121 @@
1
+ require 'bigdecimal'
2
+ require 'json'
3
+
4
+ class CorrectHorseBatteryStaple::Corpus::Serialized < CorrectHorseBatteryStaple::Corpus::Base
5
+ attr_reader :table
6
+
7
+ if RUBY_VERSION.start_with? "1.8"
8
+ require 'faster_csv'
9
+ CSVLIB = FasterCSV
10
+ else
11
+ require 'csv'
12
+ CSVLIB = CSV
13
+ end
14
+
15
+ def initialize(table, stats = nil)
16
+ super
17
+ @table = CorrectHorseBatteryStaple::StatisticalArray.cast(table.sort, true)
18
+ @stats = stats
19
+ @filters = []
20
+
21
+ if stats && !stats.empty?
22
+ load_stats_from_hash(stats)
23
+ end
24
+
25
+ self.original_size = @table.size
26
+ end
27
+
28
+ ## some core Enumerable building blocks
29
+
30
+ def each(&block)
31
+ table.each &block
32
+ end
33
+
34
+ def size
35
+ table.length
36
+ end
37
+
38
+ def entries
39
+ table
40
+ end
41
+
42
+ def sorted_entries
43
+ table
44
+ end
45
+
46
+ ## serialization
47
+ # reading
48
+
49
+ def self.read_csv(file)
50
+ self.new CSVLIB.table(file).map {|row| CorrectHorseBatteryStaple::Word.new(row.to_hash) }
51
+ end
52
+
53
+ def self.read_json(file)
54
+ json = JSON.parse(open(file).read)
55
+ self.new(json["corpus"].map {|hash| CorrectHorseBatteryStaple::Word.new(hash)},
56
+ json["stats"])
57
+ end
58
+
59
+ def self.read_marshal(file)
60
+ Marshal.load(open(file).read)
61
+ end
62
+
63
+ def self.read(filename, fformat=nil)
64
+ if ! fformat
65
+ fformat = File.extname(filename)[1..-1]
66
+ end
67
+ raise ArgumentError, "Cannot determine file format for #{filename}" if !fformat || fformat.empty?
68
+ send "read_#{fformat}", filename
69
+ end
70
+
71
+ # writing
72
+
73
+ def write_csv(io)
74
+ io.puts "index,rank,word,frequency,percentile,distance,probability,distance_probability"
75
+ @table.each_with_index do |w, index|
76
+ io.puts sprintf("%d,%d,\"%s\",%d,%.4f,%.6f,%.8f,%.8f\n",
77
+ index, w.rank, w.word, w.frequency || 0,
78
+ w.percentile || 0, w.distance || 0, w.probability || 0, w.distance_probability || 0)
79
+ end
80
+ end
81
+
82
+ def write_json1(io)
83
+ io.write({"stats" => stats, "corpus" => @table }.to_json)
84
+ end
85
+
86
+ def write_json(io)
87
+ io.print '{"stats": '
88
+ io.print stats.to_json
89
+ io.print ', "corpus": ['
90
+ i = 0
91
+ @table.each do |word|
92
+ io.puts "," if i >= 1
93
+ io.print(word.to_hash.to_json)
94
+ i += 1
95
+ end
96
+ io.puts "]\n}"
97
+ end
98
+
99
+ def write_marshal(io)
100
+ io.write Marshal.dump(self)
101
+ end
102
+
103
+ def write_isam(io)
104
+ sorted_entries.each_with_index do |w, index|
105
+ io.print sprintf("%-40s%10d", w.word, w.frequency || 0) if
106
+ w.word.length <= 40
107
+ end
108
+ end
109
+
110
+ def write(io, fformat=nil)
111
+ raise ArgumentError, "Cannot determine file format for output" if !fformat || fformat.empty?
112
+ send "write_#{fformat}", io
113
+ end
114
+
115
+ protected
116
+
117
+ def method_missing(name, *args, &block)
118
+ @table.__send__(name, *args, &block)
119
+ end
120
+
121
+ end
@@ -0,0 +1,266 @@
1
+ require 'bigdecimal'
2
+ require 'sqlite3'
3
+
4
+ class CorrectHorseBatteryStaple::Corpus::Sqlite < CorrectHorseBatteryStaple::Corpus::Base
5
+ MAX_ITERATIONS = 1000
6
+
7
+ def initialize(file)
8
+ super
9
+ @db = SQLite3::Database.open file
10
+ @statements = []
11
+ load_stats
12
+ end
13
+
14
+ def self.read(file)
15
+ self.new file
16
+ end
17
+
18
+ ## some core Enumerable building blocks
19
+
20
+ def each(&block)
21
+ entries.each &block
22
+ end
23
+
24
+ def size
25
+ @size ||= @db.execute("select count(*) from entries").first.first
26
+ end
27
+
28
+
29
+
30
+ ## our own collection operations
31
+
32
+ def entries
33
+ @entries ||= table
34
+ end
35
+
36
+ def sorted_entries
37
+ entries
38
+ end
39
+
40
+ def frequencies
41
+ @frequencies ||= @db.execute("select frequency from entries").map {|x| x.first}
42
+ end
43
+
44
+
45
+ ## optimized pick variants - they do NOT support :filter, though
46
+
47
+ def pick(count, options = {})
48
+ # incompat check
49
+ raise NotImplementedError, "SQLite does not support :filter option" if options[:filter]
50
+
51
+ strategy = options.delete(:strategy) || ENV['pick_strategy'] || "discrete"
52
+ send("pick_#{strategy}", count, options)
53
+ end
54
+
55
+ def pick_rtree(count, options = {})
56
+ base = "select id from index3d "
57
+ wheres = []
58
+ params = []
59
+
60
+ wheres << "minR >= ? and maxR <= ?"
61
+ rnd = random_number
62
+ offset = 0.0
63
+ if rnd > 0.8
64
+ offset = 0.8-rnd
65
+ elsif rnd < 0.2
66
+ offset = 0.2-rnd
67
+ end
68
+ params += [rnd - 0.20 + offset, rnd + 0.20 + offset]
69
+
70
+ if options[:word_length]
71
+ wheres << " minL >= ? and maxL <= ? "
72
+ params += [options[:word_length].first, options[:word_length].last]
73
+ end
74
+ if options[:percentile]
75
+ wheres << " minP >= ? and maxP <= ? "
76
+ params += [options[:percentile].first, options[:percentile].last]
77
+ end
78
+ statement = [base,
79
+ (wheres.empty? ? "" : " WHERE " + wheres.join(" AND ")),
80
+ "limit ?"].join(" ")
81
+ params += [[count,250].max]
82
+
83
+ query = prepare(statement)
84
+ ids = array_sample(query.execute!(*params), count).map {|r| r[0]}
85
+
86
+ if ids and !ids.empty?
87
+ result = get_words_for_ids(ids)
88
+ else
89
+ result = []
90
+ end
91
+
92
+ # validate that we succeeded
93
+ raise "Cannot find #{count} words matching criteria" if result.length < count
94
+
95
+ result
96
+ end
97
+
98
+ def get_words_for_ids(ids)
99
+ ids = Array(ids)
100
+ rows = @db.execute("select #{COLUMNS.join(", ")} from entries where id in (#{ids.join(',')})")
101
+
102
+ words = []
103
+ ids.each do |id|
104
+ words << rows.find {|r| r[0] == id }
105
+ end
106
+ words.map {|row| word_from_row(row)}
107
+ end
108
+
109
+
110
+ def pick_standard(count, options = {})
111
+ statement = "select #{COLUMNS.join(", ")} from entries "
112
+ params = []
113
+ wheres = []
114
+ if options[:word_length]
115
+ wheres << " wordlength >= ? and wordlength <= ? "
116
+ params += [options[:word_length].first, options[:word_length].last]
117
+ end
118
+ if options[:percentile]
119
+ wheres << " percentile >= ? and percentile <= ? "
120
+ params += [options[:percentile].first, options[:percentile].last]
121
+ end
122
+ statement = [statement,
123
+ (wheres.empty? ? "" : " WHERE " + wheres.join(" AND ")),
124
+ "order by RANDOM()",
125
+ "limit ?"].join(" ")
126
+ params << [count, 20].max
127
+ query = prepare(statement)
128
+ result = array_sample(query.execute!(*params), count).
129
+ map { |row| word_from_row(row) }
130
+
131
+ # validate that we succeeded
132
+ raise "Cannot find #{count} words matching criteria" if result.length < count
133
+
134
+ result
135
+ end
136
+
137
+ def pick_standard2(count, options = {})
138
+ statement = "select id from entries "
139
+ params = []
140
+ wheres = []
141
+ if options[:word_length]
142
+ wheres << " wordlength >= ? and wordlength <= ? "
143
+ params += [options[:word_length].first, options[:word_length].last]
144
+ end
145
+ if options[:percentile]
146
+ wheres << " percentile >= ? and percentile <= ? "
147
+ params += [options[:percentile].first, options[:percentile].last]
148
+ end
149
+ statement = [statement,
150
+ (wheres.empty? ? "" : " WHERE " + wheres.join(" AND ")),
151
+ # "order by RANDOM()",
152
+ "limit ?"].join(" ")
153
+ params << [count, 1000].max
154
+ query = prepare(statement)
155
+ ids = array_sample(query.execute!(*params), count).
156
+ map {|r| r[0]}
157
+
158
+ result = get_words_for_ids(ids)
159
+
160
+ # validate that we succeeded
161
+ raise "Cannot find #{count} words matching criteria" if result.length < count
162
+
163
+ result
164
+ end
165
+
166
+
167
+ # discrete method
168
+ def pick_discrete(count, options = {})
169
+ p_range = options[:percentile] or 0..100
170
+ l_range = options[:word_length] or 4..12
171
+
172
+ result = []
173
+ iterations = 0
174
+ while (iterations < 4 || result.length < count) && iterations < MAX_ITERATIONS
175
+ percentile = random_in_range(p_range)
176
+ length = random_in_range(l_range)
177
+ result += _pick_discrete_n(percentile, length, 1)
178
+ iterations += 1
179
+ end
180
+
181
+ # validate that we succeeded
182
+ raise "Cannot find #{count} words matching criteria" if result.length < count
183
+
184
+ array_sample(result, count).map {|row| word_from_row(row)}
185
+ end
186
+
187
+ def prepare(statement)
188
+ res = @db.prepare(statement)
189
+ @statements << res
190
+ res
191
+ end
192
+ memoize :prepare
193
+
194
+ def _pick_discrete_n(percentile, length, count = 1)
195
+ statement = prepare "select #{COLUMNS.join(", ")} from entries where " +
196
+ " percentile = ? and wordlength = ? and randunit < ? limit ?"
197
+
198
+ statement.execute!(percentile, length, random_number, count)
199
+ end
200
+
201
+
202
+ # discrete method
203
+ def pick_discrete2(count, options = {})
204
+
205
+ p_range = options[:percentile] or 0..100
206
+ l_range = options[:word_length] or 4..12
207
+
208
+ ids = []
209
+ iterations = 0
210
+ while (iterations < 3 || ids.length < count) && iterations < MAX_ITERATIONS
211
+ percentile = random_in_range(p_range)
212
+ length = random_in_range(l_range)
213
+ ids = ids.concat(_pick_discrete_n_ids(percentile, length, 25)).uniq
214
+ iterations += 1
215
+ end
216
+
217
+ ids = array_sample(ids, count).map {|r| r[0] }
218
+ result = get_words_for_ids(ids)
219
+
220
+ # validate that we succeeded
221
+ raise "Cannot find #{count} words matching criteria" if result.length < count
222
+ result
223
+ end
224
+
225
+ def prepare(statement)
226
+ res = @db.prepare(statement)
227
+ @statements << res
228
+ res
229
+ end
230
+ memoize :prepare
231
+
232
+ def _pick_discrete_n_ids(percentile, length, count = 1)
233
+ statement = prepare "select id from entries where " +
234
+ " percentile = ? and wordlength = ? and randunit > ? limit ?"
235
+
236
+ statement.execute!(percentile, length, random_number, count)
237
+ end
238
+
239
+
240
+
241
+ def close
242
+ @statements.each { |x| x.close }
243
+ super
244
+ end
245
+
246
+ protected
247
+
248
+ COLUMNS = %w[id word frequency idx rank percentile]
249
+
250
+ def table
251
+ @db.execute("select #{COLUMNS.join(", ")} from entries order by frequency").map do |row|
252
+ word_from_row(row)
253
+ end
254
+ end
255
+
256
+ def word_from_row(row)
257
+ CorrectHorseBatteryStaple::Word.new(:word => row[1], :frequency => row[2],
258
+ :index => row[3], :rank => row[4],
259
+ :percentile => row[5])
260
+ end
261
+
262
+ def load_stats
263
+ rows = @db.execute "select name, value from stats"
264
+ load_stats_from_hash(rows.reduce({}) {|m, (key, val)| m.merge(key => val.to_f)})
265
+ end
266
+ end