correct-horse-battery-staple 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +1 -1
- data/.gemtest +0 -0
- data/Gemfile +53 -0
- data/Gemfile.lock +109 -0
- data/History.txt +6 -0
- data/Manifest.txt +57 -0
- data/README.txt +115 -0
- data/Rakefile +47 -0
- data/bin/chbs +234 -0
- data/bin/chbs-mkpass +16 -0
- data/correct-horse-battery-staple.gemspec +59 -0
- data/lib/correct_horse_battery_staple.rb +117 -0
- data/lib/correct_horse_battery_staple/assembler.rb +45 -0
- data/lib/correct_horse_battery_staple/backend.rb +6 -0
- data/lib/correct_horse_battery_staple/backend/isam_kd.rb +410 -0
- data/lib/correct_horse_battery_staple/backend/redis.rb +95 -0
- data/lib/correct_horse_battery_staple/backend/redis/d_range.rb +105 -0
- data/lib/correct_horse_battery_staple/corpus.rb +33 -0
- data/lib/correct_horse_battery_staple/corpus/base.rb +278 -0
- data/lib/correct_horse_battery_staple/corpus/isam.rb +258 -0
- data/lib/correct_horse_battery_staple/corpus/isam_kd.rb +60 -0
- data/lib/correct_horse_battery_staple/corpus/redis.rb +188 -0
- data/lib/correct_horse_battery_staple/corpus/redis2.rb +88 -0
- data/lib/correct_horse_battery_staple/corpus/serialized.rb +121 -0
- data/lib/correct_horse_battery_staple/corpus/sqlite.rb +266 -0
- data/lib/correct_horse_battery_staple/generator.rb +40 -0
- data/lib/correct_horse_battery_staple/memoize.rb +25 -0
- data/lib/correct_horse_battery_staple/parser.rb +5 -0
- data/lib/correct_horse_battery_staple/parser/base.rb +5 -0
- data/lib/correct_horse_battery_staple/parser/regex.rb +58 -0
- data/lib/correct_horse_battery_staple/range_parser.rb +29 -0
- data/lib/correct_horse_battery_staple/statistical_array.rb +74 -0
- data/lib/correct_horse_battery_staple/stats.rb +22 -0
- data/lib/correct_horse_battery_staple/word.rb +90 -0
- data/lib/correct_horse_battery_staple/writer.rb +29 -0
- data/lib/correct_horse_battery_staple/writer/base.rb +22 -0
- data/lib/correct_horse_battery_staple/writer/csv.rb +15 -0
- data/lib/correct_horse_battery_staple/writer/file.rb +54 -0
- data/lib/correct_horse_battery_staple/writer/isam.rb +50 -0
- data/lib/correct_horse_battery_staple/writer/isam_kd.rb +12 -0
- data/lib/correct_horse_battery_staple/writer/json.rb +19 -0
- data/lib/correct_horse_battery_staple/writer/marshal.rb +10 -0
- data/lib/correct_horse_battery_staple/writer/redis.rb +41 -0
- data/lib/correct_horse_battery_staple/writer/sqlite.rb +115 -0
- data/script/generate_all +34 -0
- data/script/load_redis +17 -0
- data/script/perftest +74 -0
- data/spec/corpus/serialized_spec.rb +62 -0
- data/spec/corpus_spec.rb +50 -0
- data/spec/correct_horse_battery_staple_spec.rb +73 -0
- data/spec/fixtures/100.json +101 -0
- data/spec/fixtures/corpus1.csv +101 -0
- data/spec/fixtures/corpus100.json +101 -0
- data/spec/fixtures/wiktionary1000.htm +648 -0
- data/spec/range_parser_spec.rb +54 -0
- data/spec/spec_helper.rb +20 -0
- data/spec/statistical_array_spec.rb +52 -0
- data/spec/support/spec_pry.rb +1 -0
- data/spec/word_spec.rb +95 -0
- metadata +264 -0
- metadata.gz.sig +1 -0
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'bigdecimal'
|
2
|
+
require 'hiredis'
|
3
|
+
require 'redis'
|
4
|
+
|
5
|
+
class CorrectHorseBatteryStaple::Corpus::Redis2 < CorrectHorseBatteryStaple::Corpus::Redis
|
6
|
+
MAX_ITERATIONS = 1000
|
7
|
+
|
8
|
+
def size
|
9
|
+
@size ||= db.zcard(@percentile_key)
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
|
14
|
+
## our own collection operations
|
15
|
+
## optimized pick implementations - they do NOT support :filter, though
|
16
|
+
|
17
|
+
def pick(count, options = {})
|
18
|
+
percentile_range = options[:percentile]
|
19
|
+
length_range = options[:word_length]
|
20
|
+
tempkey = nil
|
21
|
+
|
22
|
+
if percentile_range && percentile_range.begin == 0 && percentile_range.end == 100
|
23
|
+
percentile_range = nil
|
24
|
+
end
|
25
|
+
|
26
|
+
pick_sset_random(@words_key, 4)
|
27
|
+
|
28
|
+
if (!percentile_range && !length_range)
|
29
|
+
get_words_for_ids(pick_random_words(count))
|
30
|
+
else
|
31
|
+
sets = []
|
32
|
+
sets << make_subset_spec(@percentile_key, percentile_range) if percentile_range
|
33
|
+
|
34
|
+
# this isn't correct because lenprod_key will have values in the range 18...19
|
35
|
+
# sets << make_subset_spec(@lenprod_key, length_range) if length_range
|
36
|
+
if length_range
|
37
|
+
sets << [@lenprod_key, ["-inf", "(#{length_range.begin}"],
|
38
|
+
["#{length_range.end.floor + 1}", "inf"]]
|
39
|
+
end
|
40
|
+
|
41
|
+
# returns union set key
|
42
|
+
tempkey = subset_and_union(sets)
|
43
|
+
# STDERR.puts "result count in #{tempkey} is #{db.zcard(tempkey)}"
|
44
|
+
|
45
|
+
get_words_for_ids(pick_sset_random(tempkey, count))
|
46
|
+
end
|
47
|
+
ensure
|
48
|
+
db.del tempkey if tempkey
|
49
|
+
end
|
50
|
+
|
51
|
+
def make_subset_spec(key, range)
|
52
|
+
[key, ["-inf", "(#{range.begin}"], ["(#{range.end}", "inf"]]
|
53
|
+
end
|
54
|
+
|
55
|
+
def make_subset(spec)
|
56
|
+
key = gensym_temp
|
57
|
+
source_key = spec.shift
|
58
|
+
db.zunionstore(key, [source_key])
|
59
|
+
db.expire(key, 180)
|
60
|
+
spec.each do |(min, max)|
|
61
|
+
db.zremrangebyscore(key, min, max)
|
62
|
+
end
|
63
|
+
key
|
64
|
+
end
|
65
|
+
|
66
|
+
def subset_and_union(specs)
|
67
|
+
result_key = gensym_temp
|
68
|
+
db.multi do
|
69
|
+
keys = specs.map do |spec|
|
70
|
+
make_subset(spec)
|
71
|
+
end
|
72
|
+
db.zinterstore(result_key, keys)
|
73
|
+
db.del(*keys)
|
74
|
+
end
|
75
|
+
db.expire(result_key, 1800)
|
76
|
+
result_key
|
77
|
+
end
|
78
|
+
|
79
|
+
def pick_sset_random(key, count)
|
80
|
+
max = db.zcard(key)
|
81
|
+
db.multi do
|
82
|
+
count.times.map do
|
83
|
+
rnd = random_number(max)
|
84
|
+
db.zrange(key, rnd, rnd)
|
85
|
+
end
|
86
|
+
end.flatten
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
require 'bigdecimal'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
class CorrectHorseBatteryStaple::Corpus::Serialized < CorrectHorseBatteryStaple::Corpus::Base
|
5
|
+
attr_reader :table
|
6
|
+
|
7
|
+
if RUBY_VERSION.start_with? "1.8"
|
8
|
+
require 'faster_csv'
|
9
|
+
CSVLIB = FasterCSV
|
10
|
+
else
|
11
|
+
require 'csv'
|
12
|
+
CSVLIB = CSV
|
13
|
+
end
|
14
|
+
|
15
|
+
def initialize(table, stats = nil)
|
16
|
+
super
|
17
|
+
@table = CorrectHorseBatteryStaple::StatisticalArray.cast(table.sort, true)
|
18
|
+
@stats = stats
|
19
|
+
@filters = []
|
20
|
+
|
21
|
+
if stats && !stats.empty?
|
22
|
+
load_stats_from_hash(stats)
|
23
|
+
end
|
24
|
+
|
25
|
+
self.original_size = @table.size
|
26
|
+
end
|
27
|
+
|
28
|
+
## some core Enumerable building blocks
|
29
|
+
|
30
|
+
def each(&block)
|
31
|
+
table.each &block
|
32
|
+
end
|
33
|
+
|
34
|
+
def size
|
35
|
+
table.length
|
36
|
+
end
|
37
|
+
|
38
|
+
def entries
|
39
|
+
table
|
40
|
+
end
|
41
|
+
|
42
|
+
def sorted_entries
|
43
|
+
table
|
44
|
+
end
|
45
|
+
|
46
|
+
## serialization
|
47
|
+
# reading
|
48
|
+
|
49
|
+
def self.read_csv(file)
|
50
|
+
self.new CSVLIB.table(file).map {|row| CorrectHorseBatteryStaple::Word.new(row.to_hash) }
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.read_json(file)
|
54
|
+
json = JSON.parse(open(file).read)
|
55
|
+
self.new(json["corpus"].map {|hash| CorrectHorseBatteryStaple::Word.new(hash)},
|
56
|
+
json["stats"])
|
57
|
+
end
|
58
|
+
|
59
|
+
def self.read_marshal(file)
|
60
|
+
Marshal.load(open(file).read)
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.read(filename, fformat=nil)
|
64
|
+
if ! fformat
|
65
|
+
fformat = File.extname(filename)[1..-1]
|
66
|
+
end
|
67
|
+
raise ArgumentError, "Cannot determine file format for #{filename}" if !fformat || fformat.empty?
|
68
|
+
send "read_#{fformat}", filename
|
69
|
+
end
|
70
|
+
|
71
|
+
# writing
|
72
|
+
|
73
|
+
def write_csv(io)
|
74
|
+
io.puts "index,rank,word,frequency,percentile,distance,probability,distance_probability"
|
75
|
+
@table.each_with_index do |w, index|
|
76
|
+
io.puts sprintf("%d,%d,\"%s\",%d,%.4f,%.6f,%.8f,%.8f\n",
|
77
|
+
index, w.rank, w.word, w.frequency || 0,
|
78
|
+
w.percentile || 0, w.distance || 0, w.probability || 0, w.distance_probability || 0)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def write_json1(io)
|
83
|
+
io.write({"stats" => stats, "corpus" => @table }.to_json)
|
84
|
+
end
|
85
|
+
|
86
|
+
def write_json(io)
|
87
|
+
io.print '{"stats": '
|
88
|
+
io.print stats.to_json
|
89
|
+
io.print ', "corpus": ['
|
90
|
+
i = 0
|
91
|
+
@table.each do |word|
|
92
|
+
io.puts "," if i >= 1
|
93
|
+
io.print(word.to_hash.to_json)
|
94
|
+
i += 1
|
95
|
+
end
|
96
|
+
io.puts "]\n}"
|
97
|
+
end
|
98
|
+
|
99
|
+
def write_marshal(io)
|
100
|
+
io.write Marshal.dump(self)
|
101
|
+
end
|
102
|
+
|
103
|
+
def write_isam(io)
|
104
|
+
sorted_entries.each_with_index do |w, index|
|
105
|
+
io.print sprintf("%-40s%10d", w.word, w.frequency || 0) if
|
106
|
+
w.word.length <= 40
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def write(io, fformat=nil)
|
111
|
+
raise ArgumentError, "Cannot determine file format for output" if !fformat || fformat.empty?
|
112
|
+
send "write_#{fformat}", io
|
113
|
+
end
|
114
|
+
|
115
|
+
protected
|
116
|
+
|
117
|
+
def method_missing(name, *args, &block)
|
118
|
+
@table.__send__(name, *args, &block)
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
@@ -0,0 +1,266 @@
|
|
1
|
+
require 'bigdecimal'
|
2
|
+
require 'sqlite3'
|
3
|
+
|
4
|
+
class CorrectHorseBatteryStaple::Corpus::Sqlite < CorrectHorseBatteryStaple::Corpus::Base
|
5
|
+
MAX_ITERATIONS = 1000
|
6
|
+
|
7
|
+
def initialize(file)
|
8
|
+
super
|
9
|
+
@db = SQLite3::Database.open file
|
10
|
+
@statements = []
|
11
|
+
load_stats
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.read(file)
|
15
|
+
self.new file
|
16
|
+
end
|
17
|
+
|
18
|
+
## some core Enumerable building blocks
|
19
|
+
|
20
|
+
def each(&block)
|
21
|
+
entries.each &block
|
22
|
+
end
|
23
|
+
|
24
|
+
def size
|
25
|
+
@size ||= @db.execute("select count(*) from entries").first.first
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
## our own collection operations
|
31
|
+
|
32
|
+
def entries
|
33
|
+
@entries ||= table
|
34
|
+
end
|
35
|
+
|
36
|
+
def sorted_entries
|
37
|
+
entries
|
38
|
+
end
|
39
|
+
|
40
|
+
def frequencies
|
41
|
+
@frequencies ||= @db.execute("select frequency from entries").map {|x| x.first}
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
## optimized pick variants - they do NOT support :filter, though
|
46
|
+
|
47
|
+
def pick(count, options = {})
|
48
|
+
# incompat check
|
49
|
+
raise NotImplementedError, "SQLite does not support :filter option" if options[:filter]
|
50
|
+
|
51
|
+
strategy = options.delete(:strategy) || ENV['pick_strategy'] || "discrete"
|
52
|
+
send("pick_#{strategy}", count, options)
|
53
|
+
end
|
54
|
+
|
55
|
+
def pick_rtree(count, options = {})
|
56
|
+
base = "select id from index3d "
|
57
|
+
wheres = []
|
58
|
+
params = []
|
59
|
+
|
60
|
+
wheres << "minR >= ? and maxR <= ?"
|
61
|
+
rnd = random_number
|
62
|
+
offset = 0.0
|
63
|
+
if rnd > 0.8
|
64
|
+
offset = 0.8-rnd
|
65
|
+
elsif rnd < 0.2
|
66
|
+
offset = 0.2-rnd
|
67
|
+
end
|
68
|
+
params += [rnd - 0.20 + offset, rnd + 0.20 + offset]
|
69
|
+
|
70
|
+
if options[:word_length]
|
71
|
+
wheres << " minL >= ? and maxL <= ? "
|
72
|
+
params += [options[:word_length].first, options[:word_length].last]
|
73
|
+
end
|
74
|
+
if options[:percentile]
|
75
|
+
wheres << " minP >= ? and maxP <= ? "
|
76
|
+
params += [options[:percentile].first, options[:percentile].last]
|
77
|
+
end
|
78
|
+
statement = [base,
|
79
|
+
(wheres.empty? ? "" : " WHERE " + wheres.join(" AND ")),
|
80
|
+
"limit ?"].join(" ")
|
81
|
+
params += [[count,250].max]
|
82
|
+
|
83
|
+
query = prepare(statement)
|
84
|
+
ids = array_sample(query.execute!(*params), count).map {|r| r[0]}
|
85
|
+
|
86
|
+
if ids and !ids.empty?
|
87
|
+
result = get_words_for_ids(ids)
|
88
|
+
else
|
89
|
+
result = []
|
90
|
+
end
|
91
|
+
|
92
|
+
# validate that we succeeded
|
93
|
+
raise "Cannot find #{count} words matching criteria" if result.length < count
|
94
|
+
|
95
|
+
result
|
96
|
+
end
|
97
|
+
|
98
|
+
def get_words_for_ids(ids)
|
99
|
+
ids = Array(ids)
|
100
|
+
rows = @db.execute("select #{COLUMNS.join(", ")} from entries where id in (#{ids.join(',')})")
|
101
|
+
|
102
|
+
words = []
|
103
|
+
ids.each do |id|
|
104
|
+
words << rows.find {|r| r[0] == id }
|
105
|
+
end
|
106
|
+
words.map {|row| word_from_row(row)}
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
def pick_standard(count, options = {})
|
111
|
+
statement = "select #{COLUMNS.join(", ")} from entries "
|
112
|
+
params = []
|
113
|
+
wheres = []
|
114
|
+
if options[:word_length]
|
115
|
+
wheres << " wordlength >= ? and wordlength <= ? "
|
116
|
+
params += [options[:word_length].first, options[:word_length].last]
|
117
|
+
end
|
118
|
+
if options[:percentile]
|
119
|
+
wheres << " percentile >= ? and percentile <= ? "
|
120
|
+
params += [options[:percentile].first, options[:percentile].last]
|
121
|
+
end
|
122
|
+
statement = [statement,
|
123
|
+
(wheres.empty? ? "" : " WHERE " + wheres.join(" AND ")),
|
124
|
+
"order by RANDOM()",
|
125
|
+
"limit ?"].join(" ")
|
126
|
+
params << [count, 20].max
|
127
|
+
query = prepare(statement)
|
128
|
+
result = array_sample(query.execute!(*params), count).
|
129
|
+
map { |row| word_from_row(row) }
|
130
|
+
|
131
|
+
# validate that we succeeded
|
132
|
+
raise "Cannot find #{count} words matching criteria" if result.length < count
|
133
|
+
|
134
|
+
result
|
135
|
+
end
|
136
|
+
|
137
|
+
def pick_standard2(count, options = {})
|
138
|
+
statement = "select id from entries "
|
139
|
+
params = []
|
140
|
+
wheres = []
|
141
|
+
if options[:word_length]
|
142
|
+
wheres << " wordlength >= ? and wordlength <= ? "
|
143
|
+
params += [options[:word_length].first, options[:word_length].last]
|
144
|
+
end
|
145
|
+
if options[:percentile]
|
146
|
+
wheres << " percentile >= ? and percentile <= ? "
|
147
|
+
params += [options[:percentile].first, options[:percentile].last]
|
148
|
+
end
|
149
|
+
statement = [statement,
|
150
|
+
(wheres.empty? ? "" : " WHERE " + wheres.join(" AND ")),
|
151
|
+
# "order by RANDOM()",
|
152
|
+
"limit ?"].join(" ")
|
153
|
+
params << [count, 1000].max
|
154
|
+
query = prepare(statement)
|
155
|
+
ids = array_sample(query.execute!(*params), count).
|
156
|
+
map {|r| r[0]}
|
157
|
+
|
158
|
+
result = get_words_for_ids(ids)
|
159
|
+
|
160
|
+
# validate that we succeeded
|
161
|
+
raise "Cannot find #{count} words matching criteria" if result.length < count
|
162
|
+
|
163
|
+
result
|
164
|
+
end
|
165
|
+
|
166
|
+
|
167
|
+
# discrete method
|
168
|
+
def pick_discrete(count, options = {})
|
169
|
+
p_range = options[:percentile] or 0..100
|
170
|
+
l_range = options[:word_length] or 4..12
|
171
|
+
|
172
|
+
result = []
|
173
|
+
iterations = 0
|
174
|
+
while (iterations < 4 || result.length < count) && iterations < MAX_ITERATIONS
|
175
|
+
percentile = random_in_range(p_range)
|
176
|
+
length = random_in_range(l_range)
|
177
|
+
result += _pick_discrete_n(percentile, length, 1)
|
178
|
+
iterations += 1
|
179
|
+
end
|
180
|
+
|
181
|
+
# validate that we succeeded
|
182
|
+
raise "Cannot find #{count} words matching criteria" if result.length < count
|
183
|
+
|
184
|
+
array_sample(result, count).map {|row| word_from_row(row)}
|
185
|
+
end
|
186
|
+
|
187
|
+
def prepare(statement)
|
188
|
+
res = @db.prepare(statement)
|
189
|
+
@statements << res
|
190
|
+
res
|
191
|
+
end
|
192
|
+
memoize :prepare
|
193
|
+
|
194
|
+
def _pick_discrete_n(percentile, length, count = 1)
|
195
|
+
statement = prepare "select #{COLUMNS.join(", ")} from entries where " +
|
196
|
+
" percentile = ? and wordlength = ? and randunit < ? limit ?"
|
197
|
+
|
198
|
+
statement.execute!(percentile, length, random_number, count)
|
199
|
+
end
|
200
|
+
|
201
|
+
|
202
|
+
# discrete method
|
203
|
+
def pick_discrete2(count, options = {})
|
204
|
+
|
205
|
+
p_range = options[:percentile] or 0..100
|
206
|
+
l_range = options[:word_length] or 4..12
|
207
|
+
|
208
|
+
ids = []
|
209
|
+
iterations = 0
|
210
|
+
while (iterations < 3 || ids.length < count) && iterations < MAX_ITERATIONS
|
211
|
+
percentile = random_in_range(p_range)
|
212
|
+
length = random_in_range(l_range)
|
213
|
+
ids = ids.concat(_pick_discrete_n_ids(percentile, length, 25)).uniq
|
214
|
+
iterations += 1
|
215
|
+
end
|
216
|
+
|
217
|
+
ids = array_sample(ids, count).map {|r| r[0] }
|
218
|
+
result = get_words_for_ids(ids)
|
219
|
+
|
220
|
+
# validate that we succeeded
|
221
|
+
raise "Cannot find #{count} words matching criteria" if result.length < count
|
222
|
+
result
|
223
|
+
end
|
224
|
+
|
225
|
+
def prepare(statement)
|
226
|
+
res = @db.prepare(statement)
|
227
|
+
@statements << res
|
228
|
+
res
|
229
|
+
end
|
230
|
+
memoize :prepare
|
231
|
+
|
232
|
+
def _pick_discrete_n_ids(percentile, length, count = 1)
|
233
|
+
statement = prepare "select id from entries where " +
|
234
|
+
" percentile = ? and wordlength = ? and randunit > ? limit ?"
|
235
|
+
|
236
|
+
statement.execute!(percentile, length, random_number, count)
|
237
|
+
end
|
238
|
+
|
239
|
+
|
240
|
+
|
241
|
+
def close
|
242
|
+
@statements.each { |x| x.close }
|
243
|
+
super
|
244
|
+
end
|
245
|
+
|
246
|
+
protected
|
247
|
+
|
248
|
+
COLUMNS = %w[id word frequency idx rank percentile]
|
249
|
+
|
250
|
+
def table
|
251
|
+
@db.execute("select #{COLUMNS.join(", ")} from entries order by frequency").map do |row|
|
252
|
+
word_from_row(row)
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
def word_from_row(row)
|
257
|
+
CorrectHorseBatteryStaple::Word.new(:word => row[1], :frequency => row[2],
|
258
|
+
:index => row[3], :rank => row[4],
|
259
|
+
:percentile => row[5])
|
260
|
+
end
|
261
|
+
|
262
|
+
def load_stats
|
263
|
+
rows = @db.execute "select name, value from stats"
|
264
|
+
load_stats_from_hash(rows.reduce({}) {|m, (key, val)| m.merge(key => val.to_f)})
|
265
|
+
end
|
266
|
+
end
|