correct-horse-battery-staple 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +1 -1
- data/.gemtest +0 -0
- data/Gemfile +53 -0
- data/Gemfile.lock +109 -0
- data/History.txt +6 -0
- data/Manifest.txt +57 -0
- data/README.txt +115 -0
- data/Rakefile +47 -0
- data/bin/chbs +234 -0
- data/bin/chbs-mkpass +16 -0
- data/correct-horse-battery-staple.gemspec +59 -0
- data/lib/correct_horse_battery_staple.rb +117 -0
- data/lib/correct_horse_battery_staple/assembler.rb +45 -0
- data/lib/correct_horse_battery_staple/backend.rb +6 -0
- data/lib/correct_horse_battery_staple/backend/isam_kd.rb +410 -0
- data/lib/correct_horse_battery_staple/backend/redis.rb +95 -0
- data/lib/correct_horse_battery_staple/backend/redis/d_range.rb +105 -0
- data/lib/correct_horse_battery_staple/corpus.rb +33 -0
- data/lib/correct_horse_battery_staple/corpus/base.rb +278 -0
- data/lib/correct_horse_battery_staple/corpus/isam.rb +258 -0
- data/lib/correct_horse_battery_staple/corpus/isam_kd.rb +60 -0
- data/lib/correct_horse_battery_staple/corpus/redis.rb +188 -0
- data/lib/correct_horse_battery_staple/corpus/redis2.rb +88 -0
- data/lib/correct_horse_battery_staple/corpus/serialized.rb +121 -0
- data/lib/correct_horse_battery_staple/corpus/sqlite.rb +266 -0
- data/lib/correct_horse_battery_staple/generator.rb +40 -0
- data/lib/correct_horse_battery_staple/memoize.rb +25 -0
- data/lib/correct_horse_battery_staple/parser.rb +5 -0
- data/lib/correct_horse_battery_staple/parser/base.rb +5 -0
- data/lib/correct_horse_battery_staple/parser/regex.rb +58 -0
- data/lib/correct_horse_battery_staple/range_parser.rb +29 -0
- data/lib/correct_horse_battery_staple/statistical_array.rb +74 -0
- data/lib/correct_horse_battery_staple/stats.rb +22 -0
- data/lib/correct_horse_battery_staple/word.rb +90 -0
- data/lib/correct_horse_battery_staple/writer.rb +29 -0
- data/lib/correct_horse_battery_staple/writer/base.rb +22 -0
- data/lib/correct_horse_battery_staple/writer/csv.rb +15 -0
- data/lib/correct_horse_battery_staple/writer/file.rb +54 -0
- data/lib/correct_horse_battery_staple/writer/isam.rb +50 -0
- data/lib/correct_horse_battery_staple/writer/isam_kd.rb +12 -0
- data/lib/correct_horse_battery_staple/writer/json.rb +19 -0
- data/lib/correct_horse_battery_staple/writer/marshal.rb +10 -0
- data/lib/correct_horse_battery_staple/writer/redis.rb +41 -0
- data/lib/correct_horse_battery_staple/writer/sqlite.rb +115 -0
- data/script/generate_all +34 -0
- data/script/load_redis +17 -0
- data/script/perftest +74 -0
- data/spec/corpus/serialized_spec.rb +62 -0
- data/spec/corpus_spec.rb +50 -0
- data/spec/correct_horse_battery_staple_spec.rb +73 -0
- data/spec/fixtures/100.json +101 -0
- data/spec/fixtures/corpus1.csv +101 -0
- data/spec/fixtures/corpus100.json +101 -0
- data/spec/fixtures/wiktionary1000.htm +648 -0
- data/spec/range_parser_spec.rb +54 -0
- data/spec/spec_helper.rb +20 -0
- data/spec/statistical_array_spec.rb +52 -0
- data/spec/support/spec_pry.rb +1 -0
- data/spec/word_spec.rb +95 -0
- metadata +264 -0
- metadata.gz.sig +1 -0
@@ -0,0 +1,258 @@
|
|
1
|
+
require 'bigdecimal'
|
2
|
+
require 'json'
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Format of header:
|
8
|
+
#
|
9
|
+
# 0..3 - OB - offset of body start in bytes; network byte order
|
10
|
+
# 4..7 - LP - length of prelude in network byte order
|
11
|
+
# 8..OB-1 - P - JSON-encoded prelude hash and space padding
|
12
|
+
# OB..EOF - array of fixed size records as described in prelude
|
13
|
+
#
|
14
|
+
# Contents of Prelude (after JSON decoding):
|
15
|
+
#
|
16
|
+
# P["wlen"] - length of word part of record
|
17
|
+
# P["flen"] - length of frequency part of record (always 4 bytes)
|
18
|
+
# P["entrylen"] - length of total part of record
|
19
|
+
# P["n"] - number of records
|
20
|
+
# P["sort"] - field name sorted by (word or frequency)
|
21
|
+
# P["stats"] - corpus statistics
|
22
|
+
#
|
23
|
+
# Format of record:
|
24
|
+
#
|
25
|
+
# 2 bytes - LW - actual length of word within field
|
26
|
+
# P["wlen"] bytes - LW bytes of word (W) + P["wlen"]-LW bytes of padding
|
27
|
+
# P["flen"] (4) bytes - frequency as network byte order long
|
28
|
+
#
|
29
|
+
|
30
|
+
class CorrectHorseBatteryStaple::Corpus::Isam < CorrectHorseBatteryStaple::Corpus::Base
|
31
|
+
include CorrectHorseBatteryStaple::Memoize
|
32
|
+
|
33
|
+
INITIAL_PRELUDE_LENGTH = 512
|
34
|
+
|
35
|
+
def initialize(filename, stats = nil)
|
36
|
+
super
|
37
|
+
@filename = filename
|
38
|
+
@file = CorrectHorseBatteryStaple::Util.open_binary(filename, "r")
|
39
|
+
parse_prelude
|
40
|
+
end
|
41
|
+
|
42
|
+
def precache(max = -1)
|
43
|
+
return if max > -1 && file_size(@file) > max
|
44
|
+
@file.seek 0
|
45
|
+
@file = StringIO.new @file.read, "r"
|
46
|
+
end
|
47
|
+
|
48
|
+
def file_size(file)
|
49
|
+
(file.respond_to?(:size) ? file.size : file.stat.size)
|
50
|
+
end
|
51
|
+
|
52
|
+
def prelude
|
53
|
+
@prelude || parse_prelude
|
54
|
+
end
|
55
|
+
|
56
|
+
def parse_prelude
|
57
|
+
@file.seek 0
|
58
|
+
prelude_buf = @file.read(INITIAL_PRELUDE_LENGTH)
|
59
|
+
|
60
|
+
# byte offset of first record from beginning of file
|
61
|
+
# total length of JSON string (without padding)
|
62
|
+
(@record_offset, @prelude_len) = prelude_buf.unpack("NN")
|
63
|
+
|
64
|
+
# read more if our initial read didn't slurp in the entire prelude
|
65
|
+
if @prelude_len > prelude_buf.length
|
66
|
+
prelude_buf += @file.read(@prelude_len - prelude_buf.length)
|
67
|
+
end
|
68
|
+
|
69
|
+
@prelude = JSON.parse( prelude_buf.unpack("@8a#{@prelude_len}")[0] ) || {}
|
70
|
+
|
71
|
+
# includes prefix length byte
|
72
|
+
@word_length = @prelude["wlen"] || raise(ArgumentError, "Word length is not defined!")
|
73
|
+
|
74
|
+
# as network byte order int
|
75
|
+
@frequency_length = @prelude["flen"] || 4
|
76
|
+
|
77
|
+
# total length of record
|
78
|
+
@entry_length = @prelude["entrylen"] || raise(ArgumentError, "Prelude does not include entrylen!")
|
79
|
+
|
80
|
+
load_stats_from_hash(@prelude["stats"]) if @prelude["stats"]
|
81
|
+
|
82
|
+
@prelude
|
83
|
+
end
|
84
|
+
|
85
|
+
# factory-ish constructor
|
86
|
+
def self.read(filename)
|
87
|
+
self.new filename
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
## parsing
|
92
|
+
|
93
|
+
#
|
94
|
+
# Parse a record into an array of [word, frequency] IFF the word
|
95
|
+
# fits into the length_range or length_range is nil
|
96
|
+
#
|
97
|
+
def parse_record_into_array(string, index, length_range = nil)
|
98
|
+
chunk = nth_chunk(index, string)
|
99
|
+
raise "No chunk for index #{index}" unless chunk
|
100
|
+
actual_word_length = chunk.unpack("C")[0]
|
101
|
+
if !length_range || length_range.include?(actual_word_length)
|
102
|
+
# returns [word, frequency]
|
103
|
+
chunk.unpack("xa#{actual_word_length}@#{@word_length}N")
|
104
|
+
else
|
105
|
+
nil
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
#
|
110
|
+
# Parse a record into a Word object, which can be provided or will otherwise
|
111
|
+
# be constructed as needed fourth arg is a length range which can act as a
|
112
|
+
# filter; if not satisfied, nil will be returned
|
113
|
+
#
|
114
|
+
def parse_record(string, index=0,
|
115
|
+
word=CorrectHorseBatteryStaple::Word.new(:word => ""),
|
116
|
+
length_range = nil)
|
117
|
+
bare = parse_record_into_array(string, index, length_range)
|
118
|
+
return nil unless bare
|
119
|
+
word.word = bare[0]
|
120
|
+
word.frequency = bare[1]
|
121
|
+
word
|
122
|
+
end
|
123
|
+
|
124
|
+
def word_length(chunk_string)
|
125
|
+
chunk_string.unpack("C")
|
126
|
+
end
|
127
|
+
|
128
|
+
# return a string representing the nth_record
|
129
|
+
def nth_chunk(n, string)
|
130
|
+
string[@entry_length * n, @entry_length]
|
131
|
+
end
|
132
|
+
|
133
|
+
## some core Enumerable building blocks
|
134
|
+
|
135
|
+
def each(&block)
|
136
|
+
string = records_string
|
137
|
+
max_index = size - 1
|
138
|
+
index = 0
|
139
|
+
while index < max_index
|
140
|
+
yield parse_record(string, index)
|
141
|
+
index += 1
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def count; size; end
|
146
|
+
def size
|
147
|
+
@size ||= records_size / @entry_length
|
148
|
+
end
|
149
|
+
|
150
|
+
|
151
|
+
## our Corpus Enumerablish abstract methods
|
152
|
+
|
153
|
+
# we presume that the ISAM file has been sorted
|
154
|
+
def sorted_entries
|
155
|
+
@sorted_entries ||= entries
|
156
|
+
end
|
157
|
+
|
158
|
+
|
159
|
+
## optimized pick - does NOT support :filter, though
|
160
|
+
def pick(count, options = {})
|
161
|
+
# incompat check
|
162
|
+
raise NotImplementedError, "ISAM does not support :filter option" if options[:filter]
|
163
|
+
|
164
|
+
# options parsing
|
165
|
+
string = record_percentile_range_read(options[:percentile] || (0..100))
|
166
|
+
range_size = string.length / @entry_length
|
167
|
+
max_iterations = [options[:max_iterations] || 1000, count*10].max
|
168
|
+
|
169
|
+
if range_size < count
|
170
|
+
raise ArgumentError, "Percentile range contains fewer words than requested count"
|
171
|
+
end
|
172
|
+
|
173
|
+
# the real work
|
174
|
+
result = _pick(string, count, options[:word_length], max_iterations)
|
175
|
+
|
176
|
+
# validate that we succeeded
|
177
|
+
raise "Cannot find #{count} words matching criteria" if result.length < count
|
178
|
+
|
179
|
+
result
|
180
|
+
end
|
181
|
+
|
182
|
+
def _pick(string, count, length_range, max_iterations)
|
183
|
+
result = []
|
184
|
+
iterations = 0
|
185
|
+
|
186
|
+
# don't bother reading already read words
|
187
|
+
skip_cache = Set.new
|
188
|
+
range_size = string.length / @entry_length
|
189
|
+
|
190
|
+
# don't cons!
|
191
|
+
entry = CorrectHorseBatteryStaple::Word.new :word => ""
|
192
|
+
while result.length < count && iterations < max_iterations
|
193
|
+
i = random_number(range_size)
|
194
|
+
unless skip_cache.include? i
|
195
|
+
pr = parse_record(string, i, entry, length_range)
|
196
|
+
if pr
|
197
|
+
result << pr.dup
|
198
|
+
else
|
199
|
+
skip_cache << i
|
200
|
+
end
|
201
|
+
end
|
202
|
+
iterations += 1
|
203
|
+
end
|
204
|
+
result
|
205
|
+
end
|
206
|
+
|
207
|
+
|
208
|
+
## file I/O
|
209
|
+
|
210
|
+
def records_size
|
211
|
+
@records_size ||= (file_size(@file) - @record_offset)
|
212
|
+
end
|
213
|
+
|
214
|
+
def file_string
|
215
|
+
@file.is_a?(StringIO) ? @file.string : file_range_read(nil)
|
216
|
+
end
|
217
|
+
|
218
|
+
def file_range_read(file_range = nil)
|
219
|
+
file_range ||= 0...file_size(@file)
|
220
|
+
pos = @file.tell
|
221
|
+
@file.seek(file_range.first)
|
222
|
+
@file.read(range_count(file_range))
|
223
|
+
ensure
|
224
|
+
@file.seek(pos)
|
225
|
+
end
|
226
|
+
memoize :file_range_read
|
227
|
+
|
228
|
+
# returns a string representing the record-holding portion of the file
|
229
|
+
def records_string
|
230
|
+
@records_string ||=
|
231
|
+
record_range_read(0 ... records_size)
|
232
|
+
end
|
233
|
+
|
234
|
+
def record_range_read(record_range = nil)
|
235
|
+
record_range ||= 0...records_size
|
236
|
+
file_range_read((record_range.first + @record_offset)...(range_count(record_range) + @record_offset))
|
237
|
+
end
|
238
|
+
# memoize :record_range_read
|
239
|
+
|
240
|
+
def record_percentile_range_read(percentile_range)
|
241
|
+
record_range = record_range_for_percentile(percentile_range)
|
242
|
+
record_range_read(record_range)
|
243
|
+
end
|
244
|
+
|
245
|
+
|
246
|
+
## rather than using a StatisticalArray, we do direct indexing into the file/string
|
247
|
+
def percentile_index(percentile, round=true)
|
248
|
+
r = percentile.to_f/100 * count + 0.5
|
249
|
+
round ? r.round : r
|
250
|
+
end
|
251
|
+
|
252
|
+
def record_range_for_percentile(range)
|
253
|
+
range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric)
|
254
|
+
(percentile_index(range.begin, false).floor * @entry_length ...
|
255
|
+
percentile_index(range.end, false).ceil * @entry_length)
|
256
|
+
end
|
257
|
+
|
258
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'bigdecimal'
|
2
|
+
require 'json'
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Format of header:
|
8
|
+
#
|
9
|
+
# 0..3 - OB - offset of body start in bytes; network byte order
|
10
|
+
# 4..7 - LP - length of prelude in network byte order
|
11
|
+
# 8..OB-1 - P - JSON-encoded prelude hash and space padding
|
12
|
+
# OB..EOF - array of fixed size records as described in prelude
|
13
|
+
#
|
14
|
+
# Contents of Prelude (after JSON decoding):
|
15
|
+
#
|
16
|
+
# P["wlen"] - length of word part of record
|
17
|
+
# P["flen"] - length of frequency part of record (always 4 bytes)
|
18
|
+
# P["entrylen"] - length of total part of record
|
19
|
+
# P["n"] - number of records
|
20
|
+
# P["sort"] - field name sorted by (word or frequency)
|
21
|
+
# P["stats"] - corpus statistics
|
22
|
+
#
|
23
|
+
# Format of record:
|
24
|
+
#
|
25
|
+
# 2 bytes - LW - actual length of word within field
|
26
|
+
# P["wlen"] bytes - LW bytes of word (W) + P["wlen"]-LW bytes of padding
|
27
|
+
# P["flen"] (4) bytes - frequency as network byte order long
|
28
|
+
#
|
29
|
+
|
30
|
+
class CorrectHorseBatteryStaple::Corpus::IsamKD < CorrectHorseBatteryStaple::Corpus::Base
|
31
|
+
include CorrectHorseBatteryStaple::Memoize
|
32
|
+
include CorrectHorseBatteryStaple::Backend::IsamKD
|
33
|
+
|
34
|
+
def initialize(filename, stats = nil)
|
35
|
+
super
|
36
|
+
@filename = filename
|
37
|
+
@file = CorrectHorseBatteryStaple::Util.open_binary(filename, "r")
|
38
|
+
parse_prelude
|
39
|
+
load_index
|
40
|
+
end
|
41
|
+
|
42
|
+
def precache(max = -1)
|
43
|
+
return if max > -1 && file_size(@file) > max
|
44
|
+
@file.seek 0
|
45
|
+
@file = StringIO.new @file.read, "r"
|
46
|
+
end
|
47
|
+
|
48
|
+
def file_size(file)
|
49
|
+
(file.respond_to?(:size) ? file.size : file.stat.size)
|
50
|
+
end
|
51
|
+
|
52
|
+
def prelude
|
53
|
+
@prelude ||= parse_prelude
|
54
|
+
end
|
55
|
+
|
56
|
+
def load_index
|
57
|
+
@kdtree ||= load_kdtree
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
@@ -0,0 +1,188 @@
|
|
1
|
+
require 'bigdecimal'
|
2
|
+
require 'hiredis'
|
3
|
+
require 'redis'
|
4
|
+
require 'set'
|
5
|
+
|
6
|
+
class CorrectHorseBatteryStaple::Corpus::Redis < CorrectHorseBatteryStaple::Corpus::Base
|
7
|
+
include CorrectHorseBatteryStaple::Backend::Redis
|
8
|
+
|
9
|
+
MAX_ITERATIONS = 1000
|
10
|
+
|
11
|
+
attr_accessor :dest
|
12
|
+
attr_accessor :options
|
13
|
+
|
14
|
+
def initialize(dest)
|
15
|
+
super
|
16
|
+
self.dest = dest
|
17
|
+
self.options = {}
|
18
|
+
parse_uri(dest)
|
19
|
+
|
20
|
+
load_stats
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.read(file)
|
24
|
+
self.new file
|
25
|
+
end
|
26
|
+
|
27
|
+
## some core Enumerable building blocks
|
28
|
+
|
29
|
+
def each(&block)
|
30
|
+
entries.each &block
|
31
|
+
end
|
32
|
+
|
33
|
+
def count
|
34
|
+
@count ||= db.zcard(@words_key)
|
35
|
+
end
|
36
|
+
|
37
|
+
def size
|
38
|
+
stats[:size] || count
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
## our own collection operations
|
44
|
+
|
45
|
+
def entries
|
46
|
+
table
|
47
|
+
end
|
48
|
+
|
49
|
+
def sorted_entries
|
50
|
+
entries.sort
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
def pick(count, options = {})
|
55
|
+
# incompat check
|
56
|
+
raise NotImplementedError, "Redis does not support :filter option" if options[:filter]
|
57
|
+
|
58
|
+
strategy = options.delete(:strategy) || ENV['pick_strategy'] || "drange"
|
59
|
+
send("pick_#{strategy}", count, options)
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
## optimized pick implementations - they do NOT support :filter, though
|
64
|
+
|
65
|
+
def pick_standard(count, options = {})
|
66
|
+
percentile_range = options[:percentile]
|
67
|
+
length_range = options[:word_length]
|
68
|
+
|
69
|
+
if percentile_range && percentile_range.begin == 0 && percentile_range.end == 100
|
70
|
+
percentile_range = nil
|
71
|
+
end
|
72
|
+
|
73
|
+
if (!percentile_range && !length_range)
|
74
|
+
get_words_for_ids(pick_random_words(count))
|
75
|
+
else
|
76
|
+
sets = []
|
77
|
+
sets << get_word_ids_in_zset(@percentile_key, percentile_range) if percentile_range
|
78
|
+
sets << get_word_ids_in_zset(@lenprod_key, length_range) if length_range
|
79
|
+
|
80
|
+
candidates = (sets.length == 1 ? sets[0] : intersection(*sets))
|
81
|
+
get_words_for_ids(array_sample(candidates, count))
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
|
87
|
+
def pick_drange(count, options = {})
|
88
|
+
percentile_range = options[:percentile]
|
89
|
+
length_range = options[:word_length]
|
90
|
+
|
91
|
+
if percentile_range && range_cover?(percentile_range, 0..100)
|
92
|
+
percentile_range = nil
|
93
|
+
end
|
94
|
+
|
95
|
+
corpus_length_range = self.corpus_length_range
|
96
|
+
if !length_range || range_cover?(length_range, corpus_length_range)
|
97
|
+
length_range = nil
|
98
|
+
end
|
99
|
+
|
100
|
+
if (!percentile_range && !length_range)
|
101
|
+
get_words_for_ids(pick_random_words(count))
|
102
|
+
else
|
103
|
+
dspace = discontiguous_range_map(@lenprod_key, length_range, percentile_range)
|
104
|
+
max = dspace.count
|
105
|
+
ids = count.times.map do
|
106
|
+
dspace.pick_nth(random_number(max))
|
107
|
+
end
|
108
|
+
# STDERR.puts "ids from decimal are #{ids.inspect}"
|
109
|
+
get_words_for_ids(ids)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def zcount(key, min, max)
|
114
|
+
db.zcount(key, min, max)
|
115
|
+
end
|
116
|
+
memoize :zcount
|
117
|
+
|
118
|
+
def discontiguous_range_map(key, outer_range, inner_range, divisor=100)
|
119
|
+
CorrectHorseBatteryStaple::Backend::Redis::DRange.new(@db, key, outer_range,
|
120
|
+
inner_range, divisor)
|
121
|
+
end
|
122
|
+
memoize :discontiguous_range_map
|
123
|
+
|
124
|
+
# XXX - does not handle exclusive endpoints
|
125
|
+
def range_cover?(outer, inner)
|
126
|
+
outer.cover?(inner.begin) && outer.cover?(inner.end)
|
127
|
+
end
|
128
|
+
|
129
|
+
# TODO: make this use actual data from stored stats
|
130
|
+
def corpus_length_range
|
131
|
+
3..18
|
132
|
+
end
|
133
|
+
|
134
|
+
def pick_random_words(count)
|
135
|
+
count.times.map do
|
136
|
+
idx = random_number(size)-1
|
137
|
+
db.zrange(@words_key, idx, idx)[0]
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def intersection(*sets)
|
142
|
+
sets.reduce {|a,b| a & b }
|
143
|
+
end
|
144
|
+
|
145
|
+
def get_word_ids_in_zset(key, range)
|
146
|
+
db.zrangebyscore(key, range.begin, range.end)
|
147
|
+
end
|
148
|
+
memoize :get_word_ids_in_zset
|
149
|
+
|
150
|
+
def get_words_for_ids(ids)
|
151
|
+
ids.map {|id| CorrectHorseBatteryStaple::Word.new(:word => get_word_by_id(id)) }
|
152
|
+
end
|
153
|
+
|
154
|
+
|
155
|
+
def close
|
156
|
+
super
|
157
|
+
end
|
158
|
+
|
159
|
+
protected
|
160
|
+
|
161
|
+
def table
|
162
|
+
percentiles = db.zrangebyscore(@percentile_key, -1, 101, :withscores => true)
|
163
|
+
frequencies = db.zrangebyscore(@frequency_key, -1, 99999999, :withscores => true)
|
164
|
+
|
165
|
+
phash = {}
|
166
|
+
fhash = {}
|
167
|
+
(0...percentiles.length / 2).each do |index|
|
168
|
+
base = index * 2
|
169
|
+
phash[percentiles[base]] = percentiles[base+1]
|
170
|
+
end
|
171
|
+
(0...frequencies.length / 2).each do |index|
|
172
|
+
base = index * 2
|
173
|
+
fhash[frequencies[base]] = frequencies[base+1]
|
174
|
+
end
|
175
|
+
|
176
|
+
count = phash.length
|
177
|
+
index = 0
|
178
|
+
phash.keys.map do |w|
|
179
|
+
word_from_hash :word => w, :percentile => phash[w].to_f, :index => (index+=1),
|
180
|
+
:rank => count-index+1, :frequency => fhash[w].to_f
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
def word_from_hash(hash)
|
185
|
+
CorrectHorseBatteryStaple::Word.new(hash)
|
186
|
+
end
|
187
|
+
|
188
|
+
end
|