symspell 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +32 -0
- data/Rakefile +33 -0
- data/lib/symspell.rb +264 -0
- data/symspell.gemspec +17 -0
- metadata +46 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1460d117d70b607f1e3cd2a82e1775239dcf6409
|
4
|
+
data.tar.gz: 077eec846704a48345ffe305d4491ec85c40da57
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d60beee5581b45443ed882fce50add34f6a4b26cf6792996156d2e14b396d31246708a9a30ded49baf7f0f35b28e877fdd4409bbff7020b4006f15b97c84e372
|
7
|
+
data.tar.gz: 89497ab96606792069f18ca01bebdaf8e4aaeb68cc555053b5e0352a3656ef70faf6443803806630d73926a1abda41d74ed640366ffbe469f3cd12704bfcb541
|
data/README.md
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
## Synopsis
|
2
|
+
|
3
|
+
The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup for a given Damerau-Levenshtein distance.
|
4
|
+
|
5
|
+
Same license as original (LGPL-3.0).
|
6
|
+
|
7
|
+
|
8
|
+
## About this port
|
9
|
+
|
10
|
+
This is a straight port of SymSpell from C# to Ruby. I've started moving things around a bit and also turned it into a gem.
|
11
|
+
|
12
|
+
Original source with inline comments and README is here: https://github.com/wolfgarbe/symspell.
|
13
|
+
|
14
|
+
I've changed very little from the original source (apart from removing the commandline interface) but please note it has no test coverage at this time.
|
15
|
+
|
16
|
+
|
17
|
+
## Usage
|
18
|
+
|
19
|
+
gem install symspell
|
20
|
+
|
21
|
+
require 'symspell'
|
22
|
+
|
23
|
+
speller = SymSpell.new <EDIT_DISTANCE_MAX>
|
24
|
+
speller.create_dictionary('words.txt')
|
25
|
+
speller.lookup('something')
|
26
|
+
|
27
|
+
## EDIT_DISTANCE_MAX
|
28
|
+
|
29
|
+
`EDIT_DISTANCE_MAX` is the number of operations needed to tranform one string into another.
|
30
|
+
|
31
|
+
For example the edit distance between **CA** and **ABC** is 2 because **CA** => **AC** => **ABC**. Edit distances of 2-5 are normal. Note, however, increasing EDIT_DISTANCE_MAX exponentially increases the combinations and therefore the time it takes to create the dictionary.
|
32
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'rake/testtask'
|
2
|
+
|
3
|
+
desc 'Test, build and install the gem'
|
4
|
+
task :default => [:spec, :install]
|
5
|
+
|
6
|
+
Rake::TestTask.new(:spec) do |t|
|
7
|
+
t.pattern = 'tests/*_test.rb'
|
8
|
+
end
|
9
|
+
|
10
|
+
desc 'Build and install the gem'
|
11
|
+
task :install do
|
12
|
+
gemspec_path = Dir['*.gemspec'].first
|
13
|
+
spec = eval(File.read(gemspec_path))
|
14
|
+
|
15
|
+
result = `gem build #{gemspec_path} 2>&1`
|
16
|
+
if result =~ /Successfully built/
|
17
|
+
system "gem uninstall -I #{spec.name} 2>&1"
|
18
|
+
system "gem install #{spec.file_name} --no-rdoc --no-ri 2>&1"
|
19
|
+
else
|
20
|
+
raise result
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
desc 'Take the version in the gemspec, create a git tag and send the gem to rubygems'
|
25
|
+
task :release do
|
26
|
+
gemspec_path = Dir['*.gemspec'].first
|
27
|
+
spec = eval(File.read(gemspec_path))
|
28
|
+
|
29
|
+
system "git tag -f -a v#{spec.version} -m 'Version #{spec.version}'"
|
30
|
+
system "git push --tags"
|
31
|
+
system "gem push #{spec.file_name}"
|
32
|
+
end
|
33
|
+
|
data/lib/symspell.rb
ADDED
@@ -0,0 +1,264 @@
|
|
1
|
+
require 'active_support/all'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
class SymSpell
|
5
|
+
MAX_INT = 2**30 - 1
|
6
|
+
|
7
|
+
def initialize(edit_distance_max)
|
8
|
+
@edit_distance_max = edit_distance_max
|
9
|
+
@maxlength = 0
|
10
|
+
@dictionary = {}
|
11
|
+
@wordlist = []
|
12
|
+
end
|
13
|
+
|
14
|
+
def create_dictionary(corpus)
|
15
|
+
word_count = 0
|
16
|
+
|
17
|
+
File.open(corpus, 'r').each_line do |word|
|
18
|
+
word_count += 1 if create_dictionary_entry(word.strip)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def lookup(input)
|
23
|
+
return [] if (input.size - @edit_distance_max) > @maxlength
|
24
|
+
|
25
|
+
candidates = []
|
26
|
+
hashset1 = Set.new
|
27
|
+
|
28
|
+
suggestions = []
|
29
|
+
hashset2 = Set.new
|
30
|
+
|
31
|
+
valueo = nil
|
32
|
+
|
33
|
+
candidates << input
|
34
|
+
|
35
|
+
while (candidates.count > 0)
|
36
|
+
candidate = candidates.shift
|
37
|
+
|
38
|
+
return sort(suggestions) if ((suggestions.count > 0) && (input.size - candidate.size > suggestions[0].distance))
|
39
|
+
|
40
|
+
if valueo = @dictionary[candidate]
|
41
|
+
value = DictionaryItem.new
|
42
|
+
if (valueo.is_a?(Fixnum))
|
43
|
+
value.suggestions << valueo
|
44
|
+
else
|
45
|
+
value = valueo
|
46
|
+
end
|
47
|
+
|
48
|
+
if (value.count > 0) && hashset2.add?(candidate)
|
49
|
+
si = SuggestItem.new
|
50
|
+
si.term = candidate
|
51
|
+
si.count = value.count
|
52
|
+
si.distance = input.size - candidate.size
|
53
|
+
suggestions << si
|
54
|
+
return sort(suggestions) if input.size - candidate.size == 0
|
55
|
+
end
|
56
|
+
|
57
|
+
value2 = nil
|
58
|
+
value.suggestions.each do |suggestionint|
|
59
|
+
suggestion = @wordlist[suggestionint]
|
60
|
+
if hashset2.add?(suggestion)
|
61
|
+
distance = 0
|
62
|
+
if suggestion != input
|
63
|
+
if suggestion.size == candidate.size
|
64
|
+
distance = input.size - candidate.size
|
65
|
+
elsif input.size == candidate.size
|
66
|
+
distance = suggestion.size - candidate.size
|
67
|
+
else
|
68
|
+
ii = 0
|
69
|
+
jj = 0
|
70
|
+
while (ii < suggestion.size) && (ii < input.size) && (suggestion[ii] == input[ii])
|
71
|
+
ii += 1
|
72
|
+
end
|
73
|
+
|
74
|
+
while (jj < suggestion.size - ii) && (jj < input.size - ii) && (suggestion[suggestion.size - jj - 1] == input[input.size - jj - 1])
|
75
|
+
jj += 1
|
76
|
+
end
|
77
|
+
|
78
|
+
if ii > 0 || jj > 0
|
79
|
+
distance = damerau_levenshtein_distance(
|
80
|
+
suggestion[ii..(suggestion.size - jj)],
|
81
|
+
input[ii..(input.size - jj)])
|
82
|
+
else
|
83
|
+
distance = damerau_levenshtein_distance(suggestion, input)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
if suggestions.count > 0 && suggestions[0].distance > distance
|
89
|
+
suggestions.clear
|
90
|
+
end
|
91
|
+
if suggestions.count > 0 && distance > suggestions[0].distance
|
92
|
+
next
|
93
|
+
end
|
94
|
+
|
95
|
+
if (distance <= @edit_distance_max)
|
96
|
+
if value2 = @dictionary[suggestion]
|
97
|
+
si = SuggestItem.new
|
98
|
+
si.term = suggestion
|
99
|
+
si.count = value2.count
|
100
|
+
si.distance = distance
|
101
|
+
suggestions << si
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
if (input.size - candidate.size < @edit_distance_max)
|
109
|
+
if suggestions.count > 0 && input.size - candidate.size >= suggestions[0].distance
|
110
|
+
next
|
111
|
+
end
|
112
|
+
|
113
|
+
candidate.size.times do |i|
|
114
|
+
delete = candidate.dup
|
115
|
+
delete.slice!(i)
|
116
|
+
if hashset1.add?(delete)
|
117
|
+
candidates << delete
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
sort(suggestions)
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
private
|
127
|
+
|
128
|
+
class DictionaryItem
|
129
|
+
attr_accessor :suggestions, :count
|
130
|
+
|
131
|
+
def initialize
|
132
|
+
@suggestions = []
|
133
|
+
@count = 0
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
class SuggestItem
|
138
|
+
attr_accessor :term, :distance, :count
|
139
|
+
|
140
|
+
def initialize
|
141
|
+
@term = ''
|
142
|
+
@distance = 0
|
143
|
+
@count = 0
|
144
|
+
end
|
145
|
+
|
146
|
+
def ==(other)
|
147
|
+
term == other.term
|
148
|
+
end
|
149
|
+
|
150
|
+
def hash
|
151
|
+
term.hash
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def parse_words(text)
|
156
|
+
text.downcase.scan(/[\w-[\d_]]+/).first
|
157
|
+
end
|
158
|
+
|
159
|
+
def create_dictionary_entry(key)
|
160
|
+
result = false
|
161
|
+
value = nil
|
162
|
+
if valueo = @dictionary[key]
|
163
|
+
if valueo.is_a?(Fixnum)
|
164
|
+
tmp = valueo
|
165
|
+
value = DictionaryItem.new
|
166
|
+
value.suggestions << tmp
|
167
|
+
else
|
168
|
+
value = valueo
|
169
|
+
end
|
170
|
+
value.count += 1 if value.count < MAX_INT
|
171
|
+
elsif @wordlist.count < MAX_INT
|
172
|
+
value = DictionaryItem.new
|
173
|
+
value.count += 1
|
174
|
+
@dictionary[key] = value
|
175
|
+
|
176
|
+
@maxlength = key.size if key.size > @maxlength
|
177
|
+
end
|
178
|
+
|
179
|
+
if value.count == 1
|
180
|
+
@wordlist << key
|
181
|
+
keyint = @wordlist.size - 1
|
182
|
+
result = true
|
183
|
+
|
184
|
+
edits(key, 0, Set.new).each do |delete|
|
185
|
+
if value2 = @dictionary[delete]
|
186
|
+
if value2.is_a?(Fixnum)
|
187
|
+
tmp = value2
|
188
|
+
di = DictionaryItem.new
|
189
|
+
di.suggestions << tmp
|
190
|
+
@dictionary[delete] = di
|
191
|
+
add_lowest_distance(di, key, keyint, delete) unless di.suggestions.include?(keyint)
|
192
|
+
elsif !value2.suggestions.include?(keyint)
|
193
|
+
end
|
194
|
+
else
|
195
|
+
@dictionary[delete] = keyint
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
result
|
200
|
+
end
|
201
|
+
|
202
|
+
def add_lowest_distance(item, suggestion, suggestionint, delete)
|
203
|
+
if item.suggestions.count > 0 && @wordlist[item.suggestions[0]].size - delete.size > suggestion.size - delete.size
|
204
|
+
item.suggestions.clear
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
def edits(word, edit_distance, deletes)
|
209
|
+
edit_distance += 1
|
210
|
+
if (word.size > 1)
|
211
|
+
word.size.times do |i|
|
212
|
+
delete = word.dup
|
213
|
+
delete.slice!(i)
|
214
|
+
if !deletes.include?(delete)
|
215
|
+
deletes.add(delete)
|
216
|
+
edits(delete, edit_distance, deletes) if edit_distance < @edit_distance_max
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
deletes
|
221
|
+
end
|
222
|
+
|
223
|
+
def sort(suggestions)
|
224
|
+
suggestions.sort! {|x, y| -x.count <=> y.count}
|
225
|
+
end
|
226
|
+
|
227
|
+
def damerau_levenshtein_distance(source, target)
|
228
|
+
m = source.size
|
229
|
+
n = target.size
|
230
|
+
h = Array.new(m + 2) { Array.new(n + 2) { 0 } }
|
231
|
+
inf = m + n
|
232
|
+
|
233
|
+
h[0][0] = inf
|
234
|
+
(0..m).each { |i| h[i + 1][1] = i; h[i + 1][0] = inf }
|
235
|
+
(0..n).each { |j| h[1][j + 1] = j; h[0][j + 1] = inf }
|
236
|
+
|
237
|
+
sd = {}
|
238
|
+
(source + target).each_char do |letter|
|
239
|
+
sd[letter] = 0 unless sd[letter]
|
240
|
+
end
|
241
|
+
|
242
|
+
(1..m).each do |i|
|
243
|
+
db = 0
|
244
|
+
(0..n).each do |j|
|
245
|
+
i1 = sd[target[j - 1]]
|
246
|
+
j1 = db
|
247
|
+
if source[i - 1] == target[j - 1]
|
248
|
+
h[i + 1][j + 1] = h[i][j]
|
249
|
+
db = j
|
250
|
+
else
|
251
|
+
h[i + 1][j + 1] = [h[i][j], [h[i + 1][j], h[i][j + 1]].min].min + 1
|
252
|
+
end
|
253
|
+
|
254
|
+
first = h[i + 1][j + 1]
|
255
|
+
second = h[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1)
|
256
|
+
h[i + 1][j + 1] = [first, second].min
|
257
|
+
end
|
258
|
+
|
259
|
+
sd[source[i - 1]] = i
|
260
|
+
end
|
261
|
+
return h[m + 1][n + 1]
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
data/symspell.gemspec
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'base64'
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = 'symspell'
|
5
|
+
s.version = '0.0.1'
|
6
|
+
s.authors = 'Phil Thompson'
|
7
|
+
s.email = Base64.decode64("cGhpbEBlbGVjdHJpY3Zpc2lvbnMuY29t\n")
|
8
|
+
s.summary = 'Ruby port of the symetric spell checking algorithm'
|
9
|
+
s.homepage = 'https://github.com/PhilT/symspell'
|
10
|
+
s.required_rubygems_version = '>= 2.4.5'
|
11
|
+
|
12
|
+
s.files = `git ls-files`.split("\n")
|
13
|
+
s.test_files = `git ls-files -- spec/*`.split("\n")
|
14
|
+
|
15
|
+
s.require_path = 'lib'
|
16
|
+
end
|
17
|
+
|
metadata
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: symspell
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Phil Thompson
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-07-31 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description:
|
14
|
+
email: phil@electricvisions.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- README.md
|
20
|
+
- Rakefile
|
21
|
+
- lib/symspell.rb
|
22
|
+
- symspell.gemspec
|
23
|
+
homepage: https://github.com/PhilT/symspell
|
24
|
+
licenses: []
|
25
|
+
metadata: {}
|
26
|
+
post_install_message:
|
27
|
+
rdoc_options: []
|
28
|
+
require_paths:
|
29
|
+
- lib
|
30
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - ">="
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '0'
|
35
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: 2.4.5
|
40
|
+
requirements: []
|
41
|
+
rubyforge_project:
|
42
|
+
rubygems_version: 2.4.5
|
43
|
+
signing_key:
|
44
|
+
specification_version: 4
|
45
|
+
summary: Ruby port of the symetric spell checking algorithm
|
46
|
+
test_files: []
|