symspell 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +32 -0
- data/Rakefile +33 -0
- data/lib/symspell.rb +264 -0
- data/symspell.gemspec +17 -0
- metadata +46 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1460d117d70b607f1e3cd2a82e1775239dcf6409
|
4
|
+
data.tar.gz: 077eec846704a48345ffe305d4491ec85c40da57
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d60beee5581b45443ed882fce50add34f6a4b26cf6792996156d2e14b396d31246708a9a30ded49baf7f0f35b28e877fdd4409bbff7020b4006f15b97c84e372
|
7
|
+
data.tar.gz: 89497ab96606792069f18ca01bebdaf8e4aaeb68cc555053b5e0352a3656ef70faf6443803806630d73926a1abda41d74ed640366ffbe469f3cd12704bfcb541
|
data/README.md
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
## Synopsis
|
2
|
+
|
3
|
+
The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup for a given Damerau-Levenshtein distance.
|
4
|
+
|
5
|
+
Same license as original (LGPL-3.0).
|
6
|
+
|
7
|
+
|
8
|
+
## About this port
|
9
|
+
|
10
|
+
This is a straight port of SymSpell from C# to Ruby. I've started moving things around a bit and also turned it into a gem.
|
11
|
+
|
12
|
+
Original source with inline comments and README is here: https://github.com/wolfgarbe/symspell.
|
13
|
+
|
14
|
+
I've changed very little from the original source (apart from removing the commandline interface) but please note it has no test coverage at this time.
|
15
|
+
|
16
|
+
|
17
|
+
## Usage
|
18
|
+
|
19
|
+
gem install symspell
|
20
|
+
|
21
|
+
require 'symspell'
|
22
|
+
|
23
|
+
speller = SymSpell.new <EDIT_DISTANCE_MAX>
|
24
|
+
speller.create_dictionary('words.txt')
|
25
|
+
speller.lookup('something')
|
26
|
+
|
27
|
+
## EDIT_DISTANCE_MAX
|
28
|
+
|
29
|
+
`EDIT_DISTANCE_MAX` is the number of operations needed to tranform one string into another.
|
30
|
+
|
31
|
+
For example the edit distance between **CA** and **ABC** is 2 because **CA** => **AC** => **ABC**. Edit distances of 2-5 are normal. Note, however, increasing EDIT_DISTANCE_MAX exponentially increases the combinations and therefore the time it takes to create the dictionary.
|
32
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'rake/testtask'
|
2
|
+
|
3
|
+
desc 'Test, build and install the gem'
|
4
|
+
task :default => [:spec, :install]
|
5
|
+
|
6
|
+
Rake::TestTask.new(:spec) do |t|
|
7
|
+
t.pattern = 'tests/*_test.rb'
|
8
|
+
end
|
9
|
+
|
10
|
+
desc 'Build and install the gem'
|
11
|
+
task :install do
|
12
|
+
gemspec_path = Dir['*.gemspec'].first
|
13
|
+
spec = eval(File.read(gemspec_path))
|
14
|
+
|
15
|
+
result = `gem build #{gemspec_path} 2>&1`
|
16
|
+
if result =~ /Successfully built/
|
17
|
+
system "gem uninstall -I #{spec.name} 2>&1"
|
18
|
+
system "gem install #{spec.file_name} --no-rdoc --no-ri 2>&1"
|
19
|
+
else
|
20
|
+
raise result
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
desc 'Take the version in the gemspec, create a git tag and send the gem to rubygems'
|
25
|
+
task :release do
|
26
|
+
gemspec_path = Dir['*.gemspec'].first
|
27
|
+
spec = eval(File.read(gemspec_path))
|
28
|
+
|
29
|
+
system "git tag -f -a v#{spec.version} -m 'Version #{spec.version}'"
|
30
|
+
system "git push --tags"
|
31
|
+
system "gem push #{spec.file_name}"
|
32
|
+
end
|
33
|
+
|
data/lib/symspell.rb
ADDED
@@ -0,0 +1,264 @@
|
|
1
|
+
require 'active_support/all'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
class SymSpell
|
5
|
+
MAX_INT = 2**30 - 1
|
6
|
+
|
7
|
+
def initialize(edit_distance_max)
|
8
|
+
@edit_distance_max = edit_distance_max
|
9
|
+
@maxlength = 0
|
10
|
+
@dictionary = {}
|
11
|
+
@wordlist = []
|
12
|
+
end
|
13
|
+
|
14
|
+
def create_dictionary(corpus)
|
15
|
+
word_count = 0
|
16
|
+
|
17
|
+
File.open(corpus, 'r').each_line do |word|
|
18
|
+
word_count += 1 if create_dictionary_entry(word.strip)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def lookup(input)
|
23
|
+
return [] if (input.size - @edit_distance_max) > @maxlength
|
24
|
+
|
25
|
+
candidates = []
|
26
|
+
hashset1 = Set.new
|
27
|
+
|
28
|
+
suggestions = []
|
29
|
+
hashset2 = Set.new
|
30
|
+
|
31
|
+
valueo = nil
|
32
|
+
|
33
|
+
candidates << input
|
34
|
+
|
35
|
+
while (candidates.count > 0)
|
36
|
+
candidate = candidates.shift
|
37
|
+
|
38
|
+
return sort(suggestions) if ((suggestions.count > 0) && (input.size - candidate.size > suggestions[0].distance))
|
39
|
+
|
40
|
+
if valueo = @dictionary[candidate]
|
41
|
+
value = DictionaryItem.new
|
42
|
+
if (valueo.is_a?(Fixnum))
|
43
|
+
value.suggestions << valueo
|
44
|
+
else
|
45
|
+
value = valueo
|
46
|
+
end
|
47
|
+
|
48
|
+
if (value.count > 0) && hashset2.add?(candidate)
|
49
|
+
si = SuggestItem.new
|
50
|
+
si.term = candidate
|
51
|
+
si.count = value.count
|
52
|
+
si.distance = input.size - candidate.size
|
53
|
+
suggestions << si
|
54
|
+
return sort(suggestions) if input.size - candidate.size == 0
|
55
|
+
end
|
56
|
+
|
57
|
+
value2 = nil
|
58
|
+
value.suggestions.each do |suggestionint|
|
59
|
+
suggestion = @wordlist[suggestionint]
|
60
|
+
if hashset2.add?(suggestion)
|
61
|
+
distance = 0
|
62
|
+
if suggestion != input
|
63
|
+
if suggestion.size == candidate.size
|
64
|
+
distance = input.size - candidate.size
|
65
|
+
elsif input.size == candidate.size
|
66
|
+
distance = suggestion.size - candidate.size
|
67
|
+
else
|
68
|
+
ii = 0
|
69
|
+
jj = 0
|
70
|
+
while (ii < suggestion.size) && (ii < input.size) && (suggestion[ii] == input[ii])
|
71
|
+
ii += 1
|
72
|
+
end
|
73
|
+
|
74
|
+
while (jj < suggestion.size - ii) && (jj < input.size - ii) && (suggestion[suggestion.size - jj - 1] == input[input.size - jj - 1])
|
75
|
+
jj += 1
|
76
|
+
end
|
77
|
+
|
78
|
+
if ii > 0 || jj > 0
|
79
|
+
distance = damerau_levenshtein_distance(
|
80
|
+
suggestion[ii..(suggestion.size - jj)],
|
81
|
+
input[ii..(input.size - jj)])
|
82
|
+
else
|
83
|
+
distance = damerau_levenshtein_distance(suggestion, input)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
if suggestions.count > 0 && suggestions[0].distance > distance
|
89
|
+
suggestions.clear
|
90
|
+
end
|
91
|
+
if suggestions.count > 0 && distance > suggestions[0].distance
|
92
|
+
next
|
93
|
+
end
|
94
|
+
|
95
|
+
if (distance <= @edit_distance_max)
|
96
|
+
if value2 = @dictionary[suggestion]
|
97
|
+
si = SuggestItem.new
|
98
|
+
si.term = suggestion
|
99
|
+
si.count = value2.count
|
100
|
+
si.distance = distance
|
101
|
+
suggestions << si
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
if (input.size - candidate.size < @edit_distance_max)
|
109
|
+
if suggestions.count > 0 && input.size - candidate.size >= suggestions[0].distance
|
110
|
+
next
|
111
|
+
end
|
112
|
+
|
113
|
+
candidate.size.times do |i|
|
114
|
+
delete = candidate.dup
|
115
|
+
delete.slice!(i)
|
116
|
+
if hashset1.add?(delete)
|
117
|
+
candidates << delete
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
sort(suggestions)
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
private
|
127
|
+
|
128
|
+
class DictionaryItem
|
129
|
+
attr_accessor :suggestions, :count
|
130
|
+
|
131
|
+
def initialize
|
132
|
+
@suggestions = []
|
133
|
+
@count = 0
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
class SuggestItem
|
138
|
+
attr_accessor :term, :distance, :count
|
139
|
+
|
140
|
+
def initialize
|
141
|
+
@term = ''
|
142
|
+
@distance = 0
|
143
|
+
@count = 0
|
144
|
+
end
|
145
|
+
|
146
|
+
def ==(other)
|
147
|
+
term == other.term
|
148
|
+
end
|
149
|
+
|
150
|
+
def hash
|
151
|
+
term.hash
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def parse_words(text)
|
156
|
+
text.downcase.scan(/[\w-[\d_]]+/).first
|
157
|
+
end
|
158
|
+
|
159
|
+
def create_dictionary_entry(key)
|
160
|
+
result = false
|
161
|
+
value = nil
|
162
|
+
if valueo = @dictionary[key]
|
163
|
+
if valueo.is_a?(Fixnum)
|
164
|
+
tmp = valueo
|
165
|
+
value = DictionaryItem.new
|
166
|
+
value.suggestions << tmp
|
167
|
+
else
|
168
|
+
value = valueo
|
169
|
+
end
|
170
|
+
value.count += 1 if value.count < MAX_INT
|
171
|
+
elsif @wordlist.count < MAX_INT
|
172
|
+
value = DictionaryItem.new
|
173
|
+
value.count += 1
|
174
|
+
@dictionary[key] = value
|
175
|
+
|
176
|
+
@maxlength = key.size if key.size > @maxlength
|
177
|
+
end
|
178
|
+
|
179
|
+
if value.count == 1
|
180
|
+
@wordlist << key
|
181
|
+
keyint = @wordlist.size - 1
|
182
|
+
result = true
|
183
|
+
|
184
|
+
edits(key, 0, Set.new).each do |delete|
|
185
|
+
if value2 = @dictionary[delete]
|
186
|
+
if value2.is_a?(Fixnum)
|
187
|
+
tmp = value2
|
188
|
+
di = DictionaryItem.new
|
189
|
+
di.suggestions << tmp
|
190
|
+
@dictionary[delete] = di
|
191
|
+
add_lowest_distance(di, key, keyint, delete) unless di.suggestions.include?(keyint)
|
192
|
+
elsif !value2.suggestions.include?(keyint)
|
193
|
+
end
|
194
|
+
else
|
195
|
+
@dictionary[delete] = keyint
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
result
|
200
|
+
end
|
201
|
+
|
202
|
+
def add_lowest_distance(item, suggestion, suggestionint, delete)
|
203
|
+
if item.suggestions.count > 0 && @wordlist[item.suggestions[0]].size - delete.size > suggestion.size - delete.size
|
204
|
+
item.suggestions.clear
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
def edits(word, edit_distance, deletes)
|
209
|
+
edit_distance += 1
|
210
|
+
if (word.size > 1)
|
211
|
+
word.size.times do |i|
|
212
|
+
delete = word.dup
|
213
|
+
delete.slice!(i)
|
214
|
+
if !deletes.include?(delete)
|
215
|
+
deletes.add(delete)
|
216
|
+
edits(delete, edit_distance, deletes) if edit_distance < @edit_distance_max
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
deletes
|
221
|
+
end
|
222
|
+
|
223
|
+
def sort(suggestions)
|
224
|
+
suggestions.sort! {|x, y| -x.count <=> y.count}
|
225
|
+
end
|
226
|
+
|
227
|
+
def damerau_levenshtein_distance(source, target)
|
228
|
+
m = source.size
|
229
|
+
n = target.size
|
230
|
+
h = Array.new(m + 2) { Array.new(n + 2) { 0 } }
|
231
|
+
inf = m + n
|
232
|
+
|
233
|
+
h[0][0] = inf
|
234
|
+
(0..m).each { |i| h[i + 1][1] = i; h[i + 1][0] = inf }
|
235
|
+
(0..n).each { |j| h[1][j + 1] = j; h[0][j + 1] = inf }
|
236
|
+
|
237
|
+
sd = {}
|
238
|
+
(source + target).each_char do |letter|
|
239
|
+
sd[letter] = 0 unless sd[letter]
|
240
|
+
end
|
241
|
+
|
242
|
+
(1..m).each do |i|
|
243
|
+
db = 0
|
244
|
+
(0..n).each do |j|
|
245
|
+
i1 = sd[target[j - 1]]
|
246
|
+
j1 = db
|
247
|
+
if source[i - 1] == target[j - 1]
|
248
|
+
h[i + 1][j + 1] = h[i][j]
|
249
|
+
db = j
|
250
|
+
else
|
251
|
+
h[i + 1][j + 1] = [h[i][j], [h[i + 1][j], h[i][j + 1]].min].min + 1
|
252
|
+
end
|
253
|
+
|
254
|
+
first = h[i + 1][j + 1]
|
255
|
+
second = h[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1)
|
256
|
+
h[i + 1][j + 1] = [first, second].min
|
257
|
+
end
|
258
|
+
|
259
|
+
sd[source[i - 1]] = i
|
260
|
+
end
|
261
|
+
return h[m + 1][n + 1]
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
data/symspell.gemspec
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'base64'
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = 'symspell'
|
5
|
+
s.version = '0.0.1'
|
6
|
+
s.authors = 'Phil Thompson'
|
7
|
+
s.email = Base64.decode64("cGhpbEBlbGVjdHJpY3Zpc2lvbnMuY29t\n")
|
8
|
+
s.summary = 'Ruby port of the symetric spell checking algorithm'
|
9
|
+
s.homepage = 'https://github.com/PhilT/symspell'
|
10
|
+
s.required_rubygems_version = '>= 2.4.5'
|
11
|
+
|
12
|
+
s.files = `git ls-files`.split("\n")
|
13
|
+
s.test_files = `git ls-files -- spec/*`.split("\n")
|
14
|
+
|
15
|
+
s.require_path = 'lib'
|
16
|
+
end
|
17
|
+
|
metadata
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: symspell
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Phil Thompson
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-07-31 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description:
|
14
|
+
email: phil@electricvisions.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- README.md
|
20
|
+
- Rakefile
|
21
|
+
- lib/symspell.rb
|
22
|
+
- symspell.gemspec
|
23
|
+
homepage: https://github.com/PhilT/symspell
|
24
|
+
licenses: []
|
25
|
+
metadata: {}
|
26
|
+
post_install_message:
|
27
|
+
rdoc_options: []
|
28
|
+
require_paths:
|
29
|
+
- lib
|
30
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - ">="
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '0'
|
35
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: 2.4.5
|
40
|
+
requirements: []
|
41
|
+
rubyforge_project:
|
42
|
+
rubygems_version: 2.4.5
|
43
|
+
signing_key:
|
44
|
+
specification_version: 4
|
45
|
+
summary: Ruby port of the symetric spell checking algorithm
|
46
|
+
test_files: []
|