symspell 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (6) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +32 -0
  3. data/Rakefile +33 -0
  4. data/lib/symspell.rb +264 -0
  5. data/symspell.gemspec +17 -0
  6. metadata +46 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1460d117d70b607f1e3cd2a82e1775239dcf6409
4
+ data.tar.gz: 077eec846704a48345ffe305d4491ec85c40da57
5
+ SHA512:
6
+ metadata.gz: d60beee5581b45443ed882fce50add34f6a4b26cf6792996156d2e14b396d31246708a9a30ded49baf7f0f35b28e877fdd4409bbff7020b4006f15b97c84e372
7
+ data.tar.gz: 89497ab96606792069f18ca01bebdaf8e4aaeb68cc555053b5e0352a3656ef70faf6443803806630d73926a1abda41d74ed640366ffbe469f3cd12704bfcb541
@@ -0,0 +1,32 @@
1
+ ## Synopsis
2
+
3
+ The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup for a given Damerau-Levenshtein distance.
4
+
5
+ Same license as original (LGPL-3.0).
6
+
7
+
8
+ ## About this port
9
+
10
+ This is a straight port of SymSpell from C# to Ruby. I've started moving things around a bit and also turned it into a gem.
11
+
12
+ Original source with inline comments and README is here: https://github.com/wolfgarbe/symspell.
13
+
14
+ I've changed very little from the original source (apart from removing the commandline interface) but please note it has no test coverage at this time.
15
+
16
+
17
+ ## Usage
18
+
19
+ gem install symspell
20
+
21
+ require 'symspell'
22
+
23
+ speller = SymSpell.new <EDIT_DISTANCE_MAX>
24
+ speller.create_dictionary('words.txt')
25
+ speller.lookup('something')
26
+
27
+ ## EDIT_DISTANCE_MAX
28
+
29
+ `EDIT_DISTANCE_MAX` is the number of operations needed to tranform one string into another.
30
+
31
+ For example the edit distance between **CA** and **ABC** is 2 because **CA** => **AC** => **ABC**. Edit distances of 2-5 are normal. Note, however, increasing EDIT_DISTANCE_MAX exponentially increases the combinations and therefore the time it takes to create the dictionary.
32
+
@@ -0,0 +1,33 @@
1
+ require 'rake/testtask'
2
+
3
+ desc 'Test, build and install the gem'
4
+ task :default => [:spec, :install]
5
+
6
+ Rake::TestTask.new(:spec) do |t|
7
+ t.pattern = 'tests/*_test.rb'
8
+ end
9
+
10
+ desc 'Build and install the gem'
11
+ task :install do
12
+ gemspec_path = Dir['*.gemspec'].first
13
+ spec = eval(File.read(gemspec_path))
14
+
15
+ result = `gem build #{gemspec_path} 2>&1`
16
+ if result =~ /Successfully built/
17
+ system "gem uninstall -I #{spec.name} 2>&1"
18
+ system "gem install #{spec.file_name} --no-rdoc --no-ri 2>&1"
19
+ else
20
+ raise result
21
+ end
22
+ end
23
+
24
+ desc 'Take the version in the gemspec, create a git tag and send the gem to rubygems'
25
+ task :release do
26
+ gemspec_path = Dir['*.gemspec'].first
27
+ spec = eval(File.read(gemspec_path))
28
+
29
+ system "git tag -f -a v#{spec.version} -m 'Version #{spec.version}'"
30
+ system "git push --tags"
31
+ system "gem push #{spec.file_name}"
32
+ end
33
+
@@ -0,0 +1,264 @@
1
+ require 'active_support/all'
2
+ require 'set'
3
+
4
+ class SymSpell
5
+ MAX_INT = 2**30 - 1
6
+
7
+ def initialize(edit_distance_max)
8
+ @edit_distance_max = edit_distance_max
9
+ @maxlength = 0
10
+ @dictionary = {}
11
+ @wordlist = []
12
+ end
13
+
14
+ def create_dictionary(corpus)
15
+ word_count = 0
16
+
17
+ File.open(corpus, 'r').each_line do |word|
18
+ word_count += 1 if create_dictionary_entry(word.strip)
19
+ end
20
+ end
21
+
22
+ def lookup(input)
23
+ return [] if (input.size - @edit_distance_max) > @maxlength
24
+
25
+ candidates = []
26
+ hashset1 = Set.new
27
+
28
+ suggestions = []
29
+ hashset2 = Set.new
30
+
31
+ valueo = nil
32
+
33
+ candidates << input
34
+
35
+ while (candidates.count > 0)
36
+ candidate = candidates.shift
37
+
38
+ return sort(suggestions) if ((suggestions.count > 0) && (input.size - candidate.size > suggestions[0].distance))
39
+
40
+ if valueo = @dictionary[candidate]
41
+ value = DictionaryItem.new
42
+ if (valueo.is_a?(Fixnum))
43
+ value.suggestions << valueo
44
+ else
45
+ value = valueo
46
+ end
47
+
48
+ if (value.count > 0) && hashset2.add?(candidate)
49
+ si = SuggestItem.new
50
+ si.term = candidate
51
+ si.count = value.count
52
+ si.distance = input.size - candidate.size
53
+ suggestions << si
54
+ return sort(suggestions) if input.size - candidate.size == 0
55
+ end
56
+
57
+ value2 = nil
58
+ value.suggestions.each do |suggestionint|
59
+ suggestion = @wordlist[suggestionint]
60
+ if hashset2.add?(suggestion)
61
+ distance = 0
62
+ if suggestion != input
63
+ if suggestion.size == candidate.size
64
+ distance = input.size - candidate.size
65
+ elsif input.size == candidate.size
66
+ distance = suggestion.size - candidate.size
67
+ else
68
+ ii = 0
69
+ jj = 0
70
+ while (ii < suggestion.size) && (ii < input.size) && (suggestion[ii] == input[ii])
71
+ ii += 1
72
+ end
73
+
74
+ while (jj < suggestion.size - ii) && (jj < input.size - ii) && (suggestion[suggestion.size - jj - 1] == input[input.size - jj - 1])
75
+ jj += 1
76
+ end
77
+
78
+ if ii > 0 || jj > 0
79
+ distance = damerau_levenshtein_distance(
80
+ suggestion[ii..(suggestion.size - jj)],
81
+ input[ii..(input.size - jj)])
82
+ else
83
+ distance = damerau_levenshtein_distance(suggestion, input)
84
+ end
85
+ end
86
+ end
87
+
88
+ if suggestions.count > 0 && suggestions[0].distance > distance
89
+ suggestions.clear
90
+ end
91
+ if suggestions.count > 0 && distance > suggestions[0].distance
92
+ next
93
+ end
94
+
95
+ if (distance <= @edit_distance_max)
96
+ if value2 = @dictionary[suggestion]
97
+ si = SuggestItem.new
98
+ si.term = suggestion
99
+ si.count = value2.count
100
+ si.distance = distance
101
+ suggestions << si
102
+ end
103
+ end
104
+ end
105
+ end
106
+ end
107
+
108
+ if (input.size - candidate.size < @edit_distance_max)
109
+ if suggestions.count > 0 && input.size - candidate.size >= suggestions[0].distance
110
+ next
111
+ end
112
+
113
+ candidate.size.times do |i|
114
+ delete = candidate.dup
115
+ delete.slice!(i)
116
+ if hashset1.add?(delete)
117
+ candidates << delete
118
+ end
119
+ end
120
+ end
121
+ end
122
+ sort(suggestions)
123
+ end
124
+
125
+
126
+ private
127
+
128
+ class DictionaryItem
129
+ attr_accessor :suggestions, :count
130
+
131
+ def initialize
132
+ @suggestions = []
133
+ @count = 0
134
+ end
135
+ end
136
+
137
+ class SuggestItem
138
+ attr_accessor :term, :distance, :count
139
+
140
+ def initialize
141
+ @term = ''
142
+ @distance = 0
143
+ @count = 0
144
+ end
145
+
146
+ def ==(other)
147
+ term == other.term
148
+ end
149
+
150
+ def hash
151
+ term.hash
152
+ end
153
+ end
154
+
155
+ def parse_words(text)
156
+ text.downcase.scan(/[\w-[\d_]]+/).first
157
+ end
158
+
159
+ def create_dictionary_entry(key)
160
+ result = false
161
+ value = nil
162
+ if valueo = @dictionary[key]
163
+ if valueo.is_a?(Fixnum)
164
+ tmp = valueo
165
+ value = DictionaryItem.new
166
+ value.suggestions << tmp
167
+ else
168
+ value = valueo
169
+ end
170
+ value.count += 1 if value.count < MAX_INT
171
+ elsif @wordlist.count < MAX_INT
172
+ value = DictionaryItem.new
173
+ value.count += 1
174
+ @dictionary[key] = value
175
+
176
+ @maxlength = key.size if key.size > @maxlength
177
+ end
178
+
179
+ if value.count == 1
180
+ @wordlist << key
181
+ keyint = @wordlist.size - 1
182
+ result = true
183
+
184
+ edits(key, 0, Set.new).each do |delete|
185
+ if value2 = @dictionary[delete]
186
+ if value2.is_a?(Fixnum)
187
+ tmp = value2
188
+ di = DictionaryItem.new
189
+ di.suggestions << tmp
190
+ @dictionary[delete] = di
191
+ add_lowest_distance(di, key, keyint, delete) unless di.suggestions.include?(keyint)
192
+ elsif !value2.suggestions.include?(keyint)
193
+ end
194
+ else
195
+ @dictionary[delete] = keyint
196
+ end
197
+ end
198
+ end
199
+ result
200
+ end
201
+
202
+ def add_lowest_distance(item, suggestion, suggestionint, delete)
203
+ if item.suggestions.count > 0 && @wordlist[item.suggestions[0]].size - delete.size > suggestion.size - delete.size
204
+ item.suggestions.clear
205
+ end
206
+ end
207
+
208
+ def edits(word, edit_distance, deletes)
209
+ edit_distance += 1
210
+ if (word.size > 1)
211
+ word.size.times do |i|
212
+ delete = word.dup
213
+ delete.slice!(i)
214
+ if !deletes.include?(delete)
215
+ deletes.add(delete)
216
+ edits(delete, edit_distance, deletes) if edit_distance < @edit_distance_max
217
+ end
218
+ end
219
+ end
220
+ deletes
221
+ end
222
+
223
+ def sort(suggestions)
224
+ suggestions.sort! {|x, y| -x.count <=> y.count}
225
+ end
226
+
227
+ def damerau_levenshtein_distance(source, target)
228
+ m = source.size
229
+ n = target.size
230
+ h = Array.new(m + 2) { Array.new(n + 2) { 0 } }
231
+ inf = m + n
232
+
233
+ h[0][0] = inf
234
+ (0..m).each { |i| h[i + 1][1] = i; h[i + 1][0] = inf }
235
+ (0..n).each { |j| h[1][j + 1] = j; h[0][j + 1] = inf }
236
+
237
+ sd = {}
238
+ (source + target).each_char do |letter|
239
+ sd[letter] = 0 unless sd[letter]
240
+ end
241
+
242
+ (1..m).each do |i|
243
+ db = 0
244
+ (0..n).each do |j|
245
+ i1 = sd[target[j - 1]]
246
+ j1 = db
247
+ if source[i - 1] == target[j - 1]
248
+ h[i + 1][j + 1] = h[i][j]
249
+ db = j
250
+ else
251
+ h[i + 1][j + 1] = [h[i][j], [h[i + 1][j], h[i][j + 1]].min].min + 1
252
+ end
253
+
254
+ first = h[i + 1][j + 1]
255
+ second = h[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1)
256
+ h[i + 1][j + 1] = [first, second].min
257
+ end
258
+
259
+ sd[source[i - 1]] = i
260
+ end
261
+ return h[m + 1][n + 1]
262
+ end
263
+ end
264
+
@@ -0,0 +1,17 @@
1
+ require 'base64'
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = 'symspell'
5
+ s.version = '0.0.1'
6
+ s.authors = 'Phil Thompson'
7
+ s.email = Base64.decode64("cGhpbEBlbGVjdHJpY3Zpc2lvbnMuY29t\n")
8
+ s.summary = 'Ruby port of the symetric spell checking algorithm'
9
+ s.homepage = 'https://github.com/PhilT/symspell'
10
+ s.required_rubygems_version = '>= 2.4.5'
11
+
12
+ s.files = `git ls-files`.split("\n")
13
+ s.test_files = `git ls-files -- spec/*`.split("\n")
14
+
15
+ s.require_path = 'lib'
16
+ end
17
+
metadata ADDED
@@ -0,0 +1,46 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: symspell
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Phil Thompson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-07-31 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email: phil@electricvisions.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - README.md
20
+ - Rakefile
21
+ - lib/symspell.rb
22
+ - symspell.gemspec
23
+ homepage: https://github.com/PhilT/symspell
24
+ licenses: []
25
+ metadata: {}
26
+ post_install_message:
27
+ rdoc_options: []
28
+ require_paths:
29
+ - lib
30
+ required_ruby_version: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ required_rubygems_version: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: 2.4.5
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 2.4.5
43
+ signing_key:
44
+ specification_version: 4
45
+ summary: Ruby port of the symetric spell checking algorithm
46
+ test_files: []