symspell 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (6) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +32 -0
  3. data/Rakefile +33 -0
  4. data/lib/symspell.rb +264 -0
  5. data/symspell.gemspec +17 -0
  6. metadata +46 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1460d117d70b607f1e3cd2a82e1775239dcf6409
4
+ data.tar.gz: 077eec846704a48345ffe305d4491ec85c40da57
5
+ SHA512:
6
+ metadata.gz: d60beee5581b45443ed882fce50add34f6a4b26cf6792996156d2e14b396d31246708a9a30ded49baf7f0f35b28e877fdd4409bbff7020b4006f15b97c84e372
7
+ data.tar.gz: 89497ab96606792069f18ca01bebdaf8e4aaeb68cc555053b5e0352a3656ef70faf6443803806630d73926a1abda41d74ed640366ffbe469f3cd12704bfcb541
@@ -0,0 +1,32 @@
1
+ ## Synopsis
2
+
3
+ The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup for a given Damerau-Levenshtein distance.
4
+
5
+ Same license as original (LGPL-3.0).
6
+
7
+
8
+ ## About this port
9
+
10
+ This is a straight port of SymSpell from C# to Ruby. I've started moving things around a bit and also turned it into a gem.
11
+
12
+ Original source with inline comments and README is here: https://github.com/wolfgarbe/symspell.
13
+
14
+ I've changed very little from the original source (apart from removing the commandline interface) but please note it has no test coverage at this time.
15
+
16
+
17
+ ## Usage
18
+
19
+ gem install symspell
20
+
21
+ require 'symspell'
22
+
23
+ speller = SymSpell.new <EDIT_DISTANCE_MAX>
24
+ speller.create_dictionary('words.txt')
25
+ speller.lookup('something')
26
+
27
+ ## EDIT_DISTANCE_MAX
28
+
29
+ `EDIT_DISTANCE_MAX` is the number of operations needed to tranform one string into another.
30
+
31
+ For example the edit distance between **CA** and **ABC** is 2 because **CA** => **AC** => **ABC**. Edit distances of 2-5 are normal. Note, however, increasing EDIT_DISTANCE_MAX exponentially increases the combinations and therefore the time it takes to create the dictionary.
32
+
@@ -0,0 +1,33 @@
1
+ require 'rake/testtask'
2
+
3
+ desc 'Test, build and install the gem'
4
+ task :default => [:spec, :install]
5
+
6
+ Rake::TestTask.new(:spec) do |t|
7
+ t.pattern = 'tests/*_test.rb'
8
+ end
9
+
10
+ desc 'Build and install the gem'
11
+ task :install do
12
+ gemspec_path = Dir['*.gemspec'].first
13
+ spec = eval(File.read(gemspec_path))
14
+
15
+ result = `gem build #{gemspec_path} 2>&1`
16
+ if result =~ /Successfully built/
17
+ system "gem uninstall -I #{spec.name} 2>&1"
18
+ system "gem install #{spec.file_name} --no-rdoc --no-ri 2>&1"
19
+ else
20
+ raise result
21
+ end
22
+ end
23
+
24
+ desc 'Take the version in the gemspec, create a git tag and send the gem to rubygems'
25
+ task :release do
26
+ gemspec_path = Dir['*.gemspec'].first
27
+ spec = eval(File.read(gemspec_path))
28
+
29
+ system "git tag -f -a v#{spec.version} -m 'Version #{spec.version}'"
30
+ system "git push --tags"
31
+ system "gem push #{spec.file_name}"
32
+ end
33
+
@@ -0,0 +1,264 @@
1
+ require 'active_support/all'
2
+ require 'set'
3
+
4
+ class SymSpell
5
+ MAX_INT = 2**30 - 1
6
+
7
+ def initialize(edit_distance_max)
8
+ @edit_distance_max = edit_distance_max
9
+ @maxlength = 0
10
+ @dictionary = {}
11
+ @wordlist = []
12
+ end
13
+
14
+ def create_dictionary(corpus)
15
+ word_count = 0
16
+
17
+ File.open(corpus, 'r').each_line do |word|
18
+ word_count += 1 if create_dictionary_entry(word.strip)
19
+ end
20
+ end
21
+
22
+ def lookup(input)
23
+ return [] if (input.size - @edit_distance_max) > @maxlength
24
+
25
+ candidates = []
26
+ hashset1 = Set.new
27
+
28
+ suggestions = []
29
+ hashset2 = Set.new
30
+
31
+ valueo = nil
32
+
33
+ candidates << input
34
+
35
+ while (candidates.count > 0)
36
+ candidate = candidates.shift
37
+
38
+ return sort(suggestions) if ((suggestions.count > 0) && (input.size - candidate.size > suggestions[0].distance))
39
+
40
+ if valueo = @dictionary[candidate]
41
+ value = DictionaryItem.new
42
+ if (valueo.is_a?(Fixnum))
43
+ value.suggestions << valueo
44
+ else
45
+ value = valueo
46
+ end
47
+
48
+ if (value.count > 0) && hashset2.add?(candidate)
49
+ si = SuggestItem.new
50
+ si.term = candidate
51
+ si.count = value.count
52
+ si.distance = input.size - candidate.size
53
+ suggestions << si
54
+ return sort(suggestions) if input.size - candidate.size == 0
55
+ end
56
+
57
+ value2 = nil
58
+ value.suggestions.each do |suggestionint|
59
+ suggestion = @wordlist[suggestionint]
60
+ if hashset2.add?(suggestion)
61
+ distance = 0
62
+ if suggestion != input
63
+ if suggestion.size == candidate.size
64
+ distance = input.size - candidate.size
65
+ elsif input.size == candidate.size
66
+ distance = suggestion.size - candidate.size
67
+ else
68
+ ii = 0
69
+ jj = 0
70
+ while (ii < suggestion.size) && (ii < input.size) && (suggestion[ii] == input[ii])
71
+ ii += 1
72
+ end
73
+
74
+ while (jj < suggestion.size - ii) && (jj < input.size - ii) && (suggestion[suggestion.size - jj - 1] == input[input.size - jj - 1])
75
+ jj += 1
76
+ end
77
+
78
+ if ii > 0 || jj > 0
79
+ distance = damerau_levenshtein_distance(
80
+ suggestion[ii..(suggestion.size - jj)],
81
+ input[ii..(input.size - jj)])
82
+ else
83
+ distance = damerau_levenshtein_distance(suggestion, input)
84
+ end
85
+ end
86
+ end
87
+
88
+ if suggestions.count > 0 && suggestions[0].distance > distance
89
+ suggestions.clear
90
+ end
91
+ if suggestions.count > 0 && distance > suggestions[0].distance
92
+ next
93
+ end
94
+
95
+ if (distance <= @edit_distance_max)
96
+ if value2 = @dictionary[suggestion]
97
+ si = SuggestItem.new
98
+ si.term = suggestion
99
+ si.count = value2.count
100
+ si.distance = distance
101
+ suggestions << si
102
+ end
103
+ end
104
+ end
105
+ end
106
+ end
107
+
108
+ if (input.size - candidate.size < @edit_distance_max)
109
+ if suggestions.count > 0 && input.size - candidate.size >= suggestions[0].distance
110
+ next
111
+ end
112
+
113
+ candidate.size.times do |i|
114
+ delete = candidate.dup
115
+ delete.slice!(i)
116
+ if hashset1.add?(delete)
117
+ candidates << delete
118
+ end
119
+ end
120
+ end
121
+ end
122
+ sort(suggestions)
123
+ end
124
+
125
+
126
+ private
127
+
128
+ class DictionaryItem
129
+ attr_accessor :suggestions, :count
130
+
131
+ def initialize
132
+ @suggestions = []
133
+ @count = 0
134
+ end
135
+ end
136
+
137
+ class SuggestItem
138
+ attr_accessor :term, :distance, :count
139
+
140
+ def initialize
141
+ @term = ''
142
+ @distance = 0
143
+ @count = 0
144
+ end
145
+
146
+ def ==(other)
147
+ term == other.term
148
+ end
149
+
150
+ def hash
151
+ term.hash
152
+ end
153
+ end
154
+
155
+ def parse_words(text)
156
+ text.downcase.scan(/[\w-[\d_]]+/).first
157
+ end
158
+
159
+ def create_dictionary_entry(key)
160
+ result = false
161
+ value = nil
162
+ if valueo = @dictionary[key]
163
+ if valueo.is_a?(Fixnum)
164
+ tmp = valueo
165
+ value = DictionaryItem.new
166
+ value.suggestions << tmp
167
+ else
168
+ value = valueo
169
+ end
170
+ value.count += 1 if value.count < MAX_INT
171
+ elsif @wordlist.count < MAX_INT
172
+ value = DictionaryItem.new
173
+ value.count += 1
174
+ @dictionary[key] = value
175
+
176
+ @maxlength = key.size if key.size > @maxlength
177
+ end
178
+
179
+ if value.count == 1
180
+ @wordlist << key
181
+ keyint = @wordlist.size - 1
182
+ result = true
183
+
184
+ edits(key, 0, Set.new).each do |delete|
185
+ if value2 = @dictionary[delete]
186
+ if value2.is_a?(Fixnum)
187
+ tmp = value2
188
+ di = DictionaryItem.new
189
+ di.suggestions << tmp
190
+ @dictionary[delete] = di
191
+ add_lowest_distance(di, key, keyint, delete) unless di.suggestions.include?(keyint)
192
+ elsif !value2.suggestions.include?(keyint)
193
+ end
194
+ else
195
+ @dictionary[delete] = keyint
196
+ end
197
+ end
198
+ end
199
+ result
200
+ end
201
+
202
+ def add_lowest_distance(item, suggestion, suggestionint, delete)
203
+ if item.suggestions.count > 0 && @wordlist[item.suggestions[0]].size - delete.size > suggestion.size - delete.size
204
+ item.suggestions.clear
205
+ end
206
+ end
207
+
208
+ def edits(word, edit_distance, deletes)
209
+ edit_distance += 1
210
+ if (word.size > 1)
211
+ word.size.times do |i|
212
+ delete = word.dup
213
+ delete.slice!(i)
214
+ if !deletes.include?(delete)
215
+ deletes.add(delete)
216
+ edits(delete, edit_distance, deletes) if edit_distance < @edit_distance_max
217
+ end
218
+ end
219
+ end
220
+ deletes
221
+ end
222
+
223
+ def sort(suggestions)
224
+ suggestions.sort! {|x, y| -x.count <=> y.count}
225
+ end
226
+
227
+ def damerau_levenshtein_distance(source, target)
228
+ m = source.size
229
+ n = target.size
230
+ h = Array.new(m + 2) { Array.new(n + 2) { 0 } }
231
+ inf = m + n
232
+
233
+ h[0][0] = inf
234
+ (0..m).each { |i| h[i + 1][1] = i; h[i + 1][0] = inf }
235
+ (0..n).each { |j| h[1][j + 1] = j; h[0][j + 1] = inf }
236
+
237
+ sd = {}
238
+ (source + target).each_char do |letter|
239
+ sd[letter] = 0 unless sd[letter]
240
+ end
241
+
242
+ (1..m).each do |i|
243
+ db = 0
244
+ (0..n).each do |j|
245
+ i1 = sd[target[j - 1]]
246
+ j1 = db
247
+ if source[i - 1] == target[j - 1]
248
+ h[i + 1][j + 1] = h[i][j]
249
+ db = j
250
+ else
251
+ h[i + 1][j + 1] = [h[i][j], [h[i + 1][j], h[i][j + 1]].min].min + 1
252
+ end
253
+
254
+ first = h[i + 1][j + 1]
255
+ second = h[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1)
256
+ h[i + 1][j + 1] = [first, second].min
257
+ end
258
+
259
+ sd[source[i - 1]] = i
260
+ end
261
+ return h[m + 1][n + 1]
262
+ end
263
+ end
264
+
@@ -0,0 +1,17 @@
1
+ require 'base64'
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = 'symspell'
5
+ s.version = '0.0.1'
6
+ s.authors = 'Phil Thompson'
7
+ s.email = Base64.decode64("cGhpbEBlbGVjdHJpY3Zpc2lvbnMuY29t\n")
8
+ s.summary = 'Ruby port of the symetric spell checking algorithm'
9
+ s.homepage = 'https://github.com/PhilT/symspell'
10
+ s.required_rubygems_version = '>= 2.4.5'
11
+
12
+ s.files = `git ls-files`.split("\n")
13
+ s.test_files = `git ls-files -- spec/*`.split("\n")
14
+
15
+ s.require_path = 'lib'
16
+ end
17
+
metadata ADDED
@@ -0,0 +1,46 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: symspell
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Phil Thompson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-07-31 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email: phil@electricvisions.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - README.md
20
+ - Rakefile
21
+ - lib/symspell.rb
22
+ - symspell.gemspec
23
+ homepage: https://github.com/PhilT/symspell
24
+ licenses: []
25
+ metadata: {}
26
+ post_install_message:
27
+ rdoc_options: []
28
+ require_paths:
29
+ - lib
30
+ required_ruby_version: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ required_rubygems_version: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: 2.4.5
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 2.4.5
43
+ signing_key:
44
+ specification_version: 4
45
+ summary: Ruby port of the symetric spell checking algorithm
46
+ test_files: []