symspell 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +11 -5
- data/Rakefile +2 -2
- data/lib/symspell.rb +18 -12
- data/symspell.gemspec +1 -1
- data/tests/symspell_test.rb +39 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4c6abf1253f5e16d82ae589bc6e35a6f5d72bc36
|
4
|
+
data.tar.gz: 6c9344ec4d40597700099e119b423ccfb85d3582
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2f71318a0ded808acf0f3eac41d8b569edd908ec0b81f126421fcfa7d206fce6969bc47e565fcdc324e1b4fb02f492e482e52e36b608559cf4364760a1847baf
|
7
|
+
data.tar.gz: 62215df1ac0378e26fe131ed8c730ebf3ad235107858bebf37802704bb8f8dd8a88c0c15f9a15da02518e7034cdeaa34802c24d03b38e31d8dc834c86254c03f
|
data/README.md
CHANGED
@@ -11,7 +11,7 @@ This is a straight port of SymSpell from C# to Ruby. I've started moving things
|
|
11
11
|
|
12
12
|
Original source with inline comments and README is here: https://github.com/wolfgarbe/symspell.
|
13
13
|
|
14
|
-
I've changed very little from the original source (apart from removing the commandline interface) but please note it has
|
14
|
+
I've changed very little from the original source (apart from removing the commandline interface) but please note it has only some very basic end to end tests at this time.
|
15
15
|
|
16
16
|
|
17
17
|
## Usage
|
@@ -20,13 +20,19 @@ I've changed very little from the original source (apart from removing the comma
|
|
20
20
|
|
21
21
|
require 'symspell'
|
22
22
|
|
23
|
-
speller = SymSpell.new <EDIT_DISTANCE_MAX>
|
24
|
-
speller.create_dictionary(
|
25
|
-
speller.lookup
|
23
|
+
speller = SymSpell.new <EDIT_DISTANCE_MAX> <VERBOSE>
|
24
|
+
speller.create_dictionary %w(joe jo mark john peter mary andrew imogen)
|
25
|
+
speller.lookup 'jo'
|
26
26
|
|
27
|
-
|
27
|
+
### EDIT_DISTANCE_MAX
|
28
28
|
|
29
29
|
`EDIT_DISTANCE_MAX` is the number of operations needed to tranform one string into another.
|
30
30
|
|
31
31
|
For example the edit distance between **CA** and **ABC** is 2 because **CA** => **AC** => **ABC**. Edit distances of 2-5 are normal. Note, however, increasing EDIT_DISTANCE_MAX exponentially increases the combinations and therefore the time it takes to create the dictionary.
|
32
32
|
|
33
|
+
### VERBOSE
|
34
|
+
|
35
|
+
* 0 - Return the top suggestion
|
36
|
+
* 1 - Return the suggestions with the lowest edit distance
|
37
|
+
* 2 - Return all suggestions
|
38
|
+
|
data/Rakefile
CHANGED
data/lib/symspell.rb
CHANGED
@@ -4,8 +4,9 @@ require 'set'
|
|
4
4
|
class SymSpell
|
5
5
|
MAX_INT = 2**30 - 1
|
6
6
|
|
7
|
-
def initialize(edit_distance_max)
|
7
|
+
def initialize(edit_distance_max, verbose)
|
8
8
|
@edit_distance_max = edit_distance_max
|
9
|
+
@verbose = verbose
|
9
10
|
@maxlength = 0
|
10
11
|
@dictionary = {}
|
11
12
|
@wordlist = []
|
@@ -14,7 +15,7 @@ class SymSpell
|
|
14
15
|
def create_dictionary(corpus)
|
15
16
|
word_count = 0
|
16
17
|
|
17
|
-
|
18
|
+
corpus.each do |word|
|
18
19
|
word_count += 1 if create_dictionary_entry(word.strip)
|
19
20
|
end
|
20
21
|
end
|
@@ -35,7 +36,7 @@ class SymSpell
|
|
35
36
|
while (candidates.count > 0)
|
36
37
|
candidate = candidates.shift
|
37
38
|
|
38
|
-
return sort(suggestions) if
|
39
|
+
return sort(suggestions) if @verbose < 2 && suggestions.count > 0 && (input.size - candidate.size) > suggestions[0].distance
|
39
40
|
|
40
41
|
if valueo = @dictionary[candidate]
|
41
42
|
value = DictionaryItem.new
|
@@ -51,7 +52,7 @@ class SymSpell
|
|
51
52
|
si.count = value.count
|
52
53
|
si.distance = input.size - candidate.size
|
53
54
|
suggestions << si
|
54
|
-
return sort(suggestions) if input.size - candidate.size == 0
|
55
|
+
return sort(suggestions) if @verbose < 2 && input.size - candidate.size == 0
|
55
56
|
end
|
56
57
|
|
57
58
|
value2 = nil
|
@@ -85,12 +86,8 @@ class SymSpell
|
|
85
86
|
end
|
86
87
|
end
|
87
88
|
|
88
|
-
if suggestions.count > 0 && suggestions[0].distance > distance
|
89
|
-
|
90
|
-
end
|
91
|
-
if suggestions.count > 0 && distance > suggestions[0].distance
|
92
|
-
next
|
93
|
-
end
|
89
|
+
suggestions.clear if @verbose < 2 && suggestions.count > 0 && suggestions[0].distance > distance
|
90
|
+
next if @verbose < 2 && suggestions.count > 0 && distance > suggestions[0].distance
|
94
91
|
|
95
92
|
if (distance <= @edit_distance_max)
|
96
93
|
if value2 = @dictionary[suggestion]
|
@@ -200,9 +197,12 @@ class SymSpell
|
|
200
197
|
end
|
201
198
|
|
202
199
|
def add_lowest_distance(item, suggestion, suggestionint, delete)
|
203
|
-
if item.suggestions.count > 0 && @wordlist[item.suggestions[0]].size - delete.size > suggestion.size - delete.size
|
200
|
+
if @verbose < 2 && item.suggestions.count > 0 && @wordlist[item.suggestions[0]].size - delete.size > suggestion.size - delete.size
|
204
201
|
item.suggestions.clear
|
205
202
|
end
|
203
|
+
if @verbose == 2 || item.suggestions.size == 0 || (@wordlist[item.suggestions[0]].size - delete.size >= suggestion.size - delete.size)
|
204
|
+
item.suggestions << suggestionint
|
205
|
+
end
|
206
206
|
end
|
207
207
|
|
208
208
|
def edits(word, edit_distance, deletes)
|
@@ -221,7 +221,13 @@ class SymSpell
|
|
221
221
|
end
|
222
222
|
|
223
223
|
def sort(suggestions)
|
224
|
-
|
224
|
+
if @verbose < 2
|
225
|
+
suggestions.sort! {|x, y| -x.count <=> y.count}
|
226
|
+
else
|
227
|
+
suggestions.sort! {|x, y| (2 * x.distance <=> y.distance) - x.count <=> y.count}
|
228
|
+
end
|
229
|
+
|
230
|
+
@verbose == 0 ? suggestions[0..0] : suggestions
|
225
231
|
end
|
226
232
|
|
227
233
|
def damerau_levenshtein_distance(source, target)
|
data/symspell.gemspec
CHANGED
@@ -2,7 +2,7 @@ require 'base64'
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = 'symspell'
|
5
|
-
s.version = '0.0.
|
5
|
+
s.version = '0.0.2'
|
6
6
|
s.authors = 'Phil Thompson'
|
7
7
|
s.email = Base64.decode64("cGhpbEBlbGVjdHJpY3Zpc2lvbnMuY29t\n")
|
8
8
|
s.summary = 'Ruby port of the symetric spell checking algorithm'
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require_relative '../lib/symspell'
|
3
|
+
|
4
|
+
class SymSpellTest < Minitest::Test
|
5
|
+
def setup
|
6
|
+
@edit_distance_max = 2
|
7
|
+
@verbose = 0
|
8
|
+
end
|
9
|
+
|
10
|
+
def subject
|
11
|
+
@subject ||= SymSpell.new(@edit_distance_max, @verbose).tap do |subject|
|
12
|
+
words = %w(joe mark john peter mary andrew imogen)
|
13
|
+
subject.create_dictionary words
|
14
|
+
end
|
15
|
+
end
|
16
|
+
def test_lookup_correctly_spelled_word
|
17
|
+
assert_equal 'andrew', subject.lookup('andrew').first.term
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_lookup_misspelt_word
|
21
|
+
assert_equal 'andrew', subject.lookup('andre').first.term
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_lookup_fails_to_find_match
|
25
|
+
assert_equal nil, subject.lookup('amigon').first
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_lookup_finds_match_after_turning_up_edit_distance
|
29
|
+
@edit_distance_max = 3
|
30
|
+
assert_equal ['imogen'], subject.lookup('amigon').map(&:term)
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_lookup_returns_multiple_suggestions
|
34
|
+
@edit_distance_max = 2
|
35
|
+
@verbose = 2
|
36
|
+
assert_equal ['joe', 'john'], subject.lookup('jo').map(&:term)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: symspell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Phil Thompson
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-08-03 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email: phil@electricvisions.com
|
@@ -20,6 +20,7 @@ files:
|
|
20
20
|
- Rakefile
|
21
21
|
- lib/symspell.rb
|
22
22
|
- symspell.gemspec
|
23
|
+
- tests/symspell_test.rb
|
23
24
|
homepage: https://github.com/PhilT/symspell
|
24
25
|
licenses: []
|
25
26
|
metadata: {}
|