spell_check 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/lib/spell_check/dictionary.rb +42 -0
- data/lib/spell_check/version.rb +3 -0
- data/lib/spell_check/words.txt +234936 -0
- data/lib/spell_check.rb +129 -0
- metadata +78 -0
data/lib/spell_check.rb
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
require 'spell_check/version'
|
2
|
+
require 'spell_check/dictionary'
|
3
|
+
|
4
|
+
module SpellCheck
|
5
|
+
|
6
|
+
# Checks if input word exists in dictionary. This method will accept any case and will make corrections for any
|
7
|
+
# over-repeated strings.
|
8
|
+
# @param [Object] aWordToCheck
|
9
|
+
# @return [String]
|
10
|
+
def self.checkWord( aWordToCheck )
|
11
|
+
@dict = Dictionary.new
|
12
|
+
not_found = 'no correction found'
|
13
|
+
|
14
|
+
# Check for any non-alphabet characters. If any are found, cease word search
|
15
|
+
return not_found if has_non_alphabet_chars aWordToCheck
|
16
|
+
|
17
|
+
# Adjust word for case
|
18
|
+
word = adjust_case( aWordToCheck )
|
19
|
+
|
20
|
+
# Attempt to see if input word is found before doing regular expression search
|
21
|
+
corrected_word = @dict.find_word(word)
|
22
|
+
return corrected_word unless corrected_word.nil?
|
23
|
+
|
24
|
+
# Input word was not found, so try matching regular expression
|
25
|
+
corrected_word = correct_repetitions(word)
|
26
|
+
return corrected_word unless corrected_word.nil?
|
27
|
+
|
28
|
+
return not_found
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
# Returns true if any non-English alphabet characters exist in input object, false otherwise. This is also the
|
34
|
+
# primary check to ensure that the object will convert to a String
|
35
|
+
# @param [Object] word
|
36
|
+
# @return [Boolean]
|
37
|
+
def self.has_non_alphabet_chars( word )
|
38
|
+
return word.to_s.match(/[^A-Za-z]/) rescue true
|
39
|
+
end
|
40
|
+
|
41
|
+
# Takes an input object, converts it to a string, then returns it with all but the first character down cased. The
|
42
|
+
# first character case is retained for comparing between words like god and God
|
43
|
+
# @param [String] word
|
44
|
+
# @return [String]
|
45
|
+
def self.adjust_case( word )
|
46
|
+
return word[0] + word[1..word.length].to_s.downcase if word.length > 1
|
47
|
+
word # string contains one or fewer characters
|
48
|
+
end
|
49
|
+
|
50
|
+
# This is the main logic block for correcting repetition spelling errors. This method removes all consecutive
|
51
|
+
# repetition of characters in a word, builds a regular expression that allows repetitions of each character,
|
52
|
+
# scans the dictionary for any matches to this regular expression, then finds and returns the closest match.
|
53
|
+
# @param [String] word
|
54
|
+
# @return [String]
|
55
|
+
def self.correct_repetitions( word )
|
56
|
+
# Remove all consecutive repetitions of characters in the word
|
57
|
+
squeeze_str = word.downcase.squeeze
|
58
|
+
|
59
|
+
# Create a set of potential matches using regular expression
|
60
|
+
reg_ex_matches = find_reg_ex_matches squeeze_str
|
61
|
+
|
62
|
+
# Compare matches against original input and return closest match
|
63
|
+
return get_best_match word, reg_ex_matches unless reg_ex_matches.empty?
|
64
|
+
|
65
|
+
nil # no corrections found
|
66
|
+
end
|
67
|
+
|
68
|
+
# Builds a RegExp object and then matches it against the dictionary values
|
69
|
+
# @param [String] word
|
70
|
+
# @return [Array]
|
71
|
+
def self.find_reg_ex_matches( word )
|
72
|
+
reg_ex = build_reg_ex word
|
73
|
+
@dict.find_reg_ex_matches reg_ex
|
74
|
+
end
|
75
|
+
|
76
|
+
# Creates a regular expression object that allows any number of consecutive repetitions of each character in the
|
77
|
+
# input string.
|
78
|
+
# @param [String] word
|
79
|
+
# @return [RegExp]
|
80
|
+
def self.build_reg_ex( word )
|
81
|
+
exp_str = '^' # disallow preceding characters
|
82
|
+
word.chars.each do |char|
|
83
|
+
exp_str += '[' + char + ']' + '+' # this allows 1 or more instances of this char
|
84
|
+
end
|
85
|
+
exp_str += '$' # disallow trailing characters
|
86
|
+
Regexp.new exp_str
|
87
|
+
end
|
88
|
+
|
89
|
+
# Given a set of words that are possible corrections for the input word, this method compares the corrected words
|
90
|
+
# to the input and returns to closest matching correction using Levenshtein distance.
|
91
|
+
# @param [String] word
|
92
|
+
# @param [Array] matches
|
93
|
+
def self.get_best_match( word, matches )
|
94
|
+
lev_array = matches.to_a
|
95
|
+
lev_array.sort! { |x,y| levenshtein_distance(x,word) <=> levenshtein_distance(y,word)}
|
96
|
+
lev_array.first # lowest number of changes means closest match to original input
|
97
|
+
end
|
98
|
+
|
99
|
+
# The Levenshtein Distance algorithm measures the number of changes required to match two strings. For this reason
|
100
|
+
# it is a good method to compare the similarity of strings. Please note that this code was sourced at
|
101
|
+
# http://stackoverflow.com/questions/16323571/measure-the-distance-between-two-strings-with-ruby
|
102
|
+
# @param [String] s
|
103
|
+
# @param [String] t
|
104
|
+
# @return [Integer]
|
105
|
+
def self.levenshtein_distance(s, t)
|
106
|
+
m = s.length
|
107
|
+
n = t.length
|
108
|
+
return m if n == 0
|
109
|
+
return n if m == 0
|
110
|
+
d = Array.new(m+1) {Array.new(n+1)}
|
111
|
+
|
112
|
+
(0..m).each {|i| d[i][0] = i}
|
113
|
+
(0..n).each {|j| d[0][j] = j}
|
114
|
+
(1..n).each do |j|
|
115
|
+
(1..m).each do |i|
|
116
|
+
d[i][j] = if s[i-1] == t[j-1] # adjust index into string
|
117
|
+
d[i-1][j-1] # no operation required
|
118
|
+
else
|
119
|
+
[ d[i-1][j]+1, # deletion
|
120
|
+
d[i][j-1]+1, # insertion
|
121
|
+
d[i-1][j-1]+1, # substitution
|
122
|
+
].min
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
d[m][n]
|
127
|
+
end
|
128
|
+
|
129
|
+
end
|
metadata
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: spell_check
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Solberg, Garrick L
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-09-23 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: ! "Checks if a word exists in text dictionary. If it does not, this
|
42
|
+
gem will attempt to correct\n the input by changing repeated
|
43
|
+
characters and case."
|
44
|
+
email:
|
45
|
+
- Garrick.Solberg@gmail.com
|
46
|
+
executables: []
|
47
|
+
extensions: []
|
48
|
+
extra_rdoc_files: []
|
49
|
+
files:
|
50
|
+
- lib/spell_check.rb
|
51
|
+
- lib/spell_check/dictionary.rb
|
52
|
+
- lib/spell_check/version.rb
|
53
|
+
- lib/spell_check/words.txt
|
54
|
+
homepage: http://www.linkedin.com/in/garricksolberg/
|
55
|
+
licenses:
|
56
|
+
- MIT
|
57
|
+
metadata: {}
|
58
|
+
post_install_message:
|
59
|
+
rdoc_options: []
|
60
|
+
require_paths:
|
61
|
+
- lib
|
62
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ! '>='
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
67
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ! '>='
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
requirements: []
|
73
|
+
rubyforge_project:
|
74
|
+
rubygems_version: 2.2.2
|
75
|
+
signing_key:
|
76
|
+
specification_version: 4
|
77
|
+
summary: Checks to see if a word is correctly spelled
|
78
|
+
test_files: []
|