ruby-spellchecker 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -0
- data/dictionaries/ngrams.csv +0 -6
- data/dictionaries/typos.csv +0 -2
- data/lib/spellchecker/detect_typo.rb +24 -1
- data/lib/spellchecker/dictionaries/ngram_list.rb +1 -1
- data/lib/spellchecker/tokenizer.rb +1 -1
- data/lib/spellchecker/tokenizer/token.rb +15 -0
- data/lib/spellchecker/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ef027e0b01226b4df2d1ba0d4e4d63b9b28ee84070689d747f5573e52c94014b
|
4
|
+
data.tar.gz: 5daeaa089531bd755434545304c96a21501b0f2f5975611a9fc32f38cec92ee2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 62237078c65b536f9ca986324f532ecf439f511184bf8ad738163b0681bb7f6cd66c783d5b1e63086d15802ea2c1ceb304ea35ddee9449fc6b3592c59b301d85
|
7
|
+
data.tar.gz: ff1bc1100afcdd792dcc600e8c42b696d2703bed837c2ea4fbccf2acdf8a7284bfb4b62341a72a3a675232a48e7131ebf025db4b3d32fb39aa0d7ff6cd629264
|
data/README.md
CHANGED
data/dictionaries/ngrams.csv
CHANGED
@@ -693,7 +693,6 @@ corona virus,coronavirus
|
|
693
693
|
cote chalonnaise,Côte Chalonnaise
|
694
694
|
cote d'argent,Côte d'Argent
|
695
695
|
cote d'azur,Côte d'Azur
|
696
|
-
cote d'ivoire,Côte d'Ivoire
|
697
696
|
cote d'opale,Côte d'Opale
|
698
697
|
cote d'or,Côte d'Or
|
699
698
|
cote d`argent,Côte d`Argent
|
@@ -2012,11 +2011,6 @@ salvador dali,Salvador Dalí
|
|
2012
2011
|
sam elliot,Sam Elliott
|
2013
2012
|
san luis potosi,San Luis Potosí
|
2014
2013
|
sao paolo,São Paulo
|
2015
|
-
sao paulo,São Paulo
|
2016
|
-
sao tome and principe,São Tomé and Príncipe
|
2017
|
-
sao tome and príncipe,São Tomé and Príncipe
|
2018
|
-
sao tomé and principe,São Tomé and Príncipe
|
2019
|
-
sao tomé and príncipe,São Tomé and Príncipe
|
2020
2014
|
sau paolo,São Paulo
|
2021
2015
|
sau paulo,São Paulo
|
2022
2016
|
saudia arabia,saudi Arabia
|
data/dictionaries/typos.csv
CHANGED
@@ -21157,7 +21157,6 @@ complexty,complexity
|
|
21157
21157
|
complexy,complexity
|
21158
21158
|
compliacted,complicate
|
21159
21159
|
compliactions,complication
|
21160
|
-
compliancy,compliance
|
21161
21160
|
complianed,compliance
|
21162
21161
|
complians,complains
|
21163
21162
|
compliants,complaints
|
@@ -89409,7 +89408,6 @@ reey,really
|
|
89409
89408
|
refacted,refactored
|
89410
89409
|
refactor's,refactored
|
89411
89410
|
refactorig,refactoring
|
89412
|
-
refactorings,refactors
|
89413
89411
|
refactorng,refactoring
|
89414
89412
|
refactorsing,refactoring
|
89415
89413
|
refarence,references
|
@@ -6,6 +6,9 @@ module Spellchecker
|
|
6
6
|
ABBREVIATION_REGEXP = /\A(?:[A-Z]{2,4})|(?:[A-Z][a-z])\z/.freeze
|
7
7
|
|
8
8
|
LENGTH_LIMIT = 2
|
9
|
+
ABBREVIATION_LENGTH = 2
|
10
|
+
NUMBER_SHORTENING_SUFFIX = 'th'
|
11
|
+
SHORTENINGS = Set.new(%w[ver]).freeze
|
9
12
|
|
10
13
|
module_function
|
11
14
|
|
@@ -20,7 +23,7 @@ module Spellchecker
|
|
20
23
|
|
21
24
|
return unless correction
|
22
25
|
return if PROPER_NAME_REGEXP.match?(word)
|
23
|
-
return if
|
26
|
+
return if abbreviation?(token) || shortening?(token)
|
24
27
|
return if Dictionaries::EnglishWords.include?(Utils.replace_quote(word))
|
25
28
|
|
26
29
|
return if token.capital? && proper_noun?(word)
|
@@ -38,5 +41,25 @@ module Spellchecker
|
|
38
41
|
Dictionaries::CompanyNames.include?(word) ||
|
39
42
|
Dictionaries::UsToponyms.include?(word)
|
40
43
|
end
|
44
|
+
|
45
|
+
# @param token [Spellchecker::Tokenizer::Token]
|
46
|
+
# @return [Boolean]
|
47
|
+
def abbreviation?(token)
|
48
|
+
return true if ABBREVIATION_REGEXP.match?(token.text)
|
49
|
+
return true if token.text.length <= ABBREVIATION_LENGTH &&
|
50
|
+
!token.prev.word? && !token.next.word?
|
51
|
+
|
52
|
+
false
|
53
|
+
end
|
54
|
+
|
55
|
+
# @param token [Spellchecker::Tokenizer::Token]
|
56
|
+
# @return [Boolean]
|
57
|
+
def shortening?(token)
|
58
|
+
return true if token.text == NUMBER_SHORTENING_SUFFIX && token.prev.digit?
|
59
|
+
return true if SHORTENINGS.include?(token.downcased) &&
|
60
|
+
(token.next.dot? || token.next.digit?)
|
61
|
+
|
62
|
+
false
|
63
|
+
end
|
41
64
|
end
|
42
65
|
end
|
@@ -16,7 +16,7 @@ module Spellchecker
|
|
16
16
|
SIMPLE_POST = ['!', '?', ',', ':', ';', '.'].freeze
|
17
17
|
PAIR_PRE = ['(', '{', '[', '<', '«', '„', '‘'].freeze
|
18
18
|
PAIR_POST = [')', '}', ']', '>', '»', '“', '’'].freeze
|
19
|
-
PRE_N_POST = ['"', "'", '`'].freeze
|
19
|
+
PRE_N_POST = ['"', "'", '`', '*'].freeze
|
20
20
|
|
21
21
|
SPLITTABLES = SIMPLE_PRE + SIMPLE_POST + PAIR_PRE + PAIR_POST + PRE_N_POST
|
22
22
|
|
@@ -43,6 +43,21 @@ module Spellchecker
|
|
43
43
|
@capital ||= text.match?(/\A[A-Z]/)
|
44
44
|
end
|
45
45
|
|
46
|
+
# @return [Boolean]
|
47
|
+
def word?
|
48
|
+
@word ||= text.length > 1 || text.match?(/\w/)
|
49
|
+
end
|
50
|
+
|
51
|
+
# @return [Boolean]
|
52
|
+
def digit?
|
53
|
+
@digit ||= text.match?(/\A\d+\z/)
|
54
|
+
end
|
55
|
+
|
56
|
+
# @return [Boolean]
|
57
|
+
def dot?
|
58
|
+
@dot ||= text == Tokenizer::DOT
|
59
|
+
end
|
60
|
+
|
46
61
|
# @return [String]
|
47
62
|
def downcased
|
48
63
|
@downcased ||= text.downcase
|
data/lib/spellchecker/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-spellchecker
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Pete Matsyburka
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-12-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|