ruby-spellchecker 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -0
- data/dictionaries/ngrams.csv +0 -6
- data/dictionaries/typos.csv +0 -2
- data/lib/spellchecker/detect_typo.rb +24 -1
- data/lib/spellchecker/dictionaries/ngram_list.rb +1 -1
- data/lib/spellchecker/tokenizer.rb +1 -1
- data/lib/spellchecker/tokenizer/token.rb +15 -0
- data/lib/spellchecker/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ef027e0b01226b4df2d1ba0d4e4d63b9b28ee84070689d747f5573e52c94014b
|
4
|
+
data.tar.gz: 5daeaa089531bd755434545304c96a21501b0f2f5975611a9fc32f38cec92ee2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 62237078c65b536f9ca986324f532ecf439f511184bf8ad738163b0681bb7f6cd66c783d5b1e63086d15802ea2c1ceb304ea35ddee9449fc6b3592c59b301d85
|
7
|
+
data.tar.gz: ff1bc1100afcdd792dcc600e8c42b696d2703bed837c2ea4fbccf2acdf8a7284bfb4b62341a72a3a675232a48e7131ebf025db4b3d32fb39aa0d7ff6cd629264
|
data/README.md
CHANGED
data/dictionaries/ngrams.csv
CHANGED
@@ -693,7 +693,6 @@ corona virus,coronavirus
|
|
693
693
|
cote chalonnaise,Côte Chalonnaise
|
694
694
|
cote d'argent,Côte d'Argent
|
695
695
|
cote d'azur,Côte d'Azur
|
696
|
-
cote d'ivoire,Côte d'Ivoire
|
697
696
|
cote d'opale,Côte d'Opale
|
698
697
|
cote d'or,Côte d'Or
|
699
698
|
cote d`argent,Côte d`Argent
|
@@ -2012,11 +2011,6 @@ salvador dali,Salvador Dalí
|
|
2012
2011
|
sam elliot,Sam Elliott
|
2013
2012
|
san luis potosi,San Luis Potosí
|
2014
2013
|
sao paolo,São Paulo
|
2015
|
-
sao paulo,São Paulo
|
2016
|
-
sao tome and principe,São Tomé and Príncipe
|
2017
|
-
sao tome and príncipe,São Tomé and Príncipe
|
2018
|
-
sao tomé and principe,São Tomé and Príncipe
|
2019
|
-
sao tomé and príncipe,São Tomé and Príncipe
|
2020
2014
|
sau paolo,São Paulo
|
2021
2015
|
sau paulo,São Paulo
|
2022
2016
|
saudia arabia,saudi Arabia
|
data/dictionaries/typos.csv
CHANGED
@@ -21157,7 +21157,6 @@ complexty,complexity
|
|
21157
21157
|
complexy,complexity
|
21158
21158
|
compliacted,complicate
|
21159
21159
|
compliactions,complication
|
21160
|
-
compliancy,compliance
|
21161
21160
|
complianed,compliance
|
21162
21161
|
complians,complains
|
21163
21162
|
compliants,complaints
|
@@ -89409,7 +89408,6 @@ reey,really
|
|
89409
89408
|
refacted,refactored
|
89410
89409
|
refactor's,refactored
|
89411
89410
|
refactorig,refactoring
|
89412
|
-
refactorings,refactors
|
89413
89411
|
refactorng,refactoring
|
89414
89412
|
refactorsing,refactoring
|
89415
89413
|
refarence,references
|
@@ -6,6 +6,9 @@ module Spellchecker
|
|
6
6
|
ABBREVIATION_REGEXP = /\A(?:[A-Z]{2,4})|(?:[A-Z][a-z])\z/.freeze
|
7
7
|
|
8
8
|
LENGTH_LIMIT = 2
|
9
|
+
ABBREVIATION_LENGTH = 2
|
10
|
+
NUMBER_SHORTENING_SUFFIX = 'th'
|
11
|
+
SHORTENINGS = Set.new(%w[ver]).freeze
|
9
12
|
|
10
13
|
module_function
|
11
14
|
|
@@ -20,7 +23,7 @@ module Spellchecker
|
|
20
23
|
|
21
24
|
return unless correction
|
22
25
|
return if PROPER_NAME_REGEXP.match?(word)
|
23
|
-
return if
|
26
|
+
return if abbreviation?(token) || shortening?(token)
|
24
27
|
return if Dictionaries::EnglishWords.include?(Utils.replace_quote(word))
|
25
28
|
|
26
29
|
return if token.capital? && proper_noun?(word)
|
@@ -38,5 +41,25 @@ module Spellchecker
|
|
38
41
|
Dictionaries::CompanyNames.include?(word) ||
|
39
42
|
Dictionaries::UsToponyms.include?(word)
|
40
43
|
end
|
44
|
+
|
45
|
+
# @param token [Spellchecker::Tokenizer::Token]
|
46
|
+
# @return [Boolean]
|
47
|
+
def abbreviation?(token)
|
48
|
+
return true if ABBREVIATION_REGEXP.match?(token.text)
|
49
|
+
return true if token.text.length <= ABBREVIATION_LENGTH &&
|
50
|
+
!token.prev.word? && !token.next.word?
|
51
|
+
|
52
|
+
false
|
53
|
+
end
|
54
|
+
|
55
|
+
# @param token [Spellchecker::Tokenizer::Token]
|
56
|
+
# @return [Boolean]
|
57
|
+
def shortening?(token)
|
58
|
+
return true if token.text == NUMBER_SHORTENING_SUFFIX && token.prev.digit?
|
59
|
+
return true if SHORTENINGS.include?(token.downcased) &&
|
60
|
+
(token.next.dot? || token.next.digit?)
|
61
|
+
|
62
|
+
false
|
63
|
+
end
|
41
64
|
end
|
42
65
|
end
|
@@ -16,7 +16,7 @@ module Spellchecker
|
|
16
16
|
SIMPLE_POST = ['!', '?', ',', ':', ';', '.'].freeze
|
17
17
|
PAIR_PRE = ['(', '{', '[', '<', '«', '„', '‘'].freeze
|
18
18
|
PAIR_POST = [')', '}', ']', '>', '»', '“', '’'].freeze
|
19
|
-
PRE_N_POST = ['"', "'", '`'].freeze
|
19
|
+
PRE_N_POST = ['"', "'", '`', '*'].freeze
|
20
20
|
|
21
21
|
SPLITTABLES = SIMPLE_PRE + SIMPLE_POST + PAIR_PRE + PAIR_POST + PRE_N_POST
|
22
22
|
|
@@ -43,6 +43,21 @@ module Spellchecker
|
|
43
43
|
@capital ||= text.match?(/\A[A-Z]/)
|
44
44
|
end
|
45
45
|
|
46
|
+
# @return [Boolean]
|
47
|
+
def word?
|
48
|
+
@word ||= text.length > 1 || text.match?(/\w/)
|
49
|
+
end
|
50
|
+
|
51
|
+
# @return [Boolean]
|
52
|
+
def digit?
|
53
|
+
@digit ||= text.match?(/\A\d+\z/)
|
54
|
+
end
|
55
|
+
|
56
|
+
# @return [Boolean]
|
57
|
+
def dot?
|
58
|
+
@dot ||= text == Tokenizer::DOT
|
59
|
+
end
|
60
|
+
|
46
61
|
# @return [String]
|
47
62
|
def downcased
|
48
63
|
@downcased ||= text.downcase
|
data/lib/spellchecker/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-spellchecker
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Pete Matsyburka
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-12-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|