clausewitz-spelling 0.1.16 → 0.1.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -1
- data/clausewitz-spelling.gemspec +1 -0
- data/lib/clausewitz/spelling/checker.rb +35 -24
- data/lib/clausewitz/spelling/version.rb +1 -1
- metadata +15 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8fe17c11c32b6260c764e55e2bd4d508e37bdc2f
|
4
|
+
data.tar.gz: 61083f1bc484d2a8b8dbf960b5226b947e17a520
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 42e7327da7a213edae016e730acb1177415453e3d87aa9771dda061b36f7fdfd01fe05966c8a35ef1fb4a650097025ad8de2c7c5f9ac6917361059d21ac02d62
|
7
|
+
data.tar.gz: a7199d624932691fdf0827c34bd7bf1cda2ff112817f27026cec2efc1bb03b5973cb8eb5ca2728f4e11f233f5ab02601db6426a9a768c9b2c2e2cc60c6463f8d
|
data/Gemfile.lock
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
clausewitz-spelling (0.1.
|
4
|
+
clausewitz-spelling (0.1.17)
|
5
5
|
colorize
|
6
6
|
damerau-levenshtein
|
7
7
|
ffi-aspell
|
8
8
|
optimist
|
9
|
+
pragmatic_tokenizer
|
9
10
|
|
10
11
|
GEM
|
11
12
|
remote: https://rubygems.org/
|
@@ -19,6 +20,8 @@ GEM
|
|
19
20
|
ffi
|
20
21
|
method_source (0.9.2)
|
21
22
|
optimist (3.0.0)
|
23
|
+
pragmatic_tokenizer (3.0.7)
|
24
|
+
unicode
|
22
25
|
pry (0.12.2)
|
23
26
|
coderay (~> 1.1.0)
|
24
27
|
method_source (~> 0.9.0)
|
@@ -36,6 +39,7 @@ GEM
|
|
36
39
|
diff-lcs (>= 1.2.0, < 2.0)
|
37
40
|
rspec-support (~> 3.8.0)
|
38
41
|
rspec-support (3.8.0)
|
42
|
+
unicode (0.4.4.4)
|
39
43
|
|
40
44
|
PLATFORMS
|
41
45
|
ruby
|
data/clausewitz-spelling.gemspec
CHANGED
@@ -5,6 +5,7 @@ require 'set'
|
|
5
5
|
require 'tmpdir'
|
6
6
|
require 'yaml'
|
7
7
|
require 'damerau-levenshtein'
|
8
|
+
require 'pragmatic_tokenizer'
|
8
9
|
require 'clausewitz/localisation'
|
9
10
|
require 'clausewitz/spelling/results'
|
10
11
|
|
@@ -53,6 +54,7 @@ module Clausewitz; module Spelling
|
|
53
54
|
aspell_checker = load_aspell_checker(lang)
|
54
55
|
spellcheck_ignore = entries&.delete('spellcheck_ignore')
|
55
56
|
ignored_keys = spellcheck_ignore ? spellcheck_ignore.split(',') : []
|
57
|
+
ignored_keys << 'spellcheck_ignore'
|
56
58
|
return IgnoredLangResult.new(lang) if ignored_keys.include?('all')
|
57
59
|
return LangResults.new(lang, []) unless entries
|
58
60
|
checks = entries.map do |key, entry|
|
@@ -75,30 +77,39 @@ module Clausewitz; module Spelling
|
|
75
77
|
# Remove other localisation bits we don't care about.
|
76
78
|
entry.gsub!(/§(%|\*|=|\d|W|G|R|B|Y|b|M|g|T|l|H|\+|-|!)/, '')
|
77
79
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
entry.gsub!(/(!|;|\?|"|“|”|…|:|\(|\))/, '')
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
words = entry.split(/\s|—/)
|
88
|
-
words.map! do |word|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
end.join(" ")
|
80
|
+
## We should also remove punctuation that is never part of words, like
|
81
|
+
## exclamation points, commas, semi-colons, and question marks.
|
82
|
+
## We should be using proper apostrophes for possessives in our loc.
|
83
|
+
#entry.gsub!(/(!|;|\?|"|“|”|…|:|\(|\))/, '')
|
84
|
+
|
85
|
+
## If a word has one full stop at the end with no other full stops
|
86
|
+
## elsewhere in the word, it's probably an acronym or initialism like
|
87
|
+
## U.S.A. and so we should avoid stripping it. Otherwise, it's probably
|
88
|
+
## the end of a sentence and can be ignored.
|
89
|
+
#words = entry.split(/\s|—/)
|
90
|
+
#words.map! do |word|
|
91
|
+
# word.sub!(/^'/, '')
|
92
|
+
# word.sub!(/'?,?'?$/, '')
|
93
|
+
|
94
|
+
# if word.end_with?('...')
|
95
|
+
# word.sub(/\.\.\.$/, '')
|
96
|
+
# elsif word =~ /[[:alpha:]]\.$/ && word.chars.count('.') == 1
|
97
|
+
# word.sub(/\.$/, '')
|
98
|
+
# elsif word =~ /\d\.$/ && word.chars.count('.') <= 2
|
99
|
+
# word.sub(/\.$/, '')
|
100
|
+
# else
|
101
|
+
# word
|
102
|
+
# end
|
103
|
+
#end.join(" ")
|
104
|
+
|
105
|
+
opts = {
|
106
|
+
language: :en,
|
107
|
+
punctuation: :none,
|
108
|
+
downcase: false
|
109
|
+
}
|
110
|
+
words = PragmaticTokenizer::Tokenizer.new(opts).tokenize(entry)
|
111
|
+
words = words.map { |word| word.split('—') }.flatten(1)
|
112
|
+
|
102
113
|
|
103
114
|
checks = words.map { |word| check_word(checker, wordlist, word) }.compact
|
104
115
|
EntryResults.new(key, checks)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: clausewitz-spelling
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.17
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Will Chappell
|
@@ -122,6 +122,20 @@ dependencies:
|
|
122
122
|
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: pragmatic_tokenizer
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - ">="
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
125
139
|
description:
|
126
140
|
email:
|
127
141
|
- wtchappell@gmail.com
|