clausewitz-spelling 0.1.16 → 0.1.17
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -1
- data/clausewitz-spelling.gemspec +1 -0
- data/lib/clausewitz/spelling/checker.rb +35 -24
- data/lib/clausewitz/spelling/version.rb +1 -1
- metadata +15 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8fe17c11c32b6260c764e55e2bd4d508e37bdc2f
|
4
|
+
data.tar.gz: 61083f1bc484d2a8b8dbf960b5226b947e17a520
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 42e7327da7a213edae016e730acb1177415453e3d87aa9771dda061b36f7fdfd01fe05966c8a35ef1fb4a650097025ad8de2c7c5f9ac6917361059d21ac02d62
|
7
|
+
data.tar.gz: a7199d624932691fdf0827c34bd7bf1cda2ff112817f27026cec2efc1bb03b5973cb8eb5ca2728f4e11f233f5ab02601db6426a9a768c9b2c2e2cc60c6463f8d
|
data/Gemfile.lock
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
clausewitz-spelling (0.1.
|
4
|
+
clausewitz-spelling (0.1.17)
|
5
5
|
colorize
|
6
6
|
damerau-levenshtein
|
7
7
|
ffi-aspell
|
8
8
|
optimist
|
9
|
+
pragmatic_tokenizer
|
9
10
|
|
10
11
|
GEM
|
11
12
|
remote: https://rubygems.org/
|
@@ -19,6 +20,8 @@ GEM
|
|
19
20
|
ffi
|
20
21
|
method_source (0.9.2)
|
21
22
|
optimist (3.0.0)
|
23
|
+
pragmatic_tokenizer (3.0.7)
|
24
|
+
unicode
|
22
25
|
pry (0.12.2)
|
23
26
|
coderay (~> 1.1.0)
|
24
27
|
method_source (~> 0.9.0)
|
@@ -36,6 +39,7 @@ GEM
|
|
36
39
|
diff-lcs (>= 1.2.0, < 2.0)
|
37
40
|
rspec-support (~> 3.8.0)
|
38
41
|
rspec-support (3.8.0)
|
42
|
+
unicode (0.4.4.4)
|
39
43
|
|
40
44
|
PLATFORMS
|
41
45
|
ruby
|
data/clausewitz-spelling.gemspec
CHANGED
@@ -5,6 +5,7 @@ require 'set'
|
|
5
5
|
require 'tmpdir'
|
6
6
|
require 'yaml'
|
7
7
|
require 'damerau-levenshtein'
|
8
|
+
require 'pragmatic_tokenizer'
|
8
9
|
require 'clausewitz/localisation'
|
9
10
|
require 'clausewitz/spelling/results'
|
10
11
|
|
@@ -53,6 +54,7 @@ module Clausewitz; module Spelling
|
|
53
54
|
aspell_checker = load_aspell_checker(lang)
|
54
55
|
spellcheck_ignore = entries&.delete('spellcheck_ignore')
|
55
56
|
ignored_keys = spellcheck_ignore ? spellcheck_ignore.split(',') : []
|
57
|
+
ignored_keys << 'spellcheck_ignore'
|
56
58
|
return IgnoredLangResult.new(lang) if ignored_keys.include?('all')
|
57
59
|
return LangResults.new(lang, []) unless entries
|
58
60
|
checks = entries.map do |key, entry|
|
@@ -75,30 +77,39 @@ module Clausewitz; module Spelling
|
|
75
77
|
# Remove other localisation bits we don't care about.
|
76
78
|
entry.gsub!(/§(%|\*|=|\d|W|G|R|B|Y|b|M|g|T|l|H|\+|-|!)/, '')
|
77
79
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
entry.gsub!(/(!|;|\?|"|“|”|…|:|\(|\))/, '')
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
words = entry.split(/\s|—/)
|
88
|
-
words.map! do |word|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
end.join(" ")
|
80
|
+
## We should also remove punctuation that is never part of words, like
|
81
|
+
## exclamation points, commas, semi-colons, and question marks.
|
82
|
+
## We should be using proper apostrophes for possessives in our loc.
|
83
|
+
#entry.gsub!(/(!|;|\?|"|“|”|…|:|\(|\))/, '')
|
84
|
+
|
85
|
+
## If a word has one full stop at the end with no other full stops
|
86
|
+
## elsewhere in the word, it's probably an acronym or initialism like
|
87
|
+
## U.S.A. and so we should avoid stripping it. Otherwise, it's probably
|
88
|
+
## the end of a sentence and can be ignored.
|
89
|
+
#words = entry.split(/\s|—/)
|
90
|
+
#words.map! do |word|
|
91
|
+
# word.sub!(/^'/, '')
|
92
|
+
# word.sub!(/'?,?'?$/, '')
|
93
|
+
|
94
|
+
# if word.end_with?('...')
|
95
|
+
# word.sub(/\.\.\.$/, '')
|
96
|
+
# elsif word =~ /[[:alpha:]]\.$/ && word.chars.count('.') == 1
|
97
|
+
# word.sub(/\.$/, '')
|
98
|
+
# elsif word =~ /\d\.$/ && word.chars.count('.') <= 2
|
99
|
+
# word.sub(/\.$/, '')
|
100
|
+
# else
|
101
|
+
# word
|
102
|
+
# end
|
103
|
+
#end.join(" ")
|
104
|
+
|
105
|
+
opts = {
|
106
|
+
language: :en,
|
107
|
+
punctuation: :none,
|
108
|
+
downcase: false
|
109
|
+
}
|
110
|
+
words = PragmaticTokenizer::Tokenizer.new(opts).tokenize(entry)
|
111
|
+
words = words.map { |word| word.split('—') }.flatten(1)
|
112
|
+
|
102
113
|
|
103
114
|
checks = words.map { |word| check_word(checker, wordlist, word) }.compact
|
104
115
|
EntryResults.new(key, checks)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: clausewitz-spelling
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.17
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Will Chappell
|
@@ -122,6 +122,20 @@ dependencies:
|
|
122
122
|
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: pragmatic_tokenizer
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - ">="
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
125
139
|
description:
|
126
140
|
email:
|
127
141
|
- wtchappell@gmail.com
|