httpspell 1.2.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/dependabot.yml +14 -0
- data/.gitignore +2 -0
- data/.mergify.yml +8 -0
- data/.rubocop.yml +4 -9
- data/.ruby-version +1 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +142 -94
- data/Guardfile +25 -0
- data/README.markdown +23 -2
- data/Rakefile +1 -1
- data/TODO.markdown +5 -0
- data/exe/httpspell +24 -13
- data/httpspell.gemspec +3 -16
- data/lib/httpspell/spellchecker.rb +3 -3
- data/lib/httpspell/spider.rb +21 -7
- data/lib/httpspell/version.rb +1 -1
- metadata +16 -166
- data/.travis.yml +0 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e6245dacfbe0f2b1f7a95e95118a058e592f0fcbe88407b804e756bd41691054
|
|
4
|
+
data.tar.gz: 731ddea385cdca5f40c14c10289583b1544b9055bd82015ce2c5beb3b9495b52
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2295d29b287812f6f1330d4cc17a5eae9ab3a14e99bb9ab4d4215761d3146a3f27b8ddcb66331999156c68a1ac58e2ac33b47d7043b7a2771e56faca03eb7b62
|
|
7
|
+
data.tar.gz: 6cbab0f82ae37c684a6bf4ad60e30a7b5a9921e5d90956bd4baa088e4190c9e8b2b8359e41c457dc6d594c962e43b13e3483046b0208f5bb5273d0928e74950f
|
data/.gitignore
CHANGED
data/.mergify.yml
ADDED
data/.rubocop.yml
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
AllCops:
|
|
2
|
-
|
|
2
|
+
NewCops: enable
|
|
3
|
+
TargetRubyVersion: 3.3
|
|
3
4
|
Include:
|
|
4
5
|
- '**/Gemfile'
|
|
5
6
|
- '**/Rakefile'
|
|
@@ -8,23 +9,17 @@ AllCops:
|
|
|
8
9
|
Exclude:
|
|
9
10
|
- vendor/**/*
|
|
10
11
|
- db/migrations/**/*
|
|
11
|
-
|
|
12
12
|
DisplayCopNames:
|
|
13
13
|
Enabled: true
|
|
14
|
-
|
|
15
14
|
DisplayStyleGuide:
|
|
16
15
|
Enabled: true
|
|
17
|
-
|
|
18
16
|
Naming/FileName:
|
|
19
17
|
Exclude:
|
|
20
|
-
|
|
21
|
-
|
|
18
|
+
- Guardfile
|
|
22
19
|
Metrics/BlockLength:
|
|
23
20
|
Exclude:
|
|
24
21
|
- spec/**/*
|
|
25
|
-
|
|
26
|
-
Metrics/LineLength:
|
|
22
|
+
Layout/LineLength:
|
|
27
23
|
Max: 160
|
|
28
|
-
|
|
29
24
|
Style/Documentation:
|
|
30
25
|
Enabled: false
|
data/.ruby-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ruby-3.3.1
|
data/Gemfile
CHANGED
|
@@ -2,3 +2,20 @@
|
|
|
2
2
|
|
|
3
3
|
source 'https://rubygems.org'
|
|
4
4
|
gemspec
|
|
5
|
+
|
|
6
|
+
group :development do
|
|
7
|
+
gem 'aruba'
|
|
8
|
+
gem 'bundler'
|
|
9
|
+
gem 'guard'
|
|
10
|
+
gem 'guard-bundler'
|
|
11
|
+
gem 'guard-rspec'
|
|
12
|
+
gem 'httpx'
|
|
13
|
+
gem 'pry'
|
|
14
|
+
gem 'pry-byebug'
|
|
15
|
+
gem 'rake'
|
|
16
|
+
gem 'rspec'
|
|
17
|
+
gem 'rubocop'
|
|
18
|
+
gem 'rubocop-rake'
|
|
19
|
+
gem 'rubocop-rspec'
|
|
20
|
+
gem 'stub_server'
|
|
21
|
+
end
|
data/Gemfile.lock
CHANGED
|
@@ -1,61 +1,66 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
httpspell (1.
|
|
4
|
+
httpspell (1.4.0)
|
|
5
5
|
addressable
|
|
6
6
|
nokogiri
|
|
7
7
|
|
|
8
8
|
GEM
|
|
9
9
|
remote: https://rubygems.org/
|
|
10
10
|
specs:
|
|
11
|
-
addressable (2.
|
|
12
|
-
public_suffix (>= 2.0.2, <
|
|
13
|
-
aruba (
|
|
14
|
-
|
|
15
|
-
contracts (
|
|
16
|
-
cucumber (>=
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
cucumber-
|
|
31
|
-
cucumber-
|
|
32
|
-
cucumber-
|
|
33
|
-
diff-lcs (~> 1.
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
cucumber-
|
|
38
|
-
|
|
39
|
-
cucumber-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
cucumber-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
11
|
+
addressable (2.8.6)
|
|
12
|
+
public_suffix (>= 2.0.2, < 6.0)
|
|
13
|
+
aruba (2.2.0)
|
|
14
|
+
bundler (>= 1.17, < 3.0)
|
|
15
|
+
contracts (>= 0.16.0, < 0.18.0)
|
|
16
|
+
cucumber (>= 8.0, < 10.0)
|
|
17
|
+
rspec-expectations (~> 3.4)
|
|
18
|
+
thor (~> 1.0)
|
|
19
|
+
ast (2.4.2)
|
|
20
|
+
bigdecimal (3.1.8)
|
|
21
|
+
builder (3.2.4)
|
|
22
|
+
byebug (11.1.3)
|
|
23
|
+
coderay (1.1.3)
|
|
24
|
+
contracts (0.17)
|
|
25
|
+
cucumber (9.2.0)
|
|
26
|
+
builder (~> 3.2)
|
|
27
|
+
cucumber-ci-environment (> 9, < 11)
|
|
28
|
+
cucumber-core (> 13, < 14)
|
|
29
|
+
cucumber-cucumber-expressions (~> 17.0)
|
|
30
|
+
cucumber-gherkin (> 24, < 28)
|
|
31
|
+
cucumber-html-formatter (> 20.3, < 22)
|
|
32
|
+
cucumber-messages (> 19, < 25)
|
|
33
|
+
diff-lcs (~> 1.5)
|
|
34
|
+
mini_mime (~> 1.1)
|
|
35
|
+
multi_test (~> 1.1)
|
|
36
|
+
sys-uname (~> 1.2)
|
|
37
|
+
cucumber-ci-environment (10.0.1)
|
|
38
|
+
cucumber-core (13.0.2)
|
|
39
|
+
cucumber-gherkin (>= 27, < 28)
|
|
40
|
+
cucumber-messages (>= 20, < 23)
|
|
41
|
+
cucumber-tag-expressions (> 5, < 7)
|
|
42
|
+
cucumber-cucumber-expressions (17.1.0)
|
|
43
|
+
bigdecimal
|
|
44
|
+
cucumber-gherkin (27.0.0)
|
|
45
|
+
cucumber-messages (>= 19.1.4, < 23)
|
|
46
|
+
cucumber-html-formatter (21.3.1)
|
|
47
|
+
cucumber-messages (> 19, < 25)
|
|
48
|
+
cucumber-messages (22.0.0)
|
|
49
|
+
cucumber-tag-expressions (6.1.0)
|
|
50
|
+
diff-lcs (1.5.1)
|
|
51
|
+
ffi (1.16.3)
|
|
52
|
+
formatador (1.1.0)
|
|
53
|
+
guard (2.18.1)
|
|
49
54
|
formatador (>= 0.2.4)
|
|
50
55
|
listen (>= 2.7, < 4.0)
|
|
51
56
|
lumberjack (>= 1.0.12, < 2.0)
|
|
52
57
|
nenv (~> 0.1)
|
|
53
58
|
notiffany (~> 0.0)
|
|
54
|
-
pry (>= 0.
|
|
59
|
+
pry (>= 0.13.0)
|
|
55
60
|
shellany (~> 0.0)
|
|
56
61
|
thor (>= 0.18.1)
|
|
57
|
-
guard-bundler (
|
|
58
|
-
bundler (
|
|
62
|
+
guard-bundler (3.0.1)
|
|
63
|
+
bundler (>= 2.1, < 3)
|
|
59
64
|
guard (~> 2.2)
|
|
60
65
|
guard-compat (~> 1.1)
|
|
61
66
|
guard-compat (1.2.1)
|
|
@@ -63,68 +68,108 @@ GEM
|
|
|
63
68
|
guard (~> 2.1)
|
|
64
69
|
guard-compat (~> 1.1)
|
|
65
70
|
rspec (>= 2.99.0, < 4.0)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
71
|
+
http-2-next (1.0.3)
|
|
72
|
+
httpx (1.2.5)
|
|
73
|
+
http-2-next (>= 1.0.3)
|
|
74
|
+
json (2.7.2)
|
|
75
|
+
language_server-protocol (3.17.0.3)
|
|
76
|
+
listen (3.9.0)
|
|
77
|
+
rb-fsevent (~> 0.10, >= 0.10.3)
|
|
78
|
+
rb-inotify (~> 0.9, >= 0.9.10)
|
|
79
|
+
lumberjack (1.2.10)
|
|
80
|
+
method_source (1.1.0)
|
|
81
|
+
mini_mime (1.1.5)
|
|
82
|
+
multi_test (1.1.0)
|
|
75
83
|
nenv (0.3.0)
|
|
76
|
-
nokogiri (1.
|
|
77
|
-
|
|
78
|
-
|
|
84
|
+
nokogiri (1.16.5-arm64-darwin)
|
|
85
|
+
racc (~> 1.4)
|
|
86
|
+
nokogiri (1.16.5-x86_64-darwin)
|
|
87
|
+
racc (~> 1.4)
|
|
88
|
+
nokogiri (1.16.5-x86_64-linux)
|
|
89
|
+
racc (~> 1.4)
|
|
90
|
+
notiffany (0.1.3)
|
|
79
91
|
nenv (~> 0.1)
|
|
80
92
|
shellany (~> 0.0)
|
|
81
|
-
parallel (1.
|
|
82
|
-
parser (
|
|
83
|
-
ast (~> 2.4.
|
|
84
|
-
|
|
85
|
-
pry (0.
|
|
86
|
-
coderay (~> 1.1
|
|
87
|
-
method_source (~>
|
|
88
|
-
pry-byebug (3.
|
|
89
|
-
byebug (~>
|
|
90
|
-
pry (
|
|
91
|
-
public_suffix (
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
93
|
+
parallel (1.24.0)
|
|
94
|
+
parser (3.3.1.0)
|
|
95
|
+
ast (~> 2.4.1)
|
|
96
|
+
racc
|
|
97
|
+
pry (0.14.2)
|
|
98
|
+
coderay (~> 1.1)
|
|
99
|
+
method_source (~> 1.0)
|
|
100
|
+
pry-byebug (3.10.1)
|
|
101
|
+
byebug (~> 11.0)
|
|
102
|
+
pry (>= 0.13, < 0.15)
|
|
103
|
+
public_suffix (5.0.5)
|
|
104
|
+
racc (1.8.0)
|
|
105
|
+
rack (3.0.11)
|
|
106
|
+
rackup (0.2.3)
|
|
107
|
+
rack (>= 3.0.0.beta1)
|
|
108
|
+
webrick
|
|
109
|
+
rainbow (3.1.1)
|
|
110
|
+
rake (13.2.1)
|
|
111
|
+
rb-fsevent (0.11.2)
|
|
112
|
+
rb-inotify (0.11.1)
|
|
113
|
+
ffi (~> 1.0)
|
|
114
|
+
regexp_parser (2.9.2)
|
|
115
|
+
rexml (3.2.8)
|
|
116
|
+
strscan (>= 3.0.9)
|
|
117
|
+
rspec (3.13.0)
|
|
118
|
+
rspec-core (~> 3.13.0)
|
|
119
|
+
rspec-expectations (~> 3.13.0)
|
|
120
|
+
rspec-mocks (~> 3.13.0)
|
|
121
|
+
rspec-core (3.13.0)
|
|
122
|
+
rspec-support (~> 3.13.0)
|
|
123
|
+
rspec-expectations (3.13.0)
|
|
105
124
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
106
|
-
rspec-support (~> 3.
|
|
107
|
-
rspec-mocks (3.
|
|
125
|
+
rspec-support (~> 3.13.0)
|
|
126
|
+
rspec-mocks (3.13.1)
|
|
108
127
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
109
|
-
rspec-support (~> 3.
|
|
110
|
-
rspec-support (3.
|
|
111
|
-
rubocop (
|
|
128
|
+
rspec-support (~> 3.13.0)
|
|
129
|
+
rspec-support (3.13.1)
|
|
130
|
+
rubocop (1.64.0)
|
|
131
|
+
json (~> 2.3)
|
|
132
|
+
language_server-protocol (>= 3.17.0)
|
|
112
133
|
parallel (~> 1.10)
|
|
113
|
-
parser (>= 2
|
|
114
|
-
powerpack (~> 0.1)
|
|
134
|
+
parser (>= 3.3.0.2)
|
|
115
135
|
rainbow (>= 2.2.2, < 4.0)
|
|
136
|
+
regexp_parser (>= 1.8, < 3.0)
|
|
137
|
+
rexml (>= 3.2.5, < 4.0)
|
|
138
|
+
rubocop-ast (>= 1.31.1, < 2.0)
|
|
116
139
|
ruby-progressbar (~> 1.7)
|
|
117
|
-
unicode-display_width (
|
|
118
|
-
|
|
119
|
-
|
|
140
|
+
unicode-display_width (>= 2.4.0, < 3.0)
|
|
141
|
+
rubocop-ast (1.31.3)
|
|
142
|
+
parser (>= 3.3.1.0)
|
|
143
|
+
rubocop-capybara (2.20.0)
|
|
144
|
+
rubocop (~> 1.41)
|
|
145
|
+
rubocop-factory_bot (2.25.1)
|
|
146
|
+
rubocop (~> 1.41)
|
|
147
|
+
rubocop-rake (0.6.0)
|
|
148
|
+
rubocop (~> 1.0)
|
|
149
|
+
rubocop-rspec (2.29.2)
|
|
150
|
+
rubocop (~> 1.40)
|
|
151
|
+
rubocop-capybara (~> 2.17)
|
|
152
|
+
rubocop-factory_bot (~> 2.22)
|
|
153
|
+
rubocop-rspec_rails (~> 2.28)
|
|
154
|
+
rubocop-rspec_rails (2.28.3)
|
|
155
|
+
rubocop (~> 1.40)
|
|
156
|
+
ruby-progressbar (1.13.0)
|
|
120
157
|
shellany (0.0.1)
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
158
|
+
strscan (3.1.0)
|
|
159
|
+
stub_server (0.7.0)
|
|
160
|
+
rackup (~> 0.2.2)
|
|
161
|
+
webrick
|
|
162
|
+
sys-uname (1.2.3)
|
|
163
|
+
ffi (~> 1.1)
|
|
164
|
+
thor (1.3.1)
|
|
165
|
+
unicode-display_width (2.5.0)
|
|
166
|
+
webrick (1.8.1)
|
|
125
167
|
|
|
126
168
|
PLATFORMS
|
|
127
|
-
|
|
169
|
+
arm64-darwin-22
|
|
170
|
+
arm64-darwin-23
|
|
171
|
+
x86_64-darwin-21
|
|
172
|
+
x86_64-linux
|
|
128
173
|
|
|
129
174
|
DEPENDENCIES
|
|
130
175
|
aruba
|
|
@@ -133,12 +178,15 @@ DEPENDENCIES
|
|
|
133
178
|
guard-bundler
|
|
134
179
|
guard-rspec
|
|
135
180
|
httpspell!
|
|
181
|
+
httpx
|
|
136
182
|
pry
|
|
137
183
|
pry-byebug
|
|
138
184
|
rake
|
|
139
185
|
rspec
|
|
140
186
|
rubocop
|
|
187
|
+
rubocop-rake
|
|
188
|
+
rubocop-rspec
|
|
141
189
|
stub_server
|
|
142
190
|
|
|
143
191
|
BUNDLED WITH
|
|
144
|
-
|
|
192
|
+
2.5.9
|
data/Guardfile
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
guard :bundler do
|
|
2
|
+
require 'guard/bundler'
|
|
3
|
+
require 'guard/bundler/verify'
|
|
4
|
+
helper = Guard::Bundler::Verify.new
|
|
5
|
+
|
|
6
|
+
files = ['Gemfile']
|
|
7
|
+
files += Dir['*.gemspec'] if files.any? { |f| helper.uses_gemspec?(f) }
|
|
8
|
+
|
|
9
|
+
# Assume files are symlinked from somewhere
|
|
10
|
+
files.each { |file| watch(helper.real_path(file)) }
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
guard :rspec, cmd: "bundle exec rspec" do
|
|
14
|
+
require "guard/rspec/dsl"
|
|
15
|
+
dsl = Guard::RSpec::Dsl.new(self)
|
|
16
|
+
|
|
17
|
+
rspec = dsl.rspec
|
|
18
|
+
watch(rspec.spec_helper) { rspec.spec_dir }
|
|
19
|
+
watch(rspec.spec_support) { rspec.spec_dir }
|
|
20
|
+
watch(rspec.spec_files)
|
|
21
|
+
|
|
22
|
+
# Ruby files
|
|
23
|
+
ruby = dsl.ruby
|
|
24
|
+
dsl.watch_spec_files_for(ruby.lib_files)
|
|
25
|
+
end
|
data/README.markdown
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
# `httpspell`
|
|
2
2
|
|
|
3
|
-
[](https://travis-ci.org/suhlig/httpspell)
|
|
4
|
-
|
|
5
3
|
This is a spellchecker that recursively fetches HTML pages, converts them to plain text (using [pandoc](http://pandoc.org/)), and spellchecks them with [hunspell](https://hunspell.github.io/). Unknown words will be printed to `stdout`, which makes the tool a good candidate for CI pipelines where you might want to take action when a spelling error is found on a web page.
|
|
6
4
|
|
|
7
5
|
Words that are not in the dictionary for the given language (inferred from the `lang` attribute of the HTML document's root element) can be added to a personal dictionary, which will mark the word as correctly spelled.
|
|
@@ -46,3 +44,26 @@ If you produce content with kramdown (e.g. using Jekyll), setting `spellcheck='f
|
|
|
46
44
|
```
|
|
47
45
|
{: spellcheck="false"}
|
|
48
46
|
```
|
|
47
|
+
|
|
48
|
+
# Dictionaries
|
|
49
|
+
|
|
50
|
+
Hunspell uses the system dictionary paths; on the Mac this is `~/Library/Spelling/`. Get some dictionaries as explained in the [hunspell](https://github.com/hunspell/hunspell) project:
|
|
51
|
+
|
|
52
|
+
```command
|
|
53
|
+
$ wget -O ~/Library/Spelling/en_US.aff https://cgit.freedesktop.org/libreoffice/dictionaries/plain/en/en_US.aff
|
|
54
|
+
$ wget -O ~/Library/Spelling/en_US.dic https://cgit.freedesktop.org/libreoffice/dictionaries/plain/en/en_US.dic
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
German:
|
|
58
|
+
|
|
59
|
+
```command
|
|
60
|
+
$ wget -O ~/Library/Spelling/de_DE.dic https://cgit.freedesktop.org/libreoffice/dictionaries/plain/de/de_DE_frami.dic
|
|
61
|
+
$ wget -O ~/Library/Spelling/de_DE.aff https://cgit.freedesktop.org/libreoffice/dictionaries/plain/de/de_DE_frami.aff
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Italian (for integration tests):
|
|
65
|
+
|
|
66
|
+
```command
|
|
67
|
+
$ wget -O ~/Library/Spelling/it_IT.dic https://cgit.freedesktop.org/libreoffice/dictionaries/plain/it_IT/it_IT.dic
|
|
68
|
+
$ wget -O ~/Library/Spelling/it_IT.aff https://cgit.freedesktop.org/libreoffice/dictionaries/plain/it_IT/it_IT.aff
|
|
69
|
+
```
|
data/Rakefile
CHANGED
data/TODO.markdown
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
* Bail out if lang cannot be inferred and is not given on cmdline
|
|
2
|
+
* exe/httpspell: # TODO: --recursive, defaults to false
|
|
3
|
+
* exe/httpspell: # TODO wget has some additional options for recursive behavior that should be reviewed
|
|
4
|
+
* lib/httpspell/spider.rb: # TODO Print _which_ entry of the blacklist matches
|
|
5
|
+
* lib/httpspell/spider.rb: # TODO Ignore same page links (some anchor)
|
data/exe/httpspell
CHANGED
|
@@ -59,27 +59,38 @@ if ARGV.size != 1
|
|
|
59
59
|
exit 1
|
|
60
60
|
end
|
|
61
61
|
|
|
62
|
-
|
|
62
|
+
def check(doc, lang, personal_dictionary_path, verbose)
|
|
63
|
+
unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc, lang)
|
|
64
|
+
|
|
65
|
+
if unknown_words.empty?
|
|
66
|
+
warn 'No unknown words.' if verbose
|
|
67
|
+
else
|
|
68
|
+
warn "#{unknown_words.size} unknown words:" if verbose
|
|
69
|
+
puts unknown_words
|
|
70
|
+
true
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
63
74
|
has_unknown_words = false
|
|
64
75
|
|
|
65
|
-
spider_success = HttpSpell::Spider.new(ARGV.first, whitelist
|
|
66
|
-
lang = force_language || doc.root['lang'] || ENV
|
|
76
|
+
spider_success = HttpSpell::Spider.new(ARGV.first, whitelist:, blacklist:, verbose:, tracing:).start do |url, doc|
|
|
77
|
+
lang = force_language || doc.root['lang'] || ENV.fetch('LANGUAGE', nil)
|
|
78
|
+
warn "Checking #{url} as #{lang}" if verbose
|
|
67
79
|
|
|
68
|
-
# Remove
|
|
80
|
+
# Remove elements that are not to be spellchecked
|
|
69
81
|
doc.css('pre').each(&:unlink)
|
|
70
82
|
doc.css('code').each(&:unlink)
|
|
71
83
|
doc.css('[spellcheck=false]').each(&:unlink)
|
|
72
84
|
|
|
73
|
-
#
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
else
|
|
79
|
-
warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
|
|
80
|
-
puts unknown_words
|
|
81
|
-
has_unknown_words = true
|
|
85
|
+
# Handle elements with a different lang attribute separately
|
|
86
|
+
doc.css(%([lang]:not([lang="#{lang}"]))).each do |element|
|
|
87
|
+
warn "Handling #{element.name} with lang #{element['lang']}:" if verbose
|
|
88
|
+
has_unknown_words |= check(element.to_s, element['lang'], personal_dictionary_path, verbose)
|
|
89
|
+
element.unlink
|
|
82
90
|
end
|
|
91
|
+
|
|
92
|
+
# Everything else
|
|
93
|
+
has_unknown_words |= check(doc.to_s, lang, personal_dictionary_path, verbose)
|
|
83
94
|
end
|
|
84
95
|
|
|
85
96
|
exit 2 unless spider_success
|
data/httpspell.gemspec
CHANGED
|
@@ -4,7 +4,6 @@ lib = File.expand_path('lib', __dir__)
|
|
|
4
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
5
5
|
require 'httpspell/version'
|
|
6
6
|
|
|
7
|
-
# rubocop:disable Metrics/BlockLength
|
|
8
7
|
Gem::Specification.new do |spec|
|
|
9
8
|
spec.name = 'httpspell'
|
|
10
9
|
spec.version = HttpSpell::VERSION
|
|
@@ -13,8 +12,8 @@ Gem::Specification.new do |spec|
|
|
|
13
12
|
|
|
14
13
|
spec.summary = 'HTTP spellchecker'
|
|
15
14
|
spec.description = %(httpspell is a spellchecker that recursively fetches
|
|
16
|
-
|
|
17
|
-
|
|
15
|
+
HTML pages, converts them to plain text using pandoc, and
|
|
16
|
+
spellchecks them with hunspell.)
|
|
18
17
|
spec.license = 'MIT'
|
|
19
18
|
|
|
20
19
|
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
|
@@ -26,17 +25,5 @@ Gem::Specification.new do |spec|
|
|
|
26
25
|
|
|
27
26
|
spec.add_dependency 'addressable'
|
|
28
27
|
spec.add_dependency 'nokogiri'
|
|
29
|
-
|
|
30
|
-
spec.add_development_dependency 'aruba'
|
|
31
|
-
spec.add_development_dependency 'bundler'
|
|
32
|
-
spec.add_development_dependency 'guard'
|
|
33
|
-
spec.add_development_dependency 'guard-bundler'
|
|
34
|
-
spec.add_development_dependency 'guard-rspec'
|
|
35
|
-
spec.add_development_dependency 'pry'
|
|
36
|
-
spec.add_development_dependency 'pry-byebug'
|
|
37
|
-
spec.add_development_dependency 'rake'
|
|
38
|
-
spec.add_development_dependency 'rspec'
|
|
39
|
-
spec.add_development_dependency 'rubocop'
|
|
40
|
-
spec.add_development_dependency 'stub_server'
|
|
28
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
|
41
29
|
end
|
|
42
|
-
# rubocop:enable Metrics/BlockLength
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
module HttpSpell
|
|
2
2
|
class SpellChecker
|
|
3
|
-
def initialize(personal_dictionary_path = nil,
|
|
3
|
+
def initialize(personal_dictionary_path = nil, verbose: false)
|
|
4
4
|
@personal_dictionary_arg = "-p #{personal_dictionary_path}" if personal_dictionary_path
|
|
5
|
-
@
|
|
5
|
+
@verbose = verbose
|
|
6
6
|
end
|
|
7
7
|
|
|
8
8
|
def check(doc, lang)
|
|
@@ -11,7 +11,7 @@ module HttpSpell
|
|
|
11
11
|
"hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l",
|
|
12
12
|
]
|
|
13
13
|
|
|
14
|
-
if @
|
|
14
|
+
if @verbose
|
|
15
15
|
warn "Piping the HTML document into the following chain of commands:"
|
|
16
16
|
warn commands
|
|
17
17
|
end
|
data/lib/httpspell/spider.rb
CHANGED
|
@@ -8,12 +8,13 @@ module HttpSpell
|
|
|
8
8
|
class Spider
|
|
9
9
|
attr_reader :todo, :done
|
|
10
10
|
|
|
11
|
-
def initialize(starting_point, whitelist: nil, blacklist: [], tracing: false)
|
|
11
|
+
def initialize(starting_point, whitelist: nil, blacklist: [], verbose: false, tracing: false)
|
|
12
12
|
@todo = []
|
|
13
13
|
@done = []
|
|
14
14
|
todo << Addressable::URI.parse(starting_point)
|
|
15
15
|
@whitelist = whitelist || [/^#{starting_point}/]
|
|
16
16
|
@blacklist = blacklist
|
|
17
|
+
@verbose = verbose
|
|
17
18
|
@tracing = tracing
|
|
18
19
|
end
|
|
19
20
|
|
|
@@ -46,10 +47,10 @@ module HttpSpell
|
|
|
46
47
|
private
|
|
47
48
|
|
|
48
49
|
def links(uri)
|
|
49
|
-
response =
|
|
50
|
+
response = http_get(uri)
|
|
50
51
|
|
|
51
|
-
if response.content_type != 'text/html'
|
|
52
|
-
warn "Skipping #{uri} because it is not HTML" if @
|
|
52
|
+
if response.respond_to?(:content_type) && response.content_type != 'text/html'
|
|
53
|
+
warn "Skipping #{uri} because it is not HTML" if @verbose
|
|
53
54
|
return []
|
|
54
55
|
end
|
|
55
56
|
|
|
@@ -60,13 +61,13 @@ module HttpSpell
|
|
|
60
61
|
link = uri.join(link) if link.relative?
|
|
61
62
|
|
|
62
63
|
if @whitelist.none? { |re| re.match?(link.to_s) }
|
|
63
|
-
warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @
|
|
64
|
+
warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @verbose
|
|
64
65
|
next
|
|
65
66
|
end
|
|
66
67
|
|
|
67
68
|
if @blacklist.any? { |re| re.match?(link.to_s) }
|
|
68
69
|
# TODO Print _which_ entry of the blacklist matches
|
|
69
|
-
warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @
|
|
70
|
+
warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @verbose
|
|
70
71
|
next
|
|
71
72
|
end
|
|
72
73
|
|
|
@@ -79,8 +80,21 @@ module HttpSpell
|
|
|
79
80
|
|
|
80
81
|
yield uri, doc if block_given?
|
|
81
82
|
|
|
82
|
-
warn "Adding #{links.size} links from #{uri}" if @
|
|
83
|
+
warn "Adding #{links.size} links from #{uri}" if @verbose
|
|
83
84
|
links
|
|
84
85
|
end
|
|
86
|
+
|
|
87
|
+
# https://twin.github.io/improving-open-uri/
|
|
88
|
+
def http_get(uri)
|
|
89
|
+
tries = 10
|
|
90
|
+
|
|
91
|
+
begin
|
|
92
|
+
URI.open(uri, redirect: false)
|
|
93
|
+
rescue OpenURI::HTTPRedirect => redirect
|
|
94
|
+
uri = redirect.uri
|
|
95
|
+
retry if (tries -= 1) > 0
|
|
96
|
+
raise
|
|
97
|
+
end
|
|
98
|
+
end
|
|
85
99
|
end
|
|
86
100
|
end
|
data/lib/httpspell/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: httpspell
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.4.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Steffen Uhlig
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2024-05-29 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: addressable
|
|
@@ -38,164 +38,10 @@ dependencies:
|
|
|
38
38
|
- - ">="
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
40
|
version: '0'
|
|
41
|
-
- !ruby/object:Gem::Dependency
|
|
42
|
-
name: aruba
|
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
|
44
|
-
requirements:
|
|
45
|
-
- - ">="
|
|
46
|
-
- !ruby/object:Gem::Version
|
|
47
|
-
version: '0'
|
|
48
|
-
type: :development
|
|
49
|
-
prerelease: false
|
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
-
requirements:
|
|
52
|
-
- - ">="
|
|
53
|
-
- !ruby/object:Gem::Version
|
|
54
|
-
version: '0'
|
|
55
|
-
- !ruby/object:Gem::Dependency
|
|
56
|
-
name: bundler
|
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
|
58
|
-
requirements:
|
|
59
|
-
- - ">="
|
|
60
|
-
- !ruby/object:Gem::Version
|
|
61
|
-
version: '0'
|
|
62
|
-
type: :development
|
|
63
|
-
prerelease: false
|
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
-
requirements:
|
|
66
|
-
- - ">="
|
|
67
|
-
- !ruby/object:Gem::Version
|
|
68
|
-
version: '0'
|
|
69
|
-
- !ruby/object:Gem::Dependency
|
|
70
|
-
name: guard
|
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
|
72
|
-
requirements:
|
|
73
|
-
- - ">="
|
|
74
|
-
- !ruby/object:Gem::Version
|
|
75
|
-
version: '0'
|
|
76
|
-
type: :development
|
|
77
|
-
prerelease: false
|
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
79
|
-
requirements:
|
|
80
|
-
- - ">="
|
|
81
|
-
- !ruby/object:Gem::Version
|
|
82
|
-
version: '0'
|
|
83
|
-
- !ruby/object:Gem::Dependency
|
|
84
|
-
name: guard-bundler
|
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
|
86
|
-
requirements:
|
|
87
|
-
- - ">="
|
|
88
|
-
- !ruby/object:Gem::Version
|
|
89
|
-
version: '0'
|
|
90
|
-
type: :development
|
|
91
|
-
prerelease: false
|
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
93
|
-
requirements:
|
|
94
|
-
- - ">="
|
|
95
|
-
- !ruby/object:Gem::Version
|
|
96
|
-
version: '0'
|
|
97
|
-
- !ruby/object:Gem::Dependency
|
|
98
|
-
name: guard-rspec
|
|
99
|
-
requirement: !ruby/object:Gem::Requirement
|
|
100
|
-
requirements:
|
|
101
|
-
- - ">="
|
|
102
|
-
- !ruby/object:Gem::Version
|
|
103
|
-
version: '0'
|
|
104
|
-
type: :development
|
|
105
|
-
prerelease: false
|
|
106
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
107
|
-
requirements:
|
|
108
|
-
- - ">="
|
|
109
|
-
- !ruby/object:Gem::Version
|
|
110
|
-
version: '0'
|
|
111
|
-
- !ruby/object:Gem::Dependency
|
|
112
|
-
name: pry
|
|
113
|
-
requirement: !ruby/object:Gem::Requirement
|
|
114
|
-
requirements:
|
|
115
|
-
- - ">="
|
|
116
|
-
- !ruby/object:Gem::Version
|
|
117
|
-
version: '0'
|
|
118
|
-
type: :development
|
|
119
|
-
prerelease: false
|
|
120
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
121
|
-
requirements:
|
|
122
|
-
- - ">="
|
|
123
|
-
- !ruby/object:Gem::Version
|
|
124
|
-
version: '0'
|
|
125
|
-
- !ruby/object:Gem::Dependency
|
|
126
|
-
name: pry-byebug
|
|
127
|
-
requirement: !ruby/object:Gem::Requirement
|
|
128
|
-
requirements:
|
|
129
|
-
- - ">="
|
|
130
|
-
- !ruby/object:Gem::Version
|
|
131
|
-
version: '0'
|
|
132
|
-
type: :development
|
|
133
|
-
prerelease: false
|
|
134
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
135
|
-
requirements:
|
|
136
|
-
- - ">="
|
|
137
|
-
- !ruby/object:Gem::Version
|
|
138
|
-
version: '0'
|
|
139
|
-
- !ruby/object:Gem::Dependency
|
|
140
|
-
name: rake
|
|
141
|
-
requirement: !ruby/object:Gem::Requirement
|
|
142
|
-
requirements:
|
|
143
|
-
- - ">="
|
|
144
|
-
- !ruby/object:Gem::Version
|
|
145
|
-
version: '0'
|
|
146
|
-
type: :development
|
|
147
|
-
prerelease: false
|
|
148
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
149
|
-
requirements:
|
|
150
|
-
- - ">="
|
|
151
|
-
- !ruby/object:Gem::Version
|
|
152
|
-
version: '0'
|
|
153
|
-
- !ruby/object:Gem::Dependency
|
|
154
|
-
name: rspec
|
|
155
|
-
requirement: !ruby/object:Gem::Requirement
|
|
156
|
-
requirements:
|
|
157
|
-
- - ">="
|
|
158
|
-
- !ruby/object:Gem::Version
|
|
159
|
-
version: '0'
|
|
160
|
-
type: :development
|
|
161
|
-
prerelease: false
|
|
162
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
163
|
-
requirements:
|
|
164
|
-
- - ">="
|
|
165
|
-
- !ruby/object:Gem::Version
|
|
166
|
-
version: '0'
|
|
167
|
-
- !ruby/object:Gem::Dependency
|
|
168
|
-
name: rubocop
|
|
169
|
-
requirement: !ruby/object:Gem::Requirement
|
|
170
|
-
requirements:
|
|
171
|
-
- - ">="
|
|
172
|
-
- !ruby/object:Gem::Version
|
|
173
|
-
version: '0'
|
|
174
|
-
type: :development
|
|
175
|
-
prerelease: false
|
|
176
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
177
|
-
requirements:
|
|
178
|
-
- - ">="
|
|
179
|
-
- !ruby/object:Gem::Version
|
|
180
|
-
version: '0'
|
|
181
|
-
- !ruby/object:Gem::Dependency
|
|
182
|
-
name: stub_server
|
|
183
|
-
requirement: !ruby/object:Gem::Requirement
|
|
184
|
-
requirements:
|
|
185
|
-
- - ">="
|
|
186
|
-
- !ruby/object:Gem::Version
|
|
187
|
-
version: '0'
|
|
188
|
-
type: :development
|
|
189
|
-
prerelease: false
|
|
190
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
191
|
-
requirements:
|
|
192
|
-
- - ">="
|
|
193
|
-
- !ruby/object:Gem::Version
|
|
194
|
-
version: '0'
|
|
195
41
|
description: |-
|
|
196
42
|
httpspell is a spellchecker that recursively fetches
|
|
197
|
-
|
|
198
|
-
|
|
43
|
+
HTML pages, converts them to plain text using pandoc, and
|
|
44
|
+
spellchecks them with hunspell.
|
|
199
45
|
email:
|
|
200
46
|
- steffen@familie-uhlig.net
|
|
201
47
|
executables:
|
|
@@ -203,24 +49,29 @@ executables:
|
|
|
203
49
|
extensions: []
|
|
204
50
|
extra_rdoc_files: []
|
|
205
51
|
files:
|
|
52
|
+
- ".github/dependabot.yml"
|
|
206
53
|
- ".gitignore"
|
|
54
|
+
- ".mergify.yml"
|
|
207
55
|
- ".rspec"
|
|
208
56
|
- ".rubocop.yml"
|
|
209
|
-
- ".
|
|
57
|
+
- ".ruby-version"
|
|
210
58
|
- Gemfile
|
|
211
59
|
- Gemfile.lock
|
|
60
|
+
- Guardfile
|
|
212
61
|
- README.markdown
|
|
213
62
|
- Rakefile
|
|
63
|
+
- TODO.markdown
|
|
214
64
|
- exe/httpspell
|
|
215
65
|
- httpspell.gemspec
|
|
216
66
|
- lib/httpspell/spellchecker.rb
|
|
217
67
|
- lib/httpspell/spider.rb
|
|
218
68
|
- lib/httpspell/version.rb
|
|
219
|
-
homepage:
|
|
69
|
+
homepage:
|
|
220
70
|
licenses:
|
|
221
71
|
- MIT
|
|
222
|
-
metadata:
|
|
223
|
-
|
|
72
|
+
metadata:
|
|
73
|
+
rubygems_mfa_required: 'true'
|
|
74
|
+
post_install_message:
|
|
224
75
|
rdoc_options: []
|
|
225
76
|
require_paths:
|
|
226
77
|
- lib
|
|
@@ -235,9 +86,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
235
86
|
- !ruby/object:Gem::Version
|
|
236
87
|
version: '0'
|
|
237
88
|
requirements: []
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
signing_key:
|
|
89
|
+
rubygems_version: 3.5.9
|
|
90
|
+
signing_key:
|
|
241
91
|
specification_version: 4
|
|
242
92
|
summary: HTTP spellchecker
|
|
243
93
|
test_files: []
|