httpspell 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/dependabot.yml +14 -0
- data/.gitignore +2 -0
- data/.mergify.yml +8 -0
- data/.rubocop.yml +4 -9
- data/.ruby-version +1 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +140 -94
- data/Guardfile +25 -0
- data/README.markdown +23 -2
- data/Rakefile +1 -1
- data/TODO.markdown +0 -1
- data/exe/httpspell +24 -13
- data/httpspell.gemspec +3 -16
- data/lib/httpspell/spellchecker.rb +3 -3
- data/lib/httpspell/spider.rb +9 -8
- data/lib/httpspell/version.rb +1 -1
- metadata +15 -165
- data/.travis.yml +0 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e6245dacfbe0f2b1f7a95e95118a058e592f0fcbe88407b804e756bd41691054
|
4
|
+
data.tar.gz: 731ddea385cdca5f40c14c10289583b1544b9055bd82015ce2c5beb3b9495b52
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2295d29b287812f6f1330d4cc17a5eae9ab3a14e99bb9ab4d4215761d3146a3f27b8ddcb66331999156c68a1ac58e2ac33b47d7043b7a2771e56faca03eb7b62
|
7
|
+
data.tar.gz: 6cbab0f82ae37c684a6bf4ad60e30a7b5a9921e5d90956bd4baa088e4190c9e8b2b8359e41c457dc6d594c962e43b13e3483046b0208f5bb5273d0928e74950f
|
data/.gitignore
CHANGED
data/.mergify.yml
ADDED
data/.rubocop.yml
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
AllCops:
|
2
|
-
|
2
|
+
NewCops: enable
|
3
|
+
TargetRubyVersion: 3.3
|
3
4
|
Include:
|
4
5
|
- '**/Gemfile'
|
5
6
|
- '**/Rakefile'
|
@@ -8,23 +9,17 @@ AllCops:
|
|
8
9
|
Exclude:
|
9
10
|
- vendor/**/*
|
10
11
|
- db/migrations/**/*
|
11
|
-
|
12
12
|
DisplayCopNames:
|
13
13
|
Enabled: true
|
14
|
-
|
15
14
|
DisplayStyleGuide:
|
16
15
|
Enabled: true
|
17
|
-
|
18
16
|
Naming/FileName:
|
19
17
|
Exclude:
|
20
|
-
|
21
|
-
|
18
|
+
- Guardfile
|
22
19
|
Metrics/BlockLength:
|
23
20
|
Exclude:
|
24
21
|
- spec/**/*
|
25
|
-
|
26
|
-
Metrics/LineLength:
|
22
|
+
Layout/LineLength:
|
27
23
|
Max: 160
|
28
|
-
|
29
24
|
Style/Documentation:
|
30
25
|
Enabled: false
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
ruby-3.3.1
|
data/Gemfile
CHANGED
@@ -2,3 +2,20 @@
|
|
2
2
|
|
3
3
|
source 'https://rubygems.org'
|
4
4
|
gemspec
|
5
|
+
|
6
|
+
group :development do
|
7
|
+
gem 'aruba'
|
8
|
+
gem 'bundler'
|
9
|
+
gem 'guard'
|
10
|
+
gem 'guard-bundler'
|
11
|
+
gem 'guard-rspec'
|
12
|
+
gem 'httpx'
|
13
|
+
gem 'pry'
|
14
|
+
gem 'pry-byebug'
|
15
|
+
gem 'rake'
|
16
|
+
gem 'rspec'
|
17
|
+
gem 'rubocop'
|
18
|
+
gem 'rubocop-rake'
|
19
|
+
gem 'rubocop-rspec'
|
20
|
+
gem 'stub_server'
|
21
|
+
end
|
data/Gemfile.lock
CHANGED
@@ -1,61 +1,66 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
httpspell (1.
|
4
|
+
httpspell (1.4.0)
|
5
5
|
addressable
|
6
6
|
nokogiri
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
-
addressable (2.6
|
12
|
-
public_suffix (>= 2.0.2, <
|
13
|
-
aruba (
|
14
|
-
|
15
|
-
contracts (
|
16
|
-
cucumber (>=
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
cucumber-
|
31
|
-
cucumber-
|
32
|
-
cucumber-
|
33
|
-
diff-lcs (~> 1.
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
cucumber-
|
38
|
-
|
39
|
-
cucumber-
|
40
|
-
|
41
|
-
|
42
|
-
cucumber-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
11
|
+
addressable (2.8.6)
|
12
|
+
public_suffix (>= 2.0.2, < 6.0)
|
13
|
+
aruba (2.2.0)
|
14
|
+
bundler (>= 1.17, < 3.0)
|
15
|
+
contracts (>= 0.16.0, < 0.18.0)
|
16
|
+
cucumber (>= 8.0, < 10.0)
|
17
|
+
rspec-expectations (~> 3.4)
|
18
|
+
thor (~> 1.0)
|
19
|
+
ast (2.4.2)
|
20
|
+
bigdecimal (3.1.8)
|
21
|
+
builder (3.2.4)
|
22
|
+
byebug (11.1.3)
|
23
|
+
coderay (1.1.3)
|
24
|
+
contracts (0.17)
|
25
|
+
cucumber (9.2.0)
|
26
|
+
builder (~> 3.2)
|
27
|
+
cucumber-ci-environment (> 9, < 11)
|
28
|
+
cucumber-core (> 13, < 14)
|
29
|
+
cucumber-cucumber-expressions (~> 17.0)
|
30
|
+
cucumber-gherkin (> 24, < 28)
|
31
|
+
cucumber-html-formatter (> 20.3, < 22)
|
32
|
+
cucumber-messages (> 19, < 25)
|
33
|
+
diff-lcs (~> 1.5)
|
34
|
+
mini_mime (~> 1.1)
|
35
|
+
multi_test (~> 1.1)
|
36
|
+
sys-uname (~> 1.2)
|
37
|
+
cucumber-ci-environment (10.0.1)
|
38
|
+
cucumber-core (13.0.2)
|
39
|
+
cucumber-gherkin (>= 27, < 28)
|
40
|
+
cucumber-messages (>= 20, < 23)
|
41
|
+
cucumber-tag-expressions (> 5, < 7)
|
42
|
+
cucumber-cucumber-expressions (17.1.0)
|
43
|
+
bigdecimal
|
44
|
+
cucumber-gherkin (27.0.0)
|
45
|
+
cucumber-messages (>= 19.1.4, < 23)
|
46
|
+
cucumber-html-formatter (21.3.1)
|
47
|
+
cucumber-messages (> 19, < 25)
|
48
|
+
cucumber-messages (22.0.0)
|
49
|
+
cucumber-tag-expressions (6.1.0)
|
50
|
+
diff-lcs (1.5.1)
|
51
|
+
ffi (1.16.3)
|
52
|
+
formatador (1.1.0)
|
53
|
+
guard (2.18.1)
|
49
54
|
formatador (>= 0.2.4)
|
50
55
|
listen (>= 2.7, < 4.0)
|
51
56
|
lumberjack (>= 1.0.12, < 2.0)
|
52
57
|
nenv (~> 0.1)
|
53
58
|
notiffany (~> 0.0)
|
54
|
-
pry (>= 0.
|
59
|
+
pry (>= 0.13.0)
|
55
60
|
shellany (~> 0.0)
|
56
61
|
thor (>= 0.18.1)
|
57
|
-
guard-bundler (
|
58
|
-
bundler (>= 1
|
62
|
+
guard-bundler (3.0.1)
|
63
|
+
bundler (>= 2.1, < 3)
|
59
64
|
guard (~> 2.2)
|
60
65
|
guard-compat (~> 1.1)
|
61
66
|
guard-compat (1.2.1)
|
@@ -63,70 +68,108 @@ GEM
|
|
63
68
|
guard (~> 2.1)
|
64
69
|
guard-compat (~> 1.1)
|
65
70
|
rspec (>= 2.99.0, < 4.0)
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
71
|
+
http-2-next (1.0.3)
|
72
|
+
httpx (1.2.5)
|
73
|
+
http-2-next (>= 1.0.3)
|
74
|
+
json (2.7.2)
|
75
|
+
language_server-protocol (3.17.0.3)
|
76
|
+
listen (3.9.0)
|
77
|
+
rb-fsevent (~> 0.10, >= 0.10.3)
|
78
|
+
rb-inotify (~> 0.9, >= 0.9.10)
|
79
|
+
lumberjack (1.2.10)
|
80
|
+
method_source (1.1.0)
|
81
|
+
mini_mime (1.1.5)
|
82
|
+
multi_test (1.1.0)
|
76
83
|
nenv (0.3.0)
|
77
|
-
nokogiri (1.
|
78
|
-
|
79
|
-
|
84
|
+
nokogiri (1.16.5-arm64-darwin)
|
85
|
+
racc (~> 1.4)
|
86
|
+
nokogiri (1.16.5-x86_64-darwin)
|
87
|
+
racc (~> 1.4)
|
88
|
+
nokogiri (1.16.5-x86_64-linux)
|
89
|
+
racc (~> 1.4)
|
90
|
+
notiffany (0.1.3)
|
80
91
|
nenv (~> 0.1)
|
81
92
|
shellany (~> 0.0)
|
82
|
-
parallel (1.
|
83
|
-
parser (
|
84
|
-
ast (~> 2.4.
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
93
|
+
parallel (1.24.0)
|
94
|
+
parser (3.3.1.0)
|
95
|
+
ast (~> 2.4.1)
|
96
|
+
racc
|
97
|
+
pry (0.14.2)
|
98
|
+
coderay (~> 1.1)
|
99
|
+
method_source (~> 1.0)
|
100
|
+
pry-byebug (3.10.1)
|
89
101
|
byebug (~> 11.0)
|
90
|
-
pry (
|
91
|
-
|
92
|
-
|
93
|
-
rack (
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
102
|
+
pry (>= 0.13, < 0.15)
|
103
|
+
public_suffix (5.0.5)
|
104
|
+
racc (1.8.0)
|
105
|
+
rack (3.0.11)
|
106
|
+
rackup (0.2.3)
|
107
|
+
rack (>= 3.0.0.beta1)
|
108
|
+
webrick
|
109
|
+
rainbow (3.1.1)
|
110
|
+
rake (13.2.1)
|
111
|
+
rb-fsevent (0.11.2)
|
112
|
+
rb-inotify (0.11.1)
|
98
113
|
ffi (~> 1.0)
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
rspec-
|
105
|
-
|
114
|
+
regexp_parser (2.9.2)
|
115
|
+
rexml (3.2.8)
|
116
|
+
strscan (>= 3.0.9)
|
117
|
+
rspec (3.13.0)
|
118
|
+
rspec-core (~> 3.13.0)
|
119
|
+
rspec-expectations (~> 3.13.0)
|
120
|
+
rspec-mocks (~> 3.13.0)
|
121
|
+
rspec-core (3.13.0)
|
122
|
+
rspec-support (~> 3.13.0)
|
123
|
+
rspec-expectations (3.13.0)
|
106
124
|
diff-lcs (>= 1.2.0, < 2.0)
|
107
|
-
rspec-support (~> 3.
|
108
|
-
rspec-mocks (3.
|
125
|
+
rspec-support (~> 3.13.0)
|
126
|
+
rspec-mocks (3.13.1)
|
109
127
|
diff-lcs (>= 1.2.0, < 2.0)
|
110
|
-
rspec-support (~> 3.
|
111
|
-
rspec-support (3.
|
112
|
-
rubocop (
|
113
|
-
|
128
|
+
rspec-support (~> 3.13.0)
|
129
|
+
rspec-support (3.13.1)
|
130
|
+
rubocop (1.64.0)
|
131
|
+
json (~> 2.3)
|
132
|
+
language_server-protocol (>= 3.17.0)
|
114
133
|
parallel (~> 1.10)
|
115
|
-
parser (>=
|
116
|
-
psych (>= 3.1.0)
|
134
|
+
parser (>= 3.3.0.2)
|
117
135
|
rainbow (>= 2.2.2, < 4.0)
|
136
|
+
regexp_parser (>= 1.8, < 3.0)
|
137
|
+
rexml (>= 3.2.5, < 4.0)
|
138
|
+
rubocop-ast (>= 1.31.1, < 2.0)
|
118
139
|
ruby-progressbar (~> 1.7)
|
119
|
-
unicode-display_width (>=
|
120
|
-
|
121
|
-
|
140
|
+
unicode-display_width (>= 2.4.0, < 3.0)
|
141
|
+
rubocop-ast (1.31.3)
|
142
|
+
parser (>= 3.3.1.0)
|
143
|
+
rubocop-capybara (2.20.0)
|
144
|
+
rubocop (~> 1.41)
|
145
|
+
rubocop-factory_bot (2.25.1)
|
146
|
+
rubocop (~> 1.41)
|
147
|
+
rubocop-rake (0.6.0)
|
148
|
+
rubocop (~> 1.0)
|
149
|
+
rubocop-rspec (2.29.2)
|
150
|
+
rubocop (~> 1.40)
|
151
|
+
rubocop-capybara (~> 2.17)
|
152
|
+
rubocop-factory_bot (~> 2.22)
|
153
|
+
rubocop-rspec_rails (~> 2.28)
|
154
|
+
rubocop-rspec_rails (2.28.3)
|
155
|
+
rubocop (~> 1.40)
|
156
|
+
ruby-progressbar (1.13.0)
|
122
157
|
shellany (0.0.1)
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
158
|
+
strscan (3.1.0)
|
159
|
+
stub_server (0.7.0)
|
160
|
+
rackup (~> 0.2.2)
|
161
|
+
webrick
|
162
|
+
sys-uname (1.2.3)
|
163
|
+
ffi (~> 1.1)
|
164
|
+
thor (1.3.1)
|
165
|
+
unicode-display_width (2.5.0)
|
166
|
+
webrick (1.8.1)
|
127
167
|
|
128
168
|
PLATFORMS
|
129
|
-
|
169
|
+
arm64-darwin-22
|
170
|
+
arm64-darwin-23
|
171
|
+
x86_64-darwin-21
|
172
|
+
x86_64-linux
|
130
173
|
|
131
174
|
DEPENDENCIES
|
132
175
|
aruba
|
@@ -135,12 +178,15 @@ DEPENDENCIES
|
|
135
178
|
guard-bundler
|
136
179
|
guard-rspec
|
137
180
|
httpspell!
|
181
|
+
httpx
|
138
182
|
pry
|
139
183
|
pry-byebug
|
140
184
|
rake
|
141
185
|
rspec
|
142
186
|
rubocop
|
187
|
+
rubocop-rake
|
188
|
+
rubocop-rspec
|
143
189
|
stub_server
|
144
190
|
|
145
191
|
BUNDLED WITH
|
146
|
-
|
192
|
+
2.5.9
|
data/Guardfile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
guard :bundler do
|
2
|
+
require 'guard/bundler'
|
3
|
+
require 'guard/bundler/verify'
|
4
|
+
helper = Guard::Bundler::Verify.new
|
5
|
+
|
6
|
+
files = ['Gemfile']
|
7
|
+
files += Dir['*.gemspec'] if files.any? { |f| helper.uses_gemspec?(f) }
|
8
|
+
|
9
|
+
# Assume files are symlinked from somewhere
|
10
|
+
files.each { |file| watch(helper.real_path(file)) }
|
11
|
+
end
|
12
|
+
|
13
|
+
guard :rspec, cmd: "bundle exec rspec" do
|
14
|
+
require "guard/rspec/dsl"
|
15
|
+
dsl = Guard::RSpec::Dsl.new(self)
|
16
|
+
|
17
|
+
rspec = dsl.rspec
|
18
|
+
watch(rspec.spec_helper) { rspec.spec_dir }
|
19
|
+
watch(rspec.spec_support) { rspec.spec_dir }
|
20
|
+
watch(rspec.spec_files)
|
21
|
+
|
22
|
+
# Ruby files
|
23
|
+
ruby = dsl.ruby
|
24
|
+
dsl.watch_spec_files_for(ruby.lib_files)
|
25
|
+
end
|
data/README.markdown
CHANGED
@@ -1,7 +1,5 @@
|
|
1
1
|
# `httpspell`
|
2
2
|
|
3
|
-
[](https://travis-ci.org/suhlig/httpspell)
|
4
|
-
|
5
3
|
This is a spellchecker that recursively fetches HTML pages, converts them to plain text (using [pandoc](http://pandoc.org/)), and spellchecks them with [hunspell](https://hunspell.github.io/). Unknown words will be printed to `stdout`, which makes the tool a good candidate for CI pipelines where you might want to take action when a spelling error is found on a web page.
|
6
4
|
|
7
5
|
Words that are not in the dictionary for the given language (inferred from the `lang` attribute of the HTML document's root element) can be added to a personal dictionary, which will mark the word as correctly spelled.
|
@@ -46,3 +44,26 @@ If you produce content with kramdown (e.g. using Jekyll), setting `spellcheck='f
|
|
46
44
|
```
|
47
45
|
{: spellcheck="false"}
|
48
46
|
```
|
47
|
+
|
48
|
+
# Dictionaries
|
49
|
+
|
50
|
+
Hunspell uses the system dictionary paths; on the Mac this is `~/Library/Spelling/`. Get some dictionaries as explained in the [hunspell](https://github.com/hunspell/hunspell) project:
|
51
|
+
|
52
|
+
```command
|
53
|
+
$ wget -O ~/Library/Spelling/en_US.aff https://cgit.freedesktop.org/libreoffice/dictionaries/plain/en/en_US.aff
|
54
|
+
$ wget -O ~/Library/Spelling/en_US.dic https://cgit.freedesktop.org/libreoffice/dictionaries/plain/en/en_US.dic
|
55
|
+
```
|
56
|
+
|
57
|
+
German:
|
58
|
+
|
59
|
+
```command
|
60
|
+
$ wget -O ~/Library/Spelling/de_DE.dic https://cgit.freedesktop.org/libreoffice/dictionaries/plain/de/de_DE_frami.dic
|
61
|
+
$ wget -O ~/Library/Spelling/de_DE.aff https://cgit.freedesktop.org/libreoffice/dictionaries/plain/de/de_DE_frami.aff
|
62
|
+
```
|
63
|
+
|
64
|
+
Italian (for integration tests):
|
65
|
+
|
66
|
+
```command
|
67
|
+
$ wget -O ~/Library/Spelling/it_IT.dic https://cgit.freedesktop.org/libreoffice/dictionaries/plain/it_IT/it_IT.dic
|
68
|
+
$ wget -O ~/Library/Spelling/it_IT.aff https://cgit.freedesktop.org/libreoffice/dictionaries/plain/it_IT/it_IT.aff
|
69
|
+
```
|
data/Rakefile
CHANGED
data/TODO.markdown
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
* Bail out if lang cannot be inferred and is not given on cmdline
|
2
2
|
* exe/httpspell: # TODO: --recursive, defaults to false
|
3
3
|
* exe/httpspell: # TODO wget has some additional options for recursive behavior that should be reviewed
|
4
|
-
* exe/httpspell: # TODO: Find sections with a lang attribute and handle them separately
|
5
4
|
* lib/httpspell/spider.rb: # TODO Print _which_ entry of the blacklist matches
|
6
5
|
* lib/httpspell/spider.rb: # TODO Ignore same page links (some anchor)
|
data/exe/httpspell
CHANGED
@@ -59,27 +59,38 @@ if ARGV.size != 1
|
|
59
59
|
exit 1
|
60
60
|
end
|
61
61
|
|
62
|
-
|
62
|
+
def check(doc, lang, personal_dictionary_path, verbose)
|
63
|
+
unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc, lang)
|
64
|
+
|
65
|
+
if unknown_words.empty?
|
66
|
+
warn 'No unknown words.' if verbose
|
67
|
+
else
|
68
|
+
warn "#{unknown_words.size} unknown words:" if verbose
|
69
|
+
puts unknown_words
|
70
|
+
true
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
63
74
|
has_unknown_words = false
|
64
75
|
|
65
|
-
spider_success = HttpSpell::Spider.new(ARGV.first, whitelist
|
66
|
-
lang = force_language || doc.root['lang'] || ENV
|
76
|
+
spider_success = HttpSpell::Spider.new(ARGV.first, whitelist:, blacklist:, verbose:, tracing:).start do |url, doc|
|
77
|
+
lang = force_language || doc.root['lang'] || ENV.fetch('LANGUAGE', nil)
|
78
|
+
warn "Checking #{url} as #{lang}" if verbose
|
67
79
|
|
68
|
-
# Remove
|
80
|
+
# Remove elements that are not to be spellchecked
|
69
81
|
doc.css('pre').each(&:unlink)
|
70
82
|
doc.css('code').each(&:unlink)
|
71
83
|
doc.css('[spellcheck=false]').each(&:unlink)
|
72
84
|
|
73
|
-
#
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
else
|
79
|
-
warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
|
80
|
-
puts unknown_words
|
81
|
-
has_unknown_words = true
|
85
|
+
# Handle elements with a different lang attribute separately
|
86
|
+
doc.css(%([lang]:not([lang="#{lang}"]))).each do |element|
|
87
|
+
warn "Handling #{element.name} with lang #{element['lang']}:" if verbose
|
88
|
+
has_unknown_words |= check(element.to_s, element['lang'], personal_dictionary_path, verbose)
|
89
|
+
element.unlink
|
82
90
|
end
|
91
|
+
|
92
|
+
# Everything else
|
93
|
+
has_unknown_words |= check(doc.to_s, lang, personal_dictionary_path, verbose)
|
83
94
|
end
|
84
95
|
|
85
96
|
exit 2 unless spider_success
|
data/httpspell.gemspec
CHANGED
@@ -4,7 +4,6 @@ lib = File.expand_path('lib', __dir__)
|
|
4
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
5
|
require 'httpspell/version'
|
6
6
|
|
7
|
-
# rubocop:disable Metrics/BlockLength
|
8
7
|
Gem::Specification.new do |spec|
|
9
8
|
spec.name = 'httpspell'
|
10
9
|
spec.version = HttpSpell::VERSION
|
@@ -13,8 +12,8 @@ Gem::Specification.new do |spec|
|
|
13
12
|
|
14
13
|
spec.summary = 'HTTP spellchecker'
|
15
14
|
spec.description = %(httpspell is a spellchecker that recursively fetches
|
16
|
-
|
17
|
-
|
15
|
+
HTML pages, converts them to plain text using pandoc, and
|
16
|
+
spellchecks them with hunspell.)
|
18
17
|
spec.license = 'MIT'
|
19
18
|
|
20
19
|
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
@@ -26,17 +25,5 @@ Gem::Specification.new do |spec|
|
|
26
25
|
|
27
26
|
spec.add_dependency 'addressable'
|
28
27
|
spec.add_dependency 'nokogiri'
|
29
|
-
|
30
|
-
spec.add_development_dependency 'aruba'
|
31
|
-
spec.add_development_dependency 'bundler'
|
32
|
-
spec.add_development_dependency 'guard'
|
33
|
-
spec.add_development_dependency 'guard-bundler'
|
34
|
-
spec.add_development_dependency 'guard-rspec'
|
35
|
-
spec.add_development_dependency 'pry'
|
36
|
-
spec.add_development_dependency 'pry-byebug'
|
37
|
-
spec.add_development_dependency 'rake'
|
38
|
-
spec.add_development_dependency 'rspec'
|
39
|
-
spec.add_development_dependency 'rubocop'
|
40
|
-
spec.add_development_dependency 'stub_server'
|
28
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
41
29
|
end
|
42
|
-
# rubocop:enable Metrics/BlockLength
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module HttpSpell
|
2
2
|
class SpellChecker
|
3
|
-
def initialize(personal_dictionary_path = nil,
|
3
|
+
def initialize(personal_dictionary_path = nil, verbose: false)
|
4
4
|
@personal_dictionary_arg = "-p #{personal_dictionary_path}" if personal_dictionary_path
|
5
|
-
@
|
5
|
+
@verbose = verbose
|
6
6
|
end
|
7
7
|
|
8
8
|
def check(doc, lang)
|
@@ -11,7 +11,7 @@ module HttpSpell
|
|
11
11
|
"hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l",
|
12
12
|
]
|
13
13
|
|
14
|
-
if @
|
14
|
+
if @verbose
|
15
15
|
warn "Piping the HTML document into the following chain of commands:"
|
16
16
|
warn commands
|
17
17
|
end
|
data/lib/httpspell/spider.rb
CHANGED
@@ -8,12 +8,13 @@ module HttpSpell
|
|
8
8
|
class Spider
|
9
9
|
attr_reader :todo, :done
|
10
10
|
|
11
|
-
def initialize(starting_point, whitelist: nil, blacklist: [], tracing: false)
|
11
|
+
def initialize(starting_point, whitelist: nil, blacklist: [], verbose: false, tracing: false)
|
12
12
|
@todo = []
|
13
13
|
@done = []
|
14
14
|
todo << Addressable::URI.parse(starting_point)
|
15
15
|
@whitelist = whitelist || [/^#{starting_point}/]
|
16
16
|
@blacklist = blacklist
|
17
|
+
@verbose = verbose
|
17
18
|
@tracing = tracing
|
18
19
|
end
|
19
20
|
|
@@ -46,10 +47,10 @@ module HttpSpell
|
|
46
47
|
private
|
47
48
|
|
48
49
|
def links(uri)
|
49
|
-
response = http_get(
|
50
|
+
response = http_get(uri)
|
50
51
|
|
51
|
-
if response.content_type != 'text/html'
|
52
|
-
warn "Skipping #{uri} because it is not HTML" if @
|
52
|
+
if response.respond_to?(:content_type) && response.content_type != 'text/html'
|
53
|
+
warn "Skipping #{uri} because it is not HTML" if @verbose
|
53
54
|
return []
|
54
55
|
end
|
55
56
|
|
@@ -60,13 +61,13 @@ module HttpSpell
|
|
60
61
|
link = uri.join(link) if link.relative?
|
61
62
|
|
62
63
|
if @whitelist.none? { |re| re.match?(link.to_s) }
|
63
|
-
warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @
|
64
|
+
warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @verbose
|
64
65
|
next
|
65
66
|
end
|
66
67
|
|
67
68
|
if @blacklist.any? { |re| re.match?(link.to_s) }
|
68
69
|
# TODO Print _which_ entry of the blacklist matches
|
69
|
-
warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @
|
70
|
+
warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @verbose
|
70
71
|
next
|
71
72
|
end
|
72
73
|
|
@@ -79,7 +80,7 @@ module HttpSpell
|
|
79
80
|
|
80
81
|
yield uri, doc if block_given?
|
81
82
|
|
82
|
-
warn "Adding #{links.size} links from #{uri}" if @
|
83
|
+
warn "Adding #{links.size} links from #{uri}" if @verbose
|
83
84
|
links
|
84
85
|
end
|
85
86
|
|
@@ -88,7 +89,7 @@ module HttpSpell
|
|
88
89
|
tries = 10
|
89
90
|
|
90
91
|
begin
|
91
|
-
|
92
|
+
URI.open(uri, redirect: false)
|
92
93
|
rescue OpenURI::HTTPRedirect => redirect
|
93
94
|
uri = redirect.uri
|
94
95
|
retry if (tries -= 1) > 0
|
data/lib/httpspell/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: httpspell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Steffen Uhlig
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-05-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -38,164 +38,10 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: aruba
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - ">="
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - ">="
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: bundler
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - ">="
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
|
-
type: :development
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - ">="
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: guard
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - ">="
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
76
|
-
type: :development
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - ">="
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: guard-bundler
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
86
|
-
requirements:
|
87
|
-
- - ">="
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
90
|
-
type: :development
|
91
|
-
prerelease: false
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
93
|
-
requirements:
|
94
|
-
- - ">="
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
version: '0'
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: guard-rspec
|
99
|
-
requirement: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - ">="
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: '0'
|
104
|
-
type: :development
|
105
|
-
prerelease: false
|
106
|
-
version_requirements: !ruby/object:Gem::Requirement
|
107
|
-
requirements:
|
108
|
-
- - ">="
|
109
|
-
- !ruby/object:Gem::Version
|
110
|
-
version: '0'
|
111
|
-
- !ruby/object:Gem::Dependency
|
112
|
-
name: pry
|
113
|
-
requirement: !ruby/object:Gem::Requirement
|
114
|
-
requirements:
|
115
|
-
- - ">="
|
116
|
-
- !ruby/object:Gem::Version
|
117
|
-
version: '0'
|
118
|
-
type: :development
|
119
|
-
prerelease: false
|
120
|
-
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
requirements:
|
122
|
-
- - ">="
|
123
|
-
- !ruby/object:Gem::Version
|
124
|
-
version: '0'
|
125
|
-
- !ruby/object:Gem::Dependency
|
126
|
-
name: pry-byebug
|
127
|
-
requirement: !ruby/object:Gem::Requirement
|
128
|
-
requirements:
|
129
|
-
- - ">="
|
130
|
-
- !ruby/object:Gem::Version
|
131
|
-
version: '0'
|
132
|
-
type: :development
|
133
|
-
prerelease: false
|
134
|
-
version_requirements: !ruby/object:Gem::Requirement
|
135
|
-
requirements:
|
136
|
-
- - ">="
|
137
|
-
- !ruby/object:Gem::Version
|
138
|
-
version: '0'
|
139
|
-
- !ruby/object:Gem::Dependency
|
140
|
-
name: rake
|
141
|
-
requirement: !ruby/object:Gem::Requirement
|
142
|
-
requirements:
|
143
|
-
- - ">="
|
144
|
-
- !ruby/object:Gem::Version
|
145
|
-
version: '0'
|
146
|
-
type: :development
|
147
|
-
prerelease: false
|
148
|
-
version_requirements: !ruby/object:Gem::Requirement
|
149
|
-
requirements:
|
150
|
-
- - ">="
|
151
|
-
- !ruby/object:Gem::Version
|
152
|
-
version: '0'
|
153
|
-
- !ruby/object:Gem::Dependency
|
154
|
-
name: rspec
|
155
|
-
requirement: !ruby/object:Gem::Requirement
|
156
|
-
requirements:
|
157
|
-
- - ">="
|
158
|
-
- !ruby/object:Gem::Version
|
159
|
-
version: '0'
|
160
|
-
type: :development
|
161
|
-
prerelease: false
|
162
|
-
version_requirements: !ruby/object:Gem::Requirement
|
163
|
-
requirements:
|
164
|
-
- - ">="
|
165
|
-
- !ruby/object:Gem::Version
|
166
|
-
version: '0'
|
167
|
-
- !ruby/object:Gem::Dependency
|
168
|
-
name: rubocop
|
169
|
-
requirement: !ruby/object:Gem::Requirement
|
170
|
-
requirements:
|
171
|
-
- - ">="
|
172
|
-
- !ruby/object:Gem::Version
|
173
|
-
version: '0'
|
174
|
-
type: :development
|
175
|
-
prerelease: false
|
176
|
-
version_requirements: !ruby/object:Gem::Requirement
|
177
|
-
requirements:
|
178
|
-
- - ">="
|
179
|
-
- !ruby/object:Gem::Version
|
180
|
-
version: '0'
|
181
|
-
- !ruby/object:Gem::Dependency
|
182
|
-
name: stub_server
|
183
|
-
requirement: !ruby/object:Gem::Requirement
|
184
|
-
requirements:
|
185
|
-
- - ">="
|
186
|
-
- !ruby/object:Gem::Version
|
187
|
-
version: '0'
|
188
|
-
type: :development
|
189
|
-
prerelease: false
|
190
|
-
version_requirements: !ruby/object:Gem::Requirement
|
191
|
-
requirements:
|
192
|
-
- - ">="
|
193
|
-
- !ruby/object:Gem::Version
|
194
|
-
version: '0'
|
195
41
|
description: |-
|
196
42
|
httpspell is a spellchecker that recursively fetches
|
197
|
-
|
198
|
-
|
43
|
+
HTML pages, converts them to plain text using pandoc, and
|
44
|
+
spellchecks them with hunspell.
|
199
45
|
email:
|
200
46
|
- steffen@familie-uhlig.net
|
201
47
|
executables:
|
@@ -203,12 +49,15 @@ executables:
|
|
203
49
|
extensions: []
|
204
50
|
extra_rdoc_files: []
|
205
51
|
files:
|
52
|
+
- ".github/dependabot.yml"
|
206
53
|
- ".gitignore"
|
54
|
+
- ".mergify.yml"
|
207
55
|
- ".rspec"
|
208
56
|
- ".rubocop.yml"
|
209
|
-
- ".
|
57
|
+
- ".ruby-version"
|
210
58
|
- Gemfile
|
211
59
|
- Gemfile.lock
|
60
|
+
- Guardfile
|
212
61
|
- README.markdown
|
213
62
|
- Rakefile
|
214
63
|
- TODO.markdown
|
@@ -217,11 +66,12 @@ files:
|
|
217
66
|
- lib/httpspell/spellchecker.rb
|
218
67
|
- lib/httpspell/spider.rb
|
219
68
|
- lib/httpspell/version.rb
|
220
|
-
homepage:
|
69
|
+
homepage:
|
221
70
|
licenses:
|
222
71
|
- MIT
|
223
|
-
metadata:
|
224
|
-
|
72
|
+
metadata:
|
73
|
+
rubygems_mfa_required: 'true'
|
74
|
+
post_install_message:
|
225
75
|
rdoc_options: []
|
226
76
|
require_paths:
|
227
77
|
- lib
|
@@ -236,8 +86,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
236
86
|
- !ruby/object:Gem::Version
|
237
87
|
version: '0'
|
238
88
|
requirements: []
|
239
|
-
rubygems_version: 3.
|
240
|
-
signing_key:
|
89
|
+
rubygems_version: 3.5.9
|
90
|
+
signing_key:
|
241
91
|
specification_version: 4
|
242
92
|
summary: HTTP spellchecker
|
243
93
|
test_files: []
|