httpspell 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec +5 -0
- data/.travis.yml +6 -0
- data/Gemfile.lock +33 -1
- data/README.markdown +29 -2
- data/Rakefile +6 -3
- data/exe/httpspell +45 -11
- data/httpspell.gemspec +1 -0
- data/lib/httpspell/spider.rb +23 -22
- data/lib/httpspell/version.rb +1 -1
- metadata +18 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aeed14621889176b3b295937ab5ae5d75371862b0e9107fc11ab0be65cebc082
|
4
|
+
data.tar.gz: 10a13a180d6b032b0e71623ae82239325c658bad2a4a4f74424636adedc22531
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9517f20cdfa7da8f013eb01e456c5a67fbccd49df25bfcfa437aca1e20ccd570623c465e8a953fb588d83f95c783caa1fccce19dab0ff416caf05c5a4c75a58a
|
7
|
+
data.tar.gz: 54d38926757fcd968a461cfff777343e08a69f8a3fe687fd00694486a1ac4539d4547f6faaa34c9a997b7643a134bf27e346d9ddf0c3fe7d11c38d5d6ca56e45
|
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
httpspell (1.
|
4
|
+
httpspell (1.1.0)
|
5
5
|
addressable
|
6
6
|
nokogiri
|
7
7
|
|
@@ -10,12 +10,41 @@ GEM
|
|
10
10
|
specs:
|
11
11
|
addressable (2.5.2)
|
12
12
|
public_suffix (>= 2.0.2, < 4.0)
|
13
|
+
aruba (0.14.5)
|
14
|
+
childprocess (>= 0.6.3, < 0.10.0)
|
15
|
+
contracts (~> 0.9)
|
16
|
+
cucumber (>= 1.3.19)
|
17
|
+
ffi (~> 1.9.10)
|
18
|
+
rspec-expectations (>= 2.99)
|
19
|
+
thor (~> 0.19)
|
13
20
|
ast (2.4.0)
|
21
|
+
backports (3.11.3)
|
22
|
+
builder (3.2.3)
|
14
23
|
byebug (10.0.2)
|
24
|
+
childprocess (0.9.0)
|
25
|
+
ffi (~> 1.0, >= 1.0.11)
|
15
26
|
coderay (1.1.2)
|
27
|
+
contracts (0.16.0)
|
28
|
+
cucumber (3.1.0)
|
29
|
+
builder (>= 2.1.2)
|
30
|
+
cucumber-core (~> 3.1.0)
|
31
|
+
cucumber-expressions (~> 5.0.4)
|
32
|
+
cucumber-wire (~> 0.0.1)
|
33
|
+
diff-lcs (~> 1.3)
|
34
|
+
gherkin (~> 5.0)
|
35
|
+
multi_json (>= 1.7.5, < 2.0)
|
36
|
+
multi_test (>= 0.1.2)
|
37
|
+
cucumber-core (3.1.0)
|
38
|
+
backports (>= 3.8.0)
|
39
|
+
cucumber-tag_expressions (~> 1.1.0)
|
40
|
+
gherkin (>= 5.0.0)
|
41
|
+
cucumber-expressions (5.0.18)
|
42
|
+
cucumber-tag_expressions (1.1.1)
|
43
|
+
cucumber-wire (0.0.1)
|
16
44
|
diff-lcs (1.3)
|
17
45
|
ffi (1.9.23)
|
18
46
|
formatador (0.2.5)
|
47
|
+
gherkin (5.1.0)
|
19
48
|
guard (2.14.2)
|
20
49
|
formatador (>= 0.2.4)
|
21
50
|
listen (>= 2.7, < 4.0)
|
@@ -41,6 +70,8 @@ GEM
|
|
41
70
|
lumberjack (1.0.13)
|
42
71
|
method_source (0.9.0)
|
43
72
|
mini_portile2 (2.3.0)
|
73
|
+
multi_json (1.13.1)
|
74
|
+
multi_test (0.1.2)
|
44
75
|
nenv (0.3.0)
|
45
76
|
nokogiri (1.8.2)
|
46
77
|
mini_portile2 (~> 2.3.0)
|
@@ -93,6 +124,7 @@ PLATFORMS
|
|
93
124
|
ruby
|
94
125
|
|
95
126
|
DEPENDENCIES
|
127
|
+
aruba
|
96
128
|
bundler
|
97
129
|
guard
|
98
130
|
guard-bundler
|
data/README.markdown
CHANGED
@@ -1,16 +1,43 @@
|
|
1
1
|
# `httpspell`
|
2
2
|
|
3
|
+
[](https://travis-ci.org/suhlig/httpspell)
|
4
|
+
|
3
5
|
This is a spellchecker that recursively fetches HTML pages, converts them to plain text (using [pandoc](http://pandoc.org/)), and spellchecks them with [hunspell](https://hunspell.github.io/). Unknown words will be printed to `stdout`, which makes the tool a good candidate for CI pipelines where you might want to take action when a spelling error is found on a web page.
|
4
6
|
|
5
7
|
Words that are not in the dictionary for the given language (inferred from the `lang` attribute of the HTML document's root element) can be added to a personal dictionary, which will mark the word as correctly spelled.
|
6
8
|
|
9
|
+
# Usage
|
10
|
+
|
11
|
+
* The following command will retrieve the HTML document at https://example.com, spellcheck it, and not print anything because there are no errors:
|
12
|
+
|
13
|
+
```bash
|
14
|
+
$ httpspell https://example.com
|
15
|
+
```
|
16
|
+
|
17
|
+
The exit code is `0`.
|
18
|
+
|
19
|
+
* The following command will spellcheck the README of this project as rendered by GitHub, and print a list of unknown words. Note that we set the language to `en_US` because GitHub declares 'en' as document language, but the installed dictionaries usually refer the a specific language variant like `en_US`:
|
20
|
+
|
21
|
+
```bash
|
22
|
+
$ httpspell https://github.com/suhlig/httpspell/blob/master/README.markdown --language en_US
|
23
|
+
suhlig
|
24
|
+
Permalink
|
25
|
+
httpspell
|
26
|
+
sloc
|
27
|
+
pandoc
|
28
|
+
hunspell
|
29
|
+
...
|
30
|
+
```
|
31
|
+
|
32
|
+
The exit code is `1`.
|
33
|
+
|
7
34
|
# What is *not* checked
|
8
35
|
|
9
|
-
* When spidering a site, `httpspell` will skip all responses with a `content-type` header other than `text/html
|
36
|
+
* When spidering a site, `httpspell` will skip all responses with a `content-type` header other than `text/html` (unless pointing it to file, in which case it accepts anything).
|
10
37
|
* Before converting, `httpspell` removes the following nodes from the HTML DOM as they are not a good target for spellchecking:
|
11
38
|
- `code`
|
12
39
|
- `pre`
|
13
|
-
- Elements with `spellcheck='false'` (this is how HTML5 allows tagging elements as a target for spellchecking)
|
40
|
+
- Elements with `spellcheck='false'` (this is how HTML5 allows tagging elements as a being target for spellchecking or not)
|
14
41
|
|
15
42
|
# Misc
|
16
43
|
|
data/Rakefile
CHANGED
@@ -10,9 +10,12 @@ task default: ['spec:all']
|
|
10
10
|
|
11
11
|
namespace :spec do
|
12
12
|
desc 'Run all specs'
|
13
|
-
task all: [
|
13
|
+
task all: %i[rubocop:auto_correct unit system]
|
14
14
|
|
15
|
-
|
16
|
-
|
15
|
+
%w[unit system].each do |type|
|
16
|
+
desc "Run #{type} tests"
|
17
|
+
RSpec::Core::RakeTask.new(type) do |t|
|
18
|
+
t.pattern = "spec/#{type}/**/*_spec.rb"
|
19
|
+
end
|
17
20
|
end
|
18
21
|
end
|
data/exe/httpspell
CHANGED
@@ -4,8 +4,13 @@
|
|
4
4
|
require 'optparse'
|
5
5
|
require 'httpspell/spider'
|
6
6
|
require 'httpspell/spellchecker'
|
7
|
+
require 'httpspell/version'
|
7
8
|
|
8
9
|
personal_dictionary_path = nil
|
10
|
+
force_language = nil
|
11
|
+
tracing = nil
|
12
|
+
verbose = nil
|
13
|
+
limit = nil
|
9
14
|
|
10
15
|
begin
|
11
16
|
OptionParser.new do |parser|
|
@@ -13,11 +18,28 @@ begin
|
|
13
18
|
Spellchecks a website via HTTP.
|
14
19
|
|
15
20
|
BANNER
|
21
|
+
parser.version = HttpSpell::VERSION
|
16
22
|
|
17
23
|
parser.on('-p', '--personal-dictionary=FILE', 'path to the personal dictionary file') do |p|
|
18
24
|
personal_dictionary_path = p
|
19
25
|
end
|
20
26
|
|
27
|
+
parser.on('-l', '--language=LANGUAGE', 'override LANGUAGE of content') do |l|
|
28
|
+
force_language = l
|
29
|
+
end
|
30
|
+
|
31
|
+
parser.on('-L', '--limit=EXPRESSION', 'limit recursive retrieval to URLs matching a regular EXPRESSION') do |l|
|
32
|
+
limit = Regexp.new(l)
|
33
|
+
end
|
34
|
+
|
35
|
+
parser.on('-t', '--trace', 'enable error tracing') do
|
36
|
+
tracing = true
|
37
|
+
end
|
38
|
+
|
39
|
+
parser.on('-V', '--verbose', "explain what's happening") do
|
40
|
+
verbose = true
|
41
|
+
end
|
42
|
+
|
21
43
|
# TODO: --recursive, defaults to false
|
22
44
|
# TODO wget has some additional options for recursive behavior that should be reviewed
|
23
45
|
end.parse!
|
@@ -32,20 +54,32 @@ if ARGV.size != 1
|
|
32
54
|
end
|
33
55
|
|
34
56
|
spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path)
|
57
|
+
has_unknown_words = false
|
35
58
|
|
36
|
-
|
37
|
-
|
59
|
+
begin
|
60
|
+
HttpSpell::Spider.new(ARGV.first, limit: limit, tracing: tracing).start do |url, doc|
|
61
|
+
lang = force_language || doc.root['lang'] || ENV['LANGUAGE']
|
38
62
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
63
|
+
# Remove sections that are not to be spellchecked
|
64
|
+
doc.css('pre').each(&:unlink)
|
65
|
+
doc.css('code').each(&:unlink)
|
66
|
+
doc.css('[spellcheck=false]').each(&:unlink)
|
43
67
|
|
44
|
-
|
45
|
-
|
68
|
+
# TODO: Find sections with a lang attribute and handle them separately
|
69
|
+
unknown_words = spell_checker.check(doc.to_s, lang)
|
46
70
|
|
47
|
-
|
48
|
-
|
49
|
-
|
71
|
+
if unknown_words.empty?
|
72
|
+
warn "No unknown words (language is #{lang}) at #{url}." if verbose
|
73
|
+
else
|
74
|
+
warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
|
75
|
+
puts unknown_words
|
76
|
+
has_unknown_words = true
|
77
|
+
end
|
50
78
|
end
|
79
|
+
rescue StandardError
|
80
|
+
warn $ERROR_INFO.message
|
81
|
+
warn $ERROR_INFO.backtrace if tracing
|
82
|
+
exit 2
|
51
83
|
end
|
84
|
+
|
85
|
+
exit 1 if has_unknown_words
|
data/httpspell.gemspec
CHANGED
@@ -27,6 +27,7 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.add_dependency 'addressable'
|
28
28
|
spec.add_dependency 'nokogiri'
|
29
29
|
|
30
|
+
spec.add_development_dependency 'aruba'
|
30
31
|
spec.add_development_dependency 'bundler'
|
31
32
|
spec.add_development_dependency 'guard'
|
32
33
|
spec.add_development_dependency 'guard-bundler'
|
data/lib/httpspell/spider.rb
CHANGED
@@ -5,34 +5,29 @@ require 'addressable/uri'
|
|
5
5
|
require 'English'
|
6
6
|
|
7
7
|
module HttpSpell
|
8
|
-
# rubocop:disable Metrics/AbcSize
|
9
|
-
# rubocop:disable Metrics/MethodLength
|
10
8
|
class Spider
|
11
9
|
attr_reader :todo, :done
|
12
10
|
|
13
|
-
def initialize(starting_point,
|
11
|
+
def initialize(starting_point, limit: nil, tracing: false)
|
14
12
|
@todo = []
|
15
13
|
@done = []
|
16
14
|
todo << Addressable::URI.parse(starting_point)
|
17
|
-
@
|
15
|
+
@limit = limit || /^#{starting_point}/
|
16
|
+
@tracing = tracing
|
18
17
|
end
|
19
18
|
|
20
19
|
def start
|
21
20
|
while todo.any?
|
22
21
|
url = todo.pop
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
warn "Callback error for #{url}: #{$ERROR_INFO}"
|
29
|
-
end
|
30
|
-
|
31
|
-
done.append(url)
|
32
|
-
todo.concat(extracted - done - todo)
|
33
|
-
rescue StandardError
|
34
|
-
warn "Could not fetch #{url}: #{$ERROR_INFO}"
|
22
|
+
extracted = links(url) do |u, d|
|
23
|
+
yield u, d if block_given?
|
24
|
+
rescue
|
25
|
+
warn "Callback error for #{url}: #{$ERROR_INFO}"
|
26
|
+
warn $ERROR_INFO.backtrace if @tracing
|
35
27
|
end
|
28
|
+
|
29
|
+
done.append(url)
|
30
|
+
todo.concat(extracted - done - todo)
|
36
31
|
end
|
37
32
|
end
|
38
33
|
|
@@ -40,23 +35,29 @@ module HttpSpell
|
|
40
35
|
|
41
36
|
def links(uri)
|
42
37
|
# We are using open-uri, which follows redirects and also provides the content-type.
|
43
|
-
response =
|
44
|
-
|
38
|
+
response = open(uri).read
|
39
|
+
|
40
|
+
if response.respond_to?(:content_type)
|
41
|
+
return [] unless response.content_type == 'text/html'
|
42
|
+
end
|
43
|
+
|
45
44
|
doc = Nokogiri::HTML(response)
|
46
45
|
|
47
46
|
links = doc.css('a[href]').map do |e|
|
48
47
|
link = Addressable::URI.parse(e['href'])
|
49
48
|
link = uri.join(link) if link.relative?
|
50
|
-
next unless
|
49
|
+
next unless @limit.match?(link.to_s)
|
50
|
+
# TODO Ignore same page links (some anchor)
|
51
51
|
link
|
52
52
|
rescue StandardError
|
53
|
-
warn $ERROR_INFO
|
53
|
+
warn $ERROR_INFO.message
|
54
|
+
warn $ERROR_INFO.backtrace if @tracing
|
54
55
|
end.compact
|
55
56
|
|
56
57
|
yield uri, doc if block_given?
|
58
|
+
|
59
|
+
warn "Adding #{links.size} links from #{uri}" if @tracing
|
57
60
|
links
|
58
61
|
end
|
59
62
|
end
|
60
|
-
# rubocop:enable Metrics/AbcSize
|
61
|
-
# rubocop:enable Metrics/MethodLength
|
62
63
|
end
|
data/lib/httpspell/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: httpspell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Steffen Uhlig
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-06-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: aruba
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: bundler
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -176,7 +190,9 @@ extensions: []
|
|
176
190
|
extra_rdoc_files: []
|
177
191
|
files:
|
178
192
|
- ".gitignore"
|
193
|
+
- ".rspec"
|
179
194
|
- ".rubocop.yml"
|
195
|
+
- ".travis.yml"
|
180
196
|
- Gemfile
|
181
197
|
- Gemfile.lock
|
182
198
|
- README.markdown
|