httpspell 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rspec +5 -0
- data/.travis.yml +6 -0
- data/Gemfile.lock +33 -1
- data/README.markdown +29 -2
- data/Rakefile +6 -3
- data/exe/httpspell +45 -11
- data/httpspell.gemspec +1 -0
- data/lib/httpspell/spider.rb +23 -22
- data/lib/httpspell/version.rb +1 -1
- metadata +18 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aeed14621889176b3b295937ab5ae5d75371862b0e9107fc11ab0be65cebc082
|
4
|
+
data.tar.gz: 10a13a180d6b032b0e71623ae82239325c658bad2a4a4f74424636adedc22531
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9517f20cdfa7da8f013eb01e456c5a67fbccd49df25bfcfa437aca1e20ccd570623c465e8a953fb588d83f95c783caa1fccce19dab0ff416caf05c5a4c75a58a
|
7
|
+
data.tar.gz: 54d38926757fcd968a461cfff777343e08a69f8a3fe687fd00694486a1ac4539d4547f6faaa34c9a997b7643a134bf27e346d9ddf0c3fe7d11c38d5d6ca56e45
|
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
httpspell (1.
|
4
|
+
httpspell (1.1.0)
|
5
5
|
addressable
|
6
6
|
nokogiri
|
7
7
|
|
@@ -10,12 +10,41 @@ GEM
|
|
10
10
|
specs:
|
11
11
|
addressable (2.5.2)
|
12
12
|
public_suffix (>= 2.0.2, < 4.0)
|
13
|
+
aruba (0.14.5)
|
14
|
+
childprocess (>= 0.6.3, < 0.10.0)
|
15
|
+
contracts (~> 0.9)
|
16
|
+
cucumber (>= 1.3.19)
|
17
|
+
ffi (~> 1.9.10)
|
18
|
+
rspec-expectations (>= 2.99)
|
19
|
+
thor (~> 0.19)
|
13
20
|
ast (2.4.0)
|
21
|
+
backports (3.11.3)
|
22
|
+
builder (3.2.3)
|
14
23
|
byebug (10.0.2)
|
24
|
+
childprocess (0.9.0)
|
25
|
+
ffi (~> 1.0, >= 1.0.11)
|
15
26
|
coderay (1.1.2)
|
27
|
+
contracts (0.16.0)
|
28
|
+
cucumber (3.1.0)
|
29
|
+
builder (>= 2.1.2)
|
30
|
+
cucumber-core (~> 3.1.0)
|
31
|
+
cucumber-expressions (~> 5.0.4)
|
32
|
+
cucumber-wire (~> 0.0.1)
|
33
|
+
diff-lcs (~> 1.3)
|
34
|
+
gherkin (~> 5.0)
|
35
|
+
multi_json (>= 1.7.5, < 2.0)
|
36
|
+
multi_test (>= 0.1.2)
|
37
|
+
cucumber-core (3.1.0)
|
38
|
+
backports (>= 3.8.0)
|
39
|
+
cucumber-tag_expressions (~> 1.1.0)
|
40
|
+
gherkin (>= 5.0.0)
|
41
|
+
cucumber-expressions (5.0.18)
|
42
|
+
cucumber-tag_expressions (1.1.1)
|
43
|
+
cucumber-wire (0.0.1)
|
16
44
|
diff-lcs (1.3)
|
17
45
|
ffi (1.9.23)
|
18
46
|
formatador (0.2.5)
|
47
|
+
gherkin (5.1.0)
|
19
48
|
guard (2.14.2)
|
20
49
|
formatador (>= 0.2.4)
|
21
50
|
listen (>= 2.7, < 4.0)
|
@@ -41,6 +70,8 @@ GEM
|
|
41
70
|
lumberjack (1.0.13)
|
42
71
|
method_source (0.9.0)
|
43
72
|
mini_portile2 (2.3.0)
|
73
|
+
multi_json (1.13.1)
|
74
|
+
multi_test (0.1.2)
|
44
75
|
nenv (0.3.0)
|
45
76
|
nokogiri (1.8.2)
|
46
77
|
mini_portile2 (~> 2.3.0)
|
@@ -93,6 +124,7 @@ PLATFORMS
|
|
93
124
|
ruby
|
94
125
|
|
95
126
|
DEPENDENCIES
|
127
|
+
aruba
|
96
128
|
bundler
|
97
129
|
guard
|
98
130
|
guard-bundler
|
data/README.markdown
CHANGED
@@ -1,16 +1,43 @@
|
|
1
1
|
# `httpspell`
|
2
2
|
|
3
|
+
[![Build Status](https://travis-ci.org/suhlig/httpspell.svg?branch=master)](https://travis-ci.org/suhlig/httpspell)
|
4
|
+
|
3
5
|
This is a spellchecker that recursively fetches HTML pages, converts them to plain text (using [pandoc](http://pandoc.org/)), and spellchecks them with [hunspell](https://hunspell.github.io/). Unknown words will be printed to `stdout`, which makes the tool a good candidate for CI pipelines where you might want to take action when a spelling error is found on a web page.
|
4
6
|
|
5
7
|
Words that are not in the dictionary for the given language (inferred from the `lang` attribute of the HTML document's root element) can be added to a personal dictionary, which will mark the word as correctly spelled.
|
6
8
|
|
9
|
+
# Usage
|
10
|
+
|
11
|
+
* The following command will retrieve the HTML document at https://example.com, spellcheck it, and not print anything because there are no errors:
|
12
|
+
|
13
|
+
```bash
|
14
|
+
$ httpspell https://example.com
|
15
|
+
```
|
16
|
+
|
17
|
+
The exit code is `0`.
|
18
|
+
|
19
|
+
* The following command will spellcheck the README of this project as rendered by GitHub, and print a list of unknown words. Note that we set the language to `en_US` because GitHub declares 'en' as document language, but the installed dictionaries usually refer the a specific language variant like `en_US`:
|
20
|
+
|
21
|
+
```bash
|
22
|
+
$ httpspell https://github.com/suhlig/httpspell/blob/master/README.markdown --language en_US
|
23
|
+
suhlig
|
24
|
+
Permalink
|
25
|
+
httpspell
|
26
|
+
sloc
|
27
|
+
pandoc
|
28
|
+
hunspell
|
29
|
+
...
|
30
|
+
```
|
31
|
+
|
32
|
+
The exit code is `1`.
|
33
|
+
|
7
34
|
# What is *not* checked
|
8
35
|
|
9
|
-
* When spidering a site, `httpspell` will skip all responses with a `content-type` header other than `text/html
|
36
|
+
* When spidering a site, `httpspell` will skip all responses with a `content-type` header other than `text/html` (unless pointing it to file, in which case it accepts anything).
|
10
37
|
* Before converting, `httpspell` removes the following nodes from the HTML DOM as they are not a good target for spellchecking:
|
11
38
|
- `code`
|
12
39
|
- `pre`
|
13
|
-
- Elements with `spellcheck='false'` (this is how HTML5 allows tagging elements as a target for spellchecking)
|
40
|
+
- Elements with `spellcheck='false'` (this is how HTML5 allows tagging elements as a being target for spellchecking or not)
|
14
41
|
|
15
42
|
# Misc
|
16
43
|
|
data/Rakefile
CHANGED
@@ -10,9 +10,12 @@ task default: ['spec:all']
|
|
10
10
|
|
11
11
|
namespace :spec do
|
12
12
|
desc 'Run all specs'
|
13
|
-
task all: [
|
13
|
+
task all: %i[rubocop:auto_correct unit system]
|
14
14
|
|
15
|
-
|
16
|
-
|
15
|
+
%w[unit system].each do |type|
|
16
|
+
desc "Run #{type} tests"
|
17
|
+
RSpec::Core::RakeTask.new(type) do |t|
|
18
|
+
t.pattern = "spec/#{type}/**/*_spec.rb"
|
19
|
+
end
|
17
20
|
end
|
18
21
|
end
|
data/exe/httpspell
CHANGED
@@ -4,8 +4,13 @@
|
|
4
4
|
require 'optparse'
|
5
5
|
require 'httpspell/spider'
|
6
6
|
require 'httpspell/spellchecker'
|
7
|
+
require 'httpspell/version'
|
7
8
|
|
8
9
|
personal_dictionary_path = nil
|
10
|
+
force_language = nil
|
11
|
+
tracing = nil
|
12
|
+
verbose = nil
|
13
|
+
limit = nil
|
9
14
|
|
10
15
|
begin
|
11
16
|
OptionParser.new do |parser|
|
@@ -13,11 +18,28 @@ begin
|
|
13
18
|
Spellchecks a website via HTTP.
|
14
19
|
|
15
20
|
BANNER
|
21
|
+
parser.version = HttpSpell::VERSION
|
16
22
|
|
17
23
|
parser.on('-p', '--personal-dictionary=FILE', 'path to the personal dictionary file') do |p|
|
18
24
|
personal_dictionary_path = p
|
19
25
|
end
|
20
26
|
|
27
|
+
parser.on('-l', '--language=LANGUAGE', 'override LANGUAGE of content') do |l|
|
28
|
+
force_language = l
|
29
|
+
end
|
30
|
+
|
31
|
+
parser.on('-L', '--limit=EXPRESSION', 'limit recursive retrieval to URLs matching a regular EXPRESSION') do |l|
|
32
|
+
limit = Regexp.new(l)
|
33
|
+
end
|
34
|
+
|
35
|
+
parser.on('-t', '--trace', 'enable error tracing') do
|
36
|
+
tracing = true
|
37
|
+
end
|
38
|
+
|
39
|
+
parser.on('-V', '--verbose', "explain what's happening") do
|
40
|
+
verbose = true
|
41
|
+
end
|
42
|
+
|
21
43
|
# TODO: --recursive, defaults to false
|
22
44
|
# TODO wget has some additional options for recursive behavior that should be reviewed
|
23
45
|
end.parse!
|
@@ -32,20 +54,32 @@ if ARGV.size != 1
|
|
32
54
|
end
|
33
55
|
|
34
56
|
spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path)
|
57
|
+
has_unknown_words = false
|
35
58
|
|
36
|
-
|
37
|
-
|
59
|
+
begin
|
60
|
+
HttpSpell::Spider.new(ARGV.first, limit: limit, tracing: tracing).start do |url, doc|
|
61
|
+
lang = force_language || doc.root['lang'] || ENV['LANGUAGE']
|
38
62
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
63
|
+
# Remove sections that are not to be spellchecked
|
64
|
+
doc.css('pre').each(&:unlink)
|
65
|
+
doc.css('code').each(&:unlink)
|
66
|
+
doc.css('[spellcheck=false]').each(&:unlink)
|
43
67
|
|
44
|
-
|
45
|
-
|
68
|
+
# TODO: Find sections with a lang attribute and handle them separately
|
69
|
+
unknown_words = spell_checker.check(doc.to_s, lang)
|
46
70
|
|
47
|
-
|
48
|
-
|
49
|
-
|
71
|
+
if unknown_words.empty?
|
72
|
+
warn "No unknown words (language is #{lang}) at #{url}." if verbose
|
73
|
+
else
|
74
|
+
warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
|
75
|
+
puts unknown_words
|
76
|
+
has_unknown_words = true
|
77
|
+
end
|
50
78
|
end
|
79
|
+
rescue StandardError
|
80
|
+
warn $ERROR_INFO.message
|
81
|
+
warn $ERROR_INFO.backtrace if tracing
|
82
|
+
exit 2
|
51
83
|
end
|
84
|
+
|
85
|
+
exit 1 if has_unknown_words
|
data/httpspell.gemspec
CHANGED
@@ -27,6 +27,7 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.add_dependency 'addressable'
|
28
28
|
spec.add_dependency 'nokogiri'
|
29
29
|
|
30
|
+
spec.add_development_dependency 'aruba'
|
30
31
|
spec.add_development_dependency 'bundler'
|
31
32
|
spec.add_development_dependency 'guard'
|
32
33
|
spec.add_development_dependency 'guard-bundler'
|
data/lib/httpspell/spider.rb
CHANGED
@@ -5,34 +5,29 @@ require 'addressable/uri'
|
|
5
5
|
require 'English'
|
6
6
|
|
7
7
|
module HttpSpell
|
8
|
-
# rubocop:disable Metrics/AbcSize
|
9
|
-
# rubocop:disable Metrics/MethodLength
|
10
8
|
class Spider
|
11
9
|
attr_reader :todo, :done
|
12
10
|
|
13
|
-
def initialize(starting_point,
|
11
|
+
def initialize(starting_point, limit: nil, tracing: false)
|
14
12
|
@todo = []
|
15
13
|
@done = []
|
16
14
|
todo << Addressable::URI.parse(starting_point)
|
17
|
-
@
|
15
|
+
@limit = limit || /^#{starting_point}/
|
16
|
+
@tracing = tracing
|
18
17
|
end
|
19
18
|
|
20
19
|
def start
|
21
20
|
while todo.any?
|
22
21
|
url = todo.pop
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
warn "Callback error for #{url}: #{$ERROR_INFO}"
|
29
|
-
end
|
30
|
-
|
31
|
-
done.append(url)
|
32
|
-
todo.concat(extracted - done - todo)
|
33
|
-
rescue StandardError
|
34
|
-
warn "Could not fetch #{url}: #{$ERROR_INFO}"
|
22
|
+
extracted = links(url) do |u, d|
|
23
|
+
yield u, d if block_given?
|
24
|
+
rescue
|
25
|
+
warn "Callback error for #{url}: #{$ERROR_INFO}"
|
26
|
+
warn $ERROR_INFO.backtrace if @tracing
|
35
27
|
end
|
28
|
+
|
29
|
+
done.append(url)
|
30
|
+
todo.concat(extracted - done - todo)
|
36
31
|
end
|
37
32
|
end
|
38
33
|
|
@@ -40,23 +35,29 @@ module HttpSpell
|
|
40
35
|
|
41
36
|
def links(uri)
|
42
37
|
# We are using open-uri, which follows redirects and also provides the content-type.
|
43
|
-
response =
|
44
|
-
|
38
|
+
response = open(uri).read
|
39
|
+
|
40
|
+
if response.respond_to?(:content_type)
|
41
|
+
return [] unless response.content_type == 'text/html'
|
42
|
+
end
|
43
|
+
|
45
44
|
doc = Nokogiri::HTML(response)
|
46
45
|
|
47
46
|
links = doc.css('a[href]').map do |e|
|
48
47
|
link = Addressable::URI.parse(e['href'])
|
49
48
|
link = uri.join(link) if link.relative?
|
50
|
-
next unless
|
49
|
+
next unless @limit.match?(link.to_s)
|
50
|
+
# TODO Ignore same page links (some anchor)
|
51
51
|
link
|
52
52
|
rescue StandardError
|
53
|
-
warn $ERROR_INFO
|
53
|
+
warn $ERROR_INFO.message
|
54
|
+
warn $ERROR_INFO.backtrace if @tracing
|
54
55
|
end.compact
|
55
56
|
|
56
57
|
yield uri, doc if block_given?
|
58
|
+
|
59
|
+
warn "Adding #{links.size} links from #{uri}" if @tracing
|
57
60
|
links
|
58
61
|
end
|
59
62
|
end
|
60
|
-
# rubocop:enable Metrics/AbcSize
|
61
|
-
# rubocop:enable Metrics/MethodLength
|
62
63
|
end
|
data/lib/httpspell/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: httpspell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Steffen Uhlig
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-06-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: aruba
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: bundler
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -176,7 +190,9 @@ extensions: []
|
|
176
190
|
extra_rdoc_files: []
|
177
191
|
files:
|
178
192
|
- ".gitignore"
|
193
|
+
- ".rspec"
|
179
194
|
- ".rubocop.yml"
|
195
|
+
- ".travis.yml"
|
180
196
|
- Gemfile
|
181
197
|
- Gemfile.lock
|
182
198
|
- README.markdown
|