httpspell 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0fe73f8f1ff3740d6e3ae3af685d3554879a2819fc9d7803c994a21dd3694d91
4
- data.tar.gz: 8354f5c3bdc325a073310aa534a6171164d3dfbe7a1c4154f77737f20108eb91
3
+ metadata.gz: aeed14621889176b3b295937ab5ae5d75371862b0e9107fc11ab0be65cebc082
4
+ data.tar.gz: 10a13a180d6b032b0e71623ae82239325c658bad2a4a4f74424636adedc22531
5
5
  SHA512:
6
- metadata.gz: '073693d2520238d10012e4c02057c4966ab8af80f1c9db868e5ae2a4b95e4ae59a7d0989c162f62649aa0d2194290da0bca1ac5e1186f8ff3569cca581d571ae'
7
- data.tar.gz: 8eb778ffa3bcc1f56e8362d160117f695ec5f3ca146592219f4ef43a160ea28b96c67c5a6edeba52fbdc6dc3413b9f4967243fc60431a81861506d2c46435b7b
6
+ metadata.gz: 9517f20cdfa7da8f013eb01e456c5a67fbccd49df25bfcfa437aca1e20ccd570623c465e8a953fb588d83f95c783caa1fccce19dab0ff416caf05c5a4c75a58a
7
+ data.tar.gz: 54d38926757fcd968a461cfff777343e08a69f8a3fe687fd00694486a1ac4539d4547f6faaa34c9a997b7643a134bf27e346d9ddf0c3fe7d11c38d5d6ca56e45
data/.rspec ADDED
@@ -0,0 +1,5 @@
1
+ --color
2
+ --format documentation
3
+ --tty
4
+ --order random
5
+ --require 'spec_helper'
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.5.1
4
+ before_install:
5
+ - sudo apt-get -qq update
6
+ - sudo apt-get install -y pandoc hunspell hunspell-en-us
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- httpspell (1.0.0)
4
+ httpspell (1.1.0)
5
5
  addressable
6
6
  nokogiri
7
7
 
@@ -10,12 +10,41 @@ GEM
10
10
  specs:
11
11
  addressable (2.5.2)
12
12
  public_suffix (>= 2.0.2, < 4.0)
13
+ aruba (0.14.5)
14
+ childprocess (>= 0.6.3, < 0.10.0)
15
+ contracts (~> 0.9)
16
+ cucumber (>= 1.3.19)
17
+ ffi (~> 1.9.10)
18
+ rspec-expectations (>= 2.99)
19
+ thor (~> 0.19)
13
20
  ast (2.4.0)
21
+ backports (3.11.3)
22
+ builder (3.2.3)
14
23
  byebug (10.0.2)
24
+ childprocess (0.9.0)
25
+ ffi (~> 1.0, >= 1.0.11)
15
26
  coderay (1.1.2)
27
+ contracts (0.16.0)
28
+ cucumber (3.1.0)
29
+ builder (>= 2.1.2)
30
+ cucumber-core (~> 3.1.0)
31
+ cucumber-expressions (~> 5.0.4)
32
+ cucumber-wire (~> 0.0.1)
33
+ diff-lcs (~> 1.3)
34
+ gherkin (~> 5.0)
35
+ multi_json (>= 1.7.5, < 2.0)
36
+ multi_test (>= 0.1.2)
37
+ cucumber-core (3.1.0)
38
+ backports (>= 3.8.0)
39
+ cucumber-tag_expressions (~> 1.1.0)
40
+ gherkin (>= 5.0.0)
41
+ cucumber-expressions (5.0.18)
42
+ cucumber-tag_expressions (1.1.1)
43
+ cucumber-wire (0.0.1)
16
44
  diff-lcs (1.3)
17
45
  ffi (1.9.23)
18
46
  formatador (0.2.5)
47
+ gherkin (5.1.0)
19
48
  guard (2.14.2)
20
49
  formatador (>= 0.2.4)
21
50
  listen (>= 2.7, < 4.0)
@@ -41,6 +70,8 @@ GEM
41
70
  lumberjack (1.0.13)
42
71
  method_source (0.9.0)
43
72
  mini_portile2 (2.3.0)
73
+ multi_json (1.13.1)
74
+ multi_test (0.1.2)
44
75
  nenv (0.3.0)
45
76
  nokogiri (1.8.2)
46
77
  mini_portile2 (~> 2.3.0)
@@ -93,6 +124,7 @@ PLATFORMS
93
124
  ruby
94
125
 
95
126
  DEPENDENCIES
127
+ aruba
96
128
  bundler
97
129
  guard
98
130
  guard-bundler
@@ -1,16 +1,43 @@
1
1
  # `httpspell`
2
2
 
3
+ [![Build Status](https://travis-ci.org/suhlig/httpspell.svg?branch=master)](https://travis-ci.org/suhlig/httpspell)
4
+
3
5
  This is a spellchecker that recursively fetches HTML pages, converts them to plain text (using [pandoc](http://pandoc.org/)), and spellchecks them with [hunspell](https://hunspell.github.io/). Unknown words will be printed to `stdout`, which makes the tool a good candidate for CI pipelines where you might want to take action when a spelling error is found on a web page.
4
6
 
5
7
  Words that are not in the dictionary for the given language (inferred from the `lang` attribute of the HTML document's root element) can be added to a personal dictionary, which will mark the word as correctly spelled.
6
8
 
9
+ # Usage
10
+
11
+ * The following command will retrieve the HTML document at https://example.com, spellcheck it, and not print anything because there are no errors:
12
+
13
+ ```bash
14
+ $ httpspell https://example.com
15
+ ```
16
+
17
+ The exit code is `0`.
18
+
19
+ * The following command will spellcheck the README of this project as rendered by GitHub, and print a list of unknown words. Note that we set the language to `en_US` because GitHub declares 'en' as document language, but the installed dictionaries usually refer the a specific language variant like `en_US`:
20
+
21
+ ```bash
22
+ $ httpspell https://github.com/suhlig/httpspell/blob/master/README.markdown --language en_US
23
+ suhlig
24
+ Permalink
25
+ httpspell
26
+ sloc
27
+ pandoc
28
+ hunspell
29
+ ...
30
+ ```
31
+
32
+ The exit code is `1`.
33
+
7
34
  # What is *not* checked
8
35
 
9
- * When spidering a site, `httpspell` will skip all responses with a `content-type` header other than `text/html`.
36
+ * When spidering a site, `httpspell` will skip all responses with a `content-type` header other than `text/html` (unless pointing it to file, in which case it accepts anything).
10
37
  * Before converting, `httpspell` removes the following nodes from the HTML DOM as they are not a good target for spellchecking:
11
38
  - `code`
12
39
  - `pre`
13
- - Elements with `spellcheck='false'` (this is how HTML5 allows tagging elements as a target for spellchecking)
40
+ - Elements with `spellcheck='false'` (this is how HTML5 allows tagging elements as a being target for spellchecking or not)
14
41
 
15
42
  # Misc
16
43
 
data/Rakefile CHANGED
@@ -10,9 +10,12 @@ task default: ['spec:all']
10
10
 
11
11
  namespace :spec do
12
12
  desc 'Run all specs'
13
- task all: ['rubocop:auto_correct', :unit]
13
+ task all: %i[rubocop:auto_correct unit system]
14
14
 
15
- RSpec::Core::RakeTask.new(:unit) do |t|
16
- t.pattern = 'spec/unit/**/*_spec.rb'
15
+ %w[unit system].each do |type|
16
+ desc "Run #{type} tests"
17
+ RSpec::Core::RakeTask.new(type) do |t|
18
+ t.pattern = "spec/#{type}/**/*_spec.rb"
19
+ end
17
20
  end
18
21
  end
@@ -4,8 +4,13 @@
4
4
  require 'optparse'
5
5
  require 'httpspell/spider'
6
6
  require 'httpspell/spellchecker'
7
+ require 'httpspell/version'
7
8
 
8
9
  personal_dictionary_path = nil
10
+ force_language = nil
11
+ tracing = nil
12
+ verbose = nil
13
+ limit = nil
9
14
 
10
15
  begin
11
16
  OptionParser.new do |parser|
@@ -13,11 +18,28 @@ begin
13
18
  Spellchecks a website via HTTP.
14
19
 
15
20
  BANNER
21
+ parser.version = HttpSpell::VERSION
16
22
 
17
23
  parser.on('-p', '--personal-dictionary=FILE', 'path to the personal dictionary file') do |p|
18
24
  personal_dictionary_path = p
19
25
  end
20
26
 
27
+ parser.on('-l', '--language=LANGUAGE', 'override LANGUAGE of content') do |l|
28
+ force_language = l
29
+ end
30
+
31
+ parser.on('-L', '--limit=EXPRESSION', 'limit recursive retrieval to URLs matching a regular EXPRESSION') do |l|
32
+ limit = Regexp.new(l)
33
+ end
34
+
35
+ parser.on('-t', '--trace', 'enable error tracing') do
36
+ tracing = true
37
+ end
38
+
39
+ parser.on('-V', '--verbose', "explain what's happening") do
40
+ verbose = true
41
+ end
42
+
21
43
  # TODO: --recursive, defaults to false
22
44
  # TODO wget has some additional options for recursive behavior that should be reviewed
23
45
  end.parse!
@@ -32,20 +54,32 @@ if ARGV.size != 1
32
54
  end
33
55
 
34
56
  spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path)
57
+ has_unknown_words = false
35
58
 
36
- HttpSpell::Spider.new(ARGV.first).start do |url, doc|
37
- lang = doc.root['lang'] || 'de-DE'
59
+ begin
60
+ HttpSpell::Spider.new(ARGV.first, limit: limit, tracing: tracing).start do |url, doc|
61
+ lang = force_language || doc.root['lang'] || ENV['LANGUAGE']
38
62
 
39
- # Remove sections that are not to be spellchecked
40
- doc.css('pre').each(&:unlink)
41
- doc.css('code').each(&:unlink)
42
- doc.css('[spellcheck=false]').each(&:unlink)
63
+ # Remove sections that are not to be spellchecked
64
+ doc.css('pre').each(&:unlink)
65
+ doc.css('code').each(&:unlink)
66
+ doc.css('[spellcheck=false]').each(&:unlink)
43
67
 
44
- # TODO: Find sections with a lang attribute and handle them separately
45
- unknown_words = spell_checker.check(doc.to_s, lang)
68
+ # TODO: Find sections with a lang attribute and handle them separately
69
+ unknown_words = spell_checker.check(doc.to_s, lang)
46
70
 
47
- unless unknown_words.empty?
48
- warn "#{unknown_words.size} unknown words at #{url}:"
49
- puts unknown_words
71
+ if unknown_words.empty?
72
+ warn "No unknown words (language is #{lang}) at #{url}." if verbose
73
+ else
74
+ warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
75
+ puts unknown_words
76
+ has_unknown_words = true
77
+ end
50
78
  end
79
+ rescue StandardError
80
+ warn $ERROR_INFO.message
81
+ warn $ERROR_INFO.backtrace if tracing
82
+ exit 2
51
83
  end
84
+
85
+ exit 1 if has_unknown_words
@@ -27,6 +27,7 @@ Gem::Specification.new do |spec|
27
27
  spec.add_dependency 'addressable'
28
28
  spec.add_dependency 'nokogiri'
29
29
 
30
+ spec.add_development_dependency 'aruba'
30
31
  spec.add_development_dependency 'bundler'
31
32
  spec.add_development_dependency 'guard'
32
33
  spec.add_development_dependency 'guard-bundler'
@@ -5,34 +5,29 @@ require 'addressable/uri'
5
5
  require 'English'
6
6
 
7
7
  module HttpSpell
8
- # rubocop:disable Metrics/AbcSize
9
- # rubocop:disable Metrics/MethodLength
10
8
  class Spider
11
9
  attr_reader :todo, :done
12
10
 
13
- def initialize(starting_point, base_url = starting_point)
11
+ def initialize(starting_point, limit: nil, tracing: false)
14
12
  @todo = []
15
13
  @done = []
16
14
  todo << Addressable::URI.parse(starting_point)
17
- @base_url = Addressable::URI.parse(base_url)
15
+ @limit = limit || /^#{starting_point}/
16
+ @tracing = tracing
18
17
  end
19
18
 
20
19
  def start
21
20
  while todo.any?
22
21
  url = todo.pop
23
-
24
- begin
25
- extracted = links(url) do |u, d|
26
- yield u, d if block_given?
27
- rescue
28
- warn "Callback error for #{url}: #{$ERROR_INFO}"
29
- end
30
-
31
- done.append(url)
32
- todo.concat(extracted - done - todo)
33
- rescue StandardError
34
- warn "Could not fetch #{url}: #{$ERROR_INFO}"
22
+ extracted = links(url) do |u, d|
23
+ yield u, d if block_given?
24
+ rescue
25
+ warn "Callback error for #{url}: #{$ERROR_INFO}"
26
+ warn $ERROR_INFO.backtrace if @tracing
35
27
  end
28
+
29
+ done.append(url)
30
+ todo.concat(extracted - done - todo)
36
31
  end
37
32
  end
38
33
 
@@ -40,23 +35,29 @@ module HttpSpell
40
35
 
41
36
  def links(uri)
42
37
  # We are using open-uri, which follows redirects and also provides the content-type.
43
- response = URI(uri).read
44
- return [] unless response.content_type == 'text/html'
38
+ response = open(uri).read
39
+
40
+ if response.respond_to?(:content_type)
41
+ return [] unless response.content_type == 'text/html'
42
+ end
43
+
45
44
  doc = Nokogiri::HTML(response)
46
45
 
47
46
  links = doc.css('a[href]').map do |e|
48
47
  link = Addressable::URI.parse(e['href'])
49
48
  link = uri.join(link) if link.relative?
50
- next unless link.to_s.start_with?(@base_url.to_s)
49
+ next unless @limit.match?(link.to_s)
50
+ # TODO Ignore same page links (some anchor)
51
51
  link
52
52
  rescue StandardError
53
- warn $ERROR_INFO
53
+ warn $ERROR_INFO.message
54
+ warn $ERROR_INFO.backtrace if @tracing
54
55
  end.compact
55
56
 
56
57
  yield uri, doc if block_given?
58
+
59
+ warn "Adding #{links.size} links from #{uri}" if @tracing
57
60
  links
58
61
  end
59
62
  end
60
- # rubocop:enable Metrics/AbcSize
61
- # rubocop:enable Metrics/MethodLength
62
63
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HttpSpell
4
- VERSION = '1.0.0'
4
+ VERSION = '1.1.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: httpspell
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steffen Uhlig
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-05-30 00:00:00.000000000 Z
11
+ date: 2018-06-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: aruba
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: bundler
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -176,7 +190,9 @@ extensions: []
176
190
  extra_rdoc_files: []
177
191
  files:
178
192
  - ".gitignore"
193
+ - ".rspec"
179
194
  - ".rubocop.yml"
195
+ - ".travis.yml"
180
196
  - Gemfile
181
197
  - Gemfile.lock
182
198
  - README.markdown