httpspell 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0fe73f8f1ff3740d6e3ae3af685d3554879a2819fc9d7803c994a21dd3694d91
4
- data.tar.gz: 8354f5c3bdc325a073310aa534a6171164d3dfbe7a1c4154f77737f20108eb91
3
+ metadata.gz: aeed14621889176b3b295937ab5ae5d75371862b0e9107fc11ab0be65cebc082
4
+ data.tar.gz: 10a13a180d6b032b0e71623ae82239325c658bad2a4a4f74424636adedc22531
5
5
  SHA512:
6
- metadata.gz: '073693d2520238d10012e4c02057c4966ab8af80f1c9db868e5ae2a4b95e4ae59a7d0989c162f62649aa0d2194290da0bca1ac5e1186f8ff3569cca581d571ae'
7
- data.tar.gz: 8eb778ffa3bcc1f56e8362d160117f695ec5f3ca146592219f4ef43a160ea28b96c67c5a6edeba52fbdc6dc3413b9f4967243fc60431a81861506d2c46435b7b
6
+ metadata.gz: 9517f20cdfa7da8f013eb01e456c5a67fbccd49df25bfcfa437aca1e20ccd570623c465e8a953fb588d83f95c783caa1fccce19dab0ff416caf05c5a4c75a58a
7
+ data.tar.gz: 54d38926757fcd968a461cfff777343e08a69f8a3fe687fd00694486a1ac4539d4547f6faaa34c9a997b7643a134bf27e346d9ddf0c3fe7d11c38d5d6ca56e45
data/.rspec ADDED
@@ -0,0 +1,5 @@
1
+ --color
2
+ --format documentation
3
+ --tty
4
+ --order random
5
+ --require 'spec_helper'
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.5.1
4
+ before_install:
5
+ - sudo apt-get -qq update
6
+ - sudo apt-get install -y pandoc hunspell hunspell-en-us
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- httpspell (1.0.0)
4
+ httpspell (1.1.0)
5
5
  addressable
6
6
  nokogiri
7
7
 
@@ -10,12 +10,41 @@ GEM
10
10
  specs:
11
11
  addressable (2.5.2)
12
12
  public_suffix (>= 2.0.2, < 4.0)
13
+ aruba (0.14.5)
14
+ childprocess (>= 0.6.3, < 0.10.0)
15
+ contracts (~> 0.9)
16
+ cucumber (>= 1.3.19)
17
+ ffi (~> 1.9.10)
18
+ rspec-expectations (>= 2.99)
19
+ thor (~> 0.19)
13
20
  ast (2.4.0)
21
+ backports (3.11.3)
22
+ builder (3.2.3)
14
23
  byebug (10.0.2)
24
+ childprocess (0.9.0)
25
+ ffi (~> 1.0, >= 1.0.11)
15
26
  coderay (1.1.2)
27
+ contracts (0.16.0)
28
+ cucumber (3.1.0)
29
+ builder (>= 2.1.2)
30
+ cucumber-core (~> 3.1.0)
31
+ cucumber-expressions (~> 5.0.4)
32
+ cucumber-wire (~> 0.0.1)
33
+ diff-lcs (~> 1.3)
34
+ gherkin (~> 5.0)
35
+ multi_json (>= 1.7.5, < 2.0)
36
+ multi_test (>= 0.1.2)
37
+ cucumber-core (3.1.0)
38
+ backports (>= 3.8.0)
39
+ cucumber-tag_expressions (~> 1.1.0)
40
+ gherkin (>= 5.0.0)
41
+ cucumber-expressions (5.0.18)
42
+ cucumber-tag_expressions (1.1.1)
43
+ cucumber-wire (0.0.1)
16
44
  diff-lcs (1.3)
17
45
  ffi (1.9.23)
18
46
  formatador (0.2.5)
47
+ gherkin (5.1.0)
19
48
  guard (2.14.2)
20
49
  formatador (>= 0.2.4)
21
50
  listen (>= 2.7, < 4.0)
@@ -41,6 +70,8 @@ GEM
41
70
  lumberjack (1.0.13)
42
71
  method_source (0.9.0)
43
72
  mini_portile2 (2.3.0)
73
+ multi_json (1.13.1)
74
+ multi_test (0.1.2)
44
75
  nenv (0.3.0)
45
76
  nokogiri (1.8.2)
46
77
  mini_portile2 (~> 2.3.0)
@@ -93,6 +124,7 @@ PLATFORMS
93
124
  ruby
94
125
 
95
126
  DEPENDENCIES
127
+ aruba
96
128
  bundler
97
129
  guard
98
130
  guard-bundler
@@ -1,16 +1,43 @@
1
1
  # `httpspell`
2
2
 
3
+ [![Build Status](https://travis-ci.org/suhlig/httpspell.svg?branch=master)](https://travis-ci.org/suhlig/httpspell)
4
+
3
5
  This is a spellchecker that recursively fetches HTML pages, converts them to plain text (using [pandoc](http://pandoc.org/)), and spellchecks them with [hunspell](https://hunspell.github.io/). Unknown words will be printed to `stdout`, which makes the tool a good candidate for CI pipelines where you might want to take action when a spelling error is found on a web page.
4
6
 
5
7
  Words that are not in the dictionary for the given language (inferred from the `lang` attribute of the HTML document's root element) can be added to a personal dictionary, which will mark the word as correctly spelled.
6
8
 
9
+ # Usage
10
+
11
+ * The following command will retrieve the HTML document at https://example.com, spellcheck it, and not print anything because there are no errors:
12
+
13
+ ```bash
14
+ $ httpspell https://example.com
15
+ ```
16
+
17
+ The exit code is `0`.
18
+
19
+ * The following command will spellcheck the README of this project as rendered by GitHub, and print a list of unknown words. Note that we set the language to `en_US` because GitHub declares 'en' as document language, but the installed dictionaries usually refer the a specific language variant like `en_US`:
20
+
21
+ ```bash
22
+ $ httpspell https://github.com/suhlig/httpspell/blob/master/README.markdown --language en_US
23
+ suhlig
24
+ Permalink
25
+ httpspell
26
+ sloc
27
+ pandoc
28
+ hunspell
29
+ ...
30
+ ```
31
+
32
+ The exit code is `1`.
33
+
7
34
  # What is *not* checked
8
35
 
9
- * When spidering a site, `httpspell` will skip all responses with a `content-type` header other than `text/html`.
36
+ * When spidering a site, `httpspell` will skip all responses with a `content-type` header other than `text/html` (unless pointing it to file, in which case it accepts anything).
10
37
  * Before converting, `httpspell` removes the following nodes from the HTML DOM as they are not a good target for spellchecking:
11
38
  - `code`
12
39
  - `pre`
13
- - Elements with `spellcheck='false'` (this is how HTML5 allows tagging elements as a target for spellchecking)
40
+ - Elements with `spellcheck='false'` (this is how HTML5 allows tagging elements as a being target for spellchecking or not)
14
41
 
15
42
  # Misc
16
43
 
data/Rakefile CHANGED
@@ -10,9 +10,12 @@ task default: ['spec:all']
10
10
 
11
11
  namespace :spec do
12
12
  desc 'Run all specs'
13
- task all: ['rubocop:auto_correct', :unit]
13
+ task all: %i[rubocop:auto_correct unit system]
14
14
 
15
- RSpec::Core::RakeTask.new(:unit) do |t|
16
- t.pattern = 'spec/unit/**/*_spec.rb'
15
+ %w[unit system].each do |type|
16
+ desc "Run #{type} tests"
17
+ RSpec::Core::RakeTask.new(type) do |t|
18
+ t.pattern = "spec/#{type}/**/*_spec.rb"
19
+ end
17
20
  end
18
21
  end
@@ -4,8 +4,13 @@
4
4
  require 'optparse'
5
5
  require 'httpspell/spider'
6
6
  require 'httpspell/spellchecker'
7
+ require 'httpspell/version'
7
8
 
8
9
  personal_dictionary_path = nil
10
+ force_language = nil
11
+ tracing = nil
12
+ verbose = nil
13
+ limit = nil
9
14
 
10
15
  begin
11
16
  OptionParser.new do |parser|
@@ -13,11 +18,28 @@ begin
13
18
  Spellchecks a website via HTTP.
14
19
 
15
20
  BANNER
21
+ parser.version = HttpSpell::VERSION
16
22
 
17
23
  parser.on('-p', '--personal-dictionary=FILE', 'path to the personal dictionary file') do |p|
18
24
  personal_dictionary_path = p
19
25
  end
20
26
 
27
+ parser.on('-l', '--language=LANGUAGE', 'override LANGUAGE of content') do |l|
28
+ force_language = l
29
+ end
30
+
31
+ parser.on('-L', '--limit=EXPRESSION', 'limit recursive retrieval to URLs matching a regular EXPRESSION') do |l|
32
+ limit = Regexp.new(l)
33
+ end
34
+
35
+ parser.on('-t', '--trace', 'enable error tracing') do
36
+ tracing = true
37
+ end
38
+
39
+ parser.on('-V', '--verbose', "explain what's happening") do
40
+ verbose = true
41
+ end
42
+
21
43
  # TODO: --recursive, defaults to false
22
44
  # TODO wget has some additional options for recursive behavior that should be reviewed
23
45
  end.parse!
@@ -32,20 +54,32 @@ if ARGV.size != 1
32
54
  end
33
55
 
34
56
  spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path)
57
+ has_unknown_words = false
35
58
 
36
- HttpSpell::Spider.new(ARGV.first).start do |url, doc|
37
- lang = doc.root['lang'] || 'de-DE'
59
+ begin
60
+ HttpSpell::Spider.new(ARGV.first, limit: limit, tracing: tracing).start do |url, doc|
61
+ lang = force_language || doc.root['lang'] || ENV['LANGUAGE']
38
62
 
39
- # Remove sections that are not to be spellchecked
40
- doc.css('pre').each(&:unlink)
41
- doc.css('code').each(&:unlink)
42
- doc.css('[spellcheck=false]').each(&:unlink)
63
+ # Remove sections that are not to be spellchecked
64
+ doc.css('pre').each(&:unlink)
65
+ doc.css('code').each(&:unlink)
66
+ doc.css('[spellcheck=false]').each(&:unlink)
43
67
 
44
- # TODO: Find sections with a lang attribute and handle them separately
45
- unknown_words = spell_checker.check(doc.to_s, lang)
68
+ # TODO: Find sections with a lang attribute and handle them separately
69
+ unknown_words = spell_checker.check(doc.to_s, lang)
46
70
 
47
- unless unknown_words.empty?
48
- warn "#{unknown_words.size} unknown words at #{url}:"
49
- puts unknown_words
71
+ if unknown_words.empty?
72
+ warn "No unknown words (language is #{lang}) at #{url}." if verbose
73
+ else
74
+ warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
75
+ puts unknown_words
76
+ has_unknown_words = true
77
+ end
50
78
  end
79
+ rescue StandardError
80
+ warn $ERROR_INFO.message
81
+ warn $ERROR_INFO.backtrace if tracing
82
+ exit 2
51
83
  end
84
+
85
+ exit 1 if has_unknown_words
@@ -27,6 +27,7 @@ Gem::Specification.new do |spec|
27
27
  spec.add_dependency 'addressable'
28
28
  spec.add_dependency 'nokogiri'
29
29
 
30
+ spec.add_development_dependency 'aruba'
30
31
  spec.add_development_dependency 'bundler'
31
32
  spec.add_development_dependency 'guard'
32
33
  spec.add_development_dependency 'guard-bundler'
@@ -5,34 +5,29 @@ require 'addressable/uri'
5
5
  require 'English'
6
6
 
7
7
  module HttpSpell
8
- # rubocop:disable Metrics/AbcSize
9
- # rubocop:disable Metrics/MethodLength
10
8
  class Spider
11
9
  attr_reader :todo, :done
12
10
 
13
- def initialize(starting_point, base_url = starting_point)
11
+ def initialize(starting_point, limit: nil, tracing: false)
14
12
  @todo = []
15
13
  @done = []
16
14
  todo << Addressable::URI.parse(starting_point)
17
- @base_url = Addressable::URI.parse(base_url)
15
+ @limit = limit || /^#{starting_point}/
16
+ @tracing = tracing
18
17
  end
19
18
 
20
19
  def start
21
20
  while todo.any?
22
21
  url = todo.pop
23
-
24
- begin
25
- extracted = links(url) do |u, d|
26
- yield u, d if block_given?
27
- rescue
28
- warn "Callback error for #{url}: #{$ERROR_INFO}"
29
- end
30
-
31
- done.append(url)
32
- todo.concat(extracted - done - todo)
33
- rescue StandardError
34
- warn "Could not fetch #{url}: #{$ERROR_INFO}"
22
+ extracted = links(url) do |u, d|
23
+ yield u, d if block_given?
24
+ rescue
25
+ warn "Callback error for #{url}: #{$ERROR_INFO}"
26
+ warn $ERROR_INFO.backtrace if @tracing
35
27
  end
28
+
29
+ done.append(url)
30
+ todo.concat(extracted - done - todo)
36
31
  end
37
32
  end
38
33
 
@@ -40,23 +35,29 @@ module HttpSpell
40
35
 
41
36
  def links(uri)
42
37
  # We are using open-uri, which follows redirects and also provides the content-type.
43
- response = URI(uri).read
44
- return [] unless response.content_type == 'text/html'
38
+ response = open(uri).read
39
+
40
+ if response.respond_to?(:content_type)
41
+ return [] unless response.content_type == 'text/html'
42
+ end
43
+
45
44
  doc = Nokogiri::HTML(response)
46
45
 
47
46
  links = doc.css('a[href]').map do |e|
48
47
  link = Addressable::URI.parse(e['href'])
49
48
  link = uri.join(link) if link.relative?
50
- next unless link.to_s.start_with?(@base_url.to_s)
49
+ next unless @limit.match?(link.to_s)
50
+ # TODO Ignore same page links (some anchor)
51
51
  link
52
52
  rescue StandardError
53
- warn $ERROR_INFO
53
+ warn $ERROR_INFO.message
54
+ warn $ERROR_INFO.backtrace if @tracing
54
55
  end.compact
55
56
 
56
57
  yield uri, doc if block_given?
58
+
59
+ warn "Adding #{links.size} links from #{uri}" if @tracing
57
60
  links
58
61
  end
59
62
  end
60
- # rubocop:enable Metrics/AbcSize
61
- # rubocop:enable Metrics/MethodLength
62
63
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HttpSpell
4
- VERSION = '1.0.0'
4
+ VERSION = '1.1.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: httpspell
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steffen Uhlig
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-05-30 00:00:00.000000000 Z
11
+ date: 2018-06-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: aruba
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: bundler
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -176,7 +190,9 @@ extensions: []
176
190
  extra_rdoc_files: []
177
191
  files:
178
192
  - ".gitignore"
193
+ - ".rspec"
179
194
  - ".rubocop.yml"
195
+ - ".travis.yml"
180
196
  - Gemfile
181
197
  - Gemfile.lock
182
198
  - README.markdown