httpspell 1.4.1 → 1.5.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dc09324c003c7b14e08fa255b7a31c0a9aeb143df033da9aea300619a47268ba
4
- data.tar.gz: 6890352a3cef38e243e2506398d58736c8179c2e0443a2b6ff341165e724dba0
3
+ metadata.gz: 509242695286e955675a85e15957752f1ac19eba7a5ffda317f6e45fd41c6c01
4
+ data.tar.gz: 4537ecafb9c882a23024c00246b0c1a07359d5180b2ee052d68a25ea23a64f6f
5
5
  SHA512:
6
- metadata.gz: 826bb8e875b2f1584dd5c052ab9777e616e1da0d6844263589b027c3eabfb07955155e0c43b8b1b8dc253d720eba952e80330c38035fff53fc1943420dea7454
7
- data.tar.gz: 7a4e3c9aaa586d4fbdc41971424cd5f064793ff18cba8d8606a452b3cee36070af44aa2f78ab307c71a613404cc1e490af1f56eca11068675183625f5360790e
6
+ metadata.gz: ddf6cb8856cf025e21956c49efe2d94c35204c273a086f60b6ae5e61c7bd56ec9fddda5ec8890f78c0ff106b03baba6ced6bfcf733f1e93622721ebf0b966a08
7
+ data.tar.gz: c217f2635966096b1ab86df7c52d6dd76145359d761b3cccc6829fde018760407e7207b7d6b89e3f709b8a966f82de93964656e64ad32474e4e8e3ad7cea43f8
data/Gemfile.lock CHANGED
@@ -1,15 +1,12 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- httpspell (1.4.1)
5
- addressable
4
+ httpspell (1.5.1)
6
5
  nokogiri
7
6
 
8
7
  GEM
9
8
  remote: https://rubygems.org/
10
9
  specs:
11
- addressable (2.8.6)
12
- public_suffix (>= 2.0.2, < 6.0)
13
10
  aruba (2.2.0)
14
11
  bundler (>= 1.17, < 3.0)
15
12
  contracts (>= 0.16.0, < 0.18.0)
@@ -91,7 +88,7 @@ GEM
91
88
  nenv (~> 0.1)
92
89
  shellany (~> 0.0)
93
90
  parallel (1.24.0)
94
- parser (3.3.1.0)
91
+ parser (3.3.2.0)
95
92
  ast (~> 2.4.1)
96
93
  racc
97
94
  pry (0.14.2)
@@ -100,7 +97,6 @@ GEM
100
97
  pry-byebug (3.10.1)
101
98
  byebug (~> 11.0)
102
99
  pry (>= 0.13, < 0.15)
103
- public_suffix (5.0.5)
104
100
  racc (1.8.0)
105
101
  rack (3.0.11)
106
102
  rackup (0.2.3)
@@ -127,7 +123,7 @@ GEM
127
123
  diff-lcs (>= 1.2.0, < 2.0)
128
124
  rspec-support (~> 3.13.0)
129
125
  rspec-support (3.13.1)
130
- rubocop (1.64.0)
126
+ rubocop (1.64.1)
131
127
  json (~> 2.3)
132
128
  language_server-protocol (>= 3.17.0)
133
129
  parallel (~> 1.10)
data/README.markdown CHANGED
@@ -39,7 +39,7 @@ Words that are not in the dictionary for the given language (inferred from the `
39
39
 
40
40
  # Misc
41
41
 
42
- If you produce content with kramdown (e.g. using Jekyll), setting `spellcheck='false'` for an element is a simple as adding this line *after* the element (e.g. heading):
42
+ If you produce content with kramdown (e.g. using Jekyll), an [Inline Attribute List](https://kramdown.gettalong.org/syntax.html#inline-attribute-lists) can be used to set `spellcheck='false'` for an element by adding this line *after* the element (e.g. heading):
43
43
 
44
44
  ```
45
45
  {: spellcheck="false"}
data/TODO.markdown CHANGED
@@ -1,4 +1,4 @@
1
1
  * Bail out if lang cannot be inferred and is not given on cmdline
2
2
  * exe/httpspell: # TODO: --recursive, defaults to false
3
3
  * exe/httpspell: # TODO wget has some additional options for recursive behavior that should be reviewed
4
- * lib/httpspell/spider.rb: # TODO Print _which_ entry of the blacklist matches
4
+ * lib/httpspell/spider.rb: # TODO Print _which_ entry of the exclude list matches
data/exe/httpspell CHANGED
@@ -7,13 +7,15 @@ require 'http_spell/spellchecker'
7
7
  require 'http_spell/version'
8
8
 
9
9
  personal_dictionary_path = nil
10
+ ignore_file_path = nil
10
11
  force_language = nil
11
12
  tracing = nil
12
13
  verbose = nil
13
- whitelist = nil
14
- blacklist = []
14
+ included = nil
15
+ excluded = []
15
16
 
16
17
  begin
18
+ # rubocop:disable Metrics/BlockLength
17
19
  OptionParser.new do |parser|
18
20
  parser.banner.prepend <<~BANNER
19
21
  Spellchecks a website via HTTP.
@@ -25,13 +27,17 @@ begin
25
27
  personal_dictionary_path = p
26
28
  end
27
29
 
30
+ parser.on('-I', '--ignore=FILE', 'path to a file containing spelling errors to ignore') do |i|
31
+ ignore_file_path = i
32
+ end
33
+
28
34
  parser.on('-l', '--language=LANGUAGE', 'override LANGUAGE of content') do |l|
29
35
  force_language = l
30
36
  end
31
37
 
32
- parser.on('-w', '--whitelist=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
33
- whitelist ||= []
34
- whitelist << Regexp.new(w)
38
+ parser.on('-i', '--include=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
39
+ included ||= []
40
+ included << Regexp.new(w)
35
41
  end
36
42
 
37
43
  parser.on('-t', '--trace', 'enable error tracing') do
@@ -42,15 +48,16 @@ begin
42
48
  verbose = true
43
49
  end
44
50
 
45
- parser.on('-b', '--blacklist=EXPRESSION', 'blacklist (ignore) URLs matching the given regular EXPRESSION') do |b|
46
- blacklist << Regexp.new(b)
51
+ parser.on('-e', '--exclude=EXPRESSION', 'exclude URLs matching the given regular EXPRESSION') do |b|
52
+ excluded << Regexp.new(b)
47
53
  end
48
54
 
49
55
  # TODO: --recursive, defaults to false
50
56
  # TODO wget has some additional options for recursive behavior that should be reviewed
51
57
  end.parse!
58
+ # rubocop:enable Metrics/BlockLength
52
59
  rescue StandardError
53
- warn "Error - #{$ERROR_INFO}"
60
+ warn "Error: #{$ERROR_INFO}"
54
61
  exit 1
55
62
  end
56
63
 
@@ -59,38 +66,51 @@ if ARGV.size != 1
59
66
  exit 1
60
67
  end
61
68
 
62
- def check(doc, lang, personal_dictionary_path, verbose)
63
- unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc, lang)
69
+ # rubocop:disable Metrics/ParameterLists
70
+ def check(url, doc, lang, personal_dictionary_path, ignore_file_path, verbose)
71
+ has_unknown_words = false
72
+
73
+ # Handle elements with a different lang attribute separately
74
+ doc.css(%([lang]:not([lang="#{lang}"]))).each do |element|
75
+ has_unknown_words |= check("#{url} => #{element.name} with", element, element['lang'], personal_dictionary_path, ignore_file_path, verbose)
76
+ element.unlink
77
+ end
78
+
79
+ unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc.to_s, lang)
80
+
81
+ if ignore_file_path && unknown_words.any?
82
+ ignore_words = File.read(ignore_file_path).lines.map(&:chomp)
83
+ ignored_words = unknown_words.intersection(ignore_words)
84
+
85
+ if ignored_words.any?
86
+ warn "#{url} (lang=#{lang}): Ignoring the following spelling errors because they are in the ignore list: #{ignored_words}" if verbose
87
+ unknown_words -= ignore_words
88
+ end
89
+ end
64
90
 
65
91
  if unknown_words.empty?
66
- warn 'No unknown words.' if verbose
92
+ warn "#{url} (lang=#{lang}): No unknown words" if verbose
93
+ has_unknown_words # no unknown words in doc, but maybe in elements with a different language
67
94
  else
68
- warn "#{unknown_words.size} unknown words:" if verbose
95
+ warn "#{url} (lang=#{lang}): #{unknown_words.size} unknown words:" if verbose
69
96
  puts unknown_words
70
- true
97
+ true # regardless of what elements with a different language had, at least doc has unknown words
71
98
  end
72
99
  end
100
+ # rubocop:enable Metrics/ParameterLists
73
101
 
74
102
  has_unknown_words = false
75
103
 
76
- spider_success = HttpSpell::Spider.new(ARGV.first, whitelist:, blacklist:, verbose:, tracing:).start do |url, doc|
104
+ spider_success = HttpSpell::Spider.new(ARGV.first, included:, excluded:, verbose:, tracing:).start do |url, doc|
77
105
  lang = force_language || doc.root['lang'] || ENV.fetch('LANGUAGE', nil)
78
- warn "Checking #{url} as #{lang}" if verbose
79
106
 
80
107
  # Remove elements that are not to be spellchecked
81
108
  doc.css('pre').each(&:unlink)
82
109
  doc.css('code').each(&:unlink)
110
+ doc.css('iframe').each(&:unlink)
83
111
  doc.css('[spellcheck=false]').each(&:unlink)
84
112
 
85
- # Handle elements with a different lang attribute separately
86
- doc.css(%([lang]:not([lang="#{lang}"]))).each do |element|
87
- warn "Handling #{element.name} with lang #{element['lang']}:" if verbose
88
- has_unknown_words |= check(element.to_s, element['lang'], personal_dictionary_path, verbose)
89
- element.unlink
90
- end
91
-
92
- # Everything else
93
- has_unknown_words |= check(doc.to_s, lang, personal_dictionary_path, verbose)
113
+ has_unknown_words |= check("#{url} => document with", doc, lang, personal_dictionary_path, ignore_file_path, verbose)
94
114
  end
95
115
 
96
116
  exit 2 unless spider_success
data/httpspell.gemspec CHANGED
@@ -23,7 +23,6 @@ Gem::Specification.new do |spec|
23
23
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
24
24
  spec.require_paths = ['lib']
25
25
 
26
- spec.add_dependency 'addressable'
27
26
  spec.add_dependency 'nokogiri'
28
27
  spec.metadata['rubygems_mfa_required'] = 'true'
29
28
  end
@@ -13,11 +13,6 @@ module HttpSpell
13
13
  "hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l",
14
14
  ]
15
15
 
16
- if @verbose
17
- warn 'Piping the HTML document into the following chain of commands:'
18
- warn commands
19
- end
20
-
21
16
  Open3.pipeline_rw(*commands) do |stdin, stdout, _wait_thrs|
22
17
  stdin.puts(doc)
23
18
  stdin.close
@@ -1,21 +1,21 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'nokogiri'
4
+ require 'uri'
4
5
  require 'open-uri'
5
6
  require 'open3'
6
- require 'addressable/uri'
7
7
  require 'English'
8
8
 
9
9
  module HttpSpell
10
10
  class Spider
11
11
  attr_reader :todo, :done
12
12
 
13
- def initialize(starting_point, whitelist: nil, blacklist: [], verbose: false, tracing: false)
13
+ def initialize(starting_point, included: nil, excluded: [], verbose: false, tracing: false)
14
14
  @todo = []
15
15
  @done = []
16
- todo << Addressable::URI.parse(starting_point)
17
- @whitelist = whitelist || [/^#{starting_point}/]
18
- @blacklist = blacklist
16
+ todo << URI(starting_point)
17
+ @included = included || [/^#{starting_point}/]
18
+ @excluded = excluded
19
19
  @verbose = verbose
20
20
  @tracing = tracing
21
21
  end
@@ -35,7 +35,12 @@ module HttpSpell
35
35
  end
36
36
 
37
37
  done.append(url)
38
- todo.concat(extracted - done - todo).uniq!
38
+ new_links = (extracted - done - todo).uniq
39
+
40
+ if new_links.any?
41
+ warn "Adding #{new_links.size} new links found at #{url}" if @verbose
42
+ todo.concat(extracted - done - todo).uniq!
43
+ end
39
44
  rescue StandardError
40
45
  warn "Skipping #{url} because of #{$ERROR_INFO.message}"
41
46
  warn $ERROR_INFO.backtrace if @tracing
@@ -52,46 +57,43 @@ module HttpSpell
52
57
  response = http_get(uri)
53
58
 
54
59
  if response.respond_to?(:content_type) && response.content_type != 'text/html'
55
- warn "Skipping #{uri} because it is not HTML" if @verbose
60
+ warn "Skipping #{response.base_uri} because it is not HTML" if @verbose
56
61
  return []
57
62
  end
58
63
 
59
64
  doc = Nokogiri::HTML(response)
60
65
 
61
66
  links = doc.css('a[href]').map do |e|
62
- link = Addressable::URI.parse(e['href'])
63
- link = uri.join(link) if link.relative?
67
+ next if e['href'].start_with?('#') # Ignore fragment on the same page; we always check the whole page
68
+
69
+ link = URI.join(response.base_uri, e['href'])
70
+ link.fragment = nil # Ignore fragment in links to other pages, too
64
71
 
65
- if @whitelist.none? { |re| re.match?(link.to_s) }
66
- warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @verbose
72
+ if @included.none? { |re| re.match?(link.to_s) }
73
+ warn "Skipping #{link} because it is not on the included #{@included}" if @verbose
67
74
  next
68
75
  end
69
76
 
70
- if @blacklist.any? { |re| re.match?(link.to_s) }
71
- # TODO: Print _which_ entry of the blacklist matches
72
- warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @verbose
77
+ if @excluded.any? { |re| re.match?(link.to_s) }
78
+ # TODO: Print _which_ entry of the excluded matches
79
+ warn "Skipping #{link} because it is on the excluded #{@excluded}" if @verbose
73
80
  next
74
81
  end
75
82
 
76
- # Ignore fragment; we always check the whole page
77
- link.fragment = nil
78
-
79
83
  link
80
84
  rescue StandardError
81
- warn $ERROR_INFO.message
85
+ warn "Error: #{$ERROR_INFO}"
82
86
  warn $ERROR_INFO.backtrace if @tracing
83
87
  end.compact
84
88
 
85
- yield uri, doc if block_given?
89
+ yield response.base_uri, doc if block_given?
86
90
 
87
- warn "Adding #{links.size} links from #{uri}" if @verbose
88
91
  links
89
92
  end
90
93
 
91
94
  # https://twin.github.io/improving-open-uri/
92
95
  def http_get(uri)
93
96
  tries = 10
94
-
95
97
  begin
96
98
  URI.parse(uri).open(redirect: false)
97
99
  rescue OpenURI::HTTPRedirect => e
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HttpSpell
4
- VERSION = '1.4.1'
4
+ VERSION = '1.5.1'
5
5
  end
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: httpspell
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.1
4
+ version: 1.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steffen Uhlig
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-30 00:00:00.000000000 Z
11
+ date: 2024-06-01 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: addressable
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ">="
18
- - !ruby/object:Gem::Version
19
- version: '0'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- version: '0'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: nokogiri
29
15
  requirement: !ruby/object:Gem::Requirement