httpspell 1.4.1 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dc09324c003c7b14e08fa255b7a31c0a9aeb143df033da9aea300619a47268ba
4
- data.tar.gz: 6890352a3cef38e243e2506398d58736c8179c2e0443a2b6ff341165e724dba0
3
+ metadata.gz: 509242695286e955675a85e15957752f1ac19eba7a5ffda317f6e45fd41c6c01
4
+ data.tar.gz: 4537ecafb9c882a23024c00246b0c1a07359d5180b2ee052d68a25ea23a64f6f
5
5
  SHA512:
6
- metadata.gz: 826bb8e875b2f1584dd5c052ab9777e616e1da0d6844263589b027c3eabfb07955155e0c43b8b1b8dc253d720eba952e80330c38035fff53fc1943420dea7454
7
- data.tar.gz: 7a4e3c9aaa586d4fbdc41971424cd5f064793ff18cba8d8606a452b3cee36070af44aa2f78ab307c71a613404cc1e490af1f56eca11068675183625f5360790e
6
+ metadata.gz: ddf6cb8856cf025e21956c49efe2d94c35204c273a086f60b6ae5e61c7bd56ec9fddda5ec8890f78c0ff106b03baba6ced6bfcf733f1e93622721ebf0b966a08
7
+ data.tar.gz: c217f2635966096b1ab86df7c52d6dd76145359d761b3cccc6829fde018760407e7207b7d6b89e3f709b8a966f82de93964656e64ad32474e4e8e3ad7cea43f8
data/Gemfile.lock CHANGED
@@ -1,15 +1,12 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- httpspell (1.4.1)
5
- addressable
4
+ httpspell (1.5.1)
6
5
  nokogiri
7
6
 
8
7
  GEM
9
8
  remote: https://rubygems.org/
10
9
  specs:
11
- addressable (2.8.6)
12
- public_suffix (>= 2.0.2, < 6.0)
13
10
  aruba (2.2.0)
14
11
  bundler (>= 1.17, < 3.0)
15
12
  contracts (>= 0.16.0, < 0.18.0)
@@ -91,7 +88,7 @@ GEM
91
88
  nenv (~> 0.1)
92
89
  shellany (~> 0.0)
93
90
  parallel (1.24.0)
94
- parser (3.3.1.0)
91
+ parser (3.3.2.0)
95
92
  ast (~> 2.4.1)
96
93
  racc
97
94
  pry (0.14.2)
@@ -100,7 +97,6 @@ GEM
100
97
  pry-byebug (3.10.1)
101
98
  byebug (~> 11.0)
102
99
  pry (>= 0.13, < 0.15)
103
- public_suffix (5.0.5)
104
100
  racc (1.8.0)
105
101
  rack (3.0.11)
106
102
  rackup (0.2.3)
@@ -127,7 +123,7 @@ GEM
127
123
  diff-lcs (>= 1.2.0, < 2.0)
128
124
  rspec-support (~> 3.13.0)
129
125
  rspec-support (3.13.1)
130
- rubocop (1.64.0)
126
+ rubocop (1.64.1)
131
127
  json (~> 2.3)
132
128
  language_server-protocol (>= 3.17.0)
133
129
  parallel (~> 1.10)
data/README.markdown CHANGED
@@ -39,7 +39,7 @@ Words that are not in the dictionary for the given language (inferred from the `
39
39
 
40
40
  # Misc
41
41
 
42
- If you produce content with kramdown (e.g. using Jekyll), setting `spellcheck='false'` for an element is a simple as adding this line *after* the element (e.g. heading):
42
+ If you produce content with kramdown (e.g. using Jekyll), an [Inline Attribute List](https://kramdown.gettalong.org/syntax.html#inline-attribute-lists) can be used to set `spellcheck='false'` for an element by adding this line *after* the element (e.g. heading):
43
43
 
44
44
  ```
45
45
  {: spellcheck="false"}
data/TODO.markdown CHANGED
@@ -1,4 +1,4 @@
1
1
  * Bail out if lang cannot be inferred and is not given on cmdline
2
2
  * exe/httpspell: # TODO: --recursive, defaults to false
3
3
  * exe/httpspell: # TODO wget has some additional options for recursive behavior that should be reviewed
4
- * lib/httpspell/spider.rb: # TODO Print _which_ entry of the blacklist matches
4
+ * lib/httpspell/spider.rb: # TODO Print _which_ entry of the exclude list matches
data/exe/httpspell CHANGED
@@ -7,13 +7,15 @@ require 'http_spell/spellchecker'
7
7
  require 'http_spell/version'
8
8
 
9
9
  personal_dictionary_path = nil
10
+ ignore_file_path = nil
10
11
  force_language = nil
11
12
  tracing = nil
12
13
  verbose = nil
13
- whitelist = nil
14
- blacklist = []
14
+ included = nil
15
+ excluded = []
15
16
 
16
17
  begin
18
+ # rubocop:disable Metrics/BlockLength
17
19
  OptionParser.new do |parser|
18
20
  parser.banner.prepend <<~BANNER
19
21
  Spellchecks a website via HTTP.
@@ -25,13 +27,17 @@ begin
25
27
  personal_dictionary_path = p
26
28
  end
27
29
 
30
+ parser.on('-I', '--ignore=FILE', 'path to a file containing spelling errors to ignore') do |i|
31
+ ignore_file_path = i
32
+ end
33
+
28
34
  parser.on('-l', '--language=LANGUAGE', 'override LANGUAGE of content') do |l|
29
35
  force_language = l
30
36
  end
31
37
 
32
- parser.on('-w', '--whitelist=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
33
- whitelist ||= []
34
- whitelist << Regexp.new(w)
38
+ parser.on('-i', '--include=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
39
+ included ||= []
40
+ included << Regexp.new(w)
35
41
  end
36
42
 
37
43
  parser.on('-t', '--trace', 'enable error tracing') do
@@ -42,15 +48,16 @@ begin
42
48
  verbose = true
43
49
  end
44
50
 
45
- parser.on('-b', '--blacklist=EXPRESSION', 'blacklist (ignore) URLs matching the given regular EXPRESSION') do |b|
46
- blacklist << Regexp.new(b)
51
+ parser.on('-e', '--exclude=EXPRESSION', 'exclude URLs matching the given regular EXPRESSION') do |b|
52
+ excluded << Regexp.new(b)
47
53
  end
48
54
 
49
55
  # TODO: --recursive, defaults to false
50
56
  # TODO wget has some additional options for recursive behavior that should be reviewed
51
57
  end.parse!
58
+ # rubocop:enable Metrics/BlockLength
52
59
  rescue StandardError
53
- warn "Error - #{$ERROR_INFO}"
60
+ warn "Error: #{$ERROR_INFO}"
54
61
  exit 1
55
62
  end
56
63
 
@@ -59,38 +66,51 @@ if ARGV.size != 1
59
66
  exit 1
60
67
  end
61
68
 
62
- def check(doc, lang, personal_dictionary_path, verbose)
63
- unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc, lang)
69
+ # rubocop:disable Metrics/ParameterLists
70
+ def check(url, doc, lang, personal_dictionary_path, ignore_file_path, verbose)
71
+ has_unknown_words = false
72
+
73
+ # Handle elements with a different lang attribute separately
74
+ doc.css(%([lang]:not([lang="#{lang}"]))).each do |element|
75
+ has_unknown_words |= check("#{url} => #{element.name} with", element, element['lang'], personal_dictionary_path, ignore_file_path, verbose)
76
+ element.unlink
77
+ end
78
+
79
+ unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc.to_s, lang)
80
+
81
+ if ignore_file_path && unknown_words.any?
82
+ ignore_words = File.read(ignore_file_path).lines.map(&:chomp)
83
+ ignored_words = unknown_words.intersection(ignore_words)
84
+
85
+ if ignored_words.any?
86
+ warn "#{url} (lang=#{lang}): Ignoring the following spelling errors because they are in the ignore list: #{ignored_words}" if verbose
87
+ unknown_words -= ignore_words
88
+ end
89
+ end
64
90
 
65
91
  if unknown_words.empty?
66
- warn 'No unknown words.' if verbose
92
+ warn "#{url} (lang=#{lang}): No unknown words" if verbose
93
+ has_unknown_words # no unknown words in doc, but maybe in elements with a different language
67
94
  else
68
- warn "#{unknown_words.size} unknown words:" if verbose
95
+ warn "#{url} (lang=#{lang}): #{unknown_words.size} unknown words:" if verbose
69
96
  puts unknown_words
70
- true
97
+ true # regardless of what elements with a different language had, at least doc has unknown words
71
98
  end
72
99
  end
100
+ # rubocop:enable Metrics/ParameterLists
73
101
 
74
102
  has_unknown_words = false
75
103
 
76
- spider_success = HttpSpell::Spider.new(ARGV.first, whitelist:, blacklist:, verbose:, tracing:).start do |url, doc|
104
+ spider_success = HttpSpell::Spider.new(ARGV.first, included:, excluded:, verbose:, tracing:).start do |url, doc|
77
105
  lang = force_language || doc.root['lang'] || ENV.fetch('LANGUAGE', nil)
78
- warn "Checking #{url} as #{lang}" if verbose
79
106
 
80
107
  # Remove elements that are not to be spellchecked
81
108
  doc.css('pre').each(&:unlink)
82
109
  doc.css('code').each(&:unlink)
110
+ doc.css('iframe').each(&:unlink)
83
111
  doc.css('[spellcheck=false]').each(&:unlink)
84
112
 
85
- # Handle elements with a different lang attribute separately
86
- doc.css(%([lang]:not([lang="#{lang}"]))).each do |element|
87
- warn "Handling #{element.name} with lang #{element['lang']}:" if verbose
88
- has_unknown_words |= check(element.to_s, element['lang'], personal_dictionary_path, verbose)
89
- element.unlink
90
- end
91
-
92
- # Everything else
93
- has_unknown_words |= check(doc.to_s, lang, personal_dictionary_path, verbose)
113
+ has_unknown_words |= check("#{url} => document with", doc, lang, personal_dictionary_path, ignore_file_path, verbose)
94
114
  end
95
115
 
96
116
  exit 2 unless spider_success
data/httpspell.gemspec CHANGED
@@ -23,7 +23,6 @@ Gem::Specification.new do |spec|
23
23
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
24
24
  spec.require_paths = ['lib']
25
25
 
26
- spec.add_dependency 'addressable'
27
26
  spec.add_dependency 'nokogiri'
28
27
  spec.metadata['rubygems_mfa_required'] = 'true'
29
28
  end
@@ -13,11 +13,6 @@ module HttpSpell
13
13
  "hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l",
14
14
  ]
15
15
 
16
- if @verbose
17
- warn 'Piping the HTML document into the following chain of commands:'
18
- warn commands
19
- end
20
-
21
16
  Open3.pipeline_rw(*commands) do |stdin, stdout, _wait_thrs|
22
17
  stdin.puts(doc)
23
18
  stdin.close
@@ -1,21 +1,21 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'nokogiri'
4
+ require 'uri'
4
5
  require 'open-uri'
5
6
  require 'open3'
6
- require 'addressable/uri'
7
7
  require 'English'
8
8
 
9
9
  module HttpSpell
10
10
  class Spider
11
11
  attr_reader :todo, :done
12
12
 
13
- def initialize(starting_point, whitelist: nil, blacklist: [], verbose: false, tracing: false)
13
+ def initialize(starting_point, included: nil, excluded: [], verbose: false, tracing: false)
14
14
  @todo = []
15
15
  @done = []
16
- todo << Addressable::URI.parse(starting_point)
17
- @whitelist = whitelist || [/^#{starting_point}/]
18
- @blacklist = blacklist
16
+ todo << URI(starting_point)
17
+ @included = included || [/^#{starting_point}/]
18
+ @excluded = excluded
19
19
  @verbose = verbose
20
20
  @tracing = tracing
21
21
  end
@@ -35,7 +35,12 @@ module HttpSpell
35
35
  end
36
36
 
37
37
  done.append(url)
38
- todo.concat(extracted - done - todo).uniq!
38
+ new_links = (extracted - done - todo).uniq
39
+
40
+ if new_links.any?
41
+ warn "Adding #{new_links.size} new links found at #{url}" if @verbose
42
+ todo.concat(extracted - done - todo).uniq!
43
+ end
39
44
  rescue StandardError
40
45
  warn "Skipping #{url} because of #{$ERROR_INFO.message}"
41
46
  warn $ERROR_INFO.backtrace if @tracing
@@ -52,46 +57,43 @@ module HttpSpell
52
57
  response = http_get(uri)
53
58
 
54
59
  if response.respond_to?(:content_type) && response.content_type != 'text/html'
55
- warn "Skipping #{uri} because it is not HTML" if @verbose
60
+ warn "Skipping #{response.base_uri} because it is not HTML" if @verbose
56
61
  return []
57
62
  end
58
63
 
59
64
  doc = Nokogiri::HTML(response)
60
65
 
61
66
  links = doc.css('a[href]').map do |e|
62
- link = Addressable::URI.parse(e['href'])
63
- link = uri.join(link) if link.relative?
67
+ next if e['href'].start_with?('#') # Ignore fragment on the same page; we always check the whole page
68
+
69
+ link = URI.join(response.base_uri, e['href'])
70
+ link.fragment = nil # Ignore fragment in links to other pages, too
64
71
 
65
- if @whitelist.none? { |re| re.match?(link.to_s) }
66
- warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @verbose
72
+ if @included.none? { |re| re.match?(link.to_s) }
73
+ warn "Skipping #{link} because it is not on the included #{@included}" if @verbose
67
74
  next
68
75
  end
69
76
 
70
- if @blacklist.any? { |re| re.match?(link.to_s) }
71
- # TODO: Print _which_ entry of the blacklist matches
72
- warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @verbose
77
+ if @excluded.any? { |re| re.match?(link.to_s) }
78
+ # TODO: Print _which_ entry of the excluded matches
79
+ warn "Skipping #{link} because it is on the excluded #{@excluded}" if @verbose
73
80
  next
74
81
  end
75
82
 
76
- # Ignore fragment; we always check the whole page
77
- link.fragment = nil
78
-
79
83
  link
80
84
  rescue StandardError
81
- warn $ERROR_INFO.message
85
+ warn "Error: #{$ERROR_INFO}"
82
86
  warn $ERROR_INFO.backtrace if @tracing
83
87
  end.compact
84
88
 
85
- yield uri, doc if block_given?
89
+ yield response.base_uri, doc if block_given?
86
90
 
87
- warn "Adding #{links.size} links from #{uri}" if @verbose
88
91
  links
89
92
  end
90
93
 
91
94
  # https://twin.github.io/improving-open-uri/
92
95
  def http_get(uri)
93
96
  tries = 10
94
-
95
97
  begin
96
98
  URI.parse(uri).open(redirect: false)
97
99
  rescue OpenURI::HTTPRedirect => e
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HttpSpell
4
- VERSION = '1.4.1'
4
+ VERSION = '1.5.1'
5
5
  end
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: httpspell
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.1
4
+ version: 1.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steffen Uhlig
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-30 00:00:00.000000000 Z
11
+ date: 2024-06-01 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: addressable
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ">="
18
- - !ruby/object:Gem::Version
19
- version: '0'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- version: '0'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: nokogiri
29
15
  requirement: !ruby/object:Gem::Requirement