httpspell 1.4.1 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dc09324c003c7b14e08fa255b7a31c0a9aeb143df033da9aea300619a47268ba
4
- data.tar.gz: 6890352a3cef38e243e2506398d58736c8179c2e0443a2b6ff341165e724dba0
3
+ metadata.gz: d66cfcc88c0bc7e0e237033b8c76f1aaccc40f9aba3f68766d45204a2b133401
4
+ data.tar.gz: 6c488170f95d0f33fdcbc5c55f2416d654f2b4558214943f21d19e1220f2ad96
5
5
  SHA512:
6
- metadata.gz: 826bb8e875b2f1584dd5c052ab9777e616e1da0d6844263589b027c3eabfb07955155e0c43b8b1b8dc253d720eba952e80330c38035fff53fc1943420dea7454
7
- data.tar.gz: 7a4e3c9aaa586d4fbdc41971424cd5f064793ff18cba8d8606a452b3cee36070af44aa2f78ab307c71a613404cc1e490af1f56eca11068675183625f5360790e
6
+ metadata.gz: f493b3411cd162e4a714203b05277f26810d71a0c23e52d69c36297c19e03db7b97692c1e4928c8f8fb0b9bc9a59f04b9ae4113c5cdd309c1edcf2a493d68687
7
+ data.tar.gz: ed44c8adf0dcd63330e8e7f837d9f515fd6131e58ae02808a01ed430ef42e5ee258a8c8d5a6fb9552c6b2c0dc460966453d3f1cda6e359739a18b6a01a25cdeb
data/Gemfile.lock CHANGED
@@ -1,15 +1,12 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- httpspell (1.4.1)
5
- addressable
4
+ httpspell (1.5.0)
6
5
  nokogiri
7
6
 
8
7
  GEM
9
8
  remote: https://rubygems.org/
10
9
  specs:
11
- addressable (2.8.6)
12
- public_suffix (>= 2.0.2, < 6.0)
13
10
  aruba (2.2.0)
14
11
  bundler (>= 1.17, < 3.0)
15
12
  contracts (>= 0.16.0, < 0.18.0)
@@ -91,7 +88,7 @@ GEM
91
88
  nenv (~> 0.1)
92
89
  shellany (~> 0.0)
93
90
  parallel (1.24.0)
94
- parser (3.3.1.0)
91
+ parser (3.3.2.0)
95
92
  ast (~> 2.4.1)
96
93
  racc
97
94
  pry (0.14.2)
@@ -100,7 +97,6 @@ GEM
100
97
  pry-byebug (3.10.1)
101
98
  byebug (~> 11.0)
102
99
  pry (>= 0.13, < 0.15)
103
- public_suffix (5.0.5)
104
100
  racc (1.8.0)
105
101
  rack (3.0.11)
106
102
  rackup (0.2.3)
@@ -127,7 +123,7 @@ GEM
127
123
  diff-lcs (>= 1.2.0, < 2.0)
128
124
  rspec-support (~> 3.13.0)
129
125
  rspec-support (3.13.1)
130
- rubocop (1.64.0)
126
+ rubocop (1.64.1)
131
127
  json (~> 2.3)
132
128
  language_server-protocol (>= 3.17.0)
133
129
  parallel (~> 1.10)
data/TODO.markdown CHANGED
@@ -1,4 +1,4 @@
1
1
  * Bail out if lang cannot be inferred and is not given on cmdline
2
2
  * exe/httpspell: # TODO: --recursive, defaults to false
3
3
  * exe/httpspell: # TODO wget has some additional options for recursive behavior that should be reviewed
4
- * lib/httpspell/spider.rb: # TODO Print _which_ entry of the blacklist matches
4
+ * lib/httpspell/spider.rb: # TODO Print _which_ entry of the exclude list matches
data/exe/httpspell CHANGED
@@ -10,8 +10,8 @@ personal_dictionary_path = nil
10
10
  force_language = nil
11
11
  tracing = nil
12
12
  verbose = nil
13
- whitelist = nil
14
- blacklist = []
13
+ included = nil
14
+ excluded = []
15
15
 
16
16
  begin
17
17
  OptionParser.new do |parser|
@@ -29,9 +29,9 @@ begin
29
29
  force_language = l
30
30
  end
31
31
 
32
- parser.on('-w', '--whitelist=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
33
- whitelist ||= []
34
- whitelist << Regexp.new(w)
32
+ parser.on('-i', '--include=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
33
+ included ||= []
34
+ included << Regexp.new(w)
35
35
  end
36
36
 
37
37
  parser.on('-t', '--trace', 'enable error tracing') do
@@ -42,15 +42,15 @@ begin
42
42
  verbose = true
43
43
  end
44
44
 
45
- parser.on('-b', '--blacklist=EXPRESSION', 'blacklist (ignore) URLs matching the given regular EXPRESSION') do |b|
46
- blacklist << Regexp.new(b)
45
+ parser.on('-e', '--exclude=EXPRESSION', 'exclude URLs matching the given regular EXPRESSION') do |b|
46
+ excluded << Regexp.new(b)
47
47
  end
48
48
 
49
49
  # TODO: --recursive, defaults to false
50
50
  # TODO wget has some additional options for recursive behavior that should be reviewed
51
51
  end.parse!
52
52
  rescue StandardError
53
- warn "Error - #{$ERROR_INFO}"
53
+ warn "Error: #{$ERROR_INFO}"
54
54
  exit 1
55
55
  end
56
56
 
@@ -59,13 +59,14 @@ if ARGV.size != 1
59
59
  exit 1
60
60
  end
61
61
 
62
- def check(doc, lang, personal_dictionary_path, verbose)
62
+ def check(url, doc, lang, personal_dictionary_path, verbose)
63
63
  unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc, lang)
64
64
 
65
65
  if unknown_words.empty?
66
- warn 'No unknown words.' if verbose
66
+ warn "#{url} (lang=#{lang}): No unknown words" if verbose
67
+ false
67
68
  else
68
- warn "#{unknown_words.size} unknown words:" if verbose
69
+ warn "#{url} (lang=#{lang}): #{unknown_words.size} unknown words:" if verbose
69
70
  puts unknown_words
70
71
  true
71
72
  end
@@ -73,24 +74,23 @@ end
73
74
 
74
75
  has_unknown_words = false
75
76
 
76
- spider_success = HttpSpell::Spider.new(ARGV.first, whitelist:, blacklist:, verbose:, tracing:).start do |url, doc|
77
+ spider_success = HttpSpell::Spider.new(ARGV.first, included:, excluded:, verbose:, tracing:).start do |url, doc|
77
78
  lang = force_language || doc.root['lang'] || ENV.fetch('LANGUAGE', nil)
78
- warn "Checking #{url} as #{lang}" if verbose
79
79
 
80
80
  # Remove elements that are not to be spellchecked
81
81
  doc.css('pre').each(&:unlink)
82
82
  doc.css('code').each(&:unlink)
83
+ doc.css('iframe').each(&:unlink)
83
84
  doc.css('[spellcheck=false]').each(&:unlink)
84
85
 
85
86
  # Handle elements with a different lang attribute separately
86
87
  doc.css(%([lang]:not([lang="#{lang}"]))).each do |element|
87
- warn "Handling #{element.name} with lang #{element['lang']}:" if verbose
88
- has_unknown_words |= check(element.to_s, element['lang'], personal_dictionary_path, verbose)
88
+ has_unknown_words |= check("#{url} => #{element.name} with", element.to_s, element['lang'], personal_dictionary_path, verbose)
89
89
  element.unlink
90
90
  end
91
91
 
92
92
  # Everything else
93
- has_unknown_words |= check(doc.to_s, lang, personal_dictionary_path, verbose)
93
+ has_unknown_words |= check("#{url} => document with", doc.to_s, lang, personal_dictionary_path, verbose)
94
94
  end
95
95
 
96
96
  exit 2 unless spider_success
data/httpspell.gemspec CHANGED
@@ -23,7 +23,6 @@ Gem::Specification.new do |spec|
23
23
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
24
24
  spec.require_paths = ['lib']
25
25
 
26
- spec.add_dependency 'addressable'
27
26
  spec.add_dependency 'nokogiri'
28
27
  spec.metadata['rubygems_mfa_required'] = 'true'
29
28
  end
@@ -13,11 +13,6 @@ module HttpSpell
13
13
  "hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l",
14
14
  ]
15
15
 
16
- if @verbose
17
- warn 'Piping the HTML document into the following chain of commands:'
18
- warn commands
19
- end
20
-
21
16
  Open3.pipeline_rw(*commands) do |stdin, stdout, _wait_thrs|
22
17
  stdin.puts(doc)
23
18
  stdin.close
@@ -1,21 +1,21 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'nokogiri'
4
+ require 'uri'
4
5
  require 'open-uri'
5
6
  require 'open3'
6
- require 'addressable/uri'
7
7
  require 'English'
8
8
 
9
9
  module HttpSpell
10
10
  class Spider
11
11
  attr_reader :todo, :done
12
12
 
13
- def initialize(starting_point, whitelist: nil, blacklist: [], verbose: false, tracing: false)
13
+ def initialize(starting_point, included: nil, excluded: [], verbose: false, tracing: false)
14
14
  @todo = []
15
15
  @done = []
16
- todo << Addressable::URI.parse(starting_point)
17
- @whitelist = whitelist || [/^#{starting_point}/]
18
- @blacklist = blacklist
16
+ todo << URI(starting_point)
17
+ @included = included || [/^#{starting_point}/]
18
+ @excluded = excluded
19
19
  @verbose = verbose
20
20
  @tracing = tracing
21
21
  end
@@ -35,7 +35,12 @@ module HttpSpell
35
35
  end
36
36
 
37
37
  done.append(url)
38
- todo.concat(extracted - done - todo).uniq!
38
+ new_links = (extracted - done - todo).uniq
39
+
40
+ if new_links.any?
41
+ warn "Adding #{new_links.size} new links found at #{url}" if @verbose
42
+ todo.concat(extracted - done - todo).uniq!
43
+ end
39
44
  rescue StandardError
40
45
  warn "Skipping #{url} because of #{$ERROR_INFO.message}"
41
46
  warn $ERROR_INFO.backtrace if @tracing
@@ -52,46 +57,43 @@ module HttpSpell
52
57
  response = http_get(uri)
53
58
 
54
59
  if response.respond_to?(:content_type) && response.content_type != 'text/html'
55
- warn "Skipping #{uri} because it is not HTML" if @verbose
60
+ warn "Skipping #{response.base_uri} because it is not HTML" if @verbose
56
61
  return []
57
62
  end
58
63
 
59
64
  doc = Nokogiri::HTML(response)
60
65
 
61
66
  links = doc.css('a[href]').map do |e|
62
- link = Addressable::URI.parse(e['href'])
63
- link = uri.join(link) if link.relative?
67
+ next if e['href'].start_with?('#') # Ignore fragment on the same page; we always check the whole page
68
+
69
+ link = URI.join(response.base_uri, e['href'])
70
+ link.fragment = nil # Ignore fragment in links to other pages, too
64
71
 
65
- if @whitelist.none? { |re| re.match?(link.to_s) }
66
- warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @verbose
72
+ if @included.none? { |re| re.match?(link.to_s) }
73
+ warn "Skipping #{link} because it is not on the included #{@included}" if @verbose
67
74
  next
68
75
  end
69
76
 
70
- if @blacklist.any? { |re| re.match?(link.to_s) }
71
- # TODO: Print _which_ entry of the blacklist matches
72
- warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @verbose
77
+ if @excluded.any? { |re| re.match?(link.to_s) }
78
+ # TODO: Print _which_ entry of the excluded matches
79
+ warn "Skipping #{link} because it is on the excluded #{@excluded}" if @verbose
73
80
  next
74
81
  end
75
82
 
76
- # Ignore fragment; we always check the whole page
77
- link.fragment = nil
78
-
79
83
  link
80
84
  rescue StandardError
81
- warn $ERROR_INFO.message
85
+ warn "Error: #{$ERROR_INFO}"
82
86
  warn $ERROR_INFO.backtrace if @tracing
83
87
  end.compact
84
88
 
85
- yield uri, doc if block_given?
89
+ yield response.base_uri, doc if block_given?
86
90
 
87
- warn "Adding #{links.size} links from #{uri}" if @verbose
88
91
  links
89
92
  end
90
93
 
91
94
  # https://twin.github.io/improving-open-uri/
92
95
  def http_get(uri)
93
96
  tries = 10
94
-
95
97
  begin
96
98
  URI.parse(uri).open(redirect: false)
97
99
  rescue OpenURI::HTTPRedirect => e
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HttpSpell
4
- VERSION = '1.4.1'
4
+ VERSION = '1.5.0'
5
5
  end
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: httpspell
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.1
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steffen Uhlig
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-30 00:00:00.000000000 Z
11
+ date: 2024-05-31 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: addressable
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ">="
18
- - !ruby/object:Gem::Version
19
- version: '0'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- version: '0'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: nokogiri
29
15
  requirement: !ruby/object:Gem::Requirement