httpspell 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e6245dacfbe0f2b1f7a95e95118a058e592f0fcbe88407b804e756bd41691054
4
- data.tar.gz: 731ddea385cdca5f40c14c10289583b1544b9055bd82015ce2c5beb3b9495b52
3
+ metadata.gz: d66cfcc88c0bc7e0e237033b8c76f1aaccc40f9aba3f68766d45204a2b133401
4
+ data.tar.gz: 6c488170f95d0f33fdcbc5c55f2416d654f2b4558214943f21d19e1220f2ad96
5
5
  SHA512:
6
- metadata.gz: 2295d29b287812f6f1330d4cc17a5eae9ab3a14e99bb9ab4d4215761d3146a3f27b8ddcb66331999156c68a1ac58e2ac33b47d7043b7a2771e56faca03eb7b62
7
- data.tar.gz: 6cbab0f82ae37c684a6bf4ad60e30a7b5a9921e5d90956bd4baa088e4190c9e8b2b8359e41c457dc6d594c962e43b13e3483046b0208f5bb5273d0928e74950f
6
+ metadata.gz: f493b3411cd162e4a714203b05277f26810d71a0c23e52d69c36297c19e03db7b97692c1e4928c8f8fb0b9bc9a59f04b9ae4113c5cdd309c1edcf2a493d68687
7
+ data.tar.gz: ed44c8adf0dcd63330e8e7f837d9f515fd6131e58ae02808a01ed430ef42e5ee258a8c8d5a6fb9552c6b2c0dc460966453d3f1cda6e359739a18b6a01a25cdeb
data/.rubocop.yml CHANGED
@@ -1,7 +1,11 @@
1
+ require:
2
+ - rubocop-rake
3
+ - rubocop-rspec
1
4
  AllCops:
2
5
  NewCops: enable
3
6
  TargetRubyVersion: 3.3
4
7
  Include:
8
+ - '**/*.rb'
5
9
  - '**/Gemfile'
6
10
  - '**/Rakefile'
7
11
  - '**/config.ru'
@@ -23,3 +27,19 @@ Layout/LineLength:
23
27
  Max: 160
24
28
  Style/Documentation:
25
29
  Enabled: false
30
+ Metrics/AbcSize:
31
+ Enabled: false
32
+ Metrics/MethodLength:
33
+ Enabled: false
34
+ Metrics/CyclomaticComplexity:
35
+ Enabled: false
36
+ Style/TrailingCommaInArrayLiteral:
37
+ Enabled: false
38
+ RSpec/ExampleWording:
39
+ Enabled: false
40
+ RSpec/InstanceVariable:
41
+ AssignmentOnly: true
42
+ RSpec/ExampleLength:
43
+ Max: 10
44
+ Metrics/PerceivedComplexity:
45
+ Max: 16
data/Gemfile.lock CHANGED
@@ -1,15 +1,12 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- httpspell (1.4.0)
5
- addressable
4
+ httpspell (1.5.0)
6
5
  nokogiri
7
6
 
8
7
  GEM
9
8
  remote: https://rubygems.org/
10
9
  specs:
11
- addressable (2.8.6)
12
- public_suffix (>= 2.0.2, < 6.0)
13
10
  aruba (2.2.0)
14
11
  bundler (>= 1.17, < 3.0)
15
12
  contracts (>= 0.16.0, < 0.18.0)
@@ -91,7 +88,7 @@ GEM
91
88
  nenv (~> 0.1)
92
89
  shellany (~> 0.0)
93
90
  parallel (1.24.0)
94
- parser (3.3.1.0)
91
+ parser (3.3.2.0)
95
92
  ast (~> 2.4.1)
96
93
  racc
97
94
  pry (0.14.2)
@@ -100,7 +97,6 @@ GEM
100
97
  pry-byebug (3.10.1)
101
98
  byebug (~> 11.0)
102
99
  pry (>= 0.13, < 0.15)
103
- public_suffix (5.0.5)
104
100
  racc (1.8.0)
105
101
  rack (3.0.11)
106
102
  rackup (0.2.3)
@@ -127,7 +123,7 @@ GEM
127
123
  diff-lcs (>= 1.2.0, < 2.0)
128
124
  rspec-support (~> 3.13.0)
129
125
  rspec-support (3.13.1)
130
- rubocop (1.64.0)
126
+ rubocop (1.64.1)
131
127
  json (~> 2.3)
132
128
  language_server-protocol (>= 3.17.0)
133
129
  parallel (~> 1.10)
data/TODO.markdown CHANGED
@@ -1,5 +1,4 @@
1
1
  * Bail out if lang cannot be inferred and is not given on cmdline
2
2
  * exe/httpspell: # TODO: --recursive, defaults to false
3
3
  * exe/httpspell: # TODO wget has some additional options for recursive behavior that should be reviewed
4
- * lib/httpspell/spider.rb: # TODO Print _which_ entry of the blacklist matches
5
- * lib/httpspell/spider.rb: # TODO Ignore same page links (some anchor)
4
+ * lib/httpspell/spider.rb: # TODO Print _which_ entry of the exclude list matches
data/exe/httpspell CHANGED
@@ -2,16 +2,16 @@
2
2
  # frozen_string_literal: true
3
3
 
4
4
  require 'optparse'
5
- require 'httpspell/spider'
6
- require 'httpspell/spellchecker'
7
- require 'httpspell/version'
5
+ require 'http_spell/spider'
6
+ require 'http_spell/spellchecker'
7
+ require 'http_spell/version'
8
8
 
9
9
  personal_dictionary_path = nil
10
10
  force_language = nil
11
11
  tracing = nil
12
12
  verbose = nil
13
- whitelist = nil
14
- blacklist = []
13
+ included = nil
14
+ excluded = []
15
15
 
16
16
  begin
17
17
  OptionParser.new do |parser|
@@ -29,9 +29,9 @@ begin
29
29
  force_language = l
30
30
  end
31
31
 
32
- parser.on('-w', '--whitelist=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
33
- whitelist ||= []
34
- whitelist << Regexp.new(w)
32
+ parser.on('-i', '--include=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
33
+ included ||= []
34
+ included << Regexp.new(w)
35
35
  end
36
36
 
37
37
  parser.on('-t', '--trace', 'enable error tracing') do
@@ -42,15 +42,15 @@ begin
42
42
  verbose = true
43
43
  end
44
44
 
45
- parser.on('-b', '--blacklist=EXPRESSION', 'blacklist (ignore) URLs matching the given regular EXPRESSION') do |b|
46
- blacklist << Regexp.new(b)
45
+ parser.on('-e', '--exclude=EXPRESSION', 'exclude URLs matching the given regular EXPRESSION') do |b|
46
+ excluded << Regexp.new(b)
47
47
  end
48
48
 
49
49
  # TODO: --recursive, defaults to false
50
50
  # TODO wget has some additional options for recursive behavior that should be reviewed
51
51
  end.parse!
52
52
  rescue StandardError
53
- warn "Error - #{$ERROR_INFO}"
53
+ warn "Error: #{$ERROR_INFO}"
54
54
  exit 1
55
55
  end
56
56
 
@@ -59,13 +59,14 @@ if ARGV.size != 1
59
59
  exit 1
60
60
  end
61
61
 
62
- def check(doc, lang, personal_dictionary_path, verbose)
62
+ def check(url, doc, lang, personal_dictionary_path, verbose)
63
63
  unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc, lang)
64
64
 
65
65
  if unknown_words.empty?
66
- warn 'No unknown words.' if verbose
66
+ warn "#{url} (lang=#{lang}): No unknown words" if verbose
67
+ false
67
68
  else
68
- warn "#{unknown_words.size} unknown words:" if verbose
69
+ warn "#{url} (lang=#{lang}): #{unknown_words.size} unknown words:" if verbose
69
70
  puts unknown_words
70
71
  true
71
72
  end
@@ -73,24 +74,23 @@ end
73
74
 
74
75
  has_unknown_words = false
75
76
 
76
- spider_success = HttpSpell::Spider.new(ARGV.first, whitelist:, blacklist:, verbose:, tracing:).start do |url, doc|
77
+ spider_success = HttpSpell::Spider.new(ARGV.first, included:, excluded:, verbose:, tracing:).start do |url, doc|
77
78
  lang = force_language || doc.root['lang'] || ENV.fetch('LANGUAGE', nil)
78
- warn "Checking #{url} as #{lang}" if verbose
79
79
 
80
80
  # Remove elements that are not to be spellchecked
81
81
  doc.css('pre').each(&:unlink)
82
82
  doc.css('code').each(&:unlink)
83
+ doc.css('iframe').each(&:unlink)
83
84
  doc.css('[spellcheck=false]').each(&:unlink)
84
85
 
85
86
  # Handle elements with a different lang attribute separately
86
87
  doc.css(%([lang]:not([lang="#{lang}"]))).each do |element|
87
- warn "Handling #{element.name} with lang #{element['lang']}:" if verbose
88
- has_unknown_words |= check(element.to_s, element['lang'], personal_dictionary_path, verbose)
88
+ has_unknown_words |= check("#{url} => #{element.name} with", element.to_s, element['lang'], personal_dictionary_path, verbose)
89
89
  element.unlink
90
90
  end
91
91
 
92
92
  # Everything else
93
- has_unknown_words |= check(doc.to_s, lang, personal_dictionary_path, verbose)
93
+ has_unknown_words |= check("#{url} => document with", doc.to_s, lang, personal_dictionary_path, verbose)
94
94
  end
95
95
 
96
96
  exit 2 unless spider_success
data/httpspell.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  lib = File.expand_path('lib', __dir__)
4
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
- require 'httpspell/version'
5
+ require 'http_spell/version'
6
6
 
7
7
  Gem::Specification.new do |spec|
8
8
  spec.name = 'httpspell'
@@ -23,7 +23,6 @@ Gem::Specification.new do |spec|
23
23
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
24
24
  spec.require_paths = ['lib']
25
25
 
26
- spec.add_dependency 'addressable'
27
26
  spec.add_dependency 'nokogiri'
28
27
  spec.metadata['rubygems_mfa_required'] = 'true'
29
28
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module HttpSpell
2
4
  class SpellChecker
3
5
  def initialize(personal_dictionary_path = nil, verbose: false)
@@ -11,11 +13,6 @@ module HttpSpell
11
13
  "hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l",
12
14
  ]
13
15
 
14
- if @verbose
15
- warn "Piping the HTML document into the following chain of commands:"
16
- warn commands
17
- end
18
-
19
16
  Open3.pipeline_rw(*commands) do |stdin, stdout, _wait_thrs|
20
17
  stdin.puts(doc)
21
18
  stdin.close
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+ require 'uri'
5
+ require 'open-uri'
6
+ require 'open3'
7
+ require 'English'
8
+
9
+ module HttpSpell
10
+ class Spider
11
+ attr_reader :todo, :done
12
+
13
+ def initialize(starting_point, included: nil, excluded: [], verbose: false, tracing: false)
14
+ @todo = []
15
+ @done = []
16
+ todo << URI(starting_point)
17
+ @included = included || [/^#{starting_point}/]
18
+ @excluded = excluded
19
+ @verbose = verbose
20
+ @tracing = tracing
21
+ end
22
+
23
+ def start
24
+ success = true
25
+
26
+ while todo.any?
27
+ url = todo.pop
28
+
29
+ begin
30
+ extracted = links(url) do |u, d|
31
+ yield u, d if block_given?
32
+ rescue StandardError
33
+ warn "Callback error for #{url}: #{$ERROR_INFO}"
34
+ warn $ERROR_INFO.backtrace if @tracing
35
+ end
36
+
37
+ done.append(url)
38
+ new_links = (extracted - done - todo).uniq
39
+
40
+ if new_links.any?
41
+ warn "Adding #{new_links.size} new links found at #{url}" if @verbose
42
+ todo.concat(extracted - done - todo).uniq!
43
+ end
44
+ rescue StandardError
45
+ warn "Skipping #{url} because of #{$ERROR_INFO.message}"
46
+ warn $ERROR_INFO.backtrace if @tracing
47
+ success = false
48
+ end
49
+ end
50
+
51
+ success
52
+ end
53
+
54
+ private
55
+
56
+ def links(uri)
57
+ response = http_get(uri)
58
+
59
+ if response.respond_to?(:content_type) && response.content_type != 'text/html'
60
+ warn "Skipping #{response.base_uri} because it is not HTML" if @verbose
61
+ return []
62
+ end
63
+
64
+ doc = Nokogiri::HTML(response)
65
+
66
+ links = doc.css('a[href]').map do |e|
67
+ next if e['href'].start_with?('#') # Ignore fragment on the same page; we always check the whole page
68
+
69
+ link = URI.join(response.base_uri, e['href'])
70
+ link.fragment = nil # Ignore fragment in links to other pages, too
71
+
72
+ if @included.none? { |re| re.match?(link.to_s) }
73
+ warn "Skipping #{link} because it is not on the included #{@included}" if @verbose
74
+ next
75
+ end
76
+
77
+ if @excluded.any? { |re| re.match?(link.to_s) }
78
+ # TODO: Print _which_ entry of the excluded matches
79
+ warn "Skipping #{link} because it is on the excluded #{@excluded}" if @verbose
80
+ next
81
+ end
82
+
83
+ link
84
+ rescue StandardError
85
+ warn "Error: #{$ERROR_INFO}"
86
+ warn $ERROR_INFO.backtrace if @tracing
87
+ end.compact
88
+
89
+ yield response.base_uri, doc if block_given?
90
+
91
+ links
92
+ end
93
+
94
+ # https://twin.github.io/improving-open-uri/
95
+ def http_get(uri)
96
+ tries = 10
97
+ begin
98
+ URI.parse(uri).open(redirect: false)
99
+ rescue OpenURI::HTTPRedirect => e
100
+ uri = e.uri
101
+ retry if (tries -= 1).positive?
102
+ raise
103
+ end
104
+ end
105
+ end
106
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HttpSpell
4
- VERSION = '1.4.0'
4
+ VERSION = '1.5.0'
5
5
  end
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: httpspell
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steffen Uhlig
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-29 00:00:00.000000000 Z
11
+ date: 2024-05-31 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: addressable
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ">="
18
- - !ruby/object:Gem::Version
19
- version: '0'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- version: '0'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: nokogiri
29
15
  requirement: !ruby/object:Gem::Requirement
@@ -63,9 +49,9 @@ files:
63
49
  - TODO.markdown
64
50
  - exe/httpspell
65
51
  - httpspell.gemspec
66
- - lib/httpspell/spellchecker.rb
67
- - lib/httpspell/spider.rb
68
- - lib/httpspell/version.rb
52
+ - lib/http_spell/spellchecker.rb
53
+ - lib/http_spell/spider.rb
54
+ - lib/http_spell/version.rb
69
55
  homepage:
70
56
  licenses:
71
57
  - MIT
@@ -1,100 +0,0 @@
1
- require 'nokogiri'
2
- require 'open-uri'
3
- require 'open3'
4
- require 'addressable/uri'
5
- require 'English'
6
-
7
- module HttpSpell
8
- class Spider
9
- attr_reader :todo, :done
10
-
11
- def initialize(starting_point, whitelist: nil, blacklist: [], verbose: false, tracing: false)
12
- @todo = []
13
- @done = []
14
- todo << Addressable::URI.parse(starting_point)
15
- @whitelist = whitelist || [/^#{starting_point}/]
16
- @blacklist = blacklist
17
- @verbose = verbose
18
- @tracing = tracing
19
- end
20
-
21
- def start
22
- success = true
23
-
24
- while todo.any?
25
- url = todo.pop
26
-
27
- begin
28
- extracted = links(url) do |u, d|
29
- yield u, d if block_given?
30
- rescue
31
- warn "Callback error for #{url}: #{$ERROR_INFO}"
32
- warn $ERROR_INFO.backtrace if @tracing
33
- end
34
-
35
- done.append(url)
36
- todo.concat(extracted - done - todo)
37
- rescue StandardError
38
- warn "Skipping #{url} because of #{$ERROR_INFO.message}"
39
- warn $ERROR_INFO.backtrace if @tracing
40
- success = false
41
- end
42
- end
43
-
44
- return success
45
- end
46
-
47
- private
48
-
49
- def links(uri)
50
- response = http_get(uri)
51
-
52
- if response.respond_to?(:content_type) && response.content_type != 'text/html'
53
- warn "Skipping #{uri} because it is not HTML" if @verbose
54
- return []
55
- end
56
-
57
- doc = Nokogiri::HTML(response)
58
-
59
- links = doc.css('a[href]').map do |e|
60
- link = Addressable::URI.parse(e['href'])
61
- link = uri.join(link) if link.relative?
62
-
63
- if @whitelist.none? { |re| re.match?(link.to_s) }
64
- warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @verbose
65
- next
66
- end
67
-
68
- if @blacklist.any? { |re| re.match?(link.to_s) }
69
- # TODO Print _which_ entry of the blacklist matches
70
- warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @verbose
71
- next
72
- end
73
-
74
- # TODO Ignore same page links (some anchor)
75
- link
76
- rescue StandardError
77
- warn $ERROR_INFO.message
78
- warn $ERROR_INFO.backtrace if @tracing
79
- end.compact
80
-
81
- yield uri, doc if block_given?
82
-
83
- warn "Adding #{links.size} links from #{uri}" if @verbose
84
- links
85
- end
86
-
87
- # https://twin.github.io/improving-open-uri/
88
- def http_get(uri)
89
- tries = 10
90
-
91
- begin
92
- URI.open(uri, redirect: false)
93
- rescue OpenURI::HTTPRedirect => redirect
94
- uri = redirect.uri
95
- retry if (tries -= 1) > 0
96
- raise
97
- end
98
- end
99
- end
100
- end