httpspell 1.4.0 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e6245dacfbe0f2b1f7a95e95118a058e592f0fcbe88407b804e756bd41691054
4
- data.tar.gz: 731ddea385cdca5f40c14c10289583b1544b9055bd82015ce2c5beb3b9495b52
3
+ metadata.gz: d66cfcc88c0bc7e0e237033b8c76f1aaccc40f9aba3f68766d45204a2b133401
4
+ data.tar.gz: 6c488170f95d0f33fdcbc5c55f2416d654f2b4558214943f21d19e1220f2ad96
5
5
  SHA512:
6
- metadata.gz: 2295d29b287812f6f1330d4cc17a5eae9ab3a14e99bb9ab4d4215761d3146a3f27b8ddcb66331999156c68a1ac58e2ac33b47d7043b7a2771e56faca03eb7b62
7
- data.tar.gz: 6cbab0f82ae37c684a6bf4ad60e30a7b5a9921e5d90956bd4baa088e4190c9e8b2b8359e41c457dc6d594c962e43b13e3483046b0208f5bb5273d0928e74950f
6
+ metadata.gz: f493b3411cd162e4a714203b05277f26810d71a0c23e52d69c36297c19e03db7b97692c1e4928c8f8fb0b9bc9a59f04b9ae4113c5cdd309c1edcf2a493d68687
7
+ data.tar.gz: ed44c8adf0dcd63330e8e7f837d9f515fd6131e58ae02808a01ed430ef42e5ee258a8c8d5a6fb9552c6b2c0dc460966453d3f1cda6e359739a18b6a01a25cdeb
data/.rubocop.yml CHANGED
@@ -1,7 +1,11 @@
1
+ require:
2
+ - rubocop-rake
3
+ - rubocop-rspec
1
4
  AllCops:
2
5
  NewCops: enable
3
6
  TargetRubyVersion: 3.3
4
7
  Include:
8
+ - '**/*.rb'
5
9
  - '**/Gemfile'
6
10
  - '**/Rakefile'
7
11
  - '**/config.ru'
@@ -23,3 +27,19 @@ Layout/LineLength:
23
27
  Max: 160
24
28
  Style/Documentation:
25
29
  Enabled: false
30
+ Metrics/AbcSize:
31
+ Enabled: false
32
+ Metrics/MethodLength:
33
+ Enabled: false
34
+ Metrics/CyclomaticComplexity:
35
+ Enabled: false
36
+ Style/TrailingCommaInArrayLiteral:
37
+ Enabled: false
38
+ RSpec/ExampleWording:
39
+ Enabled: false
40
+ RSpec/InstanceVariable:
41
+ AssignmentOnly: true
42
+ RSpec/ExampleLength:
43
+ Max: 10
44
+ Metrics/PerceivedComplexity:
45
+ Max: 16
data/Gemfile.lock CHANGED
@@ -1,15 +1,12 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- httpspell (1.4.0)
5
- addressable
4
+ httpspell (1.5.0)
6
5
  nokogiri
7
6
 
8
7
  GEM
9
8
  remote: https://rubygems.org/
10
9
  specs:
11
- addressable (2.8.6)
12
- public_suffix (>= 2.0.2, < 6.0)
13
10
  aruba (2.2.0)
14
11
  bundler (>= 1.17, < 3.0)
15
12
  contracts (>= 0.16.0, < 0.18.0)
@@ -91,7 +88,7 @@ GEM
91
88
  nenv (~> 0.1)
92
89
  shellany (~> 0.0)
93
90
  parallel (1.24.0)
94
- parser (3.3.1.0)
91
+ parser (3.3.2.0)
95
92
  ast (~> 2.4.1)
96
93
  racc
97
94
  pry (0.14.2)
@@ -100,7 +97,6 @@ GEM
100
97
  pry-byebug (3.10.1)
101
98
  byebug (~> 11.0)
102
99
  pry (>= 0.13, < 0.15)
103
- public_suffix (5.0.5)
104
100
  racc (1.8.0)
105
101
  rack (3.0.11)
106
102
  rackup (0.2.3)
@@ -127,7 +123,7 @@ GEM
127
123
  diff-lcs (>= 1.2.0, < 2.0)
128
124
  rspec-support (~> 3.13.0)
129
125
  rspec-support (3.13.1)
130
- rubocop (1.64.0)
126
+ rubocop (1.64.1)
131
127
  json (~> 2.3)
132
128
  language_server-protocol (>= 3.17.0)
133
129
  parallel (~> 1.10)
data/TODO.markdown CHANGED
@@ -1,5 +1,4 @@
1
1
  * Bail out if lang cannot be inferred and is not given on cmdline
2
2
  * exe/httpspell: # TODO: --recursive, defaults to false
3
3
  * exe/httpspell: # TODO wget has some additional options for recursive behavior that should be reviewed
4
- * lib/httpspell/spider.rb: # TODO Print _which_ entry of the blacklist matches
5
- * lib/httpspell/spider.rb: # TODO Ignore same page links (some anchor)
4
+ * lib/httpspell/spider.rb: # TODO Print _which_ entry of the exclude list matches
data/exe/httpspell CHANGED
@@ -2,16 +2,16 @@
2
2
  # frozen_string_literal: true
3
3
 
4
4
  require 'optparse'
5
- require 'httpspell/spider'
6
- require 'httpspell/spellchecker'
7
- require 'httpspell/version'
5
+ require 'http_spell/spider'
6
+ require 'http_spell/spellchecker'
7
+ require 'http_spell/version'
8
8
 
9
9
  personal_dictionary_path = nil
10
10
  force_language = nil
11
11
  tracing = nil
12
12
  verbose = nil
13
- whitelist = nil
14
- blacklist = []
13
+ included = nil
14
+ excluded = []
15
15
 
16
16
  begin
17
17
  OptionParser.new do |parser|
@@ -29,9 +29,9 @@ begin
29
29
  force_language = l
30
30
  end
31
31
 
32
- parser.on('-w', '--whitelist=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
33
- whitelist ||= []
34
- whitelist << Regexp.new(w)
32
+ parser.on('-i', '--include=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
33
+ included ||= []
34
+ included << Regexp.new(w)
35
35
  end
36
36
 
37
37
  parser.on('-t', '--trace', 'enable error tracing') do
@@ -42,15 +42,15 @@ begin
42
42
  verbose = true
43
43
  end
44
44
 
45
- parser.on('-b', '--blacklist=EXPRESSION', 'blacklist (ignore) URLs matching the given regular EXPRESSION') do |b|
46
- blacklist << Regexp.new(b)
45
+ parser.on('-e', '--exclude=EXPRESSION', 'exclude URLs matching the given regular EXPRESSION') do |b|
46
+ excluded << Regexp.new(b)
47
47
  end
48
48
 
49
49
  # TODO: --recursive, defaults to false
50
50
  # TODO wget has some additional options for recursive behavior that should be reviewed
51
51
  end.parse!
52
52
  rescue StandardError
53
- warn "Error - #{$ERROR_INFO}"
53
+ warn "Error: #{$ERROR_INFO}"
54
54
  exit 1
55
55
  end
56
56
 
@@ -59,13 +59,14 @@ if ARGV.size != 1
59
59
  exit 1
60
60
  end
61
61
 
62
- def check(doc, lang, personal_dictionary_path, verbose)
62
+ def check(url, doc, lang, personal_dictionary_path, verbose)
63
63
  unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc, lang)
64
64
 
65
65
  if unknown_words.empty?
66
- warn 'No unknown words.' if verbose
66
+ warn "#{url} (lang=#{lang}): No unknown words" if verbose
67
+ false
67
68
  else
68
- warn "#{unknown_words.size} unknown words:" if verbose
69
+ warn "#{url} (lang=#{lang}): #{unknown_words.size} unknown words:" if verbose
69
70
  puts unknown_words
70
71
  true
71
72
  end
@@ -73,24 +74,23 @@ end
73
74
 
74
75
  has_unknown_words = false
75
76
 
76
- spider_success = HttpSpell::Spider.new(ARGV.first, whitelist:, blacklist:, verbose:, tracing:).start do |url, doc|
77
+ spider_success = HttpSpell::Spider.new(ARGV.first, included:, excluded:, verbose:, tracing:).start do |url, doc|
77
78
  lang = force_language || doc.root['lang'] || ENV.fetch('LANGUAGE', nil)
78
- warn "Checking #{url} as #{lang}" if verbose
79
79
 
80
80
  # Remove elements that are not to be spellchecked
81
81
  doc.css('pre').each(&:unlink)
82
82
  doc.css('code').each(&:unlink)
83
+ doc.css('iframe').each(&:unlink)
83
84
  doc.css('[spellcheck=false]').each(&:unlink)
84
85
 
85
86
  # Handle elements with a different lang attribute separately
86
87
  doc.css(%([lang]:not([lang="#{lang}"]))).each do |element|
87
- warn "Handling #{element.name} with lang #{element['lang']}:" if verbose
88
- has_unknown_words |= check(element.to_s, element['lang'], personal_dictionary_path, verbose)
88
+ has_unknown_words |= check("#{url} => #{element.name} with", element.to_s, element['lang'], personal_dictionary_path, verbose)
89
89
  element.unlink
90
90
  end
91
91
 
92
92
  # Everything else
93
- has_unknown_words |= check(doc.to_s, lang, personal_dictionary_path, verbose)
93
+ has_unknown_words |= check("#{url} => document with", doc.to_s, lang, personal_dictionary_path, verbose)
94
94
  end
95
95
 
96
96
  exit 2 unless spider_success
data/httpspell.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  lib = File.expand_path('lib', __dir__)
4
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
- require 'httpspell/version'
5
+ require 'http_spell/version'
6
6
 
7
7
  Gem::Specification.new do |spec|
8
8
  spec.name = 'httpspell'
@@ -23,7 +23,6 @@ Gem::Specification.new do |spec|
23
23
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
24
24
  spec.require_paths = ['lib']
25
25
 
26
- spec.add_dependency 'addressable'
27
26
  spec.add_dependency 'nokogiri'
28
27
  spec.metadata['rubygems_mfa_required'] = 'true'
29
28
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module HttpSpell
2
4
  class SpellChecker
3
5
  def initialize(personal_dictionary_path = nil, verbose: false)
@@ -11,11 +13,6 @@ module HttpSpell
11
13
  "hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l",
12
14
  ]
13
15
 
14
- if @verbose
15
- warn "Piping the HTML document into the following chain of commands:"
16
- warn commands
17
- end
18
-
19
16
  Open3.pipeline_rw(*commands) do |stdin, stdout, _wait_thrs|
20
17
  stdin.puts(doc)
21
18
  stdin.close
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+ require 'uri'
5
+ require 'open-uri'
6
+ require 'open3'
7
+ require 'English'
8
+
9
+ module HttpSpell
10
+ class Spider
11
+ attr_reader :todo, :done
12
+
13
+ def initialize(starting_point, included: nil, excluded: [], verbose: false, tracing: false)
14
+ @todo = []
15
+ @done = []
16
+ todo << URI(starting_point)
17
+ @included = included || [/^#{starting_point}/]
18
+ @excluded = excluded
19
+ @verbose = verbose
20
+ @tracing = tracing
21
+ end
22
+
23
+ def start
24
+ success = true
25
+
26
+ while todo.any?
27
+ url = todo.pop
28
+
29
+ begin
30
+ extracted = links(url) do |u, d|
31
+ yield u, d if block_given?
32
+ rescue StandardError
33
+ warn "Callback error for #{url}: #{$ERROR_INFO}"
34
+ warn $ERROR_INFO.backtrace if @tracing
35
+ end
36
+
37
+ done.append(url)
38
+ new_links = (extracted - done - todo).uniq
39
+
40
+ if new_links.any?
41
+ warn "Adding #{new_links.size} new links found at #{url}" if @verbose
42
+ todo.concat(extracted - done - todo).uniq!
43
+ end
44
+ rescue StandardError
45
+ warn "Skipping #{url} because of #{$ERROR_INFO.message}"
46
+ warn $ERROR_INFO.backtrace if @tracing
47
+ success = false
48
+ end
49
+ end
50
+
51
+ success
52
+ end
53
+
54
+ private
55
+
56
+ def links(uri)
57
+ response = http_get(uri)
58
+
59
+ if response.respond_to?(:content_type) && response.content_type != 'text/html'
60
+ warn "Skipping #{response.base_uri} because it is not HTML" if @verbose
61
+ return []
62
+ end
63
+
64
+ doc = Nokogiri::HTML(response)
65
+
66
+ links = doc.css('a[href]').map do |e|
67
+ next if e['href'].start_with?('#') # Ignore fragment on the same page; we always check the whole page
68
+
69
+ link = URI.join(response.base_uri, e['href'])
70
+ link.fragment = nil # Ignore fragment in links to other pages, too
71
+
72
+ if @included.none? { |re| re.match?(link.to_s) }
73
+ warn "Skipping #{link} because it is not on the included #{@included}" if @verbose
74
+ next
75
+ end
76
+
77
+ if @excluded.any? { |re| re.match?(link.to_s) }
78
+ # TODO: Print _which_ entry of the excluded matches
79
+ warn "Skipping #{link} because it is on the excluded #{@excluded}" if @verbose
80
+ next
81
+ end
82
+
83
+ link
84
+ rescue StandardError
85
+ warn "Error: #{$ERROR_INFO}"
86
+ warn $ERROR_INFO.backtrace if @tracing
87
+ end.compact
88
+
89
+ yield response.base_uri, doc if block_given?
90
+
91
+ links
92
+ end
93
+
94
+ # https://twin.github.io/improving-open-uri/
95
+ def http_get(uri)
96
+ tries = 10
97
+ begin
98
+ URI.parse(uri).open(redirect: false)
99
+ rescue OpenURI::HTTPRedirect => e
100
+ uri = e.uri
101
+ retry if (tries -= 1).positive?
102
+ raise
103
+ end
104
+ end
105
+ end
106
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HttpSpell
4
- VERSION = '1.4.0'
4
+ VERSION = '1.5.0'
5
5
  end
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: httpspell
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steffen Uhlig
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-29 00:00:00.000000000 Z
11
+ date: 2024-05-31 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: addressable
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ">="
18
- - !ruby/object:Gem::Version
19
- version: '0'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- version: '0'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: nokogiri
29
15
  requirement: !ruby/object:Gem::Requirement
@@ -63,9 +49,9 @@ files:
63
49
  - TODO.markdown
64
50
  - exe/httpspell
65
51
  - httpspell.gemspec
66
- - lib/httpspell/spellchecker.rb
67
- - lib/httpspell/spider.rb
68
- - lib/httpspell/version.rb
52
+ - lib/http_spell/spellchecker.rb
53
+ - lib/http_spell/spider.rb
54
+ - lib/http_spell/version.rb
69
55
  homepage:
70
56
  licenses:
71
57
  - MIT
@@ -1,100 +0,0 @@
1
- require 'nokogiri'
2
- require 'open-uri'
3
- require 'open3'
4
- require 'addressable/uri'
5
- require 'English'
6
-
7
- module HttpSpell
8
- class Spider
9
- attr_reader :todo, :done
10
-
11
- def initialize(starting_point, whitelist: nil, blacklist: [], verbose: false, tracing: false)
12
- @todo = []
13
- @done = []
14
- todo << Addressable::URI.parse(starting_point)
15
- @whitelist = whitelist || [/^#{starting_point}/]
16
- @blacklist = blacklist
17
- @verbose = verbose
18
- @tracing = tracing
19
- end
20
-
21
- def start
22
- success = true
23
-
24
- while todo.any?
25
- url = todo.pop
26
-
27
- begin
28
- extracted = links(url) do |u, d|
29
- yield u, d if block_given?
30
- rescue
31
- warn "Callback error for #{url}: #{$ERROR_INFO}"
32
- warn $ERROR_INFO.backtrace if @tracing
33
- end
34
-
35
- done.append(url)
36
- todo.concat(extracted - done - todo)
37
- rescue StandardError
38
- warn "Skipping #{url} because of #{$ERROR_INFO.message}"
39
- warn $ERROR_INFO.backtrace if @tracing
40
- success = false
41
- end
42
- end
43
-
44
- return success
45
- end
46
-
47
- private
48
-
49
- def links(uri)
50
- response = http_get(uri)
51
-
52
- if response.respond_to?(:content_type) && response.content_type != 'text/html'
53
- warn "Skipping #{uri} because it is not HTML" if @verbose
54
- return []
55
- end
56
-
57
- doc = Nokogiri::HTML(response)
58
-
59
- links = doc.css('a[href]').map do |e|
60
- link = Addressable::URI.parse(e['href'])
61
- link = uri.join(link) if link.relative?
62
-
63
- if @whitelist.none? { |re| re.match?(link.to_s) }
64
- warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @verbose
65
- next
66
- end
67
-
68
- if @blacklist.any? { |re| re.match?(link.to_s) }
69
- # TODO Print _which_ entry of the blacklist matches
70
- warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @verbose
71
- next
72
- end
73
-
74
- # TODO Ignore same page links (some anchor)
75
- link
76
- rescue StandardError
77
- warn $ERROR_INFO.message
78
- warn $ERROR_INFO.backtrace if @tracing
79
- end.compact
80
-
81
- yield uri, doc if block_given?
82
-
83
- warn "Adding #{links.size} links from #{uri}" if @verbose
84
- links
85
- end
86
-
87
- # https://twin.github.io/improving-open-uri/
88
- def http_get(uri)
89
- tries = 10
90
-
91
- begin
92
- URI.open(uri, redirect: false)
93
- rescue OpenURI::HTTPRedirect => redirect
94
- uri = redirect.uri
95
- retry if (tries -= 1) > 0
96
- raise
97
- end
98
- end
99
- end
100
- end