httpspell 1.4.1 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -7
- data/TODO.markdown +1 -1
- data/exe/httpspell +16 -16
- data/httpspell.gemspec +0 -1
- data/lib/http_spell/spellchecker.rb +0 -5
- data/lib/http_spell/spider.rb +23 -21
- data/lib/http_spell/version.rb +1 -1
- metadata +2 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d66cfcc88c0bc7e0e237033b8c76f1aaccc40f9aba3f68766d45204a2b133401
|
4
|
+
data.tar.gz: 6c488170f95d0f33fdcbc5c55f2416d654f2b4558214943f21d19e1220f2ad96
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f493b3411cd162e4a714203b05277f26810d71a0c23e52d69c36297c19e03db7b97692c1e4928c8f8fb0b9bc9a59f04b9ae4113c5cdd309c1edcf2a493d68687
|
7
|
+
data.tar.gz: ed44c8adf0dcd63330e8e7f837d9f515fd6131e58ae02808a01ed430ef42e5ee258a8c8d5a6fb9552c6b2c0dc460966453d3f1cda6e359739a18b6a01a25cdeb
|
data/Gemfile.lock
CHANGED
@@ -1,15 +1,12 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
httpspell (1.
|
5
|
-
addressable
|
4
|
+
httpspell (1.5.0)
|
6
5
|
nokogiri
|
7
6
|
|
8
7
|
GEM
|
9
8
|
remote: https://rubygems.org/
|
10
9
|
specs:
|
11
|
-
addressable (2.8.6)
|
12
|
-
public_suffix (>= 2.0.2, < 6.0)
|
13
10
|
aruba (2.2.0)
|
14
11
|
bundler (>= 1.17, < 3.0)
|
15
12
|
contracts (>= 0.16.0, < 0.18.0)
|
@@ -91,7 +88,7 @@ GEM
|
|
91
88
|
nenv (~> 0.1)
|
92
89
|
shellany (~> 0.0)
|
93
90
|
parallel (1.24.0)
|
94
|
-
parser (3.3.
|
91
|
+
parser (3.3.2.0)
|
95
92
|
ast (~> 2.4.1)
|
96
93
|
racc
|
97
94
|
pry (0.14.2)
|
@@ -100,7 +97,6 @@ GEM
|
|
100
97
|
pry-byebug (3.10.1)
|
101
98
|
byebug (~> 11.0)
|
102
99
|
pry (>= 0.13, < 0.15)
|
103
|
-
public_suffix (5.0.5)
|
104
100
|
racc (1.8.0)
|
105
101
|
rack (3.0.11)
|
106
102
|
rackup (0.2.3)
|
@@ -127,7 +123,7 @@ GEM
|
|
127
123
|
diff-lcs (>= 1.2.0, < 2.0)
|
128
124
|
rspec-support (~> 3.13.0)
|
129
125
|
rspec-support (3.13.1)
|
130
|
-
rubocop (1.64.
|
126
|
+
rubocop (1.64.1)
|
131
127
|
json (~> 2.3)
|
132
128
|
language_server-protocol (>= 3.17.0)
|
133
129
|
parallel (~> 1.10)
|
data/TODO.markdown
CHANGED
@@ -1,4 +1,4 @@
|
|
1
1
|
* Bail out if lang cannot be inferred and is not given on cmdline
|
2
2
|
* exe/httpspell: # TODO: --recursive, defaults to false
|
3
3
|
* exe/httpspell: # TODO wget has some additional options for recursive behavior that should be reviewed
|
4
|
-
* lib/httpspell/spider.rb: # TODO Print _which_ entry of the
|
4
|
+
* lib/httpspell/spider.rb: # TODO Print _which_ entry of the exclude list matches
|
data/exe/httpspell
CHANGED
@@ -10,8 +10,8 @@ personal_dictionary_path = nil
|
|
10
10
|
force_language = nil
|
11
11
|
tracing = nil
|
12
12
|
verbose = nil
|
13
|
-
|
14
|
-
|
13
|
+
included = nil
|
14
|
+
excluded = []
|
15
15
|
|
16
16
|
begin
|
17
17
|
OptionParser.new do |parser|
|
@@ -29,9 +29,9 @@ begin
|
|
29
29
|
force_language = l
|
30
30
|
end
|
31
31
|
|
32
|
-
parser.on('-
|
33
|
-
|
34
|
-
|
32
|
+
parser.on('-i', '--include=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
|
33
|
+
included ||= []
|
34
|
+
included << Regexp.new(w)
|
35
35
|
end
|
36
36
|
|
37
37
|
parser.on('-t', '--trace', 'enable error tracing') do
|
@@ -42,15 +42,15 @@ begin
|
|
42
42
|
verbose = true
|
43
43
|
end
|
44
44
|
|
45
|
-
parser.on('-
|
46
|
-
|
45
|
+
parser.on('-e', '--exclude=EXPRESSION', 'exclude URLs matching the given regular EXPRESSION') do |b|
|
46
|
+
excluded << Regexp.new(b)
|
47
47
|
end
|
48
48
|
|
49
49
|
# TODO: --recursive, defaults to false
|
50
50
|
# TODO wget has some additional options for recursive behavior that should be reviewed
|
51
51
|
end.parse!
|
52
52
|
rescue StandardError
|
53
|
-
warn "Error
|
53
|
+
warn "Error: #{$ERROR_INFO}"
|
54
54
|
exit 1
|
55
55
|
end
|
56
56
|
|
@@ -59,13 +59,14 @@ if ARGV.size != 1
|
|
59
59
|
exit 1
|
60
60
|
end
|
61
61
|
|
62
|
-
def check(doc, lang, personal_dictionary_path, verbose)
|
62
|
+
def check(url, doc, lang, personal_dictionary_path, verbose)
|
63
63
|
unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc, lang)
|
64
64
|
|
65
65
|
if unknown_words.empty?
|
66
|
-
warn
|
66
|
+
warn "#{url} (lang=#{lang}): No unknown words" if verbose
|
67
|
+
false
|
67
68
|
else
|
68
|
-
warn "#{unknown_words.size} unknown words:" if verbose
|
69
|
+
warn "#{url} (lang=#{lang}): #{unknown_words.size} unknown words:" if verbose
|
69
70
|
puts unknown_words
|
70
71
|
true
|
71
72
|
end
|
@@ -73,24 +74,23 @@ end
|
|
73
74
|
|
74
75
|
has_unknown_words = false
|
75
76
|
|
76
|
-
spider_success = HttpSpell::Spider.new(ARGV.first,
|
77
|
+
spider_success = HttpSpell::Spider.new(ARGV.first, included:, excluded:, verbose:, tracing:).start do |url, doc|
|
77
78
|
lang = force_language || doc.root['lang'] || ENV.fetch('LANGUAGE', nil)
|
78
|
-
warn "Checking #{url} as #{lang}" if verbose
|
79
79
|
|
80
80
|
# Remove elements that are not to be spellchecked
|
81
81
|
doc.css('pre').each(&:unlink)
|
82
82
|
doc.css('code').each(&:unlink)
|
83
|
+
doc.css('iframe').each(&:unlink)
|
83
84
|
doc.css('[spellcheck=false]').each(&:unlink)
|
84
85
|
|
85
86
|
# Handle elements with a different lang attribute separately
|
86
87
|
doc.css(%([lang]:not([lang="#{lang}"]))).each do |element|
|
87
|
-
|
88
|
-
has_unknown_words |= check(element.to_s, element['lang'], personal_dictionary_path, verbose)
|
88
|
+
has_unknown_words |= check("#{url} => #{element.name} with", element.to_s, element['lang'], personal_dictionary_path, verbose)
|
89
89
|
element.unlink
|
90
90
|
end
|
91
91
|
|
92
92
|
# Everything else
|
93
|
-
has_unknown_words |= check(doc.to_s, lang, personal_dictionary_path, verbose)
|
93
|
+
has_unknown_words |= check("#{url} => document with", doc.to_s, lang, personal_dictionary_path, verbose)
|
94
94
|
end
|
95
95
|
|
96
96
|
exit 2 unless spider_success
|
data/httpspell.gemspec
CHANGED
@@ -23,7 +23,6 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
24
24
|
spec.require_paths = ['lib']
|
25
25
|
|
26
|
-
spec.add_dependency 'addressable'
|
27
26
|
spec.add_dependency 'nokogiri'
|
28
27
|
spec.metadata['rubygems_mfa_required'] = 'true'
|
29
28
|
end
|
@@ -13,11 +13,6 @@ module HttpSpell
|
|
13
13
|
"hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l",
|
14
14
|
]
|
15
15
|
|
16
|
-
if @verbose
|
17
|
-
warn 'Piping the HTML document into the following chain of commands:'
|
18
|
-
warn commands
|
19
|
-
end
|
20
|
-
|
21
16
|
Open3.pipeline_rw(*commands) do |stdin, stdout, _wait_thrs|
|
22
17
|
stdin.puts(doc)
|
23
18
|
stdin.close
|
data/lib/http_spell/spider.rb
CHANGED
@@ -1,21 +1,21 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'nokogiri'
|
4
|
+
require 'uri'
|
4
5
|
require 'open-uri'
|
5
6
|
require 'open3'
|
6
|
-
require 'addressable/uri'
|
7
7
|
require 'English'
|
8
8
|
|
9
9
|
module HttpSpell
|
10
10
|
class Spider
|
11
11
|
attr_reader :todo, :done
|
12
12
|
|
13
|
-
def initialize(starting_point,
|
13
|
+
def initialize(starting_point, included: nil, excluded: [], verbose: false, tracing: false)
|
14
14
|
@todo = []
|
15
15
|
@done = []
|
16
|
-
todo <<
|
17
|
-
@
|
18
|
-
@
|
16
|
+
todo << URI(starting_point)
|
17
|
+
@included = included || [/^#{starting_point}/]
|
18
|
+
@excluded = excluded
|
19
19
|
@verbose = verbose
|
20
20
|
@tracing = tracing
|
21
21
|
end
|
@@ -35,7 +35,12 @@ module HttpSpell
|
|
35
35
|
end
|
36
36
|
|
37
37
|
done.append(url)
|
38
|
-
|
38
|
+
new_links = (extracted - done - todo).uniq
|
39
|
+
|
40
|
+
if new_links.any?
|
41
|
+
warn "Adding #{new_links.size} new links found at #{url}" if @verbose
|
42
|
+
todo.concat(extracted - done - todo).uniq!
|
43
|
+
end
|
39
44
|
rescue StandardError
|
40
45
|
warn "Skipping #{url} because of #{$ERROR_INFO.message}"
|
41
46
|
warn $ERROR_INFO.backtrace if @tracing
|
@@ -52,46 +57,43 @@ module HttpSpell
|
|
52
57
|
response = http_get(uri)
|
53
58
|
|
54
59
|
if response.respond_to?(:content_type) && response.content_type != 'text/html'
|
55
|
-
warn "Skipping #{
|
60
|
+
warn "Skipping #{response.base_uri} because it is not HTML" if @verbose
|
56
61
|
return []
|
57
62
|
end
|
58
63
|
|
59
64
|
doc = Nokogiri::HTML(response)
|
60
65
|
|
61
66
|
links = doc.css('a[href]').map do |e|
|
62
|
-
|
63
|
-
|
67
|
+
next if e['href'].start_with?('#') # Ignore fragment on the same page; we always check the whole page
|
68
|
+
|
69
|
+
link = URI.join(response.base_uri, e['href'])
|
70
|
+
link.fragment = nil # Ignore fragment in links to other pages, too
|
64
71
|
|
65
|
-
if @
|
66
|
-
warn "Skipping #{link} because it is not on the
|
72
|
+
if @included.none? { |re| re.match?(link.to_s) }
|
73
|
+
warn "Skipping #{link} because it is not on the included #{@included}" if @verbose
|
67
74
|
next
|
68
75
|
end
|
69
76
|
|
70
|
-
if @
|
71
|
-
# TODO: Print _which_ entry of the
|
72
|
-
warn "Skipping #{link} because it is on the
|
77
|
+
if @excluded.any? { |re| re.match?(link.to_s) }
|
78
|
+
# TODO: Print _which_ entry of the excluded matches
|
79
|
+
warn "Skipping #{link} because it is on the excluded #{@excluded}" if @verbose
|
73
80
|
next
|
74
81
|
end
|
75
82
|
|
76
|
-
# Ignore fragment; we always check the whole page
|
77
|
-
link.fragment = nil
|
78
|
-
|
79
83
|
link
|
80
84
|
rescue StandardError
|
81
|
-
warn $ERROR_INFO
|
85
|
+
warn "Error: #{$ERROR_INFO}"
|
82
86
|
warn $ERROR_INFO.backtrace if @tracing
|
83
87
|
end.compact
|
84
88
|
|
85
|
-
yield
|
89
|
+
yield response.base_uri, doc if block_given?
|
86
90
|
|
87
|
-
warn "Adding #{links.size} links from #{uri}" if @verbose
|
88
91
|
links
|
89
92
|
end
|
90
93
|
|
91
94
|
# https://twin.github.io/improving-open-uri/
|
92
95
|
def http_get(uri)
|
93
96
|
tries = 10
|
94
|
-
|
95
97
|
begin
|
96
98
|
URI.parse(uri).open(redirect: false)
|
97
99
|
rescue OpenURI::HTTPRedirect => e
|
data/lib/http_spell/version.rb
CHANGED
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: httpspell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Steffen Uhlig
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-05-
|
11
|
+
date: 2024-05-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: addressable
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ">="
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ">="
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: nokogiri
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|