httpspell 1.4.0 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +20 -0
- data/Gemfile.lock +3 -7
- data/TODO.markdown +1 -2
- data/exe/httpspell +19 -19
- data/httpspell.gemspec +1 -2
- data/lib/{httpspell → http_spell}/spellchecker.rb +2 -5
- data/lib/http_spell/spider.rb +106 -0
- data/lib/{httpspell → http_spell}/version.rb +1 -1
- metadata +5 -19
- data/lib/httpspell/spider.rb +0 -100
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d66cfcc88c0bc7e0e237033b8c76f1aaccc40f9aba3f68766d45204a2b133401
|
4
|
+
data.tar.gz: 6c488170f95d0f33fdcbc5c55f2416d654f2b4558214943f21d19e1220f2ad96
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f493b3411cd162e4a714203b05277f26810d71a0c23e52d69c36297c19e03db7b97692c1e4928c8f8fb0b9bc9a59f04b9ae4113c5cdd309c1edcf2a493d68687
|
7
|
+
data.tar.gz: ed44c8adf0dcd63330e8e7f837d9f515fd6131e58ae02808a01ed430ef42e5ee258a8c8d5a6fb9552c6b2c0dc460966453d3f1cda6e359739a18b6a01a25cdeb
|
data/.rubocop.yml
CHANGED
@@ -1,7 +1,11 @@
|
|
1
|
+
require:
|
2
|
+
- rubocop-rake
|
3
|
+
- rubocop-rspec
|
1
4
|
AllCops:
|
2
5
|
NewCops: enable
|
3
6
|
TargetRubyVersion: 3.3
|
4
7
|
Include:
|
8
|
+
- '**/*.rb'
|
5
9
|
- '**/Gemfile'
|
6
10
|
- '**/Rakefile'
|
7
11
|
- '**/config.ru'
|
@@ -23,3 +27,19 @@ Layout/LineLength:
|
|
23
27
|
Max: 160
|
24
28
|
Style/Documentation:
|
25
29
|
Enabled: false
|
30
|
+
Metrics/AbcSize:
|
31
|
+
Enabled: false
|
32
|
+
Metrics/MethodLength:
|
33
|
+
Enabled: false
|
34
|
+
Metrics/CyclomaticComplexity:
|
35
|
+
Enabled: false
|
36
|
+
Style/TrailingCommaInArrayLiteral:
|
37
|
+
Enabled: false
|
38
|
+
RSpec/ExampleWording:
|
39
|
+
Enabled: false
|
40
|
+
RSpec/InstanceVariable:
|
41
|
+
AssignmentOnly: true
|
42
|
+
RSpec/ExampleLength:
|
43
|
+
Max: 10
|
44
|
+
Metrics/PerceivedComplexity:
|
45
|
+
Max: 16
|
data/Gemfile.lock
CHANGED
@@ -1,15 +1,12 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
httpspell (1.
|
5
|
-
addressable
|
4
|
+
httpspell (1.5.0)
|
6
5
|
nokogiri
|
7
6
|
|
8
7
|
GEM
|
9
8
|
remote: https://rubygems.org/
|
10
9
|
specs:
|
11
|
-
addressable (2.8.6)
|
12
|
-
public_suffix (>= 2.0.2, < 6.0)
|
13
10
|
aruba (2.2.0)
|
14
11
|
bundler (>= 1.17, < 3.0)
|
15
12
|
contracts (>= 0.16.0, < 0.18.0)
|
@@ -91,7 +88,7 @@ GEM
|
|
91
88
|
nenv (~> 0.1)
|
92
89
|
shellany (~> 0.0)
|
93
90
|
parallel (1.24.0)
|
94
|
-
parser (3.3.
|
91
|
+
parser (3.3.2.0)
|
95
92
|
ast (~> 2.4.1)
|
96
93
|
racc
|
97
94
|
pry (0.14.2)
|
@@ -100,7 +97,6 @@ GEM
|
|
100
97
|
pry-byebug (3.10.1)
|
101
98
|
byebug (~> 11.0)
|
102
99
|
pry (>= 0.13, < 0.15)
|
103
|
-
public_suffix (5.0.5)
|
104
100
|
racc (1.8.0)
|
105
101
|
rack (3.0.11)
|
106
102
|
rackup (0.2.3)
|
@@ -127,7 +123,7 @@ GEM
|
|
127
123
|
diff-lcs (>= 1.2.0, < 2.0)
|
128
124
|
rspec-support (~> 3.13.0)
|
129
125
|
rspec-support (3.13.1)
|
130
|
-
rubocop (1.64.
|
126
|
+
rubocop (1.64.1)
|
131
127
|
json (~> 2.3)
|
132
128
|
language_server-protocol (>= 3.17.0)
|
133
129
|
parallel (~> 1.10)
|
data/TODO.markdown
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
* Bail out if lang cannot be inferred and is not given on cmdline
|
2
2
|
* exe/httpspell: # TODO: --recursive, defaults to false
|
3
3
|
* exe/httpspell: # TODO wget has some additional options for recursive behavior that should be reviewed
|
4
|
-
* lib/httpspell/spider.rb: # TODO Print _which_ entry of the
|
5
|
-
* lib/httpspell/spider.rb: # TODO Ignore same page links (some anchor)
|
4
|
+
* lib/httpspell/spider.rb: # TODO Print _which_ entry of the exclude list matches
|
data/exe/httpspell
CHANGED
@@ -2,16 +2,16 @@
|
|
2
2
|
# frozen_string_literal: true
|
3
3
|
|
4
4
|
require 'optparse'
|
5
|
-
require '
|
6
|
-
require '
|
7
|
-
require '
|
5
|
+
require 'http_spell/spider'
|
6
|
+
require 'http_spell/spellchecker'
|
7
|
+
require 'http_spell/version'
|
8
8
|
|
9
9
|
personal_dictionary_path = nil
|
10
10
|
force_language = nil
|
11
11
|
tracing = nil
|
12
12
|
verbose = nil
|
13
|
-
|
14
|
-
|
13
|
+
included = nil
|
14
|
+
excluded = []
|
15
15
|
|
16
16
|
begin
|
17
17
|
OptionParser.new do |parser|
|
@@ -29,9 +29,9 @@ begin
|
|
29
29
|
force_language = l
|
30
30
|
end
|
31
31
|
|
32
|
-
parser.on('-
|
33
|
-
|
34
|
-
|
32
|
+
parser.on('-i', '--include=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
|
33
|
+
included ||= []
|
34
|
+
included << Regexp.new(w)
|
35
35
|
end
|
36
36
|
|
37
37
|
parser.on('-t', '--trace', 'enable error tracing') do
|
@@ -42,15 +42,15 @@ begin
|
|
42
42
|
verbose = true
|
43
43
|
end
|
44
44
|
|
45
|
-
parser.on('-
|
46
|
-
|
45
|
+
parser.on('-e', '--exclude=EXPRESSION', 'exclude URLs matching the given regular EXPRESSION') do |b|
|
46
|
+
excluded << Regexp.new(b)
|
47
47
|
end
|
48
48
|
|
49
49
|
# TODO: --recursive, defaults to false
|
50
50
|
# TODO wget has some additional options for recursive behavior that should be reviewed
|
51
51
|
end.parse!
|
52
52
|
rescue StandardError
|
53
|
-
warn "Error
|
53
|
+
warn "Error: #{$ERROR_INFO}"
|
54
54
|
exit 1
|
55
55
|
end
|
56
56
|
|
@@ -59,13 +59,14 @@ if ARGV.size != 1
|
|
59
59
|
exit 1
|
60
60
|
end
|
61
61
|
|
62
|
-
def check(doc, lang, personal_dictionary_path, verbose)
|
62
|
+
def check(url, doc, lang, personal_dictionary_path, verbose)
|
63
63
|
unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc, lang)
|
64
64
|
|
65
65
|
if unknown_words.empty?
|
66
|
-
warn
|
66
|
+
warn "#{url} (lang=#{lang}): No unknown words" if verbose
|
67
|
+
false
|
67
68
|
else
|
68
|
-
warn "#{unknown_words.size} unknown words:" if verbose
|
69
|
+
warn "#{url} (lang=#{lang}): #{unknown_words.size} unknown words:" if verbose
|
69
70
|
puts unknown_words
|
70
71
|
true
|
71
72
|
end
|
@@ -73,24 +74,23 @@ end
|
|
73
74
|
|
74
75
|
has_unknown_words = false
|
75
76
|
|
76
|
-
spider_success = HttpSpell::Spider.new(ARGV.first,
|
77
|
+
spider_success = HttpSpell::Spider.new(ARGV.first, included:, excluded:, verbose:, tracing:).start do |url, doc|
|
77
78
|
lang = force_language || doc.root['lang'] || ENV.fetch('LANGUAGE', nil)
|
78
|
-
warn "Checking #{url} as #{lang}" if verbose
|
79
79
|
|
80
80
|
# Remove elements that are not to be spellchecked
|
81
81
|
doc.css('pre').each(&:unlink)
|
82
82
|
doc.css('code').each(&:unlink)
|
83
|
+
doc.css('iframe').each(&:unlink)
|
83
84
|
doc.css('[spellcheck=false]').each(&:unlink)
|
84
85
|
|
85
86
|
# Handle elements with a different lang attribute separately
|
86
87
|
doc.css(%([lang]:not([lang="#{lang}"]))).each do |element|
|
87
|
-
|
88
|
-
has_unknown_words |= check(element.to_s, element['lang'], personal_dictionary_path, verbose)
|
88
|
+
has_unknown_words |= check("#{url} => #{element.name} with", element.to_s, element['lang'], personal_dictionary_path, verbose)
|
89
89
|
element.unlink
|
90
90
|
end
|
91
91
|
|
92
92
|
# Everything else
|
93
|
-
has_unknown_words |= check(doc.to_s, lang, personal_dictionary_path, verbose)
|
93
|
+
has_unknown_words |= check("#{url} => document with", doc.to_s, lang, personal_dictionary_path, verbose)
|
94
94
|
end
|
95
95
|
|
96
96
|
exit 2 unless spider_success
|
data/httpspell.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
lib = File.expand_path('lib', __dir__)
|
4
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
-
require '
|
5
|
+
require 'http_spell/version'
|
6
6
|
|
7
7
|
Gem::Specification.new do |spec|
|
8
8
|
spec.name = 'httpspell'
|
@@ -23,7 +23,6 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
24
24
|
spec.require_paths = ['lib']
|
25
25
|
|
26
|
-
spec.add_dependency 'addressable'
|
27
26
|
spec.add_dependency 'nokogiri'
|
28
27
|
spec.metadata['rubygems_mfa_required'] = 'true'
|
29
28
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module HttpSpell
|
2
4
|
class SpellChecker
|
3
5
|
def initialize(personal_dictionary_path = nil, verbose: false)
|
@@ -11,11 +13,6 @@ module HttpSpell
|
|
11
13
|
"hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l",
|
12
14
|
]
|
13
15
|
|
14
|
-
if @verbose
|
15
|
-
warn "Piping the HTML document into the following chain of commands:"
|
16
|
-
warn commands
|
17
|
-
end
|
18
|
-
|
19
16
|
Open3.pipeline_rw(*commands) do |stdin, stdout, _wait_thrs|
|
20
17
|
stdin.puts(doc)
|
21
18
|
stdin.close
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'uri'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'open3'
|
7
|
+
require 'English'
|
8
|
+
|
9
|
+
module HttpSpell
|
10
|
+
class Spider
|
11
|
+
attr_reader :todo, :done
|
12
|
+
|
13
|
+
def initialize(starting_point, included: nil, excluded: [], verbose: false, tracing: false)
|
14
|
+
@todo = []
|
15
|
+
@done = []
|
16
|
+
todo << URI(starting_point)
|
17
|
+
@included = included || [/^#{starting_point}/]
|
18
|
+
@excluded = excluded
|
19
|
+
@verbose = verbose
|
20
|
+
@tracing = tracing
|
21
|
+
end
|
22
|
+
|
23
|
+
def start
|
24
|
+
success = true
|
25
|
+
|
26
|
+
while todo.any?
|
27
|
+
url = todo.pop
|
28
|
+
|
29
|
+
begin
|
30
|
+
extracted = links(url) do |u, d|
|
31
|
+
yield u, d if block_given?
|
32
|
+
rescue StandardError
|
33
|
+
warn "Callback error for #{url}: #{$ERROR_INFO}"
|
34
|
+
warn $ERROR_INFO.backtrace if @tracing
|
35
|
+
end
|
36
|
+
|
37
|
+
done.append(url)
|
38
|
+
new_links = (extracted - done - todo).uniq
|
39
|
+
|
40
|
+
if new_links.any?
|
41
|
+
warn "Adding #{new_links.size} new links found at #{url}" if @verbose
|
42
|
+
todo.concat(extracted - done - todo).uniq!
|
43
|
+
end
|
44
|
+
rescue StandardError
|
45
|
+
warn "Skipping #{url} because of #{$ERROR_INFO.message}"
|
46
|
+
warn $ERROR_INFO.backtrace if @tracing
|
47
|
+
success = false
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
success
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def links(uri)
|
57
|
+
response = http_get(uri)
|
58
|
+
|
59
|
+
if response.respond_to?(:content_type) && response.content_type != 'text/html'
|
60
|
+
warn "Skipping #{response.base_uri} because it is not HTML" if @verbose
|
61
|
+
return []
|
62
|
+
end
|
63
|
+
|
64
|
+
doc = Nokogiri::HTML(response)
|
65
|
+
|
66
|
+
links = doc.css('a[href]').map do |e|
|
67
|
+
next if e['href'].start_with?('#') # Ignore fragment on the same page; we always check the whole page
|
68
|
+
|
69
|
+
link = URI.join(response.base_uri, e['href'])
|
70
|
+
link.fragment = nil # Ignore fragment in links to other pages, too
|
71
|
+
|
72
|
+
if @included.none? { |re| re.match?(link.to_s) }
|
73
|
+
warn "Skipping #{link} because it is not on the included #{@included}" if @verbose
|
74
|
+
next
|
75
|
+
end
|
76
|
+
|
77
|
+
if @excluded.any? { |re| re.match?(link.to_s) }
|
78
|
+
# TODO: Print _which_ entry of the excluded matches
|
79
|
+
warn "Skipping #{link} because it is on the excluded #{@excluded}" if @verbose
|
80
|
+
next
|
81
|
+
end
|
82
|
+
|
83
|
+
link
|
84
|
+
rescue StandardError
|
85
|
+
warn "Error: #{$ERROR_INFO}"
|
86
|
+
warn $ERROR_INFO.backtrace if @tracing
|
87
|
+
end.compact
|
88
|
+
|
89
|
+
yield response.base_uri, doc if block_given?
|
90
|
+
|
91
|
+
links
|
92
|
+
end
|
93
|
+
|
94
|
+
# https://twin.github.io/improving-open-uri/
|
95
|
+
def http_get(uri)
|
96
|
+
tries = 10
|
97
|
+
begin
|
98
|
+
URI.parse(uri).open(redirect: false)
|
99
|
+
rescue OpenURI::HTTPRedirect => e
|
100
|
+
uri = e.uri
|
101
|
+
retry if (tries -= 1).positive?
|
102
|
+
raise
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: httpspell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Steffen Uhlig
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-05-
|
11
|
+
date: 2024-05-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: addressable
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ">="
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ">="
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: nokogiri
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -63,9 +49,9 @@ files:
|
|
63
49
|
- TODO.markdown
|
64
50
|
- exe/httpspell
|
65
51
|
- httpspell.gemspec
|
66
|
-
- lib/
|
67
|
-
- lib/
|
68
|
-
- lib/
|
52
|
+
- lib/http_spell/spellchecker.rb
|
53
|
+
- lib/http_spell/spider.rb
|
54
|
+
- lib/http_spell/version.rb
|
69
55
|
homepage:
|
70
56
|
licenses:
|
71
57
|
- MIT
|
data/lib/httpspell/spider.rb
DELETED
@@ -1,100 +0,0 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
require 'open-uri'
|
3
|
-
require 'open3'
|
4
|
-
require 'addressable/uri'
|
5
|
-
require 'English'
|
6
|
-
|
7
|
-
module HttpSpell
|
8
|
-
class Spider
|
9
|
-
attr_reader :todo, :done
|
10
|
-
|
11
|
-
def initialize(starting_point, whitelist: nil, blacklist: [], verbose: false, tracing: false)
|
12
|
-
@todo = []
|
13
|
-
@done = []
|
14
|
-
todo << Addressable::URI.parse(starting_point)
|
15
|
-
@whitelist = whitelist || [/^#{starting_point}/]
|
16
|
-
@blacklist = blacklist
|
17
|
-
@verbose = verbose
|
18
|
-
@tracing = tracing
|
19
|
-
end
|
20
|
-
|
21
|
-
def start
|
22
|
-
success = true
|
23
|
-
|
24
|
-
while todo.any?
|
25
|
-
url = todo.pop
|
26
|
-
|
27
|
-
begin
|
28
|
-
extracted = links(url) do |u, d|
|
29
|
-
yield u, d if block_given?
|
30
|
-
rescue
|
31
|
-
warn "Callback error for #{url}: #{$ERROR_INFO}"
|
32
|
-
warn $ERROR_INFO.backtrace if @tracing
|
33
|
-
end
|
34
|
-
|
35
|
-
done.append(url)
|
36
|
-
todo.concat(extracted - done - todo)
|
37
|
-
rescue StandardError
|
38
|
-
warn "Skipping #{url} because of #{$ERROR_INFO.message}"
|
39
|
-
warn $ERROR_INFO.backtrace if @tracing
|
40
|
-
success = false
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
return success
|
45
|
-
end
|
46
|
-
|
47
|
-
private
|
48
|
-
|
49
|
-
def links(uri)
|
50
|
-
response = http_get(uri)
|
51
|
-
|
52
|
-
if response.respond_to?(:content_type) && response.content_type != 'text/html'
|
53
|
-
warn "Skipping #{uri} because it is not HTML" if @verbose
|
54
|
-
return []
|
55
|
-
end
|
56
|
-
|
57
|
-
doc = Nokogiri::HTML(response)
|
58
|
-
|
59
|
-
links = doc.css('a[href]').map do |e|
|
60
|
-
link = Addressable::URI.parse(e['href'])
|
61
|
-
link = uri.join(link) if link.relative?
|
62
|
-
|
63
|
-
if @whitelist.none? { |re| re.match?(link.to_s) }
|
64
|
-
warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @verbose
|
65
|
-
next
|
66
|
-
end
|
67
|
-
|
68
|
-
if @blacklist.any? { |re| re.match?(link.to_s) }
|
69
|
-
# TODO Print _which_ entry of the blacklist matches
|
70
|
-
warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @verbose
|
71
|
-
next
|
72
|
-
end
|
73
|
-
|
74
|
-
# TODO Ignore same page links (some anchor)
|
75
|
-
link
|
76
|
-
rescue StandardError
|
77
|
-
warn $ERROR_INFO.message
|
78
|
-
warn $ERROR_INFO.backtrace if @tracing
|
79
|
-
end.compact
|
80
|
-
|
81
|
-
yield uri, doc if block_given?
|
82
|
-
|
83
|
-
warn "Adding #{links.size} links from #{uri}" if @verbose
|
84
|
-
links
|
85
|
-
end
|
86
|
-
|
87
|
-
# https://twin.github.io/improving-open-uri/
|
88
|
-
def http_get(uri)
|
89
|
-
tries = 10
|
90
|
-
|
91
|
-
begin
|
92
|
-
URI.open(uri, redirect: false)
|
93
|
-
rescue OpenURI::HTTPRedirect => redirect
|
94
|
-
uri = redirect.uri
|
95
|
-
retry if (tries -= 1) > 0
|
96
|
-
raise
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
100
|
-
end
|