httpspell 1.4.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +20 -0
- data/Gemfile.lock +3 -7
- data/TODO.markdown +1 -2
- data/exe/httpspell +19 -19
- data/httpspell.gemspec +1 -2
- data/lib/{httpspell → http_spell}/spellchecker.rb +2 -5
- data/lib/http_spell/spider.rb +106 -0
- data/lib/{httpspell → http_spell}/version.rb +1 -1
- metadata +5 -19
- data/lib/httpspell/spider.rb +0 -100
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: d66cfcc88c0bc7e0e237033b8c76f1aaccc40f9aba3f68766d45204a2b133401
|
|
4
|
+
data.tar.gz: 6c488170f95d0f33fdcbc5c55f2416d654f2b4558214943f21d19e1220f2ad96
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f493b3411cd162e4a714203b05277f26810d71a0c23e52d69c36297c19e03db7b97692c1e4928c8f8fb0b9bc9a59f04b9ae4113c5cdd309c1edcf2a493d68687
|
|
7
|
+
data.tar.gz: ed44c8adf0dcd63330e8e7f837d9f515fd6131e58ae02808a01ed430ef42e5ee258a8c8d5a6fb9552c6b2c0dc460966453d3f1cda6e359739a18b6a01a25cdeb
|
data/.rubocop.yml
CHANGED
|
@@ -1,7 +1,11 @@
|
|
|
1
|
+
require:
|
|
2
|
+
- rubocop-rake
|
|
3
|
+
- rubocop-rspec
|
|
1
4
|
AllCops:
|
|
2
5
|
NewCops: enable
|
|
3
6
|
TargetRubyVersion: 3.3
|
|
4
7
|
Include:
|
|
8
|
+
- '**/*.rb'
|
|
5
9
|
- '**/Gemfile'
|
|
6
10
|
- '**/Rakefile'
|
|
7
11
|
- '**/config.ru'
|
|
@@ -23,3 +27,19 @@ Layout/LineLength:
|
|
|
23
27
|
Max: 160
|
|
24
28
|
Style/Documentation:
|
|
25
29
|
Enabled: false
|
|
30
|
+
Metrics/AbcSize:
|
|
31
|
+
Enabled: false
|
|
32
|
+
Metrics/MethodLength:
|
|
33
|
+
Enabled: false
|
|
34
|
+
Metrics/CyclomaticComplexity:
|
|
35
|
+
Enabled: false
|
|
36
|
+
Style/TrailingCommaInArrayLiteral:
|
|
37
|
+
Enabled: false
|
|
38
|
+
RSpec/ExampleWording:
|
|
39
|
+
Enabled: false
|
|
40
|
+
RSpec/InstanceVariable:
|
|
41
|
+
AssignmentOnly: true
|
|
42
|
+
RSpec/ExampleLength:
|
|
43
|
+
Max: 10
|
|
44
|
+
Metrics/PerceivedComplexity:
|
|
45
|
+
Max: 16
|
data/Gemfile.lock
CHANGED
|
@@ -1,15 +1,12 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
httpspell (1.
|
|
5
|
-
addressable
|
|
4
|
+
httpspell (1.5.0)
|
|
6
5
|
nokogiri
|
|
7
6
|
|
|
8
7
|
GEM
|
|
9
8
|
remote: https://rubygems.org/
|
|
10
9
|
specs:
|
|
11
|
-
addressable (2.8.6)
|
|
12
|
-
public_suffix (>= 2.0.2, < 6.0)
|
|
13
10
|
aruba (2.2.0)
|
|
14
11
|
bundler (>= 1.17, < 3.0)
|
|
15
12
|
contracts (>= 0.16.0, < 0.18.0)
|
|
@@ -91,7 +88,7 @@ GEM
|
|
|
91
88
|
nenv (~> 0.1)
|
|
92
89
|
shellany (~> 0.0)
|
|
93
90
|
parallel (1.24.0)
|
|
94
|
-
parser (3.3.
|
|
91
|
+
parser (3.3.2.0)
|
|
95
92
|
ast (~> 2.4.1)
|
|
96
93
|
racc
|
|
97
94
|
pry (0.14.2)
|
|
@@ -100,7 +97,6 @@ GEM
|
|
|
100
97
|
pry-byebug (3.10.1)
|
|
101
98
|
byebug (~> 11.0)
|
|
102
99
|
pry (>= 0.13, < 0.15)
|
|
103
|
-
public_suffix (5.0.5)
|
|
104
100
|
racc (1.8.0)
|
|
105
101
|
rack (3.0.11)
|
|
106
102
|
rackup (0.2.3)
|
|
@@ -127,7 +123,7 @@ GEM
|
|
|
127
123
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
128
124
|
rspec-support (~> 3.13.0)
|
|
129
125
|
rspec-support (3.13.1)
|
|
130
|
-
rubocop (1.64.
|
|
126
|
+
rubocop (1.64.1)
|
|
131
127
|
json (~> 2.3)
|
|
132
128
|
language_server-protocol (>= 3.17.0)
|
|
133
129
|
parallel (~> 1.10)
|
data/TODO.markdown
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
* Bail out if lang cannot be inferred and is not given on cmdline
|
|
2
2
|
* exe/httpspell: # TODO: --recursive, defaults to false
|
|
3
3
|
* exe/httpspell: # TODO wget has some additional options for recursive behavior that should be reviewed
|
|
4
|
-
* lib/httpspell/spider.rb: # TODO Print _which_ entry of the
|
|
5
|
-
* lib/httpspell/spider.rb: # TODO Ignore same page links (some anchor)
|
|
4
|
+
* lib/httpspell/spider.rb: # TODO Print _which_ entry of the exclude list matches
|
data/exe/httpspell
CHANGED
|
@@ -2,16 +2,16 @@
|
|
|
2
2
|
# frozen_string_literal: true
|
|
3
3
|
|
|
4
4
|
require 'optparse'
|
|
5
|
-
require '
|
|
6
|
-
require '
|
|
7
|
-
require '
|
|
5
|
+
require 'http_spell/spider'
|
|
6
|
+
require 'http_spell/spellchecker'
|
|
7
|
+
require 'http_spell/version'
|
|
8
8
|
|
|
9
9
|
personal_dictionary_path = nil
|
|
10
10
|
force_language = nil
|
|
11
11
|
tracing = nil
|
|
12
12
|
verbose = nil
|
|
13
|
-
|
|
14
|
-
|
|
13
|
+
included = nil
|
|
14
|
+
excluded = []
|
|
15
15
|
|
|
16
16
|
begin
|
|
17
17
|
OptionParser.new do |parser|
|
|
@@ -29,9 +29,9 @@ begin
|
|
|
29
29
|
force_language = l
|
|
30
30
|
end
|
|
31
31
|
|
|
32
|
-
parser.on('-
|
|
33
|
-
|
|
34
|
-
|
|
32
|
+
parser.on('-i', '--include=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
|
|
33
|
+
included ||= []
|
|
34
|
+
included << Regexp.new(w)
|
|
35
35
|
end
|
|
36
36
|
|
|
37
37
|
parser.on('-t', '--trace', 'enable error tracing') do
|
|
@@ -42,15 +42,15 @@ begin
|
|
|
42
42
|
verbose = true
|
|
43
43
|
end
|
|
44
44
|
|
|
45
|
-
parser.on('-
|
|
46
|
-
|
|
45
|
+
parser.on('-e', '--exclude=EXPRESSION', 'exclude URLs matching the given regular EXPRESSION') do |b|
|
|
46
|
+
excluded << Regexp.new(b)
|
|
47
47
|
end
|
|
48
48
|
|
|
49
49
|
# TODO: --recursive, defaults to false
|
|
50
50
|
# TODO wget has some additional options for recursive behavior that should be reviewed
|
|
51
51
|
end.parse!
|
|
52
52
|
rescue StandardError
|
|
53
|
-
warn "Error
|
|
53
|
+
warn "Error: #{$ERROR_INFO}"
|
|
54
54
|
exit 1
|
|
55
55
|
end
|
|
56
56
|
|
|
@@ -59,13 +59,14 @@ if ARGV.size != 1
|
|
|
59
59
|
exit 1
|
|
60
60
|
end
|
|
61
61
|
|
|
62
|
-
def check(doc, lang, personal_dictionary_path, verbose)
|
|
62
|
+
def check(url, doc, lang, personal_dictionary_path, verbose)
|
|
63
63
|
unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc, lang)
|
|
64
64
|
|
|
65
65
|
if unknown_words.empty?
|
|
66
|
-
warn
|
|
66
|
+
warn "#{url} (lang=#{lang}): No unknown words" if verbose
|
|
67
|
+
false
|
|
67
68
|
else
|
|
68
|
-
warn "#{unknown_words.size} unknown words:" if verbose
|
|
69
|
+
warn "#{url} (lang=#{lang}): #{unknown_words.size} unknown words:" if verbose
|
|
69
70
|
puts unknown_words
|
|
70
71
|
true
|
|
71
72
|
end
|
|
@@ -73,24 +74,23 @@ end
|
|
|
73
74
|
|
|
74
75
|
has_unknown_words = false
|
|
75
76
|
|
|
76
|
-
spider_success = HttpSpell::Spider.new(ARGV.first,
|
|
77
|
+
spider_success = HttpSpell::Spider.new(ARGV.first, included:, excluded:, verbose:, tracing:).start do |url, doc|
|
|
77
78
|
lang = force_language || doc.root['lang'] || ENV.fetch('LANGUAGE', nil)
|
|
78
|
-
warn "Checking #{url} as #{lang}" if verbose
|
|
79
79
|
|
|
80
80
|
# Remove elements that are not to be spellchecked
|
|
81
81
|
doc.css('pre').each(&:unlink)
|
|
82
82
|
doc.css('code').each(&:unlink)
|
|
83
|
+
doc.css('iframe').each(&:unlink)
|
|
83
84
|
doc.css('[spellcheck=false]').each(&:unlink)
|
|
84
85
|
|
|
85
86
|
# Handle elements with a different lang attribute separately
|
|
86
87
|
doc.css(%([lang]:not([lang="#{lang}"]))).each do |element|
|
|
87
|
-
|
|
88
|
-
has_unknown_words |= check(element.to_s, element['lang'], personal_dictionary_path, verbose)
|
|
88
|
+
has_unknown_words |= check("#{url} => #{element.name} with", element.to_s, element['lang'], personal_dictionary_path, verbose)
|
|
89
89
|
element.unlink
|
|
90
90
|
end
|
|
91
91
|
|
|
92
92
|
# Everything else
|
|
93
|
-
has_unknown_words |= check(doc.to_s, lang, personal_dictionary_path, verbose)
|
|
93
|
+
has_unknown_words |= check("#{url} => document with", doc.to_s, lang, personal_dictionary_path, verbose)
|
|
94
94
|
end
|
|
95
95
|
|
|
96
96
|
exit 2 unless spider_success
|
data/httpspell.gemspec
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
lib = File.expand_path('lib', __dir__)
|
|
4
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
5
|
-
require '
|
|
5
|
+
require 'http_spell/version'
|
|
6
6
|
|
|
7
7
|
Gem::Specification.new do |spec|
|
|
8
8
|
spec.name = 'httpspell'
|
|
@@ -23,7 +23,6 @@ Gem::Specification.new do |spec|
|
|
|
23
23
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
24
24
|
spec.require_paths = ['lib']
|
|
25
25
|
|
|
26
|
-
spec.add_dependency 'addressable'
|
|
27
26
|
spec.add_dependency 'nokogiri'
|
|
28
27
|
spec.metadata['rubygems_mfa_required'] = 'true'
|
|
29
28
|
end
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module HttpSpell
|
|
2
4
|
class SpellChecker
|
|
3
5
|
def initialize(personal_dictionary_path = nil, verbose: false)
|
|
@@ -11,11 +13,6 @@ module HttpSpell
|
|
|
11
13
|
"hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l",
|
|
12
14
|
]
|
|
13
15
|
|
|
14
|
-
if @verbose
|
|
15
|
-
warn "Piping the HTML document into the following chain of commands:"
|
|
16
|
-
warn commands
|
|
17
|
-
end
|
|
18
|
-
|
|
19
16
|
Open3.pipeline_rw(*commands) do |stdin, stdout, _wait_thrs|
|
|
20
17
|
stdin.puts(doc)
|
|
21
18
|
stdin.close
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'nokogiri'
|
|
4
|
+
require 'uri'
|
|
5
|
+
require 'open-uri'
|
|
6
|
+
require 'open3'
|
|
7
|
+
require 'English'
|
|
8
|
+
|
|
9
|
+
module HttpSpell
|
|
10
|
+
class Spider
|
|
11
|
+
attr_reader :todo, :done
|
|
12
|
+
|
|
13
|
+
def initialize(starting_point, included: nil, excluded: [], verbose: false, tracing: false)
|
|
14
|
+
@todo = []
|
|
15
|
+
@done = []
|
|
16
|
+
todo << URI(starting_point)
|
|
17
|
+
@included = included || [/^#{starting_point}/]
|
|
18
|
+
@excluded = excluded
|
|
19
|
+
@verbose = verbose
|
|
20
|
+
@tracing = tracing
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def start
|
|
24
|
+
success = true
|
|
25
|
+
|
|
26
|
+
while todo.any?
|
|
27
|
+
url = todo.pop
|
|
28
|
+
|
|
29
|
+
begin
|
|
30
|
+
extracted = links(url) do |u, d|
|
|
31
|
+
yield u, d if block_given?
|
|
32
|
+
rescue StandardError
|
|
33
|
+
warn "Callback error for #{url}: #{$ERROR_INFO}"
|
|
34
|
+
warn $ERROR_INFO.backtrace if @tracing
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
done.append(url)
|
|
38
|
+
new_links = (extracted - done - todo).uniq
|
|
39
|
+
|
|
40
|
+
if new_links.any?
|
|
41
|
+
warn "Adding #{new_links.size} new links found at #{url}" if @verbose
|
|
42
|
+
todo.concat(extracted - done - todo).uniq!
|
|
43
|
+
end
|
|
44
|
+
rescue StandardError
|
|
45
|
+
warn "Skipping #{url} because of #{$ERROR_INFO.message}"
|
|
46
|
+
warn $ERROR_INFO.backtrace if @tracing
|
|
47
|
+
success = false
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
success
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private
|
|
55
|
+
|
|
56
|
+
def links(uri)
|
|
57
|
+
response = http_get(uri)
|
|
58
|
+
|
|
59
|
+
if response.respond_to?(:content_type) && response.content_type != 'text/html'
|
|
60
|
+
warn "Skipping #{response.base_uri} because it is not HTML" if @verbose
|
|
61
|
+
return []
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
doc = Nokogiri::HTML(response)
|
|
65
|
+
|
|
66
|
+
links = doc.css('a[href]').map do |e|
|
|
67
|
+
next if e['href'].start_with?('#') # Ignore fragment on the same page; we always check the whole page
|
|
68
|
+
|
|
69
|
+
link = URI.join(response.base_uri, e['href'])
|
|
70
|
+
link.fragment = nil # Ignore fragment in links to other pages, too
|
|
71
|
+
|
|
72
|
+
if @included.none? { |re| re.match?(link.to_s) }
|
|
73
|
+
warn "Skipping #{link} because it is not on the included #{@included}" if @verbose
|
|
74
|
+
next
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
if @excluded.any? { |re| re.match?(link.to_s) }
|
|
78
|
+
# TODO: Print _which_ entry of the excluded matches
|
|
79
|
+
warn "Skipping #{link} because it is on the excluded #{@excluded}" if @verbose
|
|
80
|
+
next
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
link
|
|
84
|
+
rescue StandardError
|
|
85
|
+
warn "Error: #{$ERROR_INFO}"
|
|
86
|
+
warn $ERROR_INFO.backtrace if @tracing
|
|
87
|
+
end.compact
|
|
88
|
+
|
|
89
|
+
yield response.base_uri, doc if block_given?
|
|
90
|
+
|
|
91
|
+
links
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# https://twin.github.io/improving-open-uri/
|
|
95
|
+
def http_get(uri)
|
|
96
|
+
tries = 10
|
|
97
|
+
begin
|
|
98
|
+
URI.parse(uri).open(redirect: false)
|
|
99
|
+
rescue OpenURI::HTTPRedirect => e
|
|
100
|
+
uri = e.uri
|
|
101
|
+
retry if (tries -= 1).positive?
|
|
102
|
+
raise
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
metadata
CHANGED
|
@@ -1,29 +1,15 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: httpspell
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Steffen Uhlig
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2024-05-
|
|
11
|
+
date: 2024-05-31 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
|
-
- !ruby/object:Gem::Dependency
|
|
14
|
-
name: addressable
|
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
|
16
|
-
requirements:
|
|
17
|
-
- - ">="
|
|
18
|
-
- !ruby/object:Gem::Version
|
|
19
|
-
version: '0'
|
|
20
|
-
type: :runtime
|
|
21
|
-
prerelease: false
|
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
-
requirements:
|
|
24
|
-
- - ">="
|
|
25
|
-
- !ruby/object:Gem::Version
|
|
26
|
-
version: '0'
|
|
27
13
|
- !ruby/object:Gem::Dependency
|
|
28
14
|
name: nokogiri
|
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -63,9 +49,9 @@ files:
|
|
|
63
49
|
- TODO.markdown
|
|
64
50
|
- exe/httpspell
|
|
65
51
|
- httpspell.gemspec
|
|
66
|
-
- lib/
|
|
67
|
-
- lib/
|
|
68
|
-
- lib/
|
|
52
|
+
- lib/http_spell/spellchecker.rb
|
|
53
|
+
- lib/http_spell/spider.rb
|
|
54
|
+
- lib/http_spell/version.rb
|
|
69
55
|
homepage:
|
|
70
56
|
licenses:
|
|
71
57
|
- MIT
|
data/lib/httpspell/spider.rb
DELETED
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
require 'nokogiri'
|
|
2
|
-
require 'open-uri'
|
|
3
|
-
require 'open3'
|
|
4
|
-
require 'addressable/uri'
|
|
5
|
-
require 'English'
|
|
6
|
-
|
|
7
|
-
module HttpSpell
|
|
8
|
-
class Spider
|
|
9
|
-
attr_reader :todo, :done
|
|
10
|
-
|
|
11
|
-
def initialize(starting_point, whitelist: nil, blacklist: [], verbose: false, tracing: false)
|
|
12
|
-
@todo = []
|
|
13
|
-
@done = []
|
|
14
|
-
todo << Addressable::URI.parse(starting_point)
|
|
15
|
-
@whitelist = whitelist || [/^#{starting_point}/]
|
|
16
|
-
@blacklist = blacklist
|
|
17
|
-
@verbose = verbose
|
|
18
|
-
@tracing = tracing
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
def start
|
|
22
|
-
success = true
|
|
23
|
-
|
|
24
|
-
while todo.any?
|
|
25
|
-
url = todo.pop
|
|
26
|
-
|
|
27
|
-
begin
|
|
28
|
-
extracted = links(url) do |u, d|
|
|
29
|
-
yield u, d if block_given?
|
|
30
|
-
rescue
|
|
31
|
-
warn "Callback error for #{url}: #{$ERROR_INFO}"
|
|
32
|
-
warn $ERROR_INFO.backtrace if @tracing
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
done.append(url)
|
|
36
|
-
todo.concat(extracted - done - todo)
|
|
37
|
-
rescue StandardError
|
|
38
|
-
warn "Skipping #{url} because of #{$ERROR_INFO.message}"
|
|
39
|
-
warn $ERROR_INFO.backtrace if @tracing
|
|
40
|
-
success = false
|
|
41
|
-
end
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
return success
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
private
|
|
48
|
-
|
|
49
|
-
def links(uri)
|
|
50
|
-
response = http_get(uri)
|
|
51
|
-
|
|
52
|
-
if response.respond_to?(:content_type) && response.content_type != 'text/html'
|
|
53
|
-
warn "Skipping #{uri} because it is not HTML" if @verbose
|
|
54
|
-
return []
|
|
55
|
-
end
|
|
56
|
-
|
|
57
|
-
doc = Nokogiri::HTML(response)
|
|
58
|
-
|
|
59
|
-
links = doc.css('a[href]').map do |e|
|
|
60
|
-
link = Addressable::URI.parse(e['href'])
|
|
61
|
-
link = uri.join(link) if link.relative?
|
|
62
|
-
|
|
63
|
-
if @whitelist.none? { |re| re.match?(link.to_s) }
|
|
64
|
-
warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @verbose
|
|
65
|
-
next
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
if @blacklist.any? { |re| re.match?(link.to_s) }
|
|
69
|
-
# TODO Print _which_ entry of the blacklist matches
|
|
70
|
-
warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @verbose
|
|
71
|
-
next
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
# TODO Ignore same page links (some anchor)
|
|
75
|
-
link
|
|
76
|
-
rescue StandardError
|
|
77
|
-
warn $ERROR_INFO.message
|
|
78
|
-
warn $ERROR_INFO.backtrace if @tracing
|
|
79
|
-
end.compact
|
|
80
|
-
|
|
81
|
-
yield uri, doc if block_given?
|
|
82
|
-
|
|
83
|
-
warn "Adding #{links.size} links from #{uri}" if @verbose
|
|
84
|
-
links
|
|
85
|
-
end
|
|
86
|
-
|
|
87
|
-
# https://twin.github.io/improving-open-uri/
|
|
88
|
-
def http_get(uri)
|
|
89
|
-
tries = 10
|
|
90
|
-
|
|
91
|
-
begin
|
|
92
|
-
URI.open(uri, redirect: false)
|
|
93
|
-
rescue OpenURI::HTTPRedirect => redirect
|
|
94
|
-
uri = redirect.uri
|
|
95
|
-
retry if (tries -= 1) > 0
|
|
96
|
-
raise
|
|
97
|
-
end
|
|
98
|
-
end
|
|
99
|
-
end
|
|
100
|
-
end
|