metainspector 3.0.0 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +24 -6
- data/lib/meta_inspector/document.rb +3 -1
- data/lib/meta_inspector/request.rb +8 -5
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/request_spec.rb +44 -6
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 46f73f718a436065bc4353e279e997afebb0c43b
|
4
|
+
data.tar.gz: ed7aa01afb850aacca6ed942beb7ba9d5e596d3b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 469d195a8b0f8fb5417bff510f3703b5cfde6ce5dba9866851e629f8c8d64dd73a680da7e3fb7614f556302ad1f12f7a59014df66b10ca1cf4f8cdead00ad67a
|
7
|
+
data.tar.gz: 0d7e3ae71b4beec707e293067c2f76db5698d399ea42071ef8845b551fedee2e7f400a62a066225572dc195ae21fc2504043419ef60b75ef0adf22ae6220034a
|
data/README.md
CHANGED
@@ -184,12 +184,30 @@ And the full scraped document is accessible from:
|
|
184
184
|
|
185
185
|
## Options
|
186
186
|
|
187
|
-
### Timeout
|
188
|
-
|
189
|
-
By default, MetaInspector times out after 20 seconds of waiting for a page to respond
|
190
|
-
|
191
|
-
|
192
|
-
|
187
|
+
### Timeout & Retries
|
188
|
+
|
189
|
+
By default, MetaInspector times out after 20 seconds of waiting for a page to respond,
|
190
|
+
and it will retry fetching the page 3 times.
|
191
|
+
You can specify different values for both of these, like this:
|
192
|
+
|
193
|
+
# timeout after 5 seconds, retry 4 times
|
194
|
+
page = MetaInspector.new('sitevalidator.com', :timeout => 5, :retries => 4)
|
195
|
+
|
196
|
+
If MetaInspector fails to fetch the page after it has exhausted its retries,
|
197
|
+
it will raise `MetaInspector::Request::TimeoutError`, which you can rescue in your
|
198
|
+
application code.
|
199
|
+
|
200
|
+
begin
|
201
|
+
data = MetaInspector.new(url)
|
202
|
+
rescue MetaInspector::Request::TimeoutError
|
203
|
+
enqueue_for_future_fetch_attempt(url)
|
204
|
+
render_simple(url)
|
205
|
+
rescue
|
206
|
+
log_fetch_error($!)
|
207
|
+
render_simple(url)
|
208
|
+
else
|
209
|
+
render_rich(data)
|
210
|
+
end
|
193
211
|
|
194
212
|
### Redirections
|
195
213
|
|
@@ -64,10 +64,12 @@ module MetaInspector
|
|
64
64
|
|
65
65
|
def defaults
|
66
66
|
{ :timeout => 20,
|
67
|
+
:retries => 3,
|
67
68
|
:html_content_only => false,
|
68
69
|
:warn_level => :raise,
|
69
70
|
:headers => {'User-Agent' => "MetaInspector/#{MetaInspector::VERSION} (+https://github.com/jaimeiniesta/metainspector)"},
|
70
|
-
:allow_redirections => true
|
71
|
+
:allow_redirections => true,
|
72
|
+
:exception_log => MetaInspector::ExceptionLog.new
|
71
73
|
}
|
72
74
|
end
|
73
75
|
|
@@ -12,12 +12,11 @@ module MetaInspector
|
|
12
12
|
include MetaInspector::Exceptionable
|
13
13
|
|
14
14
|
def initialize(initial_url, options = {})
|
15
|
-
options = defaults.merge(options)
|
16
|
-
|
17
15
|
@url = initial_url
|
18
16
|
|
19
17
|
@allow_redirections = options[:allow_redirections]
|
20
18
|
@timeout = options[:timeout]
|
19
|
+
@retries = options[:retries]
|
21
20
|
@exception_log = options[:exception_log]
|
22
21
|
@headers = options[:headers]
|
23
22
|
|
@@ -38,8 +37,13 @@ module MetaInspector
|
|
38
37
|
private
|
39
38
|
|
40
39
|
def response
|
40
|
+
request_count ||= 0
|
41
|
+
request_count += 1
|
41
42
|
Timeout::timeout(@timeout) { @response ||= fetch }
|
42
|
-
rescue
|
43
|
+
rescue Timeout::Error
|
44
|
+
retry unless @retries == request_count
|
45
|
+
@exception_log << TimeoutError.new("Attempt to fetch #{url} timed out 3 times.")
|
46
|
+
rescue Faraday::ConnectionFailed, RuntimeError => e
|
43
47
|
@exception_log << e
|
44
48
|
nil
|
45
49
|
end
|
@@ -60,8 +64,7 @@ module MetaInspector
|
|
60
64
|
response
|
61
65
|
end
|
62
66
|
|
63
|
-
|
64
|
-
{ timeout: 20, exception_log: MetaInspector::ExceptionLog.new, allow_redirections: true }
|
67
|
+
class TimeoutError < StandardError
|
65
68
|
end
|
66
69
|
end
|
67
70
|
end
|
data/spec/request_spec.rb
CHANGED
@@ -37,12 +37,6 @@ describe MetaInspector::Request do
|
|
37
37
|
FakeWeb.allow_net_connect = false
|
38
38
|
end
|
39
39
|
|
40
|
-
it "should handle timeouts" do
|
41
|
-
logger.should receive(:<<).with(an_instance_of(Timeout::Error))
|
42
|
-
|
43
|
-
MetaInspector::Request.new(url('http://example.com/timeout'), timeout: 0.0000000000000000001, exception_log: logger)
|
44
|
-
end
|
45
|
-
|
46
40
|
it "should handle socket errors" do
|
47
41
|
TCPSocket.stub(:open).and_raise(SocketError)
|
48
42
|
logger.should receive(:<<).with(an_instance_of(Faraday::ConnectionFailed))
|
@@ -51,6 +45,50 @@ describe MetaInspector::Request do
|
|
51
45
|
end
|
52
46
|
end
|
53
47
|
|
48
|
+
describe "retrying on timeouts" do
|
49
|
+
let(:logger) { MetaInspector::ExceptionLog.new }
|
50
|
+
subject do
|
51
|
+
MetaInspector::Request.new(url('http://pagerankalert.com'),
|
52
|
+
exception_log: logger, retries: 3)
|
53
|
+
end
|
54
|
+
|
55
|
+
context "when request never succeeds" do
|
56
|
+
before{ Timeout.stub(:timeout).and_raise(Timeout::Error) }
|
57
|
+
it "swallows all the timeout errors and raises MetaInspector::Request::TimeoutError" do
|
58
|
+
logger.should receive(:<<).with(an_instance_of(MetaInspector::Request::TimeoutError))
|
59
|
+
subject
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
context "when request succeeds on third try" do
|
64
|
+
before do
|
65
|
+
Timeout.stub(:timeout).and_raise(Timeout::Error)
|
66
|
+
Timeout.stub(:timeout).and_raise(Timeout::Error)
|
67
|
+
Timeout.stub(:timeout).and_call_original
|
68
|
+
end
|
69
|
+
it "doesn't raise an exception" do
|
70
|
+
logger.should_not receive(:<<)
|
71
|
+
subject
|
72
|
+
end
|
73
|
+
it "succeeds as normal" do
|
74
|
+
subject.content_type.should == "text/html"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
context "when request succeeds on fourth try" do
|
79
|
+
before do
|
80
|
+
Timeout.stub(:timeout).exactly(3).times.and_raise(Timeout::Error)
|
81
|
+
# if it were called a fourth time, rspec would raise an error
|
82
|
+
# so this implicitely tests the correct behavior
|
83
|
+
end
|
84
|
+
it "swallows all the timeout errors and raises MetaInspector::Request::TimeoutError" do
|
85
|
+
logger.should receive(:<<).with(an_instance_of(MetaInspector::Request::TimeoutError))
|
86
|
+
subject
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
54
92
|
private
|
55
93
|
|
56
94
|
def url(initial_url)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|