metainspector 3.0.0 → 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +24 -6
- data/lib/meta_inspector/document.rb +3 -1
- data/lib/meta_inspector/request.rb +8 -5
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/request_spec.rb +44 -6
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 46f73f718a436065bc4353e279e997afebb0c43b
|
4
|
+
data.tar.gz: ed7aa01afb850aacca6ed942beb7ba9d5e596d3b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 469d195a8b0f8fb5417bff510f3703b5cfde6ce5dba9866851e629f8c8d64dd73a680da7e3fb7614f556302ad1f12f7a59014df66b10ca1cf4f8cdead00ad67a
|
7
|
+
data.tar.gz: 0d7e3ae71b4beec707e293067c2f76db5698d399ea42071ef8845b551fedee2e7f400a62a066225572dc195ae21fc2504043419ef60b75ef0adf22ae6220034a
|
data/README.md
CHANGED
@@ -184,12 +184,30 @@ And the full scraped document is accessible from:
|
|
184
184
|
|
185
185
|
## Options
|
186
186
|
|
187
|
-
### Timeout
|
188
|
-
|
189
|
-
By default, MetaInspector times out after 20 seconds of waiting for a page to respond
|
190
|
-
|
191
|
-
|
192
|
-
|
187
|
+
### Timeout & Retries
|
188
|
+
|
189
|
+
By default, MetaInspector times out after 20 seconds of waiting for a page to respond,
|
190
|
+
and it will retry fetching the page 3 times.
|
191
|
+
You can specify different values for both of these, like this:
|
192
|
+
|
193
|
+
# timeout after 5 seconds, retry 4 times
|
194
|
+
page = MetaInspector.new('sitevalidator.com', :timeout => 5, :retries => 4)
|
195
|
+
|
196
|
+
If MetaInspector fails to fetch the page after it has exhausted its retries,
|
197
|
+
it will raise `MetaInspector::Request::TimeoutError`, which you can rescue in your
|
198
|
+
application code.
|
199
|
+
|
200
|
+
begin
|
201
|
+
data = MetaInspector.new(url)
|
202
|
+
rescue MetaInspector::Request::TimeoutError
|
203
|
+
enqueue_for_future_fetch_attempt(url)
|
204
|
+
render_simple(url)
|
205
|
+
rescue
|
206
|
+
log_fetch_error($!)
|
207
|
+
render_simple(url)
|
208
|
+
else
|
209
|
+
render_rich(data)
|
210
|
+
end
|
193
211
|
|
194
212
|
### Redirections
|
195
213
|
|
@@ -64,10 +64,12 @@ module MetaInspector
|
|
64
64
|
|
65
65
|
def defaults
|
66
66
|
{ :timeout => 20,
|
67
|
+
:retries => 3,
|
67
68
|
:html_content_only => false,
|
68
69
|
:warn_level => :raise,
|
69
70
|
:headers => {'User-Agent' => "MetaInspector/#{MetaInspector::VERSION} (+https://github.com/jaimeiniesta/metainspector)"},
|
70
|
-
:allow_redirections => true
|
71
|
+
:allow_redirections => true,
|
72
|
+
:exception_log => MetaInspector::ExceptionLog.new
|
71
73
|
}
|
72
74
|
end
|
73
75
|
|
@@ -12,12 +12,11 @@ module MetaInspector
|
|
12
12
|
include MetaInspector::Exceptionable
|
13
13
|
|
14
14
|
def initialize(initial_url, options = {})
|
15
|
-
options = defaults.merge(options)
|
16
|
-
|
17
15
|
@url = initial_url
|
18
16
|
|
19
17
|
@allow_redirections = options[:allow_redirections]
|
20
18
|
@timeout = options[:timeout]
|
19
|
+
@retries = options[:retries]
|
21
20
|
@exception_log = options[:exception_log]
|
22
21
|
@headers = options[:headers]
|
23
22
|
|
@@ -38,8 +37,13 @@ module MetaInspector
|
|
38
37
|
private
|
39
38
|
|
40
39
|
def response
|
40
|
+
request_count ||= 0
|
41
|
+
request_count += 1
|
41
42
|
Timeout::timeout(@timeout) { @response ||= fetch }
|
42
|
-
rescue
|
43
|
+
rescue Timeout::Error
|
44
|
+
retry unless @retries == request_count
|
45
|
+
@exception_log << TimeoutError.new("Attempt to fetch #{url} timed out 3 times.")
|
46
|
+
rescue Faraday::ConnectionFailed, RuntimeError => e
|
43
47
|
@exception_log << e
|
44
48
|
nil
|
45
49
|
end
|
@@ -60,8 +64,7 @@ module MetaInspector
|
|
60
64
|
response
|
61
65
|
end
|
62
66
|
|
63
|
-
|
64
|
-
{ timeout: 20, exception_log: MetaInspector::ExceptionLog.new, allow_redirections: true }
|
67
|
+
class TimeoutError < StandardError
|
65
68
|
end
|
66
69
|
end
|
67
70
|
end
|
data/spec/request_spec.rb
CHANGED
@@ -37,12 +37,6 @@ describe MetaInspector::Request do
|
|
37
37
|
FakeWeb.allow_net_connect = false
|
38
38
|
end
|
39
39
|
|
40
|
-
it "should handle timeouts" do
|
41
|
-
logger.should receive(:<<).with(an_instance_of(Timeout::Error))
|
42
|
-
|
43
|
-
MetaInspector::Request.new(url('http://example.com/timeout'), timeout: 0.0000000000000000001, exception_log: logger)
|
44
|
-
end
|
45
|
-
|
46
40
|
it "should handle socket errors" do
|
47
41
|
TCPSocket.stub(:open).and_raise(SocketError)
|
48
42
|
logger.should receive(:<<).with(an_instance_of(Faraday::ConnectionFailed))
|
@@ -51,6 +45,50 @@ describe MetaInspector::Request do
|
|
51
45
|
end
|
52
46
|
end
|
53
47
|
|
48
|
+
describe "retrying on timeouts" do
|
49
|
+
let(:logger) { MetaInspector::ExceptionLog.new }
|
50
|
+
subject do
|
51
|
+
MetaInspector::Request.new(url('http://pagerankalert.com'),
|
52
|
+
exception_log: logger, retries: 3)
|
53
|
+
end
|
54
|
+
|
55
|
+
context "when request never succeeds" do
|
56
|
+
before{ Timeout.stub(:timeout).and_raise(Timeout::Error) }
|
57
|
+
it "swallows all the timeout errors and raises MetaInspector::Request::TimeoutError" do
|
58
|
+
logger.should receive(:<<).with(an_instance_of(MetaInspector::Request::TimeoutError))
|
59
|
+
subject
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
context "when request succeeds on third try" do
|
64
|
+
before do
|
65
|
+
Timeout.stub(:timeout).and_raise(Timeout::Error)
|
66
|
+
Timeout.stub(:timeout).and_raise(Timeout::Error)
|
67
|
+
Timeout.stub(:timeout).and_call_original
|
68
|
+
end
|
69
|
+
it "doesn't raise an exception" do
|
70
|
+
logger.should_not receive(:<<)
|
71
|
+
subject
|
72
|
+
end
|
73
|
+
it "succeeds as normal" do
|
74
|
+
subject.content_type.should == "text/html"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
context "when request succeeds on fourth try" do
|
79
|
+
before do
|
80
|
+
Timeout.stub(:timeout).exactly(3).times.and_raise(Timeout::Error)
|
81
|
+
# if it were called a fourth time, rspec would raise an error
|
82
|
+
# so this implicitely tests the correct behavior
|
83
|
+
end
|
84
|
+
it "swallows all the timeout errors and raises MetaInspector::Request::TimeoutError" do
|
85
|
+
logger.should receive(:<<).with(an_instance_of(MetaInspector::Request::TimeoutError))
|
86
|
+
subject
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
54
92
|
private
|
55
93
|
|
56
94
|
def url(initial_url)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|