metainspector 4.0.0.rc2 → 4.0.0.rc3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +27 -17
- data/lib/meta_inspector/document.rb +10 -4
- data/lib/meta_inspector/request.rb +12 -11
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/request_spec.rb +0 -44
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bf5c2667ff165768d1a0e0c49ebd47ea5f8de28e
|
4
|
+
data.tar.gz: 15b2f4fb7a2f090a75fe06ab98959e35d5f97a3f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: eeb60786169e979dd8bb257832f2bf2c0270af8b2bf63056330826677a4943373aea51269a1ddfc397ae296cb786b5285997a1721b5ae412cc006214c872af18
|
7
|
+
data.tar.gz: ae891af393d3746df5048a1e512e70f11718fc8357a2c8212376119afb174e8b7e0ccd180f48c252813581a4ed5671b0f01e35ca555b475efc9997238c29c952
|
data/README.md
CHANGED
@@ -25,6 +25,8 @@ page.links.external # Returns all external HTTP links found
|
|
25
25
|
|
26
26
|
* Now `page.image` will return the first image in `page.images` if no OG or Twitter image found, instead of returning `nil`.
|
27
27
|
|
28
|
+
* You can now specify 2 different timeouts, `connection_timeout` and `read_timeout`, instead of the previous single `timeout`.
|
29
|
+
|
28
30
|
## Changes in 3.0
|
29
31
|
|
30
32
|
* The redirect API has been changed, now the `:allow_redirections` option will expect only a boolean, which by default is `true`. That is, no more specifying `:safe`, `:unsafe` or `:all`.
|
@@ -213,28 +215,36 @@ And the full scraped document is accessible from:
|
|
213
215
|
|
214
216
|
### Timeout & Retries
|
215
217
|
|
216
|
-
|
217
|
-
|
218
|
-
|
218
|
+
You can specify 2 different timeouts when requesting a page:
|
219
|
+
|
220
|
+
* `connection_timeout` sets the maximum number of seconds to wait to get a connection to the page.
|
221
|
+
* `read_timeout` sets the maximum number of seconds to wait to read the page, once connected.
|
222
|
+
|
223
|
+
Both timeouts default to 20 seconds each.
|
219
224
|
|
220
|
-
|
221
|
-
|
225
|
+
You can also specify the number of `retries`, which defaults to 3.
|
226
|
+
|
227
|
+
For example, this will time out after 10 seconds waiting for a connection, or after 5 seconds waiting
|
228
|
+
to read its contents, and will retry 4 times:
|
229
|
+
|
230
|
+
```ruby
|
231
|
+
page = MetaInspector.new('www.google', :connection_timeout => 10, :read_timeout => 5, :retries => 4)
|
232
|
+
```
|
222
233
|
|
223
234
|
If MetaInspector fails to fetch the page after it has exhausted its retries,
|
224
|
-
it will raise `
|
235
|
+
it will raise `Faraday::TimeoutError`, which you can rescue in your
|
225
236
|
application code.
|
226
237
|
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
end
|
238
|
+
```ruby
|
239
|
+
begin
|
240
|
+
page = MetaInspector.new(url)
|
241
|
+
rescue Faraday::TimeoutError
|
242
|
+
enqueue_for_future_fetch_attempt(url)
|
243
|
+
render_simple(url)
|
244
|
+
else
|
245
|
+
render_rich(page)
|
246
|
+
end
|
247
|
+
```
|
238
248
|
|
239
249
|
### Redirections
|
240
250
|
|
@@ -1,13 +1,15 @@
|
|
1
1
|
module MetaInspector
|
2
2
|
# A MetaInspector::Document knows about its URL and its contents
|
3
3
|
class Document
|
4
|
-
attr_reader :
|
4
|
+
attr_reader :html_content_only, :allow_redirections, :warn_level, :headers
|
5
5
|
|
6
6
|
include MetaInspector::Exceptionable
|
7
7
|
|
8
8
|
# Initializes a new instance of MetaInspector::Document, setting the URL to the one given
|
9
9
|
# Options:
|
10
|
-
# =>
|
10
|
+
# => connection_timeout: defaults to 20 seconds
|
11
|
+
# => read_timeout: defaults to 20 seconds
|
12
|
+
# => retries: defaults to 3 times
|
11
13
|
# => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
|
12
14
|
# => allow_redirections: when true, follow HTTP redirects. Defaults to true
|
13
15
|
# => document: the html of the url as a string
|
@@ -15,7 +17,9 @@ module MetaInspector
|
|
15
17
|
# => headers: object containing custom headers for the request
|
16
18
|
def initialize(initial_url, options = {})
|
17
19
|
options = defaults.merge(options)
|
18
|
-
@
|
20
|
+
@connection_timeout = options[:connection_timeout]
|
21
|
+
@read_timeout = options[:read_timeout]
|
22
|
+
@retries = options[:retries]
|
19
23
|
@html_content_only = options[:html_content_only]
|
20
24
|
@allow_redirections = options[:allow_redirections]
|
21
25
|
@document = options[:document]
|
@@ -24,7 +28,9 @@ module MetaInspector
|
|
24
28
|
@exception_log = options[:exception_log] || MetaInspector::ExceptionLog.new(warn_level: warn_level)
|
25
29
|
@url = MetaInspector::URL.new(initial_url, exception_log: @exception_log)
|
26
30
|
@request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
|
27
|
-
|
31
|
+
connection_timeout: @connection_timeout,
|
32
|
+
read_timeout: @read_timeout,
|
33
|
+
retries: @retries,
|
28
34
|
exception_log: @exception_log,
|
29
35
|
headers: @headers) unless @document
|
30
36
|
@parser = MetaInspector::Parser.new(self, exception_log: @exception_log)
|
@@ -1,7 +1,6 @@
|
|
1
1
|
require 'faraday'
|
2
2
|
require 'faraday_middleware'
|
3
3
|
require 'faraday-cookie_jar'
|
4
|
-
require 'timeout'
|
5
4
|
|
6
5
|
module MetaInspector
|
7
6
|
|
@@ -13,7 +12,8 @@ module MetaInspector
|
|
13
12
|
@url = initial_url
|
14
13
|
|
15
14
|
@allow_redirections = options[:allow_redirections]
|
16
|
-
@
|
15
|
+
@connection_timeout = options[:connection_timeout]
|
16
|
+
@read_timeout = options[:read_timeout]
|
17
17
|
@retries = options[:retries]
|
18
18
|
@exception_log = options[:exception_log]
|
19
19
|
@headers = options[:headers]
|
@@ -35,11 +35,8 @@ module MetaInspector
|
|
35
35
|
def response
|
36
36
|
request_count ||= 0
|
37
37
|
request_count += 1
|
38
|
-
|
39
|
-
rescue
|
40
|
-
retry unless @retries == request_count
|
41
|
-
@exception_log << TimeoutError.new("Attempt to fetch #{url} timed out 3 times.")
|
42
|
-
rescue Faraday::Error::ConnectionFailed, RuntimeError => e
|
38
|
+
@response ||= fetch
|
39
|
+
rescue Faraday::TimeoutError, Faraday::Error::ConnectionFailed, RuntimeError => e
|
43
40
|
@exception_log << e
|
44
41
|
nil
|
45
42
|
end
|
@@ -48,21 +45,25 @@ module MetaInspector
|
|
48
45
|
|
49
46
|
def fetch
|
50
47
|
session = Faraday.new(:url => url) do |faraday|
|
48
|
+
faraday.request :retry, max: @retries
|
49
|
+
|
51
50
|
if @allow_redirections
|
52
51
|
faraday.use FaradayMiddleware::FollowRedirects, limit: 10
|
53
52
|
faraday.use :cookie_jar
|
54
53
|
end
|
54
|
+
|
55
55
|
faraday.headers.merge!(@headers || {})
|
56
56
|
faraday.adapter :net_http
|
57
57
|
end
|
58
|
-
|
58
|
+
|
59
|
+
response = session.get do |req|
|
60
|
+
req.options.timeout = @connection_timeout
|
61
|
+
req.options.open_timeout = @read_timeout
|
62
|
+
end
|
59
63
|
|
60
64
|
@url.url = response.env.url.to_s
|
61
65
|
|
62
66
|
response
|
63
67
|
end
|
64
|
-
|
65
|
-
class TimeoutError < StandardError
|
66
|
-
end
|
67
68
|
end
|
68
69
|
end
|
data/spec/request_spec.rb
CHANGED
@@ -60,50 +60,6 @@ describe MetaInspector::Request do
|
|
60
60
|
end
|
61
61
|
end
|
62
62
|
|
63
|
-
describe "retrying on timeouts" do
|
64
|
-
let(:logger) { MetaInspector::ExceptionLog.new }
|
65
|
-
subject do
|
66
|
-
MetaInspector::Request.new(url('http://pagerankalert.com'),
|
67
|
-
exception_log: logger, retries: 3)
|
68
|
-
end
|
69
|
-
|
70
|
-
context "when request never succeeds" do
|
71
|
-
before{ Timeout.stub(:timeout).and_raise(Timeout::Error) }
|
72
|
-
it "swallows all the timeout errors and raises MetaInspector::Request::TimeoutError" do
|
73
|
-
logger.should receive(:<<).with(an_instance_of(MetaInspector::Request::TimeoutError))
|
74
|
-
subject
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
context "when request succeeds on third try" do
|
79
|
-
before do
|
80
|
-
Timeout.stub(:timeout).and_raise(Timeout::Error)
|
81
|
-
Timeout.stub(:timeout).and_raise(Timeout::Error)
|
82
|
-
Timeout.stub(:timeout).and_call_original
|
83
|
-
end
|
84
|
-
it "doesn't raise an exception" do
|
85
|
-
logger.should_not receive(:<<)
|
86
|
-
subject
|
87
|
-
end
|
88
|
-
it "succeeds as normal" do
|
89
|
-
subject.content_type.should == "text/html"
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
context "when request succeeds on fourth try" do
|
94
|
-
before do
|
95
|
-
Timeout.stub(:timeout).exactly(3).times.and_raise(Timeout::Error)
|
96
|
-
# if it were called a fourth time, rspec would raise an error
|
97
|
-
# so this implicitely tests the correct behavior
|
98
|
-
end
|
99
|
-
it "swallows all the timeout errors and raises MetaInspector::Request::TimeoutError" do
|
100
|
-
logger.should receive(:<<).with(an_instance_of(MetaInspector::Request::TimeoutError))
|
101
|
-
subject
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
end
|
106
|
-
|
107
63
|
private
|
108
64
|
|
109
65
|
def url(initial_url)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.0.0.
|
4
|
+
version: 4.0.0.rc3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-11-
|
11
|
+
date: 2014-11-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|