metainspector 4.0.0.rc2 → 4.0.0.rc3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5ba263e5a232d332082575e310c75d31c971a419
4
- data.tar.gz: befb5ddec99b4b36db95e37eb2ab33e22bfd1e2f
3
+ metadata.gz: bf5c2667ff165768d1a0e0c49ebd47ea5f8de28e
4
+ data.tar.gz: 15b2f4fb7a2f090a75fe06ab98959e35d5f97a3f
5
5
  SHA512:
6
- metadata.gz: 9d813747e71e15d058104398fcc53eefd8aeeee2c2eb224b53cb0d2dcf6bf786c1bcd3de9111152d7e583e6fd96a42b4b34fa34e4bfb53cf4b7fc8127e27dc01
7
- data.tar.gz: fe41a9cb0a176c9d03892ab18d48194203d12aeb2df22faf47c6a2d32d91fa0422aabb5acd7c12e0b3f9a336898ca888601a5ebe1dae36a0de1a790d5bc60d51
6
+ metadata.gz: eeb60786169e979dd8bb257832f2bf2c0270af8b2bf63056330826677a4943373aea51269a1ddfc397ae296cb786b5285997a1721b5ae412cc006214c872af18
7
+ data.tar.gz: ae891af393d3746df5048a1e512e70f11718fc8357a2c8212376119afb174e8b7e0ccd180f48c252813581a4ed5671b0f01e35ca555b475efc9997238c29c952
data/README.md CHANGED
@@ -25,6 +25,8 @@ page.links.external # Returns all external HTTP links found
25
25
 
26
26
  * Now `page.image` will return the first image in `page.images` if no OG or Twitter image found, instead of returning `nil`.
27
27
 
28
+ * You can now specify 2 different timeouts, `connection_timeout` and `read_timeout`, instead of the previous single `timeout`.
29
+
28
30
  ## Changes in 3.0
29
31
 
30
32
  * The redirect API has been changed, now the `:allow_redirections` option will expect only a boolean, which by default is `true`. That is, no more specifying `:safe`, `:unsafe` or `:all`.
@@ -213,28 +215,36 @@ And the full scraped document is accessible from:
213
215
 
214
216
  ### Timeout & Retries
215
217
 
216
- By default, MetaInspector times out after 20 seconds of waiting for a page to respond,
217
- and it will retry fetching the page 3 times.
218
- You can specify different values for both of these, like this:
218
+ You can specify 2 different timeouts when requesting a page:
219
+
220
+ * `connection_timeout` sets the maximum number of seconds to wait to get a connection to the page.
221
+ * `read_timeout` sets the maximum number of seconds to wait to read the page, once connected.
222
+
223
+ Both timeouts default to 20 seconds each.
219
224
 
220
- # timeout after 5 seconds, retry 4 times
221
- page = MetaInspector.new('sitevalidator.com', :timeout => 5, :retries => 4)
225
+ You can also specify the number of `retries`, which defaults to 3.
226
+
227
+ For example, this will time out after 10 seconds waiting for a connection, or after 5 seconds waiting
228
+ to read its contents, and will retry 4 times:
229
+
230
+ ```ruby
231
+ page = MetaInspector.new('www.google', :connection_timeout => 10, :read_timeout => 5, :retries => 4)
232
+ ```
222
233
 
223
234
  If MetaInspector fails to fetch the page after it has exhausted its retries,
224
- it will raise `MetaInspector::Request::TimeoutError`, which you can rescue in your
235
+ it will raise `Faraday::TimeoutError`, which you can rescue in your
225
236
  application code.
226
237
 
227
- begin
228
- data = MetaInspector.new(url)
229
- rescue MetaInspector::Request::TimeoutError
230
- enqueue_for_future_fetch_attempt(url)
231
- render_simple(url)
232
- rescue
233
- log_fetch_error($!)
234
- render_simple(url)
235
- else
236
- render_rich(data)
237
- end
238
+ ```ruby
239
+ begin
240
+ page = MetaInspector.new(url)
241
+ rescue Faraday::TimeoutError
242
+ enqueue_for_future_fetch_attempt(url)
243
+ render_simple(url)
244
+ else
245
+ render_rich(page)
246
+ end
247
+ ```
238
248
 
239
249
  ### Redirections
240
250
 
@@ -1,13 +1,15 @@
1
1
  module MetaInspector
2
2
  # A MetaInspector::Document knows about its URL and its contents
3
3
  class Document
4
- attr_reader :timeout, :html_content_only, :allow_redirections, :warn_level, :headers
4
+ attr_reader :html_content_only, :allow_redirections, :warn_level, :headers
5
5
 
6
6
  include MetaInspector::Exceptionable
7
7
 
8
8
  # Initializes a new instance of MetaInspector::Document, setting the URL to the one given
9
9
  # Options:
10
- # => timeout: defaults to 20 seconds
10
+ # => connection_timeout: defaults to 20 seconds
11
+ # => read_timeout: defaults to 20 seconds
12
+ # => retries: defaults to 3 times
11
13
  # => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
12
14
  # => allow_redirections: when true, follow HTTP redirects. Defaults to true
13
15
  # => document: the html of the url as a string
@@ -15,7 +17,9 @@ module MetaInspector
15
17
  # => headers: object containing custom headers for the request
16
18
  def initialize(initial_url, options = {})
17
19
  options = defaults.merge(options)
18
- @timeout = options[:timeout]
20
+ @connection_timeout = options[:connection_timeout]
21
+ @read_timeout = options[:read_timeout]
22
+ @retries = options[:retries]
19
23
  @html_content_only = options[:html_content_only]
20
24
  @allow_redirections = options[:allow_redirections]
21
25
  @document = options[:document]
@@ -24,7 +28,9 @@ module MetaInspector
24
28
  @exception_log = options[:exception_log] || MetaInspector::ExceptionLog.new(warn_level: warn_level)
25
29
  @url = MetaInspector::URL.new(initial_url, exception_log: @exception_log)
26
30
  @request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
27
- timeout: @timeout,
31
+ connection_timeout: @connection_timeout,
32
+ read_timeout: @read_timeout,
33
+ retries: @retries,
28
34
  exception_log: @exception_log,
29
35
  headers: @headers) unless @document
30
36
  @parser = MetaInspector::Parser.new(self, exception_log: @exception_log)
@@ -1,7 +1,6 @@
1
1
  require 'faraday'
2
2
  require 'faraday_middleware'
3
3
  require 'faraday-cookie_jar'
4
- require 'timeout'
5
4
 
6
5
  module MetaInspector
7
6
 
@@ -13,7 +12,8 @@ module MetaInspector
13
12
  @url = initial_url
14
13
 
15
14
  @allow_redirections = options[:allow_redirections]
16
- @timeout = options[:timeout]
15
+ @connection_timeout = options[:connection_timeout]
16
+ @read_timeout = options[:read_timeout]
17
17
  @retries = options[:retries]
18
18
  @exception_log = options[:exception_log]
19
19
  @headers = options[:headers]
@@ -35,11 +35,8 @@ module MetaInspector
35
35
  def response
36
36
  request_count ||= 0
37
37
  request_count += 1
38
- Timeout::timeout(@timeout) { @response ||= fetch }
39
- rescue Timeout::Error
40
- retry unless @retries == request_count
41
- @exception_log << TimeoutError.new("Attempt to fetch #{url} timed out 3 times.")
42
- rescue Faraday::Error::ConnectionFailed, RuntimeError => e
38
+ @response ||= fetch
39
+ rescue Faraday::TimeoutError, Faraday::Error::ConnectionFailed, RuntimeError => e
43
40
  @exception_log << e
44
41
  nil
45
42
  end
@@ -48,21 +45,25 @@ module MetaInspector
48
45
 
49
46
  def fetch
50
47
  session = Faraday.new(:url => url) do |faraday|
48
+ faraday.request :retry, max: @retries
49
+
51
50
  if @allow_redirections
52
51
  faraday.use FaradayMiddleware::FollowRedirects, limit: 10
53
52
  faraday.use :cookie_jar
54
53
  end
54
+
55
55
  faraday.headers.merge!(@headers || {})
56
56
  faraday.adapter :net_http
57
57
  end
58
- response = session.get
58
+
59
+ response = session.get do |req|
60
+ req.options.timeout = @connection_timeout
61
+ req.options.open_timeout = @read_timeout
62
+ end
59
63
 
60
64
  @url.url = response.env.url.to_s
61
65
 
62
66
  response
63
67
  end
64
-
65
- class TimeoutError < StandardError
66
- end
67
68
  end
68
69
  end
@@ -1,3 +1,3 @@
1
1
  module MetaInspector
2
- VERSION = "4.0.0.rc2"
2
+ VERSION = "4.0.0.rc3"
3
3
  end
data/spec/request_spec.rb CHANGED
@@ -60,50 +60,6 @@ describe MetaInspector::Request do
60
60
  end
61
61
  end
62
62
 
63
- describe "retrying on timeouts" do
64
- let(:logger) { MetaInspector::ExceptionLog.new }
65
- subject do
66
- MetaInspector::Request.new(url('http://pagerankalert.com'),
67
- exception_log: logger, retries: 3)
68
- end
69
-
70
- context "when request never succeeds" do
71
- before{ Timeout.stub(:timeout).and_raise(Timeout::Error) }
72
- it "swallows all the timeout errors and raises MetaInspector::Request::TimeoutError" do
73
- logger.should receive(:<<).with(an_instance_of(MetaInspector::Request::TimeoutError))
74
- subject
75
- end
76
- end
77
-
78
- context "when request succeeds on third try" do
79
- before do
80
- Timeout.stub(:timeout).and_raise(Timeout::Error)
81
- Timeout.stub(:timeout).and_raise(Timeout::Error)
82
- Timeout.stub(:timeout).and_call_original
83
- end
84
- it "doesn't raise an exception" do
85
- logger.should_not receive(:<<)
86
- subject
87
- end
88
- it "succeeds as normal" do
89
- subject.content_type.should == "text/html"
90
- end
91
- end
92
-
93
- context "when request succeeds on fourth try" do
94
- before do
95
- Timeout.stub(:timeout).exactly(3).times.and_raise(Timeout::Error)
96
- # if it were called a fourth time, rspec would raise an error
97
- # so this implicitely tests the correct behavior
98
- end
99
- it "swallows all the timeout errors and raises MetaInspector::Request::TimeoutError" do
100
- logger.should receive(:<<).with(an_instance_of(MetaInspector::Request::TimeoutError))
101
- subject
102
- end
103
- end
104
-
105
- end
106
-
107
63
  private
108
64
 
109
65
  def url(initial_url)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.0.0.rc2
4
+ version: 4.0.0.rc3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaime Iniesta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-11-16 00:00:00.000000000 Z
11
+ date: 2014-11-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri