metainspector 4.0.0.rc2 → 4.0.0.rc3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5ba263e5a232d332082575e310c75d31c971a419
4
- data.tar.gz: befb5ddec99b4b36db95e37eb2ab33e22bfd1e2f
3
+ metadata.gz: bf5c2667ff165768d1a0e0c49ebd47ea5f8de28e
4
+ data.tar.gz: 15b2f4fb7a2f090a75fe06ab98959e35d5f97a3f
5
5
  SHA512:
6
- metadata.gz: 9d813747e71e15d058104398fcc53eefd8aeeee2c2eb224b53cb0d2dcf6bf786c1bcd3de9111152d7e583e6fd96a42b4b34fa34e4bfb53cf4b7fc8127e27dc01
7
- data.tar.gz: fe41a9cb0a176c9d03892ab18d48194203d12aeb2df22faf47c6a2d32d91fa0422aabb5acd7c12e0b3f9a336898ca888601a5ebe1dae36a0de1a790d5bc60d51
6
+ metadata.gz: eeb60786169e979dd8bb257832f2bf2c0270af8b2bf63056330826677a4943373aea51269a1ddfc397ae296cb786b5285997a1721b5ae412cc006214c872af18
7
+ data.tar.gz: ae891af393d3746df5048a1e512e70f11718fc8357a2c8212376119afb174e8b7e0ccd180f48c252813581a4ed5671b0f01e35ca555b475efc9997238c29c952
data/README.md CHANGED
@@ -25,6 +25,8 @@ page.links.external # Returns all external HTTP links found
25
25
 
26
26
  * Now `page.image` will return the first image in `page.images` if no OG or Twitter image found, instead of returning `nil`.
27
27
 
28
+ * You can now specify 2 different timeouts, `connection_timeout` and `read_timeout`, instead of the previous single `timeout`.
29
+
28
30
  ## Changes in 3.0
29
31
 
30
32
  * The redirect API has been changed, now the `:allow_redirections` option will expect only a boolean, which by default is `true`. That is, no more specifying `:safe`, `:unsafe` or `:all`.
@@ -213,28 +215,36 @@ And the full scraped document is accessible from:
213
215
 
214
216
  ### Timeout & Retries
215
217
 
216
- By default, MetaInspector times out after 20 seconds of waiting for a page to respond,
217
- and it will retry fetching the page 3 times.
218
- You can specify different values for both of these, like this:
218
+ You can specify 2 different timeouts when requesting a page:
219
+
220
+ * `connection_timeout` sets the maximum number of seconds to wait to get a connection to the page.
221
+ * `read_timeout` sets the maximum number of seconds to wait to read the page, once connected.
222
+
223
+ Both timeouts default to 20 seconds each.
219
224
 
220
- # timeout after 5 seconds, retry 4 times
221
- page = MetaInspector.new('sitevalidator.com', :timeout => 5, :retries => 4)
225
+ You can also specify the number of `retries`, which defaults to 3.
226
+
227
+ For example, this will time out after 10 seconds waiting for a connection, or after 5 seconds waiting
228
+ to read its contents, and will retry 4 times:
229
+
230
+ ```ruby
231
+ page = MetaInspector.new('www.google', :connection_timeout => 10, :read_timeout => 5, :retries => 4)
232
+ ```
222
233
 
223
234
  If MetaInspector fails to fetch the page after it has exhausted its retries,
224
- it will raise `MetaInspector::Request::TimeoutError`, which you can rescue in your
235
+ it will raise `Faraday::TimeoutError`, which you can rescue in your
225
236
  application code.
226
237
 
227
- begin
228
- data = MetaInspector.new(url)
229
- rescue MetaInspector::Request::TimeoutError
230
- enqueue_for_future_fetch_attempt(url)
231
- render_simple(url)
232
- rescue
233
- log_fetch_error($!)
234
- render_simple(url)
235
- else
236
- render_rich(data)
237
- end
238
+ ```ruby
239
+ begin
240
+ page = MetaInspector.new(url)
241
+ rescue Faraday::TimeoutError
242
+ enqueue_for_future_fetch_attempt(url)
243
+ render_simple(url)
244
+ else
245
+ render_rich(page)
246
+ end
247
+ ```
238
248
 
239
249
  ### Redirections
240
250
 
@@ -1,13 +1,15 @@
1
1
  module MetaInspector
2
2
  # A MetaInspector::Document knows about its URL and its contents
3
3
  class Document
4
- attr_reader :timeout, :html_content_only, :allow_redirections, :warn_level, :headers
4
+ attr_reader :html_content_only, :allow_redirections, :warn_level, :headers
5
5
 
6
6
  include MetaInspector::Exceptionable
7
7
 
8
8
  # Initializes a new instance of MetaInspector::Document, setting the URL to the one given
9
9
  # Options:
10
- # => timeout: defaults to 20 seconds
10
+ # => connection_timeout: defaults to 20 seconds
11
+ # => read_timeout: defaults to 20 seconds
12
+ # => retries: defaults to 3 times
11
13
  # => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
12
14
  # => allow_redirections: when true, follow HTTP redirects. Defaults to true
13
15
  # => document: the html of the url as a string
@@ -15,7 +17,9 @@ module MetaInspector
15
17
  # => headers: object containing custom headers for the request
16
18
  def initialize(initial_url, options = {})
17
19
  options = defaults.merge(options)
18
- @timeout = options[:timeout]
20
+ @connection_timeout = options[:connection_timeout]
21
+ @read_timeout = options[:read_timeout]
22
+ @retries = options[:retries]
19
23
  @html_content_only = options[:html_content_only]
20
24
  @allow_redirections = options[:allow_redirections]
21
25
  @document = options[:document]
@@ -24,7 +28,9 @@ module MetaInspector
24
28
  @exception_log = options[:exception_log] || MetaInspector::ExceptionLog.new(warn_level: warn_level)
25
29
  @url = MetaInspector::URL.new(initial_url, exception_log: @exception_log)
26
30
  @request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
27
- timeout: @timeout,
31
+ connection_timeout: @connection_timeout,
32
+ read_timeout: @read_timeout,
33
+ retries: @retries,
28
34
  exception_log: @exception_log,
29
35
  headers: @headers) unless @document
30
36
  @parser = MetaInspector::Parser.new(self, exception_log: @exception_log)
@@ -1,7 +1,6 @@
1
1
  require 'faraday'
2
2
  require 'faraday_middleware'
3
3
  require 'faraday-cookie_jar'
4
- require 'timeout'
5
4
 
6
5
  module MetaInspector
7
6
 
@@ -13,7 +12,8 @@ module MetaInspector
13
12
  @url = initial_url
14
13
 
15
14
  @allow_redirections = options[:allow_redirections]
16
- @timeout = options[:timeout]
15
+ @connection_timeout = options[:connection_timeout]
16
+ @read_timeout = options[:read_timeout]
17
17
  @retries = options[:retries]
18
18
  @exception_log = options[:exception_log]
19
19
  @headers = options[:headers]
@@ -35,11 +35,8 @@ module MetaInspector
35
35
  def response
36
36
  request_count ||= 0
37
37
  request_count += 1
38
- Timeout::timeout(@timeout) { @response ||= fetch }
39
- rescue Timeout::Error
40
- retry unless @retries == request_count
41
- @exception_log << TimeoutError.new("Attempt to fetch #{url} timed out 3 times.")
42
- rescue Faraday::Error::ConnectionFailed, RuntimeError => e
38
+ @response ||= fetch
39
+ rescue Faraday::TimeoutError, Faraday::Error::ConnectionFailed, RuntimeError => e
43
40
  @exception_log << e
44
41
  nil
45
42
  end
@@ -48,21 +45,25 @@ module MetaInspector
48
45
 
49
46
  def fetch
50
47
  session = Faraday.new(:url => url) do |faraday|
48
+ faraday.request :retry, max: @retries
49
+
51
50
  if @allow_redirections
52
51
  faraday.use FaradayMiddleware::FollowRedirects, limit: 10
53
52
  faraday.use :cookie_jar
54
53
  end
54
+
55
55
  faraday.headers.merge!(@headers || {})
56
56
  faraday.adapter :net_http
57
57
  end
58
- response = session.get
58
+
59
+ response = session.get do |req|
60
+ req.options.timeout = @connection_timeout
61
+ req.options.open_timeout = @read_timeout
62
+ end
59
63
 
60
64
  @url.url = response.env.url.to_s
61
65
 
62
66
  response
63
67
  end
64
-
65
- class TimeoutError < StandardError
66
- end
67
68
  end
68
69
  end
@@ -1,3 +1,3 @@
1
1
  module MetaInspector
2
- VERSION = "4.0.0.rc2"
2
+ VERSION = "4.0.0.rc3"
3
3
  end
data/spec/request_spec.rb CHANGED
@@ -60,50 +60,6 @@ describe MetaInspector::Request do
60
60
  end
61
61
  end
62
62
 
63
- describe "retrying on timeouts" do
64
- let(:logger) { MetaInspector::ExceptionLog.new }
65
- subject do
66
- MetaInspector::Request.new(url('http://pagerankalert.com'),
67
- exception_log: logger, retries: 3)
68
- end
69
-
70
- context "when request never succeeds" do
71
- before{ Timeout.stub(:timeout).and_raise(Timeout::Error) }
72
- it "swallows all the timeout errors and raises MetaInspector::Request::TimeoutError" do
73
- logger.should receive(:<<).with(an_instance_of(MetaInspector::Request::TimeoutError))
74
- subject
75
- end
76
- end
77
-
78
- context "when request succeeds on third try" do
79
- before do
80
- Timeout.stub(:timeout).and_raise(Timeout::Error)
81
- Timeout.stub(:timeout).and_raise(Timeout::Error)
82
- Timeout.stub(:timeout).and_call_original
83
- end
84
- it "doesn't raise an exception" do
85
- logger.should_not receive(:<<)
86
- subject
87
- end
88
- it "succeeds as normal" do
89
- subject.content_type.should == "text/html"
90
- end
91
- end
92
-
93
- context "when request succeeds on fourth try" do
94
- before do
95
- Timeout.stub(:timeout).exactly(3).times.and_raise(Timeout::Error)
96
- # if it were called a fourth time, rspec would raise an error
97
- # so this implicitely tests the correct behavior
98
- end
99
- it "swallows all the timeout errors and raises MetaInspector::Request::TimeoutError" do
100
- logger.should receive(:<<).with(an_instance_of(MetaInspector::Request::TimeoutError))
101
- subject
102
- end
103
- end
104
-
105
- end
106
-
107
63
  private
108
64
 
109
65
  def url(initial_url)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.0.0.rc2
4
+ version: 4.0.0.rc3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaime Iniesta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-11-16 00:00:00.000000000 Z
11
+ date: 2014-11-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri