wayback_archiver 1.2.0 → 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: e0c99c305a785b125b14194417d2930a460b4a95
4
- data.tar.gz: 4dc88ae01e5cdadfc5730aed1091f005d7526e58
2
+ SHA256:
3
+ metadata.gz: 1d97461d94b8ec02e1cae528f939be17991d054a1d29953c8672bfc8e29ea7cf
4
+ data.tar.gz: 75bf5c3a7214001df417678c7a084ea3bc1e80dc87773a8da17c1c1defcdaff8
5
5
  SHA512:
6
- metadata.gz: 1c665d9a1350cbbf8917803a3f8d0a6304a94b4af364497b3520dfe22b526c19ec9ae6174bbe4699c6d0dffd679427f457fb404757c4720d4d642de21afa465d
7
- data.tar.gz: 5105b2fef3872e3dc528cac943f43c3e3c1b6007838bbebb6064a9f4351f8c44f306976a8e9977579aad8e6da642bf39a56b0b53de128ca5b0d8f89552c5c2c1
6
+ metadata.gz: 00d2eef71ef692688249dc97cbaca906f6725679a19deab2fd3c5998e06319032ee9e9aff24bb68dd80254f1b18ef06d1aeadfeabd7fcaafa326f9bdcca10688
7
+ data.tar.gz: d05a72cfb6fbb2636f43c8d8737f72aa0587415a8ce344d1c1d01db9a3cdcecfc278d81d27e2962868dab0674e4ffdcc33fe9115f0969e8c32f1c490473e13aa
@@ -33,6 +33,19 @@ module WaybackArchiver
33
33
  # Max number of redirects before an error is raised
34
34
  MAX_REDIRECTS = 10
35
35
 
36
+ # Known request errors
37
+ REQUEST_ERRORS = {
38
+ # server
39
+ Timeout::Error => ServerError,
40
+ OpenSSL::SSL::SSLError => ServerError,
41
+ Net::HTTPBadResponse => ServerError,
42
+ Zlib::Error => ServerError,
43
+ # client
44
+ SystemCallError => ClientError,
45
+ SocketError => ClientError,
46
+ IOError => ClientError
47
+ }.freeze
48
+
36
49
  # Get reponse.
37
50
  # @return [Response] the http response representation.
38
51
  # @param [String, URI] uri to retrieve.
@@ -184,20 +197,11 @@ module WaybackArchiver
184
197
  private
185
198
 
186
199
  def self.perform_request(uri, http, request)
187
- # TODO: Consider retrying failed requests
200
+ # TODO: Consider retrying on certain HTTP response codes, i.e 429, 503
188
201
  response = http.request(request)
189
202
  GETStruct.new(response)
190
- rescue Timeout::Error,
191
- OpenSSL::SSL::SSLError,
192
- Net::HTTPBadResponse,
193
- Zlib::Error => e
194
-
195
- build_request_error(uri, e, ServerError)
196
- rescue SystemCallError,
197
- SocketError,
198
- IOError => e
199
-
200
- build_request_error(uri, e, ClientError)
203
+ rescue *REQUEST_ERRORS.keys => e
204
+ build_request_error(uri, e, REQUEST_ERRORS.fetch(e.class))
201
205
  end
202
206
 
203
207
  def self.build_request_error(uri, error, error_wrapper_klass)
@@ -1,3 +1,4 @@
1
+ require 'set'
1
2
  require 'robots'
2
3
 
3
4
  require 'wayback_archiver/sitemap'
@@ -27,6 +28,7 @@ module WaybackArchiver
27
28
  WaybackArchiver.logger.info 'Looking for Sitemap(s) in /robots.txt'
28
29
  robots = Robots.new(WaybackArchiver.user_agent)
29
30
  sitemaps = robots.other_values(url)['Sitemap']
31
+
30
32
  if sitemaps
31
33
  return sitemaps.flat_map do |sitemap|
32
34
  WaybackArchiver.logger.info "Fetching Sitemap at #{sitemap}"
@@ -61,12 +63,21 @@ module WaybackArchiver
61
63
  # @example Get URLs defined in Sitemap
62
64
  # Sitemapper.urls(xml: xml)
63
65
  # @see http://www.sitemaps.org
64
- def self.urls(url: nil, xml: nil)
66
+ def self.urls(url: nil, xml: nil, visited: Set.new)
67
+ if visited.include?(url)
68
+ WaybackArchiver.logger.debug "Already visited #{url} skipping.."
69
+ return []
70
+ end
71
+
72
+ visited << url if url
73
+
65
74
  xml = Request.get(url).body unless xml
66
75
  sitemap = Sitemap.new(xml)
67
76
 
68
77
  if sitemap.sitemap_index?
69
- sitemap.sitemaps.flat_map { |sitemap_url| urls(url: sitemap_url) }
78
+ sitemap.sitemaps.flat_map do |sitemap_url|
79
+ urls(url: sitemap_url, visited: visited)
80
+ end
70
81
  else
71
82
  sitemap.urls
72
83
  end
@@ -1,4 +1,4 @@
1
1
  module WaybackArchiver
2
2
  # Gem version
3
- VERSION = '1.2.0'.freeze
3
+ VERSION = '1.2.1'.freeze
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-08-20 00:00:00.000000000 Z
11
+ date: 2018-10-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -100,14 +100,28 @@ dependencies:
100
100
  requirements:
101
101
  - - "~>"
102
102
  - !ruby/object:Gem::Version
103
- version: '0.8'
103
+ version: '0.9'
104
104
  type: :development
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
- version: '0.8'
110
+ version: '0.9'
111
+ - !ruby/object:Gem::Dependency
112
+ name: simplecov
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: 0.14.1
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: 0.14.1
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: coveralls
113
127
  requirement: !ruby/object:Gem::Requirement
@@ -207,7 +221,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
207
221
  version: '0'
208
222
  requirements: []
209
223
  rubyforge_project:
210
- rubygems_version: 2.6.11
224
+ rubygems_version: 2.7.6
211
225
  signing_key:
212
226
  specification_version: 4
213
227
  summary: Post URLs to Wayback Machine (Internet Archive)