wayback_archiver 1.2.0 → 1.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/lib/wayback_archiver/request.rb +16 -12
- data/lib/wayback_archiver/sitemapper.rb +13 -2
- data/lib/wayback_archiver/version.rb +1 -1
- metadata +19 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 1d97461d94b8ec02e1cae528f939be17991d054a1d29953c8672bfc8e29ea7cf
|
4
|
+
data.tar.gz: 75bf5c3a7214001df417678c7a084ea3bc1e80dc87773a8da17c1c1defcdaff8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 00d2eef71ef692688249dc97cbaca906f6725679a19deab2fd3c5998e06319032ee9e9aff24bb68dd80254f1b18ef06d1aeadfeabd7fcaafa326f9bdcca10688
|
7
|
+
data.tar.gz: d05a72cfb6fbb2636f43c8d8737f72aa0587415a8ce344d1c1d01db9a3cdcecfc278d81d27e2962868dab0674e4ffdcc33fe9115f0969e8c32f1c490473e13aa
|
@@ -33,6 +33,19 @@ module WaybackArchiver
|
|
33
33
|
# Max number of redirects before an error is raised
|
34
34
|
MAX_REDIRECTS = 10
|
35
35
|
|
36
|
+
# Known request errors
|
37
|
+
REQUEST_ERRORS = {
|
38
|
+
# server
|
39
|
+
Timeout::Error => ServerError,
|
40
|
+
OpenSSL::SSL::SSLError => ServerError,
|
41
|
+
Net::HTTPBadResponse => ServerError,
|
42
|
+
Zlib::Error => ServerError,
|
43
|
+
# client
|
44
|
+
SystemCallError => ClientError,
|
45
|
+
SocketError => ClientError,
|
46
|
+
IOError => ClientError
|
47
|
+
}.freeze
|
48
|
+
|
36
49
|
# Get reponse.
|
37
50
|
# @return [Response] the http response representation.
|
38
51
|
# @param [String, URI] uri to retrieve.
|
@@ -184,20 +197,11 @@ module WaybackArchiver
|
|
184
197
|
private
|
185
198
|
|
186
199
|
def self.perform_request(uri, http, request)
|
187
|
-
# TODO: Consider retrying
|
200
|
+
# TODO: Consider retrying on certain HTTP response codes, i.e 429, 503
|
188
201
|
response = http.request(request)
|
189
202
|
GETStruct.new(response)
|
190
|
-
rescue
|
191
|
-
|
192
|
-
Net::HTTPBadResponse,
|
193
|
-
Zlib::Error => e
|
194
|
-
|
195
|
-
build_request_error(uri, e, ServerError)
|
196
|
-
rescue SystemCallError,
|
197
|
-
SocketError,
|
198
|
-
IOError => e
|
199
|
-
|
200
|
-
build_request_error(uri, e, ClientError)
|
203
|
+
rescue *REQUEST_ERRORS.keys => e
|
204
|
+
build_request_error(uri, e, REQUEST_ERRORS.fetch(e.class))
|
201
205
|
end
|
202
206
|
|
203
207
|
def self.build_request_error(uri, error, error_wrapper_klass)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'set'
|
1
2
|
require 'robots'
|
2
3
|
|
3
4
|
require 'wayback_archiver/sitemap'
|
@@ -27,6 +28,7 @@ module WaybackArchiver
|
|
27
28
|
WaybackArchiver.logger.info 'Looking for Sitemap(s) in /robots.txt'
|
28
29
|
robots = Robots.new(WaybackArchiver.user_agent)
|
29
30
|
sitemaps = robots.other_values(url)['Sitemap']
|
31
|
+
|
30
32
|
if sitemaps
|
31
33
|
return sitemaps.flat_map do |sitemap|
|
32
34
|
WaybackArchiver.logger.info "Fetching Sitemap at #{sitemap}"
|
@@ -61,12 +63,21 @@ module WaybackArchiver
|
|
61
63
|
# @example Get URLs defined in Sitemap
|
62
64
|
# Sitemapper.urls(xml: xml)
|
63
65
|
# @see http://www.sitemaps.org
|
64
|
-
def self.urls(url: nil, xml: nil)
|
66
|
+
def self.urls(url: nil, xml: nil, visited: Set.new)
|
67
|
+
if visited.include?(url)
|
68
|
+
WaybackArchiver.logger.debug "Already visited #{url} skipping.."
|
69
|
+
return []
|
70
|
+
end
|
71
|
+
|
72
|
+
visited << url if url
|
73
|
+
|
65
74
|
xml = Request.get(url).body unless xml
|
66
75
|
sitemap = Sitemap.new(xml)
|
67
76
|
|
68
77
|
if sitemap.sitemap_index?
|
69
|
-
sitemap.sitemaps.flat_map
|
78
|
+
sitemap.sitemaps.flat_map do |sitemap_url|
|
79
|
+
urls(url: sitemap_url, visited: visited)
|
80
|
+
end
|
70
81
|
else
|
71
82
|
sitemap.urls
|
72
83
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-10-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: spidr
|
@@ -100,14 +100,28 @@ dependencies:
|
|
100
100
|
requirements:
|
101
101
|
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: '0.
|
103
|
+
version: '0.9'
|
104
104
|
type: :development
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: '0.
|
110
|
+
version: '0.9'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: simplecov
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: 0.14.1
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: 0.14.1
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: coveralls
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -207,7 +221,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
207
221
|
version: '0'
|
208
222
|
requirements: []
|
209
223
|
rubyforge_project:
|
210
|
-
rubygems_version: 2.6
|
224
|
+
rubygems_version: 2.7.6
|
211
225
|
signing_key:
|
212
226
|
specification_version: 4
|
213
227
|
summary: Post URLs to Wayback Machine (Internet Archive)
|