wayback_archiver 1.2.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/wayback_archiver/request.rb +16 -12
- data/lib/wayback_archiver/sitemapper.rb +13 -2
- data/lib/wayback_archiver/version.rb +1 -1
- metadata +19 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 1d97461d94b8ec02e1cae528f939be17991d054a1d29953c8672bfc8e29ea7cf
|
4
|
+
data.tar.gz: 75bf5c3a7214001df417678c7a084ea3bc1e80dc87773a8da17c1c1defcdaff8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 00d2eef71ef692688249dc97cbaca906f6725679a19deab2fd3c5998e06319032ee9e9aff24bb68dd80254f1b18ef06d1aeadfeabd7fcaafa326f9bdcca10688
|
7
|
+
data.tar.gz: d05a72cfb6fbb2636f43c8d8737f72aa0587415a8ce344d1c1d01db9a3cdcecfc278d81d27e2962868dab0674e4ffdcc33fe9115f0969e8c32f1c490473e13aa
|
@@ -33,6 +33,19 @@ module WaybackArchiver
|
|
33
33
|
# Max number of redirects before an error is raised
|
34
34
|
MAX_REDIRECTS = 10
|
35
35
|
|
36
|
+
# Known request errors
|
37
|
+
REQUEST_ERRORS = {
|
38
|
+
# server
|
39
|
+
Timeout::Error => ServerError,
|
40
|
+
OpenSSL::SSL::SSLError => ServerError,
|
41
|
+
Net::HTTPBadResponse => ServerError,
|
42
|
+
Zlib::Error => ServerError,
|
43
|
+
# client
|
44
|
+
SystemCallError => ClientError,
|
45
|
+
SocketError => ClientError,
|
46
|
+
IOError => ClientError
|
47
|
+
}.freeze
|
48
|
+
|
36
49
|
# Get reponse.
|
37
50
|
# @return [Response] the http response representation.
|
38
51
|
# @param [String, URI] uri to retrieve.
|
@@ -184,20 +197,11 @@ module WaybackArchiver
|
|
184
197
|
private
|
185
198
|
|
186
199
|
def self.perform_request(uri, http, request)
|
187
|
-
# TODO: Consider retrying
|
200
|
+
# TODO: Consider retrying on certain HTTP response codes, i.e 429, 503
|
188
201
|
response = http.request(request)
|
189
202
|
GETStruct.new(response)
|
190
|
-
rescue
|
191
|
-
|
192
|
-
Net::HTTPBadResponse,
|
193
|
-
Zlib::Error => e
|
194
|
-
|
195
|
-
build_request_error(uri, e, ServerError)
|
196
|
-
rescue SystemCallError,
|
197
|
-
SocketError,
|
198
|
-
IOError => e
|
199
|
-
|
200
|
-
build_request_error(uri, e, ClientError)
|
203
|
+
rescue *REQUEST_ERRORS.keys => e
|
204
|
+
build_request_error(uri, e, REQUEST_ERRORS.fetch(e.class))
|
201
205
|
end
|
202
206
|
|
203
207
|
def self.build_request_error(uri, error, error_wrapper_klass)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'set'
|
1
2
|
require 'robots'
|
2
3
|
|
3
4
|
require 'wayback_archiver/sitemap'
|
@@ -27,6 +28,7 @@ module WaybackArchiver
|
|
27
28
|
WaybackArchiver.logger.info 'Looking for Sitemap(s) in /robots.txt'
|
28
29
|
robots = Robots.new(WaybackArchiver.user_agent)
|
29
30
|
sitemaps = robots.other_values(url)['Sitemap']
|
31
|
+
|
30
32
|
if sitemaps
|
31
33
|
return sitemaps.flat_map do |sitemap|
|
32
34
|
WaybackArchiver.logger.info "Fetching Sitemap at #{sitemap}"
|
@@ -61,12 +63,21 @@ module WaybackArchiver
|
|
61
63
|
# @example Get URLs defined in Sitemap
|
62
64
|
# Sitemapper.urls(xml: xml)
|
63
65
|
# @see http://www.sitemaps.org
|
64
|
-
def self.urls(url: nil, xml: nil)
|
66
|
+
def self.urls(url: nil, xml: nil, visited: Set.new)
|
67
|
+
if visited.include?(url)
|
68
|
+
WaybackArchiver.logger.debug "Already visited #{url} skipping.."
|
69
|
+
return []
|
70
|
+
end
|
71
|
+
|
72
|
+
visited << url if url
|
73
|
+
|
65
74
|
xml = Request.get(url).body unless xml
|
66
75
|
sitemap = Sitemap.new(xml)
|
67
76
|
|
68
77
|
if sitemap.sitemap_index?
|
69
|
-
sitemap.sitemaps.flat_map
|
78
|
+
sitemap.sitemaps.flat_map do |sitemap_url|
|
79
|
+
urls(url: sitemap_url, visited: visited)
|
80
|
+
end
|
70
81
|
else
|
71
82
|
sitemap.urls
|
72
83
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-10-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: spidr
|
@@ -100,14 +100,28 @@ dependencies:
|
|
100
100
|
requirements:
|
101
101
|
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: '0.
|
103
|
+
version: '0.9'
|
104
104
|
type: :development
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: '0.
|
110
|
+
version: '0.9'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: simplecov
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: 0.14.1
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: 0.14.1
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: coveralls
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -207,7 +221,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
207
221
|
version: '0'
|
208
222
|
requirements: []
|
209
223
|
rubyforge_project:
|
210
|
-
rubygems_version: 2.6
|
224
|
+
rubygems_version: 2.7.6
|
211
225
|
signing_key:
|
212
226
|
specification_version: 4
|
213
227
|
summary: Post URLs to Wayback Machine (Internet Archive)
|