wayback_archiver 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: e0c99c305a785b125b14194417d2930a460b4a95
4
- data.tar.gz: 4dc88ae01e5cdadfc5730aed1091f005d7526e58
2
+ SHA256:
3
+ metadata.gz: 1d97461d94b8ec02e1cae528f939be17991d054a1d29953c8672bfc8e29ea7cf
4
+ data.tar.gz: 75bf5c3a7214001df417678c7a084ea3bc1e80dc87773a8da17c1c1defcdaff8
5
5
  SHA512:
6
- metadata.gz: 1c665d9a1350cbbf8917803a3f8d0a6304a94b4af364497b3520dfe22b526c19ec9ae6174bbe4699c6d0dffd679427f457fb404757c4720d4d642de21afa465d
7
- data.tar.gz: 5105b2fef3872e3dc528cac943f43c3e3c1b6007838bbebb6064a9f4351f8c44f306976a8e9977579aad8e6da642bf39a56b0b53de128ca5b0d8f89552c5c2c1
6
+ metadata.gz: 00d2eef71ef692688249dc97cbaca906f6725679a19deab2fd3c5998e06319032ee9e9aff24bb68dd80254f1b18ef06d1aeadfeabd7fcaafa326f9bdcca10688
7
+ data.tar.gz: d05a72cfb6fbb2636f43c8d8737f72aa0587415a8ce344d1c1d01db9a3cdcecfc278d81d27e2962868dab0674e4ffdcc33fe9115f0969e8c32f1c490473e13aa
@@ -33,6 +33,19 @@ module WaybackArchiver
33
33
  # Max number of redirects before an error is raised
34
34
  MAX_REDIRECTS = 10
35
35
 
36
+ # Known request errors
37
+ REQUEST_ERRORS = {
38
+ # server
39
+ Timeout::Error => ServerError,
40
+ OpenSSL::SSL::SSLError => ServerError,
41
+ Net::HTTPBadResponse => ServerError,
42
+ Zlib::Error => ServerError,
43
+ # client
44
+ SystemCallError => ClientError,
45
+ SocketError => ClientError,
46
+ IOError => ClientError
47
+ }.freeze
48
+
36
49
  # Get reponse.
37
50
  # @return [Response] the http response representation.
38
51
  # @param [String, URI] uri to retrieve.
@@ -184,20 +197,11 @@ module WaybackArchiver
184
197
  private
185
198
 
186
199
  def self.perform_request(uri, http, request)
187
- # TODO: Consider retrying failed requests
200
+ # TODO: Consider retrying on certain HTTP response codes, i.e 429, 503
188
201
  response = http.request(request)
189
202
  GETStruct.new(response)
190
- rescue Timeout::Error,
191
- OpenSSL::SSL::SSLError,
192
- Net::HTTPBadResponse,
193
- Zlib::Error => e
194
-
195
- build_request_error(uri, e, ServerError)
196
- rescue SystemCallError,
197
- SocketError,
198
- IOError => e
199
-
200
- build_request_error(uri, e, ClientError)
203
+ rescue *REQUEST_ERRORS.keys => e
204
+ build_request_error(uri, e, REQUEST_ERRORS.fetch(e.class))
201
205
  end
202
206
 
203
207
  def self.build_request_error(uri, error, error_wrapper_klass)
@@ -1,3 +1,4 @@
1
+ require 'set'
1
2
  require 'robots'
2
3
 
3
4
  require 'wayback_archiver/sitemap'
@@ -27,6 +28,7 @@ module WaybackArchiver
27
28
  WaybackArchiver.logger.info 'Looking for Sitemap(s) in /robots.txt'
28
29
  robots = Robots.new(WaybackArchiver.user_agent)
29
30
  sitemaps = robots.other_values(url)['Sitemap']
31
+
30
32
  if sitemaps
31
33
  return sitemaps.flat_map do |sitemap|
32
34
  WaybackArchiver.logger.info "Fetching Sitemap at #{sitemap}"
@@ -61,12 +63,21 @@ module WaybackArchiver
61
63
  # @example Get URLs defined in Sitemap
62
64
  # Sitemapper.urls(xml: xml)
63
65
  # @see http://www.sitemaps.org
64
- def self.urls(url: nil, xml: nil)
66
+ def self.urls(url: nil, xml: nil, visited: Set.new)
67
+ if visited.include?(url)
68
+ WaybackArchiver.logger.debug "Already visited #{url} skipping.."
69
+ return []
70
+ end
71
+
72
+ visited << url if url
73
+
65
74
  xml = Request.get(url).body unless xml
66
75
  sitemap = Sitemap.new(xml)
67
76
 
68
77
  if sitemap.sitemap_index?
69
- sitemap.sitemaps.flat_map { |sitemap_url| urls(url: sitemap_url) }
78
+ sitemap.sitemaps.flat_map do |sitemap_url|
79
+ urls(url: sitemap_url, visited: visited)
80
+ end
70
81
  else
71
82
  sitemap.urls
72
83
  end
@@ -1,4 +1,4 @@
1
1
  module WaybackArchiver
2
2
  # Gem version
3
- VERSION = '1.2.0'.freeze
3
+ VERSION = '1.2.1'.freeze
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-08-20 00:00:00.000000000 Z
11
+ date: 2018-10-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -100,14 +100,28 @@ dependencies:
100
100
  requirements:
101
101
  - - "~>"
102
102
  - !ruby/object:Gem::Version
103
- version: '0.8'
103
+ version: '0.9'
104
104
  type: :development
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
- version: '0.8'
110
+ version: '0.9'
111
+ - !ruby/object:Gem::Dependency
112
+ name: simplecov
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: 0.14.1
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: 0.14.1
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: coveralls
113
127
  requirement: !ruby/object:Gem::Requirement
@@ -207,7 +221,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
207
221
  version: '0'
208
222
  requirements: []
209
223
  rubyforge_project:
210
- rubygems_version: 2.6.11
224
+ rubygems_version: 2.7.6
211
225
  signing_key:
212
226
  specification_version: 4
213
227
  summary: Post URLs to Wayback Machine (Internet Archive)