wayback_archiver 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e0c99c305a785b125b14194417d2930a460b4a95
|
4
|
+
data.tar.gz: 4dc88ae01e5cdadfc5730aed1091f005d7526e58
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1c665d9a1350cbbf8917803a3f8d0a6304a94b4af364497b3520dfe22b526c19ec9ae6174bbe4699c6d0dffd679427f457fb404757c4720d4d642de21afa465d
|
7
|
+
data.tar.gz: 5105b2fef3872e3dc528cac943f43c3e3c1b6007838bbebb6064a9f4351f8c44f306976a8e9977579aad8e6da642bf39a56b0b53de128ca5b0d8f89552c5c2c1
|
data/lib/wayback_archiver.rb
CHANGED
@@ -43,15 +43,15 @@ module WaybackArchiver
|
|
43
43
|
# WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
|
44
44
|
# WaybackArchiver.archive('example.com', strategy: :url, limit: 100) # send max 100 URLs
|
45
45
|
# WaybackArchiver.archive('example.com', :url)
|
46
|
-
def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
46
|
+
def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
|
47
47
|
strategy = legacy_strategy || strategy
|
48
48
|
|
49
49
|
case strategy.to_s
|
50
|
-
when 'crawl' then crawl(source, concurrency: concurrency, limit: limit)
|
51
|
-
when 'auto' then auto(source, concurrency: concurrency, limit: limit)
|
52
|
-
when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit)
|
53
|
-
when 'urls' then urls(source, concurrency: concurrency, limit: limit)
|
54
|
-
when 'url' then urls(source, concurrency: concurrency, limit: limit)
|
50
|
+
when 'crawl' then crawl(source, concurrency: concurrency, limit: limit, &block)
|
51
|
+
when 'auto' then auto(source, concurrency: concurrency, limit: limit, &block)
|
52
|
+
when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit, &block)
|
53
|
+
when 'urls' then urls(source, concurrency: concurrency, limit: limit, &block)
|
54
|
+
when 'url' then urls(source, concurrency: concurrency, limit: limit, &block)
|
55
55
|
else
|
56
56
|
raise ArgumentError, "Unknown strategy: '#{strategy}'. Allowed strategies: sitemap, urls, url, crawl"
|
57
57
|
end
|
@@ -69,11 +69,11 @@ module WaybackArchiver
|
|
69
69
|
# @example Auto archive example.com and archive max 100 URLs
|
70
70
|
# WaybackArchiver.auto('example.com', limit: 100)
|
71
71
|
# @see http://www.sitemaps.org
|
72
|
-
def self.auto(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
72
|
+
def self.auto(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
|
73
73
|
urls = Sitemapper.autodiscover(source)
|
74
|
-
return urls(urls, concurrency: concurrency) if urls.any?
|
74
|
+
return urls(urls, concurrency: concurrency, &block) if urls.any?
|
75
75
|
|
76
|
-
crawl(source, concurrency: concurrency)
|
76
|
+
crawl(source, concurrency: concurrency, &block)
|
77
77
|
end
|
78
78
|
|
79
79
|
# Crawl site for URLs to send to the Wayback Machine.
|
@@ -86,9 +86,9 @@ module WaybackArchiver
|
|
86
86
|
# WaybackArchiver.crawl('example.com', concurrency: 1)
|
87
87
|
# @example Crawl example.com and archive max 100 URLs
|
88
88
|
# WaybackArchiver.crawl('example.com', limit: 100)
|
89
|
-
def self.crawl(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
89
|
+
def self.crawl(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
|
90
90
|
WaybackArchiver.logger.info "Crawling #{url}"
|
91
|
-
Archive.crawl(url, concurrency: concurrency, limit: limit)
|
91
|
+
Archive.crawl(url, concurrency: concurrency, limit: limit, &block)
|
92
92
|
end
|
93
93
|
|
94
94
|
# Get URLs from sitemap and send found URLs to the Wayback Machine.
|
@@ -102,9 +102,9 @@ module WaybackArchiver
|
|
102
102
|
# @example Get example.com sitemap archive max 100 URLs
|
103
103
|
# WaybackArchiver.sitemap('example.com/sitemap.xml', limit: 100)
|
104
104
|
# @see http://www.sitemaps.org
|
105
|
-
def self.sitemap(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
105
|
+
def self.sitemap(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
|
106
106
|
WaybackArchiver.logger.info "Fetching Sitemap"
|
107
|
-
Archive.post(URLCollector.sitemap(url), concurrency: concurrency, limit: limit)
|
107
|
+
Archive.post(URLCollector.sitemap(url), concurrency: concurrency, limit: limit, &block)
|
108
108
|
end
|
109
109
|
|
110
110
|
# Send URL to the Wayback Machine.
|
@@ -117,8 +117,8 @@ module WaybackArchiver
|
|
117
117
|
# WaybackArchiver.urls(%w(example.com google.com))
|
118
118
|
# @example Archive example.com, max 100 URLs
|
119
119
|
# WaybackArchiver.urls(%w(example.com www.example.com), limit: 100)
|
120
|
-
def self.urls(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
121
|
-
Archive.post(Array(urls), concurrency: concurrency)
|
120
|
+
def self.urls(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
|
121
|
+
Archive.post(Array(urls), concurrency: concurrency, &block)
|
122
122
|
end
|
123
123
|
|
124
124
|
# Set logger
|
@@ -180,4 +180,21 @@ module WaybackArchiver
|
|
180
180
|
def self.max_limit
|
181
181
|
@max_limit ||= DEFAULT_MAX_LIMIT
|
182
182
|
end
|
183
|
+
|
184
|
+
# Sets the adapter
|
185
|
+
# @return [Object, #call>] the configured adapter
|
186
|
+
# @param [Object, #call>] the adapter
|
187
|
+
def self.adapter=(adapter)
|
188
|
+
unless adapter.respond_to?(:call)
|
189
|
+
raise(ArgumentError, 'adapter must implement #call')
|
190
|
+
end
|
191
|
+
|
192
|
+
@adapter = adapter
|
193
|
+
end
|
194
|
+
|
195
|
+
# Returns the configured adapter
|
196
|
+
# @return [Integer] the configured or the default adapter
|
197
|
+
def self.adapter
|
198
|
+
@adapter ||= WaybackMachine
|
199
|
+
end
|
183
200
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'wayback_archiver/archive_result'
|
2
|
+
require 'wayback_archiver/request'
|
3
|
+
|
4
|
+
module WaybackArchiver
|
5
|
+
# WaybackMachine adapter
|
6
|
+
class WaybackMachine
|
7
|
+
# Wayback Machine base URL.
|
8
|
+
BASE_URL = 'https://web.archive.org/save/'.freeze
|
9
|
+
|
10
|
+
# Send URL to Wayback Machine.
|
11
|
+
# @return [ArchiveResult] the sent URL.
|
12
|
+
# @param [String] url to send.
|
13
|
+
# @example Archive example.com, with default options
|
14
|
+
# WaybackMachine.call('http://example.com')
|
15
|
+
def self.call(url)
|
16
|
+
request_url = "#{BASE_URL}#{url}"
|
17
|
+
response = Request.get(request_url, follow_redirects: false)
|
18
|
+
WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
|
19
|
+
ArchiveResult.new(
|
20
|
+
url,
|
21
|
+
code: response.code,
|
22
|
+
request_url: response.uri,
|
23
|
+
response_error: response.error
|
24
|
+
)
|
25
|
+
rescue Request::Error => e
|
26
|
+
WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
|
27
|
+
ArchiveResult.new(url, error: e)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -1,15 +1,11 @@
|
|
1
1
|
require 'concurrent'
|
2
2
|
|
3
|
-
require 'wayback_archiver/archive_result'
|
4
3
|
require 'wayback_archiver/thread_pool'
|
5
|
-
require 'wayback_archiver/
|
4
|
+
require 'wayback_archiver/adapters/wayback_machine'
|
6
5
|
|
7
6
|
module WaybackArchiver
|
8
7
|
# Post URL(s) to Wayback Machine
|
9
8
|
class Archive
|
10
|
-
# Wayback Machine base URL.
|
11
|
-
WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
|
12
|
-
|
13
9
|
# Send URLs to Wayback Machine.
|
14
10
|
# @return [Array<ArchiveResult>] with sent URLs.
|
15
11
|
# @param [Array<String>] urls to send to the Wayback Machine.
|
@@ -97,18 +93,7 @@ module WaybackArchiver
|
|
97
93
|
# @example Archive example.com, with default options
|
98
94
|
# Archive.post_url('http://example.com')
|
99
95
|
def self.post_url(url)
|
100
|
-
|
101
|
-
response = Request.get(request_url, follow_redirects: false)
|
102
|
-
WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
|
103
|
-
ArchiveResult.new(
|
104
|
-
url,
|
105
|
-
code: response.code,
|
106
|
-
request_url: response.uri,
|
107
|
-
response_error: response.error
|
108
|
-
)
|
109
|
-
rescue Request::Error => e
|
110
|
-
WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
|
111
|
-
ArchiveResult.new(url, error: e)
|
96
|
+
WaybackArchiver.adapter.call(url)
|
112
97
|
end
|
113
98
|
end
|
114
99
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-08-
|
11
|
+
date: 2017-08-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: spidr
|
@@ -175,6 +175,7 @@ extra_rdoc_files: []
|
|
175
175
|
files:
|
176
176
|
- bin/wayback_archiver
|
177
177
|
- lib/wayback_archiver.rb
|
178
|
+
- lib/wayback_archiver/adapters/wayback_machine.rb
|
178
179
|
- lib/wayback_archiver/archive.rb
|
179
180
|
- lib/wayback_archiver/archive_result.rb
|
180
181
|
- lib/wayback_archiver/http_code.rb
|