wayback_archiver 1.1.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e0c99c305a785b125b14194417d2930a460b4a95
|
4
|
+
data.tar.gz: 4dc88ae01e5cdadfc5730aed1091f005d7526e58
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1c665d9a1350cbbf8917803a3f8d0a6304a94b4af364497b3520dfe22b526c19ec9ae6174bbe4699c6d0dffd679427f457fb404757c4720d4d642de21afa465d
|
7
|
+
data.tar.gz: 5105b2fef3872e3dc528cac943f43c3e3c1b6007838bbebb6064a9f4351f8c44f306976a8e9977579aad8e6da642bf39a56b0b53de128ca5b0d8f89552c5c2c1
|
data/lib/wayback_archiver.rb
CHANGED
@@ -43,15 +43,15 @@ module WaybackArchiver
|
|
43
43
|
# WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
|
44
44
|
# WaybackArchiver.archive('example.com', strategy: :url, limit: 100) # send max 100 URLs
|
45
45
|
# WaybackArchiver.archive('example.com', :url)
|
46
|
-
def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
46
|
+
def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
|
47
47
|
strategy = legacy_strategy || strategy
|
48
48
|
|
49
49
|
case strategy.to_s
|
50
|
-
when 'crawl' then crawl(source, concurrency: concurrency, limit: limit)
|
51
|
-
when 'auto' then auto(source, concurrency: concurrency, limit: limit)
|
52
|
-
when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit)
|
53
|
-
when 'urls' then urls(source, concurrency: concurrency, limit: limit)
|
54
|
-
when 'url' then urls(source, concurrency: concurrency, limit: limit)
|
50
|
+
when 'crawl' then crawl(source, concurrency: concurrency, limit: limit, &block)
|
51
|
+
when 'auto' then auto(source, concurrency: concurrency, limit: limit, &block)
|
52
|
+
when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit, &block)
|
53
|
+
when 'urls' then urls(source, concurrency: concurrency, limit: limit, &block)
|
54
|
+
when 'url' then urls(source, concurrency: concurrency, limit: limit, &block)
|
55
55
|
else
|
56
56
|
raise ArgumentError, "Unknown strategy: '#{strategy}'. Allowed strategies: sitemap, urls, url, crawl"
|
57
57
|
end
|
@@ -69,11 +69,11 @@ module WaybackArchiver
|
|
69
69
|
# @example Auto archive example.com and archive max 100 URLs
|
70
70
|
# WaybackArchiver.auto('example.com', limit: 100)
|
71
71
|
# @see http://www.sitemaps.org
|
72
|
-
def self.auto(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
72
|
+
def self.auto(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
|
73
73
|
urls = Sitemapper.autodiscover(source)
|
74
|
-
return urls(urls, concurrency: concurrency) if urls.any?
|
74
|
+
return urls(urls, concurrency: concurrency, &block) if urls.any?
|
75
75
|
|
76
|
-
crawl(source, concurrency: concurrency)
|
76
|
+
crawl(source, concurrency: concurrency, &block)
|
77
77
|
end
|
78
78
|
|
79
79
|
# Crawl site for URLs to send to the Wayback Machine.
|
@@ -86,9 +86,9 @@ module WaybackArchiver
|
|
86
86
|
# WaybackArchiver.crawl('example.com', concurrency: 1)
|
87
87
|
# @example Crawl example.com and archive max 100 URLs
|
88
88
|
# WaybackArchiver.crawl('example.com', limit: 100)
|
89
|
-
def self.crawl(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
89
|
+
def self.crawl(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
|
90
90
|
WaybackArchiver.logger.info "Crawling #{url}"
|
91
|
-
Archive.crawl(url, concurrency: concurrency, limit: limit)
|
91
|
+
Archive.crawl(url, concurrency: concurrency, limit: limit, &block)
|
92
92
|
end
|
93
93
|
|
94
94
|
# Get URLs from sitemap and send found URLs to the Wayback Machine.
|
@@ -102,9 +102,9 @@ module WaybackArchiver
|
|
102
102
|
# @example Get example.com sitemap archive max 100 URLs
|
103
103
|
# WaybackArchiver.sitemap('example.com/sitemap.xml', limit: 100)
|
104
104
|
# @see http://www.sitemaps.org
|
105
|
-
def self.sitemap(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
105
|
+
def self.sitemap(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
|
106
106
|
WaybackArchiver.logger.info "Fetching Sitemap"
|
107
|
-
Archive.post(URLCollector.sitemap(url), concurrency: concurrency, limit: limit)
|
107
|
+
Archive.post(URLCollector.sitemap(url), concurrency: concurrency, limit: limit, &block)
|
108
108
|
end
|
109
109
|
|
110
110
|
# Send URL to the Wayback Machine.
|
@@ -117,8 +117,8 @@ module WaybackArchiver
|
|
117
117
|
# WaybackArchiver.urls(%w(example.com google.com))
|
118
118
|
# @example Archive example.com, max 100 URLs
|
119
119
|
# WaybackArchiver.urls(%w(example.com www.example.com), limit: 100)
|
120
|
-
def self.urls(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
121
|
-
Archive.post(Array(urls), concurrency: concurrency)
|
120
|
+
def self.urls(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
|
121
|
+
Archive.post(Array(urls), concurrency: concurrency, &block)
|
122
122
|
end
|
123
123
|
|
124
124
|
# Set logger
|
@@ -180,4 +180,21 @@ module WaybackArchiver
|
|
180
180
|
def self.max_limit
|
181
181
|
@max_limit ||= DEFAULT_MAX_LIMIT
|
182
182
|
end
|
183
|
+
|
184
|
+
# Sets the adapter
|
185
|
+
# @return [Object, #call>] the configured adapter
|
186
|
+
# @param [Object, #call>] the adapter
|
187
|
+
def self.adapter=(adapter)
|
188
|
+
unless adapter.respond_to?(:call)
|
189
|
+
raise(ArgumentError, 'adapter must implement #call')
|
190
|
+
end
|
191
|
+
|
192
|
+
@adapter = adapter
|
193
|
+
end
|
194
|
+
|
195
|
+
# Returns the configured adapter
|
196
|
+
# @return [Integer] the configured or the default adapter
|
197
|
+
def self.adapter
|
198
|
+
@adapter ||= WaybackMachine
|
199
|
+
end
|
183
200
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'wayback_archiver/archive_result'
|
2
|
+
require 'wayback_archiver/request'
|
3
|
+
|
4
|
+
module WaybackArchiver
|
5
|
+
# WaybackMachine adapter
|
6
|
+
class WaybackMachine
|
7
|
+
# Wayback Machine base URL.
|
8
|
+
BASE_URL = 'https://web.archive.org/save/'.freeze
|
9
|
+
|
10
|
+
# Send URL to Wayback Machine.
|
11
|
+
# @return [ArchiveResult] the sent URL.
|
12
|
+
# @param [String] url to send.
|
13
|
+
# @example Archive example.com, with default options
|
14
|
+
# WaybackMachine.call('http://example.com')
|
15
|
+
def self.call(url)
|
16
|
+
request_url = "#{BASE_URL}#{url}"
|
17
|
+
response = Request.get(request_url, follow_redirects: false)
|
18
|
+
WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
|
19
|
+
ArchiveResult.new(
|
20
|
+
url,
|
21
|
+
code: response.code,
|
22
|
+
request_url: response.uri,
|
23
|
+
response_error: response.error
|
24
|
+
)
|
25
|
+
rescue Request::Error => e
|
26
|
+
WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
|
27
|
+
ArchiveResult.new(url, error: e)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -1,15 +1,11 @@
|
|
1
1
|
require 'concurrent'
|
2
2
|
|
3
|
-
require 'wayback_archiver/archive_result'
|
4
3
|
require 'wayback_archiver/thread_pool'
|
5
|
-
require 'wayback_archiver/
|
4
|
+
require 'wayback_archiver/adapters/wayback_machine'
|
6
5
|
|
7
6
|
module WaybackArchiver
|
8
7
|
# Post URL(s) to Wayback Machine
|
9
8
|
class Archive
|
10
|
-
# Wayback Machine base URL.
|
11
|
-
WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
|
12
|
-
|
13
9
|
# Send URLs to Wayback Machine.
|
14
10
|
# @return [Array<ArchiveResult>] with sent URLs.
|
15
11
|
# @param [Array<String>] urls to send to the Wayback Machine.
|
@@ -97,18 +93,7 @@ module WaybackArchiver
|
|
97
93
|
# @example Archive example.com, with default options
|
98
94
|
# Archive.post_url('http://example.com')
|
99
95
|
def self.post_url(url)
|
100
|
-
|
101
|
-
response = Request.get(request_url, follow_redirects: false)
|
102
|
-
WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
|
103
|
-
ArchiveResult.new(
|
104
|
-
url,
|
105
|
-
code: response.code,
|
106
|
-
request_url: response.uri,
|
107
|
-
response_error: response.error
|
108
|
-
)
|
109
|
-
rescue Request::Error => e
|
110
|
-
WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
|
111
|
-
ArchiveResult.new(url, error: e)
|
96
|
+
WaybackArchiver.adapter.call(url)
|
112
97
|
end
|
113
98
|
end
|
114
99
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-08-
|
11
|
+
date: 2017-08-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: spidr
|
@@ -175,6 +175,7 @@ extra_rdoc_files: []
|
|
175
175
|
files:
|
176
176
|
- bin/wayback_archiver
|
177
177
|
- lib/wayback_archiver.rb
|
178
|
+
- lib/wayback_archiver/adapters/wayback_machine.rb
|
178
179
|
- lib/wayback_archiver/archive.rb
|
179
180
|
- lib/wayback_archiver/archive_result.rb
|
180
181
|
- lib/wayback_archiver/http_code.rb
|