wayback_archiver 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_archiver +2 -0
- data/lib/wayback_archiver/archive.rb +31 -15
- data/lib/wayback_archiver/null_logger.rb +9 -0
- data/lib/wayback_archiver/url_collector.rb +1 -1
- data/lib/wayback_archiver/version.rb +1 -1
- data/lib/wayback_archiver.rb +24 -2
- metadata +22 -7
- data/lib/wayback_archiver/process_queue.rb +0 -28
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ccebbb815d374658a9b2e1b2998a40041115d295
|
4
|
+
data.tar.gz: 153afa895756670988fa3663f44fd1fd5a2e5e3e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: db7d655b1ea642618797d7fa3ece8357fa608dcb60322da76be7d793de11cd61df61f796025000ed0099edd96711aa78db607ce0614e9256e5921c252a7d931b
|
7
|
+
data.tar.gz: 919d8473c7f97bbd36c9065b95a99304ace7e0556a8dcd0f92eedf00d65961400dd92c63d86e405aa97babd5d254b522384f665ff39ababdc71afed9c13333ad
|
data/bin/wayback_archiver
CHANGED
@@ -4,29 +4,45 @@ module WaybackArchiver
|
|
4
4
|
# Wayback Machine base URL.
|
5
5
|
WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
|
6
6
|
# Default concurrency for archiving URLs
|
7
|
-
DEFAULT_CONCURRENCY =
|
7
|
+
DEFAULT_CONCURRENCY = 5
|
8
8
|
# Send URLs to Wayback Machine.
|
9
9
|
# @return [Array] with sent URLs.
|
10
10
|
# @param [Array] urls URLs to send.
|
11
11
|
# @param [Hash] options
|
12
|
-
# @example Archive
|
12
|
+
# @example Archive urls, asynchronously
|
13
13
|
# Archive.post(['http://example.com'])
|
14
|
-
# @example Archive
|
14
|
+
# @example Archive urls, using only 1 thread
|
15
15
|
# Archive.post(['http://example.com'], concurrency: 1)
|
16
|
-
def self.post(urls,
|
17
|
-
|
18
|
-
concurrency
|
19
|
-
|
20
|
-
puts "=== WAYBACK ARCHIVER ==="
|
21
|
-
puts "Request are sent with up to #{concurrency} parallel threads"
|
22
|
-
puts "Total urls to be sent: #{urls.length}"
|
16
|
+
def self.post(urls, concurrency: DEFAULT_CONCURRENCY)
|
17
|
+
WaybackArchiver.logger.info "=== WAYBACK ARCHIVER ==="
|
18
|
+
WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
|
19
|
+
WaybackArchiver.logger.info "Total urls to be sent: #{urls.length}"
|
23
20
|
|
24
|
-
|
21
|
+
pool = Concurrent::FixedThreadPool.new(concurrency)
|
22
|
+
urls.each do |url|
|
23
|
+
pool.post { Archive.post_url(url) }
|
24
|
+
end
|
25
25
|
|
26
|
-
|
26
|
+
WaybackArchiver.logger.info "#{urls.length} URLs sent to Internet archive"
|
27
27
|
urls
|
28
28
|
end
|
29
29
|
|
30
|
+
# Send URLs to Wayback Machine by crawling the site.
|
31
|
+
# @return [Array] with URLs sent to the Wayback Machine.
|
32
|
+
# @param [String] source for URL to crawl.
|
33
|
+
# @param [Integer] concurrency (default is 5).
|
34
|
+
# @example Crawl example.com and send all URLs of the same domain
|
35
|
+
# WaybackArchiver.crawl('example.com')
|
36
|
+
# @example Crawl example.com and send all URLs of the same domain with low concurrency
|
37
|
+
# WaybackArchiver.crawl('example.com', concurrency: 1)
|
38
|
+
def self.crawl(source, concurrency: DEFAULT_CONCURRENCY)
|
39
|
+
pool = Concurrent::FixedThreadPool.new(concurrency) # X threads
|
40
|
+
|
41
|
+
UrlCollector.crawl(source) do |url|
|
42
|
+
pool.post { Archive.post_url(url) }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
30
46
|
# Send URL to Wayback Machine.
|
31
47
|
# @return [String] the sent URL.
|
32
48
|
# @param [String] url to send.
|
@@ -35,11 +51,11 @@ module WaybackArchiver
|
|
35
51
|
def self.post_url(url)
|
36
52
|
request_url = "#{WAYBACK_BASE_URL}#{url}"
|
37
53
|
response = Request.response(request_url)
|
38
|
-
|
54
|
+
WaybackArchiver.logger.info "[#{response.code}, #{response.message}] #{url}"
|
39
55
|
url
|
40
56
|
rescue Exception => e
|
41
|
-
|
42
|
-
|
57
|
+
WaybackArchiver.logger.error "Error message: #{e.message}"
|
58
|
+
WaybackArchiver.logger.error "Failed to archive: #{url}"
|
43
59
|
end
|
44
60
|
end
|
45
61
|
end
|
data/lib/wayback_archiver.rb
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
require 'uri'
|
2
2
|
require 'net/http'
|
3
3
|
|
4
|
+
require 'concurrent'
|
5
|
+
|
6
|
+
require 'wayback_archiver/null_logger'
|
4
7
|
require 'wayback_archiver/version'
|
5
8
|
require 'wayback_archiver/url_collector'
|
6
9
|
require 'wayback_archiver/archive'
|
7
10
|
require 'wayback_archiver/request'
|
8
|
-
require 'wayback_archiver/process_queue'
|
9
11
|
|
10
12
|
# WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
|
11
13
|
module WaybackArchiver
|
@@ -28,11 +30,31 @@ module WaybackArchiver
|
|
28
30
|
def self.archive(source, type = :crawl)
|
29
31
|
case type.to_s
|
30
32
|
when 'file' then Archive.post(UrlCollector.file(source))
|
31
|
-
when 'crawl' then
|
33
|
+
when 'crawl' then crawl(source)
|
32
34
|
when 'sitemap' then Archive.post(UrlCollector.sitemap(source))
|
33
35
|
when 'url' then Archive.post_url(Request.resolve_url(source))
|
34
36
|
else
|
35
37
|
raise ArgumentError, "Unknown type: '#{type}'. Allowed types: sitemap, url, file, crawl"
|
36
38
|
end
|
37
39
|
end
|
40
|
+
|
41
|
+
# Crawl site for URLs to send to the Wayback Machine.
|
42
|
+
# @return [Array] with URLs sent to the Wayback Machine.
|
43
|
+
# @param [String] source for URL(s).
|
44
|
+
# @param [Integer] concurrency.
|
45
|
+
# @example Crawl example.com and send all URLs of the same domain
|
46
|
+
# WaybackArchiver.crawl('example.com') # Default concurrency is 5
|
47
|
+
# @example Crawl example.com and send all URLs of the same domain with low concurrency
|
48
|
+
# WaybackArchiver.crawl('example.com', concurrency: 1)
|
49
|
+
def self.crawl(source, concurrency: Archive::DEFAULT_CONCURRENCY)
|
50
|
+
Archive.crawl(source, concurrency: concurrency)
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.logger=(logger)
|
54
|
+
@logger = logger
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.logger
|
58
|
+
@logger ||= NullLogger.new
|
59
|
+
end
|
38
60
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-07-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: spidr
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0.1'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: concurrent-ruby
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: bundler
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -150,7 +164,8 @@ dependencies:
|
|
150
164
|
- - ">"
|
151
165
|
- !ruby/object:Gem::Version
|
152
166
|
version: '0'
|
153
|
-
description: Send URLs to Wayback Machine. By crawling, sitemap,
|
167
|
+
description: Send URLs to Wayback Machine (Internet Archive). By crawling, sitemap,
|
168
|
+
file or single URL.
|
154
169
|
email:
|
155
170
|
- burenstam@gmail.com
|
156
171
|
executables:
|
@@ -161,7 +176,7 @@ files:
|
|
161
176
|
- bin/wayback_archiver
|
162
177
|
- lib/wayback_archiver.rb
|
163
178
|
- lib/wayback_archiver/archive.rb
|
164
|
-
- lib/wayback_archiver/
|
179
|
+
- lib/wayback_archiver/null_logger.rb
|
165
180
|
- lib/wayback_archiver/request.rb
|
166
181
|
- lib/wayback_archiver/url_collector.rb
|
167
182
|
- lib/wayback_archiver/version.rb
|
@@ -177,7 +192,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
177
192
|
requirements:
|
178
193
|
- - ">="
|
179
194
|
- !ruby/object:Gem::Version
|
180
|
-
version:
|
195
|
+
version: 2.0.0
|
181
196
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
182
197
|
requirements:
|
183
198
|
- - ">="
|
@@ -185,8 +200,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
185
200
|
version: '0'
|
186
201
|
requirements: []
|
187
202
|
rubyforge_project:
|
188
|
-
rubygems_version: 2.6.
|
203
|
+
rubygems_version: 2.6.11
|
189
204
|
signing_key:
|
190
205
|
specification_version: 4
|
191
|
-
summary: Send URLs to Wayback Machine
|
206
|
+
summary: Send URLs to Wayback Machine (Internet Archive)
|
192
207
|
test_files: []
|
@@ -1,28 +0,0 @@
|
|
1
|
-
require 'thread'
|
2
|
-
|
3
|
-
module WaybackArchiver
|
4
|
-
class ProcessQueue
|
5
|
-
# Process enumerable data in parallel.
|
6
|
-
# @return [Array] of URLs defined found during crawl.
|
7
|
-
# @param [Object] Enumberable object
|
8
|
-
# @example Print list of names in parallel
|
9
|
-
# ProcessQueue.process(%w(jacob peter eva)) { |v| puts n }
|
10
|
-
# @example Print list of names using 2 threads
|
11
|
-
# ProcessQueue.process(%w(jacob peter eva), threads_count: 2) { |v| puts n }
|
12
|
-
def self.process(data_array, threads_count: 5)
|
13
|
-
queue = Queue.new
|
14
|
-
data_array.each { |data| queue.push(data) }
|
15
|
-
workers = threads_count.times.map do
|
16
|
-
Thread.new do
|
17
|
-
begin
|
18
|
-
while data = queue.pop(true)
|
19
|
-
yield(data)
|
20
|
-
end
|
21
|
-
rescue ThreadError
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
workers.map(&:join)
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|