wayback_archiver 0.2.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_archiver +71 -7
- data/lib/wayback_archiver.rb +120 -25
- data/lib/wayback_archiver/archive.rb +45 -23
- data/lib/wayback_archiver/http_code.rb +49 -0
- data/lib/wayback_archiver/null_logger.rb +7 -4
- data/lib/wayback_archiver/request.rb +214 -53
- data/lib/wayback_archiver/sitemap.rb +79 -0
- data/lib/wayback_archiver/sitemapper.rb +75 -0
- data/lib/wayback_archiver/thread_pool.rb +26 -0
- data/lib/wayback_archiver/url_collector.rb +16 -25
- data/lib/wayback_archiver/version.rb +1 -1
- metadata +25 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1f9f979d5fa0d31cfdf61660baa3464bdb1425e5
|
4
|
+
data.tar.gz: 1d5701273bbe4d02b2ba5d88f9e75c9477058a28
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e69883c975584b3120993371b29a0c0b7a71f3fd4210764b4caa712d4071ec175dc47fa63217950968da62485e927c05fcc977dfb076a317be4754cf4f16ec90
|
7
|
+
data.tar.gz: 51f80b591f40f4bc22b5bf2e2b7d56273c3c495db43d8a9268f11048a410ad355a3ff238088a4abd83c69fd585db5e1e704d34d22dac3d4f4b9b2f73089458ac
|
data/bin/wayback_archiver
CHANGED
@@ -1,14 +1,78 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
require 'optparse'
|
3
4
|
require 'wayback_archiver'
|
4
5
|
|
5
|
-
|
6
|
-
|
6
|
+
# Default values
|
7
|
+
urls = nil
|
8
|
+
strategy = 'auto'
|
9
|
+
log = STDOUT
|
10
|
+
log_level = Logger::INFO
|
11
|
+
concurrency = WaybackArchiver.concurrency
|
7
12
|
|
8
|
-
|
13
|
+
optparse = OptionParser.new do |parser|
|
14
|
+
parser.banner = 'Usage: wayback_archiver [<url>] [options]'
|
9
15
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
16
|
+
parser.on('--auto', 'Auto (default)') do |value|
|
17
|
+
strategy = 'auto'
|
18
|
+
end
|
19
|
+
|
20
|
+
parser.on('--crawl', 'Crawl') do |value|
|
21
|
+
strategy = 'crawl'
|
22
|
+
end
|
23
|
+
|
24
|
+
parser.on('--sitemap', 'Sitemap') do |value|
|
25
|
+
strategy = 'sitemap'
|
26
|
+
end
|
27
|
+
|
28
|
+
parser.on('--urls', '--url', 'URL(s)') do |value|
|
29
|
+
strategy = 'urls'
|
30
|
+
end
|
31
|
+
|
32
|
+
parser.on('--concurrency=5', Integer, 'Concurrency') do |value|
|
33
|
+
concurrency = value
|
34
|
+
end
|
35
|
+
|
36
|
+
parser.on('--log=output.log', String, 'Path to desired log file (if no argument is given it defaults to STDOUT)') do |path|
|
37
|
+
log = path
|
38
|
+
end
|
39
|
+
|
40
|
+
parser.on('--[no-]verbose', 'Verboes logs') do |value|
|
41
|
+
log_level = value ? Logger::DEBUG : Logger::WARN
|
42
|
+
end
|
43
|
+
|
44
|
+
parser.on('-h', '--help', 'How to use') do
|
45
|
+
puts parser
|
46
|
+
exit
|
47
|
+
end
|
48
|
+
|
49
|
+
# No argument, shows at tail. This will print an options summary.
|
50
|
+
parser.on_tail('-h', '--help', 'Show this message') do
|
51
|
+
puts parser
|
52
|
+
exit
|
53
|
+
end
|
54
|
+
|
55
|
+
parser.on_tail('--version', 'Show version') do
|
56
|
+
puts "WaybackArchiver version #{WaybackArchiver::VERSION}"
|
57
|
+
exit
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
optparse.parse!
|
62
|
+
|
63
|
+
urls = ARGV.map(&:strip).reject(&:empty?)
|
64
|
+
if urls.empty?
|
65
|
+
puts optparse.help
|
66
|
+
raise ArgumentError, "[<url>] is required"
|
67
|
+
end
|
68
|
+
|
69
|
+
WaybackArchiver.logger = Logger.new(log).tap do |logger|
|
70
|
+
logger.progname = 'WaybackArchiver'
|
71
|
+
logger.level = log_level
|
72
|
+
end
|
73
|
+
|
74
|
+
# If no strategy has explicitly been given, then default to 'auto'
|
75
|
+
strategy ||= 'auto'
|
76
|
+
urls.each do |url|
|
77
|
+
WaybackArchiver.archive(url, strategy: strategy, concurrency: concurrency)
|
14
78
|
end
|
data/lib/wayback_archiver.rb
CHANGED
@@ -1,60 +1,155 @@
|
|
1
|
-
require '
|
2
|
-
require 'net/http'
|
3
|
-
|
4
|
-
require 'concurrent'
|
5
|
-
|
1
|
+
require 'wayback_archiver/thread_pool'
|
6
2
|
require 'wayback_archiver/null_logger'
|
7
3
|
require 'wayback_archiver/version'
|
8
4
|
require 'wayback_archiver/url_collector'
|
9
5
|
require 'wayback_archiver/archive'
|
10
|
-
require 'wayback_archiver/
|
6
|
+
require 'wayback_archiver/sitemapper'
|
11
7
|
|
12
|
-
# WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap
|
8
|
+
# WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap or by passing a list of URLs.
|
13
9
|
module WaybackArchiver
|
14
10
|
# Link to gem on rubygems.org, part of the sent User-Agent
|
15
11
|
INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'.freeze
|
16
12
|
# WaybackArchiver User-Agent
|
17
13
|
USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze
|
18
14
|
|
15
|
+
# Default concurrency for archiving URLs
|
16
|
+
DEFAULT_CONCURRENCY = 5
|
17
|
+
|
19
18
|
# Send URLs to Wayback Machine.
|
20
|
-
# @return [Array]
|
21
|
-
# @param [String] source for URL(s).
|
22
|
-
# @param [String/Symbol]
|
19
|
+
# @return [Array<String>] of URLs sent to the Wayback Machine.
|
20
|
+
# @param [String/Array<String>] source for URL(s).
|
21
|
+
# @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto.
|
23
22
|
# @example Crawl example.com and send all URLs of the same domain
|
24
|
-
# WaybackArchiver.archive('example.com') # Default
|
23
|
+
# WaybackArchiver.archive('example.com') # Default strategy is :auto
|
24
|
+
# WaybackArchiver.archive('example.com', strategy: :auto)
|
25
|
+
# WaybackArchiver.archive('example.com', strategy: :auto, concurrency: 10)
|
26
|
+
# WaybackArchiver.archive('example.com', :auto)
|
27
|
+
# @example Crawl example.com and send all URLs of the same domain
|
28
|
+
# WaybackArchiver.archive('example.com', strategy: :crawl)
|
29
|
+
# WaybackArchiver.archive('example.com', strategy: :crawl, concurrency: 10)
|
25
30
|
# WaybackArchiver.archive('example.com', :crawl)
|
31
|
+
# @example Send example.com Sitemap URLs
|
32
|
+
# WaybackArchiver.archive('example.com', strategy: :sitemap)
|
33
|
+
# WaybackArchiver.archive('example.com', strategy: :sitemap, concurrency: 10)
|
34
|
+
# WaybackArchiver.archive('example.com', :sitemap)
|
26
35
|
# @example Send only example.com
|
36
|
+
# WaybackArchiver.archive('example.com', strategy: :url)
|
37
|
+
# WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
|
27
38
|
# WaybackArchiver.archive('example.com', :url)
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
case
|
32
|
-
when '
|
33
|
-
when '
|
34
|
-
when 'sitemap' then
|
35
|
-
when '
|
39
|
+
def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency)
|
40
|
+
strategy = legacy_strategy || strategy
|
41
|
+
|
42
|
+
case strategy.to_s
|
43
|
+
when 'crawl' then crawl(source, concurrency: concurrency)
|
44
|
+
when 'auto' then auto(source, concurrency: concurrency)
|
45
|
+
when 'sitemap' then sitemap(source, concurrency: concurrency)
|
46
|
+
when 'urls' then urls(source, concurrency: concurrency)
|
47
|
+
when 'url' then urls(source, concurrency: concurrency)
|
36
48
|
else
|
37
|
-
raise ArgumentError, "Unknown
|
49
|
+
raise ArgumentError, "Unknown strategy: '#{strategy}'. Allowed strategies: sitemap, urls, url, crawl"
|
38
50
|
end
|
39
51
|
end
|
40
52
|
|
53
|
+
# Look for Sitemap(s) and if nothing is found fallback to crawling.
|
54
|
+
# Then send found URLs to the Wayback Machine.
|
55
|
+
# @return [Array<String>] of URLs sent to the Wayback Machine.
|
56
|
+
# @param [String] source (must be a valid URL).
|
57
|
+
# @param concurrency [Integer]
|
58
|
+
# @example Auto archive example.com
|
59
|
+
# WaybackArchiver.auto('example.com') # Default concurrency is 5
|
60
|
+
# @example Auto archive example.com with low concurrency
|
61
|
+
# WaybackArchiver.auto('example.com', concurrency: 1)
|
62
|
+
# @see http://www.sitemaps.org
|
63
|
+
def self.auto(source, concurrency: WaybackArchiver.concurrency)
|
64
|
+
urls = Sitemapper.autodiscover(source)
|
65
|
+
return urls(urls, concurrency: concurrency) if urls.any?
|
66
|
+
|
67
|
+
crawl(source, concurrency: concurrency)
|
68
|
+
end
|
69
|
+
|
41
70
|
# Crawl site for URLs to send to the Wayback Machine.
|
42
|
-
# @return [Array]
|
43
|
-
# @param [String]
|
44
|
-
# @param [Integer]
|
71
|
+
# @return [Array<String>] of URLs sent to the Wayback Machine.
|
72
|
+
# @param [String] url to start crawling from.
|
73
|
+
# @param concurrency [Integer]
|
45
74
|
# @example Crawl example.com and send all URLs of the same domain
|
46
75
|
# WaybackArchiver.crawl('example.com') # Default concurrency is 5
|
47
76
|
# @example Crawl example.com and send all URLs of the same domain with low concurrency
|
48
77
|
# WaybackArchiver.crawl('example.com', concurrency: 1)
|
49
|
-
def self.crawl(
|
50
|
-
|
78
|
+
def self.crawl(url, concurrency: WaybackArchiver.concurrency)
|
79
|
+
WaybackArchiver.logger.info "Crawling #{url}"
|
80
|
+
Archive.crawl(url, concurrency: concurrency)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Get URLs from sitemap and send found URLs to the Wayback Machine.
|
84
|
+
# @return [Array<String>] of URLs sent to the Wayback Machine.
|
85
|
+
# @param [String] url to the sitemap.
|
86
|
+
# @param concurrency [Integer]
|
87
|
+
# @example Get example.com sitemap and archive all found URLs
|
88
|
+
# WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 5
|
89
|
+
# @example Get example.com sitemap and archive all found URLs with low concurrency
|
90
|
+
# WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1)
|
91
|
+
# @see http://www.sitemaps.org
|
92
|
+
def self.sitemap(url, concurrency: WaybackArchiver.concurrency)
|
93
|
+
WaybackArchiver.logger.info "Fetching Sitemap"
|
94
|
+
Archive.post(URLCollector.sitemap(url), concurrency: concurrency)
|
95
|
+
end
|
96
|
+
|
97
|
+
# Send URL to the Wayback Machine.
|
98
|
+
# @return [Array<String>] of URLs sent to the Wayback Machine.
|
99
|
+
# @param [Array<String>/String] urls or url.
|
100
|
+
# @param concurrency [Integer]
|
101
|
+
# @example Archive example.com
|
102
|
+
# WaybackArchiver.urls('example.com')
|
103
|
+
# @example Archive example.com and google.com
|
104
|
+
# WaybackArchiver.urls(%w(example.com google.com))
|
105
|
+
def self.urls(urls, concurrency: WaybackArchiver.concurrency)
|
106
|
+
Archive.post(Array(urls), concurrency: concurrency)
|
51
107
|
end
|
52
108
|
|
109
|
+
# Set logger
|
110
|
+
# @return [Object] the set logger
|
111
|
+
# @param [Object] logger an object than response to quacks like a Logger
|
112
|
+
# @example set a logger that prints to standard out (STDOUT)
|
113
|
+
# WaybackArchiver.logger = Logger.new(STDOUT)
|
53
114
|
def self.logger=(logger)
|
54
115
|
@logger = logger
|
55
116
|
end
|
56
117
|
|
118
|
+
# Returns the current logger
|
119
|
+
# @return [Object] the current logger instance
|
57
120
|
def self.logger
|
58
121
|
@logger ||= NullLogger.new
|
59
122
|
end
|
123
|
+
|
124
|
+
# Resets the logger to the default
|
125
|
+
# @return [NullLogger] a new instance of NullLogger
|
126
|
+
def self.default_logger!
|
127
|
+
@logger = NullLogger.new
|
128
|
+
end
|
129
|
+
|
130
|
+
# Sets the user agent
|
131
|
+
# @return [String] the configured user agent
|
132
|
+
# @param [String] user_agent the desired user agent
|
133
|
+
def self.user_agent=(user_agent)
|
134
|
+
@user_agent = user_agent
|
135
|
+
end
|
136
|
+
|
137
|
+
# Returns the configured user agent
|
138
|
+
# @return [String] the configured or the default user agent
|
139
|
+
def self.user_agent
|
140
|
+
@user_agent ||= USER_AGENT
|
141
|
+
end
|
142
|
+
|
143
|
+
# Sets the default concurrency
|
144
|
+
# @return [Integer] the desired default concurrency
|
145
|
+
# @param [Integer] concurrency the desired default concurrency
|
146
|
+
def self.concurrency=(concurrency)
|
147
|
+
@concurrency = concurrency
|
148
|
+
end
|
149
|
+
|
150
|
+
# Returns the default concurrency
|
151
|
+
# @return [Integer] the configured or the default concurrency
|
152
|
+
def self.concurrency
|
153
|
+
@concurrency ||= DEFAULT_CONCURRENCY
|
154
|
+
end
|
60
155
|
end
|
@@ -1,46 +1,68 @@
|
|
1
|
+
require 'concurrent'
|
2
|
+
|
3
|
+
require 'wayback_archiver/thread_pool'
|
4
|
+
require 'wayback_archiver/request'
|
5
|
+
|
1
6
|
module WaybackArchiver
|
2
7
|
# Post URL(s) to Wayback Machine
|
3
8
|
class Archive
|
4
9
|
# Wayback Machine base URL.
|
5
10
|
WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
|
6
|
-
|
7
|
-
DEFAULT_CONCURRENCY = 5
|
11
|
+
|
8
12
|
# Send URLs to Wayback Machine.
|
9
|
-
# @return [Array] with sent URLs.
|
10
|
-
# @param [Array] urls
|
11
|
-
# @param [
|
13
|
+
# @return [Array<String>] with sent URLs.
|
14
|
+
# @param [Array<String>] urls to send to the Wayback Machine.
|
15
|
+
# @param concurrency [Integer] the default is 5
|
12
16
|
# @example Archive urls, asynchronously
|
13
17
|
# Archive.post(['http://example.com'])
|
14
18
|
# @example Archive urls, using only 1 thread
|
15
19
|
# Archive.post(['http://example.com'], concurrency: 1)
|
16
|
-
def self.post(urls, concurrency:
|
17
|
-
WaybackArchiver.logger.info "
|
20
|
+
def self.post(urls, concurrency: WaybackArchiver.concurrency)
|
21
|
+
WaybackArchiver.logger.info "Total URLs to be sent: #{urls.length}"
|
18
22
|
WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
|
19
|
-
WaybackArchiver.logger.info "Total urls to be sent: #{urls.length}"
|
20
23
|
|
21
|
-
|
24
|
+
posted_urls = Concurrent::Array.new
|
25
|
+
pool = ThreadPool.build(concurrency)
|
22
26
|
urls.each do |url|
|
23
|
-
pool.post
|
27
|
+
pool.post do
|
28
|
+
posted_url = post_url(url)
|
29
|
+
posted_urls << posted_url if posted_url
|
30
|
+
end
|
24
31
|
end
|
25
32
|
|
26
|
-
|
27
|
-
|
33
|
+
pool.shutdown
|
34
|
+
pool.wait_for_termination
|
35
|
+
|
36
|
+
WaybackArchiver.logger.info "#{posted_urls.length} URL(s) posted to Wayback Machine"
|
37
|
+
posted_urls
|
28
38
|
end
|
29
39
|
|
30
40
|
# Send URLs to Wayback Machine by crawling the site.
|
31
|
-
# @return [Array] with URLs sent to the Wayback Machine.
|
41
|
+
# @return [Array<String>] with URLs sent to the Wayback Machine.
|
32
42
|
# @param [String] source for URL to crawl.
|
33
|
-
# @param [Integer]
|
43
|
+
# @param concurrency [Integer] the default is 5
|
34
44
|
# @example Crawl example.com and send all URLs of the same domain
|
35
45
|
# WaybackArchiver.crawl('example.com')
|
36
46
|
# @example Crawl example.com and send all URLs of the same domain with low concurrency
|
37
47
|
# WaybackArchiver.crawl('example.com', concurrency: 1)
|
38
|
-
def self.crawl(source, concurrency:
|
39
|
-
|
48
|
+
def self.crawl(source, concurrency: WaybackArchiver.concurrency)
|
49
|
+
WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
|
50
|
+
|
51
|
+
posted_urls = Concurrent::Array.new
|
52
|
+
pool = ThreadPool.build(concurrency)
|
40
53
|
|
41
|
-
|
42
|
-
pool.post
|
54
|
+
found_urls = URLCollector.crawl(source) do |url|
|
55
|
+
pool.post do
|
56
|
+
posted_url = post_url(url)
|
57
|
+
posted_urls << posted_url if posted_url
|
58
|
+
end
|
43
59
|
end
|
60
|
+
WaybackArchiver.logger.info "Crawling of #{source} finished, found #{found_urls.length} URL(s)"
|
61
|
+
pool.shutdown
|
62
|
+
pool.wait_for_termination
|
63
|
+
|
64
|
+
WaybackArchiver.logger.info "#{posted_urls.length} URL(s) posted to Wayback Machine"
|
65
|
+
posted_urls
|
44
66
|
end
|
45
67
|
|
46
68
|
# Send URL to Wayback Machine.
|
@@ -50,12 +72,12 @@ module WaybackArchiver
|
|
50
72
|
# Archive.post_url('http://example.com')
|
51
73
|
def self.post_url(url)
|
52
74
|
request_url = "#{WAYBACK_BASE_URL}#{url}"
|
53
|
-
response = Request.
|
54
|
-
WaybackArchiver.logger.info "[#{response.code}, #{response.message}] #{url}"
|
75
|
+
response = Request.get(request_url, follow_redirects: false)
|
76
|
+
WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
|
55
77
|
url
|
56
|
-
rescue
|
57
|
-
WaybackArchiver.logger.error "
|
58
|
-
|
78
|
+
rescue Request::Error => e
|
79
|
+
WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
|
80
|
+
nil
|
59
81
|
end
|
60
82
|
end
|
61
83
|
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module WaybackArchiver
|
2
|
+
# Convience class for HTTP response codes
|
3
|
+
class HTTPCode
|
4
|
+
# Type of code as symbol
|
5
|
+
# @return [Symbol] code type
|
6
|
+
# @param [String/Integer] code the response code
|
7
|
+
# @example
|
8
|
+
# HttpCode.type('200')
|
9
|
+
def self.type(code)
|
10
|
+
code = code.to_s
|
11
|
+
return :success if success?(code)
|
12
|
+
return :redirect if redirect?(code)
|
13
|
+
return :error if error?(code)
|
14
|
+
|
15
|
+
:unknown
|
16
|
+
end
|
17
|
+
|
18
|
+
# Whether the code is a success type
|
19
|
+
# @return [Boolean] is success or not
|
20
|
+
# @param [String] code the response code
|
21
|
+
# @example
|
22
|
+
# HttpCode.success?('200') # => true
|
23
|
+
# @example
|
24
|
+
# HttpCode.success?(200) # => true
|
25
|
+
# @example
|
26
|
+
# HttpCode.success?(nil) # => false
|
27
|
+
def self.success?(code)
|
28
|
+
code.to_s.match?(/2\d\d/)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Whether the code is a redirect type
|
32
|
+
# @return [Boolean] is redirect or not
|
33
|
+
# @param [String] code the response code
|
34
|
+
# @example
|
35
|
+
# HttpCode.redirect?('301')
|
36
|
+
def self.redirect?(code)
|
37
|
+
code.to_s.match?(/3\d\d/)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Whether the code is a error type
|
41
|
+
# @return [Boolean] is error or not
|
42
|
+
# @param [String] code the response code
|
43
|
+
# @example
|
44
|
+
# HttpCode.error?('301')
|
45
|
+
def self.error?(code)
|
46
|
+
code.to_s.match?(/4\d\d/) || code.to_s.match?(/5\d\d/)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -1,9 +1,12 @@
|
|
1
1
|
require 'logger'
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
3
|
+
module WaybackArchiver
|
4
|
+
# Don't log anyting / Send the logs to the abyss
|
5
|
+
class NullLogger < Logger
|
6
|
+
# Allow any and all params
|
7
|
+
def initialize(*args); end
|
6
8
|
|
7
|
-
|
9
|
+
# Allow any and alls params and don't do anyting
|
10
|
+
def add(*args, &block); end
|
8
11
|
end
|
9
12
|
end
|
@@ -1,62 +1,223 @@
|
|
1
|
-
require '
|
1
|
+
require 'net/http'
|
2
|
+
require 'openssl'
|
3
|
+
require 'timeout'
|
4
|
+
require 'uri'
|
5
|
+
require 'zlib'
|
6
|
+
|
7
|
+
require 'wayback_archiver/http_code'
|
2
8
|
|
3
9
|
module WaybackArchiver
|
4
|
-
#
|
10
|
+
# Make HTTP requests
|
5
11
|
class Request
|
6
|
-
#
|
7
|
-
|
8
|
-
#
|
9
|
-
|
10
|
-
#
|
11
|
-
|
12
|
-
#
|
13
|
-
|
14
|
-
|
15
|
-
|
12
|
+
# General error, something went wrong
|
13
|
+
class Error < StandardError; end
|
14
|
+
# Client error, something went wrong on the local machine
|
15
|
+
class ClientError < Error; end
|
16
|
+
# Server error, the remote server did something wrong
|
17
|
+
class ServerError < Error; end
|
18
|
+
# Remote server responded with a HTTP error
|
19
|
+
class HTTPError < ServerError; end
|
20
|
+
# Remote server error
|
21
|
+
class ResponseError < ServerError; end
|
22
|
+
# Max redirects reached error
|
23
|
+
class MaxRedirectError < ServerError; end
|
24
|
+
# Remote server responded with an invalid redirect
|
25
|
+
class InvalidRedirectError < ServerError; end
|
26
|
+
# Remote server responded with an unknown HTTP code
|
27
|
+
class UnknownResponseCodeError < ServerError; end
|
28
|
+
|
29
|
+
# GET response wrapper
|
30
|
+
GETStruct = Struct.new(:response, :error)
|
31
|
+
|
32
|
+
# Max number of redirects before an error is raised
|
33
|
+
MAX_REDIRECTS = 10
|
34
|
+
|
35
|
+
# Response data struct
|
36
|
+
Response = Struct.new(:code, :message, :body, :uri, :error)
|
37
|
+
class Response
|
38
|
+
# Returns true if a successfull response
|
39
|
+
# @example check if Response was successfull
|
40
|
+
# response = Response.new('200', 'OK', 'buren', 'http://example.com')
|
41
|
+
# response.success? # => true
|
42
|
+
def success?
|
43
|
+
HTTPCode.success?(code)
|
44
|
+
end
|
16
45
|
end
|
17
46
|
|
18
47
|
# Get reponse.
|
19
|
-
# @return [
|
20
|
-
# @param [String]
|
21
|
-
# @param [
|
22
|
-
# @
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
48
|
+
# @return [Response] the http response representation.
|
49
|
+
# @param [String, URI] uri to retrieve.
|
50
|
+
# @param max_redirects [Integer] max redirects (default: 10).
|
51
|
+
# @param follow_redirects [Boolean] follow redirects (default: true).
|
52
|
+
# @example Get example.com
|
53
|
+
# Request.get('example.com')
|
54
|
+
# @example Get http://example.com and follow max 3 redirects
|
55
|
+
# Request.get('http://example.com', max_redirects: 3)
|
56
|
+
# @example Get http://example.com and don't follow redirects
|
57
|
+
# Request.get('http://example.com', follow_redirects: false)
|
58
|
+
# @raise [Error] super class of all exceptions that this method can raise
|
59
|
+
# @raise [ServerError] all server errors
|
60
|
+
# @raise [ClientError] all client errors
|
61
|
+
# @raise [HTTPError] all HTTP errors
|
62
|
+
# @raise [MaxRedirectError] too many redirects, subclass of HTTPError (only raised if raise_on_http_error flag is true)
|
63
|
+
# @raise [ResponseError] server responsed with a 4xx or 5xx HTTP status code, subclass of HTTPError (only raised if raise_on_http_error flag is true)
|
64
|
+
# @raise [UnknownResponseCodeError] server responded with an unknown HTTP status code, subclass of HTTPError (only raised if raise_on_http_error flag is true)
|
65
|
+
# @raise [InvalidRedirectError] server responded with an invalid redirect, subclass of HTTPError (only raised if raise_on_http_error flag is true)
|
66
|
+
def self.get(
|
67
|
+
uri,
|
68
|
+
max_redirects: MAX_REDIRECTS,
|
69
|
+
raise_on_http_error: false,
|
70
|
+
follow_redirects: true
|
71
|
+
)
|
72
|
+
uri = build_uri(uri)
|
73
|
+
|
74
|
+
redirect_count = 0
|
75
|
+
until redirect_count > max_redirects
|
76
|
+
WaybackArchiver.logger.debug "Requesting #{uri}"
|
77
|
+
|
78
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
79
|
+
if uri.scheme == 'https'
|
80
|
+
http.use_ssl = true
|
81
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
82
|
+
end
|
83
|
+
|
84
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
85
|
+
request['User-Agent'] = WaybackArchiver.user_agent
|
86
|
+
|
87
|
+
result = perform_request(uri, http, request)
|
88
|
+
response = result.response
|
89
|
+
error = result.error
|
90
|
+
|
91
|
+
raise error if error
|
92
|
+
|
93
|
+
code = response.code
|
94
|
+
WaybackArchiver.logger.debug "[#{code}, #{response.message}] Requested #{uri}"
|
95
|
+
|
96
|
+
case HTTPCode.type(code)
|
97
|
+
when :success
|
98
|
+
return build_response(uri, response)
|
99
|
+
when :redirect
|
100
|
+
return build_response(uri, response) unless follow_redirects
|
101
|
+
|
102
|
+
uri = build_redirect_uri(uri, response)
|
103
|
+
redirect_count += 1
|
104
|
+
next
|
105
|
+
when :error
|
106
|
+
if raise_on_http_error
|
107
|
+
raise ResponseError, "Failed with response code: #{code} when requesting #{uri}"
|
108
|
+
end
|
109
|
+
|
110
|
+
return build_response(uri, response)
|
111
|
+
else
|
112
|
+
raise UnknownResponseCodeError, "Unknown HTTP response code #{code} when requesting #{uri}"
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
raise MaxRedirectError, "Redirected too many times when requesting #{uri}"
|
117
|
+
end
|
118
|
+
|
119
|
+
# Builds a Response object.
|
120
|
+
# @return [Response]
|
121
|
+
# @param [URI] uri that was requested.
|
122
|
+
# @param [Net::HTTPResponse] response the server response.
|
123
|
+
# @example Build Response object for example.com
|
124
|
+
# Request.build_response(uri, net_http_response)
|
125
|
+
def self.build_response(uri, response)
|
126
|
+
Response.new(
|
127
|
+
response.code,
|
128
|
+
response.message,
|
129
|
+
parse_body(response.body),
|
130
|
+
uri.to_s
|
131
|
+
)
|
132
|
+
end
|
133
|
+
|
134
|
+
# Builds an URI for a redirect response.
|
135
|
+
# @return [URI] to redirect to.
|
136
|
+
# @param [URI] uri that was requested.
|
137
|
+
# @param [Net::HTTPResponse] response the server response.
|
138
|
+
# @example Build redirect URI for example.com (lets pretend it will redirect..)
|
139
|
+
# Request.build_redirect_uri('http://example.com', net_http_response)
|
140
|
+
def self.build_redirect_uri(uri, response)
|
141
|
+
location_header = response.header.fetch('location') do
|
142
|
+
raise InvalidRedirectError, "No location header found on redirect when requesting #{uri}"
|
143
|
+
end
|
144
|
+
|
145
|
+
location = URI.parse(location_header)
|
146
|
+
return build_uri(uri) + location_header if location.relative?
|
147
|
+
|
148
|
+
location
|
149
|
+
end
|
150
|
+
|
151
|
+
# Build URI.
|
152
|
+
# @return [URI] uri to redirect to.
|
153
|
+
# @param [URI, String] uri to build.
|
154
|
+
# @example Build URI for example.com
|
155
|
+
# Request.build_uri('http://example.com')
|
156
|
+
# @example Build URI for #<URI::HTTP http://example.com>
|
157
|
+
# uri = URI.parse('http://example.com')
|
158
|
+
# Request.build_uri(uri)
|
159
|
+
def self.build_uri(uri)
|
160
|
+
return uri if uri.is_a?(URI)
|
161
|
+
|
162
|
+
uri = "http://#{uri}" unless uri =~ %r{^https?://}
|
163
|
+
URI.parse(uri)
|
164
|
+
end
|
165
|
+
|
166
|
+
# Parse response body, handles reqular and gzipped response bodies.
|
167
|
+
# @return [String] the response body.
|
168
|
+
# @param [String] response_body the server response body.
|
169
|
+
# @example Return response body for response.
|
170
|
+
# Request.parse_body(uri, net_http_response)
|
171
|
+
def self.parse_body(response_body)
|
172
|
+
return '' unless response_body
|
173
|
+
|
174
|
+
Zlib::GzipReader.new(StringIO.new(response_body)).read
|
175
|
+
rescue Zlib::GzipFile::Error => _e
|
176
|
+
response_body
|
177
|
+
end
|
178
|
+
|
179
|
+
# Return whether a value is blank or not.
|
180
|
+
# @return [Boolean] whether the value is blank or not.
|
181
|
+
# @param [Object] value the value to check if its blank or not.
|
182
|
+
# @example Returns false for nil.
|
183
|
+
# Request.blank?(nil)
|
184
|
+
# @example Returns false for empty string.
|
185
|
+
# Request.blank?('')
|
186
|
+
# @example Returns false for string with only spaces.
|
187
|
+
# Request.blank?(' ')
|
188
|
+
def self.blank?(value)
|
189
|
+
return true unless value
|
190
|
+
return true if value.strip.empty?
|
191
|
+
|
192
|
+
false
|
193
|
+
end
|
194
|
+
|
195
|
+
private
|
196
|
+
|
197
|
+
def self.perform_request(uri, http, request)
|
198
|
+
# TODO: Consider retrying failed requests
|
199
|
+
response = http.request(request)
|
200
|
+
GETStruct.new(response)
|
201
|
+
rescue Timeout::Error,
|
202
|
+
OpenSSL::SSL::SSLError,
|
203
|
+
Net::HTTPBadResponse,
|
204
|
+
Zlib::Error => e
|
205
|
+
|
206
|
+
build_request_error(uri, e, ServerError)
|
207
|
+
rescue SystemCallError,
|
208
|
+
SocketError,
|
209
|
+
IOError => e
|
210
|
+
|
211
|
+
build_request_error(uri, e, ClientError)
|
212
|
+
end
|
213
|
+
|
214
|
+
def self.build_request_error(uri, error, error_wrapper_klass)
|
215
|
+
WaybackArchiver.logger.error "Request to #{uri} failed: #{error_wrapper_klass}, #{error.class}, #{error.message}"
|
216
|
+
|
217
|
+
GETStruct.new(
|
218
|
+
Response.new,
|
219
|
+
error_wrapper_klass.new("#{error.class}, #{error.message}")
|
220
|
+
)
|
60
221
|
end
|
61
222
|
end
|
62
223
|
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
|
3
|
+
module WaybackArchiver
|
4
|
+
# Parse Sitemaps, https://www.sitemaps.org
|
5
|
+
class Sitemap
|
6
|
+
attr_reader :document
|
7
|
+
|
8
|
+
def initialize(xml, strict: false)
|
9
|
+
@document = REXML::Document.new(xml)
|
10
|
+
rescue REXML::ParseException => _e
|
11
|
+
raise if strict
|
12
|
+
|
13
|
+
@document = REXML::Document.new('')
|
14
|
+
end
|
15
|
+
|
16
|
+
# Return all URLs defined in Sitemap.
|
17
|
+
# @return [Array<String>] of URLs defined in Sitemap.
|
18
|
+
# @example Get URLs defined in Sitemap
|
19
|
+
# sitemap = Sitemap.new(xml)
|
20
|
+
# sitemap.urls
|
21
|
+
def urls
|
22
|
+
@urls ||= extract_urls('url')
|
23
|
+
end
|
24
|
+
|
25
|
+
# Return all sitemap URLs defined in Sitemap.
|
26
|
+
# @return [Array<String>] of Sitemap URLs defined in Sitemap.
|
27
|
+
# @example Get Sitemap URLs defined in Sitemap
|
28
|
+
# sitemap = Sitemap.new(xml)
|
29
|
+
# sitemap.sitemaps
|
30
|
+
def sitemaps
|
31
|
+
@sitemaps ||= extract_urls('sitemap')
|
32
|
+
end
|
33
|
+
|
34
|
+
# Check if sitemap is a plain file
|
35
|
+
# @return [Boolean] whether document is plain
|
36
|
+
def plain_document?
|
37
|
+
document.elements.empty?
|
38
|
+
end
|
39
|
+
|
40
|
+
# Return the name of the document (if there is one)
|
41
|
+
# @return [String] the document root name
|
42
|
+
def root_name
|
43
|
+
return unless document.root
|
44
|
+
|
45
|
+
document.root.name
|
46
|
+
end
|
47
|
+
|
48
|
+
# Returns true of Sitemap is a Sitemap index
|
49
|
+
# @return [Boolean] of whether the Sitemap is an Sitemap index or not
|
50
|
+
# @example Check if Sitemap is a sitemap index
|
51
|
+
# sitemap = Sitemap.new(xml)
|
52
|
+
# sitemap.sitemap_index?
|
53
|
+
def sitemap_index?
|
54
|
+
root_name == 'sitemapindex'
|
55
|
+
end
|
56
|
+
|
57
|
+
# Returns true of Sitemap lists regular URLs
|
58
|
+
# @return [Boolean] of whether the Sitemap regular URL list
|
59
|
+
# @example Check if Sitemap is a regular URL list
|
60
|
+
# sitemap = Sitemap.new(xml)
|
61
|
+
# sitemap.urlset?
|
62
|
+
def urlset?
|
63
|
+
root_name == 'urlset'
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
|
68
|
+
# Extract URLs from Sitemap
|
69
|
+
def extract_urls(node_name)
|
70
|
+
return document.to_s.each_line.map(&:strip) if plain_document?
|
71
|
+
|
72
|
+
urls = []
|
73
|
+
document.root.elements.each("#{node_name}/loc") do |element|
|
74
|
+
urls << element.text
|
75
|
+
end
|
76
|
+
urls
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'robots'
|
2
|
+
|
3
|
+
require 'wayback_archiver/sitemap'
|
4
|
+
require 'wayback_archiver/request'
|
5
|
+
|
6
|
+
module WaybackArchiver
|
7
|
+
# Fetch and parse sitemaps recursively
|
8
|
+
class Sitemapper
|
9
|
+
# Common locations for Sitemap(s)
|
10
|
+
COMMON_SITEMAP_LOCATIONS = %w[
|
11
|
+
sitemap_index.xml.gz
|
12
|
+
sitemap-index.xml.gz
|
13
|
+
sitemap_index.xml
|
14
|
+
sitemap-index.xml
|
15
|
+
sitemap.xml.gz
|
16
|
+
sitemap.xml
|
17
|
+
].freeze
|
18
|
+
|
19
|
+
# Autodiscover the location of the Sitemap, then fetch and parse recursively.
|
20
|
+
# First it tries /robots.txt, then common locations for Sitemap and finally the supplied URL.
|
21
|
+
# @return [Array<String>] of URLs defined in Sitemap(s).
|
22
|
+
# @param [URI] url to domain.
|
23
|
+
# @example Get URLs defined in Sitemap for google.com
|
24
|
+
# Sitemapper.autodiscover('https://google.com/')
|
25
|
+
# @see http://www.sitemaps.org
|
26
|
+
def self.autodiscover(url)
|
27
|
+
WaybackArchiver.logger.info 'Looking for Sitemap(s) in /robots.txt'
|
28
|
+
robots = Robots.new(WaybackArchiver.user_agent)
|
29
|
+
sitemaps = robots.other_values(url)['Sitemap']
|
30
|
+
if sitemaps
|
31
|
+
return sitemaps.flat_map do |sitemap|
|
32
|
+
WaybackArchiver.logger.info "Fetching Sitemap at #{sitemap}"
|
33
|
+
urls(url: sitemap)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
COMMON_SITEMAP_LOCATIONS.each do |path|
|
38
|
+
WaybackArchiver.logger.info "Looking for Sitemap at #{path}"
|
39
|
+
sitemap_url = [url, path].join(url.end_with?('/') ? '' : '/')
|
40
|
+
response = Request.get(sitemap_url, raise_on_http_error: false)
|
41
|
+
return urls(xml: response.body) if response.success?
|
42
|
+
end
|
43
|
+
|
44
|
+
WaybackArchiver.logger.info "Looking for Sitemap at #{url}"
|
45
|
+
urls(url: url)
|
46
|
+
rescue Request::Error => e
|
47
|
+
WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
|
48
|
+
[]
|
49
|
+
end
|
50
|
+
|
51
|
+
# Fetch and parse sitemaps recursively.
|
52
|
+
# @return [Array<String>] of URLs defined in Sitemap(s).
|
53
|
+
# @param url [String] URL to Sitemap.
|
54
|
+
# @param xml [String] Sitemap XML.
|
55
|
+
# @example Get URLs defined in Sitemap for google.com
|
56
|
+
# Sitemapper.urls(url: 'https://google.com/sitemap.xml')
|
57
|
+
# @example Get URLs defined in Sitemap
|
58
|
+
# Sitemapper.urls(xml: xml)
|
59
|
+
# @see http://www.sitemaps.org
|
60
|
+
def self.urls(url: nil, xml: nil)
|
61
|
+
xml = Request.get(url).body unless xml
|
62
|
+
sitemap = Sitemap.new(xml)
|
63
|
+
|
64
|
+
if sitemap.sitemap_index?
|
65
|
+
sitemap.sitemaps.flat_map { |sitemap_url| urls(url: sitemap_url) }
|
66
|
+
else
|
67
|
+
sitemap.urls
|
68
|
+
end
|
69
|
+
rescue Request::Error => e
|
70
|
+
WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
|
71
|
+
|
72
|
+
[]
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'concurrent'
|
2
|
+
|
3
|
+
module WaybackArchiver
|
4
|
+
# Thread pool
|
5
|
+
class ThreadPool
|
6
|
+
# Build a thread pool
|
7
|
+
# @return [Concurrent::FixedThreadPool/Concurrent::ImmediateExecutor] an instance of a concurrent thread pool
|
8
|
+
# @param [Integer] concurrency the desired concurrency
|
9
|
+
# @example Build a thread pool with 10 as the desired concurrency
|
10
|
+
# pool = ThreadPool.build(10)
|
11
|
+
# pool.post { some_work } # Returns a Concurrent::FixedThreadPool
|
12
|
+
# @example Build a thread pool with 1 as the desired concurrency
|
13
|
+
# pool = ThreadPool.build(1)
|
14
|
+
# pool.post { some_work } # Returns a Concurrent::ImmediateExecutor
|
15
|
+
# @see https://github.com/ruby-concurrency/concurrent-ruby/blob/master/doc/thread_pools.md
|
16
|
+
def self.build(concurrency)
|
17
|
+
if concurrency == 1
|
18
|
+
Concurrent::ImmediateExecutor.new
|
19
|
+
elsif concurrency > 1
|
20
|
+
Concurrent::FixedThreadPool.new(concurrency)
|
21
|
+
else
|
22
|
+
raise ArgumentError, 'concurrency must be one or greater'
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -1,51 +1,42 @@
|
|
1
1
|
require 'spidr'
|
2
2
|
require 'robots'
|
3
3
|
|
4
|
+
require 'wayback_archiver/sitemapper'
|
5
|
+
require 'wayback_archiver/request'
|
6
|
+
|
4
7
|
module WaybackArchiver
|
5
8
|
# Retrive URLs from different sources
|
6
|
-
class
|
9
|
+
class URLCollector
|
7
10
|
# Retrieve URLs from Sitemap.
|
8
|
-
# @return [Array] of URLs defined in Sitemap.
|
11
|
+
# @return [Array<String>] of URLs defined in Sitemap.
|
9
12
|
# @param [String] url domain to retrieve Sitemap from.
|
10
13
|
# @example Get URLs defined in Sitemap for google.com
|
11
|
-
#
|
14
|
+
# URLCollector.sitemap('https://google.com/sitemap.xml')
|
12
15
|
def self.sitemap(url)
|
13
|
-
|
14
|
-
sitemap = Request.document(resolved)
|
15
|
-
sitemap.css('loc').map(&:text)
|
16
|
+
Sitemapper.urls(url: Request.build_uri(url))
|
16
17
|
end
|
17
18
|
|
18
19
|
# Retrieve URLs by crawling.
|
19
|
-
# @return [Array] of URLs defined found during crawl.
|
20
|
+
# @return [Array<String>] of URLs defined found during crawl.
|
20
21
|
# @param [String] url domain to crawl URLs from.
|
21
22
|
# @example Crawl URLs defined on example.com
|
22
|
-
#
|
23
|
+
# URLCollector.crawl('http://example.com')
|
23
24
|
def self.crawl(url)
|
24
25
|
urls = []
|
25
|
-
|
26
|
-
|
26
|
+
start_at_url = Request.build_uri(url).to_s
|
27
|
+
options = {
|
28
|
+
robots: true,
|
29
|
+
user_agent: WaybackArchiver.user_agent
|
30
|
+
}
|
31
|
+
Spidr.site(start_at_url, options) do |spider|
|
27
32
|
spider.every_html_page do |page|
|
28
33
|
page_url = page.url.to_s
|
29
34
|
urls << page_url
|
30
|
-
WaybackArchiver.logger.
|
35
|
+
WaybackArchiver.logger.debug "Found: #{page_url}"
|
31
36
|
yield(page_url) if block_given?
|
32
37
|
end
|
33
38
|
end
|
34
39
|
urls
|
35
40
|
end
|
36
|
-
|
37
|
-
# Retrieve URLs listed in file.
|
38
|
-
# @return [Array] of URLs defined in file.
|
39
|
-
# @param [String] path to get URLs from.
|
40
|
-
# @example Get URLs defined in /path/to/file
|
41
|
-
# UrlCollector.file('/path/to/file')
|
42
|
-
def self.file(path)
|
43
|
-
raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
|
44
|
-
urls = []
|
45
|
-
File.open(path).read
|
46
|
-
.gsub(/\r\n?/, "\n")
|
47
|
-
.each_line { |line| urls << line.delete("\n").strip }
|
48
|
-
urls.reject(&:empty?)
|
49
|
-
end
|
50
41
|
end
|
51
42
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-08-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: spidr
|
@@ -38,20 +38,6 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0.1'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: url_resolver
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - "~>"
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0.1'
|
48
|
-
type: :runtime
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - "~>"
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0.1'
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: concurrent-ruby
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,14 +114,14 @@ dependencies:
|
|
128
114
|
requirements:
|
129
115
|
- - "~>"
|
130
116
|
- !ruby/object:Gem::Version
|
131
|
-
version: '0.
|
117
|
+
version: '0.8'
|
132
118
|
type: :development
|
133
119
|
prerelease: false
|
134
120
|
version_requirements: !ruby/object:Gem::Requirement
|
135
121
|
requirements:
|
136
122
|
- - "~>"
|
137
123
|
- !ruby/object:Gem::Version
|
138
|
-
version: '0.
|
124
|
+
version: '0.8'
|
139
125
|
- !ruby/object:Gem::Dependency
|
140
126
|
name: redcarpet
|
141
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -150,6 +136,20 @@ dependencies:
|
|
150
136
|
- - "~>"
|
151
137
|
- !ruby/object:Gem::Version
|
152
138
|
version: '3.2'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: webmock
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - "~>"
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '3.0'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '3.0'
|
153
153
|
- !ruby/object:Gem::Dependency
|
154
154
|
name: byebug
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
@@ -164,8 +164,8 @@ dependencies:
|
|
164
164
|
- - ">"
|
165
165
|
- !ruby/object:Gem::Version
|
166
166
|
version: '0'
|
167
|
-
description:
|
168
|
-
|
167
|
+
description: Post URLs to Wayback Machine (Internet Archive), using a crawler, from
|
168
|
+
Sitemap(s) or a list of URLs.
|
169
169
|
email:
|
170
170
|
- burenstam@gmail.com
|
171
171
|
executables:
|
@@ -176,8 +176,12 @@ files:
|
|
176
176
|
- bin/wayback_archiver
|
177
177
|
- lib/wayback_archiver.rb
|
178
178
|
- lib/wayback_archiver/archive.rb
|
179
|
+
- lib/wayback_archiver/http_code.rb
|
179
180
|
- lib/wayback_archiver/null_logger.rb
|
180
181
|
- lib/wayback_archiver/request.rb
|
182
|
+
- lib/wayback_archiver/sitemap.rb
|
183
|
+
- lib/wayback_archiver/sitemapper.rb
|
184
|
+
- lib/wayback_archiver/thread_pool.rb
|
181
185
|
- lib/wayback_archiver/url_collector.rb
|
182
186
|
- lib/wayback_archiver/version.rb
|
183
187
|
homepage: https://github.com/buren/wayback_archiver
|
@@ -203,5 +207,5 @@ rubyforge_project:
|
|
203
207
|
rubygems_version: 2.6.11
|
204
208
|
signing_key:
|
205
209
|
specification_version: 4
|
206
|
-
summary:
|
210
|
+
summary: Post URLs to Wayback Machine (Internet Archive)
|
207
211
|
test_files: []
|