wayback_archiver 0.2.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/wayback_archiver +71 -7
- data/lib/wayback_archiver.rb +120 -25
- data/lib/wayback_archiver/archive.rb +45 -23
- data/lib/wayback_archiver/http_code.rb +49 -0
- data/lib/wayback_archiver/null_logger.rb +7 -4
- data/lib/wayback_archiver/request.rb +214 -53
- data/lib/wayback_archiver/sitemap.rb +79 -0
- data/lib/wayback_archiver/sitemapper.rb +75 -0
- data/lib/wayback_archiver/thread_pool.rb +26 -0
- data/lib/wayback_archiver/url_collector.rb +16 -25
- data/lib/wayback_archiver/version.rb +1 -1
- metadata +25 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1f9f979d5fa0d31cfdf61660baa3464bdb1425e5
|
4
|
+
data.tar.gz: 1d5701273bbe4d02b2ba5d88f9e75c9477058a28
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e69883c975584b3120993371b29a0c0b7a71f3fd4210764b4caa712d4071ec175dc47fa63217950968da62485e927c05fcc977dfb076a317be4754cf4f16ec90
|
7
|
+
data.tar.gz: 51f80b591f40f4bc22b5bf2e2b7d56273c3c495db43d8a9268f11048a410ad355a3ff238088a4abd83c69fd585db5e1e704d34d22dac3d4f4b9b2f73089458ac
|
data/bin/wayback_archiver
CHANGED
@@ -1,14 +1,78 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
require 'optparse'
|
3
4
|
require 'wayback_archiver'
|
4
5
|
|
5
|
-
|
6
|
-
|
6
|
+
# Default values
|
7
|
+
urls = nil
|
8
|
+
strategy = 'auto'
|
9
|
+
log = STDOUT
|
10
|
+
log_level = Logger::INFO
|
11
|
+
concurrency = WaybackArchiver.concurrency
|
7
12
|
|
8
|
-
|
13
|
+
optparse = OptionParser.new do |parser|
|
14
|
+
parser.banner = 'Usage: wayback_archiver [<url>] [options]'
|
9
15
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
16
|
+
parser.on('--auto', 'Auto (default)') do |value|
|
17
|
+
strategy = 'auto'
|
18
|
+
end
|
19
|
+
|
20
|
+
parser.on('--crawl', 'Crawl') do |value|
|
21
|
+
strategy = 'crawl'
|
22
|
+
end
|
23
|
+
|
24
|
+
parser.on('--sitemap', 'Sitemap') do |value|
|
25
|
+
strategy = 'sitemap'
|
26
|
+
end
|
27
|
+
|
28
|
+
parser.on('--urls', '--url', 'URL(s)') do |value|
|
29
|
+
strategy = 'urls'
|
30
|
+
end
|
31
|
+
|
32
|
+
parser.on('--concurrency=5', Integer, 'Concurrency') do |value|
|
33
|
+
concurrency = value
|
34
|
+
end
|
35
|
+
|
36
|
+
parser.on('--log=output.log', String, 'Path to desired log file (if no argument is given it defaults to STDOUT)') do |path|
|
37
|
+
log = path
|
38
|
+
end
|
39
|
+
|
40
|
+
parser.on('--[no-]verbose', 'Verboes logs') do |value|
|
41
|
+
log_level = value ? Logger::DEBUG : Logger::WARN
|
42
|
+
end
|
43
|
+
|
44
|
+
parser.on('-h', '--help', 'How to use') do
|
45
|
+
puts parser
|
46
|
+
exit
|
47
|
+
end
|
48
|
+
|
49
|
+
# No argument, shows at tail. This will print an options summary.
|
50
|
+
parser.on_tail('-h', '--help', 'Show this message') do
|
51
|
+
puts parser
|
52
|
+
exit
|
53
|
+
end
|
54
|
+
|
55
|
+
parser.on_tail('--version', 'Show version') do
|
56
|
+
puts "WaybackArchiver version #{WaybackArchiver::VERSION}"
|
57
|
+
exit
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
optparse.parse!
|
62
|
+
|
63
|
+
urls = ARGV.map(&:strip).reject(&:empty?)
|
64
|
+
if urls.empty?
|
65
|
+
puts optparse.help
|
66
|
+
raise ArgumentError, "[<url>] is required"
|
67
|
+
end
|
68
|
+
|
69
|
+
WaybackArchiver.logger = Logger.new(log).tap do |logger|
|
70
|
+
logger.progname = 'WaybackArchiver'
|
71
|
+
logger.level = log_level
|
72
|
+
end
|
73
|
+
|
74
|
+
# If no strategy has explicitly been given, then default to 'auto'
|
75
|
+
strategy ||= 'auto'
|
76
|
+
urls.each do |url|
|
77
|
+
WaybackArchiver.archive(url, strategy: strategy, concurrency: concurrency)
|
14
78
|
end
|
data/lib/wayback_archiver.rb
CHANGED
@@ -1,60 +1,155 @@
|
|
1
|
-
require '
|
2
|
-
require 'net/http'
|
3
|
-
|
4
|
-
require 'concurrent'
|
5
|
-
|
1
|
+
require 'wayback_archiver/thread_pool'
|
6
2
|
require 'wayback_archiver/null_logger'
|
7
3
|
require 'wayback_archiver/version'
|
8
4
|
require 'wayback_archiver/url_collector'
|
9
5
|
require 'wayback_archiver/archive'
|
10
|
-
require 'wayback_archiver/
|
6
|
+
require 'wayback_archiver/sitemapper'
|
11
7
|
|
12
|
-
# WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap
|
8
|
+
# WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap or by passing a list of URLs.
|
13
9
|
module WaybackArchiver
|
14
10
|
# Link to gem on rubygems.org, part of the sent User-Agent
|
15
11
|
INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'.freeze
|
16
12
|
# WaybackArchiver User-Agent
|
17
13
|
USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze
|
18
14
|
|
15
|
+
# Default concurrency for archiving URLs
|
16
|
+
DEFAULT_CONCURRENCY = 5
|
17
|
+
|
19
18
|
# Send URLs to Wayback Machine.
|
20
|
-
# @return [Array]
|
21
|
-
# @param [String] source for URL(s).
|
22
|
-
# @param [String/Symbol]
|
19
|
+
# @return [Array<String>] of URLs sent to the Wayback Machine.
|
20
|
+
# @param [String/Array<String>] source for URL(s).
|
21
|
+
# @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto.
|
23
22
|
# @example Crawl example.com and send all URLs of the same domain
|
24
|
-
# WaybackArchiver.archive('example.com') # Default
|
23
|
+
# WaybackArchiver.archive('example.com') # Default strategy is :auto
|
24
|
+
# WaybackArchiver.archive('example.com', strategy: :auto)
|
25
|
+
# WaybackArchiver.archive('example.com', strategy: :auto, concurrency: 10)
|
26
|
+
# WaybackArchiver.archive('example.com', :auto)
|
27
|
+
# @example Crawl example.com and send all URLs of the same domain
|
28
|
+
# WaybackArchiver.archive('example.com', strategy: :crawl)
|
29
|
+
# WaybackArchiver.archive('example.com', strategy: :crawl, concurrency: 10)
|
25
30
|
# WaybackArchiver.archive('example.com', :crawl)
|
31
|
+
# @example Send example.com Sitemap URLs
|
32
|
+
# WaybackArchiver.archive('example.com', strategy: :sitemap)
|
33
|
+
# WaybackArchiver.archive('example.com', strategy: :sitemap, concurrency: 10)
|
34
|
+
# WaybackArchiver.archive('example.com', :sitemap)
|
26
35
|
# @example Send only example.com
|
36
|
+
# WaybackArchiver.archive('example.com', strategy: :url)
|
37
|
+
# WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
|
27
38
|
# WaybackArchiver.archive('example.com', :url)
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
case
|
32
|
-
when '
|
33
|
-
when '
|
34
|
-
when 'sitemap' then
|
35
|
-
when '
|
39
|
+
def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency)
|
40
|
+
strategy = legacy_strategy || strategy
|
41
|
+
|
42
|
+
case strategy.to_s
|
43
|
+
when 'crawl' then crawl(source, concurrency: concurrency)
|
44
|
+
when 'auto' then auto(source, concurrency: concurrency)
|
45
|
+
when 'sitemap' then sitemap(source, concurrency: concurrency)
|
46
|
+
when 'urls' then urls(source, concurrency: concurrency)
|
47
|
+
when 'url' then urls(source, concurrency: concurrency)
|
36
48
|
else
|
37
|
-
raise ArgumentError, "Unknown
|
49
|
+
raise ArgumentError, "Unknown strategy: '#{strategy}'. Allowed strategies: sitemap, urls, url, crawl"
|
38
50
|
end
|
39
51
|
end
|
40
52
|
|
53
|
+
# Look for Sitemap(s) and if nothing is found fallback to crawling.
|
54
|
+
# Then send found URLs to the Wayback Machine.
|
55
|
+
# @return [Array<String>] of URLs sent to the Wayback Machine.
|
56
|
+
# @param [String] source (must be a valid URL).
|
57
|
+
# @param concurrency [Integer]
|
58
|
+
# @example Auto archive example.com
|
59
|
+
# WaybackArchiver.auto('example.com') # Default concurrency is 5
|
60
|
+
# @example Auto archive example.com with low concurrency
|
61
|
+
# WaybackArchiver.auto('example.com', concurrency: 1)
|
62
|
+
# @see http://www.sitemaps.org
|
63
|
+
def self.auto(source, concurrency: WaybackArchiver.concurrency)
|
64
|
+
urls = Sitemapper.autodiscover(source)
|
65
|
+
return urls(urls, concurrency: concurrency) if urls.any?
|
66
|
+
|
67
|
+
crawl(source, concurrency: concurrency)
|
68
|
+
end
|
69
|
+
|
41
70
|
# Crawl site for URLs to send to the Wayback Machine.
|
42
|
-
# @return [Array]
|
43
|
-
# @param [String]
|
44
|
-
# @param [Integer]
|
71
|
+
# @return [Array<String>] of URLs sent to the Wayback Machine.
|
72
|
+
# @param [String] url to start crawling from.
|
73
|
+
# @param concurrency [Integer]
|
45
74
|
# @example Crawl example.com and send all URLs of the same domain
|
46
75
|
# WaybackArchiver.crawl('example.com') # Default concurrency is 5
|
47
76
|
# @example Crawl example.com and send all URLs of the same domain with low concurrency
|
48
77
|
# WaybackArchiver.crawl('example.com', concurrency: 1)
|
49
|
-
def self.crawl(
|
50
|
-
|
78
|
+
def self.crawl(url, concurrency: WaybackArchiver.concurrency)
|
79
|
+
WaybackArchiver.logger.info "Crawling #{url}"
|
80
|
+
Archive.crawl(url, concurrency: concurrency)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Get URLs from sitemap and send found URLs to the Wayback Machine.
|
84
|
+
# @return [Array<String>] of URLs sent to the Wayback Machine.
|
85
|
+
# @param [String] url to the sitemap.
|
86
|
+
# @param concurrency [Integer]
|
87
|
+
# @example Get example.com sitemap and archive all found URLs
|
88
|
+
# WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 5
|
89
|
+
# @example Get example.com sitemap and archive all found URLs with low concurrency
|
90
|
+
# WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1)
|
91
|
+
# @see http://www.sitemaps.org
|
92
|
+
def self.sitemap(url, concurrency: WaybackArchiver.concurrency)
|
93
|
+
WaybackArchiver.logger.info "Fetching Sitemap"
|
94
|
+
Archive.post(URLCollector.sitemap(url), concurrency: concurrency)
|
95
|
+
end
|
96
|
+
|
97
|
+
# Send URL to the Wayback Machine.
|
98
|
+
# @return [Array<String>] of URLs sent to the Wayback Machine.
|
99
|
+
# @param [Array<String>/String] urls or url.
|
100
|
+
# @param concurrency [Integer]
|
101
|
+
# @example Archive example.com
|
102
|
+
# WaybackArchiver.urls('example.com')
|
103
|
+
# @example Archive example.com and google.com
|
104
|
+
# WaybackArchiver.urls(%w(example.com google.com))
|
105
|
+
def self.urls(urls, concurrency: WaybackArchiver.concurrency)
|
106
|
+
Archive.post(Array(urls), concurrency: concurrency)
|
51
107
|
end
|
52
108
|
|
109
|
+
# Set logger
|
110
|
+
# @return [Object] the set logger
|
111
|
+
# @param [Object] logger an object than response to quacks like a Logger
|
112
|
+
# @example set a logger that prints to standard out (STDOUT)
|
113
|
+
# WaybackArchiver.logger = Logger.new(STDOUT)
|
53
114
|
def self.logger=(logger)
|
54
115
|
@logger = logger
|
55
116
|
end
|
56
117
|
|
118
|
+
# Returns the current logger
|
119
|
+
# @return [Object] the current logger instance
|
57
120
|
def self.logger
|
58
121
|
@logger ||= NullLogger.new
|
59
122
|
end
|
123
|
+
|
124
|
+
# Resets the logger to the default
|
125
|
+
# @return [NullLogger] a new instance of NullLogger
|
126
|
+
def self.default_logger!
|
127
|
+
@logger = NullLogger.new
|
128
|
+
end
|
129
|
+
|
130
|
+
# Sets the user agent
|
131
|
+
# @return [String] the configured user agent
|
132
|
+
# @param [String] user_agent the desired user agent
|
133
|
+
def self.user_agent=(user_agent)
|
134
|
+
@user_agent = user_agent
|
135
|
+
end
|
136
|
+
|
137
|
+
# Returns the configured user agent
|
138
|
+
# @return [String] the configured or the default user agent
|
139
|
+
def self.user_agent
|
140
|
+
@user_agent ||= USER_AGENT
|
141
|
+
end
|
142
|
+
|
143
|
+
# Sets the default concurrency
|
144
|
+
# @return [Integer] the desired default concurrency
|
145
|
+
# @param [Integer] concurrency the desired default concurrency
|
146
|
+
def self.concurrency=(concurrency)
|
147
|
+
@concurrency = concurrency
|
148
|
+
end
|
149
|
+
|
150
|
+
# Returns the default concurrency
|
151
|
+
# @return [Integer] the configured or the default concurrency
|
152
|
+
def self.concurrency
|
153
|
+
@concurrency ||= DEFAULT_CONCURRENCY
|
154
|
+
end
|
60
155
|
end
|
@@ -1,46 +1,68 @@
|
|
1
|
+
require 'concurrent'
|
2
|
+
|
3
|
+
require 'wayback_archiver/thread_pool'
|
4
|
+
require 'wayback_archiver/request'
|
5
|
+
|
1
6
|
module WaybackArchiver
|
2
7
|
# Post URL(s) to Wayback Machine
|
3
8
|
class Archive
|
4
9
|
# Wayback Machine base URL.
|
5
10
|
WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
|
6
|
-
|
7
|
-
DEFAULT_CONCURRENCY = 5
|
11
|
+
|
8
12
|
# Send URLs to Wayback Machine.
|
9
|
-
# @return [Array] with sent URLs.
|
10
|
-
# @param [Array] urls
|
11
|
-
# @param [
|
13
|
+
# @return [Array<String>] with sent URLs.
|
14
|
+
# @param [Array<String>] urls to send to the Wayback Machine.
|
15
|
+
# @param concurrency [Integer] the default is 5
|
12
16
|
# @example Archive urls, asynchronously
|
13
17
|
# Archive.post(['http://example.com'])
|
14
18
|
# @example Archive urls, using only 1 thread
|
15
19
|
# Archive.post(['http://example.com'], concurrency: 1)
|
16
|
-
def self.post(urls, concurrency:
|
17
|
-
WaybackArchiver.logger.info "
|
20
|
+
def self.post(urls, concurrency: WaybackArchiver.concurrency)
|
21
|
+
WaybackArchiver.logger.info "Total URLs to be sent: #{urls.length}"
|
18
22
|
WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
|
19
|
-
WaybackArchiver.logger.info "Total urls to be sent: #{urls.length}"
|
20
23
|
|
21
|
-
|
24
|
+
posted_urls = Concurrent::Array.new
|
25
|
+
pool = ThreadPool.build(concurrency)
|
22
26
|
urls.each do |url|
|
23
|
-
pool.post
|
27
|
+
pool.post do
|
28
|
+
posted_url = post_url(url)
|
29
|
+
posted_urls << posted_url if posted_url
|
30
|
+
end
|
24
31
|
end
|
25
32
|
|
26
|
-
|
27
|
-
|
33
|
+
pool.shutdown
|
34
|
+
pool.wait_for_termination
|
35
|
+
|
36
|
+
WaybackArchiver.logger.info "#{posted_urls.length} URL(s) posted to Wayback Machine"
|
37
|
+
posted_urls
|
28
38
|
end
|
29
39
|
|
30
40
|
# Send URLs to Wayback Machine by crawling the site.
|
31
|
-
# @return [Array] with URLs sent to the Wayback Machine.
|
41
|
+
# @return [Array<String>] with URLs sent to the Wayback Machine.
|
32
42
|
# @param [String] source for URL to crawl.
|
33
|
-
# @param [Integer]
|
43
|
+
# @param concurrency [Integer] the default is 5
|
34
44
|
# @example Crawl example.com and send all URLs of the same domain
|
35
45
|
# WaybackArchiver.crawl('example.com')
|
36
46
|
# @example Crawl example.com and send all URLs of the same domain with low concurrency
|
37
47
|
# WaybackArchiver.crawl('example.com', concurrency: 1)
|
38
|
-
def self.crawl(source, concurrency:
|
39
|
-
|
48
|
+
def self.crawl(source, concurrency: WaybackArchiver.concurrency)
|
49
|
+
WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
|
50
|
+
|
51
|
+
posted_urls = Concurrent::Array.new
|
52
|
+
pool = ThreadPool.build(concurrency)
|
40
53
|
|
41
|
-
|
42
|
-
pool.post
|
54
|
+
found_urls = URLCollector.crawl(source) do |url|
|
55
|
+
pool.post do
|
56
|
+
posted_url = post_url(url)
|
57
|
+
posted_urls << posted_url if posted_url
|
58
|
+
end
|
43
59
|
end
|
60
|
+
WaybackArchiver.logger.info "Crawling of #{source} finished, found #{found_urls.length} URL(s)"
|
61
|
+
pool.shutdown
|
62
|
+
pool.wait_for_termination
|
63
|
+
|
64
|
+
WaybackArchiver.logger.info "#{posted_urls.length} URL(s) posted to Wayback Machine"
|
65
|
+
posted_urls
|
44
66
|
end
|
45
67
|
|
46
68
|
# Send URL to Wayback Machine.
|
@@ -50,12 +72,12 @@ module WaybackArchiver
|
|
50
72
|
# Archive.post_url('http://example.com')
|
51
73
|
def self.post_url(url)
|
52
74
|
request_url = "#{WAYBACK_BASE_URL}#{url}"
|
53
|
-
response = Request.
|
54
|
-
WaybackArchiver.logger.info "[#{response.code}, #{response.message}] #{url}"
|
75
|
+
response = Request.get(request_url, follow_redirects: false)
|
76
|
+
WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
|
55
77
|
url
|
56
|
-
rescue
|
57
|
-
WaybackArchiver.logger.error "
|
58
|
-
|
78
|
+
rescue Request::Error => e
|
79
|
+
WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
|
80
|
+
nil
|
59
81
|
end
|
60
82
|
end
|
61
83
|
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module WaybackArchiver
|
2
|
+
# Convience class for HTTP response codes
|
3
|
+
class HTTPCode
|
4
|
+
# Type of code as symbol
|
5
|
+
# @return [Symbol] code type
|
6
|
+
# @param [String/Integer] code the response code
|
7
|
+
# @example
|
8
|
+
# HttpCode.type('200')
|
9
|
+
def self.type(code)
|
10
|
+
code = code.to_s
|
11
|
+
return :success if success?(code)
|
12
|
+
return :redirect if redirect?(code)
|
13
|
+
return :error if error?(code)
|
14
|
+
|
15
|
+
:unknown
|
16
|
+
end
|
17
|
+
|
18
|
+
# Whether the code is a success type
|
19
|
+
# @return [Boolean] is success or not
|
20
|
+
# @param [String] code the response code
|
21
|
+
# @example
|
22
|
+
# HttpCode.success?('200') # => true
|
23
|
+
# @example
|
24
|
+
# HttpCode.success?(200) # => true
|
25
|
+
# @example
|
26
|
+
# HttpCode.success?(nil) # => false
|
27
|
+
def self.success?(code)
|
28
|
+
code.to_s.match?(/2\d\d/)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Whether the code is a redirect type
|
32
|
+
# @return [Boolean] is redirect or not
|
33
|
+
# @param [String] code the response code
|
34
|
+
# @example
|
35
|
+
# HttpCode.redirect?('301')
|
36
|
+
def self.redirect?(code)
|
37
|
+
code.to_s.match?(/3\d\d/)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Whether the code is a error type
|
41
|
+
# @return [Boolean] is error or not
|
42
|
+
# @param [String] code the response code
|
43
|
+
# @example
|
44
|
+
# HttpCode.error?('301')
|
45
|
+
def self.error?(code)
|
46
|
+
code.to_s.match?(/4\d\d/) || code.to_s.match?(/5\d\d/)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -1,9 +1,12 @@
|
|
1
1
|
require 'logger'
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
3
|
+
module WaybackArchiver
|
4
|
+
# Don't log anyting / Send the logs to the abyss
|
5
|
+
class NullLogger < Logger
|
6
|
+
# Allow any and all params
|
7
|
+
def initialize(*args); end
|
6
8
|
|
7
|
-
|
9
|
+
# Allow any and alls params and don't do anyting
|
10
|
+
def add(*args, &block); end
|
8
11
|
end
|
9
12
|
end
|
@@ -1,62 +1,223 @@
|
|
1
|
-
require '
|
1
|
+
require 'net/http'
|
2
|
+
require 'openssl'
|
3
|
+
require 'timeout'
|
4
|
+
require 'uri'
|
5
|
+
require 'zlib'
|
6
|
+
|
7
|
+
require 'wayback_archiver/http_code'
|
2
8
|
|
3
9
|
module WaybackArchiver
|
4
|
-
#
|
10
|
+
# Make HTTP requests
|
5
11
|
class Request
|
6
|
-
#
|
7
|
-
|
8
|
-
#
|
9
|
-
|
10
|
-
#
|
11
|
-
|
12
|
-
#
|
13
|
-
|
14
|
-
|
15
|
-
|
12
|
+
# General error, something went wrong
|
13
|
+
class Error < StandardError; end
|
14
|
+
# Client error, something went wrong on the local machine
|
15
|
+
class ClientError < Error; end
|
16
|
+
# Server error, the remote server did something wrong
|
17
|
+
class ServerError < Error; end
|
18
|
+
# Remote server responded with a HTTP error
|
19
|
+
class HTTPError < ServerError; end
|
20
|
+
# Remote server error
|
21
|
+
class ResponseError < ServerError; end
|
22
|
+
# Max redirects reached error
|
23
|
+
class MaxRedirectError < ServerError; end
|
24
|
+
# Remote server responded with an invalid redirect
|
25
|
+
class InvalidRedirectError < ServerError; end
|
26
|
+
# Remote server responded with an unknown HTTP code
|
27
|
+
class UnknownResponseCodeError < ServerError; end
|
28
|
+
|
29
|
+
# GET response wrapper
|
30
|
+
GETStruct = Struct.new(:response, :error)
|
31
|
+
|
32
|
+
# Max number of redirects before an error is raised
|
33
|
+
MAX_REDIRECTS = 10
|
34
|
+
|
35
|
+
# Response data struct
|
36
|
+
Response = Struct.new(:code, :message, :body, :uri, :error)
|
37
|
+
class Response
|
38
|
+
# Returns true if a successfull response
|
39
|
+
# @example check if Response was successfull
|
40
|
+
# response = Response.new('200', 'OK', 'buren', 'http://example.com')
|
41
|
+
# response.success? # => true
|
42
|
+
def success?
|
43
|
+
HTTPCode.success?(code)
|
44
|
+
end
|
16
45
|
end
|
17
46
|
|
18
47
|
# Get reponse.
|
19
|
-
# @return [
|
20
|
-
# @param [String]
|
21
|
-
# @param [
|
22
|
-
# @
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
48
|
+
# @return [Response] the http response representation.
|
49
|
+
# @param [String, URI] uri to retrieve.
|
50
|
+
# @param max_redirects [Integer] max redirects (default: 10).
|
51
|
+
# @param follow_redirects [Boolean] follow redirects (default: true).
|
52
|
+
# @example Get example.com
|
53
|
+
# Request.get('example.com')
|
54
|
+
# @example Get http://example.com and follow max 3 redirects
|
55
|
+
# Request.get('http://example.com', max_redirects: 3)
|
56
|
+
# @example Get http://example.com and don't follow redirects
|
57
|
+
# Request.get('http://example.com', follow_redirects: false)
|
58
|
+
# @raise [Error] super class of all exceptions that this method can raise
|
59
|
+
# @raise [ServerError] all server errors
|
60
|
+
# @raise [ClientError] all client errors
|
61
|
+
# @raise [HTTPError] all HTTP errors
|
62
|
+
# @raise [MaxRedirectError] too many redirects, subclass of HTTPError (only raised if raise_on_http_error flag is true)
|
63
|
+
# @raise [ResponseError] server responsed with a 4xx or 5xx HTTP status code, subclass of HTTPError (only raised if raise_on_http_error flag is true)
|
64
|
+
# @raise [UnknownResponseCodeError] server responded with an unknown HTTP status code, subclass of HTTPError (only raised if raise_on_http_error flag is true)
|
65
|
+
# @raise [InvalidRedirectError] server responded with an invalid redirect, subclass of HTTPError (only raised if raise_on_http_error flag is true)
|
66
|
+
def self.get(
|
67
|
+
uri,
|
68
|
+
max_redirects: MAX_REDIRECTS,
|
69
|
+
raise_on_http_error: false,
|
70
|
+
follow_redirects: true
|
71
|
+
)
|
72
|
+
uri = build_uri(uri)
|
73
|
+
|
74
|
+
redirect_count = 0
|
75
|
+
until redirect_count > max_redirects
|
76
|
+
WaybackArchiver.logger.debug "Requesting #{uri}"
|
77
|
+
|
78
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
79
|
+
if uri.scheme == 'https'
|
80
|
+
http.use_ssl = true
|
81
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
82
|
+
end
|
83
|
+
|
84
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
85
|
+
request['User-Agent'] = WaybackArchiver.user_agent
|
86
|
+
|
87
|
+
result = perform_request(uri, http, request)
|
88
|
+
response = result.response
|
89
|
+
error = result.error
|
90
|
+
|
91
|
+
raise error if error
|
92
|
+
|
93
|
+
code = response.code
|
94
|
+
WaybackArchiver.logger.debug "[#{code}, #{response.message}] Requested #{uri}"
|
95
|
+
|
96
|
+
case HTTPCode.type(code)
|
97
|
+
when :success
|
98
|
+
return build_response(uri, response)
|
99
|
+
when :redirect
|
100
|
+
return build_response(uri, response) unless follow_redirects
|
101
|
+
|
102
|
+
uri = build_redirect_uri(uri, response)
|
103
|
+
redirect_count += 1
|
104
|
+
next
|
105
|
+
when :error
|
106
|
+
if raise_on_http_error
|
107
|
+
raise ResponseError, "Failed with response code: #{code} when requesting #{uri}"
|
108
|
+
end
|
109
|
+
|
110
|
+
return build_response(uri, response)
|
111
|
+
else
|
112
|
+
raise UnknownResponseCodeError, "Unknown HTTP response code #{code} when requesting #{uri}"
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
raise MaxRedirectError, "Redirected too many times when requesting #{uri}"
|
117
|
+
end
|
118
|
+
|
119
|
+
# Builds a Response object.
|
120
|
+
# @return [Response]
|
121
|
+
# @param [URI] uri that was requested.
|
122
|
+
# @param [Net::HTTPResponse] response the server response.
|
123
|
+
# @example Build Response object for example.com
|
124
|
+
# Request.build_response(uri, net_http_response)
|
125
|
+
def self.build_response(uri, response)
|
126
|
+
Response.new(
|
127
|
+
response.code,
|
128
|
+
response.message,
|
129
|
+
parse_body(response.body),
|
130
|
+
uri.to_s
|
131
|
+
)
|
132
|
+
end
|
133
|
+
|
134
|
+
# Builds an URI for a redirect response.
|
135
|
+
# @return [URI] to redirect to.
|
136
|
+
# @param [URI] uri that was requested.
|
137
|
+
# @param [Net::HTTPResponse] response the server response.
|
138
|
+
# @example Build redirect URI for example.com (lets pretend it will redirect..)
|
139
|
+
# Request.build_redirect_uri('http://example.com', net_http_response)
|
140
|
+
def self.build_redirect_uri(uri, response)
|
141
|
+
location_header = response.header.fetch('location') do
|
142
|
+
raise InvalidRedirectError, "No location header found on redirect when requesting #{uri}"
|
143
|
+
end
|
144
|
+
|
145
|
+
location = URI.parse(location_header)
|
146
|
+
return build_uri(uri) + location_header if location.relative?
|
147
|
+
|
148
|
+
location
|
149
|
+
end
|
150
|
+
|
151
|
+
# Build URI.
|
152
|
+
# @return [URI] uri to redirect to.
|
153
|
+
# @param [URI, String] uri to build.
|
154
|
+
# @example Build URI for example.com
|
155
|
+
# Request.build_uri('http://example.com')
|
156
|
+
# @example Build URI for #<URI::HTTP http://example.com>
|
157
|
+
# uri = URI.parse('http://example.com')
|
158
|
+
# Request.build_uri(uri)
|
159
|
+
def self.build_uri(uri)
|
160
|
+
return uri if uri.is_a?(URI)
|
161
|
+
|
162
|
+
uri = "http://#{uri}" unless uri =~ %r{^https?://}
|
163
|
+
URI.parse(uri)
|
164
|
+
end
|
165
|
+
|
166
|
+
# Parse response body, handles reqular and gzipped response bodies.
|
167
|
+
# @return [String] the response body.
|
168
|
+
# @param [String] response_body the server response body.
|
169
|
+
# @example Return response body for response.
|
170
|
+
# Request.parse_body(uri, net_http_response)
|
171
|
+
def self.parse_body(response_body)
|
172
|
+
return '' unless response_body
|
173
|
+
|
174
|
+
Zlib::GzipReader.new(StringIO.new(response_body)).read
|
175
|
+
rescue Zlib::GzipFile::Error => _e
|
176
|
+
response_body
|
177
|
+
end
|
178
|
+
|
179
|
+
# Return whether a value is blank or not.
|
180
|
+
# @return [Boolean] whether the value is blank or not.
|
181
|
+
# @param [Object] value the value to check if its blank or not.
|
182
|
+
# @example Returns false for nil.
|
183
|
+
# Request.blank?(nil)
|
184
|
+
# @example Returns false for empty string.
|
185
|
+
# Request.blank?('')
|
186
|
+
# @example Returns false for string with only spaces.
|
187
|
+
# Request.blank?(' ')
|
188
|
+
def self.blank?(value)
|
189
|
+
return true unless value
|
190
|
+
return true if value.strip.empty?
|
191
|
+
|
192
|
+
false
|
193
|
+
end
|
194
|
+
|
195
|
+
private
|
196
|
+
|
197
|
+
def self.perform_request(uri, http, request)
|
198
|
+
# TODO: Consider retrying failed requests
|
199
|
+
response = http.request(request)
|
200
|
+
GETStruct.new(response)
|
201
|
+
rescue Timeout::Error,
|
202
|
+
OpenSSL::SSL::SSLError,
|
203
|
+
Net::HTTPBadResponse,
|
204
|
+
Zlib::Error => e
|
205
|
+
|
206
|
+
build_request_error(uri, e, ServerError)
|
207
|
+
rescue SystemCallError,
|
208
|
+
SocketError,
|
209
|
+
IOError => e
|
210
|
+
|
211
|
+
build_request_error(uri, e, ClientError)
|
212
|
+
end
|
213
|
+
|
214
|
+
def self.build_request_error(uri, error, error_wrapper_klass)
|
215
|
+
WaybackArchiver.logger.error "Request to #{uri} failed: #{error_wrapper_klass}, #{error.class}, #{error.message}"
|
216
|
+
|
217
|
+
GETStruct.new(
|
218
|
+
Response.new,
|
219
|
+
error_wrapper_klass.new("#{error.class}, #{error.message}")
|
220
|
+
)
|
60
221
|
end
|
61
222
|
end
|
62
223
|
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
|
3
|
+
module WaybackArchiver
|
4
|
+
# Parse Sitemaps, https://www.sitemaps.org
|
5
|
+
class Sitemap
|
6
|
+
attr_reader :document
|
7
|
+
|
8
|
+
def initialize(xml, strict: false)
|
9
|
+
@document = REXML::Document.new(xml)
|
10
|
+
rescue REXML::ParseException => _e
|
11
|
+
raise if strict
|
12
|
+
|
13
|
+
@document = REXML::Document.new('')
|
14
|
+
end
|
15
|
+
|
16
|
+
# Return all URLs defined in Sitemap.
|
17
|
+
# @return [Array<String>] of URLs defined in Sitemap.
|
18
|
+
# @example Get URLs defined in Sitemap
|
19
|
+
# sitemap = Sitemap.new(xml)
|
20
|
+
# sitemap.urls
|
21
|
+
def urls
|
22
|
+
@urls ||= extract_urls('url')
|
23
|
+
end
|
24
|
+
|
25
|
+
# Return all sitemap URLs defined in Sitemap.
|
26
|
+
# @return [Array<String>] of Sitemap URLs defined in Sitemap.
|
27
|
+
# @example Get Sitemap URLs defined in Sitemap
|
28
|
+
# sitemap = Sitemap.new(xml)
|
29
|
+
# sitemap.sitemaps
|
30
|
+
def sitemaps
|
31
|
+
@sitemaps ||= extract_urls('sitemap')
|
32
|
+
end
|
33
|
+
|
34
|
+
# Check if sitemap is a plain file
|
35
|
+
# @return [Boolean] whether document is plain
|
36
|
+
def plain_document?
|
37
|
+
document.elements.empty?
|
38
|
+
end
|
39
|
+
|
40
|
+
# Return the name of the document (if there is one)
|
41
|
+
# @return [String] the document root name
|
42
|
+
def root_name
|
43
|
+
return unless document.root
|
44
|
+
|
45
|
+
document.root.name
|
46
|
+
end
|
47
|
+
|
48
|
+
# Returns true of Sitemap is a Sitemap index
|
49
|
+
# @return [Boolean] of whether the Sitemap is an Sitemap index or not
|
50
|
+
# @example Check if Sitemap is a sitemap index
|
51
|
+
# sitemap = Sitemap.new(xml)
|
52
|
+
# sitemap.sitemap_index?
|
53
|
+
def sitemap_index?
|
54
|
+
root_name == 'sitemapindex'
|
55
|
+
end
|
56
|
+
|
57
|
+
# Returns true of Sitemap lists regular URLs
|
58
|
+
# @return [Boolean] of whether the Sitemap regular URL list
|
59
|
+
# @example Check if Sitemap is a regular URL list
|
60
|
+
# sitemap = Sitemap.new(xml)
|
61
|
+
# sitemap.urlset?
|
62
|
+
def urlset?
|
63
|
+
root_name == 'urlset'
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
|
68
|
+
# Extract URLs from Sitemap
|
69
|
+
def extract_urls(node_name)
|
70
|
+
return document.to_s.each_line.map(&:strip) if plain_document?
|
71
|
+
|
72
|
+
urls = []
|
73
|
+
document.root.elements.each("#{node_name}/loc") do |element|
|
74
|
+
urls << element.text
|
75
|
+
end
|
76
|
+
urls
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'robots'
|
2
|
+
|
3
|
+
require 'wayback_archiver/sitemap'
|
4
|
+
require 'wayback_archiver/request'
|
5
|
+
|
6
|
+
module WaybackArchiver
|
7
|
+
# Fetch and parse sitemaps recursively
|
8
|
+
class Sitemapper
|
9
|
+
# Common locations for Sitemap(s)
|
10
|
+
COMMON_SITEMAP_LOCATIONS = %w[
|
11
|
+
sitemap_index.xml.gz
|
12
|
+
sitemap-index.xml.gz
|
13
|
+
sitemap_index.xml
|
14
|
+
sitemap-index.xml
|
15
|
+
sitemap.xml.gz
|
16
|
+
sitemap.xml
|
17
|
+
].freeze
|
18
|
+
|
19
|
+
# Autodiscover the location of the Sitemap, then fetch and parse recursively.
|
20
|
+
# First it tries /robots.txt, then common locations for Sitemap and finally the supplied URL.
|
21
|
+
# @return [Array<String>] of URLs defined in Sitemap(s).
|
22
|
+
# @param [URI] url to domain.
|
23
|
+
# @example Get URLs defined in Sitemap for google.com
|
24
|
+
# Sitemapper.autodiscover('https://google.com/')
|
25
|
+
# @see http://www.sitemaps.org
|
26
|
+
def self.autodiscover(url)
|
27
|
+
WaybackArchiver.logger.info 'Looking for Sitemap(s) in /robots.txt'
|
28
|
+
robots = Robots.new(WaybackArchiver.user_agent)
|
29
|
+
sitemaps = robots.other_values(url)['Sitemap']
|
30
|
+
if sitemaps
|
31
|
+
return sitemaps.flat_map do |sitemap|
|
32
|
+
WaybackArchiver.logger.info "Fetching Sitemap at #{sitemap}"
|
33
|
+
urls(url: sitemap)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
COMMON_SITEMAP_LOCATIONS.each do |path|
|
38
|
+
WaybackArchiver.logger.info "Looking for Sitemap at #{path}"
|
39
|
+
sitemap_url = [url, path].join(url.end_with?('/') ? '' : '/')
|
40
|
+
response = Request.get(sitemap_url, raise_on_http_error: false)
|
41
|
+
return urls(xml: response.body) if response.success?
|
42
|
+
end
|
43
|
+
|
44
|
+
WaybackArchiver.logger.info "Looking for Sitemap at #{url}"
|
45
|
+
urls(url: url)
|
46
|
+
rescue Request::Error => e
|
47
|
+
WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
|
48
|
+
[]
|
49
|
+
end
|
50
|
+
|
51
|
+
# Fetch and parse sitemaps recursively.
|
52
|
+
# @return [Array<String>] of URLs defined in Sitemap(s).
|
53
|
+
# @param url [String] URL to Sitemap.
|
54
|
+
# @param xml [String] Sitemap XML.
|
55
|
+
# @example Get URLs defined in Sitemap for google.com
|
56
|
+
# Sitemapper.urls(url: 'https://google.com/sitemap.xml')
|
57
|
+
# @example Get URLs defined in Sitemap
|
58
|
+
# Sitemapper.urls(xml: xml)
|
59
|
+
# @see http://www.sitemaps.org
|
60
|
+
def self.urls(url: nil, xml: nil)
|
61
|
+
xml = Request.get(url).body unless xml
|
62
|
+
sitemap = Sitemap.new(xml)
|
63
|
+
|
64
|
+
if sitemap.sitemap_index?
|
65
|
+
sitemap.sitemaps.flat_map { |sitemap_url| urls(url: sitemap_url) }
|
66
|
+
else
|
67
|
+
sitemap.urls
|
68
|
+
end
|
69
|
+
rescue Request::Error => e
|
70
|
+
WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
|
71
|
+
|
72
|
+
[]
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'concurrent'
|
2
|
+
|
3
|
+
module WaybackArchiver
|
4
|
+
# Thread pool
|
5
|
+
class ThreadPool
|
6
|
+
# Build a thread pool
|
7
|
+
# @return [Concurrent::FixedThreadPool/Concurrent::ImmediateExecutor] an instance of a concurrent thread pool
|
8
|
+
# @param [Integer] concurrency the desired concurrency
|
9
|
+
# @example Build a thread pool with 10 as the desired concurrency
|
10
|
+
# pool = ThreadPool.build(10)
|
11
|
+
# pool.post { some_work } # Returns a Concurrent::FixedThreadPool
|
12
|
+
# @example Build a thread pool with 1 as the desired concurrency
|
13
|
+
# pool = ThreadPool.build(1)
|
14
|
+
# pool.post { some_work } # Returns a Concurrent::ImmediateExecutor
|
15
|
+
# @see https://github.com/ruby-concurrency/concurrent-ruby/blob/master/doc/thread_pools.md
|
16
|
+
def self.build(concurrency)
|
17
|
+
if concurrency == 1
|
18
|
+
Concurrent::ImmediateExecutor.new
|
19
|
+
elsif concurrency > 1
|
20
|
+
Concurrent::FixedThreadPool.new(concurrency)
|
21
|
+
else
|
22
|
+
raise ArgumentError, 'concurrency must be one or greater'
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -1,51 +1,42 @@
|
|
1
1
|
require 'spidr'
|
2
2
|
require 'robots'
|
3
3
|
|
4
|
+
require 'wayback_archiver/sitemapper'
|
5
|
+
require 'wayback_archiver/request'
|
6
|
+
|
4
7
|
module WaybackArchiver
|
5
8
|
# Retrive URLs from different sources
|
6
|
-
class
|
9
|
+
class URLCollector
|
7
10
|
# Retrieve URLs from Sitemap.
|
8
|
-
# @return [Array] of URLs defined in Sitemap.
|
11
|
+
# @return [Array<String>] of URLs defined in Sitemap.
|
9
12
|
# @param [String] url domain to retrieve Sitemap from.
|
10
13
|
# @example Get URLs defined in Sitemap for google.com
|
11
|
-
#
|
14
|
+
# URLCollector.sitemap('https://google.com/sitemap.xml')
|
12
15
|
def self.sitemap(url)
|
13
|
-
|
14
|
-
sitemap = Request.document(resolved)
|
15
|
-
sitemap.css('loc').map(&:text)
|
16
|
+
Sitemapper.urls(url: Request.build_uri(url))
|
16
17
|
end
|
17
18
|
|
18
19
|
# Retrieve URLs by crawling.
|
19
|
-
# @return [Array] of URLs defined found during crawl.
|
20
|
+
# @return [Array<String>] of URLs defined found during crawl.
|
20
21
|
# @param [String] url domain to crawl URLs from.
|
21
22
|
# @example Crawl URLs defined on example.com
|
22
|
-
#
|
23
|
+
# URLCollector.crawl('http://example.com')
|
23
24
|
def self.crawl(url)
|
24
25
|
urls = []
|
25
|
-
|
26
|
-
|
26
|
+
start_at_url = Request.build_uri(url).to_s
|
27
|
+
options = {
|
28
|
+
robots: true,
|
29
|
+
user_agent: WaybackArchiver.user_agent
|
30
|
+
}
|
31
|
+
Spidr.site(start_at_url, options) do |spider|
|
27
32
|
spider.every_html_page do |page|
|
28
33
|
page_url = page.url.to_s
|
29
34
|
urls << page_url
|
30
|
-
WaybackArchiver.logger.
|
35
|
+
WaybackArchiver.logger.debug "Found: #{page_url}"
|
31
36
|
yield(page_url) if block_given?
|
32
37
|
end
|
33
38
|
end
|
34
39
|
urls
|
35
40
|
end
|
36
|
-
|
37
|
-
# Retrieve URLs listed in file.
|
38
|
-
# @return [Array] of URLs defined in file.
|
39
|
-
# @param [String] path to get URLs from.
|
40
|
-
# @example Get URLs defined in /path/to/file
|
41
|
-
# UrlCollector.file('/path/to/file')
|
42
|
-
def self.file(path)
|
43
|
-
raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
|
44
|
-
urls = []
|
45
|
-
File.open(path).read
|
46
|
-
.gsub(/\r\n?/, "\n")
|
47
|
-
.each_line { |line| urls << line.delete("\n").strip }
|
48
|
-
urls.reject(&:empty?)
|
49
|
-
end
|
50
41
|
end
|
51
42
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-08-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: spidr
|
@@ -38,20 +38,6 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0.1'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: url_resolver
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - "~>"
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0.1'
|
48
|
-
type: :runtime
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - "~>"
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0.1'
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: concurrent-ruby
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,14 +114,14 @@ dependencies:
|
|
128
114
|
requirements:
|
129
115
|
- - "~>"
|
130
116
|
- !ruby/object:Gem::Version
|
131
|
-
version: '0.
|
117
|
+
version: '0.8'
|
132
118
|
type: :development
|
133
119
|
prerelease: false
|
134
120
|
version_requirements: !ruby/object:Gem::Requirement
|
135
121
|
requirements:
|
136
122
|
- - "~>"
|
137
123
|
- !ruby/object:Gem::Version
|
138
|
-
version: '0.
|
124
|
+
version: '0.8'
|
139
125
|
- !ruby/object:Gem::Dependency
|
140
126
|
name: redcarpet
|
141
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -150,6 +136,20 @@ dependencies:
|
|
150
136
|
- - "~>"
|
151
137
|
- !ruby/object:Gem::Version
|
152
138
|
version: '3.2'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: webmock
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - "~>"
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '3.0'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '3.0'
|
153
153
|
- !ruby/object:Gem::Dependency
|
154
154
|
name: byebug
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
@@ -164,8 +164,8 @@ dependencies:
|
|
164
164
|
- - ">"
|
165
165
|
- !ruby/object:Gem::Version
|
166
166
|
version: '0'
|
167
|
-
description:
|
168
|
-
|
167
|
+
description: Post URLs to Wayback Machine (Internet Archive), using a crawler, from
|
168
|
+
Sitemap(s) or a list of URLs.
|
169
169
|
email:
|
170
170
|
- burenstam@gmail.com
|
171
171
|
executables:
|
@@ -176,8 +176,12 @@ files:
|
|
176
176
|
- bin/wayback_archiver
|
177
177
|
- lib/wayback_archiver.rb
|
178
178
|
- lib/wayback_archiver/archive.rb
|
179
|
+
- lib/wayback_archiver/http_code.rb
|
179
180
|
- lib/wayback_archiver/null_logger.rb
|
180
181
|
- lib/wayback_archiver/request.rb
|
182
|
+
- lib/wayback_archiver/sitemap.rb
|
183
|
+
- lib/wayback_archiver/sitemapper.rb
|
184
|
+
- lib/wayback_archiver/thread_pool.rb
|
181
185
|
- lib/wayback_archiver/url_collector.rb
|
182
186
|
- lib/wayback_archiver/version.rb
|
183
187
|
homepage: https://github.com/buren/wayback_archiver
|
@@ -203,5 +207,5 @@ rubyforge_project:
|
|
203
207
|
rubygems_version: 2.6.11
|
204
208
|
signing_key:
|
205
209
|
specification_version: 4
|
206
|
-
summary:
|
210
|
+
summary: Post URLs to Wayback Machine (Internet Archive)
|
207
211
|
test_files: []
|