wayback_archiver 1.3.0 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/wayback_archiver +7 -1
- data/lib/robots.rb +162 -0
- data/lib/wayback_archiver/adapters/wayback_machine.rb +1 -1
- data/lib/wayback_archiver/archive.rb +13 -4
- data/lib/wayback_archiver/sitemap.rb +16 -3
- data/lib/wayback_archiver/sitemapper.rb +1 -1
- data/lib/wayback_archiver/url_collector.rb +13 -3
- data/lib/wayback_archiver/version.rb +1 -1
- data/lib/wayback_archiver.rb +41 -8
- metadata +23 -23
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 405a39b07682a9c07ea9cf4689af2f110db21515c8ee192a90092c021fa8cd7b
|
4
|
+
data.tar.gz: 61c8500dc285c5e0975f95bd6ebc86b42bffed5daec93899f90a3ac5782046a0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cb74bad72fe7a33f8d45ee44afc3beb45424281ad05ca778222f1497eb3e92f4ae4cdde60669bd544436fc64dbd2b7acfdafa403e0757bcbeaf4df6aed9695cc
|
7
|
+
data.tar.gz: 0224a4bee7755dd25d791a2aa83b91960df395ffec1b3db687b73f32541b92ad9a726949926fb744062415903354eea3e4672e7235855d17f362da94fbcb3302
|
data/bin/wayback_archiver
CHANGED
@@ -10,6 +10,7 @@ log = STDOUT
|
|
10
10
|
log_level = Logger::INFO
|
11
11
|
concurrency = WaybackArchiver.concurrency
|
12
12
|
limit = WaybackArchiver.max_limit
|
13
|
+
hosts = []
|
13
14
|
|
14
15
|
optparse = OptionParser.new do |parser|
|
15
16
|
parser.banner = 'Usage: wayback_archiver [<url>] [options]'
|
@@ -30,7 +31,11 @@ optparse = OptionParser.new do |parser|
|
|
30
31
|
strategy = 'urls'
|
31
32
|
end
|
32
33
|
|
33
|
-
parser.on('--
|
34
|
+
parser.on('--hosts=[example.com]', Array, 'Only spider links on certain hosts') do |value|
|
35
|
+
hosts = value.map { |v| Regexp.new(v) } if value
|
36
|
+
end
|
37
|
+
|
38
|
+
parser.on('--concurrency=1', Integer, 'Concurrency') do |value|
|
34
39
|
concurrency = value
|
35
40
|
end
|
36
41
|
|
@@ -81,6 +86,7 @@ strategy ||= 'auto'
|
|
81
86
|
urls.each do |url|
|
82
87
|
WaybackArchiver.archive(
|
83
88
|
url,
|
89
|
+
hosts: hosts,
|
84
90
|
strategy: strategy,
|
85
91
|
concurrency: concurrency,
|
86
92
|
limit: limit
|
data/lib/robots.rb
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2008 Kyle Maxwell, contributors
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person
|
5
|
+
# obtaining a copy of this software and associated documentation
|
6
|
+
# files (the "Software"), to deal in the Software without
|
7
|
+
# restriction, including without limitation the rights to use,
|
8
|
+
# copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
# copies of the Software, and to permit persons to whom the
|
10
|
+
# Software is furnished to do so, subject to the following
|
11
|
+
# conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
18
|
+
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
20
|
+
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
21
|
+
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
22
|
+
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
23
|
+
# OTHER DEALINGS IN THE SOFTWARE.
|
24
|
+
#
|
25
|
+
|
26
|
+
require "open-uri"
|
27
|
+
require "uri"
|
28
|
+
require "rubygems"
|
29
|
+
require "timeout"
|
30
|
+
|
31
|
+
class Robots
|
32
|
+
|
33
|
+
DEFAULT_TIMEOUT = 3
|
34
|
+
|
35
|
+
class ParsedRobots
|
36
|
+
|
37
|
+
def initialize(uri, user_agent)
|
38
|
+
@last_accessed = Time.at(1)
|
39
|
+
|
40
|
+
io = Robots.get_robots_txt(uri, user_agent)
|
41
|
+
|
42
|
+
if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
|
43
|
+
io = StringIO.new("User-agent: *\nAllow: /\n")
|
44
|
+
end
|
45
|
+
|
46
|
+
@other = {}
|
47
|
+
@disallows = {}
|
48
|
+
@allows = {}
|
49
|
+
@delays = {} # added delays to make it work
|
50
|
+
agent = /.*/
|
51
|
+
io.each do |line|
|
52
|
+
next if line =~ /^\s*(#.*|$)/
|
53
|
+
arr = line.split(":")
|
54
|
+
key = arr.shift
|
55
|
+
value = arr.join(":").strip
|
56
|
+
value.strip!
|
57
|
+
case key
|
58
|
+
when "User-agent"
|
59
|
+
agent = to_regex(value)
|
60
|
+
when "Allow"
|
61
|
+
@allows[agent] ||= []
|
62
|
+
@allows[agent] << to_regex(value)
|
63
|
+
when "Disallow"
|
64
|
+
@disallows[agent] ||= []
|
65
|
+
@disallows[agent] << to_regex(value)
|
66
|
+
when "Crawl-delay"
|
67
|
+
@delays[agent] = value.to_i
|
68
|
+
else
|
69
|
+
@other[key] ||= []
|
70
|
+
@other[key] << value
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
@parsed = true
|
75
|
+
end
|
76
|
+
|
77
|
+
def allowed?(uri, user_agent)
|
78
|
+
return true unless @parsed
|
79
|
+
allowed = true
|
80
|
+
path = uri.request_uri
|
81
|
+
|
82
|
+
@disallows.each do |key, value|
|
83
|
+
if user_agent =~ key
|
84
|
+
value.each do |rule|
|
85
|
+
if path =~ rule
|
86
|
+
allowed = false
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
@allows.each do |key, value|
|
93
|
+
unless allowed
|
94
|
+
if user_agent =~ key
|
95
|
+
value.each do |rule|
|
96
|
+
if path =~ rule
|
97
|
+
allowed = true
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
if allowed && @delays[user_agent]
|
105
|
+
sleep @delays[user_agent] - (Time.now - @last_accessed)
|
106
|
+
@last_accessed = Time.now
|
107
|
+
end
|
108
|
+
|
109
|
+
return allowed
|
110
|
+
end
|
111
|
+
|
112
|
+
def other_values
|
113
|
+
@other
|
114
|
+
end
|
115
|
+
|
116
|
+
protected
|
117
|
+
|
118
|
+
def to_regex(pattern)
|
119
|
+
return /should-not-match-anything-123456789/ if pattern.strip.empty?
|
120
|
+
pattern = Regexp.escape(pattern)
|
121
|
+
pattern.gsub!(Regexp.escape("*"), ".*")
|
122
|
+
Regexp.compile("^#{pattern}")
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def self.get_robots_txt(uri, user_agent)
|
127
|
+
begin
|
128
|
+
Timeout::timeout(Robots.timeout) do
|
129
|
+
io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
|
130
|
+
end
|
131
|
+
rescue Timeout::Error
|
132
|
+
STDERR.puts "robots.txt request timed out"
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def self.timeout=(t)
|
137
|
+
@timeout = t
|
138
|
+
end
|
139
|
+
|
140
|
+
def self.timeout
|
141
|
+
@timeout || DEFAULT_TIMEOUT
|
142
|
+
end
|
143
|
+
|
144
|
+
def initialize(user_agent)
|
145
|
+
@user_agent = user_agent
|
146
|
+
@parsed = {}
|
147
|
+
end
|
148
|
+
|
149
|
+
def allowed?(uri)
|
150
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
151
|
+
host = uri.host
|
152
|
+
@parsed[host] ||= ParsedRobots.new(uri, @user_agent)
|
153
|
+
@parsed[host].allowed?(uri, @user_agent)
|
154
|
+
end
|
155
|
+
|
156
|
+
def other_values(uri)
|
157
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
158
|
+
host = uri.host
|
159
|
+
@parsed[host] ||= ParsedRobots.new(uri, @user_agent)
|
160
|
+
@parsed[host].other_values
|
161
|
+
end
|
162
|
+
end
|
@@ -13,7 +13,7 @@ module WaybackArchiver
|
|
13
13
|
# @example Archive example.com, with default options
|
14
14
|
# WaybackMachine.call('http://example.com')
|
15
15
|
def self.call(url)
|
16
|
-
request_url = "#{BASE_URL}#{url}"
|
16
|
+
request_url = "#{BASE_URL}#{url&.strip}"
|
17
17
|
response = Request.get(request_url, follow_redirects: false)
|
18
18
|
WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
|
19
19
|
ArchiveResult.new(
|
@@ -9,7 +9,7 @@ module WaybackArchiver
|
|
9
9
|
# Send URLs to Wayback Machine.
|
10
10
|
# @return [Array<ArchiveResult>] with sent URLs.
|
11
11
|
# @param [Array<String>] urls to send to the Wayback Machine.
|
12
|
-
# @param concurrency [Integer] the default is
|
12
|
+
# @param concurrency [Integer] the default is 1
|
13
13
|
# @yield [archive_result] If a block is given, each result will be yielded
|
14
14
|
# @yieldparam [ArchiveResult] archive_result
|
15
15
|
# @example Archive urls, asynchronously
|
@@ -54,7 +54,8 @@ module WaybackArchiver
|
|
54
54
|
# Send URLs to Wayback Machine by crawling the site.
|
55
55
|
# @return [Array<ArchiveResult>] with URLs sent to the Wayback Machine.
|
56
56
|
# @param [String] source for URL to crawl.
|
57
|
-
# @param concurrency [Integer] the default is
|
57
|
+
# @param concurrency [Integer] the default is 1
|
58
|
+
# @param [Array<String, Regexp>] hosts to crawl
|
58
59
|
# @yield [archive_result] If a block is given, each result will be yielded
|
59
60
|
# @yieldparam [ArchiveResult] archive_result
|
60
61
|
# @example Crawl example.com and send all URLs of the same domain
|
@@ -66,13 +67,21 @@ module WaybackArchiver
|
|
66
67
|
# Archiver.crawl('example.com', concurrency: 1)
|
67
68
|
# @example Stop after archiving 100 links
|
68
69
|
# Archiver.crawl('example.com', limit: 100)
|
69
|
-
|
70
|
+
# @example Crawl multiple hosts
|
71
|
+
# URLCollector.crawl(
|
72
|
+
# 'http://example.com',
|
73
|
+
# hosts: [
|
74
|
+
# 'example.com',
|
75
|
+
# /host[\d]+\.example\.com/
|
76
|
+
# ]
|
77
|
+
# )
|
78
|
+
def self.crawl(source, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
70
79
|
WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
|
71
80
|
|
72
81
|
posted_urls = Concurrent::Array.new
|
73
82
|
pool = ThreadPool.build(concurrency)
|
74
83
|
|
75
|
-
found_urls = URLCollector.crawl(source, limit: limit) do |url|
|
84
|
+
found_urls = URLCollector.crawl(source, hosts: hosts, limit: limit) do |url|
|
76
85
|
pool.post do
|
77
86
|
result = post_url(url)
|
78
87
|
yield(result) if block_given?
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'uri'
|
1
2
|
require 'rexml/document'
|
2
3
|
|
3
4
|
module WaybackArchiver
|
@@ -5,8 +6,9 @@ module WaybackArchiver
|
|
5
6
|
class Sitemap
|
6
7
|
attr_reader :document
|
7
8
|
|
8
|
-
def initialize(
|
9
|
-
@
|
9
|
+
def initialize(xml_or_string, strict: false)
|
10
|
+
@contents = xml_or_string
|
11
|
+
@document = REXML::Document.new(xml_or_string)
|
10
12
|
rescue REXML::ParseException => _e
|
11
13
|
raise if strict
|
12
14
|
|
@@ -65,9 +67,20 @@ module WaybackArchiver
|
|
65
67
|
|
66
68
|
private
|
67
69
|
|
70
|
+
def valid_url?(url)
|
71
|
+
uri = URI.parse(url)
|
72
|
+
uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
73
|
+
rescue URI::InvalidURIError
|
74
|
+
false
|
75
|
+
end
|
76
|
+
|
68
77
|
# Extract URLs from Sitemap
|
69
78
|
def extract_urls(node_name)
|
70
|
-
|
79
|
+
if plain_document?
|
80
|
+
return @contents.to_s
|
81
|
+
.each_line.map(&:strip)
|
82
|
+
.select(&method(:valid_url?))
|
83
|
+
end
|
71
84
|
|
72
85
|
urls = []
|
73
86
|
document.root.elements.each("#{node_name}/loc") do |element|
|
@@ -79,7 +79,7 @@ module WaybackArchiver
|
|
79
79
|
urls(url: sitemap_url, visited: visited)
|
80
80
|
end
|
81
81
|
else
|
82
|
-
sitemap.urls
|
82
|
+
sitemap.urls.map { |url| url&.strip }
|
83
83
|
end
|
84
84
|
rescue Request::Error => e
|
85
85
|
WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
|
@@ -19,22 +19,32 @@ module WaybackArchiver
|
|
19
19
|
# Retrieve URLs by crawling.
|
20
20
|
# @return [Array<String>] of URLs defined found during crawl.
|
21
21
|
# @param [String] url domain to crawl URLs from.
|
22
|
+
# @param [Array<String, Regexp>] hosts to crawl.
|
22
23
|
# @example Crawl URLs defined on example.com
|
23
24
|
# URLCollector.crawl('http://example.com')
|
24
25
|
# @example Crawl URLs defined on example.com and limit the number of visited pages to 100
|
25
26
|
# URLCollector.crawl('http://example.com', limit: 100)
|
26
27
|
# @example Crawl URLs defined on example.com and explicitly set no upper limit on the number of visited pages to 100
|
27
28
|
# URLCollector.crawl('http://example.com', limit: -1)
|
28
|
-
|
29
|
+
# @example Crawl multiple hosts
|
30
|
+
# URLCollector.crawl(
|
31
|
+
# 'http://example.com',
|
32
|
+
# hosts: [
|
33
|
+
# 'example.com',
|
34
|
+
# /host[\d]+\.example\.com/
|
35
|
+
# ]
|
36
|
+
# )
|
37
|
+
def self.crawl(url, hosts: [], limit: WaybackArchiver.max_limit)
|
29
38
|
urls = []
|
30
39
|
start_at_url = Request.build_uri(url).to_s
|
31
40
|
options = {
|
32
|
-
robots:
|
41
|
+
robots: WaybackArchiver.respect_robots_txt,
|
42
|
+
hosts: hosts,
|
33
43
|
user_agent: WaybackArchiver.user_agent
|
34
44
|
}
|
35
45
|
options[:limit] = limit unless limit == -1
|
36
46
|
|
37
|
-
Spidr.site(start_at_url, options) do |spider|
|
47
|
+
Spidr.site(start_at_url, **options) do |spider|
|
38
48
|
spider.every_page do |page|
|
39
49
|
page_url = page.url.to_s
|
40
50
|
urls << page_url
|
data/lib/wayback_archiver.rb
CHANGED
@@ -11,9 +11,11 @@ module WaybackArchiver
|
|
11
11
|
INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'.freeze
|
12
12
|
# WaybackArchiver User-Agent
|
13
13
|
USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze
|
14
|
+
# Default for whether to respect robots txt files
|
15
|
+
DEFAULT_RESPECT_ROBOTS_TXT = false
|
14
16
|
|
15
17
|
# Default concurrency for archiving URLs
|
16
|
-
DEFAULT_CONCURRENCY =
|
18
|
+
DEFAULT_CONCURRENCY = 1
|
17
19
|
|
18
20
|
# Maxmium number of links posted (-1 is no limit)
|
19
21
|
DEFAULT_MAX_LIMIT = -1
|
@@ -22,6 +24,7 @@ module WaybackArchiver
|
|
22
24
|
# @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
|
23
25
|
# @param [String/Array<String>] source for URL(s).
|
24
26
|
# @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto.
|
27
|
+
# @param [Array<String, Regexp>] hosts to crawl.
|
25
28
|
# @example Crawl example.com and send all URLs of the same domain
|
26
29
|
# WaybackArchiver.archive('example.com') # Default strategy is :auto
|
27
30
|
# WaybackArchiver.archive('example.com', strategy: :auto)
|
@@ -43,11 +46,19 @@ module WaybackArchiver
|
|
43
46
|
# WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
|
44
47
|
# WaybackArchiver.archive('example.com', strategy: :url, limit: 100) # send max 100 URLs
|
45
48
|
# WaybackArchiver.archive('example.com', :url)
|
46
|
-
|
49
|
+
# @example Crawl multiple hosts
|
50
|
+
# WaybackArchiver.archive(
|
51
|
+
# 'http://example.com',
|
52
|
+
# hosts: [
|
53
|
+
# 'example.com',
|
54
|
+
# /host[\d]+\.example\.com/
|
55
|
+
# ]
|
56
|
+
# )
|
57
|
+
def self.archive(source, legacy_strategy = nil, strategy: :auto, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
|
47
58
|
strategy = legacy_strategy || strategy
|
48
59
|
|
49
60
|
case strategy.to_s
|
50
|
-
when 'crawl' then crawl(source, concurrency: concurrency, limit: limit, &block)
|
61
|
+
when 'crawl' then crawl(source, concurrency: concurrency, limit: limit, hosts: hosts, &block)
|
51
62
|
when 'auto' then auto(source, concurrency: concurrency, limit: limit, &block)
|
52
63
|
when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit, &block)
|
53
64
|
when 'urls' then urls(source, concurrency: concurrency, limit: limit, &block)
|
@@ -63,7 +74,7 @@ module WaybackArchiver
|
|
63
74
|
# @param [String] source (must be a valid URL).
|
64
75
|
# @param concurrency [Integer]
|
65
76
|
# @example Auto archive example.com
|
66
|
-
# WaybackArchiver.auto('example.com') # Default concurrency is
|
77
|
+
# WaybackArchiver.auto('example.com') # Default concurrency is 1
|
67
78
|
# @example Auto archive example.com with low concurrency
|
68
79
|
# WaybackArchiver.auto('example.com', concurrency: 1)
|
69
80
|
# @example Auto archive example.com and archive max 100 URLs
|
@@ -79,16 +90,25 @@ module WaybackArchiver
|
|
79
90
|
# Crawl site for URLs to send to the Wayback Machine.
|
80
91
|
# @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
|
81
92
|
# @param [String] url to start crawling from.
|
93
|
+
# @param [Array<String, Regexp>] hosts to crawl
|
82
94
|
# @param concurrency [Integer]
|
83
95
|
# @example Crawl example.com and send all URLs of the same domain
|
84
|
-
# WaybackArchiver.crawl('example.com') # Default concurrency is
|
96
|
+
# WaybackArchiver.crawl('example.com') # Default concurrency is 1
|
85
97
|
# @example Crawl example.com and send all URLs of the same domain with low concurrency
|
86
98
|
# WaybackArchiver.crawl('example.com', concurrency: 1)
|
87
99
|
# @example Crawl example.com and archive max 100 URLs
|
88
100
|
# WaybackArchiver.crawl('example.com', limit: 100)
|
89
|
-
|
101
|
+
# @example Crawl multiple hosts
|
102
|
+
# URLCollector.crawl(
|
103
|
+
# 'http://example.com',
|
104
|
+
# hosts: [
|
105
|
+
# 'example.com',
|
106
|
+
# /host[\d]+\.example\.com/
|
107
|
+
# ]
|
108
|
+
# )
|
109
|
+
def self.crawl(url, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
|
90
110
|
WaybackArchiver.logger.info "Crawling #{url}"
|
91
|
-
Archive.crawl(url, concurrency: concurrency, limit: limit, &block)
|
111
|
+
Archive.crawl(url, hosts: hosts, concurrency: concurrency, limit: limit, &block)
|
92
112
|
end
|
93
113
|
|
94
114
|
# Get URLs from sitemap and send found URLs to the Wayback Machine.
|
@@ -96,7 +116,7 @@ module WaybackArchiver
|
|
96
116
|
# @param [String] url to the sitemap.
|
97
117
|
# @param concurrency [Integer]
|
98
118
|
# @example Get example.com sitemap and archive all found URLs
|
99
|
-
# WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is
|
119
|
+
# WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 1
|
100
120
|
# @example Get example.com sitemap and archive all found URLs with low concurrency
|
101
121
|
# WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1)
|
102
122
|
# @example Get example.com sitemap archive max 100 URLs
|
@@ -155,6 +175,19 @@ module WaybackArchiver
|
|
155
175
|
@user_agent ||= USER_AGENT
|
156
176
|
end
|
157
177
|
|
178
|
+
# Sets the default respect_robots_txt
|
179
|
+
# @return [Boolean] the desired default for respect_robots_txt
|
180
|
+
# @param [Boolean] respect_robots_txt the desired default
|
181
|
+
def self.respect_robots_txt=(respect_robots_txt)
|
182
|
+
@respect_robots_txt = respect_robots_txt
|
183
|
+
end
|
184
|
+
|
185
|
+
# Returns the default respect_robots_txt
|
186
|
+
# @return [Boolean] the configured or the default respect_robots_txt
|
187
|
+
def self.respect_robots_txt
|
188
|
+
@respect_robots_txt ||= DEFAULT_RESPECT_ROBOTS_TXT
|
189
|
+
end
|
190
|
+
|
158
191
|
# Sets the default concurrency
|
159
192
|
# @return [Integer] the desired default concurrency
|
160
193
|
# @param [Integer] concurrency the desired default concurrency
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-12-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: spidr
|
@@ -16,70 +16,70 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.
|
19
|
+
version: 0.7.1
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.
|
26
|
+
version: 0.7.1
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: concurrent-ruby
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '1.3'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '1.3'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: rexml
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 3.3.9
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 3.3.9
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: bundler
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '1
|
61
|
+
version: '2.1'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '1
|
68
|
+
version: '2.1'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rake
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
75
|
+
version: '12.3'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
82
|
+
version: '12.3'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: rspec
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -168,16 +168,16 @@ dependencies:
|
|
168
168
|
name: byebug
|
169
169
|
requirement: !ruby/object:Gem::Requirement
|
170
170
|
requirements:
|
171
|
-
- - "
|
171
|
+
- - "~>"
|
172
172
|
- !ruby/object:Gem::Version
|
173
|
-
version:
|
173
|
+
version: 11.1.3
|
174
174
|
type: :development
|
175
175
|
prerelease: false
|
176
176
|
version_requirements: !ruby/object:Gem::Requirement
|
177
177
|
requirements:
|
178
|
-
- - "
|
178
|
+
- - "~>"
|
179
179
|
- !ruby/object:Gem::Version
|
180
|
-
version:
|
180
|
+
version: 11.1.3
|
181
181
|
description: Post URLs to Wayback Machine (Internet Archive), using a crawler, from
|
182
182
|
Sitemap(s) or a list of URLs.
|
183
183
|
email:
|
@@ -188,6 +188,7 @@ extensions: []
|
|
188
188
|
extra_rdoc_files: []
|
189
189
|
files:
|
190
190
|
- bin/wayback_archiver
|
191
|
+
- lib/robots.rb
|
191
192
|
- lib/wayback_archiver.rb
|
192
193
|
- lib/wayback_archiver/adapters/wayback_machine.rb
|
193
194
|
- lib/wayback_archiver/archive.rb
|
@@ -205,7 +206,7 @@ homepage: https://github.com/buren/wayback_archiver
|
|
205
206
|
licenses:
|
206
207
|
- MIT
|
207
208
|
metadata: {}
|
208
|
-
post_install_message:
|
209
|
+
post_install_message:
|
209
210
|
rdoc_options: []
|
210
211
|
require_paths:
|
211
212
|
- lib
|
@@ -220,9 +221,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
220
221
|
- !ruby/object:Gem::Version
|
221
222
|
version: '0'
|
222
223
|
requirements: []
|
223
|
-
|
224
|
-
|
225
|
-
signing_key:
|
224
|
+
rubygems_version: 3.5.3
|
225
|
+
signing_key:
|
226
226
|
specification_version: 4
|
227
227
|
summary: Post URLs to Wayback Machine (Internet Archive)
|
228
228
|
test_files: []
|