wayback_archiver 1.3.0 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4dba94d5ac29b57a1df0a6e98f5d9ea29c1905b9ffdef7ee9bf9c344448362fe
4
- data.tar.gz: 79af5b2af8660dd5f0cafae521e7b73be36e528da7c8ce2f62502f76b434959b
3
+ metadata.gz: 405a39b07682a9c07ea9cf4689af2f110db21515c8ee192a90092c021fa8cd7b
4
+ data.tar.gz: 61c8500dc285c5e0975f95bd6ebc86b42bffed5daec93899f90a3ac5782046a0
5
5
  SHA512:
6
- metadata.gz: d02a18809ebdc880bdcdb9a7d0d522da445c2a2329959080c897d1f7033f6a687c268a626abfa05a45791c5e1404a4e3e8025ea172494bca0b430033e4fc20b0
7
- data.tar.gz: 316e8fe4e922b0aeb5047191fa6ddd8e242fa7bfd1ce8eb2b2a2e08d980999dd895c071f82f949a2690c4259bc5b3de00049e691b540040db79d002ffdf8a4f0
6
+ metadata.gz: cb74bad72fe7a33f8d45ee44afc3beb45424281ad05ca778222f1497eb3e92f4ae4cdde60669bd544436fc64dbd2b7acfdafa403e0757bcbeaf4df6aed9695cc
7
+ data.tar.gz: 0224a4bee7755dd25d791a2aa83b91960df395ffec1b3db687b73f32541b92ad9a726949926fb744062415903354eea3e4672e7235855d17f362da94fbcb3302
data/bin/wayback_archiver CHANGED
@@ -10,6 +10,7 @@ log = STDOUT
10
10
  log_level = Logger::INFO
11
11
  concurrency = WaybackArchiver.concurrency
12
12
  limit = WaybackArchiver.max_limit
13
+ hosts = []
13
14
 
14
15
  optparse = OptionParser.new do |parser|
15
16
  parser.banner = 'Usage: wayback_archiver [<url>] [options]'
@@ -30,7 +31,11 @@ optparse = OptionParser.new do |parser|
30
31
  strategy = 'urls'
31
32
  end
32
33
 
33
- parser.on('--concurrency=5', Integer, 'Concurrency') do |value|
34
+ parser.on('--hosts=[example.com]', Array, 'Only spider links on certain hosts') do |value|
35
+ hosts = value.map { |v| Regexp.new(v) } if value
36
+ end
37
+
38
+ parser.on('--concurrency=1', Integer, 'Concurrency') do |value|
34
39
  concurrency = value
35
40
  end
36
41
 
@@ -81,6 +86,7 @@ strategy ||= 'auto'
81
86
  urls.each do |url|
82
87
  WaybackArchiver.archive(
83
88
  url,
89
+ hosts: hosts,
84
90
  strategy: strategy,
85
91
  concurrency: concurrency,
86
92
  limit: limit
data/lib/robots.rb ADDED
@@ -0,0 +1,162 @@
1
+ #
2
+ # Copyright (c) 2008 Kyle Maxwell, contributors
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person
5
+ # obtaining a copy of this software and associated documentation
6
+ # files (the "Software"), to deal in the Software without
7
+ # restriction, including without limitation the rights to use,
8
+ # copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the
10
+ # Software is furnished to do so, subject to the following
11
+ # conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18
+ # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20
+ # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21
+ # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22
+ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23
+ # OTHER DEALINGS IN THE SOFTWARE.
24
+ #
25
+
26
+ require "open-uri"
27
+ require "uri"
28
+ require "rubygems"
29
+ require "timeout"
30
+
31
+ class Robots
32
+
33
+ DEFAULT_TIMEOUT = 3
34
+
35
+ class ParsedRobots
36
+
37
+ def initialize(uri, user_agent)
38
+ @last_accessed = Time.at(1)
39
+
40
+ io = Robots.get_robots_txt(uri, user_agent)
41
+
42
+ if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
43
+ io = StringIO.new("User-agent: *\nAllow: /\n")
44
+ end
45
+
46
+ @other = {}
47
+ @disallows = {}
48
+ @allows = {}
49
+ @delays = {} # added delays to make it work
50
+ agent = /.*/
51
+ io.each do |line|
52
+ next if line =~ /^\s*(#.*|$)/
53
+ arr = line.split(":")
54
+ key = arr.shift
55
+ value = arr.join(":").strip
56
+ value.strip!
57
+ case key
58
+ when "User-agent"
59
+ agent = to_regex(value)
60
+ when "Allow"
61
+ @allows[agent] ||= []
62
+ @allows[agent] << to_regex(value)
63
+ when "Disallow"
64
+ @disallows[agent] ||= []
65
+ @disallows[agent] << to_regex(value)
66
+ when "Crawl-delay"
67
+ @delays[agent] = value.to_i
68
+ else
69
+ @other[key] ||= []
70
+ @other[key] << value
71
+ end
72
+ end
73
+
74
+ @parsed = true
75
+ end
76
+
77
+ def allowed?(uri, user_agent)
78
+ return true unless @parsed
79
+ allowed = true
80
+ path = uri.request_uri
81
+
82
+ @disallows.each do |key, value|
83
+ if user_agent =~ key
84
+ value.each do |rule|
85
+ if path =~ rule
86
+ allowed = false
87
+ end
88
+ end
89
+ end
90
+ end
91
+
92
+ @allows.each do |key, value|
93
+ unless allowed
94
+ if user_agent =~ key
95
+ value.each do |rule|
96
+ if path =~ rule
97
+ allowed = true
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
103
+
104
+ if allowed && @delays[user_agent]
105
+ sleep @delays[user_agent] - (Time.now - @last_accessed)
106
+ @last_accessed = Time.now
107
+ end
108
+
109
+ return allowed
110
+ end
111
+
112
+ def other_values
113
+ @other
114
+ end
115
+
116
+ protected
117
+
118
+ def to_regex(pattern)
119
+ return /should-not-match-anything-123456789/ if pattern.strip.empty?
120
+ pattern = Regexp.escape(pattern)
121
+ pattern.gsub!(Regexp.escape("*"), ".*")
122
+ Regexp.compile("^#{pattern}")
123
+ end
124
+ end
125
+
126
+ def self.get_robots_txt(uri, user_agent)
127
+ begin
128
+ Timeout::timeout(Robots.timeout) do
129
+ io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
130
+ end
131
+ rescue Timeout::Error
132
+ STDERR.puts "robots.txt request timed out"
133
+ end
134
+ end
135
+
136
+ def self.timeout=(t)
137
+ @timeout = t
138
+ end
139
+
140
+ def self.timeout
141
+ @timeout || DEFAULT_TIMEOUT
142
+ end
143
+
144
+ def initialize(user_agent)
145
+ @user_agent = user_agent
146
+ @parsed = {}
147
+ end
148
+
149
+ def allowed?(uri)
150
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
151
+ host = uri.host
152
+ @parsed[host] ||= ParsedRobots.new(uri, @user_agent)
153
+ @parsed[host].allowed?(uri, @user_agent)
154
+ end
155
+
156
+ def other_values(uri)
157
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
158
+ host = uri.host
159
+ @parsed[host] ||= ParsedRobots.new(uri, @user_agent)
160
+ @parsed[host].other_values
161
+ end
162
+ end
@@ -13,7 +13,7 @@ module WaybackArchiver
13
13
  # @example Archive example.com, with default options
14
14
  # WaybackMachine.call('http://example.com')
15
15
  def self.call(url)
16
- request_url = "#{BASE_URL}#{url}"
16
+ request_url = "#{BASE_URL}#{url&.strip}"
17
17
  response = Request.get(request_url, follow_redirects: false)
18
18
  WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
19
19
  ArchiveResult.new(
@@ -9,7 +9,7 @@ module WaybackArchiver
9
9
  # Send URLs to Wayback Machine.
10
10
  # @return [Array<ArchiveResult>] with sent URLs.
11
11
  # @param [Array<String>] urls to send to the Wayback Machine.
12
- # @param concurrency [Integer] the default is 5
12
+ # @param concurrency [Integer] the default is 1
13
13
  # @yield [archive_result] If a block is given, each result will be yielded
14
14
  # @yieldparam [ArchiveResult] archive_result
15
15
  # @example Archive urls, asynchronously
@@ -54,7 +54,8 @@ module WaybackArchiver
54
54
  # Send URLs to Wayback Machine by crawling the site.
55
55
  # @return [Array<ArchiveResult>] with URLs sent to the Wayback Machine.
56
56
  # @param [String] source for URL to crawl.
57
- # @param concurrency [Integer] the default is 5
57
+ # @param concurrency [Integer] the default is 1
58
+ # @param [Array<String, Regexp>] hosts to crawl
58
59
  # @yield [archive_result] If a block is given, each result will be yielded
59
60
  # @yieldparam [ArchiveResult] archive_result
60
61
  # @example Crawl example.com and send all URLs of the same domain
@@ -66,13 +67,21 @@ module WaybackArchiver
66
67
  # Archiver.crawl('example.com', concurrency: 1)
67
68
  # @example Stop after archiving 100 links
68
69
  # Archiver.crawl('example.com', limit: 100)
69
- def self.crawl(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
70
+ # @example Crawl multiple hosts
71
+ # URLCollector.crawl(
72
+ # 'http://example.com',
73
+ # hosts: [
74
+ # 'example.com',
75
+ # /host[\d]+\.example\.com/
76
+ # ]
77
+ # )
78
+ def self.crawl(source, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
70
79
  WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
71
80
 
72
81
  posted_urls = Concurrent::Array.new
73
82
  pool = ThreadPool.build(concurrency)
74
83
 
75
- found_urls = URLCollector.crawl(source, limit: limit) do |url|
84
+ found_urls = URLCollector.crawl(source, hosts: hosts, limit: limit) do |url|
76
85
  pool.post do
77
86
  result = post_url(url)
78
87
  yield(result) if block_given?
@@ -1,3 +1,4 @@
1
+ require 'uri'
1
2
  require 'rexml/document'
2
3
 
3
4
  module WaybackArchiver
@@ -5,8 +6,9 @@ module WaybackArchiver
5
6
  class Sitemap
6
7
  attr_reader :document
7
8
 
8
- def initialize(xml, strict: false)
9
- @document = REXML::Document.new(xml)
9
+ def initialize(xml_or_string, strict: false)
10
+ @contents = xml_or_string
11
+ @document = REXML::Document.new(xml_or_string)
10
12
  rescue REXML::ParseException => _e
11
13
  raise if strict
12
14
 
@@ -65,9 +67,20 @@ module WaybackArchiver
65
67
 
66
68
  private
67
69
 
70
+ def valid_url?(url)
71
+ uri = URI.parse(url)
72
+ uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
73
+ rescue URI::InvalidURIError
74
+ false
75
+ end
76
+
68
77
  # Extract URLs from Sitemap
69
78
  def extract_urls(node_name)
70
- return document.to_s.each_line.map(&:strip) if plain_document?
79
+ if plain_document?
80
+ return @contents.to_s
81
+ .each_line.map(&:strip)
82
+ .select(&method(:valid_url?))
83
+ end
71
84
 
72
85
  urls = []
73
86
  document.root.elements.each("#{node_name}/loc") do |element|
@@ -79,7 +79,7 @@ module WaybackArchiver
79
79
  urls(url: sitemap_url, visited: visited)
80
80
  end
81
81
  else
82
- sitemap.urls
82
+ sitemap.urls.map { |url| url&.strip }
83
83
  end
84
84
  rescue Request::Error => e
85
85
  WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
@@ -19,22 +19,32 @@ module WaybackArchiver
19
19
  # Retrieve URLs by crawling.
20
20
  # @return [Array<String>] of URLs defined found during crawl.
21
21
  # @param [String] url domain to crawl URLs from.
22
+ # @param [Array<String, Regexp>] hosts to crawl.
22
23
  # @example Crawl URLs defined on example.com
23
24
  # URLCollector.crawl('http://example.com')
24
25
  # @example Crawl URLs defined on example.com and limit the number of visited pages to 100
25
26
  # URLCollector.crawl('http://example.com', limit: 100)
26
27
  # @example Crawl URLs defined on example.com and explicitly set no upper limit on the number of visited pages to 100
27
28
  # URLCollector.crawl('http://example.com', limit: -1)
28
- def self.crawl(url, limit: WaybackArchiver.max_limit)
29
+ # @example Crawl multiple hosts
30
+ # URLCollector.crawl(
31
+ # 'http://example.com',
32
+ # hosts: [
33
+ # 'example.com',
34
+ # /host[\d]+\.example\.com/
35
+ # ]
36
+ # )
37
+ def self.crawl(url, hosts: [], limit: WaybackArchiver.max_limit)
29
38
  urls = []
30
39
  start_at_url = Request.build_uri(url).to_s
31
40
  options = {
32
- robots: true,
41
+ robots: WaybackArchiver.respect_robots_txt,
42
+ hosts: hosts,
33
43
  user_agent: WaybackArchiver.user_agent
34
44
  }
35
45
  options[:limit] = limit unless limit == -1
36
46
 
37
- Spidr.site(start_at_url, options) do |spider|
47
+ Spidr.site(start_at_url, **options) do |spider|
38
48
  spider.every_page do |page|
39
49
  page_url = page.url.to_s
40
50
  urls << page_url
@@ -1,4 +1,4 @@
1
1
  module WaybackArchiver
2
2
  # Gem version
3
- VERSION = '1.3.0'.freeze
3
+ VERSION = '1.5.0'.freeze
4
4
  end
@@ -11,9 +11,11 @@ module WaybackArchiver
11
11
  INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'.freeze
12
12
  # WaybackArchiver User-Agent
13
13
  USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze
14
+ # Default for whether to respect robots txt files
15
+ DEFAULT_RESPECT_ROBOTS_TXT = false
14
16
 
15
17
  # Default concurrency for archiving URLs
16
- DEFAULT_CONCURRENCY = 5
18
+ DEFAULT_CONCURRENCY = 1
17
19
 
18
20
  # Maxmium number of links posted (-1 is no limit)
19
21
  DEFAULT_MAX_LIMIT = -1
@@ -22,6 +24,7 @@ module WaybackArchiver
22
24
  # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
23
25
  # @param [String/Array<String>] source for URL(s).
24
26
  # @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto.
27
+ # @param [Array<String, Regexp>] hosts to crawl.
25
28
  # @example Crawl example.com and send all URLs of the same domain
26
29
  # WaybackArchiver.archive('example.com') # Default strategy is :auto
27
30
  # WaybackArchiver.archive('example.com', strategy: :auto)
@@ -43,11 +46,19 @@ module WaybackArchiver
43
46
  # WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
44
47
  # WaybackArchiver.archive('example.com', strategy: :url, limit: 100) # send max 100 URLs
45
48
  # WaybackArchiver.archive('example.com', :url)
46
- def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
49
+ # @example Crawl multiple hosts
50
+ # WaybackArchiver.archive(
51
+ # 'http://example.com',
52
+ # hosts: [
53
+ # 'example.com',
54
+ # /host[\d]+\.example\.com/
55
+ # ]
56
+ # )
57
+ def self.archive(source, legacy_strategy = nil, strategy: :auto, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
47
58
  strategy = legacy_strategy || strategy
48
59
 
49
60
  case strategy.to_s
50
- when 'crawl' then crawl(source, concurrency: concurrency, limit: limit, &block)
61
+ when 'crawl' then crawl(source, concurrency: concurrency, limit: limit, hosts: hosts, &block)
51
62
  when 'auto' then auto(source, concurrency: concurrency, limit: limit, &block)
52
63
  when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit, &block)
53
64
  when 'urls' then urls(source, concurrency: concurrency, limit: limit, &block)
@@ -63,7 +74,7 @@ module WaybackArchiver
63
74
  # @param [String] source (must be a valid URL).
64
75
  # @param concurrency [Integer]
65
76
  # @example Auto archive example.com
66
- # WaybackArchiver.auto('example.com') # Default concurrency is 5
77
+ # WaybackArchiver.auto('example.com') # Default concurrency is 1
67
78
  # @example Auto archive example.com with low concurrency
68
79
  # WaybackArchiver.auto('example.com', concurrency: 1)
69
80
  # @example Auto archive example.com and archive max 100 URLs
@@ -79,16 +90,25 @@ module WaybackArchiver
79
90
  # Crawl site for URLs to send to the Wayback Machine.
80
91
  # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
81
92
  # @param [String] url to start crawling from.
93
+ # @param [Array<String, Regexp>] hosts to crawl
82
94
  # @param concurrency [Integer]
83
95
  # @example Crawl example.com and send all URLs of the same domain
84
- # WaybackArchiver.crawl('example.com') # Default concurrency is 5
96
+ # WaybackArchiver.crawl('example.com') # Default concurrency is 1
85
97
  # @example Crawl example.com and send all URLs of the same domain with low concurrency
86
98
  # WaybackArchiver.crawl('example.com', concurrency: 1)
87
99
  # @example Crawl example.com and archive max 100 URLs
88
100
  # WaybackArchiver.crawl('example.com', limit: 100)
89
- def self.crawl(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
101
+ # @example Crawl multiple hosts
102
+ # URLCollector.crawl(
103
+ # 'http://example.com',
104
+ # hosts: [
105
+ # 'example.com',
106
+ # /host[\d]+\.example\.com/
107
+ # ]
108
+ # )
109
+ def self.crawl(url, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
90
110
  WaybackArchiver.logger.info "Crawling #{url}"
91
- Archive.crawl(url, concurrency: concurrency, limit: limit, &block)
111
+ Archive.crawl(url, hosts: hosts, concurrency: concurrency, limit: limit, &block)
92
112
  end
93
113
 
94
114
  # Get URLs from sitemap and send found URLs to the Wayback Machine.
@@ -96,7 +116,7 @@ module WaybackArchiver
96
116
  # @param [String] url to the sitemap.
97
117
  # @param concurrency [Integer]
98
118
  # @example Get example.com sitemap and archive all found URLs
99
- # WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 5
119
+ # WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 1
100
120
  # @example Get example.com sitemap and archive all found URLs with low concurrency
101
121
  # WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1)
102
122
  # @example Get example.com sitemap archive max 100 URLs
@@ -155,6 +175,19 @@ module WaybackArchiver
155
175
  @user_agent ||= USER_AGENT
156
176
  end
157
177
 
178
+ # Sets the default respect_robots_txt
179
+ # @return [Boolean] the desired default for respect_robots_txt
180
+ # @param [Boolean] respect_robots_txt the desired default
181
+ def self.respect_robots_txt=(respect_robots_txt)
182
+ @respect_robots_txt = respect_robots_txt
183
+ end
184
+
185
+ # Returns the default respect_robots_txt
186
+ # @return [Boolean] the configured or the default respect_robots_txt
187
+ def self.respect_robots_txt
188
+ @respect_robots_txt ||= DEFAULT_RESPECT_ROBOTS_TXT
189
+ end
190
+
158
191
  # Sets the default concurrency
159
192
  # @return [Integer] the desired default concurrency
160
193
  # @param [Integer] concurrency the desired default concurrency
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-01-24 00:00:00.000000000 Z
11
+ date: 2024-12-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -16,70 +16,70 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 0.6.0
19
+ version: 0.7.1
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 0.6.0
26
+ version: 0.7.1
27
27
  - !ruby/object:Gem::Dependency
28
- name: robots
28
+ name: concurrent-ruby
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '0.1'
33
+ version: '1.3'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '0.1'
40
+ version: '1.3'
41
41
  - !ruby/object:Gem::Dependency
42
- name: concurrent-ruby
42
+ name: rexml
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '1.0'
47
+ version: 3.3.9
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '1.0'
54
+ version: 3.3.9
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: bundler
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '1.3'
61
+ version: '2.1'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '1.3'
68
+ version: '2.1'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rake
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '10.3'
75
+ version: '12.3'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '10.3'
82
+ version: '12.3'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: rspec
85
85
  requirement: !ruby/object:Gem::Requirement
@@ -168,16 +168,16 @@ dependencies:
168
168
  name: byebug
169
169
  requirement: !ruby/object:Gem::Requirement
170
170
  requirements:
171
- - - ">"
171
+ - - "~>"
172
172
  - !ruby/object:Gem::Version
173
- version: '0'
173
+ version: 11.1.3
174
174
  type: :development
175
175
  prerelease: false
176
176
  version_requirements: !ruby/object:Gem::Requirement
177
177
  requirements:
178
- - - ">"
178
+ - - "~>"
179
179
  - !ruby/object:Gem::Version
180
- version: '0'
180
+ version: 11.1.3
181
181
  description: Post URLs to Wayback Machine (Internet Archive), using a crawler, from
182
182
  Sitemap(s) or a list of URLs.
183
183
  email:
@@ -188,6 +188,7 @@ extensions: []
188
188
  extra_rdoc_files: []
189
189
  files:
190
190
  - bin/wayback_archiver
191
+ - lib/robots.rb
191
192
  - lib/wayback_archiver.rb
192
193
  - lib/wayback_archiver/adapters/wayback_machine.rb
193
194
  - lib/wayback_archiver/archive.rb
@@ -205,7 +206,7 @@ homepage: https://github.com/buren/wayback_archiver
205
206
  licenses:
206
207
  - MIT
207
208
  metadata: {}
208
- post_install_message:
209
+ post_install_message:
209
210
  rdoc_options: []
210
211
  require_paths:
211
212
  - lib
@@ -220,9 +221,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
220
221
  - !ruby/object:Gem::Version
221
222
  version: '0'
222
223
  requirements: []
223
- rubyforge_project:
224
- rubygems_version: 2.7.6
225
- signing_key:
224
+ rubygems_version: 3.5.3
225
+ signing_key:
226
226
  specification_version: 4
227
227
  summary: Post URLs to Wayback Machine (Internet Archive)
228
228
  test_files: []