wayback_archiver 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4dba94d5ac29b57a1df0a6e98f5d9ea29c1905b9ffdef7ee9bf9c344448362fe
4
- data.tar.gz: 79af5b2af8660dd5f0cafae521e7b73be36e528da7c8ce2f62502f76b434959b
3
+ metadata.gz: 405a39b07682a9c07ea9cf4689af2f110db21515c8ee192a90092c021fa8cd7b
4
+ data.tar.gz: 61c8500dc285c5e0975f95bd6ebc86b42bffed5daec93899f90a3ac5782046a0
5
5
  SHA512:
6
- metadata.gz: d02a18809ebdc880bdcdb9a7d0d522da445c2a2329959080c897d1f7033f6a687c268a626abfa05a45791c5e1404a4e3e8025ea172494bca0b430033e4fc20b0
7
- data.tar.gz: 316e8fe4e922b0aeb5047191fa6ddd8e242fa7bfd1ce8eb2b2a2e08d980999dd895c071f82f949a2690c4259bc5b3de00049e691b540040db79d002ffdf8a4f0
6
+ metadata.gz: cb74bad72fe7a33f8d45ee44afc3beb45424281ad05ca778222f1497eb3e92f4ae4cdde60669bd544436fc64dbd2b7acfdafa403e0757bcbeaf4df6aed9695cc
7
+ data.tar.gz: 0224a4bee7755dd25d791a2aa83b91960df395ffec1b3db687b73f32541b92ad9a726949926fb744062415903354eea3e4672e7235855d17f362da94fbcb3302
data/bin/wayback_archiver CHANGED
@@ -10,6 +10,7 @@ log = STDOUT
10
10
  log_level = Logger::INFO
11
11
  concurrency = WaybackArchiver.concurrency
12
12
  limit = WaybackArchiver.max_limit
13
+ hosts = []
13
14
 
14
15
  optparse = OptionParser.new do |parser|
15
16
  parser.banner = 'Usage: wayback_archiver [<url>] [options]'
@@ -30,7 +31,11 @@ optparse = OptionParser.new do |parser|
30
31
  strategy = 'urls'
31
32
  end
32
33
 
33
- parser.on('--concurrency=5', Integer, 'Concurrency') do |value|
34
+ parser.on('--hosts=[example.com]', Array, 'Only spider links on certain hosts') do |value|
35
+ hosts = value.map { |v| Regexp.new(v) } if value
36
+ end
37
+
38
+ parser.on('--concurrency=1', Integer, 'Concurrency') do |value|
34
39
  concurrency = value
35
40
  end
36
41
 
@@ -81,6 +86,7 @@ strategy ||= 'auto'
81
86
  urls.each do |url|
82
87
  WaybackArchiver.archive(
83
88
  url,
89
+ hosts: hosts,
84
90
  strategy: strategy,
85
91
  concurrency: concurrency,
86
92
  limit: limit
data/lib/robots.rb ADDED
@@ -0,0 +1,162 @@
1
+ #
2
+ # Copyright (c) 2008 Kyle Maxwell, contributors
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person
5
+ # obtaining a copy of this software and associated documentation
6
+ # files (the "Software"), to deal in the Software without
7
+ # restriction, including without limitation the rights to use,
8
+ # copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the
10
+ # Software is furnished to do so, subject to the following
11
+ # conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18
+ # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20
+ # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21
+ # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22
+ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23
+ # OTHER DEALINGS IN THE SOFTWARE.
24
+ #
25
+
26
+ require "open-uri"
27
+ require "uri"
28
+ require "rubygems"
29
+ require "timeout"
30
+
31
+ class Robots
32
+
33
+ DEFAULT_TIMEOUT = 3
34
+
35
+ class ParsedRobots
36
+
37
+ def initialize(uri, user_agent)
38
+ @last_accessed = Time.at(1)
39
+
40
+ io = Robots.get_robots_txt(uri, user_agent)
41
+
42
+ if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
43
+ io = StringIO.new("User-agent: *\nAllow: /\n")
44
+ end
45
+
46
+ @other = {}
47
+ @disallows = {}
48
+ @allows = {}
49
+ @delays = {} # added delays to make it work
50
+ agent = /.*/
51
+ io.each do |line|
52
+ next if line =~ /^\s*(#.*|$)/
53
+ arr = line.split(":")
54
+ key = arr.shift
55
+ value = arr.join(":").strip
56
+ value.strip!
57
+ case key
58
+ when "User-agent"
59
+ agent = to_regex(value)
60
+ when "Allow"
61
+ @allows[agent] ||= []
62
+ @allows[agent] << to_regex(value)
63
+ when "Disallow"
64
+ @disallows[agent] ||= []
65
+ @disallows[agent] << to_regex(value)
66
+ when "Crawl-delay"
67
+ @delays[agent] = value.to_i
68
+ else
69
+ @other[key] ||= []
70
+ @other[key] << value
71
+ end
72
+ end
73
+
74
+ @parsed = true
75
+ end
76
+
77
+ def allowed?(uri, user_agent)
78
+ return true unless @parsed
79
+ allowed = true
80
+ path = uri.request_uri
81
+
82
+ @disallows.each do |key, value|
83
+ if user_agent =~ key
84
+ value.each do |rule|
85
+ if path =~ rule
86
+ allowed = false
87
+ end
88
+ end
89
+ end
90
+ end
91
+
92
+ @allows.each do |key, value|
93
+ unless allowed
94
+ if user_agent =~ key
95
+ value.each do |rule|
96
+ if path =~ rule
97
+ allowed = true
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
103
+
104
+ if allowed && @delays[user_agent]
105
+ sleep @delays[user_agent] - (Time.now - @last_accessed)
106
+ @last_accessed = Time.now
107
+ end
108
+
109
+ return allowed
110
+ end
111
+
112
+ def other_values
113
+ @other
114
+ end
115
+
116
+ protected
117
+
118
+ def to_regex(pattern)
119
+ return /should-not-match-anything-123456789/ if pattern.strip.empty?
120
+ pattern = Regexp.escape(pattern)
121
+ pattern.gsub!(Regexp.escape("*"), ".*")
122
+ Regexp.compile("^#{pattern}")
123
+ end
124
+ end
125
+
126
+ def self.get_robots_txt(uri, user_agent)
127
+ begin
128
+ Timeout::timeout(Robots.timeout) do
129
+ io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
130
+ end
131
+ rescue Timeout::Error
132
+ STDERR.puts "robots.txt request timed out"
133
+ end
134
+ end
135
+
136
+ def self.timeout=(t)
137
+ @timeout = t
138
+ end
139
+
140
+ def self.timeout
141
+ @timeout || DEFAULT_TIMEOUT
142
+ end
143
+
144
+ def initialize(user_agent)
145
+ @user_agent = user_agent
146
+ @parsed = {}
147
+ end
148
+
149
+ def allowed?(uri)
150
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
151
+ host = uri.host
152
+ @parsed[host] ||= ParsedRobots.new(uri, @user_agent)
153
+ @parsed[host].allowed?(uri, @user_agent)
154
+ end
155
+
156
+ def other_values(uri)
157
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
158
+ host = uri.host
159
+ @parsed[host] ||= ParsedRobots.new(uri, @user_agent)
160
+ @parsed[host].other_values
161
+ end
162
+ end
@@ -13,7 +13,7 @@ module WaybackArchiver
13
13
  # @example Archive example.com, with default options
14
14
  # WaybackMachine.call('http://example.com')
15
15
  def self.call(url)
16
- request_url = "#{BASE_URL}#{url}"
16
+ request_url = "#{BASE_URL}#{url&.strip}"
17
17
  response = Request.get(request_url, follow_redirects: false)
18
18
  WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
19
19
  ArchiveResult.new(
@@ -9,7 +9,7 @@ module WaybackArchiver
9
9
  # Send URLs to Wayback Machine.
10
10
  # @return [Array<ArchiveResult>] with sent URLs.
11
11
  # @param [Array<String>] urls to send to the Wayback Machine.
12
- # @param concurrency [Integer] the default is 5
12
+ # @param concurrency [Integer] the default is 1
13
13
  # @yield [archive_result] If a block is given, each result will be yielded
14
14
  # @yieldparam [ArchiveResult] archive_result
15
15
  # @example Archive urls, asynchronously
@@ -54,7 +54,8 @@ module WaybackArchiver
54
54
  # Send URLs to Wayback Machine by crawling the site.
55
55
  # @return [Array<ArchiveResult>] with URLs sent to the Wayback Machine.
56
56
  # @param [String] source for URL to crawl.
57
- # @param concurrency [Integer] the default is 5
57
+ # @param concurrency [Integer] the default is 1
58
+ # @param [Array<String, Regexp>] hosts to crawl
58
59
  # @yield [archive_result] If a block is given, each result will be yielded
59
60
  # @yieldparam [ArchiveResult] archive_result
60
61
  # @example Crawl example.com and send all URLs of the same domain
@@ -66,13 +67,21 @@ module WaybackArchiver
66
67
  # Archiver.crawl('example.com', concurrency: 1)
67
68
  # @example Stop after archiving 100 links
68
69
  # Archiver.crawl('example.com', limit: 100)
69
- def self.crawl(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
70
+ # @example Crawl multiple hosts
71
+ # URLCollector.crawl(
72
+ # 'http://example.com',
73
+ # hosts: [
74
+ # 'example.com',
75
+ # /host[\d]+\.example\.com/
76
+ # ]
77
+ # )
78
+ def self.crawl(source, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
70
79
  WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
71
80
 
72
81
  posted_urls = Concurrent::Array.new
73
82
  pool = ThreadPool.build(concurrency)
74
83
 
75
- found_urls = URLCollector.crawl(source, limit: limit) do |url|
84
+ found_urls = URLCollector.crawl(source, hosts: hosts, limit: limit) do |url|
76
85
  pool.post do
77
86
  result = post_url(url)
78
87
  yield(result) if block_given?
@@ -1,3 +1,4 @@
1
+ require 'uri'
1
2
  require 'rexml/document'
2
3
 
3
4
  module WaybackArchiver
@@ -5,8 +6,9 @@ module WaybackArchiver
5
6
  class Sitemap
6
7
  attr_reader :document
7
8
 
8
- def initialize(xml, strict: false)
9
- @document = REXML::Document.new(xml)
9
+ def initialize(xml_or_string, strict: false)
10
+ @contents = xml_or_string
11
+ @document = REXML::Document.new(xml_or_string)
10
12
  rescue REXML::ParseException => _e
11
13
  raise if strict
12
14
 
@@ -65,9 +67,20 @@ module WaybackArchiver
65
67
 
66
68
  private
67
69
 
70
+ def valid_url?(url)
71
+ uri = URI.parse(url)
72
+ uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
73
+ rescue URI::InvalidURIError
74
+ false
75
+ end
76
+
68
77
  # Extract URLs from Sitemap
69
78
  def extract_urls(node_name)
70
- return document.to_s.each_line.map(&:strip) if plain_document?
79
+ if plain_document?
80
+ return @contents.to_s
81
+ .each_line.map(&:strip)
82
+ .select(&method(:valid_url?))
83
+ end
71
84
 
72
85
  urls = []
73
86
  document.root.elements.each("#{node_name}/loc") do |element|
@@ -79,7 +79,7 @@ module WaybackArchiver
79
79
  urls(url: sitemap_url, visited: visited)
80
80
  end
81
81
  else
82
- sitemap.urls
82
+ sitemap.urls.map { |url| url&.strip }
83
83
  end
84
84
  rescue Request::Error => e
85
85
  WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
@@ -19,22 +19,32 @@ module WaybackArchiver
19
19
  # Retrieve URLs by crawling.
20
20
  # @return [Array<String>] of URLs defined found during crawl.
21
21
  # @param [String] url domain to crawl URLs from.
22
+ # @param [Array<String, Regexp>] hosts to crawl.
22
23
  # @example Crawl URLs defined on example.com
23
24
  # URLCollector.crawl('http://example.com')
24
25
  # @example Crawl URLs defined on example.com and limit the number of visited pages to 100
25
26
  # URLCollector.crawl('http://example.com', limit: 100)
26
27
  # @example Crawl URLs defined on example.com and explicitly set no upper limit on the number of visited pages to 100
27
28
  # URLCollector.crawl('http://example.com', limit: -1)
28
- def self.crawl(url, limit: WaybackArchiver.max_limit)
29
+ # @example Crawl multiple hosts
30
+ # URLCollector.crawl(
31
+ # 'http://example.com',
32
+ # hosts: [
33
+ # 'example.com',
34
+ # /host[\d]+\.example\.com/
35
+ # ]
36
+ # )
37
+ def self.crawl(url, hosts: [], limit: WaybackArchiver.max_limit)
29
38
  urls = []
30
39
  start_at_url = Request.build_uri(url).to_s
31
40
  options = {
32
- robots: true,
41
+ robots: WaybackArchiver.respect_robots_txt,
42
+ hosts: hosts,
33
43
  user_agent: WaybackArchiver.user_agent
34
44
  }
35
45
  options[:limit] = limit unless limit == -1
36
46
 
37
- Spidr.site(start_at_url, options) do |spider|
47
+ Spidr.site(start_at_url, **options) do |spider|
38
48
  spider.every_page do |page|
39
49
  page_url = page.url.to_s
40
50
  urls << page_url
@@ -1,4 +1,4 @@
1
1
  module WaybackArchiver
2
2
  # Gem version
3
- VERSION = '1.3.0'.freeze
3
+ VERSION = '1.5.0'.freeze
4
4
  end
@@ -11,9 +11,11 @@ module WaybackArchiver
11
11
  INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'.freeze
12
12
  # WaybackArchiver User-Agent
13
13
  USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze
14
+ # Default for whether to respect robots txt files
15
+ DEFAULT_RESPECT_ROBOTS_TXT = false
14
16
 
15
17
  # Default concurrency for archiving URLs
16
- DEFAULT_CONCURRENCY = 5
18
+ DEFAULT_CONCURRENCY = 1
17
19
 
18
20
  # Maxmium number of links posted (-1 is no limit)
19
21
  DEFAULT_MAX_LIMIT = -1
@@ -22,6 +24,7 @@ module WaybackArchiver
22
24
  # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
23
25
  # @param [String/Array<String>] source for URL(s).
24
26
  # @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto.
27
+ # @param [Array<String, Regexp>] hosts to crawl.
25
28
  # @example Crawl example.com and send all URLs of the same domain
26
29
  # WaybackArchiver.archive('example.com') # Default strategy is :auto
27
30
  # WaybackArchiver.archive('example.com', strategy: :auto)
@@ -43,11 +46,19 @@ module WaybackArchiver
43
46
  # WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
44
47
  # WaybackArchiver.archive('example.com', strategy: :url, limit: 100) # send max 100 URLs
45
48
  # WaybackArchiver.archive('example.com', :url)
46
- def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
49
+ # @example Crawl multiple hosts
50
+ # WaybackArchiver.archive(
51
+ # 'http://example.com',
52
+ # hosts: [
53
+ # 'example.com',
54
+ # /host[\d]+\.example\.com/
55
+ # ]
56
+ # )
57
+ def self.archive(source, legacy_strategy = nil, strategy: :auto, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
47
58
  strategy = legacy_strategy || strategy
48
59
 
49
60
  case strategy.to_s
50
- when 'crawl' then crawl(source, concurrency: concurrency, limit: limit, &block)
61
+ when 'crawl' then crawl(source, concurrency: concurrency, limit: limit, hosts: hosts, &block)
51
62
  when 'auto' then auto(source, concurrency: concurrency, limit: limit, &block)
52
63
  when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit, &block)
53
64
  when 'urls' then urls(source, concurrency: concurrency, limit: limit, &block)
@@ -63,7 +74,7 @@ module WaybackArchiver
63
74
  # @param [String] source (must be a valid URL).
64
75
  # @param concurrency [Integer]
65
76
  # @example Auto archive example.com
66
- # WaybackArchiver.auto('example.com') # Default concurrency is 5
77
+ # WaybackArchiver.auto('example.com') # Default concurrency is 1
67
78
  # @example Auto archive example.com with low concurrency
68
79
  # WaybackArchiver.auto('example.com', concurrency: 1)
69
80
  # @example Auto archive example.com and archive max 100 URLs
@@ -79,16 +90,25 @@ module WaybackArchiver
79
90
  # Crawl site for URLs to send to the Wayback Machine.
80
91
  # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
81
92
  # @param [String] url to start crawling from.
93
+ # @param [Array<String, Regexp>] hosts to crawl
82
94
  # @param concurrency [Integer]
83
95
  # @example Crawl example.com and send all URLs of the same domain
84
- # WaybackArchiver.crawl('example.com') # Default concurrency is 5
96
+ # WaybackArchiver.crawl('example.com') # Default concurrency is 1
85
97
  # @example Crawl example.com and send all URLs of the same domain with low concurrency
86
98
  # WaybackArchiver.crawl('example.com', concurrency: 1)
87
99
  # @example Crawl example.com and archive max 100 URLs
88
100
  # WaybackArchiver.crawl('example.com', limit: 100)
89
- def self.crawl(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
101
+ # @example Crawl multiple hosts
102
+ # URLCollector.crawl(
103
+ # 'http://example.com',
104
+ # hosts: [
105
+ # 'example.com',
106
+ # /host[\d]+\.example\.com/
107
+ # ]
108
+ # )
109
+ def self.crawl(url, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
90
110
  WaybackArchiver.logger.info "Crawling #{url}"
91
- Archive.crawl(url, concurrency: concurrency, limit: limit, &block)
111
+ Archive.crawl(url, hosts: hosts, concurrency: concurrency, limit: limit, &block)
92
112
  end
93
113
 
94
114
  # Get URLs from sitemap and send found URLs to the Wayback Machine.
@@ -96,7 +116,7 @@ module WaybackArchiver
96
116
  # @param [String] url to the sitemap.
97
117
  # @param concurrency [Integer]
98
118
  # @example Get example.com sitemap and archive all found URLs
99
- # WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 5
119
+ # WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 1
100
120
  # @example Get example.com sitemap and archive all found URLs with low concurrency
101
121
  # WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1)
102
122
  # @example Get example.com sitemap archive max 100 URLs
@@ -155,6 +175,19 @@ module WaybackArchiver
155
175
  @user_agent ||= USER_AGENT
156
176
  end
157
177
 
178
+ # Sets the default respect_robots_txt
179
+ # @return [Boolean] the desired default for respect_robots_txt
180
+ # @param [Boolean] respect_robots_txt the desired default
181
+ def self.respect_robots_txt=(respect_robots_txt)
182
+ @respect_robots_txt = respect_robots_txt
183
+ end
184
+
185
+ # Returns the default respect_robots_txt
186
+ # @return [Boolean] the configured or the default respect_robots_txt
187
+ def self.respect_robots_txt
188
+ @respect_robots_txt ||= DEFAULT_RESPECT_ROBOTS_TXT
189
+ end
190
+
158
191
  # Sets the default concurrency
159
192
  # @return [Integer] the desired default concurrency
160
193
  # @param [Integer] concurrency the desired default concurrency
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-01-24 00:00:00.000000000 Z
11
+ date: 2024-12-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -16,70 +16,70 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 0.6.0
19
+ version: 0.7.1
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 0.6.0
26
+ version: 0.7.1
27
27
  - !ruby/object:Gem::Dependency
28
- name: robots
28
+ name: concurrent-ruby
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '0.1'
33
+ version: '1.3'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '0.1'
40
+ version: '1.3'
41
41
  - !ruby/object:Gem::Dependency
42
- name: concurrent-ruby
42
+ name: rexml
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '1.0'
47
+ version: 3.3.9
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '1.0'
54
+ version: 3.3.9
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: bundler
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '1.3'
61
+ version: '2.1'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '1.3'
68
+ version: '2.1'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rake
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '10.3'
75
+ version: '12.3'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '10.3'
82
+ version: '12.3'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: rspec
85
85
  requirement: !ruby/object:Gem::Requirement
@@ -168,16 +168,16 @@ dependencies:
168
168
  name: byebug
169
169
  requirement: !ruby/object:Gem::Requirement
170
170
  requirements:
171
- - - ">"
171
+ - - "~>"
172
172
  - !ruby/object:Gem::Version
173
- version: '0'
173
+ version: 11.1.3
174
174
  type: :development
175
175
  prerelease: false
176
176
  version_requirements: !ruby/object:Gem::Requirement
177
177
  requirements:
178
- - - ">"
178
+ - - "~>"
179
179
  - !ruby/object:Gem::Version
180
- version: '0'
180
+ version: 11.1.3
181
181
  description: Post URLs to Wayback Machine (Internet Archive), using a crawler, from
182
182
  Sitemap(s) or a list of URLs.
183
183
  email:
@@ -188,6 +188,7 @@ extensions: []
188
188
  extra_rdoc_files: []
189
189
  files:
190
190
  - bin/wayback_archiver
191
+ - lib/robots.rb
191
192
  - lib/wayback_archiver.rb
192
193
  - lib/wayback_archiver/adapters/wayback_machine.rb
193
194
  - lib/wayback_archiver/archive.rb
@@ -205,7 +206,7 @@ homepage: https://github.com/buren/wayback_archiver
205
206
  licenses:
206
207
  - MIT
207
208
  metadata: {}
208
- post_install_message:
209
+ post_install_message:
209
210
  rdoc_options: []
210
211
  require_paths:
211
212
  - lib
@@ -220,9 +221,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
220
221
  - !ruby/object:Gem::Version
221
222
  version: '0'
222
223
  requirements: []
223
- rubyforge_project:
224
- rubygems_version: 2.7.6
225
- signing_key:
224
+ rubygems_version: 3.5.3
225
+ signing_key:
226
226
  specification_version: 4
227
227
  summary: Post URLs to Wayback Machine (Internet Archive)
228
228
  test_files: []