wayback_archiver 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e5f39f42fe6d5a4f6fbded5ab460cd91a2c9411af28645a41b987bf01a31ea14
4
- data.tar.gz: 8cc1f5dbdc7d55fb9a1ec358c354fd8db3bacd856a824e9090005445a966de39
3
+ metadata.gz: 405a39b07682a9c07ea9cf4689af2f110db21515c8ee192a90092c021fa8cd7b
4
+ data.tar.gz: 61c8500dc285c5e0975f95bd6ebc86b42bffed5daec93899f90a3ac5782046a0
5
5
  SHA512:
6
- metadata.gz: 6e3edf351b7cda562d39120df2dff564a8248c41ecedae54deed2017c11cbc6c56311507a7dda87ce350b17ac9bdf3d265523dde3a79de751426d206a2f51741
7
- data.tar.gz: 020cb49d6dfc204de93a035853a26448b20901dc962d772554c4cb9074ff14682c570afffe3c2cbc05984c2298a7a05ffe662f8aaccde80e510b97450dd270cc
6
+ metadata.gz: cb74bad72fe7a33f8d45ee44afc3beb45424281ad05ca778222f1497eb3e92f4ae4cdde60669bd544436fc64dbd2b7acfdafa403e0757bcbeaf4df6aed9695cc
7
+ data.tar.gz: 0224a4bee7755dd25d791a2aa83b91960df395ffec1b3db687b73f32541b92ad9a726949926fb744062415903354eea3e4672e7235855d17f362da94fbcb3302
data/lib/robots.rb ADDED
@@ -0,0 +1,162 @@
1
+ #
2
+ # Copyright (c) 2008 Kyle Maxwell, contributors
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person
5
+ # obtaining a copy of this software and associated documentation
6
+ # files (the "Software"), to deal in the Software without
7
+ # restriction, including without limitation the rights to use,
8
+ # copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the
10
+ # Software is furnished to do so, subject to the following
11
+ # conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18
+ # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20
+ # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21
+ # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22
+ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23
+ # OTHER DEALINGS IN THE SOFTWARE.
24
+ #
25
+
26
+ require "open-uri"
27
+ require "uri"
28
+ require "rubygems"
29
+ require "timeout"
30
+
31
+ class Robots
32
+
33
+ DEFAULT_TIMEOUT = 3
34
+
35
+ class ParsedRobots
36
+
37
+ def initialize(uri, user_agent)
38
+ @last_accessed = Time.at(1)
39
+
40
+ io = Robots.get_robots_txt(uri, user_agent)
41
+
42
+ if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
43
+ io = StringIO.new("User-agent: *\nAllow: /\n")
44
+ end
45
+
46
+ @other = {}
47
+ @disallows = {}
48
+ @allows = {}
49
+ @delays = {} # added delays to make it work
50
+ agent = /.*/
51
+ io.each do |line|
52
+ next if line =~ /^\s*(#.*|$)/
53
+ arr = line.split(":")
54
+ key = arr.shift
55
+ value = arr.join(":").strip
56
+ value.strip!
57
+ case key
58
+ when "User-agent"
59
+ agent = to_regex(value)
60
+ when "Allow"
61
+ @allows[agent] ||= []
62
+ @allows[agent] << to_regex(value)
63
+ when "Disallow"
64
+ @disallows[agent] ||= []
65
+ @disallows[agent] << to_regex(value)
66
+ when "Crawl-delay"
67
+ @delays[agent] = value.to_i
68
+ else
69
+ @other[key] ||= []
70
+ @other[key] << value
71
+ end
72
+ end
73
+
74
+ @parsed = true
75
+ end
76
+
77
+ def allowed?(uri, user_agent)
78
+ return true unless @parsed
79
+ allowed = true
80
+ path = uri.request_uri
81
+
82
+ @disallows.each do |key, value|
83
+ if user_agent =~ key
84
+ value.each do |rule|
85
+ if path =~ rule
86
+ allowed = false
87
+ end
88
+ end
89
+ end
90
+ end
91
+
92
+ @allows.each do |key, value|
93
+ unless allowed
94
+ if user_agent =~ key
95
+ value.each do |rule|
96
+ if path =~ rule
97
+ allowed = true
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
103
+
104
+ if allowed && @delays[user_agent]
105
+ sleep @delays[user_agent] - (Time.now - @last_accessed)
106
+ @last_accessed = Time.now
107
+ end
108
+
109
+ return allowed
110
+ end
111
+
112
+ def other_values
113
+ @other
114
+ end
115
+
116
+ protected
117
+
118
+ def to_regex(pattern)
119
+ return /should-not-match-anything-123456789/ if pattern.strip.empty?
120
+ pattern = Regexp.escape(pattern)
121
+ pattern.gsub!(Regexp.escape("*"), ".*")
122
+ Regexp.compile("^#{pattern}")
123
+ end
124
+ end
125
+
126
+ def self.get_robots_txt(uri, user_agent)
127
+ begin
128
+ Timeout::timeout(Robots.timeout) do
129
+ io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
130
+ end
131
+ rescue Timeout::Error
132
+ STDERR.puts "robots.txt request timed out"
133
+ end
134
+ end
135
+
136
+ def self.timeout=(t)
137
+ @timeout = t
138
+ end
139
+
140
+ def self.timeout
141
+ @timeout || DEFAULT_TIMEOUT
142
+ end
143
+
144
+ def initialize(user_agent)
145
+ @user_agent = user_agent
146
+ @parsed = {}
147
+ end
148
+
149
+ def allowed?(uri)
150
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
151
+ host = uri.host
152
+ @parsed[host] ||= ParsedRobots.new(uri, @user_agent)
153
+ @parsed[host].allowed?(uri, @user_agent)
154
+ end
155
+
156
+ def other_values(uri)
157
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
158
+ host = uri.host
159
+ @parsed[host] ||= ParsedRobots.new(uri, @user_agent)
160
+ @parsed[host].other_values
161
+ end
162
+ end
@@ -13,7 +13,7 @@ module WaybackArchiver
13
13
  # @example Archive example.com, with default options
14
14
  # WaybackMachine.call('http://example.com')
15
15
  def self.call(url)
16
- request_url = "#{BASE_URL}#{url}"
16
+ request_url = "#{BASE_URL}#{url&.strip}"
17
17
  response = Request.get(request_url, follow_redirects: false)
18
18
  WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
19
19
  ArchiveResult.new(
@@ -1,3 +1,4 @@
1
+ require 'uri'
1
2
  require 'rexml/document'
2
3
 
3
4
  module WaybackArchiver
@@ -5,8 +6,9 @@ module WaybackArchiver
5
6
  class Sitemap
6
7
  attr_reader :document
7
8
 
8
- def initialize(xml, strict: false)
9
- @document = REXML::Document.new(xml)
9
+ def initialize(xml_or_string, strict: false)
10
+ @contents = xml_or_string
11
+ @document = REXML::Document.new(xml_or_string)
10
12
  rescue REXML::ParseException => _e
11
13
  raise if strict
12
14
 
@@ -65,9 +67,20 @@ module WaybackArchiver
65
67
 
66
68
  private
67
69
 
70
+ def valid_url?(url)
71
+ uri = URI.parse(url)
72
+ uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
73
+ rescue URI::InvalidURIError
74
+ false
75
+ end
76
+
68
77
  # Extract URLs from Sitemap
69
78
  def extract_urls(node_name)
70
- return document.to_s.each_line.map(&:strip) if plain_document?
79
+ if plain_document?
80
+ return @contents.to_s
81
+ .each_line.map(&:strip)
82
+ .select(&method(:valid_url?))
83
+ end
71
84
 
72
85
  urls = []
73
86
  document.root.elements.each("#{node_name}/loc") do |element|
@@ -79,7 +79,7 @@ module WaybackArchiver
79
79
  urls(url: sitemap_url, visited: visited)
80
80
  end
81
81
  else
82
- sitemap.urls
82
+ sitemap.urls.map { |url| url&.strip }
83
83
  end
84
84
  rescue Request::Error => e
85
85
  WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
@@ -44,7 +44,7 @@ module WaybackArchiver
44
44
  }
45
45
  options[:limit] = limit unless limit == -1
46
46
 
47
- Spidr.site(start_at_url, options) do |spider|
47
+ Spidr.site(start_at_url, **options) do |spider|
48
48
  spider.every_page do |page|
49
49
  page_url = page.url.to_s
50
50
  urls << page_url
@@ -1,4 +1,4 @@
1
1
  module WaybackArchiver
2
2
  # Gem version
3
- VERSION = '1.4.0'.freeze
3
+ VERSION = '1.5.0'.freeze
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-04-23 00:00:00.000000000 Z
11
+ date: 2024-12-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -16,42 +16,42 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 0.6.1
19
+ version: 0.7.1
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 0.6.1
26
+ version: 0.7.1
27
27
  - !ruby/object:Gem::Dependency
28
- name: robots
28
+ name: concurrent-ruby
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '0.1'
33
+ version: '1.3'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '0.1'
40
+ version: '1.3'
41
41
  - !ruby/object:Gem::Dependency
42
- name: concurrent-ruby
42
+ name: rexml
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '1.0'
47
+ version: 3.3.9
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '1.0'
54
+ version: 3.3.9
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: bundler
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -168,16 +168,16 @@ dependencies:
168
168
  name: byebug
169
169
  requirement: !ruby/object:Gem::Requirement
170
170
  requirements:
171
- - - ">"
171
+ - - "~>"
172
172
  - !ruby/object:Gem::Version
173
- version: '0'
173
+ version: 11.1.3
174
174
  type: :development
175
175
  prerelease: false
176
176
  version_requirements: !ruby/object:Gem::Requirement
177
177
  requirements:
178
- - - ">"
178
+ - - "~>"
179
179
  - !ruby/object:Gem::Version
180
- version: '0'
180
+ version: 11.1.3
181
181
  description: Post URLs to Wayback Machine (Internet Archive), using a crawler, from
182
182
  Sitemap(s) or a list of URLs.
183
183
  email:
@@ -188,6 +188,7 @@ extensions: []
188
188
  extra_rdoc_files: []
189
189
  files:
190
190
  - bin/wayback_archiver
191
+ - lib/robots.rb
191
192
  - lib/wayback_archiver.rb
192
193
  - lib/wayback_archiver/adapters/wayback_machine.rb
193
194
  - lib/wayback_archiver/archive.rb
@@ -205,7 +206,7 @@ homepage: https://github.com/buren/wayback_archiver
205
206
  licenses:
206
207
  - MIT
207
208
  metadata: {}
208
- post_install_message:
209
+ post_install_message:
209
210
  rdoc_options: []
210
211
  require_paths:
211
212
  - lib
@@ -220,8 +221,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
220
221
  - !ruby/object:Gem::Version
221
222
  version: '0'
222
223
  requirements: []
223
- rubygems_version: 3.1.4
224
- signing_key:
224
+ rubygems_version: 3.5.3
225
+ signing_key:
225
226
  specification_version: 4
226
227
  summary: Post URLs to Wayback Machine (Internet Archive)
227
228
  test_files: []