wayback_archiver 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wayback_archiver/archive.rb +6 -4
- data/lib/wayback_archiver/collector.rb +7 -7
- data/lib/wayback_archiver/crawl_url.rb +69 -0
- data/lib/wayback_archiver/crawler.rb +32 -22
- data/lib/wayback_archiver/request.rb +15 -8
- data/lib/wayback_archiver/version.rb +1 -1
- data/lib/wayback_archiver.rb +4 -4
- metadata +58 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ddbffea2e55297390c66201d287b85fb6336d864
|
4
|
+
data.tar.gz: b419745edba1f8dcf9d6e83ce5b74cd70c9abd0f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5bb23d2bab242cc55d1a9e851e5fd719431371f2149a2640ce34ede4be817f881ace982d39cff04691d435b140bbc54419bf90affb28d0621261cb9ee7d34a69
|
7
|
+
data.tar.gz: 91c6651a5cbeb1333a9f24ab3596ee0b284c54e4aa7c375b158bf6c3bbb54892a6c78c930a1942a3dcc9f4e0ab937c60a08e3f60a8339f0e9342cea6f0959c5f
|
@@ -3,7 +3,7 @@ module WaybackArchiver
|
|
3
3
|
MAX_THREAD_COUNT = 8
|
4
4
|
|
5
5
|
def self.post(all_urls)
|
6
|
-
puts "Request
|
6
|
+
puts "Request are sent with up to #{MAX_THREAD_COUNT} parallel threads"
|
7
7
|
|
8
8
|
puts "Total urls to be sent: #{all_urls.length}"
|
9
9
|
threads = []
|
@@ -11,18 +11,20 @@ module WaybackArchiver
|
|
11
11
|
all_urls.each_slice(group_size).to_a.each do |urls|
|
12
12
|
threads << Thread.new do
|
13
13
|
urls.each_with_index do |url, index|
|
14
|
-
|
14
|
+
resolved_url = Request.resolve_url(url)
|
15
|
+
request_url = "#{BASE_URL}#{resolved_url}"
|
15
16
|
begin
|
16
17
|
res = Request.get_response(request_url)
|
17
|
-
|
18
|
+
puts "[#{res.code}, #{res.message}] #{resolved_url}"
|
18
19
|
rescue Exception => e
|
19
20
|
puts "Error message: #{e.message}"
|
20
|
-
puts "Failed to archive: #{
|
21
|
+
puts "Failed to archive: #{resolved_url}"
|
21
22
|
end
|
22
23
|
end
|
23
24
|
end
|
24
25
|
end
|
25
26
|
threads.each(&:join)
|
27
|
+
puts "#{all_urls.length} URLs sent to Internet archive"
|
26
28
|
all_urls
|
27
29
|
end
|
28
30
|
end
|
@@ -1,27 +1,27 @@
|
|
1
1
|
module WaybackArchiver
|
2
2
|
class Collector
|
3
|
-
|
4
3
|
class << self
|
5
|
-
|
6
4
|
def urls_from_sitemap(url)
|
7
|
-
urls =
|
8
|
-
xml_data = Request.get_response(url).body
|
5
|
+
urls = []
|
6
|
+
xml_data = Request.get_response(Request.resolve_url(url)).body
|
9
7
|
document = REXML::Document.new(xml_data)
|
10
8
|
|
11
9
|
document.elements.each('urlset/url/loc') { |element| urls << element.text }
|
12
10
|
urls
|
13
11
|
end
|
14
12
|
|
13
|
+
def urls_from_crawl(url)
|
14
|
+
Crawler.collect_urls(url)
|
15
|
+
end
|
16
|
+
|
15
17
|
def urls_from_file(path)
|
16
18
|
raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
|
17
|
-
urls =
|
19
|
+
urls = []
|
18
20
|
text = File.open(path).read
|
19
21
|
text.gsub!(/\r\n?/, "\n") # Normalize line endings
|
20
22
|
text.each_line { |line| urls << line.gsub(/\n/, '').strip }
|
21
23
|
urls.reject(&:empty?)
|
22
24
|
end
|
23
|
-
|
24
25
|
end
|
25
|
-
|
26
26
|
end
|
27
27
|
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module WaybackArchiver
|
2
|
+
class CrawlUrl
|
3
|
+
attr_reader :resolved_base_url, :base_hostname
|
4
|
+
|
5
|
+
def initialize(base_url)
|
6
|
+
@resolved_base_url = Request.resolve_url(base_url)
|
7
|
+
@base_hostname = URI.parse(@resolved_base_url).host
|
8
|
+
end
|
9
|
+
|
10
|
+
def absolute_url_from(raw_url, get_url)
|
11
|
+
return nil unless eligible_url?(raw_url)
|
12
|
+
parsed_url = URI.parse(raw_url) rescue URI.parse('')
|
13
|
+
if parsed_url.relative?
|
14
|
+
url_from_relative(raw_url, get_url)
|
15
|
+
elsif base_hostname.eql?(parsed_url.hostname)
|
16
|
+
raw_url
|
17
|
+
else
|
18
|
+
nil
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def url_from_relative(url, current_page_url)
|
25
|
+
if url.start_with?('/')
|
26
|
+
"#{without_path_suffix(resolved_base_url)}#{url}"
|
27
|
+
elsif url.start_with?('../')
|
28
|
+
"#{url_from_dotted_url(url, current_page_url)}"
|
29
|
+
else
|
30
|
+
"#{with_path_suffix(resolved_base_url)}#{url}"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def url_from_dotted_url(url, current_page_url)
|
35
|
+
absolute_url = with_path_suffix(current_page_url.dup)
|
36
|
+
found_dots = without_path_suffix(url).scan('../').length
|
37
|
+
removed_dots = 0
|
38
|
+
max_levels = 4
|
39
|
+
while found_dots >= removed_dots && max_levels > removed_dots
|
40
|
+
index = absolute_url.rindex('/') or break
|
41
|
+
absolute_url = absolute_url[0..(index - 1)]
|
42
|
+
removed_dots += 1
|
43
|
+
end
|
44
|
+
"#{with_path_suffix(absolute_url)}#{url.gsub('../', '')}"
|
45
|
+
end
|
46
|
+
|
47
|
+
def with_path_suffix(passed_url)
|
48
|
+
url = passed_url.dup
|
49
|
+
url.end_with?('/') ? url : url << '/'
|
50
|
+
end
|
51
|
+
|
52
|
+
def without_path_suffix(passed_url)
|
53
|
+
url = passed_url.dup
|
54
|
+
url.end_with?('/') ? url[0...(url.length - 1)] : url
|
55
|
+
end
|
56
|
+
|
57
|
+
def eligible_url?(href)
|
58
|
+
return false if href.nil? || href.empty?
|
59
|
+
dont_start = %w(javascript: callto: mailto: tel: skype: facetime: wtai: #)
|
60
|
+
dont_include = %w(/email-protection#)
|
61
|
+
dont_end = %w(.zip .rar .pdf .exe .dmg .pkg .dpkg .bat)
|
62
|
+
|
63
|
+
dont_start.each { |pattern| return false if href.start_with?(pattern) }
|
64
|
+
dont_include.each { |pattern| return false if href.include?(pattern) }
|
65
|
+
dont_end.each { |pattern| return false if href.end_with?(pattern) }
|
66
|
+
true
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -1,22 +1,27 @@
|
|
1
1
|
require 'set'
|
2
2
|
require 'nokogiri'
|
3
|
-
require 'open-uri'
|
4
3
|
|
5
4
|
module WaybackArchiver
|
6
5
|
class Crawler
|
7
|
-
|
8
|
-
|
9
|
-
|
6
|
+
CRAWLER_INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
|
7
|
+
HEADERS_HASH = {
|
8
|
+
'User-Agent' => "WaybackArchiver/#{VERSION} (+#{CRAWLER_INFO_LINK})"
|
9
|
+
}
|
10
|
+
|
11
|
+
def initialize(url, resolve: false)
|
12
|
+
base_url = Request.resolve_url(url)
|
13
|
+
@options = { resolve: resolve }
|
14
|
+
@crawl_url = CrawlUrl.new(base_url)
|
10
15
|
@fetch_queue = Set.new
|
11
16
|
@procesed = Set.new
|
12
|
-
@fetch_queue << @
|
17
|
+
@fetch_queue << @crawl_url.resolved_base_url
|
13
18
|
end
|
14
19
|
|
15
20
|
def self.collect_urls(base_url)
|
16
21
|
new(base_url).collect_urls
|
17
22
|
end
|
18
23
|
|
19
|
-
def collect_urls
|
24
|
+
def collect_urls
|
20
25
|
until @fetch_queue.empty?
|
21
26
|
url = @fetch_queue.first
|
22
27
|
@fetch_queue.delete(@fetch_queue.first)
|
@@ -24,27 +29,32 @@ module WaybackArchiver
|
|
24
29
|
end
|
25
30
|
puts "Crawling finished, #{@procesed.length} links found"
|
26
31
|
@procesed.to_a
|
32
|
+
rescue Interrupt, IRB::Abort
|
33
|
+
puts 'Crawl interrupted.'
|
34
|
+
@fetch_queue.to_a
|
27
35
|
end
|
28
36
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
@
|
33
|
-
link_elements.
|
34
|
-
|
35
|
-
|
37
|
+
private
|
38
|
+
|
39
|
+
def page_links(get_url)
|
40
|
+
puts "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
|
41
|
+
link_elements = get_page(get_url).css('a') rescue []
|
42
|
+
@procesed << get_url
|
43
|
+
link_elements.each do |page_link|
|
44
|
+
absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
|
45
|
+
if absolute_url
|
46
|
+
resolved_url = resolve(absolute_url)
|
47
|
+
@fetch_queue << resolved_url if !@procesed.include?(resolved_url)
|
48
|
+
end
|
36
49
|
end
|
37
50
|
end
|
38
51
|
|
39
|
-
def
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
else
|
46
|
-
raw_url if raw_url.include?(@base_url) && @hostname.eql?(url.hostname)
|
47
|
-
end
|
52
|
+
def get_page(url)
|
53
|
+
Nokogiri::HTML(Request.get_response(url).body)
|
54
|
+
end
|
55
|
+
|
56
|
+
def resolve(url)
|
57
|
+
@options[:resolve] ? Request.resolve_url(url) : url
|
48
58
|
end
|
49
59
|
end
|
50
60
|
end
|
@@ -1,16 +1,23 @@
|
|
1
|
+
require 'url_resolver' # TODO: Allow users to use any resolver
|
2
|
+
|
1
3
|
module WaybackArchiver
|
2
4
|
class Request
|
5
|
+
INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
|
6
|
+
USER_AGENT = "WaybackArchiver/#{VERSION} (+#{INFO_LINK})"
|
3
7
|
|
4
|
-
def self.get_response(url)
|
5
|
-
|
6
|
-
|
7
|
-
http
|
8
|
-
http.use_ssl = true if
|
8
|
+
def self.get_response(url, resolve: false)
|
9
|
+
resolved_url = resolve ? resolve_url(url) : url
|
10
|
+
uri = URI.parse(resolved_url)
|
11
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
12
|
+
http.use_ssl = true if resolved_url.include?('https://')
|
9
13
|
|
10
|
-
request
|
11
|
-
|
12
|
-
|
14
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
15
|
+
request['User-Agent'] = USER_AGENT
|
16
|
+
http.request(request)
|
13
17
|
end
|
14
18
|
|
19
|
+
def self.resolve_url(url)
|
20
|
+
UrlResolver.resolve(url)
|
21
|
+
end
|
15
22
|
end
|
16
23
|
end
|
data/lib/wayback_archiver.rb
CHANGED
@@ -6,6 +6,7 @@ require 'wayback_archiver/collector'
|
|
6
6
|
require 'wayback_archiver/archive'
|
7
7
|
require 'wayback_archiver/request'
|
8
8
|
require 'wayback_archiver/crawler'
|
9
|
+
require 'wayback_archiver/crawl_url'
|
9
10
|
|
10
11
|
module WaybackArchiver
|
11
12
|
BASE_URL = 'https://web.archive.org/save/'
|
@@ -15,15 +16,14 @@ module WaybackArchiver
|
|
15
16
|
when 'sitemap'
|
16
17
|
Collector.urls_from_sitemap("#{source}/sitemap.xml")
|
17
18
|
when 'url'
|
18
|
-
|
19
|
+
[Request.resolve_url(source)]
|
19
20
|
when 'file'
|
20
21
|
Collector.urls_from_file(source)
|
21
22
|
when 'crawl', 'crawler'
|
22
|
-
|
23
|
+
Collector.urls_from_crawl(source)
|
23
24
|
else
|
24
|
-
raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file,
|
25
|
+
raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawl"
|
25
26
|
end
|
26
27
|
Archive.post(urls)
|
27
28
|
end
|
28
|
-
|
29
29
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - '>='
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: url_resolver
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: bundler
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -52,6 +66,48 @@ dependencies:
|
|
52
66
|
- - '>='
|
53
67
|
- !ruby/object:Gem::Version
|
54
68
|
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: yard
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - '>='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: coveralls
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - '>='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
55
111
|
description: 'Send URLs to Wayback Machine. From: sitemap, file or single URL.'
|
56
112
|
email:
|
57
113
|
- burenstam@gmail.com
|
@@ -63,6 +119,7 @@ files:
|
|
63
119
|
- bin/wayback_archiver
|
64
120
|
- lib/wayback_archiver/archive.rb
|
65
121
|
- lib/wayback_archiver/collector.rb
|
122
|
+
- lib/wayback_archiver/crawl_url.rb
|
66
123
|
- lib/wayback_archiver/crawler.rb
|
67
124
|
- lib/wayback_archiver/request.rb
|
68
125
|
- lib/wayback_archiver/version.rb
|