wayback_archiver 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wayback_archiver/archive.rb +20 -24
- data/lib/wayback_archiver/collector.rb +3 -6
- data/lib/wayback_archiver/crawl_url.rb +7 -2
- data/lib/wayback_archiver/crawler.rb +10 -14
- data/lib/wayback_archiver/request.rb +23 -11
- data/lib/wayback_archiver/version.rb +1 -1
- data/lib/wayback_archiver.rb +5 -12
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b64e289cd025ebddc9af5472a3c5038af4e98535
|
4
|
+
data.tar.gz: 903a6e3f3bfb2f6ab81db471b8633b27f5f8c419
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cc1853305301920afbcdd7a79f592cd6863f27fdb57c22d634424346526441d8ddd788bee8b7e23f938f334fbbd7a2d162b397041ddc6fe0f1193d816d4fac59
|
7
|
+
data.tar.gz: e64370ece062319402c773464493fb767148712e115ae3ce5aebf3b66361dae1f94177a9d50b6aeaf0770d8ddcb86d3e7b779c170bfc6294633fab4283e711c4
|
@@ -1,31 +1,27 @@
|
|
1
1
|
module WaybackArchiver
|
2
2
|
class Archive
|
3
|
-
|
3
|
+
WAYBACK_BASE_URL = 'https://web.archive.org/save/'
|
4
|
+
MAX_THREAD_COUNT = 10
|
4
5
|
|
5
|
-
def self.post(
|
6
|
+
def self.post(urls)
|
6
7
|
puts "Request are sent with up to #{MAX_THREAD_COUNT} parallel threads"
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
end
|
25
|
-
end
|
26
|
-
threads.each(&:join)
|
27
|
-
puts "#{all_urls.length} URLs sent to Internet archive"
|
28
|
-
all_urls
|
8
|
+
puts "Total urls to be sent: #{urls.length}"
|
9
|
+
group_size = (urls.length / MAX_THREAD_COUNT) + 1
|
10
|
+
urls.each_slice(group_size).to_a.map do |archive_urls|
|
11
|
+
Thread.new { archive_urls.each { |url| post_url(url) } }
|
12
|
+
end.each(&:join)
|
13
|
+
puts "#{urls.length} URLs sent to Internet archive"
|
14
|
+
urls
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.post_url(archive_url)
|
18
|
+
resolved_url = Request.resolve_url(archive_url)
|
19
|
+
request_url = "#{WAYBACK_BASE_URL}#{resolved_url}"
|
20
|
+
response = Request.get_response(request_url)
|
21
|
+
puts "[#{response.code}, #{response.message}] #{resolved_url}"
|
22
|
+
rescue Exception => e
|
23
|
+
puts "Error message: #{e.message}"
|
24
|
+
puts "Failed to archive: #{resolved_url}"
|
29
25
|
end
|
30
26
|
end
|
31
27
|
end
|
@@ -2,12 +2,9 @@ module WaybackArchiver
|
|
2
2
|
class Collector
|
3
3
|
class << self
|
4
4
|
def urls_from_sitemap(url)
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
document.elements.each('urlset/url/loc') { |element| urls << element.text }
|
10
|
-
urls
|
5
|
+
resolved = Request.resolve_url(url)
|
6
|
+
sitemap = Request.get_page(resolved)
|
7
|
+
sitemap.css('loc').map { |element| element.text }
|
11
8
|
end
|
12
9
|
|
13
10
|
def urls_from_crawl(url)
|
@@ -4,7 +4,8 @@ module WaybackArchiver
|
|
4
4
|
|
5
5
|
def initialize(base_url)
|
6
6
|
@resolved_base_url = Request.resolve_url(base_url)
|
7
|
-
@base_hostname = URI.parse(@resolved_base_url).
|
7
|
+
@base_hostname = URI.parse(@resolved_base_url).hostname
|
8
|
+
@resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
|
8
9
|
end
|
9
10
|
|
10
11
|
def absolute_url_from(raw_url, get_url)
|
@@ -12,7 +13,7 @@ module WaybackArchiver
|
|
12
13
|
parsed_url = URI.parse(raw_url) rescue URI.parse('')
|
13
14
|
if parsed_url.relative?
|
14
15
|
url_from_relative(raw_url, get_url)
|
15
|
-
elsif
|
16
|
+
elsif same_domain?(raw_url, @resolved_base_url)
|
16
17
|
raw_url
|
17
18
|
else
|
18
19
|
nil
|
@@ -65,5 +66,9 @@ module WaybackArchiver
|
|
65
66
|
dont_end.each { |pattern| return false if href.end_with?(pattern) }
|
66
67
|
true
|
67
68
|
end
|
69
|
+
|
70
|
+
def same_domain?(first, second)
|
71
|
+
first.include?(second)
|
72
|
+
end
|
68
73
|
end
|
69
74
|
end
|
@@ -1,19 +1,19 @@
|
|
1
1
|
require 'set'
|
2
|
-
require 'nokogiri'
|
2
|
+
require 'nokogiri'
|
3
3
|
|
4
4
|
module WaybackArchiver
|
5
5
|
class Crawler
|
6
6
|
CRAWLER_INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
|
7
7
|
HEADERS_HASH = {
|
8
|
-
'User-Agent' => "WaybackArchiver/#{VERSION} (+#{CRAWLER_INFO_LINK})"
|
8
|
+
'User-Agent' => "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{CRAWLER_INFO_LINK})"
|
9
9
|
}
|
10
10
|
|
11
|
-
def initialize(url, resolve
|
11
|
+
def initialize(url, resolve = false)
|
12
12
|
base_url = Request.resolve_url(url)
|
13
13
|
@options = { resolve: resolve }
|
14
14
|
@crawl_url = CrawlUrl.new(base_url)
|
15
15
|
@fetch_queue = Set.new
|
16
|
-
@
|
16
|
+
@processed = Set.new
|
17
17
|
@fetch_queue << @crawl_url.resolved_base_url
|
18
18
|
end
|
19
19
|
|
@@ -21,14 +21,14 @@ module WaybackArchiver
|
|
21
21
|
new(base_url).collect_urls
|
22
22
|
end
|
23
23
|
|
24
|
-
def collect_urls
|
24
|
+
def collect_urls
|
25
25
|
until @fetch_queue.empty?
|
26
26
|
url = @fetch_queue.first
|
27
27
|
@fetch_queue.delete(@fetch_queue.first)
|
28
28
|
page_links(url)
|
29
29
|
end
|
30
|
-
puts "Crawling finished, #{@
|
31
|
-
@
|
30
|
+
puts "Crawling finished, #{@processed.length} links found"
|
31
|
+
@processed.to_a
|
32
32
|
rescue Interrupt, IRB::Abort
|
33
33
|
puts 'Crawl interrupted.'
|
34
34
|
@fetch_queue.to_a
|
@@ -38,21 +38,17 @@ module WaybackArchiver
|
|
38
38
|
|
39
39
|
def page_links(get_url)
|
40
40
|
puts "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
|
41
|
-
link_elements = get_page(get_url).css('a') rescue []
|
42
|
-
@
|
41
|
+
link_elements = Request.get_page(get_url).css('a') rescue []
|
42
|
+
@processed << get_url
|
43
43
|
link_elements.each do |page_link|
|
44
44
|
absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
|
45
45
|
if absolute_url
|
46
46
|
resolved_url = resolve(absolute_url)
|
47
|
-
@fetch_queue << resolved_url if !@
|
47
|
+
@fetch_queue << resolved_url if !@processed.include?(resolved_url)
|
48
48
|
end
|
49
49
|
end
|
50
50
|
end
|
51
51
|
|
52
|
-
def get_page(url)
|
53
|
-
Nokogiri::HTML(Request.get_response(url).body)
|
54
|
-
end
|
55
|
-
|
56
52
|
def resolve(url)
|
57
53
|
@options[:resolve] ? Request.resolve_url(url) : url
|
58
54
|
end
|
@@ -5,19 +5,31 @@ module WaybackArchiver
|
|
5
5
|
INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
|
6
6
|
USER_AGENT = "WaybackArchiver/#{VERSION} (+#{INFO_LINK})"
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
8
|
+
class << self
|
9
|
+
def get_page(url, document_type = :html)
|
10
|
+
response = Request.get_response(url).body
|
11
|
+
case document_type
|
12
|
+
when :xml
|
13
|
+
Nokogiri::XML(response)
|
14
|
+
else
|
15
|
+
Nokogiri::HTML(response)
|
16
|
+
end
|
17
|
+
end
|
13
18
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
19
|
+
def get_response(url, resolve = false)
|
20
|
+
resolved_url = resolve ? resolve_url(url) : url
|
21
|
+
uri = URI.parse(resolved_url)
|
22
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
23
|
+
http.use_ssl = true if resolved_url.include?('https://')
|
24
|
+
|
25
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
26
|
+
request['User-Agent'] = USER_AGENT
|
27
|
+
http.request(request)
|
28
|
+
end
|
18
29
|
|
19
|
-
|
20
|
-
|
30
|
+
def resolve_url(url)
|
31
|
+
UrlResolver.resolve(url)
|
32
|
+
end
|
21
33
|
end
|
22
34
|
end
|
23
35
|
end
|
data/lib/wayback_archiver.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
require 'uri'
|
2
2
|
require 'net/http'
|
3
|
-
require 'rexml/document'
|
4
3
|
|
5
4
|
require 'wayback_archiver/collector'
|
6
5
|
require 'wayback_archiver/archive'
|
@@ -9,18 +8,12 @@ require 'wayback_archiver/crawler'
|
|
9
8
|
require 'wayback_archiver/crawl_url'
|
10
9
|
|
11
10
|
module WaybackArchiver
|
12
|
-
|
13
|
-
|
14
|
-
def self.archive(source, from = :sitemap)
|
11
|
+
def self.archive(source, from = :crawl)
|
15
12
|
urls = case from.to_s
|
16
|
-
when 'sitemap'
|
17
|
-
|
18
|
-
when '
|
19
|
-
|
20
|
-
when 'file'
|
21
|
-
Collector.urls_from_file(source)
|
22
|
-
when 'crawl', 'crawler'
|
23
|
-
Collector.urls_from_crawl(source)
|
13
|
+
when 'sitemap' then Collector.urls_from_sitemap("#{source}/sitemap.xml")
|
14
|
+
when 'url' then [Request.resolve_url(source)]
|
15
|
+
when 'file' then Collector.urls_from_file(source)
|
16
|
+
when 'crawl' then Collector.urls_from_crawl(source)
|
24
17
|
else
|
25
18
|
raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawl"
|
26
19
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|