wayback_archiver 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/wayback_archiver/archive.rb +20 -24
- data/lib/wayback_archiver/collector.rb +3 -6
- data/lib/wayback_archiver/crawl_url.rb +7 -2
- data/lib/wayback_archiver/crawler.rb +10 -14
- data/lib/wayback_archiver/request.rb +23 -11
- data/lib/wayback_archiver/version.rb +1 -1
- data/lib/wayback_archiver.rb +5 -12
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b64e289cd025ebddc9af5472a3c5038af4e98535
|
4
|
+
data.tar.gz: 903a6e3f3bfb2f6ab81db471b8633b27f5f8c419
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cc1853305301920afbcdd7a79f592cd6863f27fdb57c22d634424346526441d8ddd788bee8b7e23f938f334fbbd7a2d162b397041ddc6fe0f1193d816d4fac59
|
7
|
+
data.tar.gz: e64370ece062319402c773464493fb767148712e115ae3ce5aebf3b66361dae1f94177a9d50b6aeaf0770d8ddcb86d3e7b779c170bfc6294633fab4283e711c4
|
@@ -1,31 +1,27 @@
|
|
1
1
|
module WaybackArchiver
|
2
2
|
class Archive
|
3
|
-
|
3
|
+
WAYBACK_BASE_URL = 'https://web.archive.org/save/'
|
4
|
+
MAX_THREAD_COUNT = 10
|
4
5
|
|
5
|
-
def self.post(
|
6
|
+
def self.post(urls)
|
6
7
|
puts "Request are sent with up to #{MAX_THREAD_COUNT} parallel threads"
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
end
|
25
|
-
end
|
26
|
-
threads.each(&:join)
|
27
|
-
puts "#{all_urls.length} URLs sent to Internet archive"
|
28
|
-
all_urls
|
8
|
+
puts "Total urls to be sent: #{urls.length}"
|
9
|
+
group_size = (urls.length / MAX_THREAD_COUNT) + 1
|
10
|
+
urls.each_slice(group_size).to_a.map do |archive_urls|
|
11
|
+
Thread.new { archive_urls.each { |url| post_url(url) } }
|
12
|
+
end.each(&:join)
|
13
|
+
puts "#{urls.length} URLs sent to Internet archive"
|
14
|
+
urls
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.post_url(archive_url)
|
18
|
+
resolved_url = Request.resolve_url(archive_url)
|
19
|
+
request_url = "#{WAYBACK_BASE_URL}#{resolved_url}"
|
20
|
+
response = Request.get_response(request_url)
|
21
|
+
puts "[#{response.code}, #{response.message}] #{resolved_url}"
|
22
|
+
rescue Exception => e
|
23
|
+
puts "Error message: #{e.message}"
|
24
|
+
puts "Failed to archive: #{resolved_url}"
|
29
25
|
end
|
30
26
|
end
|
31
27
|
end
|
@@ -2,12 +2,9 @@ module WaybackArchiver
|
|
2
2
|
class Collector
|
3
3
|
class << self
|
4
4
|
def urls_from_sitemap(url)
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
document.elements.each('urlset/url/loc') { |element| urls << element.text }
|
10
|
-
urls
|
5
|
+
resolved = Request.resolve_url(url)
|
6
|
+
sitemap = Request.get_page(resolved)
|
7
|
+
sitemap.css('loc').map { |element| element.text }
|
11
8
|
end
|
12
9
|
|
13
10
|
def urls_from_crawl(url)
|
@@ -4,7 +4,8 @@ module WaybackArchiver
|
|
4
4
|
|
5
5
|
def initialize(base_url)
|
6
6
|
@resolved_base_url = Request.resolve_url(base_url)
|
7
|
-
@base_hostname = URI.parse(@resolved_base_url).
|
7
|
+
@base_hostname = URI.parse(@resolved_base_url).hostname
|
8
|
+
@resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
|
8
9
|
end
|
9
10
|
|
10
11
|
def absolute_url_from(raw_url, get_url)
|
@@ -12,7 +13,7 @@ module WaybackArchiver
|
|
12
13
|
parsed_url = URI.parse(raw_url) rescue URI.parse('')
|
13
14
|
if parsed_url.relative?
|
14
15
|
url_from_relative(raw_url, get_url)
|
15
|
-
elsif
|
16
|
+
elsif same_domain?(raw_url, @resolved_base_url)
|
16
17
|
raw_url
|
17
18
|
else
|
18
19
|
nil
|
@@ -65,5 +66,9 @@ module WaybackArchiver
|
|
65
66
|
dont_end.each { |pattern| return false if href.end_with?(pattern) }
|
66
67
|
true
|
67
68
|
end
|
69
|
+
|
70
|
+
def same_domain?(first, second)
|
71
|
+
first.include?(second)
|
72
|
+
end
|
68
73
|
end
|
69
74
|
end
|
@@ -1,19 +1,19 @@
|
|
1
1
|
require 'set'
|
2
|
-
require 'nokogiri'
|
2
|
+
require 'nokogiri'
|
3
3
|
|
4
4
|
module WaybackArchiver
|
5
5
|
class Crawler
|
6
6
|
CRAWLER_INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
|
7
7
|
HEADERS_HASH = {
|
8
|
-
'User-Agent' => "WaybackArchiver/#{VERSION} (+#{CRAWLER_INFO_LINK})"
|
8
|
+
'User-Agent' => "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{CRAWLER_INFO_LINK})"
|
9
9
|
}
|
10
10
|
|
11
|
-
def initialize(url, resolve
|
11
|
+
def initialize(url, resolve = false)
|
12
12
|
base_url = Request.resolve_url(url)
|
13
13
|
@options = { resolve: resolve }
|
14
14
|
@crawl_url = CrawlUrl.new(base_url)
|
15
15
|
@fetch_queue = Set.new
|
16
|
-
@
|
16
|
+
@processed = Set.new
|
17
17
|
@fetch_queue << @crawl_url.resolved_base_url
|
18
18
|
end
|
19
19
|
|
@@ -21,14 +21,14 @@ module WaybackArchiver
|
|
21
21
|
new(base_url).collect_urls
|
22
22
|
end
|
23
23
|
|
24
|
-
def collect_urls
|
24
|
+
def collect_urls
|
25
25
|
until @fetch_queue.empty?
|
26
26
|
url = @fetch_queue.first
|
27
27
|
@fetch_queue.delete(@fetch_queue.first)
|
28
28
|
page_links(url)
|
29
29
|
end
|
30
|
-
puts "Crawling finished, #{@
|
31
|
-
@
|
30
|
+
puts "Crawling finished, #{@processed.length} links found"
|
31
|
+
@processed.to_a
|
32
32
|
rescue Interrupt, IRB::Abort
|
33
33
|
puts 'Crawl interrupted.'
|
34
34
|
@fetch_queue.to_a
|
@@ -38,21 +38,17 @@ module WaybackArchiver
|
|
38
38
|
|
39
39
|
def page_links(get_url)
|
40
40
|
puts "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
|
41
|
-
link_elements = get_page(get_url).css('a') rescue []
|
42
|
-
@
|
41
|
+
link_elements = Request.get_page(get_url).css('a') rescue []
|
42
|
+
@processed << get_url
|
43
43
|
link_elements.each do |page_link|
|
44
44
|
absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
|
45
45
|
if absolute_url
|
46
46
|
resolved_url = resolve(absolute_url)
|
47
|
-
@fetch_queue << resolved_url if !@
|
47
|
+
@fetch_queue << resolved_url if !@processed.include?(resolved_url)
|
48
48
|
end
|
49
49
|
end
|
50
50
|
end
|
51
51
|
|
52
|
-
def get_page(url)
|
53
|
-
Nokogiri::HTML(Request.get_response(url).body)
|
54
|
-
end
|
55
|
-
|
56
52
|
def resolve(url)
|
57
53
|
@options[:resolve] ? Request.resolve_url(url) : url
|
58
54
|
end
|
@@ -5,19 +5,31 @@ module WaybackArchiver
|
|
5
5
|
INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
|
6
6
|
USER_AGENT = "WaybackArchiver/#{VERSION} (+#{INFO_LINK})"
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
8
|
+
class << self
|
9
|
+
def get_page(url, document_type = :html)
|
10
|
+
response = Request.get_response(url).body
|
11
|
+
case document_type
|
12
|
+
when :xml
|
13
|
+
Nokogiri::XML(response)
|
14
|
+
else
|
15
|
+
Nokogiri::HTML(response)
|
16
|
+
end
|
17
|
+
end
|
13
18
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
19
|
+
def get_response(url, resolve = false)
|
20
|
+
resolved_url = resolve ? resolve_url(url) : url
|
21
|
+
uri = URI.parse(resolved_url)
|
22
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
23
|
+
http.use_ssl = true if resolved_url.include?('https://')
|
24
|
+
|
25
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
26
|
+
request['User-Agent'] = USER_AGENT
|
27
|
+
http.request(request)
|
28
|
+
end
|
18
29
|
|
19
|
-
|
20
|
-
|
30
|
+
def resolve_url(url)
|
31
|
+
UrlResolver.resolve(url)
|
32
|
+
end
|
21
33
|
end
|
22
34
|
end
|
23
35
|
end
|
data/lib/wayback_archiver.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
require 'uri'
|
2
2
|
require 'net/http'
|
3
|
-
require 'rexml/document'
|
4
3
|
|
5
4
|
require 'wayback_archiver/collector'
|
6
5
|
require 'wayback_archiver/archive'
|
@@ -9,18 +8,12 @@ require 'wayback_archiver/crawler'
|
|
9
8
|
require 'wayback_archiver/crawl_url'
|
10
9
|
|
11
10
|
module WaybackArchiver
|
12
|
-
|
13
|
-
|
14
|
-
def self.archive(source, from = :sitemap)
|
11
|
+
def self.archive(source, from = :crawl)
|
15
12
|
urls = case from.to_s
|
16
|
-
when 'sitemap'
|
17
|
-
|
18
|
-
when '
|
19
|
-
|
20
|
-
when 'file'
|
21
|
-
Collector.urls_from_file(source)
|
22
|
-
when 'crawl', 'crawler'
|
23
|
-
Collector.urls_from_crawl(source)
|
13
|
+
when 'sitemap' then Collector.urls_from_sitemap("#{source}/sitemap.xml")
|
14
|
+
when 'url' then [Request.resolve_url(source)]
|
15
|
+
when 'file' then Collector.urls_from_file(source)
|
16
|
+
when 'crawl' then Collector.urls_from_crawl(source)
|
24
17
|
else
|
25
18
|
raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawl"
|
26
19
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|