wayback_archiver 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/wayback_archiver.rb +8 -5
- data/lib/wayback_archiver/archive.rb +12 -4
- data/lib/wayback_archiver/crawler.rb +50 -0
- data/lib/wayback_archiver/version.rb +1 -1
- metadata +18 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 87e286b40825e15468832af25c42c98b59170b21
|
4
|
+
data.tar.gz: 9a7eb7e6fc6db882cf23878718626db04847b33a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 562641a2749e3a3e644ad7056f365b8d26566809b66606d1c6b95bf5c2db158039dd4a5b1c33677817c5f61d06786a63ec227150534a4dd5d1f958ccd1171dc9
|
7
|
+
data.tar.gz: d635ae3f69ab3e585bf0519389d9335edfe4848b4c6a41f271f957413bb663d930974f2fe386445fef6f7a5bb7ffc9d0d5c3f3bdf8849c0cc66081b7806d61a8
|
data/lib/wayback_archiver.rb
CHANGED
@@ -5,20 +5,23 @@ require 'rexml/document'
|
|
5
5
|
require 'wayback_archiver/collector'
|
6
6
|
require 'wayback_archiver/archive'
|
7
7
|
require 'wayback_archiver/request'
|
8
|
+
require 'wayback_archiver/crawler'
|
8
9
|
|
9
10
|
module WaybackArchiver
|
10
11
|
BASE_URL = 'https://web.archive.org/save/'
|
11
12
|
|
12
13
|
def self.archive(source, from = :sitemap)
|
13
|
-
urls = case from
|
14
|
-
when
|
14
|
+
urls = case from.to_s
|
15
|
+
when 'sitemap'
|
15
16
|
Collector.urls_from_sitemap("#{source}/sitemap.xml")
|
16
|
-
when
|
17
|
+
when 'url'
|
17
18
|
Array(source)
|
18
|
-
when
|
19
|
+
when 'file'
|
19
20
|
Collector.urls_from_file(source)
|
21
|
+
when 'crawl', 'crawler'
|
22
|
+
Crawler.collect_urls(source)
|
20
23
|
else
|
21
|
-
raise ArgumentError, "Unknown type: '#{from}'. Allowed types:
|
24
|
+
raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawler"
|
22
25
|
end
|
23
26
|
Archive.post(urls)
|
24
27
|
end
|
@@ -6,9 +6,8 @@ module WaybackArchiver
|
|
6
6
|
puts "Request will be sent with max #{MAX_THREAD_COUNT} parallel threads"
|
7
7
|
|
8
8
|
puts "Total urls to be sent: #{all_urls.length}"
|
9
|
-
threads =
|
9
|
+
threads = []
|
10
10
|
group_size = (all_urls.length / MAX_THREAD_COUNT) + 1
|
11
|
-
|
12
11
|
all_urls.each_slice(group_size).to_a.each do |urls|
|
13
12
|
threads << Thread.new do
|
14
13
|
urls.each_with_index do |url, index|
|
@@ -23,8 +22,17 @@ module WaybackArchiver
|
|
23
22
|
end
|
24
23
|
end
|
25
24
|
end
|
26
|
-
threads.
|
25
|
+
threads.each_with_index do |thread, index|
|
26
|
+
print_index = index + 1
|
27
|
+
progress = '['
|
28
|
+
progress << '#' * print_index
|
29
|
+
progress << ' ' * (threads.length - print_index)
|
30
|
+
progress << ']'
|
31
|
+
procent = ((print_index.to_f/threads.length.to_f) * 100).round(0)
|
32
|
+
puts "[PROGRESS] #{progress} #{procent}% (#{print_index}/#{threads.length})"
|
33
|
+
thread.join
|
34
|
+
end
|
35
|
+
all_urls
|
27
36
|
end
|
28
|
-
|
29
37
|
end
|
30
38
|
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
module WaybackArchiver
|
6
|
+
class Crawler
|
7
|
+
def initialize(base_url)
|
8
|
+
@base_url = base_url
|
9
|
+
@hostname = URI.parse(@base_url).host
|
10
|
+
@fetch_queue = Set.new
|
11
|
+
@procesed = Set.new
|
12
|
+
@fetch_queue << @base_url
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.collect_urls(base_url)
|
16
|
+
new(base_url).collect_urls
|
17
|
+
end
|
18
|
+
|
19
|
+
def collect_urls
|
20
|
+
until @fetch_queue.empty?
|
21
|
+
url = @fetch_queue.first
|
22
|
+
@fetch_queue.delete(@fetch_queue.first)
|
23
|
+
page_links(url)
|
24
|
+
end
|
25
|
+
puts "Crawling finished, #{@procesed.length} links found"
|
26
|
+
@procesed.to_a
|
27
|
+
end
|
28
|
+
|
29
|
+
def page_links(url)
|
30
|
+
puts "Queue length: #{@fetch_queue.length}, Parsing: #{url}"
|
31
|
+
link_elements = Nokogiri::HTML(open(url)).css('a') rescue []
|
32
|
+
@procesed << url
|
33
|
+
link_elements.each do |link|
|
34
|
+
href = sanitize_url(link.attr('href'))
|
35
|
+
@fetch_queue << href if href && !@procesed.include?(href)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def sanitize_url(raw_url)
|
40
|
+
url = URI.parse(raw_url) rescue URI.parse('')
|
41
|
+
if url.host.nil?
|
42
|
+
sanitized_url = "#{@base_url}#{url.path}"
|
43
|
+
sanitized_url += "?#{url.query}" unless url.query.nil?
|
44
|
+
sanitized_url
|
45
|
+
else
|
46
|
+
raw_url if raw_url.include?(@base_url) && @hostname.eql?(url.hostname)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-10-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: bundler
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -49,6 +63,7 @@ files:
|
|
49
63
|
- bin/wayback_archiver
|
50
64
|
- lib/wayback_archiver/archive.rb
|
51
65
|
- lib/wayback_archiver/collector.rb
|
66
|
+
- lib/wayback_archiver/crawler.rb
|
52
67
|
- lib/wayback_archiver/request.rb
|
53
68
|
- lib/wayback_archiver/version.rb
|
54
69
|
- lib/wayback_archiver.rb
|
@@ -77,3 +92,4 @@ signing_key:
|
|
77
92
|
specification_version: 4
|
78
93
|
summary: Send URLs to Wayback Machine
|
79
94
|
test_files: []
|
95
|
+
has_rdoc:
|