wayback_archiver 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wayback_archiver.rb +8 -5
- data/lib/wayback_archiver/archive.rb +12 -4
- data/lib/wayback_archiver/crawler.rb +50 -0
- data/lib/wayback_archiver/version.rb +1 -1
- metadata +18 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 87e286b40825e15468832af25c42c98b59170b21
|
4
|
+
data.tar.gz: 9a7eb7e6fc6db882cf23878718626db04847b33a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 562641a2749e3a3e644ad7056f365b8d26566809b66606d1c6b95bf5c2db158039dd4a5b1c33677817c5f61d06786a63ec227150534a4dd5d1f958ccd1171dc9
|
7
|
+
data.tar.gz: d635ae3f69ab3e585bf0519389d9335edfe4848b4c6a41f271f957413bb663d930974f2fe386445fef6f7a5bb7ffc9d0d5c3f3bdf8849c0cc66081b7806d61a8
|
data/lib/wayback_archiver.rb
CHANGED
@@ -5,20 +5,23 @@ require 'rexml/document'
|
|
5
5
|
require 'wayback_archiver/collector'
|
6
6
|
require 'wayback_archiver/archive'
|
7
7
|
require 'wayback_archiver/request'
|
8
|
+
require 'wayback_archiver/crawler'
|
8
9
|
|
9
10
|
module WaybackArchiver
|
10
11
|
BASE_URL = 'https://web.archive.org/save/'
|
11
12
|
|
12
13
|
def self.archive(source, from = :sitemap)
|
13
|
-
urls = case from
|
14
|
-
when
|
14
|
+
urls = case from.to_s
|
15
|
+
when 'sitemap'
|
15
16
|
Collector.urls_from_sitemap("#{source}/sitemap.xml")
|
16
|
-
when
|
17
|
+
when 'url'
|
17
18
|
Array(source)
|
18
|
-
when
|
19
|
+
when 'file'
|
19
20
|
Collector.urls_from_file(source)
|
21
|
+
when 'crawl', 'crawler'
|
22
|
+
Crawler.collect_urls(source)
|
20
23
|
else
|
21
|
-
raise ArgumentError, "Unknown type: '#{from}'. Allowed types:
|
24
|
+
raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawler"
|
22
25
|
end
|
23
26
|
Archive.post(urls)
|
24
27
|
end
|
@@ -6,9 +6,8 @@ module WaybackArchiver
|
|
6
6
|
puts "Request will be sent with max #{MAX_THREAD_COUNT} parallel threads"
|
7
7
|
|
8
8
|
puts "Total urls to be sent: #{all_urls.length}"
|
9
|
-
threads =
|
9
|
+
threads = []
|
10
10
|
group_size = (all_urls.length / MAX_THREAD_COUNT) + 1
|
11
|
-
|
12
11
|
all_urls.each_slice(group_size).to_a.each do |urls|
|
13
12
|
threads << Thread.new do
|
14
13
|
urls.each_with_index do |url, index|
|
@@ -23,8 +22,17 @@ module WaybackArchiver
|
|
23
22
|
end
|
24
23
|
end
|
25
24
|
end
|
26
|
-
threads.
|
25
|
+
threads.each_with_index do |thread, index|
|
26
|
+
print_index = index + 1
|
27
|
+
progress = '['
|
28
|
+
progress << '#' * print_index
|
29
|
+
progress << ' ' * (threads.length - print_index)
|
30
|
+
progress << ']'
|
31
|
+
procent = ((print_index.to_f/threads.length.to_f) * 100).round(0)
|
32
|
+
puts "[PROGRESS] #{progress} #{procent}% (#{print_index}/#{threads.length})"
|
33
|
+
thread.join
|
34
|
+
end
|
35
|
+
all_urls
|
27
36
|
end
|
28
|
-
|
29
37
|
end
|
30
38
|
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
module WaybackArchiver
|
6
|
+
class Crawler
|
7
|
+
def initialize(base_url)
|
8
|
+
@base_url = base_url
|
9
|
+
@hostname = URI.parse(@base_url).host
|
10
|
+
@fetch_queue = Set.new
|
11
|
+
@procesed = Set.new
|
12
|
+
@fetch_queue << @base_url
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.collect_urls(base_url)
|
16
|
+
new(base_url).collect_urls
|
17
|
+
end
|
18
|
+
|
19
|
+
def collect_urls
|
20
|
+
until @fetch_queue.empty?
|
21
|
+
url = @fetch_queue.first
|
22
|
+
@fetch_queue.delete(@fetch_queue.first)
|
23
|
+
page_links(url)
|
24
|
+
end
|
25
|
+
puts "Crawling finished, #{@procesed.length} links found"
|
26
|
+
@procesed.to_a
|
27
|
+
end
|
28
|
+
|
29
|
+
def page_links(url)
|
30
|
+
puts "Queue length: #{@fetch_queue.length}, Parsing: #{url}"
|
31
|
+
link_elements = Nokogiri::HTML(open(url)).css('a') rescue []
|
32
|
+
@procesed << url
|
33
|
+
link_elements.each do |link|
|
34
|
+
href = sanitize_url(link.attr('href'))
|
35
|
+
@fetch_queue << href if href && !@procesed.include?(href)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def sanitize_url(raw_url)
|
40
|
+
url = URI.parse(raw_url) rescue URI.parse('')
|
41
|
+
if url.host.nil?
|
42
|
+
sanitized_url = "#{@base_url}#{url.path}"
|
43
|
+
sanitized_url += "?#{url.query}" unless url.query.nil?
|
44
|
+
sanitized_url
|
45
|
+
else
|
46
|
+
raw_url if raw_url.include?(@base_url) && @hostname.eql?(url.hostname)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-10-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: bundler
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -49,6 +63,7 @@ files:
|
|
49
63
|
- bin/wayback_archiver
|
50
64
|
- lib/wayback_archiver/archive.rb
|
51
65
|
- lib/wayback_archiver/collector.rb
|
66
|
+
- lib/wayback_archiver/crawler.rb
|
52
67
|
- lib/wayback_archiver/request.rb
|
53
68
|
- lib/wayback_archiver/version.rb
|
54
69
|
- lib/wayback_archiver.rb
|
@@ -77,3 +92,4 @@ signing_key:
|
|
77
92
|
specification_version: 4
|
78
93
|
summary: Send URLs to Wayback Machine
|
79
94
|
test_files: []
|
95
|
+
has_rdoc:
|