wayback_archiver 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wayback_archiver.rb +7 -7
- data/lib/wayback_archiver/archive.rb +1 -1
- data/lib/wayback_archiver/collector.rb +4 -4
- data/lib/wayback_archiver/request.rb +1 -1
- data/lib/wayback_archiver/version.rb +1 -1
- metadata +27 -29
- data/lib/wayback_archiver/crawl_url.rb +0 -74
- data/lib/wayback_archiver/crawler.rb +0 -56
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3df67fcb7faa06087ad72e6c57ff985d9dbe4fe9
|
4
|
+
data.tar.gz: 5c21d0bf3353dd2ce732cef3b59cd3ba807acc8a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d21d169fb9fa143b777d771a8cdc5ba8c3c4186f6472aa8f2ff2806034becf18e3cc260c7ce0a9b97d1148a2d1781d9ddc0daef08d7abe1d2b13140125513231
|
7
|
+
data.tar.gz: 15a413ab5bd0f51c40640cd0be764142ca0380315f97fc335cf1bf19f4645b327dc0716400c6c10af7a79b8ceb898b6e0f89a21114865af7f84da1e2bf3d383c
|
data/lib/wayback_archiver.rb
CHANGED
@@ -1,22 +1,22 @@
|
|
1
|
+
require 'site_mapper'
|
2
|
+
|
1
3
|
require 'uri'
|
2
4
|
require 'net/http'
|
3
5
|
|
6
|
+
require 'wayback_archiver/version'
|
4
7
|
require 'wayback_archiver/collector'
|
5
8
|
require 'wayback_archiver/archive'
|
6
9
|
require 'wayback_archiver/request'
|
7
|
-
require 'wayback_archiver/crawler'
|
8
|
-
require 'wayback_archiver/crawl_url'
|
9
10
|
|
10
11
|
module WaybackArchiver
|
11
12
|
def self.archive(source, from = :crawl)
|
12
13
|
urls = case from.to_s
|
13
|
-
when '
|
14
|
-
when '
|
15
|
-
when '
|
16
|
-
when '
|
14
|
+
when 'file' then Archive.post(Collector.urls_from_file(source))
|
15
|
+
when 'crawl' then Collector.urls_from_crawl(source) { |url| Archive.post_url(url) }
|
16
|
+
when 'sitemap' then Archive.post(Collector.urls_from_sitemap("#{source}/sitemap.xml"))
|
17
|
+
when 'url' then Archive.post_url(Request.resolve_url(source))
|
17
18
|
else
|
18
19
|
raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawl"
|
19
20
|
end
|
20
|
-
Archive.post(urls)
|
21
21
|
end
|
22
22
|
end
|
@@ -7,7 +7,7 @@ module WaybackArchiver
|
|
7
7
|
puts "Request are sent with up to #{MAX_THREAD_COUNT} parallel threads"
|
8
8
|
puts "Total urls to be sent: #{urls.length}"
|
9
9
|
group_size = (urls.length / MAX_THREAD_COUNT) + 1
|
10
|
-
urls.each_slice(group_size).to_a.map do |archive_urls|
|
10
|
+
urls.each_slice(group_size).to_a.map! do |archive_urls|
|
11
11
|
Thread.new { archive_urls.each { |url| post_url(url) } }
|
12
12
|
end.each(&:join)
|
13
13
|
puts "#{urls.length} URLs sent to Internet archive"
|
@@ -4,19 +4,19 @@ module WaybackArchiver
|
|
4
4
|
def urls_from_sitemap(url)
|
5
5
|
resolved = Request.resolve_url(url)
|
6
6
|
sitemap = Request.get_page(resolved)
|
7
|
-
sitemap.css('loc').map { |element| element.text }
|
7
|
+
sitemap.css('loc').map! { |element| element.text }
|
8
8
|
end
|
9
9
|
|
10
10
|
def urls_from_crawl(url)
|
11
|
-
|
11
|
+
SiteMapper.map(url) { |new_url| yield(new_url) if block_given? }
|
12
12
|
end
|
13
13
|
|
14
14
|
def urls_from_file(path)
|
15
15
|
raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
|
16
16
|
urls = []
|
17
17
|
text = File.open(path).read
|
18
|
-
text.gsub!(/\r\n?/, "\n")
|
19
|
-
|
18
|
+
text.gsub!(/\r\n?/, "\n")
|
19
|
+
.each_line { |line| urls << line.gsub!(/\n/, '').strip }
|
20
20
|
urls.reject(&:empty?)
|
21
21
|
end
|
22
22
|
end
|
@@ -3,7 +3,7 @@ require 'url_resolver' # TODO: Allow users to use any resolver
|
|
3
3
|
module WaybackArchiver
|
4
4
|
class Request
|
5
5
|
INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
|
6
|
-
USER_AGENT = "WaybackArchiver/#{VERSION} (+#{INFO_LINK})"
|
6
|
+
USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})"
|
7
7
|
|
8
8
|
class << self
|
9
9
|
def get_page(url, document_type = :html)
|
metadata
CHANGED
@@ -1,43 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: site_mapper
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ~>
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - ~>
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: url_resolver
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - ~>
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
33
|
+
version: '0.1'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - ~>
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
40
|
+
version: '0.1'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: bundler
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -56,59 +56,59 @@ dependencies:
|
|
56
56
|
name: rake
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- -
|
59
|
+
- - ~>
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '10.3'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- -
|
66
|
+
- - ~>
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '10.3'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rspec
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- -
|
73
|
+
- - ~>
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
75
|
+
version: '3.1'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- -
|
80
|
+
- - ~>
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
82
|
+
version: '3.1'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: yard
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- -
|
87
|
+
- - ~>
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
89
|
+
version: '0.8'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- -
|
94
|
+
- - ~>
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '0'
|
96
|
+
version: '0.8'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: coveralls
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- -
|
101
|
+
- - ~>
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: '0'
|
103
|
+
version: '0.7'
|
104
104
|
type: :development
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
|
-
- -
|
108
|
+
- - ~>
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: '0'
|
111
|
-
description:
|
110
|
+
version: '0.7'
|
111
|
+
description: Send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
|
112
112
|
email:
|
113
113
|
- burenstam@gmail.com
|
114
114
|
executables:
|
@@ -119,8 +119,6 @@ files:
|
|
119
119
|
- bin/wayback_archiver
|
120
120
|
- lib/wayback_archiver/archive.rb
|
121
121
|
- lib/wayback_archiver/collector.rb
|
122
|
-
- lib/wayback_archiver/crawl_url.rb
|
123
|
-
- lib/wayback_archiver/crawler.rb
|
124
122
|
- lib/wayback_archiver/request.rb
|
125
123
|
- lib/wayback_archiver/version.rb
|
126
124
|
- lib/wayback_archiver.rb
|
@@ -136,7 +134,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
136
134
|
requirements:
|
137
135
|
- - '>='
|
138
136
|
- !ruby/object:Gem::Version
|
139
|
-
version:
|
137
|
+
version: 1.9.3
|
140
138
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
141
139
|
requirements:
|
142
140
|
- - '>='
|
@@ -1,74 +0,0 @@
|
|
1
|
-
module WaybackArchiver
|
2
|
-
class CrawlUrl
|
3
|
-
attr_reader :resolved_base_url, :base_hostname
|
4
|
-
|
5
|
-
def initialize(base_url)
|
6
|
-
@resolved_base_url = Request.resolve_url(base_url)
|
7
|
-
@base_hostname = URI.parse(@resolved_base_url).hostname
|
8
|
-
@resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
|
9
|
-
end
|
10
|
-
|
11
|
-
def absolute_url_from(raw_url, get_url)
|
12
|
-
return nil unless eligible_url?(raw_url)
|
13
|
-
parsed_url = URI.parse(raw_url) rescue URI.parse('')
|
14
|
-
if parsed_url.relative?
|
15
|
-
url_from_relative(raw_url, get_url)
|
16
|
-
elsif same_domain?(raw_url, @resolved_base_url)
|
17
|
-
raw_url
|
18
|
-
else
|
19
|
-
nil
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
private
|
24
|
-
|
25
|
-
def url_from_relative(url, current_page_url)
|
26
|
-
if url.start_with?('/')
|
27
|
-
"#{without_path_suffix(resolved_base_url)}#{url}"
|
28
|
-
elsif url.start_with?('../')
|
29
|
-
"#{url_from_dotted_url(url, current_page_url)}"
|
30
|
-
else
|
31
|
-
"#{with_path_suffix(resolved_base_url)}#{url}"
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
def url_from_dotted_url(url, current_page_url)
|
36
|
-
absolute_url = with_path_suffix(current_page_url.dup)
|
37
|
-
found_dots = without_path_suffix(url).scan('../').length
|
38
|
-
removed_dots = 0
|
39
|
-
max_levels = 4
|
40
|
-
while found_dots >= removed_dots && max_levels > removed_dots
|
41
|
-
index = absolute_url.rindex('/') or break
|
42
|
-
absolute_url = absolute_url[0..(index - 1)]
|
43
|
-
removed_dots += 1
|
44
|
-
end
|
45
|
-
"#{with_path_suffix(absolute_url)}#{url.gsub('../', '')}"
|
46
|
-
end
|
47
|
-
|
48
|
-
def with_path_suffix(passed_url)
|
49
|
-
url = passed_url.dup
|
50
|
-
url.end_with?('/') ? url : url << '/'
|
51
|
-
end
|
52
|
-
|
53
|
-
def without_path_suffix(passed_url)
|
54
|
-
url = passed_url.dup
|
55
|
-
url.end_with?('/') ? url[0...(url.length - 1)] : url
|
56
|
-
end
|
57
|
-
|
58
|
-
def eligible_url?(href)
|
59
|
-
return false if href.nil? || href.empty?
|
60
|
-
dont_start = %w(javascript: callto: mailto: tel: skype: facetime: wtai: #)
|
61
|
-
dont_include = %w(/email-protection#)
|
62
|
-
dont_end = %w(.zip .rar .pdf .exe .dmg .pkg .dpkg .bat)
|
63
|
-
|
64
|
-
dont_start.each { |pattern| return false if href.start_with?(pattern) }
|
65
|
-
dont_include.each { |pattern| return false if href.include?(pattern) }
|
66
|
-
dont_end.each { |pattern| return false if href.end_with?(pattern) }
|
67
|
-
true
|
68
|
-
end
|
69
|
-
|
70
|
-
def same_domain?(first, second)
|
71
|
-
first.include?(second)
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
@@ -1,56 +0,0 @@
|
|
1
|
-
require 'set'
|
2
|
-
require 'nokogiri'
|
3
|
-
|
4
|
-
module WaybackArchiver
|
5
|
-
class Crawler
|
6
|
-
CRAWLER_INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
|
7
|
-
HEADERS_HASH = {
|
8
|
-
'User-Agent' => "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{CRAWLER_INFO_LINK})"
|
9
|
-
}
|
10
|
-
|
11
|
-
def initialize(url, resolve = false)
|
12
|
-
base_url = Request.resolve_url(url)
|
13
|
-
@options = { resolve: resolve }
|
14
|
-
@crawl_url = CrawlUrl.new(base_url)
|
15
|
-
@fetch_queue = Set.new
|
16
|
-
@processed = Set.new
|
17
|
-
@fetch_queue << @crawl_url.resolved_base_url
|
18
|
-
end
|
19
|
-
|
20
|
-
def self.collect_urls(base_url)
|
21
|
-
new(base_url).collect_urls
|
22
|
-
end
|
23
|
-
|
24
|
-
def collect_urls
|
25
|
-
until @fetch_queue.empty?
|
26
|
-
url = @fetch_queue.first
|
27
|
-
@fetch_queue.delete(@fetch_queue.first)
|
28
|
-
page_links(url)
|
29
|
-
end
|
30
|
-
puts "Crawling finished, #{@processed.length} links found"
|
31
|
-
@processed.to_a
|
32
|
-
rescue Interrupt, IRB::Abort
|
33
|
-
puts 'Crawl interrupted.'
|
34
|
-
@fetch_queue.to_a
|
35
|
-
end
|
36
|
-
|
37
|
-
private
|
38
|
-
|
39
|
-
def page_links(get_url)
|
40
|
-
puts "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
|
41
|
-
link_elements = Request.get_page(get_url).css('a') rescue []
|
42
|
-
@processed << get_url
|
43
|
-
link_elements.each do |page_link|
|
44
|
-
absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
|
45
|
-
if absolute_url
|
46
|
-
resolved_url = resolve(absolute_url)
|
47
|
-
@fetch_queue << resolved_url if !@processed.include?(resolved_url)
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def resolve(url)
|
53
|
-
@options[:resolve] ? Request.resolve_url(url) : url
|
54
|
-
end
|
55
|
-
end
|
56
|
-
end
|