wayback_archiver 0.0.10 → 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/wayback_archiver.rb +24 -7
- data/lib/wayback_archiver/archive.rb +41 -21
- data/lib/wayback_archiver/request.rb +46 -15
- data/lib/wayback_archiver/url_collector.rb +40 -0
- data/lib/wayback_archiver/version.rb +2 -1
- metadata +35 -21
- data/lib/wayback_archiver/collector.rb +0 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fa0d72d6164b6b280db7c683535e6894ba64778d
|
4
|
+
data.tar.gz: 629963ef30283123820418a714ae73160fa20293
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1021437d5c97ecfad14822548522cad49d3257992c91ee8ccf90a30e3cd4cd98fae6c44d977fc3c1cd4f61334bd7c935fcc50bdee92a68a4399470d4610836be
|
7
|
+
data.tar.gz: 5c805b9bb8b7cc1123be943966b2f4e94619580d17eb051947d38b1e3d4a165b2c61dd3f117c6bc7c8d858768731108bb1aab1cc59472ad0a3e120fb8df9cae4
|
data/lib/wayback_archiver.rb
CHANGED
@@ -4,19 +4,36 @@ require 'uri'
|
|
4
4
|
require 'net/http'
|
5
5
|
|
6
6
|
require 'wayback_archiver/version'
|
7
|
-
require 'wayback_archiver/
|
7
|
+
require 'wayback_archiver/url_collector'
|
8
8
|
require 'wayback_archiver/archive'
|
9
9
|
require 'wayback_archiver/request'
|
10
10
|
|
11
|
+
# WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
|
11
12
|
module WaybackArchiver
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
# Link to gem on rubygems.org, part of the sent User-Agent
|
14
|
+
INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
|
15
|
+
# WaybackArchiver User-Agent
|
16
|
+
USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})"
|
17
|
+
|
18
|
+
# Send URLs to Wayback Machine.
|
19
|
+
# @return [Array] with URLs sent to the Wayback Machine.
|
20
|
+
# @param [String] source for URL(s).
|
21
|
+
# @param [String/Symbol] type of source. Supported types: ['crawl', 'sitemap', 'url', 'file'].
|
22
|
+
# @example Crawl example.com and send all URLs of the same domain
|
23
|
+
# WaybackArchiver.archive('example.com') # Default type is :crawl
|
24
|
+
# WaybackArchiver.archive('example.com', :crawl)
|
25
|
+
# @example Send only example.com
|
26
|
+
# WaybackArchiver.archive('example.com', :url)
|
27
|
+
# @example Send URL on each line in specified file
|
28
|
+
# WaybackArchiver.archive('/path/to/file', :file)
|
29
|
+
def self.archive(source, type = :crawl)
|
30
|
+
case type.to_s
|
31
|
+
when 'file' then Archive.post(UrlCollector.file(source))
|
32
|
+
when 'crawl' then UrlCollector.crawl(source) { |url| Archive.post_url(url) }
|
33
|
+
when 'sitemap' then Archive.post(UrlCollector.sitemap(source))
|
17
34
|
when 'url' then Archive.post_url(Request.resolve_url(source))
|
18
35
|
else
|
19
|
-
raise ArgumentError, "Unknown type: '#{
|
36
|
+
raise ArgumentError, "Unknown type: '#{type}'. Allowed types: sitemap, url, file, crawl"
|
20
37
|
end
|
21
38
|
end
|
22
39
|
end
|
@@ -1,27 +1,47 @@
|
|
1
1
|
module WaybackArchiver
|
2
|
+
# Post URL(s) to Wayback Machine
|
2
3
|
class Archive
|
3
|
-
|
4
|
-
|
4
|
+
# Wayback Machine base URL.
|
5
|
+
WAYBACK_BASE_URL = 'https://web.archive.org/save/'
|
6
|
+
# Default concurrency for archiving URLs
|
7
|
+
DEFAULT_CONCURRENCY = 10
|
8
|
+
class << self
|
9
|
+
# Send URLs to Wayback Machine.
|
10
|
+
# @return [Array] with sent URLs.
|
11
|
+
# @param [Array] urls URLs to send.
|
12
|
+
# @param [Hash] options
|
13
|
+
# @example Archive example.com, with default options
|
14
|
+
# Archive.post(['http://example.com'])
|
15
|
+
# @example Archive example.com, using only 1 thread
|
16
|
+
# Archive.post(['http://example.com'], concurrency: 1)
|
17
|
+
def post(urls, options = {})
|
18
|
+
options = { concurrency: DEFAULT_CONCURRENCY }.merge!(options)
|
19
|
+
concurrency = options[:concurrency]
|
20
|
+
puts "Request are sent with up to #{concurrency} parallel threads"
|
21
|
+
puts "Total urls to be sent: #{urls.length}"
|
22
|
+
group_size = (urls.length / concurrency) + 1
|
23
|
+
urls.each_slice(group_size).to_a.map! do |archive_urls|
|
24
|
+
Thread.new { archive_urls.each { |url| post_url(url) } }
|
25
|
+
end.each(&:join)
|
26
|
+
puts "#{urls.length} URLs sent to Internet archive"
|
27
|
+
urls
|
28
|
+
end
|
5
29
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
puts "[#{response.code}, #{response.message}] #{resolved_url}"
|
22
|
-
rescue Exception => e
|
23
|
-
puts "Error message: #{e.message}"
|
24
|
-
puts "Failed to archive: #{resolved_url}"
|
30
|
+
# Send URL to Wayback Machine.
|
31
|
+
# @return [String] the sent URL.
|
32
|
+
# @param [String] url to send.
|
33
|
+
# @example Archive example.com, with default options
|
34
|
+
# Archive.post_url('http://example.com')
|
35
|
+
def post_url(url)
|
36
|
+
resolved_url = Request.resolve_url(url)
|
37
|
+
request_url = "#{WAYBACK_BASE_URL}#{resolved_url}"
|
38
|
+
response = Request.response(request_url)
|
39
|
+
puts "[#{response.code}, #{response.message}] #{resolved_url}"
|
40
|
+
resolved_url
|
41
|
+
rescue Exception => e
|
42
|
+
puts "Error message: #{e.message}"
|
43
|
+
puts "Failed to archive: #{resolved_url}"
|
44
|
+
end
|
25
45
|
end
|
26
46
|
end
|
27
47
|
end
|
@@ -1,34 +1,65 @@
|
|
1
1
|
require 'url_resolver' # TODO: Allow users to use any resolver
|
2
2
|
|
3
3
|
module WaybackArchiver
|
4
|
+
# Request and parse HTML & XML documents
|
4
5
|
class Request
|
5
|
-
INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
|
6
|
-
USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})"
|
7
|
-
|
8
6
|
class << self
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
7
|
+
# Get and parse HTML & XML documents.
|
8
|
+
# @return [Array] with links sent to the Wayback Machine.
|
9
|
+
# @param [String] url to retrieve and parse.
|
10
|
+
# @example Request and parse example.com
|
11
|
+
# Request.document('example.com')
|
12
|
+
# @example Request and parse google.com/sitemap.xml
|
13
|
+
# Request.document('google.com/sitemap.xml')
|
14
|
+
def document(url)
|
15
|
+
response_body = Request.response(url).body
|
16
|
+
Nokogiri::HTML(response_body)
|
17
17
|
end
|
18
18
|
|
19
|
-
|
19
|
+
# Get reponse.
|
20
|
+
# @return [Net::HTTP*] the http response.
|
21
|
+
# @param [String] url URL to retrieve.
|
22
|
+
# @param [Boolean] resolve whether to resolve the URL.
|
23
|
+
# @example Resolve example.com and request
|
24
|
+
# Request.response('example.com', true)
|
25
|
+
# @example Request http://example.com
|
26
|
+
# Request.response('http://example.com', false)
|
27
|
+
def response(url, resolve = true)
|
20
28
|
resolved_url = resolve ? resolve_url(url) : url
|
21
29
|
uri = URI.parse(resolved_url)
|
22
30
|
http = Net::HTTP.new(uri.host, uri.port)
|
23
|
-
http.use_ssl = true if resolved_url.
|
31
|
+
http.use_ssl = true if resolved_url.start_with?('https://')
|
24
32
|
|
25
33
|
request = Net::HTTP::Get.new(uri.request_uri)
|
26
|
-
request['User-Agent'] = USER_AGENT
|
34
|
+
request['User-Agent'] = WaybackArchiver::USER_AGENT
|
27
35
|
http.request(request)
|
28
36
|
end
|
29
37
|
|
38
|
+
# Resolve the URL, follows redirects.
|
39
|
+
# @return [String] the resolved URL.
|
40
|
+
# @param [String] url to retrieve.
|
41
|
+
# @example Resolve example.com and request
|
42
|
+
# Request.resolve_url('example.com')
|
30
43
|
def resolve_url(url)
|
31
|
-
UrlResolver.resolve(url)
|
44
|
+
resolved = UrlResolver.resolve(url)
|
45
|
+
resolved = resolved.prepend('http://') unless has_protocol?(resolved)
|
46
|
+
resolved
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
# Resolve the URL, follows redirects.
|
52
|
+
# @return [Boolean] true if string includes protocol.
|
53
|
+
# @param [String] url to check.
|
54
|
+
# @example Check if string includes protocol
|
55
|
+
# Request.has_protocol?('example.com')
|
56
|
+
# # => false
|
57
|
+
# Request.has_protocol?('https://example.com')
|
58
|
+
# # => true
|
59
|
+
# Request.has_protocol?('http://example.com')
|
60
|
+
# # => true
|
61
|
+
def has_protocol?(url)
|
62
|
+
url.start_with?('http://') || url.start_with?('https://')
|
32
63
|
end
|
33
64
|
end
|
34
65
|
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module WaybackArchiver
|
2
|
+
# Retrive URLs from different sources
|
3
|
+
class UrlCollector
|
4
|
+
class << self
|
5
|
+
# Retrieve URLs from Sitemap.
|
6
|
+
# @return [Array] of URLs defined in Sitemap.
|
7
|
+
# @param [String] url domain to retrieve Sitemap from.
|
8
|
+
# @example Get URLs defined in Sitemap for google.com
|
9
|
+
# UrlCollector.sitemap('https://google.com')
|
10
|
+
def sitemap(url)
|
11
|
+
resolved = Request.resolve_url("#{url}/sitemap.xml")
|
12
|
+
sitemap = Request.document(resolved)
|
13
|
+
sitemap.css('loc').map { |element| element.text }
|
14
|
+
end
|
15
|
+
|
16
|
+
# Retrieve URLs by crawling.
|
17
|
+
# @return [Array] of URLs defined found during crawl.
|
18
|
+
# @param [String] url domain to crawl URLs from.
|
19
|
+
# @example Crawl URLs defined on example.com
|
20
|
+
# UrlCollector.crawl('http://example.com')
|
21
|
+
def crawl(url)
|
22
|
+
SiteMapper.map(url, user_agent: WaybackArchiver::USER_AGENT) { |new_url| yield(new_url) if block_given? }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Retrieve URLs listed in file.
|
26
|
+
# @return [Array] of URLs defined in file.
|
27
|
+
# @param [String] path to get URLs from.
|
28
|
+
# @example Get URLs defined in /path/to/file
|
29
|
+
# UrlCollector.file('/path/to/file')
|
30
|
+
def file(path)
|
31
|
+
raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
|
32
|
+
urls = []
|
33
|
+
File.open(path).read
|
34
|
+
.gsub(/\r\n?/, "\n")
|
35
|
+
.each_line { |line| urls << line.gsub(/\n/, '').strip }
|
36
|
+
urls.reject(&:empty?)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
metadata
CHANGED
@@ -1,113 +1,127 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-04-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: site_mapper
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: url_resolver
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - ~>
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '0.1'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - ~>
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0.1'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: bundler
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - ~>
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '1.3'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - ~>
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '1.3'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: rake
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - ~>
|
59
|
+
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '10.3'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - ~>
|
66
|
+
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '10.3'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rspec
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - ~>
|
73
|
+
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
75
|
version: '3.1'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- - ~>
|
80
|
+
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '3.1'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: yard
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- - ~>
|
87
|
+
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
89
|
version: '0.8'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- - ~>
|
94
|
+
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0.8'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: coveralls
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- - ~>
|
101
|
+
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
103
|
version: '0.7'
|
104
104
|
type: :development
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
|
-
- - ~>
|
108
|
+
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0.7'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: redcarpet
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '3.2'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '3.2'
|
111
125
|
description: Send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
|
112
126
|
email:
|
113
127
|
- burenstam@gmail.com
|
@@ -117,11 +131,11 @@ extensions: []
|
|
117
131
|
extra_rdoc_files: []
|
118
132
|
files:
|
119
133
|
- bin/wayback_archiver
|
134
|
+
- lib/wayback_archiver.rb
|
120
135
|
- lib/wayback_archiver/archive.rb
|
121
|
-
- lib/wayback_archiver/collector.rb
|
122
136
|
- lib/wayback_archiver/request.rb
|
137
|
+
- lib/wayback_archiver/url_collector.rb
|
123
138
|
- lib/wayback_archiver/version.rb
|
124
|
-
- lib/wayback_archiver.rb
|
125
139
|
homepage: https://github.com/buren/wayback_archiver
|
126
140
|
licenses:
|
127
141
|
- MIT
|
@@ -132,17 +146,17 @@ require_paths:
|
|
132
146
|
- lib
|
133
147
|
required_ruby_version: !ruby/object:Gem::Requirement
|
134
148
|
requirements:
|
135
|
-
- -
|
149
|
+
- - ">="
|
136
150
|
- !ruby/object:Gem::Version
|
137
151
|
version: 1.9.3
|
138
152
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
139
153
|
requirements:
|
140
|
-
- -
|
154
|
+
- - ">="
|
141
155
|
- !ruby/object:Gem::Version
|
142
156
|
version: '0'
|
143
157
|
requirements: []
|
144
158
|
rubyforge_project:
|
145
|
-
rubygems_version: 2.
|
159
|
+
rubygems_version: 2.2.2
|
146
160
|
signing_key:
|
147
161
|
specification_version: 4
|
148
162
|
summary: Send URLs to Wayback Machine
|
@@ -1,24 +0,0 @@
|
|
1
|
-
module WaybackArchiver
|
2
|
-
class Collector
|
3
|
-
class << self
|
4
|
-
def urls_from_sitemap(url)
|
5
|
-
resolved = Request.resolve_url(url)
|
6
|
-
sitemap = Request.get_page(resolved)
|
7
|
-
sitemap.css('loc').map! { |element| element.text }
|
8
|
-
end
|
9
|
-
|
10
|
-
def urls_from_crawl(url)
|
11
|
-
SiteMapper.map(url) { |new_url| yield(new_url) if block_given? }
|
12
|
-
end
|
13
|
-
|
14
|
-
def urls_from_file(path)
|
15
|
-
raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
|
16
|
-
urls = []
|
17
|
-
text = File.open(path).read
|
18
|
-
text.gsub!(/\r\n?/, "\n")
|
19
|
-
.each_line { |line| urls << line.gsub!(/\n/, '').strip }
|
20
|
-
urls.reject(&:empty?)
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|