wayback_archiver 0.0.10 → 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wayback_archiver.rb +24 -7
- data/lib/wayback_archiver/archive.rb +41 -21
- data/lib/wayback_archiver/request.rb +46 -15
- data/lib/wayback_archiver/url_collector.rb +40 -0
- data/lib/wayback_archiver/version.rb +2 -1
- metadata +35 -21
- data/lib/wayback_archiver/collector.rb +0 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fa0d72d6164b6b280db7c683535e6894ba64778d
|
4
|
+
data.tar.gz: 629963ef30283123820418a714ae73160fa20293
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1021437d5c97ecfad14822548522cad49d3257992c91ee8ccf90a30e3cd4cd98fae6c44d977fc3c1cd4f61334bd7c935fcc50bdee92a68a4399470d4610836be
|
7
|
+
data.tar.gz: 5c805b9bb8b7cc1123be943966b2f4e94619580d17eb051947d38b1e3d4a165b2c61dd3f117c6bc7c8d858768731108bb1aab1cc59472ad0a3e120fb8df9cae4
|
data/lib/wayback_archiver.rb
CHANGED
@@ -4,19 +4,36 @@ require 'uri'
|
|
4
4
|
require 'net/http'
|
5
5
|
|
6
6
|
require 'wayback_archiver/version'
|
7
|
-
require 'wayback_archiver/
|
7
|
+
require 'wayback_archiver/url_collector'
|
8
8
|
require 'wayback_archiver/archive'
|
9
9
|
require 'wayback_archiver/request'
|
10
10
|
|
11
|
+
# WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
|
11
12
|
module WaybackArchiver
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
# Link to gem on rubygems.org, part of the sent User-Agent
|
14
|
+
INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
|
15
|
+
# WaybackArchiver User-Agent
|
16
|
+
USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})"
|
17
|
+
|
18
|
+
# Send URLs to Wayback Machine.
|
19
|
+
# @return [Array] with URLs sent to the Wayback Machine.
|
20
|
+
# @param [String] source for URL(s).
|
21
|
+
# @param [String/Symbol] type of source. Supported types: ['crawl', 'sitemap', 'url', 'file'].
|
22
|
+
# @example Crawl example.com and send all URLs of the same domain
|
23
|
+
# WaybackArchiver.archive('example.com') # Default type is :crawl
|
24
|
+
# WaybackArchiver.archive('example.com', :crawl)
|
25
|
+
# @example Send only example.com
|
26
|
+
# WaybackArchiver.archive('example.com', :url)
|
27
|
+
# @example Send URL on each line in specified file
|
28
|
+
# WaybackArchiver.archive('/path/to/file', :file)
|
29
|
+
def self.archive(source, type = :crawl)
|
30
|
+
case type.to_s
|
31
|
+
when 'file' then Archive.post(UrlCollector.file(source))
|
32
|
+
when 'crawl' then UrlCollector.crawl(source) { |url| Archive.post_url(url) }
|
33
|
+
when 'sitemap' then Archive.post(UrlCollector.sitemap(source))
|
17
34
|
when 'url' then Archive.post_url(Request.resolve_url(source))
|
18
35
|
else
|
19
|
-
raise ArgumentError, "Unknown type: '#{
|
36
|
+
raise ArgumentError, "Unknown type: '#{type}'. Allowed types: sitemap, url, file, crawl"
|
20
37
|
end
|
21
38
|
end
|
22
39
|
end
|
@@ -1,27 +1,47 @@
|
|
1
1
|
module WaybackArchiver
|
2
|
+
# Post URL(s) to Wayback Machine
|
2
3
|
class Archive
|
3
|
-
|
4
|
-
|
4
|
+
# Wayback Machine base URL.
|
5
|
+
WAYBACK_BASE_URL = 'https://web.archive.org/save/'
|
6
|
+
# Default concurrency for archiving URLs
|
7
|
+
DEFAULT_CONCURRENCY = 10
|
8
|
+
class << self
|
9
|
+
# Send URLs to Wayback Machine.
|
10
|
+
# @return [Array] with sent URLs.
|
11
|
+
# @param [Array] urls URLs to send.
|
12
|
+
# @param [Hash] options
|
13
|
+
# @example Archive example.com, with default options
|
14
|
+
# Archive.post(['http://example.com'])
|
15
|
+
# @example Archive example.com, using only 1 thread
|
16
|
+
# Archive.post(['http://example.com'], concurrency: 1)
|
17
|
+
def post(urls, options = {})
|
18
|
+
options = { concurrency: DEFAULT_CONCURRENCY }.merge!(options)
|
19
|
+
concurrency = options[:concurrency]
|
20
|
+
puts "Request are sent with up to #{concurrency} parallel threads"
|
21
|
+
puts "Total urls to be sent: #{urls.length}"
|
22
|
+
group_size = (urls.length / concurrency) + 1
|
23
|
+
urls.each_slice(group_size).to_a.map! do |archive_urls|
|
24
|
+
Thread.new { archive_urls.each { |url| post_url(url) } }
|
25
|
+
end.each(&:join)
|
26
|
+
puts "#{urls.length} URLs sent to Internet archive"
|
27
|
+
urls
|
28
|
+
end
|
5
29
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
puts "[#{response.code}, #{response.message}] #{resolved_url}"
|
22
|
-
rescue Exception => e
|
23
|
-
puts "Error message: #{e.message}"
|
24
|
-
puts "Failed to archive: #{resolved_url}"
|
30
|
+
# Send URL to Wayback Machine.
|
31
|
+
# @return [String] the sent URL.
|
32
|
+
# @param [String] url to send.
|
33
|
+
# @example Archive example.com, with default options
|
34
|
+
# Archive.post_url('http://example.com')
|
35
|
+
def post_url(url)
|
36
|
+
resolved_url = Request.resolve_url(url)
|
37
|
+
request_url = "#{WAYBACK_BASE_URL}#{resolved_url}"
|
38
|
+
response = Request.response(request_url)
|
39
|
+
puts "[#{response.code}, #{response.message}] #{resolved_url}"
|
40
|
+
resolved_url
|
41
|
+
rescue Exception => e
|
42
|
+
puts "Error message: #{e.message}"
|
43
|
+
puts "Failed to archive: #{resolved_url}"
|
44
|
+
end
|
25
45
|
end
|
26
46
|
end
|
27
47
|
end
|
@@ -1,34 +1,65 @@
|
|
1
1
|
require 'url_resolver' # TODO: Allow users to use any resolver
|
2
2
|
|
3
3
|
module WaybackArchiver
|
4
|
+
# Request and parse HTML & XML documents
|
4
5
|
class Request
|
5
|
-
INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
|
6
|
-
USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})"
|
7
|
-
|
8
6
|
class << self
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
7
|
+
# Get and parse HTML & XML documents.
|
8
|
+
# @return [Array] with links sent to the Wayback Machine.
|
9
|
+
# @param [String] url to retrieve and parse.
|
10
|
+
# @example Request and parse example.com
|
11
|
+
# Request.document('example.com')
|
12
|
+
# @example Request and parse google.com/sitemap.xml
|
13
|
+
# Request.document('google.com/sitemap.xml')
|
14
|
+
def document(url)
|
15
|
+
response_body = Request.response(url).body
|
16
|
+
Nokogiri::HTML(response_body)
|
17
17
|
end
|
18
18
|
|
19
|
-
|
19
|
+
# Get reponse.
|
20
|
+
# @return [Net::HTTP*] the http response.
|
21
|
+
# @param [String] url URL to retrieve.
|
22
|
+
# @param [Boolean] resolve whether to resolve the URL.
|
23
|
+
# @example Resolve example.com and request
|
24
|
+
# Request.response('example.com', true)
|
25
|
+
# @example Request http://example.com
|
26
|
+
# Request.response('http://example.com', false)
|
27
|
+
def response(url, resolve = true)
|
20
28
|
resolved_url = resolve ? resolve_url(url) : url
|
21
29
|
uri = URI.parse(resolved_url)
|
22
30
|
http = Net::HTTP.new(uri.host, uri.port)
|
23
|
-
http.use_ssl = true if resolved_url.
|
31
|
+
http.use_ssl = true if resolved_url.start_with?('https://')
|
24
32
|
|
25
33
|
request = Net::HTTP::Get.new(uri.request_uri)
|
26
|
-
request['User-Agent'] = USER_AGENT
|
34
|
+
request['User-Agent'] = WaybackArchiver::USER_AGENT
|
27
35
|
http.request(request)
|
28
36
|
end
|
29
37
|
|
38
|
+
# Resolve the URL, follows redirects.
|
39
|
+
# @return [String] the resolved URL.
|
40
|
+
# @param [String] url to retrieve.
|
41
|
+
# @example Resolve example.com and request
|
42
|
+
# Request.resolve_url('example.com')
|
30
43
|
def resolve_url(url)
|
31
|
-
UrlResolver.resolve(url)
|
44
|
+
resolved = UrlResolver.resolve(url)
|
45
|
+
resolved = resolved.prepend('http://') unless has_protocol?(resolved)
|
46
|
+
resolved
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
# Resolve the URL, follows redirects.
|
52
|
+
# @return [Boolean] true if string includes protocol.
|
53
|
+
# @param [String] url to check.
|
54
|
+
# @example Check if string includes protocol
|
55
|
+
# Request.has_protocol?('example.com')
|
56
|
+
# # => false
|
57
|
+
# Request.has_protocol?('https://example.com')
|
58
|
+
# # => true
|
59
|
+
# Request.has_protocol?('http://example.com')
|
60
|
+
# # => true
|
61
|
+
def has_protocol?(url)
|
62
|
+
url.start_with?('http://') || url.start_with?('https://')
|
32
63
|
end
|
33
64
|
end
|
34
65
|
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module WaybackArchiver
|
2
|
+
# Retrive URLs from different sources
|
3
|
+
class UrlCollector
|
4
|
+
class << self
|
5
|
+
# Retrieve URLs from Sitemap.
|
6
|
+
# @return [Array] of URLs defined in Sitemap.
|
7
|
+
# @param [String] url domain to retrieve Sitemap from.
|
8
|
+
# @example Get URLs defined in Sitemap for google.com
|
9
|
+
# UrlCollector.sitemap('https://google.com')
|
10
|
+
def sitemap(url)
|
11
|
+
resolved = Request.resolve_url("#{url}/sitemap.xml")
|
12
|
+
sitemap = Request.document(resolved)
|
13
|
+
sitemap.css('loc').map { |element| element.text }
|
14
|
+
end
|
15
|
+
|
16
|
+
# Retrieve URLs by crawling.
|
17
|
+
# @return [Array] of URLs defined found during crawl.
|
18
|
+
# @param [String] url domain to crawl URLs from.
|
19
|
+
# @example Crawl URLs defined on example.com
|
20
|
+
# UrlCollector.crawl('http://example.com')
|
21
|
+
def crawl(url)
|
22
|
+
SiteMapper.map(url, user_agent: WaybackArchiver::USER_AGENT) { |new_url| yield(new_url) if block_given? }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Retrieve URLs listed in file.
|
26
|
+
# @return [Array] of URLs defined in file.
|
27
|
+
# @param [String] path to get URLs from.
|
28
|
+
# @example Get URLs defined in /path/to/file
|
29
|
+
# UrlCollector.file('/path/to/file')
|
30
|
+
def file(path)
|
31
|
+
raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
|
32
|
+
urls = []
|
33
|
+
File.open(path).read
|
34
|
+
.gsub(/\r\n?/, "\n")
|
35
|
+
.each_line { |line| urls << line.gsub(/\n/, '').strip }
|
36
|
+
urls.reject(&:empty?)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
metadata
CHANGED
@@ -1,113 +1,127 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-04-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: site_mapper
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: url_resolver
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - ~>
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '0.1'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - ~>
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0.1'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: bundler
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - ~>
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '1.3'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - ~>
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '1.3'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: rake
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - ~>
|
59
|
+
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '10.3'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - ~>
|
66
|
+
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '10.3'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rspec
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - ~>
|
73
|
+
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
75
|
version: '3.1'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- - ~>
|
80
|
+
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '3.1'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: yard
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- - ~>
|
87
|
+
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
89
|
version: '0.8'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- - ~>
|
94
|
+
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0.8'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: coveralls
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- - ~>
|
101
|
+
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
103
|
version: '0.7'
|
104
104
|
type: :development
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
|
-
- - ~>
|
108
|
+
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0.7'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: redcarpet
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '3.2'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '3.2'
|
111
125
|
description: Send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
|
112
126
|
email:
|
113
127
|
- burenstam@gmail.com
|
@@ -117,11 +131,11 @@ extensions: []
|
|
117
131
|
extra_rdoc_files: []
|
118
132
|
files:
|
119
133
|
- bin/wayback_archiver
|
134
|
+
- lib/wayback_archiver.rb
|
120
135
|
- lib/wayback_archiver/archive.rb
|
121
|
-
- lib/wayback_archiver/collector.rb
|
122
136
|
- lib/wayback_archiver/request.rb
|
137
|
+
- lib/wayback_archiver/url_collector.rb
|
123
138
|
- lib/wayback_archiver/version.rb
|
124
|
-
- lib/wayback_archiver.rb
|
125
139
|
homepage: https://github.com/buren/wayback_archiver
|
126
140
|
licenses:
|
127
141
|
- MIT
|
@@ -132,17 +146,17 @@ require_paths:
|
|
132
146
|
- lib
|
133
147
|
required_ruby_version: !ruby/object:Gem::Requirement
|
134
148
|
requirements:
|
135
|
-
- -
|
149
|
+
- - ">="
|
136
150
|
- !ruby/object:Gem::Version
|
137
151
|
version: 1.9.3
|
138
152
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
139
153
|
requirements:
|
140
|
-
- -
|
154
|
+
- - ">="
|
141
155
|
- !ruby/object:Gem::Version
|
142
156
|
version: '0'
|
143
157
|
requirements: []
|
144
158
|
rubyforge_project:
|
145
|
-
rubygems_version: 2.
|
159
|
+
rubygems_version: 2.2.2
|
146
160
|
signing_key:
|
147
161
|
specification_version: 4
|
148
162
|
summary: Send URLs to Wayback Machine
|
@@ -1,24 +0,0 @@
|
|
1
|
-
module WaybackArchiver
|
2
|
-
class Collector
|
3
|
-
class << self
|
4
|
-
def urls_from_sitemap(url)
|
5
|
-
resolved = Request.resolve_url(url)
|
6
|
-
sitemap = Request.get_page(resolved)
|
7
|
-
sitemap.css('loc').map! { |element| element.text }
|
8
|
-
end
|
9
|
-
|
10
|
-
def urls_from_crawl(url)
|
11
|
-
SiteMapper.map(url) { |new_url| yield(new_url) if block_given? }
|
12
|
-
end
|
13
|
-
|
14
|
-
def urls_from_file(path)
|
15
|
-
raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
|
16
|
-
urls = []
|
17
|
-
text = File.open(path).read
|
18
|
-
text.gsub!(/\r\n?/, "\n")
|
19
|
-
.each_line { |line| urls << line.gsub!(/\n/, '').strip }
|
20
|
-
urls.reject(&:empty?)
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|