wayback_archiver 0.0.11 → 0.0.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/wayback_archiver.rb +4 -5
- data/lib/wayback_archiver/archive.rb +35 -37
- data/lib/wayback_archiver/process_queue.rb +28 -0
- data/lib/wayback_archiver/request.rb +50 -54
- data/lib/wayback_archiver/url_collector.rb +41 -30
- data/lib/wayback_archiver/version.rb +1 -1
- metadata +35 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fd01b7c39f7432bba6bb0abf29cff5cced700e90
|
4
|
+
data.tar.gz: 887582fb5f46f8e42a126b09d7c31b3d005bb1cf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f66455e9ddfdd2cb191910bacfe784a3822e31b0e7ef7bce395fd60614887c9c700b262e958b6acfff9fa59941068c023a0729d09cf00116f44b359c01b2213
|
7
|
+
data.tar.gz: 8c8f98adfeb91808296d8c1bc7e01d219918074b5680d2afccc22ac00cb02b6d6ff18b29129d3650fa660238b023e8948a851aeb7ce36b42c34b9be7bb913386
|
data/lib/wayback_archiver.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'site_mapper'
|
2
|
-
|
3
1
|
require 'uri'
|
4
2
|
require 'net/http'
|
5
3
|
|
@@ -7,13 +5,14 @@ require 'wayback_archiver/version'
|
|
7
5
|
require 'wayback_archiver/url_collector'
|
8
6
|
require 'wayback_archiver/archive'
|
9
7
|
require 'wayback_archiver/request'
|
8
|
+
require 'wayback_archiver/process_queue'
|
10
9
|
|
11
10
|
# WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
|
12
11
|
module WaybackArchiver
|
13
12
|
# Link to gem on rubygems.org, part of the sent User-Agent
|
14
|
-
INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
|
13
|
+
INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'.freeze
|
15
14
|
# WaybackArchiver User-Agent
|
16
|
-
USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})"
|
15
|
+
USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze
|
17
16
|
|
18
17
|
# Send URLs to Wayback Machine.
|
19
18
|
# @return [Array] with URLs sent to the Wayback Machine.
|
@@ -29,7 +28,7 @@ module WaybackArchiver
|
|
29
28
|
def self.archive(source, type = :crawl)
|
30
29
|
case type.to_s
|
31
30
|
when 'file' then Archive.post(UrlCollector.file(source))
|
32
|
-
when 'crawl' then UrlCollector.crawl(source)
|
31
|
+
when 'crawl' then Archive.post(UrlCollector.crawl(source))
|
33
32
|
when 'sitemap' then Archive.post(UrlCollector.sitemap(source))
|
34
33
|
when 'url' then Archive.post_url(Request.resolve_url(source))
|
35
34
|
else
|
@@ -2,46 +2,44 @@ module WaybackArchiver
|
|
2
2
|
# Post URL(s) to Wayback Machine
|
3
3
|
class Archive
|
4
4
|
# Wayback Machine base URL.
|
5
|
-
WAYBACK_BASE_URL = 'https://web.archive.org/save/'
|
5
|
+
WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
|
6
6
|
# Default concurrency for archiving URLs
|
7
7
|
DEFAULT_CONCURRENCY = 10
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
concurrency = options[:concurrency]
|
20
|
-
puts "Request are sent with up to #{concurrency} parallel threads"
|
21
|
-
puts "Total urls to be sent: #{urls.length}"
|
22
|
-
group_size = (urls.length / concurrency) + 1
|
23
|
-
urls.each_slice(group_size).to_a.map! do |archive_urls|
|
24
|
-
Thread.new { archive_urls.each { |url| post_url(url) } }
|
25
|
-
end.each(&:join)
|
26
|
-
puts "#{urls.length} URLs sent to Internet archive"
|
27
|
-
urls
|
28
|
-
end
|
8
|
+
# Send URLs to Wayback Machine.
|
9
|
+
# @return [Array] with sent URLs.
|
10
|
+
# @param [Array] urls URLs to send.
|
11
|
+
# @param [Hash] options
|
12
|
+
# @example Archive example.com, with default options
|
13
|
+
# Archive.post(['http://example.com'])
|
14
|
+
# @example Archive example.com, using only 1 thread
|
15
|
+
# Archive.post(['http://example.com'], concurrency: 1)
|
16
|
+
def self.post(urls, options = {})
|
17
|
+
options = { concurrency: DEFAULT_CONCURRENCY }.merge!(options)
|
18
|
+
concurrency = options[:concurrency]
|
29
19
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
20
|
+
puts "=== WAYBACK ARCHIVER ==="
|
21
|
+
puts "Request are sent with up to #{concurrency} parallel threads"
|
22
|
+
puts "Total urls to be sent: #{urls.length}"
|
23
|
+
|
24
|
+
ProcessQueue.process(urls, threads_count: concurrency) { |url| post_url(url) }
|
25
|
+
|
26
|
+
puts "#{urls.length} URLs sent to Internet archive"
|
27
|
+
urls
|
28
|
+
end
|
29
|
+
|
30
|
+
# Send URL to Wayback Machine.
|
31
|
+
# @return [String] the sent URL.
|
32
|
+
# @param [String] url to send.
|
33
|
+
# @example Archive example.com, with default options
|
34
|
+
# Archive.post_url('http://example.com')
|
35
|
+
def self.post_url(url)
|
36
|
+
request_url = "#{WAYBACK_BASE_URL}#{url}"
|
37
|
+
response = Request.response(request_url)
|
38
|
+
puts "[#{response.code}, #{response.message}] #{url}"
|
39
|
+
url
|
40
|
+
rescue Exception => e
|
41
|
+
puts "Error message: #{e.message}"
|
42
|
+
puts "Failed to archive: #{url}"
|
45
43
|
end
|
46
44
|
end
|
47
45
|
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'thread'
|
2
|
+
|
3
|
+
module WaybackArchiver
|
4
|
+
class ProcessQueue
|
5
|
+
# Process enumerable data in parallel.
|
6
|
+
# @return [Array] of URLs defined found during crawl.
|
7
|
+
# @param [Object] Enumberable object
|
8
|
+
# @example Print list of names in parallel
|
9
|
+
# ProcessQueue.process(%w(jacob peter eva)) { |v| puts n }
|
10
|
+
# @example Print list of names using 2 threads
|
11
|
+
# ProcessQueue.process(%w(jacob peter eva), threads_count: 2) { |v| puts n }
|
12
|
+
def self.process(data_array, threads_count: 5)
|
13
|
+
queue = Queue.new
|
14
|
+
data_array.each { |data| queue.push(data) }
|
15
|
+
workers = threads_count.times.map do
|
16
|
+
Thread.new do
|
17
|
+
begin
|
18
|
+
while data = queue.pop(true)
|
19
|
+
yield(data)
|
20
|
+
end
|
21
|
+
rescue ThreadError
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
workers.map(&:join)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -3,64 +3,60 @@ require 'url_resolver' # TODO: Allow users to use any resolver
|
|
3
3
|
module WaybackArchiver
|
4
4
|
# Request and parse HTML & XML documents
|
5
5
|
class Request
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
# Get reponse.
|
20
|
-
# @return [Net::HTTP*] the http response.
|
21
|
-
# @param [String] url URL to retrieve.
|
22
|
-
# @param [Boolean] resolve whether to resolve the URL.
|
23
|
-
# @example Resolve example.com and request
|
24
|
-
# Request.response('example.com', true)
|
25
|
-
# @example Request http://example.com
|
26
|
-
# Request.response('http://example.com', false)
|
27
|
-
def response(url, resolve = true)
|
28
|
-
resolved_url = resolve ? resolve_url(url) : url
|
29
|
-
uri = URI.parse(resolved_url)
|
30
|
-
http = Net::HTTP.new(uri.host, uri.port)
|
31
|
-
http.use_ssl = true if resolved_url.start_with?('https://')
|
6
|
+
# Get and parse HTML & XML documents.
|
7
|
+
# @return [Array] with links sent to the Wayback Machine.
|
8
|
+
# @param [String] url to retrieve and parse.
|
9
|
+
# @example Request and parse example.com
|
10
|
+
# Request.document('example.com')
|
11
|
+
# @example Request and parse google.com/sitemap.xml
|
12
|
+
# Request.document('google.com/sitemap.xml')
|
13
|
+
def self.document(url)
|
14
|
+
response_body = Request.response(url).body
|
15
|
+
Nokogiri::HTML(response_body)
|
16
|
+
end
|
32
17
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
18
|
+
# Get reponse.
|
19
|
+
# @return [Net::HTTP*] the http response.
|
20
|
+
# @param [String] url URL to retrieve.
|
21
|
+
# @param [Boolean] resolve whether to resolve the URL.
|
22
|
+
# @example Resolve example.com and request
|
23
|
+
# Request.response('example.com', true)
|
24
|
+
# @example Request http://example.com
|
25
|
+
# Request.response('http://example.com', false)
|
26
|
+
def self.response(url, resolve = true)
|
27
|
+
resolved_url = resolve ? resolve_url(url) : url
|
28
|
+
uri = URI.parse(resolved_url)
|
29
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
30
|
+
http.use_ssl = true if resolved_url.start_with?('https://')
|
37
31
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
# Request.resolve_url('example.com')
|
43
|
-
def resolve_url(url)
|
44
|
-
resolved = UrlResolver.resolve(url)
|
45
|
-
resolved = resolved.prepend('http://') unless has_protocol?(resolved)
|
46
|
-
resolved
|
47
|
-
end
|
32
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
33
|
+
request['User-Agent'] = WaybackArchiver::USER_AGENT
|
34
|
+
http.request(request)
|
35
|
+
end
|
48
36
|
|
49
|
-
|
37
|
+
# Resolve the URL, follows redirects.
|
38
|
+
# @return [String] the resolved URL.
|
39
|
+
# @param [String] url to retrieve.
|
40
|
+
# @example Resolve example.com and request
|
41
|
+
# Request.resolve_url('example.com')
|
42
|
+
def self.resolve_url(url)
|
43
|
+
resolved = UrlResolver.resolve(url)
|
44
|
+
resolved = resolved.prepend('http://') unless protocol?(resolved)
|
45
|
+
resolved
|
46
|
+
end
|
50
47
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
end
|
48
|
+
# Resolve the URL, follows redirects.
|
49
|
+
# @return [Boolean] true if string includes protocol.
|
50
|
+
# @param [String] url to check.
|
51
|
+
# @example Check if string includes protocol
|
52
|
+
# Request.protocol?('example.com')
|
53
|
+
# # => false
|
54
|
+
# Request.protocol?('https://example.com')
|
55
|
+
# # => true
|
56
|
+
# Request.protocol?('http://example.com')
|
57
|
+
# # => true
|
58
|
+
def self.protocol?(url)
|
59
|
+
url.start_with?('http://') || url.start_with?('https://')
|
64
60
|
end
|
65
61
|
end
|
66
62
|
end
|
@@ -1,40 +1,51 @@
|
|
1
|
+
require 'spidr'
|
2
|
+
require 'robots'
|
3
|
+
|
1
4
|
module WaybackArchiver
|
2
5
|
# Retrive URLs from different sources
|
3
6
|
class UrlCollector
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
end
|
7
|
+
# Retrieve URLs from Sitemap.
|
8
|
+
# @return [Array] of URLs defined in Sitemap.
|
9
|
+
# @param [String] url domain to retrieve Sitemap from.
|
10
|
+
# @example Get URLs defined in Sitemap for google.com
|
11
|
+
# UrlCollector.sitemap('https://google.com')
|
12
|
+
def self.sitemap(url)
|
13
|
+
resolved = Request.resolve_url("#{url}/sitemap.xml")
|
14
|
+
sitemap = Request.document(resolved)
|
15
|
+
sitemap.css('loc').map(&:text)
|
16
|
+
end
|
15
17
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
18
|
+
# Retrieve URLs by crawling.
|
19
|
+
# @return [Array] of URLs defined found during crawl.
|
20
|
+
# @param [String] url domain to crawl URLs from.
|
21
|
+
# @example Crawl URLs defined on example.com
|
22
|
+
# UrlCollector.crawl('http://example.com')
|
23
|
+
def self.crawl(url)
|
24
|
+
urls = []
|
25
|
+
resolved_url = Request.resolve_url(url)
|
26
|
+
Spidr.site(resolved_url, robots: true) do |spider|
|
27
|
+
spider.every_html_page do |page|
|
28
|
+
page_url = page.url.to_s
|
29
|
+
urls << page_url
|
30
|
+
puts "Found: #{page_url}"
|
31
|
+
yield(page_url) if block_given?
|
32
|
+
end
|
23
33
|
end
|
34
|
+
urls
|
35
|
+
end
|
24
36
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
37
|
+
# Retrieve URLs listed in file.
|
38
|
+
# @return [Array] of URLs defined in file.
|
39
|
+
# @param [String] path to get URLs from.
|
40
|
+
# @example Get URLs defined in /path/to/file
|
41
|
+
# UrlCollector.file('/path/to/file')
|
42
|
+
def self.file(path)
|
43
|
+
raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
|
44
|
+
urls = []
|
45
|
+
File.open(path).read
|
34
46
|
.gsub(/\r\n?/, "\n")
|
35
|
-
.each_line { |line| urls << line.
|
36
|
-
|
37
|
-
end
|
47
|
+
.each_line { |line| urls << line.delete("\n").strip }
|
48
|
+
urls.reject(&:empty?)
|
38
49
|
end
|
39
50
|
end
|
40
51
|
end
|
metadata
CHANGED
@@ -1,29 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-02-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: spidr
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 0.6.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 0.6.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: robots
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0.1'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0.1'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: url_resolver
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,6 +136,20 @@ dependencies:
|
|
122
136
|
- - "~>"
|
123
137
|
- !ruby/object:Gem::Version
|
124
138
|
version: '3.2'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: byebug
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - ">"
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ">"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
125
153
|
description: Send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
|
126
154
|
email:
|
127
155
|
- burenstam@gmail.com
|
@@ -133,6 +161,7 @@ files:
|
|
133
161
|
- bin/wayback_archiver
|
134
162
|
- lib/wayback_archiver.rb
|
135
163
|
- lib/wayback_archiver/archive.rb
|
164
|
+
- lib/wayback_archiver/process_queue.rb
|
136
165
|
- lib/wayback_archiver/request.rb
|
137
166
|
- lib/wayback_archiver/url_collector.rb
|
138
167
|
- lib/wayback_archiver/version.rb
|
@@ -156,9 +185,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
156
185
|
version: '0'
|
157
186
|
requirements: []
|
158
187
|
rubyforge_project:
|
159
|
-
rubygems_version: 2.
|
188
|
+
rubygems_version: 2.5.2
|
160
189
|
signing_key:
|
161
190
|
specification_version: 4
|
162
191
|
summary: Send URLs to Wayback Machine
|
163
192
|
test_files: []
|
164
|
-
has_rdoc:
|