wayback_archiver 0.0.11 → 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wayback_archiver.rb +4 -5
- data/lib/wayback_archiver/archive.rb +35 -37
- data/lib/wayback_archiver/process_queue.rb +28 -0
- data/lib/wayback_archiver/request.rb +50 -54
- data/lib/wayback_archiver/url_collector.rb +41 -30
- data/lib/wayback_archiver/version.rb +1 -1
- metadata +35 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fd01b7c39f7432bba6bb0abf29cff5cced700e90
|
4
|
+
data.tar.gz: 887582fb5f46f8e42a126b09d7c31b3d005bb1cf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f66455e9ddfdd2cb191910bacfe784a3822e31b0e7ef7bce395fd60614887c9c700b262e958b6acfff9fa59941068c023a0729d09cf00116f44b359c01b2213
|
7
|
+
data.tar.gz: 8c8f98adfeb91808296d8c1bc7e01d219918074b5680d2afccc22ac00cb02b6d6ff18b29129d3650fa660238b023e8948a851aeb7ce36b42c34b9be7bb913386
|
data/lib/wayback_archiver.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'site_mapper'
|
2
|
-
|
3
1
|
require 'uri'
|
4
2
|
require 'net/http'
|
5
3
|
|
@@ -7,13 +5,14 @@ require 'wayback_archiver/version'
|
|
7
5
|
require 'wayback_archiver/url_collector'
|
8
6
|
require 'wayback_archiver/archive'
|
9
7
|
require 'wayback_archiver/request'
|
8
|
+
require 'wayback_archiver/process_queue'
|
10
9
|
|
11
10
|
# WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
|
12
11
|
module WaybackArchiver
|
13
12
|
# Link to gem on rubygems.org, part of the sent User-Agent
|
14
|
-
INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
|
13
|
+
INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'.freeze
|
15
14
|
# WaybackArchiver User-Agent
|
16
|
-
USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})"
|
15
|
+
USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze
|
17
16
|
|
18
17
|
# Send URLs to Wayback Machine.
|
19
18
|
# @return [Array] with URLs sent to the Wayback Machine.
|
@@ -29,7 +28,7 @@ module WaybackArchiver
|
|
29
28
|
def self.archive(source, type = :crawl)
|
30
29
|
case type.to_s
|
31
30
|
when 'file' then Archive.post(UrlCollector.file(source))
|
32
|
-
when 'crawl' then UrlCollector.crawl(source)
|
31
|
+
when 'crawl' then Archive.post(UrlCollector.crawl(source))
|
33
32
|
when 'sitemap' then Archive.post(UrlCollector.sitemap(source))
|
34
33
|
when 'url' then Archive.post_url(Request.resolve_url(source))
|
35
34
|
else
|
@@ -2,46 +2,44 @@ module WaybackArchiver
|
|
2
2
|
# Post URL(s) to Wayback Machine
|
3
3
|
class Archive
|
4
4
|
# Wayback Machine base URL.
|
5
|
-
WAYBACK_BASE_URL = 'https://web.archive.org/save/'
|
5
|
+
WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
|
6
6
|
# Default concurrency for archiving URLs
|
7
7
|
DEFAULT_CONCURRENCY = 10
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
concurrency = options[:concurrency]
|
20
|
-
puts "Request are sent with up to #{concurrency} parallel threads"
|
21
|
-
puts "Total urls to be sent: #{urls.length}"
|
22
|
-
group_size = (urls.length / concurrency) + 1
|
23
|
-
urls.each_slice(group_size).to_a.map! do |archive_urls|
|
24
|
-
Thread.new { archive_urls.each { |url| post_url(url) } }
|
25
|
-
end.each(&:join)
|
26
|
-
puts "#{urls.length} URLs sent to Internet archive"
|
27
|
-
urls
|
28
|
-
end
|
8
|
+
# Send URLs to Wayback Machine.
|
9
|
+
# @return [Array] with sent URLs.
|
10
|
+
# @param [Array] urls URLs to send.
|
11
|
+
# @param [Hash] options
|
12
|
+
# @example Archive example.com, with default options
|
13
|
+
# Archive.post(['http://example.com'])
|
14
|
+
# @example Archive example.com, using only 1 thread
|
15
|
+
# Archive.post(['http://example.com'], concurrency: 1)
|
16
|
+
def self.post(urls, options = {})
|
17
|
+
options = { concurrency: DEFAULT_CONCURRENCY }.merge!(options)
|
18
|
+
concurrency = options[:concurrency]
|
29
19
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
20
|
+
puts "=== WAYBACK ARCHIVER ==="
|
21
|
+
puts "Request are sent with up to #{concurrency} parallel threads"
|
22
|
+
puts "Total urls to be sent: #{urls.length}"
|
23
|
+
|
24
|
+
ProcessQueue.process(urls, threads_count: concurrency) { |url| post_url(url) }
|
25
|
+
|
26
|
+
puts "#{urls.length} URLs sent to Internet archive"
|
27
|
+
urls
|
28
|
+
end
|
29
|
+
|
30
|
+
# Send URL to Wayback Machine.
|
31
|
+
# @return [String] the sent URL.
|
32
|
+
# @param [String] url to send.
|
33
|
+
# @example Archive example.com, with default options
|
34
|
+
# Archive.post_url('http://example.com')
|
35
|
+
def self.post_url(url)
|
36
|
+
request_url = "#{WAYBACK_BASE_URL}#{url}"
|
37
|
+
response = Request.response(request_url)
|
38
|
+
puts "[#{response.code}, #{response.message}] #{url}"
|
39
|
+
url
|
40
|
+
rescue Exception => e
|
41
|
+
puts "Error message: #{e.message}"
|
42
|
+
puts "Failed to archive: #{url}"
|
45
43
|
end
|
46
44
|
end
|
47
45
|
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'thread'
|
2
|
+
|
3
|
+
module WaybackArchiver
|
4
|
+
class ProcessQueue
|
5
|
+
# Process enumerable data in parallel.
|
6
|
+
# @return [Array] of URLs defined found during crawl.
|
7
|
+
# @param [Object] Enumberable object
|
8
|
+
# @example Print list of names in parallel
|
9
|
+
# ProcessQueue.process(%w(jacob peter eva)) { |v| puts n }
|
10
|
+
# @example Print list of names using 2 threads
|
11
|
+
# ProcessQueue.process(%w(jacob peter eva), threads_count: 2) { |v| puts n }
|
12
|
+
def self.process(data_array, threads_count: 5)
|
13
|
+
queue = Queue.new
|
14
|
+
data_array.each { |data| queue.push(data) }
|
15
|
+
workers = threads_count.times.map do
|
16
|
+
Thread.new do
|
17
|
+
begin
|
18
|
+
while data = queue.pop(true)
|
19
|
+
yield(data)
|
20
|
+
end
|
21
|
+
rescue ThreadError
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
workers.map(&:join)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -3,64 +3,60 @@ require 'url_resolver' # TODO: Allow users to use any resolver
|
|
3
3
|
module WaybackArchiver
|
4
4
|
# Request and parse HTML & XML documents
|
5
5
|
class Request
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
# Get reponse.
|
20
|
-
# @return [Net::HTTP*] the http response.
|
21
|
-
# @param [String] url URL to retrieve.
|
22
|
-
# @param [Boolean] resolve whether to resolve the URL.
|
23
|
-
# @example Resolve example.com and request
|
24
|
-
# Request.response('example.com', true)
|
25
|
-
# @example Request http://example.com
|
26
|
-
# Request.response('http://example.com', false)
|
27
|
-
def response(url, resolve = true)
|
28
|
-
resolved_url = resolve ? resolve_url(url) : url
|
29
|
-
uri = URI.parse(resolved_url)
|
30
|
-
http = Net::HTTP.new(uri.host, uri.port)
|
31
|
-
http.use_ssl = true if resolved_url.start_with?('https://')
|
6
|
+
# Get and parse HTML & XML documents.
|
7
|
+
# @return [Array] with links sent to the Wayback Machine.
|
8
|
+
# @param [String] url to retrieve and parse.
|
9
|
+
# @example Request and parse example.com
|
10
|
+
# Request.document('example.com')
|
11
|
+
# @example Request and parse google.com/sitemap.xml
|
12
|
+
# Request.document('google.com/sitemap.xml')
|
13
|
+
def self.document(url)
|
14
|
+
response_body = Request.response(url).body
|
15
|
+
Nokogiri::HTML(response_body)
|
16
|
+
end
|
32
17
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
18
|
+
# Get reponse.
|
19
|
+
# @return [Net::HTTP*] the http response.
|
20
|
+
# @param [String] url URL to retrieve.
|
21
|
+
# @param [Boolean] resolve whether to resolve the URL.
|
22
|
+
# @example Resolve example.com and request
|
23
|
+
# Request.response('example.com', true)
|
24
|
+
# @example Request http://example.com
|
25
|
+
# Request.response('http://example.com', false)
|
26
|
+
def self.response(url, resolve = true)
|
27
|
+
resolved_url = resolve ? resolve_url(url) : url
|
28
|
+
uri = URI.parse(resolved_url)
|
29
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
30
|
+
http.use_ssl = true if resolved_url.start_with?('https://')
|
37
31
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
# Request.resolve_url('example.com')
|
43
|
-
def resolve_url(url)
|
44
|
-
resolved = UrlResolver.resolve(url)
|
45
|
-
resolved = resolved.prepend('http://') unless has_protocol?(resolved)
|
46
|
-
resolved
|
47
|
-
end
|
32
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
33
|
+
request['User-Agent'] = WaybackArchiver::USER_AGENT
|
34
|
+
http.request(request)
|
35
|
+
end
|
48
36
|
|
49
|
-
|
37
|
+
# Resolve the URL, follows redirects.
|
38
|
+
# @return [String] the resolved URL.
|
39
|
+
# @param [String] url to retrieve.
|
40
|
+
# @example Resolve example.com and request
|
41
|
+
# Request.resolve_url('example.com')
|
42
|
+
def self.resolve_url(url)
|
43
|
+
resolved = UrlResolver.resolve(url)
|
44
|
+
resolved = resolved.prepend('http://') unless protocol?(resolved)
|
45
|
+
resolved
|
46
|
+
end
|
50
47
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
end
|
48
|
+
# Resolve the URL, follows redirects.
|
49
|
+
# @return [Boolean] true if string includes protocol.
|
50
|
+
# @param [String] url to check.
|
51
|
+
# @example Check if string includes protocol
|
52
|
+
# Request.protocol?('example.com')
|
53
|
+
# # => false
|
54
|
+
# Request.protocol?('https://example.com')
|
55
|
+
# # => true
|
56
|
+
# Request.protocol?('http://example.com')
|
57
|
+
# # => true
|
58
|
+
def self.protocol?(url)
|
59
|
+
url.start_with?('http://') || url.start_with?('https://')
|
64
60
|
end
|
65
61
|
end
|
66
62
|
end
|
@@ -1,40 +1,51 @@
|
|
1
|
+
require 'spidr'
|
2
|
+
require 'robots'
|
3
|
+
|
1
4
|
module WaybackArchiver
|
2
5
|
# Retrive URLs from different sources
|
3
6
|
class UrlCollector
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
end
|
7
|
+
# Retrieve URLs from Sitemap.
|
8
|
+
# @return [Array] of URLs defined in Sitemap.
|
9
|
+
# @param [String] url domain to retrieve Sitemap from.
|
10
|
+
# @example Get URLs defined in Sitemap for google.com
|
11
|
+
# UrlCollector.sitemap('https://google.com')
|
12
|
+
def self.sitemap(url)
|
13
|
+
resolved = Request.resolve_url("#{url}/sitemap.xml")
|
14
|
+
sitemap = Request.document(resolved)
|
15
|
+
sitemap.css('loc').map(&:text)
|
16
|
+
end
|
15
17
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
18
|
+
# Retrieve URLs by crawling.
|
19
|
+
# @return [Array] of URLs defined found during crawl.
|
20
|
+
# @param [String] url domain to crawl URLs from.
|
21
|
+
# @example Crawl URLs defined on example.com
|
22
|
+
# UrlCollector.crawl('http://example.com')
|
23
|
+
def self.crawl(url)
|
24
|
+
urls = []
|
25
|
+
resolved_url = Request.resolve_url(url)
|
26
|
+
Spidr.site(resolved_url, robots: true) do |spider|
|
27
|
+
spider.every_html_page do |page|
|
28
|
+
page_url = page.url.to_s
|
29
|
+
urls << page_url
|
30
|
+
puts "Found: #{page_url}"
|
31
|
+
yield(page_url) if block_given?
|
32
|
+
end
|
23
33
|
end
|
34
|
+
urls
|
35
|
+
end
|
24
36
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
37
|
+
# Retrieve URLs listed in file.
|
38
|
+
# @return [Array] of URLs defined in file.
|
39
|
+
# @param [String] path to get URLs from.
|
40
|
+
# @example Get URLs defined in /path/to/file
|
41
|
+
# UrlCollector.file('/path/to/file')
|
42
|
+
def self.file(path)
|
43
|
+
raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
|
44
|
+
urls = []
|
45
|
+
File.open(path).read
|
34
46
|
.gsub(/\r\n?/, "\n")
|
35
|
-
.each_line { |line| urls << line.
|
36
|
-
|
37
|
-
end
|
47
|
+
.each_line { |line| urls << line.delete("\n").strip }
|
48
|
+
urls.reject(&:empty?)
|
38
49
|
end
|
39
50
|
end
|
40
51
|
end
|
metadata
CHANGED
@@ -1,29 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-02-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: spidr
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 0.6.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 0.6.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: robots
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0.1'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0.1'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: url_resolver
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,6 +136,20 @@ dependencies:
|
|
122
136
|
- - "~>"
|
123
137
|
- !ruby/object:Gem::Version
|
124
138
|
version: '3.2'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: byebug
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - ">"
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ">"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
125
153
|
description: Send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
|
126
154
|
email:
|
127
155
|
- burenstam@gmail.com
|
@@ -133,6 +161,7 @@ files:
|
|
133
161
|
- bin/wayback_archiver
|
134
162
|
- lib/wayback_archiver.rb
|
135
163
|
- lib/wayback_archiver/archive.rb
|
164
|
+
- lib/wayback_archiver/process_queue.rb
|
136
165
|
- lib/wayback_archiver/request.rb
|
137
166
|
- lib/wayback_archiver/url_collector.rb
|
138
167
|
- lib/wayback_archiver/version.rb
|
@@ -156,9 +185,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
156
185
|
version: '0'
|
157
186
|
requirements: []
|
158
187
|
rubyforge_project:
|
159
|
-
rubygems_version: 2.
|
188
|
+
rubygems_version: 2.5.2
|
160
189
|
signing_key:
|
161
190
|
specification_version: 4
|
162
191
|
summary: Send URLs to Wayback Machine
|
163
192
|
test_files: []
|
164
|
-
has_rdoc:
|