wayback_archiver 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/wayback_archiver +11 -1
- data/lib/wayback_archiver.rb +45 -17
- data/lib/wayback_archiver/archive.rb +42 -16
- data/lib/wayback_archiver/archive_result.rb +32 -0
- data/lib/wayback_archiver/http_code.rb +3 -3
- data/lib/wayback_archiver/request.rb +1 -12
- data/lib/wayback_archiver/response.rb +13 -0
- data/lib/wayback_archiver/sitemapper.rb +5 -1
- data/lib/wayback_archiver/url_collector.rb +7 -1
- data/lib/wayback_archiver/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a4ba8820f2974f5a506c3fd125be8b6d6b429e3d
|
4
|
+
data.tar.gz: 00e101acf27d03d0fccc48b3031cca01aa4792ae
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8c0ec54efa365cdbf9ea9847d0289d67407fcc85b41100559fc768af6ea6bb3d1ab90594bd0578cc02c503fb5b2394a67e77e6cd624783314ec226aee3948424
|
7
|
+
data.tar.gz: b206a4c9e45fa42159d6cb4013149aa003f51dd70a41b6ddbbdb77978b8bdb09fe91852d3c67b5f56bff7eab9f9305f82b16dadae1487301e2344bf335f77600
|
data/bin/wayback_archiver
CHANGED
@@ -9,6 +9,7 @@ strategy = 'auto'
|
|
9
9
|
log = STDOUT
|
10
10
|
log_level = Logger::INFO
|
11
11
|
concurrency = WaybackArchiver.concurrency
|
12
|
+
limit = WaybackArchiver.max_limit
|
12
13
|
|
13
14
|
optparse = OptionParser.new do |parser|
|
14
15
|
parser.banner = 'Usage: wayback_archiver [<url>] [options]'
|
@@ -33,6 +34,10 @@ optparse = OptionParser.new do |parser|
|
|
33
34
|
concurrency = value
|
34
35
|
end
|
35
36
|
|
37
|
+
parser.on('--limit=5', Integer, 'Max number of URLs to archive') do |value|
|
38
|
+
limit = value
|
39
|
+
end
|
40
|
+
|
36
41
|
parser.on('--log=output.log', String, 'Path to desired log file (if no argument is given it defaults to STDOUT)') do |path|
|
37
42
|
log = path
|
38
43
|
end
|
@@ -74,5 +79,10 @@ end
|
|
74
79
|
# If no strategy has explicitly been given, then default to 'auto'
|
75
80
|
strategy ||= 'auto'
|
76
81
|
urls.each do |url|
|
77
|
-
WaybackArchiver.archive(
|
82
|
+
WaybackArchiver.archive(
|
83
|
+
url,
|
84
|
+
strategy: strategy,
|
85
|
+
concurrency: concurrency,
|
86
|
+
limit: limit
|
87
|
+
)
|
78
88
|
end
|
data/lib/wayback_archiver.rb
CHANGED
@@ -15,36 +15,43 @@ module WaybackArchiver
|
|
15
15
|
# Default concurrency for archiving URLs
|
16
16
|
DEFAULT_CONCURRENCY = 5
|
17
17
|
|
18
|
+
# Maxmium number of links posted (-1 is no limit)
|
19
|
+
DEFAULT_MAX_LIMIT = -1
|
20
|
+
|
18
21
|
# Send URLs to Wayback Machine.
|
19
|
-
# @return [Array<
|
22
|
+
# @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
|
20
23
|
# @param [String/Array<String>] source for URL(s).
|
21
24
|
# @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto.
|
22
25
|
# @example Crawl example.com and send all URLs of the same domain
|
23
26
|
# WaybackArchiver.archive('example.com') # Default strategy is :auto
|
24
27
|
# WaybackArchiver.archive('example.com', strategy: :auto)
|
25
28
|
# WaybackArchiver.archive('example.com', strategy: :auto, concurrency: 10)
|
29
|
+
# WaybackArchiver.archive('example.com', strategy: :auto, limit: 100) # send max 100 URLs
|
26
30
|
# WaybackArchiver.archive('example.com', :auto)
|
27
31
|
# @example Crawl example.com and send all URLs of the same domain
|
28
32
|
# WaybackArchiver.archive('example.com', strategy: :crawl)
|
29
33
|
# WaybackArchiver.archive('example.com', strategy: :crawl, concurrency: 10)
|
34
|
+
# WaybackArchiver.archive('example.com', strategy: :crawl, limit: 100) # send max 100 URLs
|
30
35
|
# WaybackArchiver.archive('example.com', :crawl)
|
31
36
|
# @example Send example.com Sitemap URLs
|
32
37
|
# WaybackArchiver.archive('example.com', strategy: :sitemap)
|
33
38
|
# WaybackArchiver.archive('example.com', strategy: :sitemap, concurrency: 10)
|
39
|
+
# WaybackArchiver.archive('example.com', strategy: :sitemap, limit: 100) # send max 100 URLs
|
34
40
|
# WaybackArchiver.archive('example.com', :sitemap)
|
35
41
|
# @example Send only example.com
|
36
42
|
# WaybackArchiver.archive('example.com', strategy: :url)
|
37
43
|
# WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
|
44
|
+
# WaybackArchiver.archive('example.com', strategy: :url, limit: 100) # send max 100 URLs
|
38
45
|
# WaybackArchiver.archive('example.com', :url)
|
39
|
-
def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency)
|
46
|
+
def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
40
47
|
strategy = legacy_strategy || strategy
|
41
48
|
|
42
49
|
case strategy.to_s
|
43
|
-
when 'crawl' then crawl(source, concurrency: concurrency)
|
44
|
-
when 'auto' then auto(source, concurrency: concurrency)
|
45
|
-
when 'sitemap' then sitemap(source, concurrency: concurrency)
|
46
|
-
when 'urls' then urls(source, concurrency: concurrency)
|
47
|
-
when 'url' then urls(source, concurrency: concurrency)
|
50
|
+
when 'crawl' then crawl(source, concurrency: concurrency, limit: limit)
|
51
|
+
when 'auto' then auto(source, concurrency: concurrency, limit: limit)
|
52
|
+
when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit)
|
53
|
+
when 'urls' then urls(source, concurrency: concurrency, limit: limit)
|
54
|
+
when 'url' then urls(source, concurrency: concurrency, limit: limit)
|
48
55
|
else
|
49
56
|
raise ArgumentError, "Unknown strategy: '#{strategy}'. Allowed strategies: sitemap, urls, url, crawl"
|
50
57
|
end
|
@@ -52,15 +59,17 @@ module WaybackArchiver
|
|
52
59
|
|
53
60
|
# Look for Sitemap(s) and if nothing is found fallback to crawling.
|
54
61
|
# Then send found URLs to the Wayback Machine.
|
55
|
-
# @return [Array<
|
62
|
+
# @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
|
56
63
|
# @param [String] source (must be a valid URL).
|
57
64
|
# @param concurrency [Integer]
|
58
65
|
# @example Auto archive example.com
|
59
66
|
# WaybackArchiver.auto('example.com') # Default concurrency is 5
|
60
67
|
# @example Auto archive example.com with low concurrency
|
61
68
|
# WaybackArchiver.auto('example.com', concurrency: 1)
|
69
|
+
# @example Auto archive example.com and archive max 100 URLs
|
70
|
+
# WaybackArchiver.auto('example.com', limit: 100)
|
62
71
|
# @see http://www.sitemaps.org
|
63
|
-
def self.auto(source, concurrency: WaybackArchiver.concurrency)
|
72
|
+
def self.auto(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
64
73
|
urls = Sitemapper.autodiscover(source)
|
65
74
|
return urls(urls, concurrency: concurrency) if urls.any?
|
66
75
|
|
@@ -68,41 +77,47 @@ module WaybackArchiver
|
|
68
77
|
end
|
69
78
|
|
70
79
|
# Crawl site for URLs to send to the Wayback Machine.
|
71
|
-
# @return [Array<
|
80
|
+
# @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
|
72
81
|
# @param [String] url to start crawling from.
|
73
82
|
# @param concurrency [Integer]
|
74
83
|
# @example Crawl example.com and send all URLs of the same domain
|
75
84
|
# WaybackArchiver.crawl('example.com') # Default concurrency is 5
|
76
85
|
# @example Crawl example.com and send all URLs of the same domain with low concurrency
|
77
86
|
# WaybackArchiver.crawl('example.com', concurrency: 1)
|
78
|
-
|
87
|
+
# @example Crawl example.com and archive max 100 URLs
|
88
|
+
# WaybackArchiver.crawl('example.com', limit: 100)
|
89
|
+
def self.crawl(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
79
90
|
WaybackArchiver.logger.info "Crawling #{url}"
|
80
|
-
Archive.crawl(url, concurrency: concurrency)
|
91
|
+
Archive.crawl(url, concurrency: concurrency, limit: limit)
|
81
92
|
end
|
82
93
|
|
83
94
|
# Get URLs from sitemap and send found URLs to the Wayback Machine.
|
84
|
-
# @return [Array<
|
95
|
+
# @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
|
85
96
|
# @param [String] url to the sitemap.
|
86
97
|
# @param concurrency [Integer]
|
87
98
|
# @example Get example.com sitemap and archive all found URLs
|
88
99
|
# WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 5
|
89
100
|
# @example Get example.com sitemap and archive all found URLs with low concurrency
|
90
101
|
# WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1)
|
102
|
+
# @example Get example.com sitemap archive max 100 URLs
|
103
|
+
# WaybackArchiver.sitemap('example.com/sitemap.xml', limit: 100)
|
91
104
|
# @see http://www.sitemaps.org
|
92
|
-
def self.sitemap(url, concurrency: WaybackArchiver.concurrency)
|
105
|
+
def self.sitemap(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
93
106
|
WaybackArchiver.logger.info "Fetching Sitemap"
|
94
|
-
Archive.post(URLCollector.sitemap(url), concurrency: concurrency)
|
107
|
+
Archive.post(URLCollector.sitemap(url), concurrency: concurrency, limit: limit)
|
95
108
|
end
|
96
109
|
|
97
110
|
# Send URL to the Wayback Machine.
|
98
|
-
# @return [Array<
|
111
|
+
# @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
|
99
112
|
# @param [Array<String>/String] urls or url.
|
100
113
|
# @param concurrency [Integer]
|
101
114
|
# @example Archive example.com
|
102
115
|
# WaybackArchiver.urls('example.com')
|
103
116
|
# @example Archive example.com and google.com
|
104
117
|
# WaybackArchiver.urls(%w(example.com google.com))
|
105
|
-
|
118
|
+
# @example Archive example.com, max 100 URLs
|
119
|
+
# WaybackArchiver.urls(%w(example.com www.example.com), limit: 100)
|
120
|
+
def self.urls(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
106
121
|
Archive.post(Array(urls), concurrency: concurrency)
|
107
122
|
end
|
108
123
|
|
@@ -152,4 +167,17 @@ module WaybackArchiver
|
|
152
167
|
def self.concurrency
|
153
168
|
@concurrency ||= DEFAULT_CONCURRENCY
|
154
169
|
end
|
170
|
+
|
171
|
+
# Sets the default max_limit
|
172
|
+
# @return [Integer] the desired default max_limit
|
173
|
+
# @param [Integer] max_limit the desired default max_limit
|
174
|
+
def self.max_limit=(max_limit)
|
175
|
+
@max_limit = max_limit
|
176
|
+
end
|
177
|
+
|
178
|
+
# Returns the default max_limit
|
179
|
+
# @return [Integer] the configured or the default max_limit
|
180
|
+
def self.max_limit
|
181
|
+
@max_limit ||= DEFAULT_MAX_LIMIT
|
182
|
+
end
|
155
183
|
end
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'concurrent'
|
2
2
|
|
3
|
+
require 'wayback_archiver/archive_result'
|
3
4
|
require 'wayback_archiver/thread_pool'
|
4
5
|
require 'wayback_archiver/request'
|
5
6
|
|
@@ -10,23 +11,40 @@ module WaybackArchiver
|
|
10
11
|
WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
|
11
12
|
|
12
13
|
# Send URLs to Wayback Machine.
|
13
|
-
# @return [Array<
|
14
|
+
# @return [Array<ArchiveResult>] with sent URLs.
|
14
15
|
# @param [Array<String>] urls to send to the Wayback Machine.
|
15
16
|
# @param concurrency [Integer] the default is 5
|
17
|
+
# @yield [archive_result] If a block is given, each result will be yielded
|
18
|
+
# @yieldparam [ArchiveResult] archive_result
|
16
19
|
# @example Archive urls, asynchronously
|
17
20
|
# Archive.post(['http://example.com'])
|
21
|
+
# Archiver.post(['http://example.com']) do |result|
|
22
|
+
# puts [result.code || 'error', result.url] # print response status and URL
|
23
|
+
# end
|
18
24
|
# @example Archive urls, using only 1 thread
|
19
25
|
# Archive.post(['http://example.com'], concurrency: 1)
|
20
|
-
|
26
|
+
# @example Stop after archiving 100 links
|
27
|
+
# Archive.post(['http://example.com'], limit: 100)
|
28
|
+
# @example Explicitly set no limit on how many links are posted
|
29
|
+
# Archive.post(['http://example.com'], limit: -1)
|
30
|
+
def self.post(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
21
31
|
WaybackArchiver.logger.info "Total URLs to be sent: #{urls.length}"
|
22
32
|
WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
|
23
33
|
|
34
|
+
urls_queue = if limit == -1
|
35
|
+
urls
|
36
|
+
else
|
37
|
+
urls[0...limit]
|
38
|
+
end
|
39
|
+
|
24
40
|
posted_urls = Concurrent::Array.new
|
25
41
|
pool = ThreadPool.build(concurrency)
|
26
|
-
|
42
|
+
|
43
|
+
urls_queue.each do |url|
|
27
44
|
pool.post do
|
28
|
-
|
29
|
-
|
45
|
+
result = post_url(url)
|
46
|
+
yield(result) if block_given?
|
47
|
+
posted_urls << result unless result.errored?
|
30
48
|
end
|
31
49
|
end
|
32
50
|
|
@@ -38,23 +56,31 @@ module WaybackArchiver
|
|
38
56
|
end
|
39
57
|
|
40
58
|
# Send URLs to Wayback Machine by crawling the site.
|
41
|
-
# @return [Array<
|
59
|
+
# @return [Array<ArchiveResult>] with URLs sent to the Wayback Machine.
|
42
60
|
# @param [String] source for URL to crawl.
|
43
61
|
# @param concurrency [Integer] the default is 5
|
62
|
+
# @yield [archive_result] If a block is given, each result will be yielded
|
63
|
+
# @yieldparam [ArchiveResult] archive_result
|
44
64
|
# @example Crawl example.com and send all URLs of the same domain
|
45
|
-
#
|
65
|
+
# Archiver.crawl('example.com')
|
66
|
+
# Archiver.crawl('example.com') do |result|
|
67
|
+
# puts [result.code || 'error', result.url] # print response status and URL
|
68
|
+
# end
|
46
69
|
# @example Crawl example.com and send all URLs of the same domain with low concurrency
|
47
|
-
#
|
48
|
-
|
70
|
+
# Archiver.crawl('example.com', concurrency: 1)
|
71
|
+
# @example Stop after archiving 100 links
|
72
|
+
# Archiver.crawl('example.com', limit: 100)
|
73
|
+
def self.crawl(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
49
74
|
WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
|
50
75
|
|
51
76
|
posted_urls = Concurrent::Array.new
|
52
77
|
pool = ThreadPool.build(concurrency)
|
53
78
|
|
54
|
-
found_urls = URLCollector.crawl(source) do |url|
|
79
|
+
found_urls = URLCollector.crawl(source, limit: limit) do |url|
|
55
80
|
pool.post do
|
56
|
-
|
57
|
-
|
81
|
+
result = post_url(url)
|
82
|
+
yield(result) if block_given?
|
83
|
+
posted_urls << result unless result.errored?
|
58
84
|
end
|
59
85
|
end
|
60
86
|
WaybackArchiver.logger.info "Crawling of #{source} finished, found #{found_urls.length} URL(s)"
|
@@ -66,18 +92,18 @@ module WaybackArchiver
|
|
66
92
|
end
|
67
93
|
|
68
94
|
# Send URL to Wayback Machine.
|
69
|
-
# @return [
|
95
|
+
# @return [ArchiveResult] the sent URL.
|
70
96
|
# @param [String] url to send.
|
71
97
|
# @example Archive example.com, with default options
|
72
98
|
# Archive.post_url('http://example.com')
|
73
99
|
def self.post_url(url)
|
74
100
|
request_url = "#{WAYBACK_BASE_URL}#{url}"
|
75
|
-
response
|
101
|
+
response = Request.get(request_url, follow_redirects: false)
|
76
102
|
WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
|
77
|
-
url
|
103
|
+
ArchiveResult.new(url, response)
|
78
104
|
rescue Request::Error => e
|
79
105
|
WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
|
80
|
-
nil
|
106
|
+
ArchiveResult.new(url, nil, e)
|
81
107
|
end
|
82
108
|
end
|
83
109
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module WaybackArchiver
|
2
|
+
# Result data for posting URL to archive
|
3
|
+
ArchiveResult = Struct.new(:uri, :response, :error)
|
4
|
+
class ArchiveResult
|
5
|
+
# @return [String] the URL that was archived
|
6
|
+
def archived_url
|
7
|
+
uri
|
8
|
+
end
|
9
|
+
|
10
|
+
# @return [String] the requested URL
|
11
|
+
def request_url
|
12
|
+
return unless response?
|
13
|
+
response.uri
|
14
|
+
end
|
15
|
+
|
16
|
+
# @return [String] The HTTP status code if any
|
17
|
+
def code
|
18
|
+
return unless response?
|
19
|
+
response.code
|
20
|
+
end
|
21
|
+
|
22
|
+
# @return [Boolean] true if errored
|
23
|
+
def errored?
|
24
|
+
!!error
|
25
|
+
end
|
26
|
+
|
27
|
+
# @return [Boolean] true if response is present
|
28
|
+
def response?
|
29
|
+
!!response
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -25,7 +25,7 @@ module WaybackArchiver
|
|
25
25
|
# @example
|
26
26
|
# HttpCode.success?(nil) # => false
|
27
27
|
def self.success?(code)
|
28
|
-
code.to_s.match
|
28
|
+
!!code.to_s.match(/2\d\d/)
|
29
29
|
end
|
30
30
|
|
31
31
|
# Whether the code is a redirect type
|
@@ -34,7 +34,7 @@ module WaybackArchiver
|
|
34
34
|
# @example
|
35
35
|
# HttpCode.redirect?('301')
|
36
36
|
def self.redirect?(code)
|
37
|
-
code.to_s.match
|
37
|
+
!!code.to_s.match(/3\d\d/)
|
38
38
|
end
|
39
39
|
|
40
40
|
# Whether the code is a error type
|
@@ -43,7 +43,7 @@ module WaybackArchiver
|
|
43
43
|
# @example
|
44
44
|
# HttpCode.error?('301')
|
45
45
|
def self.error?(code)
|
46
|
-
code.to_s.match
|
46
|
+
!!code.to_s.match(/4\d\d/) || !!code.to_s.match(/5\d\d/)
|
47
47
|
end
|
48
48
|
end
|
49
49
|
end
|
@@ -5,6 +5,7 @@ require 'uri'
|
|
5
5
|
require 'zlib'
|
6
6
|
|
7
7
|
require 'wayback_archiver/http_code'
|
8
|
+
require 'wayback_archiver/response'
|
8
9
|
|
9
10
|
module WaybackArchiver
|
10
11
|
# Make HTTP requests
|
@@ -32,18 +33,6 @@ module WaybackArchiver
|
|
32
33
|
# Max number of redirects before an error is raised
|
33
34
|
MAX_REDIRECTS = 10
|
34
35
|
|
35
|
-
# Response data struct
|
36
|
-
Response = Struct.new(:code, :message, :body, :uri, :error)
|
37
|
-
class Response
|
38
|
-
# Returns true if a successfull response
|
39
|
-
# @example check if Response was successfull
|
40
|
-
# response = Response.new('200', 'OK', 'buren', 'http://example.com')
|
41
|
-
# response.success? # => true
|
42
|
-
def success?
|
43
|
-
HTTPCode.success?(code)
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
36
|
# Get reponse.
|
48
37
|
# @return [Response] the http response representation.
|
49
38
|
# @param [String, URI] uri to retrieve.
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module WaybackArchiver
|
2
|
+
# Response data struct
|
3
|
+
Response = Struct.new(:code, :message, :body, :uri, :error)
|
4
|
+
class Response
|
5
|
+
# Returns true if a successfull response
|
6
|
+
# @example check if Response was successfull
|
7
|
+
# response = Response.new('200', 'OK', 'buren', 'http://example.com')
|
8
|
+
# response.success? # => true
|
9
|
+
def success?
|
10
|
+
HTTPCode.success?(code)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -38,7 +38,11 @@ module WaybackArchiver
|
|
38
38
|
WaybackArchiver.logger.info "Looking for Sitemap at #{path}"
|
39
39
|
sitemap_url = [url, path].join(url.end_with?('/') ? '' : '/')
|
40
40
|
response = Request.get(sitemap_url, raise_on_http_error: false)
|
41
|
-
|
41
|
+
|
42
|
+
if response.success?
|
43
|
+
WaybackArchiver.logger.info "Sitemap found at #{sitemap_url}"
|
44
|
+
return urls(xml: response.body)
|
45
|
+
end
|
42
46
|
end
|
43
47
|
|
44
48
|
WaybackArchiver.logger.info "Looking for Sitemap at #{url}"
|
@@ -21,13 +21,19 @@ module WaybackArchiver
|
|
21
21
|
# @param [String] url domain to crawl URLs from.
|
22
22
|
# @example Crawl URLs defined on example.com
|
23
23
|
# URLCollector.crawl('http://example.com')
|
24
|
-
|
24
|
+
# @example Crawl URLs defined on example.com and limit the number of visited pages to 100
|
25
|
+
# URLCollector.crawl('http://example.com', limit: 100)
|
26
|
+
# @example Crawl URLs defined on example.com and explicitly set no upper limit on the number of visited pages to 100
|
27
|
+
# URLCollector.crawl('http://example.com', limit: -1)
|
28
|
+
def self.crawl(url, limit: WaybackArchiver.max_limit)
|
25
29
|
urls = []
|
26
30
|
start_at_url = Request.build_uri(url).to_s
|
27
31
|
options = {
|
28
32
|
robots: true,
|
29
33
|
user_agent: WaybackArchiver.user_agent
|
30
34
|
}
|
35
|
+
options[:limit] = limit unless limit == -1
|
36
|
+
|
31
37
|
Spidr.site(start_at_url, options) do |spider|
|
32
38
|
spider.every_html_page do |page|
|
33
39
|
page_url = page.url.to_s
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-08-
|
11
|
+
date: 2017-08-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: spidr
|
@@ -176,9 +176,11 @@ files:
|
|
176
176
|
- bin/wayback_archiver
|
177
177
|
- lib/wayback_archiver.rb
|
178
178
|
- lib/wayback_archiver/archive.rb
|
179
|
+
- lib/wayback_archiver/archive_result.rb
|
179
180
|
- lib/wayback_archiver/http_code.rb
|
180
181
|
- lib/wayback_archiver/null_logger.rb
|
181
182
|
- lib/wayback_archiver/request.rb
|
183
|
+
- lib/wayback_archiver/response.rb
|
182
184
|
- lib/wayback_archiver/sitemap.rb
|
183
185
|
- lib/wayback_archiver/sitemapper.rb
|
184
186
|
- lib/wayback_archiver/thread_pool.rb
|