wayback_archiver 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_archiver +11 -1
- data/lib/wayback_archiver.rb +45 -17
- data/lib/wayback_archiver/archive.rb +42 -16
- data/lib/wayback_archiver/archive_result.rb +32 -0
- data/lib/wayback_archiver/http_code.rb +3 -3
- data/lib/wayback_archiver/request.rb +1 -12
- data/lib/wayback_archiver/response.rb +13 -0
- data/lib/wayback_archiver/sitemapper.rb +5 -1
- data/lib/wayback_archiver/url_collector.rb +7 -1
- data/lib/wayback_archiver/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a4ba8820f2974f5a506c3fd125be8b6d6b429e3d
|
4
|
+
data.tar.gz: 00e101acf27d03d0fccc48b3031cca01aa4792ae
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8c0ec54efa365cdbf9ea9847d0289d67407fcc85b41100559fc768af6ea6bb3d1ab90594bd0578cc02c503fb5b2394a67e77e6cd624783314ec226aee3948424
|
7
|
+
data.tar.gz: b206a4c9e45fa42159d6cb4013149aa003f51dd70a41b6ddbbdb77978b8bdb09fe91852d3c67b5f56bff7eab9f9305f82b16dadae1487301e2344bf335f77600
|
data/bin/wayback_archiver
CHANGED
@@ -9,6 +9,7 @@ strategy = 'auto'
|
|
9
9
|
log = STDOUT
|
10
10
|
log_level = Logger::INFO
|
11
11
|
concurrency = WaybackArchiver.concurrency
|
12
|
+
limit = WaybackArchiver.max_limit
|
12
13
|
|
13
14
|
optparse = OptionParser.new do |parser|
|
14
15
|
parser.banner = 'Usage: wayback_archiver [<url>] [options]'
|
@@ -33,6 +34,10 @@ optparse = OptionParser.new do |parser|
|
|
33
34
|
concurrency = value
|
34
35
|
end
|
35
36
|
|
37
|
+
parser.on('--limit=5', Integer, 'Max number of URLs to archive') do |value|
|
38
|
+
limit = value
|
39
|
+
end
|
40
|
+
|
36
41
|
parser.on('--log=output.log', String, 'Path to desired log file (if no argument is given it defaults to STDOUT)') do |path|
|
37
42
|
log = path
|
38
43
|
end
|
@@ -74,5 +79,10 @@ end
|
|
74
79
|
# If no strategy has explicitly been given, then default to 'auto'
|
75
80
|
strategy ||= 'auto'
|
76
81
|
urls.each do |url|
|
77
|
-
WaybackArchiver.archive(
|
82
|
+
WaybackArchiver.archive(
|
83
|
+
url,
|
84
|
+
strategy: strategy,
|
85
|
+
concurrency: concurrency,
|
86
|
+
limit: limit
|
87
|
+
)
|
78
88
|
end
|
data/lib/wayback_archiver.rb
CHANGED
@@ -15,36 +15,43 @@ module WaybackArchiver
|
|
15
15
|
# Default concurrency for archiving URLs
|
16
16
|
DEFAULT_CONCURRENCY = 5
|
17
17
|
|
18
|
+
# Maxmium number of links posted (-1 is no limit)
|
19
|
+
DEFAULT_MAX_LIMIT = -1
|
20
|
+
|
18
21
|
# Send URLs to Wayback Machine.
|
19
|
-
# @return [Array<
|
22
|
+
# @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
|
20
23
|
# @param [String/Array<String>] source for URL(s).
|
21
24
|
# @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto.
|
22
25
|
# @example Crawl example.com and send all URLs of the same domain
|
23
26
|
# WaybackArchiver.archive('example.com') # Default strategy is :auto
|
24
27
|
# WaybackArchiver.archive('example.com', strategy: :auto)
|
25
28
|
# WaybackArchiver.archive('example.com', strategy: :auto, concurrency: 10)
|
29
|
+
# WaybackArchiver.archive('example.com', strategy: :auto, limit: 100) # send max 100 URLs
|
26
30
|
# WaybackArchiver.archive('example.com', :auto)
|
27
31
|
# @example Crawl example.com and send all URLs of the same domain
|
28
32
|
# WaybackArchiver.archive('example.com', strategy: :crawl)
|
29
33
|
# WaybackArchiver.archive('example.com', strategy: :crawl, concurrency: 10)
|
34
|
+
# WaybackArchiver.archive('example.com', strategy: :crawl, limit: 100) # send max 100 URLs
|
30
35
|
# WaybackArchiver.archive('example.com', :crawl)
|
31
36
|
# @example Send example.com Sitemap URLs
|
32
37
|
# WaybackArchiver.archive('example.com', strategy: :sitemap)
|
33
38
|
# WaybackArchiver.archive('example.com', strategy: :sitemap, concurrency: 10)
|
39
|
+
# WaybackArchiver.archive('example.com', strategy: :sitemap, limit: 100) # send max 100 URLs
|
34
40
|
# WaybackArchiver.archive('example.com', :sitemap)
|
35
41
|
# @example Send only example.com
|
36
42
|
# WaybackArchiver.archive('example.com', strategy: :url)
|
37
43
|
# WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
|
44
|
+
# WaybackArchiver.archive('example.com', strategy: :url, limit: 100) # send max 100 URLs
|
38
45
|
# WaybackArchiver.archive('example.com', :url)
|
39
|
-
def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency)
|
46
|
+
def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
40
47
|
strategy = legacy_strategy || strategy
|
41
48
|
|
42
49
|
case strategy.to_s
|
43
|
-
when 'crawl' then crawl(source, concurrency: concurrency)
|
44
|
-
when 'auto' then auto(source, concurrency: concurrency)
|
45
|
-
when 'sitemap' then sitemap(source, concurrency: concurrency)
|
46
|
-
when 'urls' then urls(source, concurrency: concurrency)
|
47
|
-
when 'url' then urls(source, concurrency: concurrency)
|
50
|
+
when 'crawl' then crawl(source, concurrency: concurrency, limit: limit)
|
51
|
+
when 'auto' then auto(source, concurrency: concurrency, limit: limit)
|
52
|
+
when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit)
|
53
|
+
when 'urls' then urls(source, concurrency: concurrency, limit: limit)
|
54
|
+
when 'url' then urls(source, concurrency: concurrency, limit: limit)
|
48
55
|
else
|
49
56
|
raise ArgumentError, "Unknown strategy: '#{strategy}'. Allowed strategies: sitemap, urls, url, crawl"
|
50
57
|
end
|
@@ -52,15 +59,17 @@ module WaybackArchiver
|
|
52
59
|
|
53
60
|
# Look for Sitemap(s) and if nothing is found fallback to crawling.
|
54
61
|
# Then send found URLs to the Wayback Machine.
|
55
|
-
# @return [Array<
|
62
|
+
# @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
|
56
63
|
# @param [String] source (must be a valid URL).
|
57
64
|
# @param concurrency [Integer]
|
58
65
|
# @example Auto archive example.com
|
59
66
|
# WaybackArchiver.auto('example.com') # Default concurrency is 5
|
60
67
|
# @example Auto archive example.com with low concurrency
|
61
68
|
# WaybackArchiver.auto('example.com', concurrency: 1)
|
69
|
+
# @example Auto archive example.com and archive max 100 URLs
|
70
|
+
# WaybackArchiver.auto('example.com', limit: 100)
|
62
71
|
# @see http://www.sitemaps.org
|
63
|
-
def self.auto(source, concurrency: WaybackArchiver.concurrency)
|
72
|
+
def self.auto(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
64
73
|
urls = Sitemapper.autodiscover(source)
|
65
74
|
return urls(urls, concurrency: concurrency) if urls.any?
|
66
75
|
|
@@ -68,41 +77,47 @@ module WaybackArchiver
|
|
68
77
|
end
|
69
78
|
|
70
79
|
# Crawl site for URLs to send to the Wayback Machine.
|
71
|
-
# @return [Array<
|
80
|
+
# @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
|
72
81
|
# @param [String] url to start crawling from.
|
73
82
|
# @param concurrency [Integer]
|
74
83
|
# @example Crawl example.com and send all URLs of the same domain
|
75
84
|
# WaybackArchiver.crawl('example.com') # Default concurrency is 5
|
76
85
|
# @example Crawl example.com and send all URLs of the same domain with low concurrency
|
77
86
|
# WaybackArchiver.crawl('example.com', concurrency: 1)
|
78
|
-
|
87
|
+
# @example Crawl example.com and archive max 100 URLs
|
88
|
+
# WaybackArchiver.crawl('example.com', limit: 100)
|
89
|
+
def self.crawl(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
79
90
|
WaybackArchiver.logger.info "Crawling #{url}"
|
80
|
-
Archive.crawl(url, concurrency: concurrency)
|
91
|
+
Archive.crawl(url, concurrency: concurrency, limit: limit)
|
81
92
|
end
|
82
93
|
|
83
94
|
# Get URLs from sitemap and send found URLs to the Wayback Machine.
|
84
|
-
# @return [Array<
|
95
|
+
# @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
|
85
96
|
# @param [String] url to the sitemap.
|
86
97
|
# @param concurrency [Integer]
|
87
98
|
# @example Get example.com sitemap and archive all found URLs
|
88
99
|
# WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 5
|
89
100
|
# @example Get example.com sitemap and archive all found URLs with low concurrency
|
90
101
|
# WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1)
|
102
|
+
# @example Get example.com sitemap archive max 100 URLs
|
103
|
+
# WaybackArchiver.sitemap('example.com/sitemap.xml', limit: 100)
|
91
104
|
# @see http://www.sitemaps.org
|
92
|
-
def self.sitemap(url, concurrency: WaybackArchiver.concurrency)
|
105
|
+
def self.sitemap(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
93
106
|
WaybackArchiver.logger.info "Fetching Sitemap"
|
94
|
-
Archive.post(URLCollector.sitemap(url), concurrency: concurrency)
|
107
|
+
Archive.post(URLCollector.sitemap(url), concurrency: concurrency, limit: limit)
|
95
108
|
end
|
96
109
|
|
97
110
|
# Send URL to the Wayback Machine.
|
98
|
-
# @return [Array<
|
111
|
+
# @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
|
99
112
|
# @param [Array<String>/String] urls or url.
|
100
113
|
# @param concurrency [Integer]
|
101
114
|
# @example Archive example.com
|
102
115
|
# WaybackArchiver.urls('example.com')
|
103
116
|
# @example Archive example.com and google.com
|
104
117
|
# WaybackArchiver.urls(%w(example.com google.com))
|
105
|
-
|
118
|
+
# @example Archive example.com, max 100 URLs
|
119
|
+
# WaybackArchiver.urls(%w(example.com www.example.com), limit: 100)
|
120
|
+
def self.urls(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
106
121
|
Archive.post(Array(urls), concurrency: concurrency)
|
107
122
|
end
|
108
123
|
|
@@ -152,4 +167,17 @@ module WaybackArchiver
|
|
152
167
|
def self.concurrency
|
153
168
|
@concurrency ||= DEFAULT_CONCURRENCY
|
154
169
|
end
|
170
|
+
|
171
|
+
# Sets the default max_limit
|
172
|
+
# @return [Integer] the desired default max_limit
|
173
|
+
# @param [Integer] max_limit the desired default max_limit
|
174
|
+
def self.max_limit=(max_limit)
|
175
|
+
@max_limit = max_limit
|
176
|
+
end
|
177
|
+
|
178
|
+
# Returns the default max_limit
|
179
|
+
# @return [Integer] the configured or the default max_limit
|
180
|
+
def self.max_limit
|
181
|
+
@max_limit ||= DEFAULT_MAX_LIMIT
|
182
|
+
end
|
155
183
|
end
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'concurrent'
|
2
2
|
|
3
|
+
require 'wayback_archiver/archive_result'
|
3
4
|
require 'wayback_archiver/thread_pool'
|
4
5
|
require 'wayback_archiver/request'
|
5
6
|
|
@@ -10,23 +11,40 @@ module WaybackArchiver
|
|
10
11
|
WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
|
11
12
|
|
12
13
|
# Send URLs to Wayback Machine.
|
13
|
-
# @return [Array<
|
14
|
+
# @return [Array<ArchiveResult>] with sent URLs.
|
14
15
|
# @param [Array<String>] urls to send to the Wayback Machine.
|
15
16
|
# @param concurrency [Integer] the default is 5
|
17
|
+
# @yield [archive_result] If a block is given, each result will be yielded
|
18
|
+
# @yieldparam [ArchiveResult] archive_result
|
16
19
|
# @example Archive urls, asynchronously
|
17
20
|
# Archive.post(['http://example.com'])
|
21
|
+
# Archiver.post(['http://example.com']) do |result|
|
22
|
+
# puts [result.code || 'error', result.url] # print response status and URL
|
23
|
+
# end
|
18
24
|
# @example Archive urls, using only 1 thread
|
19
25
|
# Archive.post(['http://example.com'], concurrency: 1)
|
20
|
-
|
26
|
+
# @example Stop after archiving 100 links
|
27
|
+
# Archive.post(['http://example.com'], limit: 100)
|
28
|
+
# @example Explicitly set no limit on how many links are posted
|
29
|
+
# Archive.post(['http://example.com'], limit: -1)
|
30
|
+
def self.post(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
21
31
|
WaybackArchiver.logger.info "Total URLs to be sent: #{urls.length}"
|
22
32
|
WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
|
23
33
|
|
34
|
+
urls_queue = if limit == -1
|
35
|
+
urls
|
36
|
+
else
|
37
|
+
urls[0...limit]
|
38
|
+
end
|
39
|
+
|
24
40
|
posted_urls = Concurrent::Array.new
|
25
41
|
pool = ThreadPool.build(concurrency)
|
26
|
-
|
42
|
+
|
43
|
+
urls_queue.each do |url|
|
27
44
|
pool.post do
|
28
|
-
|
29
|
-
|
45
|
+
result = post_url(url)
|
46
|
+
yield(result) if block_given?
|
47
|
+
posted_urls << result unless result.errored?
|
30
48
|
end
|
31
49
|
end
|
32
50
|
|
@@ -38,23 +56,31 @@ module WaybackArchiver
|
|
38
56
|
end
|
39
57
|
|
40
58
|
# Send URLs to Wayback Machine by crawling the site.
|
41
|
-
# @return [Array<
|
59
|
+
# @return [Array<ArchiveResult>] with URLs sent to the Wayback Machine.
|
42
60
|
# @param [String] source for URL to crawl.
|
43
61
|
# @param concurrency [Integer] the default is 5
|
62
|
+
# @yield [archive_result] If a block is given, each result will be yielded
|
63
|
+
# @yieldparam [ArchiveResult] archive_result
|
44
64
|
# @example Crawl example.com and send all URLs of the same domain
|
45
|
-
#
|
65
|
+
# Archiver.crawl('example.com')
|
66
|
+
# Archiver.crawl('example.com') do |result|
|
67
|
+
# puts [result.code || 'error', result.url] # print response status and URL
|
68
|
+
# end
|
46
69
|
# @example Crawl example.com and send all URLs of the same domain with low concurrency
|
47
|
-
#
|
48
|
-
|
70
|
+
# Archiver.crawl('example.com', concurrency: 1)
|
71
|
+
# @example Stop after archiving 100 links
|
72
|
+
# Archiver.crawl('example.com', limit: 100)
|
73
|
+
def self.crawl(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
|
49
74
|
WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
|
50
75
|
|
51
76
|
posted_urls = Concurrent::Array.new
|
52
77
|
pool = ThreadPool.build(concurrency)
|
53
78
|
|
54
|
-
found_urls = URLCollector.crawl(source) do |url|
|
79
|
+
found_urls = URLCollector.crawl(source, limit: limit) do |url|
|
55
80
|
pool.post do
|
56
|
-
|
57
|
-
|
81
|
+
result = post_url(url)
|
82
|
+
yield(result) if block_given?
|
83
|
+
posted_urls << result unless result.errored?
|
58
84
|
end
|
59
85
|
end
|
60
86
|
WaybackArchiver.logger.info "Crawling of #{source} finished, found #{found_urls.length} URL(s)"
|
@@ -66,18 +92,18 @@ module WaybackArchiver
|
|
66
92
|
end
|
67
93
|
|
68
94
|
# Send URL to Wayback Machine.
|
69
|
-
# @return [
|
95
|
+
# @return [ArchiveResult] the sent URL.
|
70
96
|
# @param [String] url to send.
|
71
97
|
# @example Archive example.com, with default options
|
72
98
|
# Archive.post_url('http://example.com')
|
73
99
|
def self.post_url(url)
|
74
100
|
request_url = "#{WAYBACK_BASE_URL}#{url}"
|
75
|
-
response
|
101
|
+
response = Request.get(request_url, follow_redirects: false)
|
76
102
|
WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
|
77
|
-
url
|
103
|
+
ArchiveResult.new(url, response)
|
78
104
|
rescue Request::Error => e
|
79
105
|
WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
|
80
|
-
nil
|
106
|
+
ArchiveResult.new(url, nil, e)
|
81
107
|
end
|
82
108
|
end
|
83
109
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module WaybackArchiver
|
2
|
+
# Result data for posting URL to archive
|
3
|
+
ArchiveResult = Struct.new(:uri, :response, :error)
|
4
|
+
class ArchiveResult
|
5
|
+
# @return [String] the URL that was archived
|
6
|
+
def archived_url
|
7
|
+
uri
|
8
|
+
end
|
9
|
+
|
10
|
+
# @return [String] the requested URL
|
11
|
+
def request_url
|
12
|
+
return unless response?
|
13
|
+
response.uri
|
14
|
+
end
|
15
|
+
|
16
|
+
# @return [String] The HTTP status code if any
|
17
|
+
def code
|
18
|
+
return unless response?
|
19
|
+
response.code
|
20
|
+
end
|
21
|
+
|
22
|
+
# @return [Boolean] true if errored
|
23
|
+
def errored?
|
24
|
+
!!error
|
25
|
+
end
|
26
|
+
|
27
|
+
# @return [Boolean] true if response is present
|
28
|
+
def response?
|
29
|
+
!!response
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -25,7 +25,7 @@ module WaybackArchiver
|
|
25
25
|
# @example
|
26
26
|
# HttpCode.success?(nil) # => false
|
27
27
|
def self.success?(code)
|
28
|
-
code.to_s.match
|
28
|
+
!!code.to_s.match(/2\d\d/)
|
29
29
|
end
|
30
30
|
|
31
31
|
# Whether the code is a redirect type
|
@@ -34,7 +34,7 @@ module WaybackArchiver
|
|
34
34
|
# @example
|
35
35
|
# HttpCode.redirect?('301')
|
36
36
|
def self.redirect?(code)
|
37
|
-
code.to_s.match
|
37
|
+
!!code.to_s.match(/3\d\d/)
|
38
38
|
end
|
39
39
|
|
40
40
|
# Whether the code is a error type
|
@@ -43,7 +43,7 @@ module WaybackArchiver
|
|
43
43
|
# @example
|
44
44
|
# HttpCode.error?('301')
|
45
45
|
def self.error?(code)
|
46
|
-
code.to_s.match
|
46
|
+
!!code.to_s.match(/4\d\d/) || !!code.to_s.match(/5\d\d/)
|
47
47
|
end
|
48
48
|
end
|
49
49
|
end
|
@@ -5,6 +5,7 @@ require 'uri'
|
|
5
5
|
require 'zlib'
|
6
6
|
|
7
7
|
require 'wayback_archiver/http_code'
|
8
|
+
require 'wayback_archiver/response'
|
8
9
|
|
9
10
|
module WaybackArchiver
|
10
11
|
# Make HTTP requests
|
@@ -32,18 +33,6 @@ module WaybackArchiver
|
|
32
33
|
# Max number of redirects before an error is raised
|
33
34
|
MAX_REDIRECTS = 10
|
34
35
|
|
35
|
-
# Response data struct
|
36
|
-
Response = Struct.new(:code, :message, :body, :uri, :error)
|
37
|
-
class Response
|
38
|
-
# Returns true if a successfull response
|
39
|
-
# @example check if Response was successfull
|
40
|
-
# response = Response.new('200', 'OK', 'buren', 'http://example.com')
|
41
|
-
# response.success? # => true
|
42
|
-
def success?
|
43
|
-
HTTPCode.success?(code)
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
36
|
# Get reponse.
|
48
37
|
# @return [Response] the http response representation.
|
49
38
|
# @param [String, URI] uri to retrieve.
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module WaybackArchiver
|
2
|
+
# Response data struct
|
3
|
+
Response = Struct.new(:code, :message, :body, :uri, :error)
|
4
|
+
class Response
|
5
|
+
# Returns true if a successfull response
|
6
|
+
# @example check if Response was successfull
|
7
|
+
# response = Response.new('200', 'OK', 'buren', 'http://example.com')
|
8
|
+
# response.success? # => true
|
9
|
+
def success?
|
10
|
+
HTTPCode.success?(code)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -38,7 +38,11 @@ module WaybackArchiver
|
|
38
38
|
WaybackArchiver.logger.info "Looking for Sitemap at #{path}"
|
39
39
|
sitemap_url = [url, path].join(url.end_with?('/') ? '' : '/')
|
40
40
|
response = Request.get(sitemap_url, raise_on_http_error: false)
|
41
|
-
|
41
|
+
|
42
|
+
if response.success?
|
43
|
+
WaybackArchiver.logger.info "Sitemap found at #{sitemap_url}"
|
44
|
+
return urls(xml: response.body)
|
45
|
+
end
|
42
46
|
end
|
43
47
|
|
44
48
|
WaybackArchiver.logger.info "Looking for Sitemap at #{url}"
|
@@ -21,13 +21,19 @@ module WaybackArchiver
|
|
21
21
|
# @param [String] url domain to crawl URLs from.
|
22
22
|
# @example Crawl URLs defined on example.com
|
23
23
|
# URLCollector.crawl('http://example.com')
|
24
|
-
|
24
|
+
# @example Crawl URLs defined on example.com and limit the number of visited pages to 100
|
25
|
+
# URLCollector.crawl('http://example.com', limit: 100)
|
26
|
+
# @example Crawl URLs defined on example.com and explicitly set no upper limit on the number of visited pages to 100
|
27
|
+
# URLCollector.crawl('http://example.com', limit: -1)
|
28
|
+
def self.crawl(url, limit: WaybackArchiver.max_limit)
|
25
29
|
urls = []
|
26
30
|
start_at_url = Request.build_uri(url).to_s
|
27
31
|
options = {
|
28
32
|
robots: true,
|
29
33
|
user_agent: WaybackArchiver.user_agent
|
30
34
|
}
|
35
|
+
options[:limit] = limit unless limit == -1
|
36
|
+
|
31
37
|
Spidr.site(start_at_url, options) do |spider|
|
32
38
|
spider.every_html_page do |page|
|
33
39
|
page_url = page.url.to_s
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_archiver
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-08-
|
11
|
+
date: 2017-08-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: spidr
|
@@ -176,9 +176,11 @@ files:
|
|
176
176
|
- bin/wayback_archiver
|
177
177
|
- lib/wayback_archiver.rb
|
178
178
|
- lib/wayback_archiver/archive.rb
|
179
|
+
- lib/wayback_archiver/archive_result.rb
|
179
180
|
- lib/wayback_archiver/http_code.rb
|
180
181
|
- lib/wayback_archiver/null_logger.rb
|
181
182
|
- lib/wayback_archiver/request.rb
|
183
|
+
- lib/wayback_archiver/response.rb
|
182
184
|
- lib/wayback_archiver/sitemap.rb
|
183
185
|
- lib/wayback_archiver/sitemapper.rb
|
184
186
|
- lib/wayback_archiver/thread_pool.rb
|