wayback_archiver 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1f9f979d5fa0d31cfdf61660baa3464bdb1425e5
4
- data.tar.gz: 1d5701273bbe4d02b2ba5d88f9e75c9477058a28
3
+ metadata.gz: a4ba8820f2974f5a506c3fd125be8b6d6b429e3d
4
+ data.tar.gz: 00e101acf27d03d0fccc48b3031cca01aa4792ae
5
5
  SHA512:
6
- metadata.gz: e69883c975584b3120993371b29a0c0b7a71f3fd4210764b4caa712d4071ec175dc47fa63217950968da62485e927c05fcc977dfb076a317be4754cf4f16ec90
7
- data.tar.gz: 51f80b591f40f4bc22b5bf2e2b7d56273c3c495db43d8a9268f11048a410ad355a3ff238088a4abd83c69fd585db5e1e704d34d22dac3d4f4b9b2f73089458ac
6
+ metadata.gz: 8c0ec54efa365cdbf9ea9847d0289d67407fcc85b41100559fc768af6ea6bb3d1ab90594bd0578cc02c503fb5b2394a67e77e6cd624783314ec226aee3948424
7
+ data.tar.gz: b206a4c9e45fa42159d6cb4013149aa003f51dd70a41b6ddbbdb77978b8bdb09fe91852d3c67b5f56bff7eab9f9305f82b16dadae1487301e2344bf335f77600
@@ -9,6 +9,7 @@ strategy = 'auto'
9
9
  log = STDOUT
10
10
  log_level = Logger::INFO
11
11
  concurrency = WaybackArchiver.concurrency
12
+ limit = WaybackArchiver.max_limit
12
13
 
13
14
  optparse = OptionParser.new do |parser|
14
15
  parser.banner = 'Usage: wayback_archiver [<url>] [options]'
@@ -33,6 +34,10 @@ optparse = OptionParser.new do |parser|
33
34
  concurrency = value
34
35
  end
35
36
 
37
+ parser.on('--limit=5', Integer, 'Max number of URLs to archive') do |value|
38
+ limit = value
39
+ end
40
+
36
41
  parser.on('--log=output.log', String, 'Path to desired log file (if no argument is given it defaults to STDOUT)') do |path|
37
42
  log = path
38
43
  end
@@ -74,5 +79,10 @@ end
74
79
  # If no strategy has explicitly been given, then default to 'auto'
75
80
  strategy ||= 'auto'
76
81
  urls.each do |url|
77
- WaybackArchiver.archive(url, strategy: strategy, concurrency: concurrency)
82
+ WaybackArchiver.archive(
83
+ url,
84
+ strategy: strategy,
85
+ concurrency: concurrency,
86
+ limit: limit
87
+ )
78
88
  end
@@ -15,36 +15,43 @@ module WaybackArchiver
15
15
  # Default concurrency for archiving URLs
16
16
  DEFAULT_CONCURRENCY = 5
17
17
 
18
+ # Maxmium number of links posted (-1 is no limit)
19
+ DEFAULT_MAX_LIMIT = -1
20
+
18
21
  # Send URLs to Wayback Machine.
19
- # @return [Array<String>] of URLs sent to the Wayback Machine.
22
+ # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
20
23
  # @param [String/Array<String>] source for URL(s).
21
24
  # @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto.
22
25
  # @example Crawl example.com and send all URLs of the same domain
23
26
  # WaybackArchiver.archive('example.com') # Default strategy is :auto
24
27
  # WaybackArchiver.archive('example.com', strategy: :auto)
25
28
  # WaybackArchiver.archive('example.com', strategy: :auto, concurrency: 10)
29
+ # WaybackArchiver.archive('example.com', strategy: :auto, limit: 100) # send max 100 URLs
26
30
  # WaybackArchiver.archive('example.com', :auto)
27
31
  # @example Crawl example.com and send all URLs of the same domain
28
32
  # WaybackArchiver.archive('example.com', strategy: :crawl)
29
33
  # WaybackArchiver.archive('example.com', strategy: :crawl, concurrency: 10)
34
+ # WaybackArchiver.archive('example.com', strategy: :crawl, limit: 100) # send max 100 URLs
30
35
  # WaybackArchiver.archive('example.com', :crawl)
31
36
  # @example Send example.com Sitemap URLs
32
37
  # WaybackArchiver.archive('example.com', strategy: :sitemap)
33
38
  # WaybackArchiver.archive('example.com', strategy: :sitemap, concurrency: 10)
39
+ # WaybackArchiver.archive('example.com', strategy: :sitemap, limit: 100) # send max 100 URLs
34
40
  # WaybackArchiver.archive('example.com', :sitemap)
35
41
  # @example Send only example.com
36
42
  # WaybackArchiver.archive('example.com', strategy: :url)
37
43
  # WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
44
+ # WaybackArchiver.archive('example.com', strategy: :url, limit: 100) # send max 100 URLs
38
45
  # WaybackArchiver.archive('example.com', :url)
39
- def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency)
46
+ def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
40
47
  strategy = legacy_strategy || strategy
41
48
 
42
49
  case strategy.to_s
43
- when 'crawl' then crawl(source, concurrency: concurrency)
44
- when 'auto' then auto(source, concurrency: concurrency)
45
- when 'sitemap' then sitemap(source, concurrency: concurrency)
46
- when 'urls' then urls(source, concurrency: concurrency)
47
- when 'url' then urls(source, concurrency: concurrency)
50
+ when 'crawl' then crawl(source, concurrency: concurrency, limit: limit)
51
+ when 'auto' then auto(source, concurrency: concurrency, limit: limit)
52
+ when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit)
53
+ when 'urls' then urls(source, concurrency: concurrency, limit: limit)
54
+ when 'url' then urls(source, concurrency: concurrency, limit: limit)
48
55
  else
49
56
  raise ArgumentError, "Unknown strategy: '#{strategy}'. Allowed strategies: sitemap, urls, url, crawl"
50
57
  end
@@ -52,15 +59,17 @@ module WaybackArchiver
52
59
 
53
60
  # Look for Sitemap(s) and if nothing is found fallback to crawling.
54
61
  # Then send found URLs to the Wayback Machine.
55
- # @return [Array<String>] of URLs sent to the Wayback Machine.
62
+ # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
56
63
  # @param [String] source (must be a valid URL).
57
64
  # @param concurrency [Integer]
58
65
  # @example Auto archive example.com
59
66
  # WaybackArchiver.auto('example.com') # Default concurrency is 5
60
67
  # @example Auto archive example.com with low concurrency
61
68
  # WaybackArchiver.auto('example.com', concurrency: 1)
69
+ # @example Auto archive example.com and archive max 100 URLs
70
+ # WaybackArchiver.auto('example.com', limit: 100)
62
71
  # @see http://www.sitemaps.org
63
- def self.auto(source, concurrency: WaybackArchiver.concurrency)
72
+ def self.auto(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
64
73
  urls = Sitemapper.autodiscover(source)
65
74
  return urls(urls, concurrency: concurrency) if urls.any?
66
75
 
@@ -68,41 +77,47 @@ module WaybackArchiver
68
77
  end
69
78
 
70
79
  # Crawl site for URLs to send to the Wayback Machine.
71
- # @return [Array<String>] of URLs sent to the Wayback Machine.
80
+ # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
72
81
  # @param [String] url to start crawling from.
73
82
  # @param concurrency [Integer]
74
83
  # @example Crawl example.com and send all URLs of the same domain
75
84
  # WaybackArchiver.crawl('example.com') # Default concurrency is 5
76
85
  # @example Crawl example.com and send all URLs of the same domain with low concurrency
77
86
  # WaybackArchiver.crawl('example.com', concurrency: 1)
78
- def self.crawl(url, concurrency: WaybackArchiver.concurrency)
87
+ # @example Crawl example.com and archive max 100 URLs
88
+ # WaybackArchiver.crawl('example.com', limit: 100)
89
+ def self.crawl(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
79
90
  WaybackArchiver.logger.info "Crawling #{url}"
80
- Archive.crawl(url, concurrency: concurrency)
91
+ Archive.crawl(url, concurrency: concurrency, limit: limit)
81
92
  end
82
93
 
83
94
  # Get URLs from sitemap and send found URLs to the Wayback Machine.
84
- # @return [Array<String>] of URLs sent to the Wayback Machine.
95
+ # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
85
96
  # @param [String] url to the sitemap.
86
97
  # @param concurrency [Integer]
87
98
  # @example Get example.com sitemap and archive all found URLs
88
99
  # WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 5
89
100
  # @example Get example.com sitemap and archive all found URLs with low concurrency
90
101
  # WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1)
102
+ # @example Get example.com sitemap archive max 100 URLs
103
+ # WaybackArchiver.sitemap('example.com/sitemap.xml', limit: 100)
91
104
  # @see http://www.sitemaps.org
92
- def self.sitemap(url, concurrency: WaybackArchiver.concurrency)
105
+ def self.sitemap(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
93
106
  WaybackArchiver.logger.info "Fetching Sitemap"
94
- Archive.post(URLCollector.sitemap(url), concurrency: concurrency)
107
+ Archive.post(URLCollector.sitemap(url), concurrency: concurrency, limit: limit)
95
108
  end
96
109
 
97
110
  # Send URL to the Wayback Machine.
98
- # @return [Array<String>] of URLs sent to the Wayback Machine.
111
+ # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
99
112
  # @param [Array<String>/String] urls or url.
100
113
  # @param concurrency [Integer]
101
114
  # @example Archive example.com
102
115
  # WaybackArchiver.urls('example.com')
103
116
  # @example Archive example.com and google.com
104
117
  # WaybackArchiver.urls(%w(example.com google.com))
105
- def self.urls(urls, concurrency: WaybackArchiver.concurrency)
118
+ # @example Archive example.com, max 100 URLs
119
+ # WaybackArchiver.urls(%w(example.com www.example.com), limit: 100)
120
+ def self.urls(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
106
121
  Archive.post(Array(urls), concurrency: concurrency)
107
122
  end
108
123
 
@@ -152,4 +167,17 @@ module WaybackArchiver
152
167
  def self.concurrency
153
168
  @concurrency ||= DEFAULT_CONCURRENCY
154
169
  end
170
+
171
+ # Sets the default max_limit
172
+ # @return [Integer] the desired default max_limit
173
+ # @param [Integer] max_limit the desired default max_limit
174
+ def self.max_limit=(max_limit)
175
+ @max_limit = max_limit
176
+ end
177
+
178
+ # Returns the default max_limit
179
+ # @return [Integer] the configured or the default max_limit
180
+ def self.max_limit
181
+ @max_limit ||= DEFAULT_MAX_LIMIT
182
+ end
155
183
  end
@@ -1,5 +1,6 @@
1
1
  require 'concurrent'
2
2
 
3
+ require 'wayback_archiver/archive_result'
3
4
  require 'wayback_archiver/thread_pool'
4
5
  require 'wayback_archiver/request'
5
6
 
@@ -10,23 +11,40 @@ module WaybackArchiver
10
11
  WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
11
12
 
12
13
  # Send URLs to Wayback Machine.
13
- # @return [Array<String>] with sent URLs.
14
+ # @return [Array<ArchiveResult>] with sent URLs.
14
15
  # @param [Array<String>] urls to send to the Wayback Machine.
15
16
  # @param concurrency [Integer] the default is 5
17
+ # @yield [archive_result] If a block is given, each result will be yielded
18
+ # @yieldparam [ArchiveResult] archive_result
16
19
  # @example Archive urls, asynchronously
17
20
  # Archive.post(['http://example.com'])
21
+ # Archiver.post(['http://example.com']) do |result|
22
+ # puts [result.code || 'error', result.url] # print response status and URL
23
+ # end
18
24
  # @example Archive urls, using only 1 thread
19
25
  # Archive.post(['http://example.com'], concurrency: 1)
20
- def self.post(urls, concurrency: WaybackArchiver.concurrency)
26
+ # @example Stop after archiving 100 links
27
+ # Archive.post(['http://example.com'], limit: 100)
28
+ # @example Explicitly set no limit on how many links are posted
29
+ # Archive.post(['http://example.com'], limit: -1)
30
+ def self.post(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
21
31
  WaybackArchiver.logger.info "Total URLs to be sent: #{urls.length}"
22
32
  WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
23
33
 
34
+ urls_queue = if limit == -1
35
+ urls
36
+ else
37
+ urls[0...limit]
38
+ end
39
+
24
40
  posted_urls = Concurrent::Array.new
25
41
  pool = ThreadPool.build(concurrency)
26
- urls.each do |url|
42
+
43
+ urls_queue.each do |url|
27
44
  pool.post do
28
- posted_url = post_url(url)
29
- posted_urls << posted_url if posted_url
45
+ result = post_url(url)
46
+ yield(result) if block_given?
47
+ posted_urls << result unless result.errored?
30
48
  end
31
49
  end
32
50
 
@@ -38,23 +56,31 @@ module WaybackArchiver
38
56
  end
39
57
 
40
58
  # Send URLs to Wayback Machine by crawling the site.
41
- # @return [Array<String>] with URLs sent to the Wayback Machine.
59
+ # @return [Array<ArchiveResult>] with URLs sent to the Wayback Machine.
42
60
  # @param [String] source for URL to crawl.
43
61
  # @param concurrency [Integer] the default is 5
62
+ # @yield [archive_result] If a block is given, each result will be yielded
63
+ # @yieldparam [ArchiveResult] archive_result
44
64
  # @example Crawl example.com and send all URLs of the same domain
45
- # WaybackArchiver.crawl('example.com')
65
+ # Archiver.crawl('example.com')
66
+ # Archiver.crawl('example.com') do |result|
67
+ # puts [result.code || 'error', result.url] # print response status and URL
68
+ # end
46
69
  # @example Crawl example.com and send all URLs of the same domain with low concurrency
47
- # WaybackArchiver.crawl('example.com', concurrency: 1)
48
- def self.crawl(source, concurrency: WaybackArchiver.concurrency)
70
+ # Archiver.crawl('example.com', concurrency: 1)
71
+ # @example Stop after archiving 100 links
72
+ # Archiver.crawl('example.com', limit: 100)
73
+ def self.crawl(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
49
74
  WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
50
75
 
51
76
  posted_urls = Concurrent::Array.new
52
77
  pool = ThreadPool.build(concurrency)
53
78
 
54
- found_urls = URLCollector.crawl(source) do |url|
79
+ found_urls = URLCollector.crawl(source, limit: limit) do |url|
55
80
  pool.post do
56
- posted_url = post_url(url)
57
- posted_urls << posted_url if posted_url
81
+ result = post_url(url)
82
+ yield(result) if block_given?
83
+ posted_urls << result unless result.errored?
58
84
  end
59
85
  end
60
86
  WaybackArchiver.logger.info "Crawling of #{source} finished, found #{found_urls.length} URL(s)"
@@ -66,18 +92,18 @@ module WaybackArchiver
66
92
  end
67
93
 
68
94
  # Send URL to Wayback Machine.
69
- # @return [String] the sent URL.
95
+ # @return [ArchiveResult] the sent URL.
70
96
  # @param [String] url to send.
71
97
  # @example Archive example.com, with default options
72
98
  # Archive.post_url('http://example.com')
73
99
  def self.post_url(url)
74
100
  request_url = "#{WAYBACK_BASE_URL}#{url}"
75
- response = Request.get(request_url, follow_redirects: false)
101
+ response = Request.get(request_url, follow_redirects: false)
76
102
  WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
77
- url
103
+ ArchiveResult.new(url, response)
78
104
  rescue Request::Error => e
79
105
  WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
80
- nil
106
+ ArchiveResult.new(url, nil, e)
81
107
  end
82
108
  end
83
109
  end
@@ -0,0 +1,32 @@
1
+ module WaybackArchiver
2
+ # Result data for posting URL to archive
3
+ ArchiveResult = Struct.new(:uri, :response, :error)
4
+ class ArchiveResult
5
+ # @return [String] the URL that was archived
6
+ def archived_url
7
+ uri
8
+ end
9
+
10
+ # @return [String] the requested URL
11
+ def request_url
12
+ return unless response?
13
+ response.uri
14
+ end
15
+
16
+ # @return [String] The HTTP status code if any
17
+ def code
18
+ return unless response?
19
+ response.code
20
+ end
21
+
22
+ # @return [Boolean] true if errored
23
+ def errored?
24
+ !!error
25
+ end
26
+
27
+ # @return [Boolean] true if response is present
28
+ def response?
29
+ !!response
30
+ end
31
+ end
32
+ end
@@ -25,7 +25,7 @@ module WaybackArchiver
25
25
  # @example
26
26
  # HttpCode.success?(nil) # => false
27
27
  def self.success?(code)
28
- code.to_s.match?(/2\d\d/)
28
+ !!code.to_s.match(/2\d\d/)
29
29
  end
30
30
 
31
31
  # Whether the code is a redirect type
@@ -34,7 +34,7 @@ module WaybackArchiver
34
34
  # @example
35
35
  # HttpCode.redirect?('301')
36
36
  def self.redirect?(code)
37
- code.to_s.match?(/3\d\d/)
37
+ !!code.to_s.match(/3\d\d/)
38
38
  end
39
39
 
40
40
  # Whether the code is a error type
@@ -43,7 +43,7 @@ module WaybackArchiver
43
43
  # @example
44
44
  # HttpCode.error?('301')
45
45
  def self.error?(code)
46
- code.to_s.match?(/4\d\d/) || code.to_s.match?(/5\d\d/)
46
+ !!code.to_s.match(/4\d\d/) || !!code.to_s.match(/5\d\d/)
47
47
  end
48
48
  end
49
49
  end
@@ -5,6 +5,7 @@ require 'uri'
5
5
  require 'zlib'
6
6
 
7
7
  require 'wayback_archiver/http_code'
8
+ require 'wayback_archiver/response'
8
9
 
9
10
  module WaybackArchiver
10
11
  # Make HTTP requests
@@ -32,18 +33,6 @@ module WaybackArchiver
32
33
  # Max number of redirects before an error is raised
33
34
  MAX_REDIRECTS = 10
34
35
 
35
- # Response data struct
36
- Response = Struct.new(:code, :message, :body, :uri, :error)
37
- class Response
38
- # Returns true if a successfull response
39
- # @example check if Response was successfull
40
- # response = Response.new('200', 'OK', 'buren', 'http://example.com')
41
- # response.success? # => true
42
- def success?
43
- HTTPCode.success?(code)
44
- end
45
- end
46
-
47
36
  # Get reponse.
48
37
  # @return [Response] the http response representation.
49
38
  # @param [String, URI] uri to retrieve.
@@ -0,0 +1,13 @@
1
+ module WaybackArchiver
2
+ # Response data struct
3
+ Response = Struct.new(:code, :message, :body, :uri, :error)
4
+ class Response
5
+ # Returns true if a successfull response
6
+ # @example check if Response was successfull
7
+ # response = Response.new('200', 'OK', 'buren', 'http://example.com')
8
+ # response.success? # => true
9
+ def success?
10
+ HTTPCode.success?(code)
11
+ end
12
+ end
13
+ end
@@ -38,7 +38,11 @@ module WaybackArchiver
38
38
  WaybackArchiver.logger.info "Looking for Sitemap at #{path}"
39
39
  sitemap_url = [url, path].join(url.end_with?('/') ? '' : '/')
40
40
  response = Request.get(sitemap_url, raise_on_http_error: false)
41
- return urls(xml: response.body) if response.success?
41
+
42
+ if response.success?
43
+ WaybackArchiver.logger.info "Sitemap found at #{sitemap_url}"
44
+ return urls(xml: response.body)
45
+ end
42
46
  end
43
47
 
44
48
  WaybackArchiver.logger.info "Looking for Sitemap at #{url}"
@@ -21,13 +21,19 @@ module WaybackArchiver
21
21
  # @param [String] url domain to crawl URLs from.
22
22
  # @example Crawl URLs defined on example.com
23
23
  # URLCollector.crawl('http://example.com')
24
- def self.crawl(url)
24
+ # @example Crawl URLs defined on example.com and limit the number of visited pages to 100
25
+ # URLCollector.crawl('http://example.com', limit: 100)
26
+ # @example Crawl URLs defined on example.com and explicitly set no upper limit on the number of visited pages to 100
27
+ # URLCollector.crawl('http://example.com', limit: -1)
28
+ def self.crawl(url, limit: WaybackArchiver.max_limit)
25
29
  urls = []
26
30
  start_at_url = Request.build_uri(url).to_s
27
31
  options = {
28
32
  robots: true,
29
33
  user_agent: WaybackArchiver.user_agent
30
34
  }
35
+ options[:limit] = limit unless limit == -1
36
+
31
37
  Spidr.site(start_at_url, options) do |spider|
32
38
  spider.every_html_page do |page|
33
39
  page_url = page.url.to_s
@@ -1,4 +1,4 @@
1
1
  module WaybackArchiver
2
2
  # Gem version
3
- VERSION = '1.0.0'.freeze
3
+ VERSION = '1.1.0'.freeze
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-08-01 00:00:00.000000000 Z
11
+ date: 2017-08-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -176,9 +176,11 @@ files:
176
176
  - bin/wayback_archiver
177
177
  - lib/wayback_archiver.rb
178
178
  - lib/wayback_archiver/archive.rb
179
+ - lib/wayback_archiver/archive_result.rb
179
180
  - lib/wayback_archiver/http_code.rb
180
181
  - lib/wayback_archiver/null_logger.rb
181
182
  - lib/wayback_archiver/request.rb
183
+ - lib/wayback_archiver/response.rb
182
184
  - lib/wayback_archiver/sitemap.rb
183
185
  - lib/wayback_archiver/sitemapper.rb
184
186
  - lib/wayback_archiver/thread_pool.rb