wayback_archiver 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1f9f979d5fa0d31cfdf61660baa3464bdb1425e5
4
- data.tar.gz: 1d5701273bbe4d02b2ba5d88f9e75c9477058a28
3
+ metadata.gz: a4ba8820f2974f5a506c3fd125be8b6d6b429e3d
4
+ data.tar.gz: 00e101acf27d03d0fccc48b3031cca01aa4792ae
5
5
  SHA512:
6
- metadata.gz: e69883c975584b3120993371b29a0c0b7a71f3fd4210764b4caa712d4071ec175dc47fa63217950968da62485e927c05fcc977dfb076a317be4754cf4f16ec90
7
- data.tar.gz: 51f80b591f40f4bc22b5bf2e2b7d56273c3c495db43d8a9268f11048a410ad355a3ff238088a4abd83c69fd585db5e1e704d34d22dac3d4f4b9b2f73089458ac
6
+ metadata.gz: 8c0ec54efa365cdbf9ea9847d0289d67407fcc85b41100559fc768af6ea6bb3d1ab90594bd0578cc02c503fb5b2394a67e77e6cd624783314ec226aee3948424
7
+ data.tar.gz: b206a4c9e45fa42159d6cb4013149aa003f51dd70a41b6ddbbdb77978b8bdb09fe91852d3c67b5f56bff7eab9f9305f82b16dadae1487301e2344bf335f77600
@@ -9,6 +9,7 @@ strategy = 'auto'
9
9
  log = STDOUT
10
10
  log_level = Logger::INFO
11
11
  concurrency = WaybackArchiver.concurrency
12
+ limit = WaybackArchiver.max_limit
12
13
 
13
14
  optparse = OptionParser.new do |parser|
14
15
  parser.banner = 'Usage: wayback_archiver [<url>] [options]'
@@ -33,6 +34,10 @@ optparse = OptionParser.new do |parser|
33
34
  concurrency = value
34
35
  end
35
36
 
37
+ parser.on('--limit=5', Integer, 'Max number of URLs to archive') do |value|
38
+ limit = value
39
+ end
40
+
36
41
  parser.on('--log=output.log', String, 'Path to desired log file (if no argument is given it defaults to STDOUT)') do |path|
37
42
  log = path
38
43
  end
@@ -74,5 +79,10 @@ end
74
79
  # If no strategy has explicitly been given, then default to 'auto'
75
80
  strategy ||= 'auto'
76
81
  urls.each do |url|
77
- WaybackArchiver.archive(url, strategy: strategy, concurrency: concurrency)
82
+ WaybackArchiver.archive(
83
+ url,
84
+ strategy: strategy,
85
+ concurrency: concurrency,
86
+ limit: limit
87
+ )
78
88
  end
@@ -15,36 +15,43 @@ module WaybackArchiver
15
15
  # Default concurrency for archiving URLs
16
16
  DEFAULT_CONCURRENCY = 5
17
17
 
18
+ # Maxmium number of links posted (-1 is no limit)
19
+ DEFAULT_MAX_LIMIT = -1
20
+
18
21
  # Send URLs to Wayback Machine.
19
- # @return [Array<String>] of URLs sent to the Wayback Machine.
22
+ # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
20
23
  # @param [String/Array<String>] source for URL(s).
21
24
  # @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto.
22
25
  # @example Crawl example.com and send all URLs of the same domain
23
26
  # WaybackArchiver.archive('example.com') # Default strategy is :auto
24
27
  # WaybackArchiver.archive('example.com', strategy: :auto)
25
28
  # WaybackArchiver.archive('example.com', strategy: :auto, concurrency: 10)
29
+ # WaybackArchiver.archive('example.com', strategy: :auto, limit: 100) # send max 100 URLs
26
30
  # WaybackArchiver.archive('example.com', :auto)
27
31
  # @example Crawl example.com and send all URLs of the same domain
28
32
  # WaybackArchiver.archive('example.com', strategy: :crawl)
29
33
  # WaybackArchiver.archive('example.com', strategy: :crawl, concurrency: 10)
34
+ # WaybackArchiver.archive('example.com', strategy: :crawl, limit: 100) # send max 100 URLs
30
35
  # WaybackArchiver.archive('example.com', :crawl)
31
36
  # @example Send example.com Sitemap URLs
32
37
  # WaybackArchiver.archive('example.com', strategy: :sitemap)
33
38
  # WaybackArchiver.archive('example.com', strategy: :sitemap, concurrency: 10)
39
+ # WaybackArchiver.archive('example.com', strategy: :sitemap, limit: 100) # send max 100 URLs
34
40
  # WaybackArchiver.archive('example.com', :sitemap)
35
41
  # @example Send only example.com
36
42
  # WaybackArchiver.archive('example.com', strategy: :url)
37
43
  # WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
44
+ # WaybackArchiver.archive('example.com', strategy: :url, limit: 100) # send max 100 URLs
38
45
  # WaybackArchiver.archive('example.com', :url)
39
- def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency)
46
+ def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
40
47
  strategy = legacy_strategy || strategy
41
48
 
42
49
  case strategy.to_s
43
- when 'crawl' then crawl(source, concurrency: concurrency)
44
- when 'auto' then auto(source, concurrency: concurrency)
45
- when 'sitemap' then sitemap(source, concurrency: concurrency)
46
- when 'urls' then urls(source, concurrency: concurrency)
47
- when 'url' then urls(source, concurrency: concurrency)
50
+ when 'crawl' then crawl(source, concurrency: concurrency, limit: limit)
51
+ when 'auto' then auto(source, concurrency: concurrency, limit: limit)
52
+ when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit)
53
+ when 'urls' then urls(source, concurrency: concurrency, limit: limit)
54
+ when 'url' then urls(source, concurrency: concurrency, limit: limit)
48
55
  else
49
56
  raise ArgumentError, "Unknown strategy: '#{strategy}'. Allowed strategies: sitemap, urls, url, crawl"
50
57
  end
@@ -52,15 +59,17 @@ module WaybackArchiver
52
59
 
53
60
  # Look for Sitemap(s) and if nothing is found fallback to crawling.
54
61
  # Then send found URLs to the Wayback Machine.
55
- # @return [Array<String>] of URLs sent to the Wayback Machine.
62
+ # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
56
63
  # @param [String] source (must be a valid URL).
57
64
  # @param concurrency [Integer]
58
65
  # @example Auto archive example.com
59
66
  # WaybackArchiver.auto('example.com') # Default concurrency is 5
60
67
  # @example Auto archive example.com with low concurrency
61
68
  # WaybackArchiver.auto('example.com', concurrency: 1)
69
+ # @example Auto archive example.com and archive max 100 URLs
70
+ # WaybackArchiver.auto('example.com', limit: 100)
62
71
  # @see http://www.sitemaps.org
63
- def self.auto(source, concurrency: WaybackArchiver.concurrency)
72
+ def self.auto(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
64
73
  urls = Sitemapper.autodiscover(source)
65
74
  return urls(urls, concurrency: concurrency) if urls.any?
66
75
 
@@ -68,41 +77,47 @@ module WaybackArchiver
68
77
  end
69
78
 
70
79
  # Crawl site for URLs to send to the Wayback Machine.
71
- # @return [Array<String>] of URLs sent to the Wayback Machine.
80
+ # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
72
81
  # @param [String] url to start crawling from.
73
82
  # @param concurrency [Integer]
74
83
  # @example Crawl example.com and send all URLs of the same domain
75
84
  # WaybackArchiver.crawl('example.com') # Default concurrency is 5
76
85
  # @example Crawl example.com and send all URLs of the same domain with low concurrency
77
86
  # WaybackArchiver.crawl('example.com', concurrency: 1)
78
- def self.crawl(url, concurrency: WaybackArchiver.concurrency)
87
+ # @example Crawl example.com and archive max 100 URLs
88
+ # WaybackArchiver.crawl('example.com', limit: 100)
89
+ def self.crawl(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
79
90
  WaybackArchiver.logger.info "Crawling #{url}"
80
- Archive.crawl(url, concurrency: concurrency)
91
+ Archive.crawl(url, concurrency: concurrency, limit: limit)
81
92
  end
82
93
 
83
94
  # Get URLs from sitemap and send found URLs to the Wayback Machine.
84
- # @return [Array<String>] of URLs sent to the Wayback Machine.
95
+ # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
85
96
  # @param [String] url to the sitemap.
86
97
  # @param concurrency [Integer]
87
98
  # @example Get example.com sitemap and archive all found URLs
88
99
  # WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 5
89
100
  # @example Get example.com sitemap and archive all found URLs with low concurrency
90
101
  # WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1)
102
+ # @example Get example.com sitemap archive max 100 URLs
103
+ # WaybackArchiver.sitemap('example.com/sitemap.xml', limit: 100)
91
104
  # @see http://www.sitemaps.org
92
- def self.sitemap(url, concurrency: WaybackArchiver.concurrency)
105
+ def self.sitemap(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
93
106
  WaybackArchiver.logger.info "Fetching Sitemap"
94
- Archive.post(URLCollector.sitemap(url), concurrency: concurrency)
107
+ Archive.post(URLCollector.sitemap(url), concurrency: concurrency, limit: limit)
95
108
  end
96
109
 
97
110
  # Send URL to the Wayback Machine.
98
- # @return [Array<String>] of URLs sent to the Wayback Machine.
111
+ # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
99
112
  # @param [Array<String>/String] urls or url.
100
113
  # @param concurrency [Integer]
101
114
  # @example Archive example.com
102
115
  # WaybackArchiver.urls('example.com')
103
116
  # @example Archive example.com and google.com
104
117
  # WaybackArchiver.urls(%w(example.com google.com))
105
- def self.urls(urls, concurrency: WaybackArchiver.concurrency)
118
+ # @example Archive example.com, max 100 URLs
119
+ # WaybackArchiver.urls(%w(example.com www.example.com), limit: 100)
120
+ def self.urls(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
106
121
  Archive.post(Array(urls), concurrency: concurrency)
107
122
  end
108
123
 
@@ -152,4 +167,17 @@ module WaybackArchiver
152
167
  def self.concurrency
153
168
  @concurrency ||= DEFAULT_CONCURRENCY
154
169
  end
170
+
171
+ # Sets the default max_limit
172
+ # @return [Integer] the desired default max_limit
173
+ # @param [Integer] max_limit the desired default max_limit
174
+ def self.max_limit=(max_limit)
175
+ @max_limit = max_limit
176
+ end
177
+
178
+ # Returns the default max_limit
179
+ # @return [Integer] the configured or the default max_limit
180
+ def self.max_limit
181
+ @max_limit ||= DEFAULT_MAX_LIMIT
182
+ end
155
183
  end
@@ -1,5 +1,6 @@
1
1
  require 'concurrent'
2
2
 
3
+ require 'wayback_archiver/archive_result'
3
4
  require 'wayback_archiver/thread_pool'
4
5
  require 'wayback_archiver/request'
5
6
 
@@ -10,23 +11,40 @@ module WaybackArchiver
10
11
  WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
11
12
 
12
13
  # Send URLs to Wayback Machine.
13
- # @return [Array<String>] with sent URLs.
14
+ # @return [Array<ArchiveResult>] with sent URLs.
14
15
  # @param [Array<String>] urls to send to the Wayback Machine.
15
16
  # @param concurrency [Integer] the default is 5
17
+ # @yield [archive_result] If a block is given, each result will be yielded
18
+ # @yieldparam [ArchiveResult] archive_result
16
19
  # @example Archive urls, asynchronously
17
20
  # Archive.post(['http://example.com'])
21
+ # Archiver.post(['http://example.com']) do |result|
22
+ # puts [result.code || 'error', result.url] # print response status and URL
23
+ # end
18
24
  # @example Archive urls, using only 1 thread
19
25
  # Archive.post(['http://example.com'], concurrency: 1)
20
- def self.post(urls, concurrency: WaybackArchiver.concurrency)
26
+ # @example Stop after archiving 100 links
27
+ # Archive.post(['http://example.com'], limit: 100)
28
+ # @example Explicitly set no limit on how many links are posted
29
+ # Archive.post(['http://example.com'], limit: -1)
30
+ def self.post(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
21
31
  WaybackArchiver.logger.info "Total URLs to be sent: #{urls.length}"
22
32
  WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
23
33
 
34
+ urls_queue = if limit == -1
35
+ urls
36
+ else
37
+ urls[0...limit]
38
+ end
39
+
24
40
  posted_urls = Concurrent::Array.new
25
41
  pool = ThreadPool.build(concurrency)
26
- urls.each do |url|
42
+
43
+ urls_queue.each do |url|
27
44
  pool.post do
28
- posted_url = post_url(url)
29
- posted_urls << posted_url if posted_url
45
+ result = post_url(url)
46
+ yield(result) if block_given?
47
+ posted_urls << result unless result.errored?
30
48
  end
31
49
  end
32
50
 
@@ -38,23 +56,31 @@ module WaybackArchiver
38
56
  end
39
57
 
40
58
  # Send URLs to Wayback Machine by crawling the site.
41
- # @return [Array<String>] with URLs sent to the Wayback Machine.
59
+ # @return [Array<ArchiveResult>] with URLs sent to the Wayback Machine.
42
60
  # @param [String] source for URL to crawl.
43
61
  # @param concurrency [Integer] the default is 5
62
+ # @yield [archive_result] If a block is given, each result will be yielded
63
+ # @yieldparam [ArchiveResult] archive_result
44
64
  # @example Crawl example.com and send all URLs of the same domain
45
- # WaybackArchiver.crawl('example.com')
65
+ # Archiver.crawl('example.com')
66
+ # Archiver.crawl('example.com') do |result|
67
+ # puts [result.code || 'error', result.url] # print response status and URL
68
+ # end
46
69
  # @example Crawl example.com and send all URLs of the same domain with low concurrency
47
- # WaybackArchiver.crawl('example.com', concurrency: 1)
48
- def self.crawl(source, concurrency: WaybackArchiver.concurrency)
70
+ # Archiver.crawl('example.com', concurrency: 1)
71
+ # @example Stop after archiving 100 links
72
+ # Archiver.crawl('example.com', limit: 100)
73
+ def self.crawl(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
49
74
  WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
50
75
 
51
76
  posted_urls = Concurrent::Array.new
52
77
  pool = ThreadPool.build(concurrency)
53
78
 
54
- found_urls = URLCollector.crawl(source) do |url|
79
+ found_urls = URLCollector.crawl(source, limit: limit) do |url|
55
80
  pool.post do
56
- posted_url = post_url(url)
57
- posted_urls << posted_url if posted_url
81
+ result = post_url(url)
82
+ yield(result) if block_given?
83
+ posted_urls << result unless result.errored?
58
84
  end
59
85
  end
60
86
  WaybackArchiver.logger.info "Crawling of #{source} finished, found #{found_urls.length} URL(s)"
@@ -66,18 +92,18 @@ module WaybackArchiver
66
92
  end
67
93
 
68
94
  # Send URL to Wayback Machine.
69
- # @return [String] the sent URL.
95
+ # @return [ArchiveResult] the sent URL.
70
96
  # @param [String] url to send.
71
97
  # @example Archive example.com, with default options
72
98
  # Archive.post_url('http://example.com')
73
99
  def self.post_url(url)
74
100
  request_url = "#{WAYBACK_BASE_URL}#{url}"
75
- response = Request.get(request_url, follow_redirects: false)
101
+ response = Request.get(request_url, follow_redirects: false)
76
102
  WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
77
- url
103
+ ArchiveResult.new(url, response)
78
104
  rescue Request::Error => e
79
105
  WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
80
- nil
106
+ ArchiveResult.new(url, nil, e)
81
107
  end
82
108
  end
83
109
  end
@@ -0,0 +1,32 @@
1
+ module WaybackArchiver
2
+ # Result data for posting URL to archive
3
+ ArchiveResult = Struct.new(:uri, :response, :error)
4
+ class ArchiveResult
5
+ # @return [String] the URL that was archived
6
+ def archived_url
7
+ uri
8
+ end
9
+
10
+ # @return [String] the requested URL
11
+ def request_url
12
+ return unless response?
13
+ response.uri
14
+ end
15
+
16
+ # @return [String] The HTTP status code if any
17
+ def code
18
+ return unless response?
19
+ response.code
20
+ end
21
+
22
+ # @return [Boolean] true if errored
23
+ def errored?
24
+ !!error
25
+ end
26
+
27
+ # @return [Boolean] true if response is present
28
+ def response?
29
+ !!response
30
+ end
31
+ end
32
+ end
@@ -25,7 +25,7 @@ module WaybackArchiver
25
25
  # @example
26
26
  # HttpCode.success?(nil) # => false
27
27
  def self.success?(code)
28
- code.to_s.match?(/2\d\d/)
28
+ !!code.to_s.match(/2\d\d/)
29
29
  end
30
30
 
31
31
  # Whether the code is a redirect type
@@ -34,7 +34,7 @@ module WaybackArchiver
34
34
  # @example
35
35
  # HttpCode.redirect?('301')
36
36
  def self.redirect?(code)
37
- code.to_s.match?(/3\d\d/)
37
+ !!code.to_s.match(/3\d\d/)
38
38
  end
39
39
 
40
40
  # Whether the code is a error type
@@ -43,7 +43,7 @@ module WaybackArchiver
43
43
  # @example
44
44
  # HttpCode.error?('301')
45
45
  def self.error?(code)
46
- code.to_s.match?(/4\d\d/) || code.to_s.match?(/5\d\d/)
46
+ !!code.to_s.match(/4\d\d/) || !!code.to_s.match(/5\d\d/)
47
47
  end
48
48
  end
49
49
  end
@@ -5,6 +5,7 @@ require 'uri'
5
5
  require 'zlib'
6
6
 
7
7
  require 'wayback_archiver/http_code'
8
+ require 'wayback_archiver/response'
8
9
 
9
10
  module WaybackArchiver
10
11
  # Make HTTP requests
@@ -32,18 +33,6 @@ module WaybackArchiver
32
33
  # Max number of redirects before an error is raised
33
34
  MAX_REDIRECTS = 10
34
35
 
35
- # Response data struct
36
- Response = Struct.new(:code, :message, :body, :uri, :error)
37
- class Response
38
- # Returns true if a successfull response
39
- # @example check if Response was successfull
40
- # response = Response.new('200', 'OK', 'buren', 'http://example.com')
41
- # response.success? # => true
42
- def success?
43
- HTTPCode.success?(code)
44
- end
45
- end
46
-
47
36
  # Get reponse.
48
37
  # @return [Response] the http response representation.
49
38
  # @param [String, URI] uri to retrieve.
@@ -0,0 +1,13 @@
1
+ module WaybackArchiver
2
+ # Response data struct
3
+ Response = Struct.new(:code, :message, :body, :uri, :error)
4
+ class Response
5
+ # Returns true if a successfull response
6
+ # @example check if Response was successfull
7
+ # response = Response.new('200', 'OK', 'buren', 'http://example.com')
8
+ # response.success? # => true
9
+ def success?
10
+ HTTPCode.success?(code)
11
+ end
12
+ end
13
+ end
@@ -38,7 +38,11 @@ module WaybackArchiver
38
38
  WaybackArchiver.logger.info "Looking for Sitemap at #{path}"
39
39
  sitemap_url = [url, path].join(url.end_with?('/') ? '' : '/')
40
40
  response = Request.get(sitemap_url, raise_on_http_error: false)
41
- return urls(xml: response.body) if response.success?
41
+
42
+ if response.success?
43
+ WaybackArchiver.logger.info "Sitemap found at #{sitemap_url}"
44
+ return urls(xml: response.body)
45
+ end
42
46
  end
43
47
 
44
48
  WaybackArchiver.logger.info "Looking for Sitemap at #{url}"
@@ -21,13 +21,19 @@ module WaybackArchiver
21
21
  # @param [String] url domain to crawl URLs from.
22
22
  # @example Crawl URLs defined on example.com
23
23
  # URLCollector.crawl('http://example.com')
24
- def self.crawl(url)
24
+ # @example Crawl URLs defined on example.com and limit the number of visited pages to 100
25
+ # URLCollector.crawl('http://example.com', limit: 100)
26
+ # @example Crawl URLs defined on example.com and explicitly set no upper limit on the number of visited pages to 100
27
+ # URLCollector.crawl('http://example.com', limit: -1)
28
+ def self.crawl(url, limit: WaybackArchiver.max_limit)
25
29
  urls = []
26
30
  start_at_url = Request.build_uri(url).to_s
27
31
  options = {
28
32
  robots: true,
29
33
  user_agent: WaybackArchiver.user_agent
30
34
  }
35
+ options[:limit] = limit unless limit == -1
36
+
31
37
  Spidr.site(start_at_url, options) do |spider|
32
38
  spider.every_html_page do |page|
33
39
  page_url = page.url.to_s
@@ -1,4 +1,4 @@
1
1
  module WaybackArchiver
2
2
  # Gem version
3
- VERSION = '1.0.0'.freeze
3
+ VERSION = '1.1.0'.freeze
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-08-01 00:00:00.000000000 Z
11
+ date: 2017-08-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -176,9 +176,11 @@ files:
176
176
  - bin/wayback_archiver
177
177
  - lib/wayback_archiver.rb
178
178
  - lib/wayback_archiver/archive.rb
179
+ - lib/wayback_archiver/archive_result.rb
179
180
  - lib/wayback_archiver/http_code.rb
180
181
  - lib/wayback_archiver/null_logger.rb
181
182
  - lib/wayback_archiver/request.rb
183
+ - lib/wayback_archiver/response.rb
182
184
  - lib/wayback_archiver/sitemap.rb
183
185
  - lib/wayback_archiver/sitemapper.rb
184
186
  - lib/wayback_archiver/thread_pool.rb