wayback_archiver 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: aa04295bda99658571157a5b1e21c5c43ffa9a98
4
- data.tar.gz: 40906449581c37b2c17990a69d58a27a971c8ed7
3
+ metadata.gz: e0c99c305a785b125b14194417d2930a460b4a95
4
+ data.tar.gz: 4dc88ae01e5cdadfc5730aed1091f005d7526e58
5
5
  SHA512:
6
- metadata.gz: af3d03dcc5781889c1fa10bd1a0851e053ab3e5d91acbd87aa08bebe3f408103ed1acff2a33e95004f5734ec24ae7c06d11d65003cfde411a16453d21920136f
7
- data.tar.gz: f799a26ef78e340e3af0e31fd333917fb8723cd9a50eb29e12c69427591988a58cbb4bb2505c0027ddfa864604a3c8f8a226668ea98c1eab1abf6ebd6a7dbf73
6
+ metadata.gz: 1c665d9a1350cbbf8917803a3f8d0a6304a94b4af364497b3520dfe22b526c19ec9ae6174bbe4699c6d0dffd679427f457fb404757c4720d4d642de21afa465d
7
+ data.tar.gz: 5105b2fef3872e3dc528cac943f43c3e3c1b6007838bbebb6064a9f4351f8c44f306976a8e9977579aad8e6da642bf39a56b0b53de128ca5b0d8f89552c5c2c1
@@ -43,15 +43,15 @@ module WaybackArchiver
43
43
  # WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
44
44
  # WaybackArchiver.archive('example.com', strategy: :url, limit: 100) # send max 100 URLs
45
45
  # WaybackArchiver.archive('example.com', :url)
46
- def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
46
+ def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
47
47
  strategy = legacy_strategy || strategy
48
48
 
49
49
  case strategy.to_s
50
- when 'crawl' then crawl(source, concurrency: concurrency, limit: limit)
51
- when 'auto' then auto(source, concurrency: concurrency, limit: limit)
52
- when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit)
53
- when 'urls' then urls(source, concurrency: concurrency, limit: limit)
54
- when 'url' then urls(source, concurrency: concurrency, limit: limit)
50
+ when 'crawl' then crawl(source, concurrency: concurrency, limit: limit, &block)
51
+ when 'auto' then auto(source, concurrency: concurrency, limit: limit, &block)
52
+ when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit, &block)
53
+ when 'urls' then urls(source, concurrency: concurrency, limit: limit, &block)
54
+ when 'url' then urls(source, concurrency: concurrency, limit: limit, &block)
55
55
  else
56
56
  raise ArgumentError, "Unknown strategy: '#{strategy}'. Allowed strategies: sitemap, urls, url, crawl"
57
57
  end
@@ -69,11 +69,11 @@ module WaybackArchiver
69
69
  # @example Auto archive example.com and archive max 100 URLs
70
70
  # WaybackArchiver.auto('example.com', limit: 100)
71
71
  # @see http://www.sitemaps.org
72
- def self.auto(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
72
+ def self.auto(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
73
73
  urls = Sitemapper.autodiscover(source)
74
- return urls(urls, concurrency: concurrency) if urls.any?
74
+ return urls(urls, concurrency: concurrency, &block) if urls.any?
75
75
 
76
- crawl(source, concurrency: concurrency)
76
+ crawl(source, concurrency: concurrency, &block)
77
77
  end
78
78
 
79
79
  # Crawl site for URLs to send to the Wayback Machine.
@@ -86,9 +86,9 @@ module WaybackArchiver
86
86
  # WaybackArchiver.crawl('example.com', concurrency: 1)
87
87
  # @example Crawl example.com and archive max 100 URLs
88
88
  # WaybackArchiver.crawl('example.com', limit: 100)
89
- def self.crawl(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
89
+ def self.crawl(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
90
90
  WaybackArchiver.logger.info "Crawling #{url}"
91
- Archive.crawl(url, concurrency: concurrency, limit: limit)
91
+ Archive.crawl(url, concurrency: concurrency, limit: limit, &block)
92
92
  end
93
93
 
94
94
  # Get URLs from sitemap and send found URLs to the Wayback Machine.
@@ -102,9 +102,9 @@ module WaybackArchiver
102
102
  # @example Get example.com sitemap archive max 100 URLs
103
103
  # WaybackArchiver.sitemap('example.com/sitemap.xml', limit: 100)
104
104
  # @see http://www.sitemaps.org
105
- def self.sitemap(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
105
+ def self.sitemap(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
106
106
  WaybackArchiver.logger.info "Fetching Sitemap"
107
- Archive.post(URLCollector.sitemap(url), concurrency: concurrency, limit: limit)
107
+ Archive.post(URLCollector.sitemap(url), concurrency: concurrency, limit: limit, &block)
108
108
  end
109
109
 
110
110
  # Send URL to the Wayback Machine.
@@ -117,8 +117,8 @@ module WaybackArchiver
117
117
  # WaybackArchiver.urls(%w(example.com google.com))
118
118
  # @example Archive example.com, max 100 URLs
119
119
  # WaybackArchiver.urls(%w(example.com www.example.com), limit: 100)
120
- def self.urls(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
121
- Archive.post(Array(urls), concurrency: concurrency)
120
+ def self.urls(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
121
+ Archive.post(Array(urls), concurrency: concurrency, &block)
122
122
  end
123
123
 
124
124
  # Set logger
@@ -180,4 +180,21 @@ module WaybackArchiver
180
180
  def self.max_limit
181
181
  @max_limit ||= DEFAULT_MAX_LIMIT
182
182
  end
183
+
184
+ # Sets the adapter
185
+ # @return [Object, #call>] the configured adapter
186
+ # @param [Object, #call>] the adapter
187
+ def self.adapter=(adapter)
188
+ unless adapter.respond_to?(:call)
189
+ raise(ArgumentError, 'adapter must implement #call')
190
+ end
191
+
192
+ @adapter = adapter
193
+ end
194
+
195
+ # Returns the configured adapter
196
+ # @return [Integer] the configured or the default adapter
197
+ def self.adapter
198
+ @adapter ||= WaybackMachine
199
+ end
183
200
  end
@@ -0,0 +1,30 @@
1
+ require 'wayback_archiver/archive_result'
2
+ require 'wayback_archiver/request'
3
+
4
+ module WaybackArchiver
5
+ # WaybackMachine adapter
6
+ class WaybackMachine
7
+ # Wayback Machine base URL.
8
+ BASE_URL = 'https://web.archive.org/save/'.freeze
9
+
10
+ # Send URL to Wayback Machine.
11
+ # @return [ArchiveResult] the sent URL.
12
+ # @param [String] url to send.
13
+ # @example Archive example.com, with default options
14
+ # WaybackMachine.call('http://example.com')
15
+ def self.call(url)
16
+ request_url = "#{BASE_URL}#{url}"
17
+ response = Request.get(request_url, follow_redirects: false)
18
+ WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
19
+ ArchiveResult.new(
20
+ url,
21
+ code: response.code,
22
+ request_url: response.uri,
23
+ response_error: response.error
24
+ )
25
+ rescue Request::Error => e
26
+ WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
27
+ ArchiveResult.new(url, error: e)
28
+ end
29
+ end
30
+ end
@@ -1,15 +1,11 @@
1
1
  require 'concurrent'
2
2
 
3
- require 'wayback_archiver/archive_result'
4
3
  require 'wayback_archiver/thread_pool'
5
- require 'wayback_archiver/request'
4
+ require 'wayback_archiver/adapters/wayback_machine'
6
5
 
7
6
  module WaybackArchiver
8
7
  # Post URL(s) to Wayback Machine
9
8
  class Archive
10
- # Wayback Machine base URL.
11
- WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
12
-
13
9
  # Send URLs to Wayback Machine.
14
10
  # @return [Array<ArchiveResult>] with sent URLs.
15
11
  # @param [Array<String>] urls to send to the Wayback Machine.
@@ -97,18 +93,7 @@ module WaybackArchiver
97
93
  # @example Archive example.com, with default options
98
94
  # Archive.post_url('http://example.com')
99
95
  def self.post_url(url)
100
- request_url = "#{WAYBACK_BASE_URL}#{url}"
101
- response = Request.get(request_url, follow_redirects: false)
102
- WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
103
- ArchiveResult.new(
104
- url,
105
- code: response.code,
106
- request_url: response.uri,
107
- response_error: response.error
108
- )
109
- rescue Request::Error => e
110
- WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
111
- ArchiveResult.new(url, error: e)
96
+ WaybackArchiver.adapter.call(url)
112
97
  end
113
98
  end
114
99
  end
@@ -16,6 +16,11 @@ module WaybackArchiver
16
16
  uri
17
17
  end
18
18
 
19
+ # @return [Boolean] true if success
20
+ def success?
21
+ !errored?
22
+ end
23
+
19
24
  # @return [Boolean] true if errored
20
25
  def errored?
21
26
  !!error
@@ -1,4 +1,4 @@
1
1
  module WaybackArchiver
2
2
  # Gem version
3
- VERSION = '1.1.1'.freeze
3
+ VERSION = '1.2.0'.freeze
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-08-06 00:00:00.000000000 Z
11
+ date: 2017-08-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -175,6 +175,7 @@ extra_rdoc_files: []
175
175
  files:
176
176
  - bin/wayback_archiver
177
177
  - lib/wayback_archiver.rb
178
+ - lib/wayback_archiver/adapters/wayback_machine.rb
178
179
  - lib/wayback_archiver/archive.rb
179
180
  - lib/wayback_archiver/archive_result.rb
180
181
  - lib/wayback_archiver/http_code.rb