wayback_archiver 1.1.1 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: aa04295bda99658571157a5b1e21c5c43ffa9a98
4
- data.tar.gz: 40906449581c37b2c17990a69d58a27a971c8ed7
3
+ metadata.gz: e0c99c305a785b125b14194417d2930a460b4a95
4
+ data.tar.gz: 4dc88ae01e5cdadfc5730aed1091f005d7526e58
5
5
  SHA512:
6
- metadata.gz: af3d03dcc5781889c1fa10bd1a0851e053ab3e5d91acbd87aa08bebe3f408103ed1acff2a33e95004f5734ec24ae7c06d11d65003cfde411a16453d21920136f
7
- data.tar.gz: f799a26ef78e340e3af0e31fd333917fb8723cd9a50eb29e12c69427591988a58cbb4bb2505c0027ddfa864604a3c8f8a226668ea98c1eab1abf6ebd6a7dbf73
6
+ metadata.gz: 1c665d9a1350cbbf8917803a3f8d0a6304a94b4af364497b3520dfe22b526c19ec9ae6174bbe4699c6d0dffd679427f457fb404757c4720d4d642de21afa465d
7
+ data.tar.gz: 5105b2fef3872e3dc528cac943f43c3e3c1b6007838bbebb6064a9f4351f8c44f306976a8e9977579aad8e6da642bf39a56b0b53de128ca5b0d8f89552c5c2c1
@@ -43,15 +43,15 @@ module WaybackArchiver
43
43
  # WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
44
44
  # WaybackArchiver.archive('example.com', strategy: :url, limit: 100) # send max 100 URLs
45
45
  # WaybackArchiver.archive('example.com', :url)
46
- def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
46
+ def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
47
47
  strategy = legacy_strategy || strategy
48
48
 
49
49
  case strategy.to_s
50
- when 'crawl' then crawl(source, concurrency: concurrency, limit: limit)
51
- when 'auto' then auto(source, concurrency: concurrency, limit: limit)
52
- when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit)
53
- when 'urls' then urls(source, concurrency: concurrency, limit: limit)
54
- when 'url' then urls(source, concurrency: concurrency, limit: limit)
50
+ when 'crawl' then crawl(source, concurrency: concurrency, limit: limit, &block)
51
+ when 'auto' then auto(source, concurrency: concurrency, limit: limit, &block)
52
+ when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit, &block)
53
+ when 'urls' then urls(source, concurrency: concurrency, limit: limit, &block)
54
+ when 'url' then urls(source, concurrency: concurrency, limit: limit, &block)
55
55
  else
56
56
  raise ArgumentError, "Unknown strategy: '#{strategy}'. Allowed strategies: sitemap, urls, url, crawl"
57
57
  end
@@ -69,11 +69,11 @@ module WaybackArchiver
69
69
  # @example Auto archive example.com and archive max 100 URLs
70
70
  # WaybackArchiver.auto('example.com', limit: 100)
71
71
  # @see http://www.sitemaps.org
72
- def self.auto(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
72
+ def self.auto(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
73
73
  urls = Sitemapper.autodiscover(source)
74
- return urls(urls, concurrency: concurrency) if urls.any?
74
+ return urls(urls, concurrency: concurrency, &block) if urls.any?
75
75
 
76
- crawl(source, concurrency: concurrency)
76
+ crawl(source, concurrency: concurrency, &block)
77
77
  end
78
78
 
79
79
  # Crawl site for URLs to send to the Wayback Machine.
@@ -86,9 +86,9 @@ module WaybackArchiver
86
86
  # WaybackArchiver.crawl('example.com', concurrency: 1)
87
87
  # @example Crawl example.com and archive max 100 URLs
88
88
  # WaybackArchiver.crawl('example.com', limit: 100)
89
- def self.crawl(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
89
+ def self.crawl(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
90
90
  WaybackArchiver.logger.info "Crawling #{url}"
91
- Archive.crawl(url, concurrency: concurrency, limit: limit)
91
+ Archive.crawl(url, concurrency: concurrency, limit: limit, &block)
92
92
  end
93
93
 
94
94
  # Get URLs from sitemap and send found URLs to the Wayback Machine.
@@ -102,9 +102,9 @@ module WaybackArchiver
102
102
  # @example Get example.com sitemap archive max 100 URLs
103
103
  # WaybackArchiver.sitemap('example.com/sitemap.xml', limit: 100)
104
104
  # @see http://www.sitemaps.org
105
- def self.sitemap(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
105
+ def self.sitemap(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
106
106
  WaybackArchiver.logger.info "Fetching Sitemap"
107
- Archive.post(URLCollector.sitemap(url), concurrency: concurrency, limit: limit)
107
+ Archive.post(URLCollector.sitemap(url), concurrency: concurrency, limit: limit, &block)
108
108
  end
109
109
 
110
110
  # Send URL to the Wayback Machine.
@@ -117,8 +117,8 @@ module WaybackArchiver
117
117
  # WaybackArchiver.urls(%w(example.com google.com))
118
118
  # @example Archive example.com, max 100 URLs
119
119
  # WaybackArchiver.urls(%w(example.com www.example.com), limit: 100)
120
- def self.urls(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
121
- Archive.post(Array(urls), concurrency: concurrency)
120
+ def self.urls(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
121
+ Archive.post(Array(urls), concurrency: concurrency, &block)
122
122
  end
123
123
 
124
124
  # Set logger
@@ -180,4 +180,21 @@ module WaybackArchiver
180
180
  def self.max_limit
181
181
  @max_limit ||= DEFAULT_MAX_LIMIT
182
182
  end
183
+
184
+ # Sets the adapter
185
+ # @return [Object, #call>] the configured adapter
186
+ # @param [Object, #call>] the adapter
187
+ def self.adapter=(adapter)
188
+ unless adapter.respond_to?(:call)
189
+ raise(ArgumentError, 'adapter must implement #call')
190
+ end
191
+
192
+ @adapter = adapter
193
+ end
194
+
195
+ # Returns the configured adapter
196
+ # @return [Integer] the configured or the default adapter
197
+ def self.adapter
198
+ @adapter ||= WaybackMachine
199
+ end
183
200
  end
@@ -0,0 +1,30 @@
1
+ require 'wayback_archiver/archive_result'
2
+ require 'wayback_archiver/request'
3
+
4
+ module WaybackArchiver
5
+ # WaybackMachine adapter
6
+ class WaybackMachine
7
+ # Wayback Machine base URL.
8
+ BASE_URL = 'https://web.archive.org/save/'.freeze
9
+
10
+ # Send URL to Wayback Machine.
11
+ # @return [ArchiveResult] the sent URL.
12
+ # @param [String] url to send.
13
+ # @example Archive example.com, with default options
14
+ # WaybackMachine.call('http://example.com')
15
+ def self.call(url)
16
+ request_url = "#{BASE_URL}#{url}"
17
+ response = Request.get(request_url, follow_redirects: false)
18
+ WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
19
+ ArchiveResult.new(
20
+ url,
21
+ code: response.code,
22
+ request_url: response.uri,
23
+ response_error: response.error
24
+ )
25
+ rescue Request::Error => e
26
+ WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
27
+ ArchiveResult.new(url, error: e)
28
+ end
29
+ end
30
+ end
@@ -1,15 +1,11 @@
1
1
  require 'concurrent'
2
2
 
3
- require 'wayback_archiver/archive_result'
4
3
  require 'wayback_archiver/thread_pool'
5
- require 'wayback_archiver/request'
4
+ require 'wayback_archiver/adapters/wayback_machine'
6
5
 
7
6
  module WaybackArchiver
8
7
  # Post URL(s) to Wayback Machine
9
8
  class Archive
10
- # Wayback Machine base URL.
11
- WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
12
-
13
9
  # Send URLs to Wayback Machine.
14
10
  # @return [Array<ArchiveResult>] with sent URLs.
15
11
  # @param [Array<String>] urls to send to the Wayback Machine.
@@ -97,18 +93,7 @@ module WaybackArchiver
97
93
  # @example Archive example.com, with default options
98
94
  # Archive.post_url('http://example.com')
99
95
  def self.post_url(url)
100
- request_url = "#{WAYBACK_BASE_URL}#{url}"
101
- response = Request.get(request_url, follow_redirects: false)
102
- WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
103
- ArchiveResult.new(
104
- url,
105
- code: response.code,
106
- request_url: response.uri,
107
- response_error: response.error
108
- )
109
- rescue Request::Error => e
110
- WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
111
- ArchiveResult.new(url, error: e)
96
+ WaybackArchiver.adapter.call(url)
112
97
  end
113
98
  end
114
99
  end
@@ -16,6 +16,11 @@ module WaybackArchiver
16
16
  uri
17
17
  end
18
18
 
19
+ # @return [Boolean] true if success
20
+ def success?
21
+ !errored?
22
+ end
23
+
19
24
  # @return [Boolean] true if errored
20
25
  def errored?
21
26
  !!error
@@ -1,4 +1,4 @@
1
1
  module WaybackArchiver
2
2
  # Gem version
3
- VERSION = '1.1.1'.freeze
3
+ VERSION = '1.2.0'.freeze
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-08-06 00:00:00.000000000 Z
11
+ date: 2017-08-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -175,6 +175,7 @@ extra_rdoc_files: []
175
175
  files:
176
176
  - bin/wayback_archiver
177
177
  - lib/wayback_archiver.rb
178
+ - lib/wayback_archiver/adapters/wayback_machine.rb
178
179
  - lib/wayback_archiver/archive.rb
179
180
  - lib/wayback_archiver/archive_result.rb
180
181
  - lib/wayback_archiver/http_code.rb