wayback_archiver 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ab2721984ab2e58857a30f83a871cc539248fc7d
4
- data.tar.gz: e4a087f0d0500c930ae9e846dffb18bf72ec5696
3
+ metadata.gz: ccebbb815d374658a9b2e1b2998a40041115d295
4
+ data.tar.gz: 153afa895756670988fa3663f44fd1fd5a2e5e3e
5
5
  SHA512:
6
- metadata.gz: 26cbd46ab21a7364b3ccd38ba7cd7405ed495eee635fc035a1b99a9addcdc9f670f662d1122ac7aa23ec691297da0e45f31083dc13681c1db74626f6705c0e67
7
- data.tar.gz: c65fb8fd27f3dc9d58e33a501e66a79c24d807640c2ad3ce63a52e98e182ab46334fadf216adb3c20bdb176c1512232e7f1c4f0c0253a42d6559a21565809fab
6
+ metadata.gz: db7d655b1ea642618797d7fa3ece8357fa608dcb60322da76be7d793de11cd61df61f796025000ed0099edd96711aa78db607ce0614e9256e5921c252a7d931b
7
+ data.tar.gz: 919d8473c7f97bbd36c9065b95a99304ace7e0556a8dcd0f92eedf00d65961400dd92c63d86e405aa97babd5d254b522384f665ff39ababdc71afed9c13333ad
data/bin/wayback_archiver CHANGED
@@ -5,6 +5,8 @@ require 'wayback_archiver'
5
5
  url = ARGV[0]
6
6
  from = ARGV[1]
7
7
 
8
+ WaybackArchiver.logger = Logger.new(STDOUT)
9
+
8
10
  if from.nil?
9
11
  WaybackArchiver.archive(url)
10
12
  else
@@ -4,29 +4,45 @@ module WaybackArchiver
4
4
  # Wayback Machine base URL.
5
5
  WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
6
6
  # Default concurrency for archiving URLs
7
- DEFAULT_CONCURRENCY = 10
7
+ DEFAULT_CONCURRENCY = 5
8
8
  # Send URLs to Wayback Machine.
9
9
  # @return [Array] with sent URLs.
10
10
  # @param [Array] urls URLs to send.
11
11
  # @param [Hash] options
12
- # @example Archive example.com, with default options
12
+ # @example Archive urls, asynchronously
13
13
  # Archive.post(['http://example.com'])
14
- # @example Archive example.com, using only 1 thread
14
+ # @example Archive urls, using only 1 thread
15
15
  # Archive.post(['http://example.com'], concurrency: 1)
16
- def self.post(urls, options = {})
17
- options = { concurrency: DEFAULT_CONCURRENCY }.merge!(options)
18
- concurrency = options[:concurrency]
19
-
20
- puts "=== WAYBACK ARCHIVER ==="
21
- puts "Request are sent with up to #{concurrency} parallel threads"
22
- puts "Total urls to be sent: #{urls.length}"
16
+ def self.post(urls, concurrency: DEFAULT_CONCURRENCY)
17
+ WaybackArchiver.logger.info "=== WAYBACK ARCHIVER ==="
18
+ WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
19
+ WaybackArchiver.logger.info "Total urls to be sent: #{urls.length}"
23
20
 
24
- ProcessQueue.process(urls, threads_count: concurrency) { |url| post_url(url) }
21
+ pool = Concurrent::FixedThreadPool.new(concurrency)
22
+ urls.each do |url|
23
+ pool.post { Archive.post_url(url) }
24
+ end
25
25
 
26
- puts "#{urls.length} URLs sent to Internet archive"
26
+ WaybackArchiver.logger.info "#{urls.length} URLs sent to Internet archive"
27
27
  urls
28
28
  end
29
29
 
30
+ # Send URLs to Wayback Machine by crawling the site.
31
+ # @return [Array] with URLs sent to the Wayback Machine.
32
+ # @param [String] source for URL to crawl.
33
+ # @param [Integer] concurrency (default is 5).
34
+ # @example Crawl example.com and send all URLs of the same domain
35
+ # WaybackArchiver.crawl('example.com')
36
+ # @example Crawl example.com and send all URLs of the same domain with low concurrency
37
+ # WaybackArchiver.crawl('example.com', concurrency: 1)
38
+ def self.crawl(source, concurrency: DEFAULT_CONCURRENCY)
39
+ pool = Concurrent::FixedThreadPool.new(concurrency) # X threads
40
+
41
+ UrlCollector.crawl(source) do |url|
42
+ pool.post { Archive.post_url(url) }
43
+ end
44
+ end
45
+
30
46
  # Send URL to Wayback Machine.
31
47
  # @return [String] the sent URL.
32
48
  # @param [String] url to send.
@@ -35,11 +51,11 @@ module WaybackArchiver
35
51
  def self.post_url(url)
36
52
  request_url = "#{WAYBACK_BASE_URL}#{url}"
37
53
  response = Request.response(request_url)
38
- puts "[#{response.code}, #{response.message}] #{url}"
54
+ WaybackArchiver.logger.info "[#{response.code}, #{response.message}] #{url}"
39
55
  url
40
56
  rescue Exception => e
41
- puts "Error message: #{e.message}"
42
- puts "Failed to archive: #{url}"
57
+ WaybackArchiver.logger.error "Error message: #{e.message}"
58
+ WaybackArchiver.logger.error "Failed to archive: #{url}"
43
59
  end
44
60
  end
45
61
  end
@@ -0,0 +1,9 @@
1
+ require 'logger'
2
+
3
+ class NullLogger < Logger
4
+ def initialize(*args)
5
+ end
6
+
7
+ def add(*args, &block)
8
+ end
9
+ end
@@ -27,7 +27,7 @@ module WaybackArchiver
27
27
  spider.every_html_page do |page|
28
28
  page_url = page.url.to_s
29
29
  urls << page_url
30
- puts "Found: #{page_url}"
30
+ WaybackArchiver.logger.info "Found: #{page_url}"
31
31
  yield(page_url) if block_given?
32
32
  end
33
33
  end
@@ -1,4 +1,4 @@
1
1
  module WaybackArchiver
2
2
  # Gem version
3
- VERSION = '0.1.0'.freeze
3
+ VERSION = '0.2.0'.freeze
4
4
  end
@@ -1,11 +1,13 @@
1
1
  require 'uri'
2
2
  require 'net/http'
3
3
 
4
+ require 'concurrent'
5
+
6
+ require 'wayback_archiver/null_logger'
4
7
  require 'wayback_archiver/version'
5
8
  require 'wayback_archiver/url_collector'
6
9
  require 'wayback_archiver/archive'
7
10
  require 'wayback_archiver/request'
8
- require 'wayback_archiver/process_queue'
9
11
 
10
12
  # WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
11
13
  module WaybackArchiver
@@ -28,11 +30,31 @@ module WaybackArchiver
28
30
  def self.archive(source, type = :crawl)
29
31
  case type.to_s
30
32
  when 'file' then Archive.post(UrlCollector.file(source))
31
- when 'crawl' then UrlCollector.crawl(source) { |url| Archive.post_url(url) }
33
+ when 'crawl' then crawl(source)
32
34
  when 'sitemap' then Archive.post(UrlCollector.sitemap(source))
33
35
  when 'url' then Archive.post_url(Request.resolve_url(source))
34
36
  else
35
37
  raise ArgumentError, "Unknown type: '#{type}'. Allowed types: sitemap, url, file, crawl"
36
38
  end
37
39
  end
40
+
41
+ # Crawl site for URLs to send to the Wayback Machine.
42
+ # @return [Array] with URLs sent to the Wayback Machine.
43
+ # @param [String] source for URL(s).
44
+ # @param [Integer] concurrency.
45
+ # @example Crawl example.com and send all URLs of the same domain
46
+ # WaybackArchiver.crawl('example.com') # Default concurrency is 5
47
+ # @example Crawl example.com and send all URLs of the same domain with low concurrency
48
+ # WaybackArchiver.crawl('example.com', concurrency: 1)
49
+ def self.crawl(source, concurrency: Archive::DEFAULT_CONCURRENCY)
50
+ Archive.crawl(source, concurrency: concurrency)
51
+ end
52
+
53
+ def self.logger=(logger)
54
+ @logger = logger
55
+ end
56
+
57
+ def self.logger
58
+ @logger ||= NullLogger.new
59
+ end
38
60
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-02-20 00:00:00.000000000 Z
11
+ date: 2017-07-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0.1'
55
+ - !ruby/object:Gem::Dependency
56
+ name: concurrent-ruby
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: bundler
57
71
  requirement: !ruby/object:Gem::Requirement
@@ -150,7 +164,8 @@ dependencies:
150
164
  - - ">"
151
165
  - !ruby/object:Gem::Version
152
166
  version: '0'
153
- description: Send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
167
+ description: Send URLs to Wayback Machine (Internet Archive). By crawling, sitemap,
168
+ file or single URL.
154
169
  email:
155
170
  - burenstam@gmail.com
156
171
  executables:
@@ -161,7 +176,7 @@ files:
161
176
  - bin/wayback_archiver
162
177
  - lib/wayback_archiver.rb
163
178
  - lib/wayback_archiver/archive.rb
164
- - lib/wayback_archiver/process_queue.rb
179
+ - lib/wayback_archiver/null_logger.rb
165
180
  - lib/wayback_archiver/request.rb
166
181
  - lib/wayback_archiver/url_collector.rb
167
182
  - lib/wayback_archiver/version.rb
@@ -177,7 +192,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
177
192
  requirements:
178
193
  - - ">="
179
194
  - !ruby/object:Gem::Version
180
- version: 1.9.3
195
+ version: 2.0.0
181
196
  required_rubygems_version: !ruby/object:Gem::Requirement
182
197
  requirements:
183
198
  - - ">="
@@ -185,8 +200,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
185
200
  version: '0'
186
201
  requirements: []
187
202
  rubyforge_project:
188
- rubygems_version: 2.6.10
203
+ rubygems_version: 2.6.11
189
204
  signing_key:
190
205
  specification_version: 4
191
- summary: Send URLs to Wayback Machine
206
+ summary: Send URLs to Wayback Machine (Internet Archive)
192
207
  test_files: []
@@ -1,28 +0,0 @@
1
- require 'thread'
2
-
3
- module WaybackArchiver
4
- class ProcessQueue
5
- # Process enumerable data in parallel.
6
- # @return [Array] of URLs defined found during crawl.
7
- # @param [Object] Enumberable object
8
- # @example Print list of names in parallel
9
- # ProcessQueue.process(%w(jacob peter eva)) { |v| puts n }
10
- # @example Print list of names using 2 threads
11
- # ProcessQueue.process(%w(jacob peter eva), threads_count: 2) { |v| puts n }
12
- def self.process(data_array, threads_count: 5)
13
- queue = Queue.new
14
- data_array.each { |data| queue.push(data) }
15
- workers = threads_count.times.map do
16
- Thread.new do
17
- begin
18
- while data = queue.pop(true)
19
- yield(data)
20
- end
21
- rescue ThreadError
22
- end
23
- end
24
- end
25
- workers.map(&:join)
26
- end
27
- end
28
- end