wayback_archiver 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ab2721984ab2e58857a30f83a871cc539248fc7d
4
- data.tar.gz: e4a087f0d0500c930ae9e846dffb18bf72ec5696
3
+ metadata.gz: ccebbb815d374658a9b2e1b2998a40041115d295
4
+ data.tar.gz: 153afa895756670988fa3663f44fd1fd5a2e5e3e
5
5
  SHA512:
6
- metadata.gz: 26cbd46ab21a7364b3ccd38ba7cd7405ed495eee635fc035a1b99a9addcdc9f670f662d1122ac7aa23ec691297da0e45f31083dc13681c1db74626f6705c0e67
7
- data.tar.gz: c65fb8fd27f3dc9d58e33a501e66a79c24d807640c2ad3ce63a52e98e182ab46334fadf216adb3c20bdb176c1512232e7f1c4f0c0253a42d6559a21565809fab
6
+ metadata.gz: db7d655b1ea642618797d7fa3ece8357fa608dcb60322da76be7d793de11cd61df61f796025000ed0099edd96711aa78db607ce0614e9256e5921c252a7d931b
7
+ data.tar.gz: 919d8473c7f97bbd36c9065b95a99304ace7e0556a8dcd0f92eedf00d65961400dd92c63d86e405aa97babd5d254b522384f665ff39ababdc71afed9c13333ad
data/bin/wayback_archiver CHANGED
@@ -5,6 +5,8 @@ require 'wayback_archiver'
5
5
  url = ARGV[0]
6
6
  from = ARGV[1]
7
7
 
8
+ WaybackArchiver.logger = Logger.new(STDOUT)
9
+
8
10
  if from.nil?
9
11
  WaybackArchiver.archive(url)
10
12
  else
@@ -4,29 +4,45 @@ module WaybackArchiver
4
4
  # Wayback Machine base URL.
5
5
  WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
6
6
  # Default concurrency for archiving URLs
7
- DEFAULT_CONCURRENCY = 10
7
+ DEFAULT_CONCURRENCY = 5
8
8
  # Send URLs to Wayback Machine.
9
9
  # @return [Array] with sent URLs.
10
10
  # @param [Array] urls URLs to send.
11
11
  # @param [Hash] options
12
- # @example Archive example.com, with default options
12
+ # @example Archive urls, asynchronously
13
13
  # Archive.post(['http://example.com'])
14
- # @example Archive example.com, using only 1 thread
14
+ # @example Archive urls, using only 1 thread
15
15
  # Archive.post(['http://example.com'], concurrency: 1)
16
- def self.post(urls, options = {})
17
- options = { concurrency: DEFAULT_CONCURRENCY }.merge!(options)
18
- concurrency = options[:concurrency]
19
-
20
- puts "=== WAYBACK ARCHIVER ==="
21
- puts "Request are sent with up to #{concurrency} parallel threads"
22
- puts "Total urls to be sent: #{urls.length}"
16
+ def self.post(urls, concurrency: DEFAULT_CONCURRENCY)
17
+ WaybackArchiver.logger.info "=== WAYBACK ARCHIVER ==="
18
+ WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
19
+ WaybackArchiver.logger.info "Total urls to be sent: #{urls.length}"
23
20
 
24
- ProcessQueue.process(urls, threads_count: concurrency) { |url| post_url(url) }
21
+ pool = Concurrent::FixedThreadPool.new(concurrency)
22
+ urls.each do |url|
23
+ pool.post { Archive.post_url(url) }
24
+ end
25
25
 
26
- puts "#{urls.length} URLs sent to Internet archive"
26
+ WaybackArchiver.logger.info "#{urls.length} URLs sent to Internet archive"
27
27
  urls
28
28
  end
29
29
 
30
+ # Send URLs to Wayback Machine by crawling the site.
31
+ # @return [Array] with URLs sent to the Wayback Machine.
32
+ # @param [String] source for URL to crawl.
33
+ # @param [Integer] concurrency (default is 5).
34
+ # @example Crawl example.com and send all URLs of the same domain
35
+ # WaybackArchiver.crawl('example.com')
36
+ # @example Crawl example.com and send all URLs of the same domain with low concurrency
37
+ # WaybackArchiver.crawl('example.com', concurrency: 1)
38
+ def self.crawl(source, concurrency: DEFAULT_CONCURRENCY)
39
+ pool = Concurrent::FixedThreadPool.new(concurrency) # X threads
40
+
41
+ UrlCollector.crawl(source) do |url|
42
+ pool.post { Archive.post_url(url) }
43
+ end
44
+ end
45
+
30
46
  # Send URL to Wayback Machine.
31
47
  # @return [String] the sent URL.
32
48
  # @param [String] url to send.
@@ -35,11 +51,11 @@ module WaybackArchiver
35
51
  def self.post_url(url)
36
52
  request_url = "#{WAYBACK_BASE_URL}#{url}"
37
53
  response = Request.response(request_url)
38
- puts "[#{response.code}, #{response.message}] #{url}"
54
+ WaybackArchiver.logger.info "[#{response.code}, #{response.message}] #{url}"
39
55
  url
40
56
  rescue Exception => e
41
- puts "Error message: #{e.message}"
42
- puts "Failed to archive: #{url}"
57
+ WaybackArchiver.logger.error "Error message: #{e.message}"
58
+ WaybackArchiver.logger.error "Failed to archive: #{url}"
43
59
  end
44
60
  end
45
61
  end
@@ -0,0 +1,9 @@
1
+ require 'logger'
2
+
3
+ class NullLogger < Logger
4
+ def initialize(*args)
5
+ end
6
+
7
+ def add(*args, &block)
8
+ end
9
+ end
@@ -27,7 +27,7 @@ module WaybackArchiver
27
27
  spider.every_html_page do |page|
28
28
  page_url = page.url.to_s
29
29
  urls << page_url
30
- puts "Found: #{page_url}"
30
+ WaybackArchiver.logger.info "Found: #{page_url}"
31
31
  yield(page_url) if block_given?
32
32
  end
33
33
  end
@@ -1,4 +1,4 @@
1
1
  module WaybackArchiver
2
2
  # Gem version
3
- VERSION = '0.1.0'.freeze
3
+ VERSION = '0.2.0'.freeze
4
4
  end
@@ -1,11 +1,13 @@
1
1
  require 'uri'
2
2
  require 'net/http'
3
3
 
4
+ require 'concurrent'
5
+
6
+ require 'wayback_archiver/null_logger'
4
7
  require 'wayback_archiver/version'
5
8
  require 'wayback_archiver/url_collector'
6
9
  require 'wayback_archiver/archive'
7
10
  require 'wayback_archiver/request'
8
- require 'wayback_archiver/process_queue'
9
11
 
10
12
  # WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
11
13
  module WaybackArchiver
@@ -28,11 +30,31 @@ module WaybackArchiver
28
30
  def self.archive(source, type = :crawl)
29
31
  case type.to_s
30
32
  when 'file' then Archive.post(UrlCollector.file(source))
31
- when 'crawl' then UrlCollector.crawl(source) { |url| Archive.post_url(url) }
33
+ when 'crawl' then crawl(source)
32
34
  when 'sitemap' then Archive.post(UrlCollector.sitemap(source))
33
35
  when 'url' then Archive.post_url(Request.resolve_url(source))
34
36
  else
35
37
  raise ArgumentError, "Unknown type: '#{type}'. Allowed types: sitemap, url, file, crawl"
36
38
  end
37
39
  end
40
+
41
+ # Crawl site for URLs to send to the Wayback Machine.
42
+ # @return [Array] with URLs sent to the Wayback Machine.
43
+ # @param [String] source for URL(s).
44
+ # @param [Integer] concurrency.
45
+ # @example Crawl example.com and send all URLs of the same domain
46
+ # WaybackArchiver.crawl('example.com') # Default concurrency is 5
47
+ # @example Crawl example.com and send all URLs of the same domain with low concurrency
48
+ # WaybackArchiver.crawl('example.com', concurrency: 1)
49
+ def self.crawl(source, concurrency: Archive::DEFAULT_CONCURRENCY)
50
+ Archive.crawl(source, concurrency: concurrency)
51
+ end
52
+
53
+ def self.logger=(logger)
54
+ @logger = logger
55
+ end
56
+
57
+ def self.logger
58
+ @logger ||= NullLogger.new
59
+ end
38
60
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-02-20 00:00:00.000000000 Z
11
+ date: 2017-07-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0.1'
55
+ - !ruby/object:Gem::Dependency
56
+ name: concurrent-ruby
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: bundler
57
71
  requirement: !ruby/object:Gem::Requirement
@@ -150,7 +164,8 @@ dependencies:
150
164
  - - ">"
151
165
  - !ruby/object:Gem::Version
152
166
  version: '0'
153
- description: Send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
167
+ description: Send URLs to Wayback Machine (Internet Archive). By crawling, sitemap,
168
+ file or single URL.
154
169
  email:
155
170
  - burenstam@gmail.com
156
171
  executables:
@@ -161,7 +176,7 @@ files:
161
176
  - bin/wayback_archiver
162
177
  - lib/wayback_archiver.rb
163
178
  - lib/wayback_archiver/archive.rb
164
- - lib/wayback_archiver/process_queue.rb
179
+ - lib/wayback_archiver/null_logger.rb
165
180
  - lib/wayback_archiver/request.rb
166
181
  - lib/wayback_archiver/url_collector.rb
167
182
  - lib/wayback_archiver/version.rb
@@ -177,7 +192,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
177
192
  requirements:
178
193
  - - ">="
179
194
  - !ruby/object:Gem::Version
180
- version: 1.9.3
195
+ version: 2.0.0
181
196
  required_rubygems_version: !ruby/object:Gem::Requirement
182
197
  requirements:
183
198
  - - ">="
@@ -185,8 +200,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
185
200
  version: '0'
186
201
  requirements: []
187
202
  rubyforge_project:
188
- rubygems_version: 2.6.10
203
+ rubygems_version: 2.6.11
189
204
  signing_key:
190
205
  specification_version: 4
191
- summary: Send URLs to Wayback Machine
206
+ summary: Send URLs to Wayback Machine (Internet Archive)
192
207
  test_files: []
@@ -1,28 +0,0 @@
1
- require 'thread'
2
-
3
- module WaybackArchiver
4
- class ProcessQueue
5
- # Process enumerable data in parallel.
6
- # @return [Array] of URLs defined found during crawl.
7
- # @param [Object] Enumberable object
8
- # @example Print list of names in parallel
9
- # ProcessQueue.process(%w(jacob peter eva)) { |v| puts n }
10
- # @example Print list of names using 2 threads
11
- # ProcessQueue.process(%w(jacob peter eva), threads_count: 2) { |v| puts n }
12
- def self.process(data_array, threads_count: 5)
13
- queue = Queue.new
14
- data_array.each { |data| queue.push(data) }
15
- workers = threads_count.times.map do
16
- Thread.new do
17
- begin
18
- while data = queue.pop(true)
19
- yield(data)
20
- end
21
- rescue ThreadError
22
- end
23
- end
24
- end
25
- workers.map(&:join)
26
- end
27
- end
28
- end