wayback_archiver 0.0.11 → 0.0.12

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fa0d72d6164b6b280db7c683535e6894ba64778d
4
- data.tar.gz: 629963ef30283123820418a714ae73160fa20293
3
+ metadata.gz: fd01b7c39f7432bba6bb0abf29cff5cced700e90
4
+ data.tar.gz: 887582fb5f46f8e42a126b09d7c31b3d005bb1cf
5
5
  SHA512:
6
- metadata.gz: 1021437d5c97ecfad14822548522cad49d3257992c91ee8ccf90a30e3cd4cd98fae6c44d977fc3c1cd4f61334bd7c935fcc50bdee92a68a4399470d4610836be
7
- data.tar.gz: 5c805b9bb8b7cc1123be943966b2f4e94619580d17eb051947d38b1e3d4a165b2c61dd3f117c6bc7c8d858768731108bb1aab1cc59472ad0a3e120fb8df9cae4
6
+ metadata.gz: 4f66455e9ddfdd2cb191910bacfe784a3822e31b0e7ef7bce395fd60614887c9c700b262e958b6acfff9fa59941068c023a0729d09cf00116f44b359c01b2213
7
+ data.tar.gz: 8c8f98adfeb91808296d8c1bc7e01d219918074b5680d2afccc22ac00cb02b6d6ff18b29129d3650fa660238b023e8948a851aeb7ce36b42c34b9be7bb913386
@@ -1,5 +1,3 @@
1
- require 'site_mapper'
2
-
3
1
  require 'uri'
4
2
  require 'net/http'
5
3
 
@@ -7,13 +5,14 @@ require 'wayback_archiver/version'
7
5
  require 'wayback_archiver/url_collector'
8
6
  require 'wayback_archiver/archive'
9
7
  require 'wayback_archiver/request'
8
+ require 'wayback_archiver/process_queue'
10
9
 
11
10
  # WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
12
11
  module WaybackArchiver
13
12
  # Link to gem on rubygems.org, part of the sent User-Agent
14
- INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
13
+ INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'.freeze
15
14
  # WaybackArchiver User-Agent
16
- USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})"
15
+ USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze
17
16
 
18
17
  # Send URLs to Wayback Machine.
19
18
  # @return [Array] with URLs sent to the Wayback Machine.
@@ -29,7 +28,7 @@ module WaybackArchiver
29
28
  def self.archive(source, type = :crawl)
30
29
  case type.to_s
31
30
  when 'file' then Archive.post(UrlCollector.file(source))
32
- when 'crawl' then UrlCollector.crawl(source) { |url| Archive.post_url(url) }
31
+ when 'crawl' then Archive.post(UrlCollector.crawl(source))
33
32
  when 'sitemap' then Archive.post(UrlCollector.sitemap(source))
34
33
  when 'url' then Archive.post_url(Request.resolve_url(source))
35
34
  else
@@ -2,46 +2,44 @@ module WaybackArchiver
2
2
  # Post URL(s) to Wayback Machine
3
3
  class Archive
4
4
  # Wayback Machine base URL.
5
- WAYBACK_BASE_URL = 'https://web.archive.org/save/'
5
+ WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
6
6
  # Default concurrency for archiving URLs
7
7
  DEFAULT_CONCURRENCY = 10
8
- class << self
9
- # Send URLs to Wayback Machine.
10
- # @return [Array] with sent URLs.
11
- # @param [Array] urls URLs to send.
12
- # @param [Hash] options
13
- # @example Archive example.com, with default options
14
- # Archive.post(['http://example.com'])
15
- # @example Archive example.com, using only 1 thread
16
- # Archive.post(['http://example.com'], concurrency: 1)
17
- def post(urls, options = {})
18
- options = { concurrency: DEFAULT_CONCURRENCY }.merge!(options)
19
- concurrency = options[:concurrency]
20
- puts "Request are sent with up to #{concurrency} parallel threads"
21
- puts "Total urls to be sent: #{urls.length}"
22
- group_size = (urls.length / concurrency) + 1
23
- urls.each_slice(group_size).to_a.map! do |archive_urls|
24
- Thread.new { archive_urls.each { |url| post_url(url) } }
25
- end.each(&:join)
26
- puts "#{urls.length} URLs sent to Internet archive"
27
- urls
28
- end
8
+ # Send URLs to Wayback Machine.
9
+ # @return [Array] with sent URLs.
10
+ # @param [Array] urls URLs to send.
11
+ # @param [Hash] options
12
+ # @example Archive example.com, with default options
13
+ # Archive.post(['http://example.com'])
14
+ # @example Archive example.com, using only 1 thread
15
+ # Archive.post(['http://example.com'], concurrency: 1)
16
+ def self.post(urls, options = {})
17
+ options = { concurrency: DEFAULT_CONCURRENCY }.merge!(options)
18
+ concurrency = options[:concurrency]
29
19
 
30
- # Send URL to Wayback Machine.
31
- # @return [String] the sent URL.
32
- # @param [String] url to send.
33
- # @example Archive example.com, with default options
34
- # Archive.post_url('http://example.com')
35
- def post_url(url)
36
- resolved_url = Request.resolve_url(url)
37
- request_url = "#{WAYBACK_BASE_URL}#{resolved_url}"
38
- response = Request.response(request_url)
39
- puts "[#{response.code}, #{response.message}] #{resolved_url}"
40
- resolved_url
41
- rescue Exception => e
42
- puts "Error message: #{e.message}"
43
- puts "Failed to archive: #{resolved_url}"
44
- end
20
+ puts "=== WAYBACK ARCHIVER ==="
21
+ puts "Request are sent with up to #{concurrency} parallel threads"
22
+ puts "Total urls to be sent: #{urls.length}"
23
+
24
+ ProcessQueue.process(urls, threads_count: concurrency) { |url| post_url(url) }
25
+
26
+ puts "#{urls.length} URLs sent to Internet archive"
27
+ urls
28
+ end
29
+
30
+ # Send URL to Wayback Machine.
31
+ # @return [String] the sent URL.
32
+ # @param [String] url to send.
33
+ # @example Archive example.com, with default options
34
+ # Archive.post_url('http://example.com')
35
+ def self.post_url(url)
36
+ request_url = "#{WAYBACK_BASE_URL}#{url}"
37
+ response = Request.response(request_url)
38
+ puts "[#{response.code}, #{response.message}] #{url}"
39
+ url
40
+ rescue Exception => e
41
+ puts "Error message: #{e.message}"
42
+ puts "Failed to archive: #{url}"
45
43
  end
46
44
  end
47
45
  end
@@ -0,0 +1,28 @@
1
+ require 'thread'
2
+
3
+ module WaybackArchiver
4
+ class ProcessQueue
5
+ # Process enumerable data in parallel.
6
+ # @return [Array] of URLs defined found during crawl.
7
+ # @param [Object] Enumberable object
8
+ # @example Print list of names in parallel
9
+ # ProcessQueue.process(%w(jacob peter eva)) { |v| puts n }
10
+ # @example Print list of names using 2 threads
11
+ # ProcessQueue.process(%w(jacob peter eva), threads_count: 2) { |v| puts n }
12
+ def self.process(data_array, threads_count: 5)
13
+ queue = Queue.new
14
+ data_array.each { |data| queue.push(data) }
15
+ workers = threads_count.times.map do
16
+ Thread.new do
17
+ begin
18
+ while data = queue.pop(true)
19
+ yield(data)
20
+ end
21
+ rescue ThreadError
22
+ end
23
+ end
24
+ end
25
+ workers.map(&:join)
26
+ end
27
+ end
28
+ end
@@ -3,64 +3,60 @@ require 'url_resolver' # TODO: Allow users to use any resolver
3
3
  module WaybackArchiver
4
4
  # Request and parse HTML & XML documents
5
5
  class Request
6
- class << self
7
- # Get and parse HTML & XML documents.
8
- # @return [Array] with links sent to the Wayback Machine.
9
- # @param [String] url to retrieve and parse.
10
- # @example Request and parse example.com
11
- # Request.document('example.com')
12
- # @example Request and parse google.com/sitemap.xml
13
- # Request.document('google.com/sitemap.xml')
14
- def document(url)
15
- response_body = Request.response(url).body
16
- Nokogiri::HTML(response_body)
17
- end
18
-
19
- # Get reponse.
20
- # @return [Net::HTTP*] the http response.
21
- # @param [String] url URL to retrieve.
22
- # @param [Boolean] resolve whether to resolve the URL.
23
- # @example Resolve example.com and request
24
- # Request.response('example.com', true)
25
- # @example Request http://example.com
26
- # Request.response('http://example.com', false)
27
- def response(url, resolve = true)
28
- resolved_url = resolve ? resolve_url(url) : url
29
- uri = URI.parse(resolved_url)
30
- http = Net::HTTP.new(uri.host, uri.port)
31
- http.use_ssl = true if resolved_url.start_with?('https://')
6
+ # Get and parse HTML & XML documents.
7
+ # @return [Array] with links sent to the Wayback Machine.
8
+ # @param [String] url to retrieve and parse.
9
+ # @example Request and parse example.com
10
+ # Request.document('example.com')
11
+ # @example Request and parse google.com/sitemap.xml
12
+ # Request.document('google.com/sitemap.xml')
13
+ def self.document(url)
14
+ response_body = Request.response(url).body
15
+ Nokogiri::HTML(response_body)
16
+ end
32
17
 
33
- request = Net::HTTP::Get.new(uri.request_uri)
34
- request['User-Agent'] = WaybackArchiver::USER_AGENT
35
- http.request(request)
36
- end
18
+ # Get reponse.
19
+ # @return [Net::HTTP*] the http response.
20
+ # @param [String] url URL to retrieve.
21
+ # @param [Boolean] resolve whether to resolve the URL.
22
+ # @example Resolve example.com and request
23
+ # Request.response('example.com', true)
24
+ # @example Request http://example.com
25
+ # Request.response('http://example.com', false)
26
+ def self.response(url, resolve = true)
27
+ resolved_url = resolve ? resolve_url(url) : url
28
+ uri = URI.parse(resolved_url)
29
+ http = Net::HTTP.new(uri.host, uri.port)
30
+ http.use_ssl = true if resolved_url.start_with?('https://')
37
31
 
38
- # Resolve the URL, follows redirects.
39
- # @return [String] the resolved URL.
40
- # @param [String] url to retrieve.
41
- # @example Resolve example.com and request
42
- # Request.resolve_url('example.com')
43
- def resolve_url(url)
44
- resolved = UrlResolver.resolve(url)
45
- resolved = resolved.prepend('http://') unless has_protocol?(resolved)
46
- resolved
47
- end
32
+ request = Net::HTTP::Get.new(uri.request_uri)
33
+ request['User-Agent'] = WaybackArchiver::USER_AGENT
34
+ http.request(request)
35
+ end
48
36
 
49
- private
37
+ # Resolve the URL, follows redirects.
38
+ # @return [String] the resolved URL.
39
+ # @param [String] url to retrieve.
40
+ # @example Resolve example.com and request
41
+ # Request.resolve_url('example.com')
42
+ def self.resolve_url(url)
43
+ resolved = UrlResolver.resolve(url)
44
+ resolved = resolved.prepend('http://') unless protocol?(resolved)
45
+ resolved
46
+ end
50
47
 
51
- # Resolve the URL, follows redirects.
52
- # @return [Boolean] true if string includes protocol.
53
- # @param [String] url to check.
54
- # @example Check if string includes protocol
55
- # Request.has_protocol?('example.com')
56
- # # => false
57
- # Request.has_protocol?('https://example.com')
58
- # # => true
59
- # Request.has_protocol?('http://example.com')
60
- # # => true
61
- def has_protocol?(url)
62
- url.start_with?('http://') || url.start_with?('https://')
63
- end
48
+ # Resolve the URL, follows redirects.
49
+ # @return [Boolean] true if string includes protocol.
50
+ # @param [String] url to check.
51
+ # @example Check if string includes protocol
52
+ # Request.protocol?('example.com')
53
+ # # => false
54
+ # Request.protocol?('https://example.com')
55
+ # # => true
56
+ # Request.protocol?('http://example.com')
57
+ # # => true
58
+ def self.protocol?(url)
59
+ url.start_with?('http://') || url.start_with?('https://')
64
60
  end
65
61
  end
66
62
  end
@@ -1,40 +1,51 @@
1
+ require 'spidr'
2
+ require 'robots'
3
+
1
4
  module WaybackArchiver
2
5
  # Retrive URLs from different sources
3
6
  class UrlCollector
4
- class << self
5
- # Retrieve URLs from Sitemap.
6
- # @return [Array] of URLs defined in Sitemap.
7
- # @param [String] url domain to retrieve Sitemap from.
8
- # @example Get URLs defined in Sitemap for google.com
9
- # UrlCollector.sitemap('https://google.com')
10
- def sitemap(url)
11
- resolved = Request.resolve_url("#{url}/sitemap.xml")
12
- sitemap = Request.document(resolved)
13
- sitemap.css('loc').map { |element| element.text }
14
- end
7
+ # Retrieve URLs from Sitemap.
8
+ # @return [Array] of URLs defined in Sitemap.
9
+ # @param [String] url domain to retrieve Sitemap from.
10
+ # @example Get URLs defined in Sitemap for google.com
11
+ # UrlCollector.sitemap('https://google.com')
12
+ def self.sitemap(url)
13
+ resolved = Request.resolve_url("#{url}/sitemap.xml")
14
+ sitemap = Request.document(resolved)
15
+ sitemap.css('loc').map(&:text)
16
+ end
15
17
 
16
- # Retrieve URLs by crawling.
17
- # @return [Array] of URLs defined found during crawl.
18
- # @param [String] url domain to crawl URLs from.
19
- # @example Crawl URLs defined on example.com
20
- # UrlCollector.crawl('http://example.com')
21
- def crawl(url)
22
- SiteMapper.map(url, user_agent: WaybackArchiver::USER_AGENT) { |new_url| yield(new_url) if block_given? }
18
+ # Retrieve URLs by crawling.
19
+ # @return [Array] of URLs defined found during crawl.
20
+ # @param [String] url domain to crawl URLs from.
21
+ # @example Crawl URLs defined on example.com
22
+ # UrlCollector.crawl('http://example.com')
23
+ def self.crawl(url)
24
+ urls = []
25
+ resolved_url = Request.resolve_url(url)
26
+ Spidr.site(resolved_url, robots: true) do |spider|
27
+ spider.every_html_page do |page|
28
+ page_url = page.url.to_s
29
+ urls << page_url
30
+ puts "Found: #{page_url}"
31
+ yield(page_url) if block_given?
32
+ end
23
33
  end
34
+ urls
35
+ end
24
36
 
25
- # Retrieve URLs listed in file.
26
- # @return [Array] of URLs defined in file.
27
- # @param [String] path to get URLs from.
28
- # @example Get URLs defined in /path/to/file
29
- # UrlCollector.file('/path/to/file')
30
- def file(path)
31
- raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
32
- urls = []
33
- File.open(path).read
37
+ # Retrieve URLs listed in file.
38
+ # @return [Array] of URLs defined in file.
39
+ # @param [String] path to get URLs from.
40
+ # @example Get URLs defined in /path/to/file
41
+ # UrlCollector.file('/path/to/file')
42
+ def self.file(path)
43
+ raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
44
+ urls = []
45
+ File.open(path).read
34
46
  .gsub(/\r\n?/, "\n")
35
- .each_line { |line| urls << line.gsub(/\n/, '').strip }
36
- urls.reject(&:empty?)
37
- end
47
+ .each_line { |line| urls << line.delete("\n").strip }
48
+ urls.reject(&:empty?)
38
49
  end
39
50
  end
40
51
  end
@@ -1,4 +1,4 @@
1
1
  module WaybackArchiver
2
2
  # Gem version
3
- VERSION = '0.0.11'
3
+ VERSION = '0.0.12'.freeze
4
4
  end
metadata CHANGED
@@ -1,29 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.11
4
+ version: 0.0.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-02 00:00:00.000000000 Z
11
+ date: 2017-02-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: site_mapper
14
+ name: spidr
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
19
+ version: 0.6.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: 0.6.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: robots
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.1'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.1'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: url_resolver
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -122,6 +136,20 @@ dependencies:
122
136
  - - "~>"
123
137
  - !ruby/object:Gem::Version
124
138
  version: '3.2'
139
+ - !ruby/object:Gem::Dependency
140
+ name: byebug
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">"
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">"
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
125
153
  description: Send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
126
154
  email:
127
155
  - burenstam@gmail.com
@@ -133,6 +161,7 @@ files:
133
161
  - bin/wayback_archiver
134
162
  - lib/wayback_archiver.rb
135
163
  - lib/wayback_archiver/archive.rb
164
+ - lib/wayback_archiver/process_queue.rb
136
165
  - lib/wayback_archiver/request.rb
137
166
  - lib/wayback_archiver/url_collector.rb
138
167
  - lib/wayback_archiver/version.rb
@@ -156,9 +185,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
156
185
  version: '0'
157
186
  requirements: []
158
187
  rubyforge_project:
159
- rubygems_version: 2.2.2
188
+ rubygems_version: 2.5.2
160
189
  signing_key:
161
190
  specification_version: 4
162
191
  summary: Send URLs to Wayback Machine
163
192
  test_files: []
164
- has_rdoc: