wayback_archiver 0.0.11 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fa0d72d6164b6b280db7c683535e6894ba64778d
4
- data.tar.gz: 629963ef30283123820418a714ae73160fa20293
3
+ metadata.gz: fd01b7c39f7432bba6bb0abf29cff5cced700e90
4
+ data.tar.gz: 887582fb5f46f8e42a126b09d7c31b3d005bb1cf
5
5
  SHA512:
6
- metadata.gz: 1021437d5c97ecfad14822548522cad49d3257992c91ee8ccf90a30e3cd4cd98fae6c44d977fc3c1cd4f61334bd7c935fcc50bdee92a68a4399470d4610836be
7
- data.tar.gz: 5c805b9bb8b7cc1123be943966b2f4e94619580d17eb051947d38b1e3d4a165b2c61dd3f117c6bc7c8d858768731108bb1aab1cc59472ad0a3e120fb8df9cae4
6
+ metadata.gz: 4f66455e9ddfdd2cb191910bacfe784a3822e31b0e7ef7bce395fd60614887c9c700b262e958b6acfff9fa59941068c023a0729d09cf00116f44b359c01b2213
7
+ data.tar.gz: 8c8f98adfeb91808296d8c1bc7e01d219918074b5680d2afccc22ac00cb02b6d6ff18b29129d3650fa660238b023e8948a851aeb7ce36b42c34b9be7bb913386
@@ -1,5 +1,3 @@
1
- require 'site_mapper'
2
-
3
1
  require 'uri'
4
2
  require 'net/http'
5
3
 
@@ -7,13 +5,14 @@ require 'wayback_archiver/version'
7
5
  require 'wayback_archiver/url_collector'
8
6
  require 'wayback_archiver/archive'
9
7
  require 'wayback_archiver/request'
8
+ require 'wayback_archiver/process_queue'
10
9
 
11
10
  # WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
12
11
  module WaybackArchiver
13
12
  # Link to gem on rubygems.org, part of the sent User-Agent
14
- INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
13
+ INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'.freeze
15
14
  # WaybackArchiver User-Agent
16
- USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})"
15
+ USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze
17
16
 
18
17
  # Send URLs to Wayback Machine.
19
18
  # @return [Array] with URLs sent to the Wayback Machine.
@@ -29,7 +28,7 @@ module WaybackArchiver
29
28
  def self.archive(source, type = :crawl)
30
29
  case type.to_s
31
30
  when 'file' then Archive.post(UrlCollector.file(source))
32
- when 'crawl' then UrlCollector.crawl(source) { |url| Archive.post_url(url) }
31
+ when 'crawl' then Archive.post(UrlCollector.crawl(source))
33
32
  when 'sitemap' then Archive.post(UrlCollector.sitemap(source))
34
33
  when 'url' then Archive.post_url(Request.resolve_url(source))
35
34
  else
@@ -2,46 +2,44 @@ module WaybackArchiver
2
2
  # Post URL(s) to Wayback Machine
3
3
  class Archive
4
4
  # Wayback Machine base URL.
5
- WAYBACK_BASE_URL = 'https://web.archive.org/save/'
5
+ WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
6
6
  # Default concurrency for archiving URLs
7
7
  DEFAULT_CONCURRENCY = 10
8
- class << self
9
- # Send URLs to Wayback Machine.
10
- # @return [Array] with sent URLs.
11
- # @param [Array] urls URLs to send.
12
- # @param [Hash] options
13
- # @example Archive example.com, with default options
14
- # Archive.post(['http://example.com'])
15
- # @example Archive example.com, using only 1 thread
16
- # Archive.post(['http://example.com'], concurrency: 1)
17
- def post(urls, options = {})
18
- options = { concurrency: DEFAULT_CONCURRENCY }.merge!(options)
19
- concurrency = options[:concurrency]
20
- puts "Request are sent with up to #{concurrency} parallel threads"
21
- puts "Total urls to be sent: #{urls.length}"
22
- group_size = (urls.length / concurrency) + 1
23
- urls.each_slice(group_size).to_a.map! do |archive_urls|
24
- Thread.new { archive_urls.each { |url| post_url(url) } }
25
- end.each(&:join)
26
- puts "#{urls.length} URLs sent to Internet archive"
27
- urls
28
- end
8
+ # Send URLs to Wayback Machine.
9
+ # @return [Array] with sent URLs.
10
+ # @param [Array] urls URLs to send.
11
+ # @param [Hash] options
12
+ # @example Archive example.com, with default options
13
+ # Archive.post(['http://example.com'])
14
+ # @example Archive example.com, using only 1 thread
15
+ # Archive.post(['http://example.com'], concurrency: 1)
16
+ def self.post(urls, options = {})
17
+ options = { concurrency: DEFAULT_CONCURRENCY }.merge!(options)
18
+ concurrency = options[:concurrency]
29
19
 
30
- # Send URL to Wayback Machine.
31
- # @return [String] the sent URL.
32
- # @param [String] url to send.
33
- # @example Archive example.com, with default options
34
- # Archive.post_url('http://example.com')
35
- def post_url(url)
36
- resolved_url = Request.resolve_url(url)
37
- request_url = "#{WAYBACK_BASE_URL}#{resolved_url}"
38
- response = Request.response(request_url)
39
- puts "[#{response.code}, #{response.message}] #{resolved_url}"
40
- resolved_url
41
- rescue Exception => e
42
- puts "Error message: #{e.message}"
43
- puts "Failed to archive: #{resolved_url}"
44
- end
20
+ puts "=== WAYBACK ARCHIVER ==="
21
+ puts "Request are sent with up to #{concurrency} parallel threads"
22
+ puts "Total urls to be sent: #{urls.length}"
23
+
24
+ ProcessQueue.process(urls, threads_count: concurrency) { |url| post_url(url) }
25
+
26
+ puts "#{urls.length} URLs sent to Internet archive"
27
+ urls
28
+ end
29
+
30
+ # Send URL to Wayback Machine.
31
+ # @return [String] the sent URL.
32
+ # @param [String] url to send.
33
+ # @example Archive example.com, with default options
34
+ # Archive.post_url('http://example.com')
35
+ def self.post_url(url)
36
+ request_url = "#{WAYBACK_BASE_URL}#{url}"
37
+ response = Request.response(request_url)
38
+ puts "[#{response.code}, #{response.message}] #{url}"
39
+ url
40
+ rescue Exception => e
41
+ puts "Error message: #{e.message}"
42
+ puts "Failed to archive: #{url}"
45
43
  end
46
44
  end
47
45
  end
@@ -0,0 +1,28 @@
1
+ require 'thread'
2
+
3
+ module WaybackArchiver
4
+ class ProcessQueue
5
+ # Process enumerable data in parallel.
6
+ # @return [Array] of URLs defined found during crawl.
7
+ # @param [Object] Enumberable object
8
+ # @example Print list of names in parallel
9
+ # ProcessQueue.process(%w(jacob peter eva)) { |v| puts n }
10
+ # @example Print list of names using 2 threads
11
+ # ProcessQueue.process(%w(jacob peter eva), threads_count: 2) { |v| puts n }
12
+ def self.process(data_array, threads_count: 5)
13
+ queue = Queue.new
14
+ data_array.each { |data| queue.push(data) }
15
+ workers = threads_count.times.map do
16
+ Thread.new do
17
+ begin
18
+ while data = queue.pop(true)
19
+ yield(data)
20
+ end
21
+ rescue ThreadError
22
+ end
23
+ end
24
+ end
25
+ workers.map(&:join)
26
+ end
27
+ end
28
+ end
@@ -3,64 +3,60 @@ require 'url_resolver' # TODO: Allow users to use any resolver
3
3
  module WaybackArchiver
4
4
  # Request and parse HTML & XML documents
5
5
  class Request
6
- class << self
7
- # Get and parse HTML & XML documents.
8
- # @return [Array] with links sent to the Wayback Machine.
9
- # @param [String] url to retrieve and parse.
10
- # @example Request and parse example.com
11
- # Request.document('example.com')
12
- # @example Request and parse google.com/sitemap.xml
13
- # Request.document('google.com/sitemap.xml')
14
- def document(url)
15
- response_body = Request.response(url).body
16
- Nokogiri::HTML(response_body)
17
- end
18
-
19
- # Get reponse.
20
- # @return [Net::HTTP*] the http response.
21
- # @param [String] url URL to retrieve.
22
- # @param [Boolean] resolve whether to resolve the URL.
23
- # @example Resolve example.com and request
24
- # Request.response('example.com', true)
25
- # @example Request http://example.com
26
- # Request.response('http://example.com', false)
27
- def response(url, resolve = true)
28
- resolved_url = resolve ? resolve_url(url) : url
29
- uri = URI.parse(resolved_url)
30
- http = Net::HTTP.new(uri.host, uri.port)
31
- http.use_ssl = true if resolved_url.start_with?('https://')
6
+ # Get and parse HTML & XML documents.
7
+ # @return [Array] with links sent to the Wayback Machine.
8
+ # @param [String] url to retrieve and parse.
9
+ # @example Request and parse example.com
10
+ # Request.document('example.com')
11
+ # @example Request and parse google.com/sitemap.xml
12
+ # Request.document('google.com/sitemap.xml')
13
+ def self.document(url)
14
+ response_body = Request.response(url).body
15
+ Nokogiri::HTML(response_body)
16
+ end
32
17
 
33
- request = Net::HTTP::Get.new(uri.request_uri)
34
- request['User-Agent'] = WaybackArchiver::USER_AGENT
35
- http.request(request)
36
- end
18
+ # Get reponse.
19
+ # @return [Net::HTTP*] the http response.
20
+ # @param [String] url URL to retrieve.
21
+ # @param [Boolean] resolve whether to resolve the URL.
22
+ # @example Resolve example.com and request
23
+ # Request.response('example.com', true)
24
+ # @example Request http://example.com
25
+ # Request.response('http://example.com', false)
26
+ def self.response(url, resolve = true)
27
+ resolved_url = resolve ? resolve_url(url) : url
28
+ uri = URI.parse(resolved_url)
29
+ http = Net::HTTP.new(uri.host, uri.port)
30
+ http.use_ssl = true if resolved_url.start_with?('https://')
37
31
 
38
- # Resolve the URL, follows redirects.
39
- # @return [String] the resolved URL.
40
- # @param [String] url to retrieve.
41
- # @example Resolve example.com and request
42
- # Request.resolve_url('example.com')
43
- def resolve_url(url)
44
- resolved = UrlResolver.resolve(url)
45
- resolved = resolved.prepend('http://') unless has_protocol?(resolved)
46
- resolved
47
- end
32
+ request = Net::HTTP::Get.new(uri.request_uri)
33
+ request['User-Agent'] = WaybackArchiver::USER_AGENT
34
+ http.request(request)
35
+ end
48
36
 
49
- private
37
+ # Resolve the URL, follows redirects.
38
+ # @return [String] the resolved URL.
39
+ # @param [String] url to retrieve.
40
+ # @example Resolve example.com and request
41
+ # Request.resolve_url('example.com')
42
+ def self.resolve_url(url)
43
+ resolved = UrlResolver.resolve(url)
44
+ resolved = resolved.prepend('http://') unless protocol?(resolved)
45
+ resolved
46
+ end
50
47
 
51
- # Resolve the URL, follows redirects.
52
- # @return [Boolean] true if string includes protocol.
53
- # @param [String] url to check.
54
- # @example Check if string includes protocol
55
- # Request.has_protocol?('example.com')
56
- # # => false
57
- # Request.has_protocol?('https://example.com')
58
- # # => true
59
- # Request.has_protocol?('http://example.com')
60
- # # => true
61
- def has_protocol?(url)
62
- url.start_with?('http://') || url.start_with?('https://')
63
- end
48
+ # Resolve the URL, follows redirects.
49
+ # @return [Boolean] true if string includes protocol.
50
+ # @param [String] url to check.
51
+ # @example Check if string includes protocol
52
+ # Request.protocol?('example.com')
53
+ # # => false
54
+ # Request.protocol?('https://example.com')
55
+ # # => true
56
+ # Request.protocol?('http://example.com')
57
+ # # => true
58
+ def self.protocol?(url)
59
+ url.start_with?('http://') || url.start_with?('https://')
64
60
  end
65
61
  end
66
62
  end
@@ -1,40 +1,51 @@
1
+ require 'spidr'
2
+ require 'robots'
3
+
1
4
  module WaybackArchiver
2
5
  # Retrive URLs from different sources
3
6
  class UrlCollector
4
- class << self
5
- # Retrieve URLs from Sitemap.
6
- # @return [Array] of URLs defined in Sitemap.
7
- # @param [String] url domain to retrieve Sitemap from.
8
- # @example Get URLs defined in Sitemap for google.com
9
- # UrlCollector.sitemap('https://google.com')
10
- def sitemap(url)
11
- resolved = Request.resolve_url("#{url}/sitemap.xml")
12
- sitemap = Request.document(resolved)
13
- sitemap.css('loc').map { |element| element.text }
14
- end
7
+ # Retrieve URLs from Sitemap.
8
+ # @return [Array] of URLs defined in Sitemap.
9
+ # @param [String] url domain to retrieve Sitemap from.
10
+ # @example Get URLs defined in Sitemap for google.com
11
+ # UrlCollector.sitemap('https://google.com')
12
+ def self.sitemap(url)
13
+ resolved = Request.resolve_url("#{url}/sitemap.xml")
14
+ sitemap = Request.document(resolved)
15
+ sitemap.css('loc').map(&:text)
16
+ end
15
17
 
16
- # Retrieve URLs by crawling.
17
- # @return [Array] of URLs defined found during crawl.
18
- # @param [String] url domain to crawl URLs from.
19
- # @example Crawl URLs defined on example.com
20
- # UrlCollector.crawl('http://example.com')
21
- def crawl(url)
22
- SiteMapper.map(url, user_agent: WaybackArchiver::USER_AGENT) { |new_url| yield(new_url) if block_given? }
18
+ # Retrieve URLs by crawling.
19
+ # @return [Array] of URLs defined found during crawl.
20
+ # @param [String] url domain to crawl URLs from.
21
+ # @example Crawl URLs defined on example.com
22
+ # UrlCollector.crawl('http://example.com')
23
+ def self.crawl(url)
24
+ urls = []
25
+ resolved_url = Request.resolve_url(url)
26
+ Spidr.site(resolved_url, robots: true) do |spider|
27
+ spider.every_html_page do |page|
28
+ page_url = page.url.to_s
29
+ urls << page_url
30
+ puts "Found: #{page_url}"
31
+ yield(page_url) if block_given?
32
+ end
23
33
  end
34
+ urls
35
+ end
24
36
 
25
- # Retrieve URLs listed in file.
26
- # @return [Array] of URLs defined in file.
27
- # @param [String] path to get URLs from.
28
- # @example Get URLs defined in /path/to/file
29
- # UrlCollector.file('/path/to/file')
30
- def file(path)
31
- raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
32
- urls = []
33
- File.open(path).read
37
+ # Retrieve URLs listed in file.
38
+ # @return [Array] of URLs defined in file.
39
+ # @param [String] path to get URLs from.
40
+ # @example Get URLs defined in /path/to/file
41
+ # UrlCollector.file('/path/to/file')
42
+ def self.file(path)
43
+ raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
44
+ urls = []
45
+ File.open(path).read
34
46
  .gsub(/\r\n?/, "\n")
35
- .each_line { |line| urls << line.gsub(/\n/, '').strip }
36
- urls.reject(&:empty?)
37
- end
47
+ .each_line { |line| urls << line.delete("\n").strip }
48
+ urls.reject(&:empty?)
38
49
  end
39
50
  end
40
51
  end
@@ -1,4 +1,4 @@
1
1
  module WaybackArchiver
2
2
  # Gem version
3
- VERSION = '0.0.11'
3
+ VERSION = '0.0.12'.freeze
4
4
  end
metadata CHANGED
@@ -1,29 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.11
4
+ version: 0.0.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-02 00:00:00.000000000 Z
11
+ date: 2017-02-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: site_mapper
14
+ name: spidr
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
19
+ version: 0.6.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: 0.6.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: robots
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.1'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.1'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: url_resolver
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -122,6 +136,20 @@ dependencies:
122
136
  - - "~>"
123
137
  - !ruby/object:Gem::Version
124
138
  version: '3.2'
139
+ - !ruby/object:Gem::Dependency
140
+ name: byebug
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">"
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">"
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
125
153
  description: Send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
126
154
  email:
127
155
  - burenstam@gmail.com
@@ -133,6 +161,7 @@ files:
133
161
  - bin/wayback_archiver
134
162
  - lib/wayback_archiver.rb
135
163
  - lib/wayback_archiver/archive.rb
164
+ - lib/wayback_archiver/process_queue.rb
136
165
  - lib/wayback_archiver/request.rb
137
166
  - lib/wayback_archiver/url_collector.rb
138
167
  - lib/wayback_archiver/version.rb
@@ -156,9 +185,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
156
185
  version: '0'
157
186
  requirements: []
158
187
  rubyforge_project:
159
- rubygems_version: 2.2.2
188
+ rubygems_version: 2.5.2
160
189
  signing_key:
161
190
  specification_version: 4
162
191
  summary: Send URLs to Wayback Machine
163
192
  test_files: []
164
- has_rdoc: