wayback_archiver 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ddbffea2e55297390c66201d287b85fb6336d864
4
- data.tar.gz: b419745edba1f8dcf9d6e83ce5b74cd70c9abd0f
3
+ metadata.gz: b64e289cd025ebddc9af5472a3c5038af4e98535
4
+ data.tar.gz: 903a6e3f3bfb2f6ab81db471b8633b27f5f8c419
5
5
  SHA512:
6
- metadata.gz: 5bb23d2bab242cc55d1a9e851e5fd719431371f2149a2640ce34ede4be817f881ace982d39cff04691d435b140bbc54419bf90affb28d0621261cb9ee7d34a69
7
- data.tar.gz: 91c6651a5cbeb1333a9f24ab3596ee0b284c54e4aa7c375b158bf6c3bbb54892a6c78c930a1942a3dcc9f4e0ab937c60a08e3f60a8339f0e9342cea6f0959c5f
6
+ metadata.gz: cc1853305301920afbcdd7a79f592cd6863f27fdb57c22d634424346526441d8ddd788bee8b7e23f938f334fbbd7a2d162b397041ddc6fe0f1193d816d4fac59
7
+ data.tar.gz: e64370ece062319402c773464493fb767148712e115ae3ce5aebf3b66361dae1f94177a9d50b6aeaf0770d8ddcb86d3e7b779c170bfc6294633fab4283e711c4
@@ -1,31 +1,27 @@
1
1
  module WaybackArchiver
2
2
  class Archive
3
- MAX_THREAD_COUNT = 8
3
+ WAYBACK_BASE_URL = 'https://web.archive.org/save/'
4
+ MAX_THREAD_COUNT = 10
4
5
 
5
- def self.post(all_urls)
6
+ def self.post(urls)
6
7
  puts "Request are sent with up to #{MAX_THREAD_COUNT} parallel threads"
7
-
8
- puts "Total urls to be sent: #{all_urls.length}"
9
- threads = []
10
- group_size = (all_urls.length / MAX_THREAD_COUNT) + 1
11
- all_urls.each_slice(group_size).to_a.each do |urls|
12
- threads << Thread.new do
13
- urls.each_with_index do |url, index|
14
- resolved_url = Request.resolve_url(url)
15
- request_url = "#{BASE_URL}#{resolved_url}"
16
- begin
17
- res = Request.get_response(request_url)
18
- puts "[#{res.code}, #{res.message}] #{resolved_url}"
19
- rescue Exception => e
20
- puts "Error message: #{e.message}"
21
- puts "Failed to archive: #{resolved_url}"
22
- end
23
- end
24
- end
25
- end
26
- threads.each(&:join)
27
- puts "#{all_urls.length} URLs sent to Internet archive"
28
- all_urls
8
+ puts "Total urls to be sent: #{urls.length}"
9
+ group_size = (urls.length / MAX_THREAD_COUNT) + 1
10
+ urls.each_slice(group_size).to_a.map do |archive_urls|
11
+ Thread.new { archive_urls.each { |url| post_url(url) } }
12
+ end.each(&:join)
13
+ puts "#{urls.length} URLs sent to Internet archive"
14
+ urls
15
+ end
16
+
17
+ def self.post_url(archive_url)
18
+ resolved_url = Request.resolve_url(archive_url)
19
+ request_url = "#{WAYBACK_BASE_URL}#{resolved_url}"
20
+ response = Request.get_response(request_url)
21
+ puts "[#{response.code}, #{response.message}] #{resolved_url}"
22
+ rescue Exception => e
23
+ puts "Error message: #{e.message}"
24
+ puts "Failed to archive: #{resolved_url}"
29
25
  end
30
26
  end
31
27
  end
@@ -2,12 +2,9 @@ module WaybackArchiver
2
2
  class Collector
3
3
  class << self
4
4
  def urls_from_sitemap(url)
5
- urls = []
6
- xml_data = Request.get_response(Request.resolve_url(url)).body
7
- document = REXML::Document.new(xml_data)
8
-
9
- document.elements.each('urlset/url/loc') { |element| urls << element.text }
10
- urls
5
+ resolved = Request.resolve_url(url)
6
+ sitemap = Request.get_page(resolved)
7
+ sitemap.css('loc').map { |element| element.text }
11
8
  end
12
9
 
13
10
  def urls_from_crawl(url)
@@ -4,7 +4,8 @@ module WaybackArchiver
4
4
 
5
5
  def initialize(base_url)
6
6
  @resolved_base_url = Request.resolve_url(base_url)
7
- @base_hostname = URI.parse(@resolved_base_url).host
7
+ @base_hostname = URI.parse(@resolved_base_url).hostname
8
+ @resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
8
9
  end
9
10
 
10
11
  def absolute_url_from(raw_url, get_url)
@@ -12,7 +13,7 @@ module WaybackArchiver
12
13
  parsed_url = URI.parse(raw_url) rescue URI.parse('')
13
14
  if parsed_url.relative?
14
15
  url_from_relative(raw_url, get_url)
15
- elsif base_hostname.eql?(parsed_url.hostname)
16
+ elsif same_domain?(raw_url, @resolved_base_url)
16
17
  raw_url
17
18
  else
18
19
  nil
@@ -65,5 +66,9 @@ module WaybackArchiver
65
66
  dont_end.each { |pattern| return false if href.end_with?(pattern) }
66
67
  true
67
68
  end
69
+
70
+ def same_domain?(first, second)
71
+ first.include?(second)
72
+ end
68
73
  end
69
74
  end
@@ -1,19 +1,19 @@
1
1
  require 'set'
2
- require 'nokogiri'
2
+ require 'nokogiri'
3
3
 
4
4
  module WaybackArchiver
5
5
  class Crawler
6
6
  CRAWLER_INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
7
7
  HEADERS_HASH = {
8
- 'User-Agent' => "WaybackArchiver/#{VERSION} (+#{CRAWLER_INFO_LINK})"
8
+ 'User-Agent' => "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{CRAWLER_INFO_LINK})"
9
9
  }
10
10
 
11
- def initialize(url, resolve: false)
11
+ def initialize(url, resolve = false)
12
12
  base_url = Request.resolve_url(url)
13
13
  @options = { resolve: resolve }
14
14
  @crawl_url = CrawlUrl.new(base_url)
15
15
  @fetch_queue = Set.new
16
- @procesed = Set.new
16
+ @processed = Set.new
17
17
  @fetch_queue << @crawl_url.resolved_base_url
18
18
  end
19
19
 
@@ -21,14 +21,14 @@ module WaybackArchiver
21
21
  new(base_url).collect_urls
22
22
  end
23
23
 
24
- def collect_urls
24
+ def collect_urls
25
25
  until @fetch_queue.empty?
26
26
  url = @fetch_queue.first
27
27
  @fetch_queue.delete(@fetch_queue.first)
28
28
  page_links(url)
29
29
  end
30
- puts "Crawling finished, #{@procesed.length} links found"
31
- @procesed.to_a
30
+ puts "Crawling finished, #{@processed.length} links found"
31
+ @processed.to_a
32
32
  rescue Interrupt, IRB::Abort
33
33
  puts 'Crawl interrupted.'
34
34
  @fetch_queue.to_a
@@ -38,21 +38,17 @@ module WaybackArchiver
38
38
 
39
39
  def page_links(get_url)
40
40
  puts "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
41
- link_elements = get_page(get_url).css('a') rescue []
42
- @procesed << get_url
41
+ link_elements = Request.get_page(get_url).css('a') rescue []
42
+ @processed << get_url
43
43
  link_elements.each do |page_link|
44
44
  absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
45
45
  if absolute_url
46
46
  resolved_url = resolve(absolute_url)
47
- @fetch_queue << resolved_url if !@procesed.include?(resolved_url)
47
+ @fetch_queue << resolved_url if !@processed.include?(resolved_url)
48
48
  end
49
49
  end
50
50
  end
51
51
 
52
- def get_page(url)
53
- Nokogiri::HTML(Request.get_response(url).body)
54
- end
55
-
56
52
  def resolve(url)
57
53
  @options[:resolve] ? Request.resolve_url(url) : url
58
54
  end
@@ -5,19 +5,31 @@ module WaybackArchiver
5
5
  INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
6
6
  USER_AGENT = "WaybackArchiver/#{VERSION} (+#{INFO_LINK})"
7
7
 
8
- def self.get_response(url, resolve: false)
9
- resolved_url = resolve ? resolve_url(url) : url
10
- uri = URI.parse(resolved_url)
11
- http = Net::HTTP.new(uri.host, uri.port)
12
- http.use_ssl = true if resolved_url.include?('https://')
8
+ class << self
9
+ def get_page(url, document_type = :html)
10
+ response = Request.get_response(url).body
11
+ case document_type
12
+ when :xml
13
+ Nokogiri::XML(response)
14
+ else
15
+ Nokogiri::HTML(response)
16
+ end
17
+ end
13
18
 
14
- request = Net::HTTP::Get.new(uri.request_uri)
15
- request['User-Agent'] = USER_AGENT
16
- http.request(request)
17
- end
19
+ def get_response(url, resolve = false)
20
+ resolved_url = resolve ? resolve_url(url) : url
21
+ uri = URI.parse(resolved_url)
22
+ http = Net::HTTP.new(uri.host, uri.port)
23
+ http.use_ssl = true if resolved_url.include?('https://')
24
+
25
+ request = Net::HTTP::Get.new(uri.request_uri)
26
+ request['User-Agent'] = USER_AGENT
27
+ http.request(request)
28
+ end
18
29
 
19
- def self.resolve_url(url)
20
- UrlResolver.resolve(url)
30
+ def resolve_url(url)
31
+ UrlResolver.resolve(url)
32
+ end
21
33
  end
22
34
  end
23
35
  end
@@ -1,3 +1,3 @@
1
1
  module WaybackArchiver
2
- VERSION = '0.0.7'
2
+ VERSION = '0.0.8'
3
3
  end
@@ -1,6 +1,5 @@
1
1
  require 'uri'
2
2
  require 'net/http'
3
- require 'rexml/document'
4
3
 
5
4
  require 'wayback_archiver/collector'
6
5
  require 'wayback_archiver/archive'
@@ -9,18 +8,12 @@ require 'wayback_archiver/crawler'
9
8
  require 'wayback_archiver/crawl_url'
10
9
 
11
10
  module WaybackArchiver
12
- BASE_URL = 'https://web.archive.org/save/'
13
-
14
- def self.archive(source, from = :sitemap)
11
+ def self.archive(source, from = :crawl)
15
12
  urls = case from.to_s
16
- when 'sitemap'
17
- Collector.urls_from_sitemap("#{source}/sitemap.xml")
18
- when 'url'
19
- [Request.resolve_url(source)]
20
- when 'file'
21
- Collector.urls_from_file(source)
22
- when 'crawl', 'crawler'
23
- Collector.urls_from_crawl(source)
13
+ when 'sitemap' then Collector.urls_from_sitemap("#{source}/sitemap.xml")
14
+ when 'url' then [Request.resolve_url(source)]
15
+ when 'file' then Collector.urls_from_file(source)
16
+ when 'crawl' then Collector.urls_from_crawl(source)
24
17
  else
25
18
  raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawl"
26
19
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-20 00:00:00.000000000 Z
11
+ date: 2014-10-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri