wayback_archiver 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ddbffea2e55297390c66201d287b85fb6336d864
4
- data.tar.gz: b419745edba1f8dcf9d6e83ce5b74cd70c9abd0f
3
+ metadata.gz: b64e289cd025ebddc9af5472a3c5038af4e98535
4
+ data.tar.gz: 903a6e3f3bfb2f6ab81db471b8633b27f5f8c419
5
5
  SHA512:
6
- metadata.gz: 5bb23d2bab242cc55d1a9e851e5fd719431371f2149a2640ce34ede4be817f881ace982d39cff04691d435b140bbc54419bf90affb28d0621261cb9ee7d34a69
7
- data.tar.gz: 91c6651a5cbeb1333a9f24ab3596ee0b284c54e4aa7c375b158bf6c3bbb54892a6c78c930a1942a3dcc9f4e0ab937c60a08e3f60a8339f0e9342cea6f0959c5f
6
+ metadata.gz: cc1853305301920afbcdd7a79f592cd6863f27fdb57c22d634424346526441d8ddd788bee8b7e23f938f334fbbd7a2d162b397041ddc6fe0f1193d816d4fac59
7
+ data.tar.gz: e64370ece062319402c773464493fb767148712e115ae3ce5aebf3b66361dae1f94177a9d50b6aeaf0770d8ddcb86d3e7b779c170bfc6294633fab4283e711c4
@@ -1,31 +1,27 @@
1
1
  module WaybackArchiver
2
2
  class Archive
3
- MAX_THREAD_COUNT = 8
3
+ WAYBACK_BASE_URL = 'https://web.archive.org/save/'
4
+ MAX_THREAD_COUNT = 10
4
5
 
5
- def self.post(all_urls)
6
+ def self.post(urls)
6
7
  puts "Request are sent with up to #{MAX_THREAD_COUNT} parallel threads"
7
-
8
- puts "Total urls to be sent: #{all_urls.length}"
9
- threads = []
10
- group_size = (all_urls.length / MAX_THREAD_COUNT) + 1
11
- all_urls.each_slice(group_size).to_a.each do |urls|
12
- threads << Thread.new do
13
- urls.each_with_index do |url, index|
14
- resolved_url = Request.resolve_url(url)
15
- request_url = "#{BASE_URL}#{resolved_url}"
16
- begin
17
- res = Request.get_response(request_url)
18
- puts "[#{res.code}, #{res.message}] #{resolved_url}"
19
- rescue Exception => e
20
- puts "Error message: #{e.message}"
21
- puts "Failed to archive: #{resolved_url}"
22
- end
23
- end
24
- end
25
- end
26
- threads.each(&:join)
27
- puts "#{all_urls.length} URLs sent to Internet archive"
28
- all_urls
8
+ puts "Total urls to be sent: #{urls.length}"
9
+ group_size = (urls.length / MAX_THREAD_COUNT) + 1
10
+ urls.each_slice(group_size).to_a.map do |archive_urls|
11
+ Thread.new { archive_urls.each { |url| post_url(url) } }
12
+ end.each(&:join)
13
+ puts "#{urls.length} URLs sent to Internet archive"
14
+ urls
15
+ end
16
+
17
+ def self.post_url(archive_url)
18
+ resolved_url = Request.resolve_url(archive_url)
19
+ request_url = "#{WAYBACK_BASE_URL}#{resolved_url}"
20
+ response = Request.get_response(request_url)
21
+ puts "[#{response.code}, #{response.message}] #{resolved_url}"
22
+ rescue Exception => e
23
+ puts "Error message: #{e.message}"
24
+ puts "Failed to archive: #{resolved_url}"
29
25
  end
30
26
  end
31
27
  end
@@ -2,12 +2,9 @@ module WaybackArchiver
2
2
  class Collector
3
3
  class << self
4
4
  def urls_from_sitemap(url)
5
- urls = []
6
- xml_data = Request.get_response(Request.resolve_url(url)).body
7
- document = REXML::Document.new(xml_data)
8
-
9
- document.elements.each('urlset/url/loc') { |element| urls << element.text }
10
- urls
5
+ resolved = Request.resolve_url(url)
6
+ sitemap = Request.get_page(resolved)
7
+ sitemap.css('loc').map { |element| element.text }
11
8
  end
12
9
 
13
10
  def urls_from_crawl(url)
@@ -4,7 +4,8 @@ module WaybackArchiver
4
4
 
5
5
  def initialize(base_url)
6
6
  @resolved_base_url = Request.resolve_url(base_url)
7
- @base_hostname = URI.parse(@resolved_base_url).host
7
+ @base_hostname = URI.parse(@resolved_base_url).hostname
8
+ @resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
8
9
  end
9
10
 
10
11
  def absolute_url_from(raw_url, get_url)
@@ -12,7 +13,7 @@ module WaybackArchiver
12
13
  parsed_url = URI.parse(raw_url) rescue URI.parse('')
13
14
  if parsed_url.relative?
14
15
  url_from_relative(raw_url, get_url)
15
- elsif base_hostname.eql?(parsed_url.hostname)
16
+ elsif same_domain?(raw_url, @resolved_base_url)
16
17
  raw_url
17
18
  else
18
19
  nil
@@ -65,5 +66,9 @@ module WaybackArchiver
65
66
  dont_end.each { |pattern| return false if href.end_with?(pattern) }
66
67
  true
67
68
  end
69
+
70
+ def same_domain?(first, second)
71
+ first.include?(second)
72
+ end
68
73
  end
69
74
  end
@@ -1,19 +1,19 @@
1
1
  require 'set'
2
- require 'nokogiri'
2
+ require 'nokogiri'
3
3
 
4
4
  module WaybackArchiver
5
5
  class Crawler
6
6
  CRAWLER_INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
7
7
  HEADERS_HASH = {
8
- 'User-Agent' => "WaybackArchiver/#{VERSION} (+#{CRAWLER_INFO_LINK})"
8
+ 'User-Agent' => "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{CRAWLER_INFO_LINK})"
9
9
  }
10
10
 
11
- def initialize(url, resolve: false)
11
+ def initialize(url, resolve = false)
12
12
  base_url = Request.resolve_url(url)
13
13
  @options = { resolve: resolve }
14
14
  @crawl_url = CrawlUrl.new(base_url)
15
15
  @fetch_queue = Set.new
16
- @procesed = Set.new
16
+ @processed = Set.new
17
17
  @fetch_queue << @crawl_url.resolved_base_url
18
18
  end
19
19
 
@@ -21,14 +21,14 @@ module WaybackArchiver
21
21
  new(base_url).collect_urls
22
22
  end
23
23
 
24
- def collect_urls
24
+ def collect_urls
25
25
  until @fetch_queue.empty?
26
26
  url = @fetch_queue.first
27
27
  @fetch_queue.delete(@fetch_queue.first)
28
28
  page_links(url)
29
29
  end
30
- puts "Crawling finished, #{@procesed.length} links found"
31
- @procesed.to_a
30
+ puts "Crawling finished, #{@processed.length} links found"
31
+ @processed.to_a
32
32
  rescue Interrupt, IRB::Abort
33
33
  puts 'Crawl interrupted.'
34
34
  @fetch_queue.to_a
@@ -38,21 +38,17 @@ module WaybackArchiver
38
38
 
39
39
  def page_links(get_url)
40
40
  puts "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
41
- link_elements = get_page(get_url).css('a') rescue []
42
- @procesed << get_url
41
+ link_elements = Request.get_page(get_url).css('a') rescue []
42
+ @processed << get_url
43
43
  link_elements.each do |page_link|
44
44
  absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
45
45
  if absolute_url
46
46
  resolved_url = resolve(absolute_url)
47
- @fetch_queue << resolved_url if !@procesed.include?(resolved_url)
47
+ @fetch_queue << resolved_url if !@processed.include?(resolved_url)
48
48
  end
49
49
  end
50
50
  end
51
51
 
52
- def get_page(url)
53
- Nokogiri::HTML(Request.get_response(url).body)
54
- end
55
-
56
52
  def resolve(url)
57
53
  @options[:resolve] ? Request.resolve_url(url) : url
58
54
  end
@@ -5,19 +5,31 @@ module WaybackArchiver
5
5
  INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
6
6
  USER_AGENT = "WaybackArchiver/#{VERSION} (+#{INFO_LINK})"
7
7
 
8
- def self.get_response(url, resolve: false)
9
- resolved_url = resolve ? resolve_url(url) : url
10
- uri = URI.parse(resolved_url)
11
- http = Net::HTTP.new(uri.host, uri.port)
12
- http.use_ssl = true if resolved_url.include?('https://')
8
+ class << self
9
+ def get_page(url, document_type = :html)
10
+ response = Request.get_response(url).body
11
+ case document_type
12
+ when :xml
13
+ Nokogiri::XML(response)
14
+ else
15
+ Nokogiri::HTML(response)
16
+ end
17
+ end
13
18
 
14
- request = Net::HTTP::Get.new(uri.request_uri)
15
- request['User-Agent'] = USER_AGENT
16
- http.request(request)
17
- end
19
+ def get_response(url, resolve = false)
20
+ resolved_url = resolve ? resolve_url(url) : url
21
+ uri = URI.parse(resolved_url)
22
+ http = Net::HTTP.new(uri.host, uri.port)
23
+ http.use_ssl = true if resolved_url.include?('https://')
24
+
25
+ request = Net::HTTP::Get.new(uri.request_uri)
26
+ request['User-Agent'] = USER_AGENT
27
+ http.request(request)
28
+ end
18
29
 
19
- def self.resolve_url(url)
20
- UrlResolver.resolve(url)
30
+ def resolve_url(url)
31
+ UrlResolver.resolve(url)
32
+ end
21
33
  end
22
34
  end
23
35
  end
@@ -1,3 +1,3 @@
1
1
  module WaybackArchiver
2
- VERSION = '0.0.7'
2
+ VERSION = '0.0.8'
3
3
  end
@@ -1,6 +1,5 @@
1
1
  require 'uri'
2
2
  require 'net/http'
3
- require 'rexml/document'
4
3
 
5
4
  require 'wayback_archiver/collector'
6
5
  require 'wayback_archiver/archive'
@@ -9,18 +8,12 @@ require 'wayback_archiver/crawler'
9
8
  require 'wayback_archiver/crawl_url'
10
9
 
11
10
  module WaybackArchiver
12
- BASE_URL = 'https://web.archive.org/save/'
13
-
14
- def self.archive(source, from = :sitemap)
11
+ def self.archive(source, from = :crawl)
15
12
  urls = case from.to_s
16
- when 'sitemap'
17
- Collector.urls_from_sitemap("#{source}/sitemap.xml")
18
- when 'url'
19
- [Request.resolve_url(source)]
20
- when 'file'
21
- Collector.urls_from_file(source)
22
- when 'crawl', 'crawler'
23
- Collector.urls_from_crawl(source)
13
+ when 'sitemap' then Collector.urls_from_sitemap("#{source}/sitemap.xml")
14
+ when 'url' then [Request.resolve_url(source)]
15
+ when 'file' then Collector.urls_from_file(source)
16
+ when 'crawl' then Collector.urls_from_crawl(source)
24
17
  else
25
18
  raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawl"
26
19
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-20 00:00:00.000000000 Z
11
+ date: 2014-10-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri