wayback_archiver 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: aee3e5d863178ea9283eb2b723e347d17aefa6b1
4
- data.tar.gz: 597d40bdfa0d7538477dbbba3f6671a72f22d82a
3
+ metadata.gz: ddbffea2e55297390c66201d287b85fb6336d864
4
+ data.tar.gz: b419745edba1f8dcf9d6e83ce5b74cd70c9abd0f
5
5
  SHA512:
6
- metadata.gz: 37f2b0372499d7999eed48a80647218127de4d240c661260f18011456c281956cdf778aa3bd66852be08caa3f0cfc03fdf8668895644197c31d1889cb464ddbe
7
- data.tar.gz: 505e63ed0e5ce661d96acee3e53d2c950f1e550e611ce64951d7011e76642f595f26e1cc6f522581e3253d1317c1496e861ace64a19032418deb7d14c4f4f991
6
+ metadata.gz: 5bb23d2bab242cc55d1a9e851e5fd719431371f2149a2640ce34ede4be817f881ace982d39cff04691d435b140bbc54419bf90affb28d0621261cb9ee7d34a69
7
+ data.tar.gz: 91c6651a5cbeb1333a9f24ab3596ee0b284c54e4aa7c375b158bf6c3bbb54892a6c78c930a1942a3dcc9f4e0ab937c60a08e3f60a8339f0e9342cea6f0959c5f
@@ -3,7 +3,7 @@ module WaybackArchiver
3
3
  MAX_THREAD_COUNT = 8
4
4
 
5
5
  def self.post(all_urls)
6
- puts "Request will be sent with max #{MAX_THREAD_COUNT} parallel threads"
6
+ puts "Request are sent with up to #{MAX_THREAD_COUNT} parallel threads"
7
7
 
8
8
  puts "Total urls to be sent: #{all_urls.length}"
9
9
  threads = []
@@ -11,18 +11,20 @@ module WaybackArchiver
11
11
  all_urls.each_slice(group_size).to_a.each do |urls|
12
12
  threads << Thread.new do
13
13
  urls.each_with_index do |url, index|
14
- request_url = "#{BASE_URL}#{url}"
14
+ resolved_url = Request.resolve_url(url)
15
+ request_url = "#{BASE_URL}#{resolved_url}"
15
16
  begin
16
17
  res = Request.get_response(request_url)
17
- print "#{url} #{res.code} => #{res.message} \n"
18
+ puts "[#{res.code}, #{res.message}] #{resolved_url}"
18
19
  rescue Exception => e
19
20
  puts "Error message: #{e.message}"
20
- puts "Failed to archive: #{url}"
21
+ puts "Failed to archive: #{resolved_url}"
21
22
  end
22
23
  end
23
24
  end
24
25
  end
25
26
  threads.each(&:join)
27
+ puts "#{all_urls.length} URLs sent to Internet archive"
26
28
  all_urls
27
29
  end
28
30
  end
@@ -1,27 +1,27 @@
1
1
  module WaybackArchiver
2
2
  class Collector
3
-
4
3
  class << self
5
-
6
4
  def urls_from_sitemap(url)
7
- urls = Array.new
8
- xml_data = Request.get_response(url).body
5
+ urls = []
6
+ xml_data = Request.get_response(Request.resolve_url(url)).body
9
7
  document = REXML::Document.new(xml_data)
10
8
 
11
9
  document.elements.each('urlset/url/loc') { |element| urls << element.text }
12
10
  urls
13
11
  end
14
12
 
13
+ def urls_from_crawl(url)
14
+ Crawler.collect_urls(url)
15
+ end
16
+
15
17
  def urls_from_file(path)
16
18
  raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
17
- urls = Array.new
19
+ urls = []
18
20
  text = File.open(path).read
19
21
  text.gsub!(/\r\n?/, "\n") # Normalize line endings
20
22
  text.each_line { |line| urls << line.gsub(/\n/, '').strip }
21
23
  urls.reject(&:empty?)
22
24
  end
23
-
24
25
  end
25
-
26
26
  end
27
27
  end
@@ -0,0 +1,69 @@
1
+ module WaybackArchiver
2
+ class CrawlUrl
3
+ attr_reader :resolved_base_url, :base_hostname
4
+
5
+ def initialize(base_url)
6
+ @resolved_base_url = Request.resolve_url(base_url)
7
+ @base_hostname = URI.parse(@resolved_base_url).host
8
+ end
9
+
10
+ def absolute_url_from(raw_url, get_url)
11
+ return nil unless eligible_url?(raw_url)
12
+ parsed_url = URI.parse(raw_url) rescue URI.parse('')
13
+ if parsed_url.relative?
14
+ url_from_relative(raw_url, get_url)
15
+ elsif base_hostname.eql?(parsed_url.hostname)
16
+ raw_url
17
+ else
18
+ nil
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def url_from_relative(url, current_page_url)
25
+ if url.start_with?('/')
26
+ "#{without_path_suffix(resolved_base_url)}#{url}"
27
+ elsif url.start_with?('../')
28
+ "#{url_from_dotted_url(url, current_page_url)}"
29
+ else
30
+ "#{with_path_suffix(resolved_base_url)}#{url}"
31
+ end
32
+ end
33
+
34
+ def url_from_dotted_url(url, current_page_url)
35
+ absolute_url = with_path_suffix(current_page_url.dup)
36
+ found_dots = without_path_suffix(url).scan('../').length
37
+ removed_dots = 0
38
+ max_levels = 4
39
+ while found_dots >= removed_dots && max_levels > removed_dots
40
+ index = absolute_url.rindex('/') or break
41
+ absolute_url = absolute_url[0..(index - 1)]
42
+ removed_dots += 1
43
+ end
44
+ "#{with_path_suffix(absolute_url)}#{url.gsub('../', '')}"
45
+ end
46
+
47
+ def with_path_suffix(passed_url)
48
+ url = passed_url.dup
49
+ url.end_with?('/') ? url : url << '/'
50
+ end
51
+
52
+ def without_path_suffix(passed_url)
53
+ url = passed_url.dup
54
+ url.end_with?('/') ? url[0...(url.length - 1)] : url
55
+ end
56
+
57
+ def eligible_url?(href)
58
+ return false if href.nil? || href.empty?
59
+ dont_start = %w(javascript: callto: mailto: tel: skype: facetime: wtai: #)
60
+ dont_include = %w(/email-protection#)
61
+ dont_end = %w(.zip .rar .pdf .exe .dmg .pkg .dpkg .bat)
62
+
63
+ dont_start.each { |pattern| return false if href.start_with?(pattern) }
64
+ dont_include.each { |pattern| return false if href.include?(pattern) }
65
+ dont_end.each { |pattern| return false if href.end_with?(pattern) }
66
+ true
67
+ end
68
+ end
69
+ end
@@ -1,22 +1,27 @@
1
1
  require 'set'
2
2
  require 'nokogiri'
3
- require 'open-uri'
4
3
 
5
4
  module WaybackArchiver
6
5
  class Crawler
7
- def initialize(base_url)
8
- @base_url = base_url
9
- @hostname = URI.parse(@base_url).host
6
+ CRAWLER_INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
7
+ HEADERS_HASH = {
8
+ 'User-Agent' => "WaybackArchiver/#{VERSION} (+#{CRAWLER_INFO_LINK})"
9
+ }
10
+
11
+ def initialize(url, resolve: false)
12
+ base_url = Request.resolve_url(url)
13
+ @options = { resolve: resolve }
14
+ @crawl_url = CrawlUrl.new(base_url)
10
15
  @fetch_queue = Set.new
11
16
  @procesed = Set.new
12
- @fetch_queue << @base_url
17
+ @fetch_queue << @crawl_url.resolved_base_url
13
18
  end
14
19
 
15
20
  def self.collect_urls(base_url)
16
21
  new(base_url).collect_urls
17
22
  end
18
23
 
19
- def collect_urls
24
+ def collect_urls
20
25
  until @fetch_queue.empty?
21
26
  url = @fetch_queue.first
22
27
  @fetch_queue.delete(@fetch_queue.first)
@@ -24,27 +29,32 @@ module WaybackArchiver
24
29
  end
25
30
  puts "Crawling finished, #{@procesed.length} links found"
26
31
  @procesed.to_a
32
+ rescue Interrupt, IRB::Abort
33
+ puts 'Crawl interrupted.'
34
+ @fetch_queue.to_a
27
35
  end
28
36
 
29
- def page_links(url)
30
- puts "Queue length: #{@fetch_queue.length}, Parsing: #{url}"
31
- link_elements = Nokogiri::HTML(open(url)).css('a') rescue []
32
- @procesed << url
33
- link_elements.each do |link|
34
- href = sanitize_url(link.attr('href'))
35
- @fetch_queue << href if href && !@procesed.include?(href)
37
+ private
38
+
39
+ def page_links(get_url)
40
+ puts "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
41
+ link_elements = get_page(get_url).css('a') rescue []
42
+ @procesed << get_url
43
+ link_elements.each do |page_link|
44
+ absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
45
+ if absolute_url
46
+ resolved_url = resolve(absolute_url)
47
+ @fetch_queue << resolved_url if !@procesed.include?(resolved_url)
48
+ end
36
49
  end
37
50
  end
38
51
 
39
- def sanitize_url(raw_url)
40
- url = URI.parse(raw_url) rescue URI.parse('')
41
- if url.host.nil?
42
- sanitized_url = "#{@base_url}#{url.path}"
43
- sanitized_url += "?#{url.query}" unless url.query.nil?
44
- sanitized_url
45
- else
46
- raw_url if raw_url.include?(@base_url) && @hostname.eql?(url.hostname)
47
- end
52
+ def get_page(url)
53
+ Nokogiri::HTML(Request.get_response(url).body)
54
+ end
55
+
56
+ def resolve(url)
57
+ @options[:resolve] ? Request.resolve_url(url) : url
48
58
  end
49
59
  end
50
60
  end
@@ -1,16 +1,23 @@
1
+ require 'url_resolver' # TODO: Allow users to use any resolver
2
+
1
3
  module WaybackArchiver
2
4
  class Request
5
+ INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
6
+ USER_AGENT = "WaybackArchiver/#{VERSION} (+#{INFO_LINK})"
3
7
 
4
- def self.get_response(url)
5
- uri = URI.parse(url)
6
-
7
- http = Net::HTTP.new(uri.host, uri.port)
8
- http.use_ssl = true if url.include?('https://')
8
+ def self.get_response(url, resolve: false)
9
+ resolved_url = resolve ? resolve_url(url) : url
10
+ uri = URI.parse(resolved_url)
11
+ http = Net::HTTP.new(uri.host, uri.port)
12
+ http.use_ssl = true if resolved_url.include?('https://')
9
13
 
10
- request = Net::HTTP::Get.new(uri.request_uri)
11
- response = http.request(request)
12
- response
14
+ request = Net::HTTP::Get.new(uri.request_uri)
15
+ request['User-Agent'] = USER_AGENT
16
+ http.request(request)
13
17
  end
14
18
 
19
+ def self.resolve_url(url)
20
+ UrlResolver.resolve(url)
21
+ end
15
22
  end
16
23
  end
@@ -1,3 +1,3 @@
1
1
  module WaybackArchiver
2
- VERSION = '0.0.6'
2
+ VERSION = '0.0.7'
3
3
  end
@@ -6,6 +6,7 @@ require 'wayback_archiver/collector'
6
6
  require 'wayback_archiver/archive'
7
7
  require 'wayback_archiver/request'
8
8
  require 'wayback_archiver/crawler'
9
+ require 'wayback_archiver/crawl_url'
9
10
 
10
11
  module WaybackArchiver
11
12
  BASE_URL = 'https://web.archive.org/save/'
@@ -15,15 +16,14 @@ module WaybackArchiver
15
16
  when 'sitemap'
16
17
  Collector.urls_from_sitemap("#{source}/sitemap.xml")
17
18
  when 'url'
18
- Array(source)
19
+ [Request.resolve_url(source)]
19
20
  when 'file'
20
21
  Collector.urls_from_file(source)
21
22
  when 'crawl', 'crawler'
22
- Crawler.collect_urls(source)
23
+ Collector.urls_from_crawl(source)
23
24
  else
24
- raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawler"
25
+ raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawl"
25
26
  end
26
27
  Archive.post(urls)
27
28
  end
28
-
29
29
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - '>='
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: url_resolver
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: bundler
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -52,6 +66,48 @@ dependencies:
52
66
  - - '>='
53
67
  - !ruby/object:Gem::Version
54
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: yard
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: coveralls
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
55
111
  description: 'Send URLs to Wayback Machine. From: sitemap, file or single URL.'
56
112
  email:
57
113
  - burenstam@gmail.com
@@ -63,6 +119,7 @@ files:
63
119
  - bin/wayback_archiver
64
120
  - lib/wayback_archiver/archive.rb
65
121
  - lib/wayback_archiver/collector.rb
122
+ - lib/wayback_archiver/crawl_url.rb
66
123
  - lib/wayback_archiver/crawler.rb
67
124
  - lib/wayback_archiver/request.rb
68
125
  - lib/wayback_archiver/version.rb