wayback_archiver 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: aee3e5d863178ea9283eb2b723e347d17aefa6b1
4
- data.tar.gz: 597d40bdfa0d7538477dbbba3f6671a72f22d82a
3
+ metadata.gz: ddbffea2e55297390c66201d287b85fb6336d864
4
+ data.tar.gz: b419745edba1f8dcf9d6e83ce5b74cd70c9abd0f
5
5
  SHA512:
6
- metadata.gz: 37f2b0372499d7999eed48a80647218127de4d240c661260f18011456c281956cdf778aa3bd66852be08caa3f0cfc03fdf8668895644197c31d1889cb464ddbe
7
- data.tar.gz: 505e63ed0e5ce661d96acee3e53d2c950f1e550e611ce64951d7011e76642f595f26e1cc6f522581e3253d1317c1496e861ace64a19032418deb7d14c4f4f991
6
+ metadata.gz: 5bb23d2bab242cc55d1a9e851e5fd719431371f2149a2640ce34ede4be817f881ace982d39cff04691d435b140bbc54419bf90affb28d0621261cb9ee7d34a69
7
+ data.tar.gz: 91c6651a5cbeb1333a9f24ab3596ee0b284c54e4aa7c375b158bf6c3bbb54892a6c78c930a1942a3dcc9f4e0ab937c60a08e3f60a8339f0e9342cea6f0959c5f
@@ -3,7 +3,7 @@ module WaybackArchiver
3
3
  MAX_THREAD_COUNT = 8
4
4
 
5
5
  def self.post(all_urls)
6
- puts "Request will be sent with max #{MAX_THREAD_COUNT} parallel threads"
6
+ puts "Request are sent with up to #{MAX_THREAD_COUNT} parallel threads"
7
7
 
8
8
  puts "Total urls to be sent: #{all_urls.length}"
9
9
  threads = []
@@ -11,18 +11,20 @@ module WaybackArchiver
11
11
  all_urls.each_slice(group_size).to_a.each do |urls|
12
12
  threads << Thread.new do
13
13
  urls.each_with_index do |url, index|
14
- request_url = "#{BASE_URL}#{url}"
14
+ resolved_url = Request.resolve_url(url)
15
+ request_url = "#{BASE_URL}#{resolved_url}"
15
16
  begin
16
17
  res = Request.get_response(request_url)
17
- print "#{url} #{res.code} => #{res.message} \n"
18
+ puts "[#{res.code}, #{res.message}] #{resolved_url}"
18
19
  rescue Exception => e
19
20
  puts "Error message: #{e.message}"
20
- puts "Failed to archive: #{url}"
21
+ puts "Failed to archive: #{resolved_url}"
21
22
  end
22
23
  end
23
24
  end
24
25
  end
25
26
  threads.each(&:join)
27
+ puts "#{all_urls.length} URLs sent to Internet archive"
26
28
  all_urls
27
29
  end
28
30
  end
@@ -1,27 +1,27 @@
1
1
  module WaybackArchiver
2
2
  class Collector
3
-
4
3
  class << self
5
-
6
4
  def urls_from_sitemap(url)
7
- urls = Array.new
8
- xml_data = Request.get_response(url).body
5
+ urls = []
6
+ xml_data = Request.get_response(Request.resolve_url(url)).body
9
7
  document = REXML::Document.new(xml_data)
10
8
 
11
9
  document.elements.each('urlset/url/loc') { |element| urls << element.text }
12
10
  urls
13
11
  end
14
12
 
13
+ def urls_from_crawl(url)
14
+ Crawler.collect_urls(url)
15
+ end
16
+
15
17
  def urls_from_file(path)
16
18
  raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
17
- urls = Array.new
19
+ urls = []
18
20
  text = File.open(path).read
19
21
  text.gsub!(/\r\n?/, "\n") # Normalize line endings
20
22
  text.each_line { |line| urls << line.gsub(/\n/, '').strip }
21
23
  urls.reject(&:empty?)
22
24
  end
23
-
24
25
  end
25
-
26
26
  end
27
27
  end
@@ -0,0 +1,69 @@
1
+ module WaybackArchiver
2
+ class CrawlUrl
3
+ attr_reader :resolved_base_url, :base_hostname
4
+
5
+ def initialize(base_url)
6
+ @resolved_base_url = Request.resolve_url(base_url)
7
+ @base_hostname = URI.parse(@resolved_base_url).host
8
+ end
9
+
10
+ def absolute_url_from(raw_url, get_url)
11
+ return nil unless eligible_url?(raw_url)
12
+ parsed_url = URI.parse(raw_url) rescue URI.parse('')
13
+ if parsed_url.relative?
14
+ url_from_relative(raw_url, get_url)
15
+ elsif base_hostname.eql?(parsed_url.hostname)
16
+ raw_url
17
+ else
18
+ nil
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def url_from_relative(url, current_page_url)
25
+ if url.start_with?('/')
26
+ "#{without_path_suffix(resolved_base_url)}#{url}"
27
+ elsif url.start_with?('../')
28
+ "#{url_from_dotted_url(url, current_page_url)}"
29
+ else
30
+ "#{with_path_suffix(resolved_base_url)}#{url}"
31
+ end
32
+ end
33
+
34
+ def url_from_dotted_url(url, current_page_url)
35
+ absolute_url = with_path_suffix(current_page_url.dup)
36
+ found_dots = without_path_suffix(url).scan('../').length
37
+ removed_dots = 0
38
+ max_levels = 4
39
+ while found_dots >= removed_dots && max_levels > removed_dots
40
+ index = absolute_url.rindex('/') or break
41
+ absolute_url = absolute_url[0..(index - 1)]
42
+ removed_dots += 1
43
+ end
44
+ "#{with_path_suffix(absolute_url)}#{url.gsub('../', '')}"
45
+ end
46
+
47
+ def with_path_suffix(passed_url)
48
+ url = passed_url.dup
49
+ url.end_with?('/') ? url : url << '/'
50
+ end
51
+
52
+ def without_path_suffix(passed_url)
53
+ url = passed_url.dup
54
+ url.end_with?('/') ? url[0...(url.length - 1)] : url
55
+ end
56
+
57
+ def eligible_url?(href)
58
+ return false if href.nil? || href.empty?
59
+ dont_start = %w(javascript: callto: mailto: tel: skype: facetime: wtai: #)
60
+ dont_include = %w(/email-protection#)
61
+ dont_end = %w(.zip .rar .pdf .exe .dmg .pkg .dpkg .bat)
62
+
63
+ dont_start.each { |pattern| return false if href.start_with?(pattern) }
64
+ dont_include.each { |pattern| return false if href.include?(pattern) }
65
+ dont_end.each { |pattern| return false if href.end_with?(pattern) }
66
+ true
67
+ end
68
+ end
69
+ end
@@ -1,22 +1,27 @@
1
1
  require 'set'
2
2
  require 'nokogiri'
3
- require 'open-uri'
4
3
 
5
4
  module WaybackArchiver
6
5
  class Crawler
7
- def initialize(base_url)
8
- @base_url = base_url
9
- @hostname = URI.parse(@base_url).host
6
+ CRAWLER_INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
7
+ HEADERS_HASH = {
8
+ 'User-Agent' => "WaybackArchiver/#{VERSION} (+#{CRAWLER_INFO_LINK})"
9
+ }
10
+
11
+ def initialize(url, resolve: false)
12
+ base_url = Request.resolve_url(url)
13
+ @options = { resolve: resolve }
14
+ @crawl_url = CrawlUrl.new(base_url)
10
15
  @fetch_queue = Set.new
11
16
  @procesed = Set.new
12
- @fetch_queue << @base_url
17
+ @fetch_queue << @crawl_url.resolved_base_url
13
18
  end
14
19
 
15
20
  def self.collect_urls(base_url)
16
21
  new(base_url).collect_urls
17
22
  end
18
23
 
19
- def collect_urls
24
+ def collect_urls
20
25
  until @fetch_queue.empty?
21
26
  url = @fetch_queue.first
22
27
  @fetch_queue.delete(@fetch_queue.first)
@@ -24,27 +29,32 @@ module WaybackArchiver
24
29
  end
25
30
  puts "Crawling finished, #{@procesed.length} links found"
26
31
  @procesed.to_a
32
+ rescue Interrupt, IRB::Abort
33
+ puts 'Crawl interrupted.'
34
+ @fetch_queue.to_a
27
35
  end
28
36
 
29
- def page_links(url)
30
- puts "Queue length: #{@fetch_queue.length}, Parsing: #{url}"
31
- link_elements = Nokogiri::HTML(open(url)).css('a') rescue []
32
- @procesed << url
33
- link_elements.each do |link|
34
- href = sanitize_url(link.attr('href'))
35
- @fetch_queue << href if href && !@procesed.include?(href)
37
+ private
38
+
39
+ def page_links(get_url)
40
+ puts "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
41
+ link_elements = get_page(get_url).css('a') rescue []
42
+ @procesed << get_url
43
+ link_elements.each do |page_link|
44
+ absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
45
+ if absolute_url
46
+ resolved_url = resolve(absolute_url)
47
+ @fetch_queue << resolved_url if !@procesed.include?(resolved_url)
48
+ end
36
49
  end
37
50
  end
38
51
 
39
- def sanitize_url(raw_url)
40
- url = URI.parse(raw_url) rescue URI.parse('')
41
- if url.host.nil?
42
- sanitized_url = "#{@base_url}#{url.path}"
43
- sanitized_url += "?#{url.query}" unless url.query.nil?
44
- sanitized_url
45
- else
46
- raw_url if raw_url.include?(@base_url) && @hostname.eql?(url.hostname)
47
- end
52
+ def get_page(url)
53
+ Nokogiri::HTML(Request.get_response(url).body)
54
+ end
55
+
56
+ def resolve(url)
57
+ @options[:resolve] ? Request.resolve_url(url) : url
48
58
  end
49
59
  end
50
60
  end
@@ -1,16 +1,23 @@
1
+ require 'url_resolver' # TODO: Allow users to use any resolver
2
+
1
3
  module WaybackArchiver
2
4
  class Request
5
+ INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
6
+ USER_AGENT = "WaybackArchiver/#{VERSION} (+#{INFO_LINK})"
3
7
 
4
- def self.get_response(url)
5
- uri = URI.parse(url)
6
-
7
- http = Net::HTTP.new(uri.host, uri.port)
8
- http.use_ssl = true if url.include?('https://')
8
+ def self.get_response(url, resolve: false)
9
+ resolved_url = resolve ? resolve_url(url) : url
10
+ uri = URI.parse(resolved_url)
11
+ http = Net::HTTP.new(uri.host, uri.port)
12
+ http.use_ssl = true if resolved_url.include?('https://')
9
13
 
10
- request = Net::HTTP::Get.new(uri.request_uri)
11
- response = http.request(request)
12
- response
14
+ request = Net::HTTP::Get.new(uri.request_uri)
15
+ request['User-Agent'] = USER_AGENT
16
+ http.request(request)
13
17
  end
14
18
 
19
+ def self.resolve_url(url)
20
+ UrlResolver.resolve(url)
21
+ end
15
22
  end
16
23
  end
@@ -1,3 +1,3 @@
1
1
  module WaybackArchiver
2
- VERSION = '0.0.6'
2
+ VERSION = '0.0.7'
3
3
  end
@@ -6,6 +6,7 @@ require 'wayback_archiver/collector'
6
6
  require 'wayback_archiver/archive'
7
7
  require 'wayback_archiver/request'
8
8
  require 'wayback_archiver/crawler'
9
+ require 'wayback_archiver/crawl_url'
9
10
 
10
11
  module WaybackArchiver
11
12
  BASE_URL = 'https://web.archive.org/save/'
@@ -15,15 +16,14 @@ module WaybackArchiver
15
16
  when 'sitemap'
16
17
  Collector.urls_from_sitemap("#{source}/sitemap.xml")
17
18
  when 'url'
18
- Array(source)
19
+ [Request.resolve_url(source)]
19
20
  when 'file'
20
21
  Collector.urls_from_file(source)
21
22
  when 'crawl', 'crawler'
22
- Crawler.collect_urls(source)
23
+ Collector.urls_from_crawl(source)
23
24
  else
24
- raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawler"
25
+ raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawl"
25
26
  end
26
27
  Archive.post(urls)
27
28
  end
28
-
29
29
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - '>='
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: url_resolver
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: bundler
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -52,6 +66,48 @@ dependencies:
52
66
  - - '>='
53
67
  - !ruby/object:Gem::Version
54
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: yard
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: coveralls
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
55
111
  description: 'Send URLs to Wayback Machine. From: sitemap, file or single URL.'
56
112
  email:
57
113
  - burenstam@gmail.com
@@ -63,6 +119,7 @@ files:
63
119
  - bin/wayback_archiver
64
120
  - lib/wayback_archiver/archive.rb
65
121
  - lib/wayback_archiver/collector.rb
122
+ - lib/wayback_archiver/crawl_url.rb
66
123
  - lib/wayback_archiver/crawler.rb
67
124
  - lib/wayback_archiver/request.rb
68
125
  - lib/wayback_archiver/version.rb