wayback_archiver 0.0.9 → 0.0.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0ab0900a297f8e0413c436d9cf084ebfd66c09a9
4
- data.tar.gz: 8801173b1f3bd9d10e6184bcc185056a835c30d9
3
+ metadata.gz: 3df67fcb7faa06087ad72e6c57ff985d9dbe4fe9
4
+ data.tar.gz: 5c21d0bf3353dd2ce732cef3b59cd3ba807acc8a
5
5
  SHA512:
6
- metadata.gz: fb35d8b324df42b37aaa10b3fbf90428f9d227800db935d64a9bf674a64f438c64e24a1ba0a529a6171edae144300ea6d991e8591b278e3373d3e5c3b8b1869a
7
- data.tar.gz: b9c18c4062d056e4e6a766b337701deae9178124da8455631ad45a10350a6498f885ad44d6c6757246ecf43653b6b48d94254e9c768b8df37a91f141887f41fc
6
+ metadata.gz: d21d169fb9fa143b777d771a8cdc5ba8c3c4186f6472aa8f2ff2806034becf18e3cc260c7ce0a9b97d1148a2d1781d9ddc0daef08d7abe1d2b13140125513231
7
+ data.tar.gz: 15a413ab5bd0f51c40640cd0be764142ca0380315f97fc335cf1bf19f4645b327dc0716400c6c10af7a79b8ceb898b6e0f89a21114865af7f84da1e2bf3d383c
@@ -1,22 +1,22 @@
1
+ require 'site_mapper'
2
+
1
3
  require 'uri'
2
4
  require 'net/http'
3
5
 
6
+ require 'wayback_archiver/version'
4
7
  require 'wayback_archiver/collector'
5
8
  require 'wayback_archiver/archive'
6
9
  require 'wayback_archiver/request'
7
- require 'wayback_archiver/crawler'
8
- require 'wayback_archiver/crawl_url'
9
10
 
10
11
  module WaybackArchiver
11
12
  def self.archive(source, from = :crawl)
12
13
  urls = case from.to_s
13
- when 'sitemap' then Collector.urls_from_sitemap("#{source}/sitemap.xml")
14
- when 'url' then [Request.resolve_url(source)]
15
- when 'file' then Collector.urls_from_file(source)
16
- when 'crawl' then Collector.urls_from_crawl(source)
14
+ when 'file' then Archive.post(Collector.urls_from_file(source))
15
+ when 'crawl' then Collector.urls_from_crawl(source) { |url| Archive.post_url(url) }
16
+ when 'sitemap' then Archive.post(Collector.urls_from_sitemap("#{source}/sitemap.xml"))
17
+ when 'url' then Archive.post_url(Request.resolve_url(source))
17
18
  else
18
19
  raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawl"
19
20
  end
20
- Archive.post(urls)
21
21
  end
22
22
  end
@@ -7,7 +7,7 @@ module WaybackArchiver
7
7
  puts "Request are sent with up to #{MAX_THREAD_COUNT} parallel threads"
8
8
  puts "Total urls to be sent: #{urls.length}"
9
9
  group_size = (urls.length / MAX_THREAD_COUNT) + 1
10
- urls.each_slice(group_size).to_a.map do |archive_urls|
10
+ urls.each_slice(group_size).to_a.map! do |archive_urls|
11
11
  Thread.new { archive_urls.each { |url| post_url(url) } }
12
12
  end.each(&:join)
13
13
  puts "#{urls.length} URLs sent to Internet archive"
@@ -4,19 +4,19 @@ module WaybackArchiver
4
4
  def urls_from_sitemap(url)
5
5
  resolved = Request.resolve_url(url)
6
6
  sitemap = Request.get_page(resolved)
7
- sitemap.css('loc').map { |element| element.text }
7
+ sitemap.css('loc').map! { |element| element.text }
8
8
  end
9
9
 
10
10
  def urls_from_crawl(url)
11
- Crawler.collect_urls(url)
11
+ SiteMapper.map(url) { |new_url| yield(new_url) if block_given? }
12
12
  end
13
13
 
14
14
  def urls_from_file(path)
15
15
  raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
16
16
  urls = []
17
17
  text = File.open(path).read
18
- text.gsub!(/\r\n?/, "\n") # Normalize line endings
19
- text.each_line { |line| urls << line.gsub(/\n/, '').strip }
18
+ text.gsub!(/\r\n?/, "\n")
19
+ .each_line { |line| urls << line.gsub!(/\n/, '').strip }
20
20
  urls.reject(&:empty?)
21
21
  end
22
22
  end
@@ -3,7 +3,7 @@ require 'url_resolver' # TODO: Allow users to use any resolver
3
3
  module WaybackArchiver
4
4
  class Request
5
5
  INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
6
- USER_AGENT = "WaybackArchiver/#{VERSION} (+#{INFO_LINK})"
6
+ USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})"
7
7
 
8
8
  class << self
9
9
  def get_page(url, document_type = :html)
@@ -1,3 +1,3 @@
1
1
  module WaybackArchiver
2
- VERSION = '0.0.9'
2
+ VERSION = '0.0.10'
3
3
  end
metadata CHANGED
@@ -1,43 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-21 00:00:00.000000000 Z
11
+ date: 2014-10-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: nokogiri
14
+ name: site_mapper
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - '>='
17
+ - - ~>
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - '>='
24
+ - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: url_resolver
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '>='
31
+ - - ~>
32
32
  - !ruby/object:Gem::Version
33
- version: '0'
33
+ version: '0.1'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - ~>
39
39
  - !ruby/object:Gem::Version
40
- version: '0'
40
+ version: '0.1'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: bundler
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -56,59 +56,59 @@ dependencies:
56
56
  name: rake
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - '>='
59
+ - - ~>
60
60
  - !ruby/object:Gem::Version
61
- version: '0'
61
+ version: '10.3'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - '>='
66
+ - - ~>
67
67
  - !ruby/object:Gem::Version
68
- version: '0'
68
+ version: '10.3'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rspec
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - '>='
73
+ - - ~>
74
74
  - !ruby/object:Gem::Version
75
- version: '0'
75
+ version: '3.1'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - '>='
80
+ - - ~>
81
81
  - !ruby/object:Gem::Version
82
- version: '0'
82
+ version: '3.1'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: yard
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - '>='
87
+ - - ~>
88
88
  - !ruby/object:Gem::Version
89
- version: '0'
89
+ version: '0.8'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - '>='
94
+ - - ~>
95
95
  - !ruby/object:Gem::Version
96
- version: '0'
96
+ version: '0.8'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: coveralls
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
- - - '>='
101
+ - - ~>
102
102
  - !ruby/object:Gem::Version
103
- version: '0'
103
+ version: '0.7'
104
104
  type: :development
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
- - - '>='
108
+ - - ~>
109
109
  - !ruby/object:Gem::Version
110
- version: '0'
111
- description: 'Send URLs to Wayback Machine. From: sitemap, file or single URL.'
110
+ version: '0.7'
111
+ description: Send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
112
112
  email:
113
113
  - burenstam@gmail.com
114
114
  executables:
@@ -119,8 +119,6 @@ files:
119
119
  - bin/wayback_archiver
120
120
  - lib/wayback_archiver/archive.rb
121
121
  - lib/wayback_archiver/collector.rb
122
- - lib/wayback_archiver/crawl_url.rb
123
- - lib/wayback_archiver/crawler.rb
124
122
  - lib/wayback_archiver/request.rb
125
123
  - lib/wayback_archiver/version.rb
126
124
  - lib/wayback_archiver.rb
@@ -136,7 +134,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
136
134
  requirements:
137
135
  - - '>='
138
136
  - !ruby/object:Gem::Version
139
- version: '0'
137
+ version: 1.9.3
140
138
  required_rubygems_version: !ruby/object:Gem::Requirement
141
139
  requirements:
142
140
  - - '>='
@@ -1,74 +0,0 @@
1
- module WaybackArchiver
2
- class CrawlUrl
3
- attr_reader :resolved_base_url, :base_hostname
4
-
5
- def initialize(base_url)
6
- @resolved_base_url = Request.resolve_url(base_url)
7
- @base_hostname = URI.parse(@resolved_base_url).hostname
8
- @resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
9
- end
10
-
11
- def absolute_url_from(raw_url, get_url)
12
- return nil unless eligible_url?(raw_url)
13
- parsed_url = URI.parse(raw_url) rescue URI.parse('')
14
- if parsed_url.relative?
15
- url_from_relative(raw_url, get_url)
16
- elsif same_domain?(raw_url, @resolved_base_url)
17
- raw_url
18
- else
19
- nil
20
- end
21
- end
22
-
23
- private
24
-
25
- def url_from_relative(url, current_page_url)
26
- if url.start_with?('/')
27
- "#{without_path_suffix(resolved_base_url)}#{url}"
28
- elsif url.start_with?('../')
29
- "#{url_from_dotted_url(url, current_page_url)}"
30
- else
31
- "#{with_path_suffix(resolved_base_url)}#{url}"
32
- end
33
- end
34
-
35
- def url_from_dotted_url(url, current_page_url)
36
- absolute_url = with_path_suffix(current_page_url.dup)
37
- found_dots = without_path_suffix(url).scan('../').length
38
- removed_dots = 0
39
- max_levels = 4
40
- while found_dots >= removed_dots && max_levels > removed_dots
41
- index = absolute_url.rindex('/') or break
42
- absolute_url = absolute_url[0..(index - 1)]
43
- removed_dots += 1
44
- end
45
- "#{with_path_suffix(absolute_url)}#{url.gsub('../', '')}"
46
- end
47
-
48
- def with_path_suffix(passed_url)
49
- url = passed_url.dup
50
- url.end_with?('/') ? url : url << '/'
51
- end
52
-
53
- def without_path_suffix(passed_url)
54
- url = passed_url.dup
55
- url.end_with?('/') ? url[0...(url.length - 1)] : url
56
- end
57
-
58
- def eligible_url?(href)
59
- return false if href.nil? || href.empty?
60
- dont_start = %w(javascript: callto: mailto: tel: skype: facetime: wtai: #)
61
- dont_include = %w(/email-protection#)
62
- dont_end = %w(.zip .rar .pdf .exe .dmg .pkg .dpkg .bat)
63
-
64
- dont_start.each { |pattern| return false if href.start_with?(pattern) }
65
- dont_include.each { |pattern| return false if href.include?(pattern) }
66
- dont_end.each { |pattern| return false if href.end_with?(pattern) }
67
- true
68
- end
69
-
70
- def same_domain?(first, second)
71
- first.include?(second)
72
- end
73
- end
74
- end
@@ -1,56 +0,0 @@
1
- require 'set'
2
- require 'nokogiri'
3
-
4
- module WaybackArchiver
5
- class Crawler
6
- CRAWLER_INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
7
- HEADERS_HASH = {
8
- 'User-Agent' => "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{CRAWLER_INFO_LINK})"
9
- }
10
-
11
- def initialize(url, resolve = false)
12
- base_url = Request.resolve_url(url)
13
- @options = { resolve: resolve }
14
- @crawl_url = CrawlUrl.new(base_url)
15
- @fetch_queue = Set.new
16
- @processed = Set.new
17
- @fetch_queue << @crawl_url.resolved_base_url
18
- end
19
-
20
- def self.collect_urls(base_url)
21
- new(base_url).collect_urls
22
- end
23
-
24
- def collect_urls
25
- until @fetch_queue.empty?
26
- url = @fetch_queue.first
27
- @fetch_queue.delete(@fetch_queue.first)
28
- page_links(url)
29
- end
30
- puts "Crawling finished, #{@processed.length} links found"
31
- @processed.to_a
32
- rescue Interrupt, IRB::Abort
33
- puts 'Crawl interrupted.'
34
- @fetch_queue.to_a
35
- end
36
-
37
- private
38
-
39
- def page_links(get_url)
40
- puts "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
41
- link_elements = Request.get_page(get_url).css('a') rescue []
42
- @processed << get_url
43
- link_elements.each do |page_link|
44
- absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
45
- if absolute_url
46
- resolved_url = resolve(absolute_url)
47
- @fetch_queue << resolved_url if !@processed.include?(resolved_url)
48
- end
49
- end
50
- end
51
-
52
- def resolve(url)
53
- @options[:resolve] ? Request.resolve_url(url) : url
54
- end
55
- end
56
- end