wayback_archiver 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0ab0900a297f8e0413c436d9cf084ebfd66c09a9
4
- data.tar.gz: 8801173b1f3bd9d10e6184bcc185056a835c30d9
3
+ metadata.gz: 3df67fcb7faa06087ad72e6c57ff985d9dbe4fe9
4
+ data.tar.gz: 5c21d0bf3353dd2ce732cef3b59cd3ba807acc8a
5
5
  SHA512:
6
- metadata.gz: fb35d8b324df42b37aaa10b3fbf90428f9d227800db935d64a9bf674a64f438c64e24a1ba0a529a6171edae144300ea6d991e8591b278e3373d3e5c3b8b1869a
7
- data.tar.gz: b9c18c4062d056e4e6a766b337701deae9178124da8455631ad45a10350a6498f885ad44d6c6757246ecf43653b6b48d94254e9c768b8df37a91f141887f41fc
6
+ metadata.gz: d21d169fb9fa143b777d771a8cdc5ba8c3c4186f6472aa8f2ff2806034becf18e3cc260c7ce0a9b97d1148a2d1781d9ddc0daef08d7abe1d2b13140125513231
7
+ data.tar.gz: 15a413ab5bd0f51c40640cd0be764142ca0380315f97fc335cf1bf19f4645b327dc0716400c6c10af7a79b8ceb898b6e0f89a21114865af7f84da1e2bf3d383c
@@ -1,22 +1,22 @@
1
+ require 'site_mapper'
2
+
1
3
  require 'uri'
2
4
  require 'net/http'
3
5
 
6
+ require 'wayback_archiver/version'
4
7
  require 'wayback_archiver/collector'
5
8
  require 'wayback_archiver/archive'
6
9
  require 'wayback_archiver/request'
7
- require 'wayback_archiver/crawler'
8
- require 'wayback_archiver/crawl_url'
9
10
 
10
11
  module WaybackArchiver
11
12
  def self.archive(source, from = :crawl)
12
13
  urls = case from.to_s
13
- when 'sitemap' then Collector.urls_from_sitemap("#{source}/sitemap.xml")
14
- when 'url' then [Request.resolve_url(source)]
15
- when 'file' then Collector.urls_from_file(source)
16
- when 'crawl' then Collector.urls_from_crawl(source)
14
+ when 'file' then Archive.post(Collector.urls_from_file(source))
15
+ when 'crawl' then Collector.urls_from_crawl(source) { |url| Archive.post_url(url) }
16
+ when 'sitemap' then Archive.post(Collector.urls_from_sitemap("#{source}/sitemap.xml"))
17
+ when 'url' then Archive.post_url(Request.resolve_url(source))
17
18
  else
18
19
  raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawl"
19
20
  end
20
- Archive.post(urls)
21
21
  end
22
22
  end
@@ -7,7 +7,7 @@ module WaybackArchiver
7
7
  puts "Request are sent with up to #{MAX_THREAD_COUNT} parallel threads"
8
8
  puts "Total urls to be sent: #{urls.length}"
9
9
  group_size = (urls.length / MAX_THREAD_COUNT) + 1
10
- urls.each_slice(group_size).to_a.map do |archive_urls|
10
+ urls.each_slice(group_size).to_a.map! do |archive_urls|
11
11
  Thread.new { archive_urls.each { |url| post_url(url) } }
12
12
  end.each(&:join)
13
13
  puts "#{urls.length} URLs sent to Internet archive"
@@ -4,19 +4,19 @@ module WaybackArchiver
4
4
  def urls_from_sitemap(url)
5
5
  resolved = Request.resolve_url(url)
6
6
  sitemap = Request.get_page(resolved)
7
- sitemap.css('loc').map { |element| element.text }
7
+ sitemap.css('loc').map! { |element| element.text }
8
8
  end
9
9
 
10
10
  def urls_from_crawl(url)
11
- Crawler.collect_urls(url)
11
+ SiteMapper.map(url) { |new_url| yield(new_url) if block_given? }
12
12
  end
13
13
 
14
14
  def urls_from_file(path)
15
15
  raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
16
16
  urls = []
17
17
  text = File.open(path).read
18
- text.gsub!(/\r\n?/, "\n") # Normalize line endings
19
- text.each_line { |line| urls << line.gsub(/\n/, '').strip }
18
+ text.gsub!(/\r\n?/, "\n")
19
+ .each_line { |line| urls << line.gsub!(/\n/, '').strip }
20
20
  urls.reject(&:empty?)
21
21
  end
22
22
  end
@@ -3,7 +3,7 @@ require 'url_resolver' # TODO: Allow users to use any resolver
3
3
  module WaybackArchiver
4
4
  class Request
5
5
  INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
6
- USER_AGENT = "WaybackArchiver/#{VERSION} (+#{INFO_LINK})"
6
+ USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})"
7
7
 
8
8
  class << self
9
9
  def get_page(url, document_type = :html)
@@ -1,3 +1,3 @@
1
1
  module WaybackArchiver
2
- VERSION = '0.0.9'
2
+ VERSION = '0.0.10'
3
3
  end
metadata CHANGED
@@ -1,43 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-21 00:00:00.000000000 Z
11
+ date: 2014-10-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: nokogiri
14
+ name: site_mapper
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - '>='
17
+ - - ~>
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - '>='
24
+ - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: url_resolver
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '>='
31
+ - - ~>
32
32
  - !ruby/object:Gem::Version
33
- version: '0'
33
+ version: '0.1'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - ~>
39
39
  - !ruby/object:Gem::Version
40
- version: '0'
40
+ version: '0.1'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: bundler
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -56,59 +56,59 @@ dependencies:
56
56
  name: rake
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - '>='
59
+ - - ~>
60
60
  - !ruby/object:Gem::Version
61
- version: '0'
61
+ version: '10.3'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - '>='
66
+ - - ~>
67
67
  - !ruby/object:Gem::Version
68
- version: '0'
68
+ version: '10.3'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rspec
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - '>='
73
+ - - ~>
74
74
  - !ruby/object:Gem::Version
75
- version: '0'
75
+ version: '3.1'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - '>='
80
+ - - ~>
81
81
  - !ruby/object:Gem::Version
82
- version: '0'
82
+ version: '3.1'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: yard
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - '>='
87
+ - - ~>
88
88
  - !ruby/object:Gem::Version
89
- version: '0'
89
+ version: '0.8'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - '>='
94
+ - - ~>
95
95
  - !ruby/object:Gem::Version
96
- version: '0'
96
+ version: '0.8'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: coveralls
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
- - - '>='
101
+ - - ~>
102
102
  - !ruby/object:Gem::Version
103
- version: '0'
103
+ version: '0.7'
104
104
  type: :development
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
- - - '>='
108
+ - - ~>
109
109
  - !ruby/object:Gem::Version
110
- version: '0'
111
- description: 'Send URLs to Wayback Machine. From: sitemap, file or single URL.'
110
+ version: '0.7'
111
+ description: Send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
112
112
  email:
113
113
  - burenstam@gmail.com
114
114
  executables:
@@ -119,8 +119,6 @@ files:
119
119
  - bin/wayback_archiver
120
120
  - lib/wayback_archiver/archive.rb
121
121
  - lib/wayback_archiver/collector.rb
122
- - lib/wayback_archiver/crawl_url.rb
123
- - lib/wayback_archiver/crawler.rb
124
122
  - lib/wayback_archiver/request.rb
125
123
  - lib/wayback_archiver/version.rb
126
124
  - lib/wayback_archiver.rb
@@ -136,7 +134,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
136
134
  requirements:
137
135
  - - '>='
138
136
  - !ruby/object:Gem::Version
139
- version: '0'
137
+ version: 1.9.3
140
138
  required_rubygems_version: !ruby/object:Gem::Requirement
141
139
  requirements:
142
140
  - - '>='
@@ -1,74 +0,0 @@
1
- module WaybackArchiver
2
- class CrawlUrl
3
- attr_reader :resolved_base_url, :base_hostname
4
-
5
- def initialize(base_url)
6
- @resolved_base_url = Request.resolve_url(base_url)
7
- @base_hostname = URI.parse(@resolved_base_url).hostname
8
- @resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
9
- end
10
-
11
- def absolute_url_from(raw_url, get_url)
12
- return nil unless eligible_url?(raw_url)
13
- parsed_url = URI.parse(raw_url) rescue URI.parse('')
14
- if parsed_url.relative?
15
- url_from_relative(raw_url, get_url)
16
- elsif same_domain?(raw_url, @resolved_base_url)
17
- raw_url
18
- else
19
- nil
20
- end
21
- end
22
-
23
- private
24
-
25
- def url_from_relative(url, current_page_url)
26
- if url.start_with?('/')
27
- "#{without_path_suffix(resolved_base_url)}#{url}"
28
- elsif url.start_with?('../')
29
- "#{url_from_dotted_url(url, current_page_url)}"
30
- else
31
- "#{with_path_suffix(resolved_base_url)}#{url}"
32
- end
33
- end
34
-
35
- def url_from_dotted_url(url, current_page_url)
36
- absolute_url = with_path_suffix(current_page_url.dup)
37
- found_dots = without_path_suffix(url).scan('../').length
38
- removed_dots = 0
39
- max_levels = 4
40
- while found_dots >= removed_dots && max_levels > removed_dots
41
- index = absolute_url.rindex('/') or break
42
- absolute_url = absolute_url[0..(index - 1)]
43
- removed_dots += 1
44
- end
45
- "#{with_path_suffix(absolute_url)}#{url.gsub('../', '')}"
46
- end
47
-
48
- def with_path_suffix(passed_url)
49
- url = passed_url.dup
50
- url.end_with?('/') ? url : url << '/'
51
- end
52
-
53
- def without_path_suffix(passed_url)
54
- url = passed_url.dup
55
- url.end_with?('/') ? url[0...(url.length - 1)] : url
56
- end
57
-
58
- def eligible_url?(href)
59
- return false if href.nil? || href.empty?
60
- dont_start = %w(javascript: callto: mailto: tel: skype: facetime: wtai: #)
61
- dont_include = %w(/email-protection#)
62
- dont_end = %w(.zip .rar .pdf .exe .dmg .pkg .dpkg .bat)
63
-
64
- dont_start.each { |pattern| return false if href.start_with?(pattern) }
65
- dont_include.each { |pattern| return false if href.include?(pattern) }
66
- dont_end.each { |pattern| return false if href.end_with?(pattern) }
67
- true
68
- end
69
-
70
- def same_domain?(first, second)
71
- first.include?(second)
72
- end
73
- end
74
- end
@@ -1,56 +0,0 @@
1
- require 'set'
2
- require 'nokogiri'
3
-
4
- module WaybackArchiver
5
- class Crawler
6
- CRAWLER_INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
7
- HEADERS_HASH = {
8
- 'User-Agent' => "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{CRAWLER_INFO_LINK})"
9
- }
10
-
11
- def initialize(url, resolve = false)
12
- base_url = Request.resolve_url(url)
13
- @options = { resolve: resolve }
14
- @crawl_url = CrawlUrl.new(base_url)
15
- @fetch_queue = Set.new
16
- @processed = Set.new
17
- @fetch_queue << @crawl_url.resolved_base_url
18
- end
19
-
20
- def self.collect_urls(base_url)
21
- new(base_url).collect_urls
22
- end
23
-
24
- def collect_urls
25
- until @fetch_queue.empty?
26
- url = @fetch_queue.first
27
- @fetch_queue.delete(@fetch_queue.first)
28
- page_links(url)
29
- end
30
- puts "Crawling finished, #{@processed.length} links found"
31
- @processed.to_a
32
- rescue Interrupt, IRB::Abort
33
- puts 'Crawl interrupted.'
34
- @fetch_queue.to_a
35
- end
36
-
37
- private
38
-
39
- def page_links(get_url)
40
- puts "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
41
- link_elements = Request.get_page(get_url).css('a') rescue []
42
- @processed << get_url
43
- link_elements.each do |page_link|
44
- absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
45
- if absolute_url
46
- resolved_url = resolve(absolute_url)
47
- @fetch_queue << resolved_url if !@processed.include?(resolved_url)
48
- end
49
- end
50
- end
51
-
52
- def resolve(url)
53
- @options[:resolve] ? Request.resolve_url(url) : url
54
- end
55
- end
56
- end