wayback_archiver 0.0.10 → 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3df67fcb7faa06087ad72e6c57ff985d9dbe4fe9
4
- data.tar.gz: 5c21d0bf3353dd2ce732cef3b59cd3ba807acc8a
3
+ metadata.gz: fa0d72d6164b6b280db7c683535e6894ba64778d
4
+ data.tar.gz: 629963ef30283123820418a714ae73160fa20293
5
5
  SHA512:
6
- metadata.gz: d21d169fb9fa143b777d771a8cdc5ba8c3c4186f6472aa8f2ff2806034becf18e3cc260c7ce0a9b97d1148a2d1781d9ddc0daef08d7abe1d2b13140125513231
7
- data.tar.gz: 15a413ab5bd0f51c40640cd0be764142ca0380315f97fc335cf1bf19f4645b327dc0716400c6c10af7a79b8ceb898b6e0f89a21114865af7f84da1e2bf3d383c
6
+ metadata.gz: 1021437d5c97ecfad14822548522cad49d3257992c91ee8ccf90a30e3cd4cd98fae6c44d977fc3c1cd4f61334bd7c935fcc50bdee92a68a4399470d4610836be
7
+ data.tar.gz: 5c805b9bb8b7cc1123be943966b2f4e94619580d17eb051947d38b1e3d4a165b2c61dd3f117c6bc7c8d858768731108bb1aab1cc59472ad0a3e120fb8df9cae4
@@ -4,19 +4,36 @@ require 'uri'
4
4
  require 'net/http'
5
5
 
6
6
  require 'wayback_archiver/version'
7
- require 'wayback_archiver/collector'
7
+ require 'wayback_archiver/url_collector'
8
8
  require 'wayback_archiver/archive'
9
9
  require 'wayback_archiver/request'
10
10
 
11
+ # WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
11
12
  module WaybackArchiver
12
- def self.archive(source, from = :crawl)
13
- urls = case from.to_s
14
- when 'file' then Archive.post(Collector.urls_from_file(source))
15
- when 'crawl' then Collector.urls_from_crawl(source) { |url| Archive.post_url(url) }
16
- when 'sitemap' then Archive.post(Collector.urls_from_sitemap("#{source}/sitemap.xml"))
13
+ # Link to gem on rubygems.org, part of the sent User-Agent
14
+ INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
15
+ # WaybackArchiver User-Agent
16
+ USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})"
17
+
18
+ # Send URLs to Wayback Machine.
19
+ # @return [Array] with URLs sent to the Wayback Machine.
20
+ # @param [String] source for URL(s).
21
+ # @param [String/Symbol] type of source. Supported types: ['crawl', 'sitemap', 'url', 'file'].
22
+ # @example Crawl example.com and send all URLs of the same domain
23
+ # WaybackArchiver.archive('example.com') # Default type is :crawl
24
+ # WaybackArchiver.archive('example.com', :crawl)
25
+ # @example Send only example.com
26
+ # WaybackArchiver.archive('example.com', :url)
27
+ # @example Send URL on each line in specified file
28
+ # WaybackArchiver.archive('/path/to/file', :file)
29
+ def self.archive(source, type = :crawl)
30
+ case type.to_s
31
+ when 'file' then Archive.post(UrlCollector.file(source))
32
+ when 'crawl' then UrlCollector.crawl(source) { |url| Archive.post_url(url) }
33
+ when 'sitemap' then Archive.post(UrlCollector.sitemap(source))
17
34
  when 'url' then Archive.post_url(Request.resolve_url(source))
18
35
  else
19
- raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawl"
36
+ raise ArgumentError, "Unknown type: '#{type}'. Allowed types: sitemap, url, file, crawl"
20
37
  end
21
38
  end
22
39
  end
@@ -1,27 +1,47 @@
1
1
  module WaybackArchiver
2
+ # Post URL(s) to Wayback Machine
2
3
  class Archive
3
- WAYBACK_BASE_URL = 'https://web.archive.org/save/'
4
- MAX_THREAD_COUNT = 10
4
+ # Wayback Machine base URL.
5
+ WAYBACK_BASE_URL = 'https://web.archive.org/save/'
6
+ # Default concurrency for archiving URLs
7
+ DEFAULT_CONCURRENCY = 10
8
+ class << self
9
+ # Send URLs to Wayback Machine.
10
+ # @return [Array] with sent URLs.
11
+ # @param [Array] urls URLs to send.
12
+ # @param [Hash] options
13
+ # @example Archive example.com, with default options
14
+ # Archive.post(['http://example.com'])
15
+ # @example Archive example.com, using only 1 thread
16
+ # Archive.post(['http://example.com'], concurrency: 1)
17
+ def post(urls, options = {})
18
+ options = { concurrency: DEFAULT_CONCURRENCY }.merge!(options)
19
+ concurrency = options[:concurrency]
20
+ puts "Request are sent with up to #{concurrency} parallel threads"
21
+ puts "Total urls to be sent: #{urls.length}"
22
+ group_size = (urls.length / concurrency) + 1
23
+ urls.each_slice(group_size).to_a.map! do |archive_urls|
24
+ Thread.new { archive_urls.each { |url| post_url(url) } }
25
+ end.each(&:join)
26
+ puts "#{urls.length} URLs sent to Internet archive"
27
+ urls
28
+ end
5
29
 
6
- def self.post(urls)
7
- puts "Request are sent with up to #{MAX_THREAD_COUNT} parallel threads"
8
- puts "Total urls to be sent: #{urls.length}"
9
- group_size = (urls.length / MAX_THREAD_COUNT) + 1
10
- urls.each_slice(group_size).to_a.map! do |archive_urls|
11
- Thread.new { archive_urls.each { |url| post_url(url) } }
12
- end.each(&:join)
13
- puts "#{urls.length} URLs sent to Internet archive"
14
- urls
15
- end
16
-
17
- def self.post_url(archive_url)
18
- resolved_url = Request.resolve_url(archive_url)
19
- request_url = "#{WAYBACK_BASE_URL}#{resolved_url}"
20
- response = Request.get_response(request_url)
21
- puts "[#{response.code}, #{response.message}] #{resolved_url}"
22
- rescue Exception => e
23
- puts "Error message: #{e.message}"
24
- puts "Failed to archive: #{resolved_url}"
30
+ # Send URL to Wayback Machine.
31
+ # @return [String] the sent URL.
32
+ # @param [String] url to send.
33
+ # @example Archive example.com, with default options
34
+ # Archive.post_url('http://example.com')
35
+ def post_url(url)
36
+ resolved_url = Request.resolve_url(url)
37
+ request_url = "#{WAYBACK_BASE_URL}#{resolved_url}"
38
+ response = Request.response(request_url)
39
+ puts "[#{response.code}, #{response.message}] #{resolved_url}"
40
+ resolved_url
41
+ rescue Exception => e
42
+ puts "Error message: #{e.message}"
43
+ puts "Failed to archive: #{resolved_url}"
44
+ end
25
45
  end
26
46
  end
27
47
  end
@@ -1,34 +1,65 @@
1
1
  require 'url_resolver' # TODO: Allow users to use any resolver
2
2
 
3
3
  module WaybackArchiver
4
+ # Request and parse HTML & XML documents
4
5
  class Request
5
- INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
6
- USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})"
7
-
8
6
  class << self
9
- def get_page(url, document_type = :html)
10
- response = Request.get_response(url).body
11
- case document_type
12
- when :xml
13
- Nokogiri::XML(response)
14
- else
15
- Nokogiri::HTML(response)
16
- end
7
+ # Get and parse HTML & XML documents.
8
+ # @return [Array] with links sent to the Wayback Machine.
9
+ # @param [String] url to retrieve and parse.
10
+ # @example Request and parse example.com
11
+ # Request.document('example.com')
12
+ # @example Request and parse google.com/sitemap.xml
13
+ # Request.document('google.com/sitemap.xml')
14
+ def document(url)
15
+ response_body = Request.response(url).body
16
+ Nokogiri::HTML(response_body)
17
17
  end
18
18
 
19
- def get_response(url, resolve = false)
19
+ # Get reponse.
20
+ # @return [Net::HTTP*] the http response.
21
+ # @param [String] url URL to retrieve.
22
+ # @param [Boolean] resolve whether to resolve the URL.
23
+ # @example Resolve example.com and request
24
+ # Request.response('example.com', true)
25
+ # @example Request http://example.com
26
+ # Request.response('http://example.com', false)
27
+ def response(url, resolve = true)
20
28
  resolved_url = resolve ? resolve_url(url) : url
21
29
  uri = URI.parse(resolved_url)
22
30
  http = Net::HTTP.new(uri.host, uri.port)
23
- http.use_ssl = true if resolved_url.include?('https://')
31
+ http.use_ssl = true if resolved_url.start_with?('https://')
24
32
 
25
33
  request = Net::HTTP::Get.new(uri.request_uri)
26
- request['User-Agent'] = USER_AGENT
34
+ request['User-Agent'] = WaybackArchiver::USER_AGENT
27
35
  http.request(request)
28
36
  end
29
37
 
38
+ # Resolve the URL, follows redirects.
39
+ # @return [String] the resolved URL.
40
+ # @param [String] url to retrieve.
41
+ # @example Resolve example.com and request
42
+ # Request.resolve_url('example.com')
30
43
  def resolve_url(url)
31
- UrlResolver.resolve(url)
44
+ resolved = UrlResolver.resolve(url)
45
+ resolved = resolved.prepend('http://') unless has_protocol?(resolved)
46
+ resolved
47
+ end
48
+
49
+ private
50
+
51
+ # Resolve the URL, follows redirects.
52
+ # @return [Boolean] true if string includes protocol.
53
+ # @param [String] url to check.
54
+ # @example Check if string includes protocol
55
+ # Request.has_protocol?('example.com')
56
+ # # => false
57
+ # Request.has_protocol?('https://example.com')
58
+ # # => true
59
+ # Request.has_protocol?('http://example.com')
60
+ # # => true
61
+ def has_protocol?(url)
62
+ url.start_with?('http://') || url.start_with?('https://')
32
63
  end
33
64
  end
34
65
  end
@@ -0,0 +1,40 @@
1
+ module WaybackArchiver
2
+ # Retrive URLs from different sources
3
+ class UrlCollector
4
+ class << self
5
+ # Retrieve URLs from Sitemap.
6
+ # @return [Array] of URLs defined in Sitemap.
7
+ # @param [String] url domain to retrieve Sitemap from.
8
+ # @example Get URLs defined in Sitemap for google.com
9
+ # UrlCollector.sitemap('https://google.com')
10
+ def sitemap(url)
11
+ resolved = Request.resolve_url("#{url}/sitemap.xml")
12
+ sitemap = Request.document(resolved)
13
+ sitemap.css('loc').map { |element| element.text }
14
+ end
15
+
16
+ # Retrieve URLs by crawling.
17
+ # @return [Array] of URLs defined found during crawl.
18
+ # @param [String] url domain to crawl URLs from.
19
+ # @example Crawl URLs defined on example.com
20
+ # UrlCollector.crawl('http://example.com')
21
+ def crawl(url)
22
+ SiteMapper.map(url, user_agent: WaybackArchiver::USER_AGENT) { |new_url| yield(new_url) if block_given? }
23
+ end
24
+
25
+ # Retrieve URLs listed in file.
26
+ # @return [Array] of URLs defined in file.
27
+ # @param [String] path to get URLs from.
28
+ # @example Get URLs defined in /path/to/file
29
+ # UrlCollector.file('/path/to/file')
30
+ def file(path)
31
+ raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
32
+ urls = []
33
+ File.open(path).read
34
+ .gsub(/\r\n?/, "\n")
35
+ .each_line { |line| urls << line.gsub(/\n/, '').strip }
36
+ urls.reject(&:empty?)
37
+ end
38
+ end
39
+ end
40
+ end
@@ -1,3 +1,4 @@
1
1
  module WaybackArchiver
2
- VERSION = '0.0.10'
2
+ # Gem version
3
+ VERSION = '0.0.11'
3
4
  end
metadata CHANGED
@@ -1,113 +1,127 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.10
4
+ version: 0.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-22 00:00:00.000000000 Z
11
+ date: 2015-04-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: site_mapper
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: url_resolver
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ~>
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0.1'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ~>
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0.1'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: bundler
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ~>
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
47
  version: '1.3'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ~>
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '1.3'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: rake
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - ~>
59
+ - - "~>"
60
60
  - !ruby/object:Gem::Version
61
61
  version: '10.3'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - ~>
66
+ - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '10.3'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rspec
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - ~>
73
+ - - "~>"
74
74
  - !ruby/object:Gem::Version
75
75
  version: '3.1'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - ~>
80
+ - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: '3.1'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: yard
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - ~>
87
+ - - "~>"
88
88
  - !ruby/object:Gem::Version
89
89
  version: '0.8'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - ~>
94
+ - - "~>"
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0.8'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: coveralls
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
- - - ~>
101
+ - - "~>"
102
102
  - !ruby/object:Gem::Version
103
103
  version: '0.7'
104
104
  type: :development
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
- - - ~>
108
+ - - "~>"
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0.7'
111
+ - !ruby/object:Gem::Dependency
112
+ name: redcarpet
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '3.2'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '3.2'
111
125
  description: Send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
112
126
  email:
113
127
  - burenstam@gmail.com
@@ -117,11 +131,11 @@ extensions: []
117
131
  extra_rdoc_files: []
118
132
  files:
119
133
  - bin/wayback_archiver
134
+ - lib/wayback_archiver.rb
120
135
  - lib/wayback_archiver/archive.rb
121
- - lib/wayback_archiver/collector.rb
122
136
  - lib/wayback_archiver/request.rb
137
+ - lib/wayback_archiver/url_collector.rb
123
138
  - lib/wayback_archiver/version.rb
124
- - lib/wayback_archiver.rb
125
139
  homepage: https://github.com/buren/wayback_archiver
126
140
  licenses:
127
141
  - MIT
@@ -132,17 +146,17 @@ require_paths:
132
146
  - lib
133
147
  required_ruby_version: !ruby/object:Gem::Requirement
134
148
  requirements:
135
- - - '>='
149
+ - - ">="
136
150
  - !ruby/object:Gem::Version
137
151
  version: 1.9.3
138
152
  required_rubygems_version: !ruby/object:Gem::Requirement
139
153
  requirements:
140
- - - '>='
154
+ - - ">="
141
155
  - !ruby/object:Gem::Version
142
156
  version: '0'
143
157
  requirements: []
144
158
  rubyforge_project:
145
- rubygems_version: 2.0.0
159
+ rubygems_version: 2.2.2
146
160
  signing_key:
147
161
  specification_version: 4
148
162
  summary: Send URLs to Wayback Machine
@@ -1,24 +0,0 @@
1
- module WaybackArchiver
2
- class Collector
3
- class << self
4
- def urls_from_sitemap(url)
5
- resolved = Request.resolve_url(url)
6
- sitemap = Request.get_page(resolved)
7
- sitemap.css('loc').map! { |element| element.text }
8
- end
9
-
10
- def urls_from_crawl(url)
11
- SiteMapper.map(url) { |new_url| yield(new_url) if block_given? }
12
- end
13
-
14
- def urls_from_file(path)
15
- raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
16
- urls = []
17
- text = File.open(path).read
18
- text.gsub!(/\r\n?/, "\n")
19
- .each_line { |line| urls << line.gsub!(/\n/, '').strip }
20
- urls.reject(&:empty?)
21
- end
22
- end
23
- end
24
- end