wayback_archiver 0.0.10 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3df67fcb7faa06087ad72e6c57ff985d9dbe4fe9
4
- data.tar.gz: 5c21d0bf3353dd2ce732cef3b59cd3ba807acc8a
3
+ metadata.gz: fa0d72d6164b6b280db7c683535e6894ba64778d
4
+ data.tar.gz: 629963ef30283123820418a714ae73160fa20293
5
5
  SHA512:
6
- metadata.gz: d21d169fb9fa143b777d771a8cdc5ba8c3c4186f6472aa8f2ff2806034becf18e3cc260c7ce0a9b97d1148a2d1781d9ddc0daef08d7abe1d2b13140125513231
7
- data.tar.gz: 15a413ab5bd0f51c40640cd0be764142ca0380315f97fc335cf1bf19f4645b327dc0716400c6c10af7a79b8ceb898b6e0f89a21114865af7f84da1e2bf3d383c
6
+ metadata.gz: 1021437d5c97ecfad14822548522cad49d3257992c91ee8ccf90a30e3cd4cd98fae6c44d977fc3c1cd4f61334bd7c935fcc50bdee92a68a4399470d4610836be
7
+ data.tar.gz: 5c805b9bb8b7cc1123be943966b2f4e94619580d17eb051947d38b1e3d4a165b2c61dd3f117c6bc7c8d858768731108bb1aab1cc59472ad0a3e120fb8df9cae4
@@ -4,19 +4,36 @@ require 'uri'
4
4
  require 'net/http'
5
5
 
6
6
  require 'wayback_archiver/version'
7
- require 'wayback_archiver/collector'
7
+ require 'wayback_archiver/url_collector'
8
8
  require 'wayback_archiver/archive'
9
9
  require 'wayback_archiver/request'
10
10
 
11
+ # WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
11
12
  module WaybackArchiver
12
- def self.archive(source, from = :crawl)
13
- urls = case from.to_s
14
- when 'file' then Archive.post(Collector.urls_from_file(source))
15
- when 'crawl' then Collector.urls_from_crawl(source) { |url| Archive.post_url(url) }
16
- when 'sitemap' then Archive.post(Collector.urls_from_sitemap("#{source}/sitemap.xml"))
13
+ # Link to gem on rubygems.org, part of the sent User-Agent
14
+ INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
15
+ # WaybackArchiver User-Agent
16
+ USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})"
17
+
18
+ # Send URLs to Wayback Machine.
19
+ # @return [Array] with URLs sent to the Wayback Machine.
20
+ # @param [String] source for URL(s).
21
+ # @param [String/Symbol] type of source. Supported types: ['crawl', 'sitemap', 'url', 'file'].
22
+ # @example Crawl example.com and send all URLs of the same domain
23
+ # WaybackArchiver.archive('example.com') # Default type is :crawl
24
+ # WaybackArchiver.archive('example.com', :crawl)
25
+ # @example Send only example.com
26
+ # WaybackArchiver.archive('example.com', :url)
27
+ # @example Send URL on each line in specified file
28
+ # WaybackArchiver.archive('/path/to/file', :file)
29
+ def self.archive(source, type = :crawl)
30
+ case type.to_s
31
+ when 'file' then Archive.post(UrlCollector.file(source))
32
+ when 'crawl' then UrlCollector.crawl(source) { |url| Archive.post_url(url) }
33
+ when 'sitemap' then Archive.post(UrlCollector.sitemap(source))
17
34
  when 'url' then Archive.post_url(Request.resolve_url(source))
18
35
  else
19
- raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawl"
36
+ raise ArgumentError, "Unknown type: '#{type}'. Allowed types: sitemap, url, file, crawl"
20
37
  end
21
38
  end
22
39
  end
@@ -1,27 +1,47 @@
1
1
  module WaybackArchiver
2
+ # Post URL(s) to Wayback Machine
2
3
  class Archive
3
- WAYBACK_BASE_URL = 'https://web.archive.org/save/'
4
- MAX_THREAD_COUNT = 10
4
+ # Wayback Machine base URL.
5
+ WAYBACK_BASE_URL = 'https://web.archive.org/save/'
6
+ # Default concurrency for archiving URLs
7
+ DEFAULT_CONCURRENCY = 10
8
+ class << self
9
+ # Send URLs to Wayback Machine.
10
+ # @return [Array] with sent URLs.
11
+ # @param [Array] urls URLs to send.
12
+ # @param [Hash] options
13
+ # @example Archive example.com, with default options
14
+ # Archive.post(['http://example.com'])
15
+ # @example Archive example.com, using only 1 thread
16
+ # Archive.post(['http://example.com'], concurrency: 1)
17
+ def post(urls, options = {})
18
+ options = { concurrency: DEFAULT_CONCURRENCY }.merge!(options)
19
+ concurrency = options[:concurrency]
20
+ puts "Request are sent with up to #{concurrency} parallel threads"
21
+ puts "Total urls to be sent: #{urls.length}"
22
+ group_size = (urls.length / concurrency) + 1
23
+ urls.each_slice(group_size).to_a.map! do |archive_urls|
24
+ Thread.new { archive_urls.each { |url| post_url(url) } }
25
+ end.each(&:join)
26
+ puts "#{urls.length} URLs sent to Internet archive"
27
+ urls
28
+ end
5
29
 
6
- def self.post(urls)
7
- puts "Request are sent with up to #{MAX_THREAD_COUNT} parallel threads"
8
- puts "Total urls to be sent: #{urls.length}"
9
- group_size = (urls.length / MAX_THREAD_COUNT) + 1
10
- urls.each_slice(group_size).to_a.map! do |archive_urls|
11
- Thread.new { archive_urls.each { |url| post_url(url) } }
12
- end.each(&:join)
13
- puts "#{urls.length} URLs sent to Internet archive"
14
- urls
15
- end
16
-
17
- def self.post_url(archive_url)
18
- resolved_url = Request.resolve_url(archive_url)
19
- request_url = "#{WAYBACK_BASE_URL}#{resolved_url}"
20
- response = Request.get_response(request_url)
21
- puts "[#{response.code}, #{response.message}] #{resolved_url}"
22
- rescue Exception => e
23
- puts "Error message: #{e.message}"
24
- puts "Failed to archive: #{resolved_url}"
30
+ # Send URL to Wayback Machine.
31
+ # @return [String] the sent URL.
32
+ # @param [String] url to send.
33
+ # @example Archive example.com, with default options
34
+ # Archive.post_url('http://example.com')
35
+ def post_url(url)
36
+ resolved_url = Request.resolve_url(url)
37
+ request_url = "#{WAYBACK_BASE_URL}#{resolved_url}"
38
+ response = Request.response(request_url)
39
+ puts "[#{response.code}, #{response.message}] #{resolved_url}"
40
+ resolved_url
41
+ rescue Exception => e
42
+ puts "Error message: #{e.message}"
43
+ puts "Failed to archive: #{resolved_url}"
44
+ end
25
45
  end
26
46
  end
27
47
  end
@@ -1,34 +1,65 @@
1
1
  require 'url_resolver' # TODO: Allow users to use any resolver
2
2
 
3
3
  module WaybackArchiver
4
+ # Request and parse HTML & XML documents
4
5
  class Request
5
- INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
6
- USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})"
7
-
8
6
  class << self
9
- def get_page(url, document_type = :html)
10
- response = Request.get_response(url).body
11
- case document_type
12
- when :xml
13
- Nokogiri::XML(response)
14
- else
15
- Nokogiri::HTML(response)
16
- end
7
+ # Get and parse HTML & XML documents.
8
+ # @return [Array] with links sent to the Wayback Machine.
9
+ # @param [String] url to retrieve and parse.
10
+ # @example Request and parse example.com
11
+ # Request.document('example.com')
12
+ # @example Request and parse google.com/sitemap.xml
13
+ # Request.document('google.com/sitemap.xml')
14
+ def document(url)
15
+ response_body = Request.response(url).body
16
+ Nokogiri::HTML(response_body)
17
17
  end
18
18
 
19
- def get_response(url, resolve = false)
19
+ # Get reponse.
20
+ # @return [Net::HTTP*] the http response.
21
+ # @param [String] url URL to retrieve.
22
+ # @param [Boolean] resolve whether to resolve the URL.
23
+ # @example Resolve example.com and request
24
+ # Request.response('example.com', true)
25
+ # @example Request http://example.com
26
+ # Request.response('http://example.com', false)
27
+ def response(url, resolve = true)
20
28
  resolved_url = resolve ? resolve_url(url) : url
21
29
  uri = URI.parse(resolved_url)
22
30
  http = Net::HTTP.new(uri.host, uri.port)
23
- http.use_ssl = true if resolved_url.include?('https://')
31
+ http.use_ssl = true if resolved_url.start_with?('https://')
24
32
 
25
33
  request = Net::HTTP::Get.new(uri.request_uri)
26
- request['User-Agent'] = USER_AGENT
34
+ request['User-Agent'] = WaybackArchiver::USER_AGENT
27
35
  http.request(request)
28
36
  end
29
37
 
38
+ # Resolve the URL, follows redirects.
39
+ # @return [String] the resolved URL.
40
+ # @param [String] url to retrieve.
41
+ # @example Resolve example.com and request
42
+ # Request.resolve_url('example.com')
30
43
  def resolve_url(url)
31
- UrlResolver.resolve(url)
44
+ resolved = UrlResolver.resolve(url)
45
+ resolved = resolved.prepend('http://') unless has_protocol?(resolved)
46
+ resolved
47
+ end
48
+
49
+ private
50
+
51
+ # Resolve the URL, follows redirects.
52
+ # @return [Boolean] true if string includes protocol.
53
+ # @param [String] url to check.
54
+ # @example Check if string includes protocol
55
+ # Request.has_protocol?('example.com')
56
+ # # => false
57
+ # Request.has_protocol?('https://example.com')
58
+ # # => true
59
+ # Request.has_protocol?('http://example.com')
60
+ # # => true
61
+ def has_protocol?(url)
62
+ url.start_with?('http://') || url.start_with?('https://')
32
63
  end
33
64
  end
34
65
  end
@@ -0,0 +1,40 @@
1
+ module WaybackArchiver
2
+ # Retrive URLs from different sources
3
+ class UrlCollector
4
+ class << self
5
+ # Retrieve URLs from Sitemap.
6
+ # @return [Array] of URLs defined in Sitemap.
7
+ # @param [String] url domain to retrieve Sitemap from.
8
+ # @example Get URLs defined in Sitemap for google.com
9
+ # UrlCollector.sitemap('https://google.com')
10
+ def sitemap(url)
11
+ resolved = Request.resolve_url("#{url}/sitemap.xml")
12
+ sitemap = Request.document(resolved)
13
+ sitemap.css('loc').map { |element| element.text }
14
+ end
15
+
16
+ # Retrieve URLs by crawling.
17
+ # @return [Array] of URLs defined found during crawl.
18
+ # @param [String] url domain to crawl URLs from.
19
+ # @example Crawl URLs defined on example.com
20
+ # UrlCollector.crawl('http://example.com')
21
+ def crawl(url)
22
+ SiteMapper.map(url, user_agent: WaybackArchiver::USER_AGENT) { |new_url| yield(new_url) if block_given? }
23
+ end
24
+
25
+ # Retrieve URLs listed in file.
26
+ # @return [Array] of URLs defined in file.
27
+ # @param [String] path to get URLs from.
28
+ # @example Get URLs defined in /path/to/file
29
+ # UrlCollector.file('/path/to/file')
30
+ def file(path)
31
+ raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
32
+ urls = []
33
+ File.open(path).read
34
+ .gsub(/\r\n?/, "\n")
35
+ .each_line { |line| urls << line.gsub(/\n/, '').strip }
36
+ urls.reject(&:empty?)
37
+ end
38
+ end
39
+ end
40
+ end
@@ -1,3 +1,4 @@
1
1
  module WaybackArchiver
2
- VERSION = '0.0.10'
2
+ # Gem version
3
+ VERSION = '0.0.11'
3
4
  end
metadata CHANGED
@@ -1,113 +1,127 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.10
4
+ version: 0.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-22 00:00:00.000000000 Z
11
+ date: 2015-04-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: site_mapper
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: url_resolver
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ~>
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0.1'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ~>
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0.1'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: bundler
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ~>
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
47
  version: '1.3'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ~>
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '1.3'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: rake
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - ~>
59
+ - - "~>"
60
60
  - !ruby/object:Gem::Version
61
61
  version: '10.3'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - ~>
66
+ - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '10.3'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rspec
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - ~>
73
+ - - "~>"
74
74
  - !ruby/object:Gem::Version
75
75
  version: '3.1'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - ~>
80
+ - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: '3.1'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: yard
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - ~>
87
+ - - "~>"
88
88
  - !ruby/object:Gem::Version
89
89
  version: '0.8'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - ~>
94
+ - - "~>"
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0.8'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: coveralls
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
- - - ~>
101
+ - - "~>"
102
102
  - !ruby/object:Gem::Version
103
103
  version: '0.7'
104
104
  type: :development
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
- - - ~>
108
+ - - "~>"
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0.7'
111
+ - !ruby/object:Gem::Dependency
112
+ name: redcarpet
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '3.2'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '3.2'
111
125
  description: Send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
112
126
  email:
113
127
  - burenstam@gmail.com
@@ -117,11 +131,11 @@ extensions: []
117
131
  extra_rdoc_files: []
118
132
  files:
119
133
  - bin/wayback_archiver
134
+ - lib/wayback_archiver.rb
120
135
  - lib/wayback_archiver/archive.rb
121
- - lib/wayback_archiver/collector.rb
122
136
  - lib/wayback_archiver/request.rb
137
+ - lib/wayback_archiver/url_collector.rb
123
138
  - lib/wayback_archiver/version.rb
124
- - lib/wayback_archiver.rb
125
139
  homepage: https://github.com/buren/wayback_archiver
126
140
  licenses:
127
141
  - MIT
@@ -132,17 +146,17 @@ require_paths:
132
146
  - lib
133
147
  required_ruby_version: !ruby/object:Gem::Requirement
134
148
  requirements:
135
- - - '>='
149
+ - - ">="
136
150
  - !ruby/object:Gem::Version
137
151
  version: 1.9.3
138
152
  required_rubygems_version: !ruby/object:Gem::Requirement
139
153
  requirements:
140
- - - '>='
154
+ - - ">="
141
155
  - !ruby/object:Gem::Version
142
156
  version: '0'
143
157
  requirements: []
144
158
  rubyforge_project:
145
- rubygems_version: 2.0.0
159
+ rubygems_version: 2.2.2
146
160
  signing_key:
147
161
  specification_version: 4
148
162
  summary: Send URLs to Wayback Machine
@@ -1,24 +0,0 @@
1
- module WaybackArchiver
2
- class Collector
3
- class << self
4
- def urls_from_sitemap(url)
5
- resolved = Request.resolve_url(url)
6
- sitemap = Request.get_page(resolved)
7
- sitemap.css('loc').map! { |element| element.text }
8
- end
9
-
10
- def urls_from_crawl(url)
11
- SiteMapper.map(url) { |new_url| yield(new_url) if block_given? }
12
- end
13
-
14
- def urls_from_file(path)
15
- raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
16
- urls = []
17
- text = File.open(path).read
18
- text.gsub!(/\r\n?/, "\n")
19
- .each_line { |line| urls << line.gsub!(/\n/, '').strip }
20
- urls.reject(&:empty?)
21
- end
22
- end
23
- end
24
- end