wayback_archiver 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b50fa3a16a416da9fea6e7eb544173f9bd8c10ea
4
- data.tar.gz: cfd8ebea5a0ed30fbc06d038a157e10961a4b5c9
3
+ metadata.gz: c846ec4d2baeb3bc0087387f0501d25587424563
4
+ data.tar.gz: c7d4b89507ba88b2504e2c409ebe81440aaa1eec
5
5
  SHA512:
6
- metadata.gz: 0ab0b992bbc1da9433684c5d4f6266d603f07f73ecaa7c9c3a3fafcb1932bb30077e53d5f2e866990756943bb23638e542d7a9cee80d91e71768467042f26b68
7
- data.tar.gz: 37ab8183c5738c0d1c9540b39395b25c06f80fbb5c8412848892f71a8ada68d5a0611d9fc831a427d638d350fd58c32aac27bf9d19df0eade960359b51b3b4d3
6
+ metadata.gz: 16f09492eebc97040f466702cd5308260524dcd18bc04e289aad9813e48f8cf43fb3a77ed636e48a1fae4932c52e57c5b3316508518f327ac754cbfe036440d6
7
+ data.tar.gz: 8f2842913687e9d917d5ae609c12f598919bf97079d8e9a464ea6f9c679c4ff642708c63c3ab4bf37b30336878d2aab6eef3c73665f7961d971e896aa9a39af2
@@ -1,4 +1,12 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'wayback_archiver'
4
- WaybackArchiver.archive(ARGV[0], ARGV[1])
4
+
5
+ url = ARGV[0]
6
+ from = ARGV[1]
7
+
8
+ if from.nil?
9
+ WaybackArchiver.archive(url)
10
+ else
11
+ WaybackArchiver.archive(url, from)
12
+ end
@@ -1,46 +1,26 @@
1
- require 'net/http'
2
1
  require 'uri'
2
+ require 'net/http'
3
3
  require 'rexml/document'
4
4
 
5
- module WaybackArchiver
6
- WAYBACK_BASE_URL = 'https://web.archive.org/save/'
7
-
8
- def self.archive(source, from)
9
- from = from || :sitemap
10
- urls = Array.new
11
- case from.to_sym
12
- when :sitemap
13
- urls = WaybackArchiver.urls_from_sitemap("#{source}/sitemap.xml")
14
- when :url
15
- urls << source
16
- when :file
17
- urls = WaybackArchiver.urls_from_file(source)
18
- end
19
-
20
- urls.each_with_index do |url, index|
21
- puts "Archiving (#{index + 1}/#{urls.length}): #{url}"
22
- request_url = URI("#{WAYBACK_BASE_URL}#{url}")
23
- res = Net::HTTP.get_response(request_url)
5
+ require 'wayback_archiver/collector'
6
+ require 'wayback_archiver/archive'
7
+ require 'wayback_archiver/request'
24
8
 
25
- puts "#{res.code} => #{res.message}"
9
+ module WaybackArchiver
10
+ BASE_URL = 'https://web.archive.org/save/'
11
+
12
+ def self.archive(source, from = :sitemap)
13
+ urls = case from
14
+ when :sitemap, 'sitemap'
15
+ Collector.urls_from_sitemap("#{source}/sitemap.xml")
16
+ when :url, 'url'
17
+ Array(source)
18
+ when :file, 'file'
19
+ Collector.urls_from_file(source)
20
+ else
21
+ raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemal, url, file"
26
22
  end
27
- end
28
-
29
- def self.urls_from_sitemap(url)
30
- urls = Array.new
31
- xml_data = Net::HTTP.get_response(URI.parse(url)).body
32
- document = REXML::Document.new(xml_data)
33
-
34
- document.elements.each('urlset/url/loc') { |element| urls << element.text }
35
- urls
36
- end
37
-
38
- def self.urls_from_file(path)
39
- urls = Array.new
40
- text = File.open(path).read
41
- text.gsub!(/\r\n?/, "\n") # Normalize line endings
42
- text.each_line { |line| urls << line.gsub(/\n/, '') }
43
- urls
23
+ Archive.post(urls)
44
24
  end
45
25
 
46
26
  end
@@ -0,0 +1,19 @@
1
+ module WaybackArchiver
2
+ class Archive
3
+
4
+ def self.post(urls)
5
+ urls.each_with_index do |url, index|
6
+ request_url = "#{BASE_URL}#{url}"
7
+ puts "Archiving (#{index + 1}/#{urls.length}): #{url}"
8
+ begin
9
+ res = Request.get_response(request_url)
10
+ puts "#{res.code} => #{res.message}"
11
+ rescue Exception => e
12
+ puts "Error message: #{e.message}"
13
+ puts "Failed to archive: #{url}"
14
+ end
15
+ end
16
+ end
17
+
18
+ end
19
+ end
@@ -0,0 +1,27 @@
1
+ module WaybackArchiver
2
+ class Collector
3
+
4
+ class << self
5
+
6
+ def urls_from_sitemap(url)
7
+ urls = Array.new
8
+ xml_data = Request.get_response(url).body
9
+ document = REXML::Document.new(xml_data)
10
+
11
+ document.elements.each('urlset/url/loc') { |element| urls << element.text }
12
+ urls
13
+ end
14
+
15
+ def urls_from_file(path)
16
+ raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
17
+ urls = Array.new
18
+ text = File.open(path).read
19
+ text.gsub!(/\r\n?/, "\n") # Normalize line endings
20
+ text.each_line { |line| urls << line.gsub(/\n/, '').strip }
21
+ urls.reject(&:empty?)
22
+ end
23
+
24
+ end
25
+
26
+ end
27
+ end
@@ -0,0 +1,9 @@
1
+ module WaybackArchiver
2
+ class Request
3
+
4
+ def self.get_response(url)
5
+ Net::HTTP.get_response(URI.parse(url))
6
+ end
7
+
8
+ end
9
+ end
@@ -1,3 +1,3 @@
1
1
  module WaybackArchiver
2
- VERSION = '0.0.1'
2
+ VERSION = '0.0.2'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-07-17 00:00:00.000000000 Z
11
+ date: 2014-07-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -46,15 +46,12 @@ executables:
46
46
  extensions: []
47
47
  extra_rdoc_files: []
48
48
  files:
49
- - .gitignore
50
- - Gemfile
51
- - LICENSE.txt
52
- - README.md
53
- - Rakefile
54
49
  - bin/wayback_archiver
55
- - lib/wayback_archiver.rb
50
+ - lib/wayback_archiver/archive.rb
51
+ - lib/wayback_archiver/collector.rb
52
+ - lib/wayback_archiver/request.rb
56
53
  - lib/wayback_archiver/version.rb
57
- - wayback_archiver.gemspec
54
+ - lib/wayback_archiver.rb
58
55
  homepage: https://github.com/buren/wayback_archiver
59
56
  licenses:
60
57
  - MIT
@@ -77,6 +74,6 @@ requirements: []
77
74
  rubyforge_project:
78
75
  rubygems_version: 2.0.0
79
76
  signing_key:
80
- specification_version: 3
77
+ specification_version: 4
81
78
  summary: Send URLs to Wayback Machine
82
79
  test_files: []
data/.gitignore DELETED
@@ -1,17 +0,0 @@
1
- *.gem
2
- *.rbc
3
- .bundle
4
- .config
5
- .yardoc
6
- Gemfile.lock
7
- InstalledFiles
8
- _yardoc
9
- coverage
10
- doc/
11
- lib/bundler/man
12
- pkg
13
- rdoc
14
- spec/reports
15
- test/tmp
16
- test/version_tmp
17
- tmp
data/Gemfile DELETED
@@ -1,4 +0,0 @@
1
- source 'https://rubygems.org'
2
-
3
- # Specify your gem's dependencies in wayback_archiver.gemspec
4
- gemspec
@@ -1,22 +0,0 @@
1
- Copyright (c) 2014 Jacob Burenstam
2
-
3
- MIT License
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining
6
- a copy of this software and associated documentation files (the
7
- "Software"), to deal in the Software without restriction, including
8
- without limitation the rights to use, copy, modify, merge, publish,
9
- distribute, sublicense, and/or sell copies of the Software, and to
10
- permit persons to whom the Software is furnished to do so, subject to
11
- the following conditions:
12
-
13
- The above copyright notice and this permission notice shall be
14
- included in all copies or substantial portions of the Software.
15
-
16
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md DELETED
@@ -1,30 +0,0 @@
1
- # WaybackArchiver
2
- Send URLs to [Wayback Machine](https://archive.org/web/) from [/sitemap.xml](http://www.sitemaps.org), single URL or file with URLs.
3
-
4
- ## Installation
5
- Install the gem:
6
- ```bash
7
- gem install wayback_archiver
8
- ```
9
-
10
- ## Usage
11
-
12
- ```bash
13
- wayback_archiver http://example.com # Send each line defined in http://example.com/sitemap.xml
14
- wayback_archiver http://example.com/some/path url # Only send http://example.com/some/path to Wayback Machine
15
- wayback_archiver /path/to/some/file # With an URL on each line
16
- ```
17
-
18
- View domain archive: https://web.archive.org/web/*/http://example.com
19
-
20
-
21
- ## Contributing
22
-
23
- 1. Fork it
24
- 2. Create your feature branch (`git checkout -b my-new-feature`)
25
- 3. Commit your changes (`git commit -am 'Add some feature'`)
26
- 4. Push to the branch (`git push origin my-new-feature`)
27
- 5. Create new Pull Request
28
-
29
- ---------
30
- Don't know what a sitemap is? [http://sitemaps.org](http://www.sitemaps.org)
data/Rakefile DELETED
@@ -1 +0,0 @@
1
- require 'bundler/gem_tasks'
@@ -1,34 +0,0 @@
1
- # coding: utf-8
2
- lib = File.expand_path('../lib', __FILE__)
3
- $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require 'wayback_archiver/version'
5
-
6
- Gem::Specification.new do |spec|
7
- spec.name = 'wayback_archiver'
8
- spec.version = WaybackArchiver::VERSION
9
-
10
- spec.required_rubygems_version = Gem::Requirement.new('>= 0') if spec.respond_to? :required_rubygems_version=
11
- spec.authors = ['Jacob Burenstam']
12
- spec.email = ['burenstam@gmail.com']
13
- spec.summary = %q{Send URLs to Wayback Machine}
14
- spec.description = %q{Send URLs to Wayback Machine. From: sitemap, file or single URL.}
15
- spec.homepage = 'https://github.com/buren/wayback_archiver'
16
- spec.license = 'MIT'
17
-
18
- spec.files = `git ls-files`.split($/)
19
- spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
20
- spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
21
- spec.require_paths = ['lib']
22
-
23
- if spec.respond_to? :specification_version then
24
- spec.specification_version = 3
25
-
26
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
27
- else
28
- end
29
- else
30
- end
31
-
32
- spec.add_development_dependency 'bundler', '~> 1.3'
33
- spec.add_development_dependency 'rake'
34
- end