wayback_machine_downloader 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +27 -1
- data/lib/wayback_machine_downloader.rb +53 -6
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9ace9b823151cb8ec95c7b942cbe07a1de388218
|
4
|
+
data.tar.gz: 7a4af6e514823ddc120272ac3d236ea435747e22
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e76d5c4fb1cb024619e5eb90fdc43b4a71d27b0430b8f5251a07ac0b7367c8247a6c2ed29660d5e38e45cd7614d20f250eec3ea6e4633e35b0e6bc740869e6a2
|
7
|
+
data.tar.gz: 5f9307192fecf31509894dc86c4c98b07d9cd5aa28ae046b0d93d797b08eb7e019c9d59023f35c4e625826d4987b8c1716916759df94bc0394e2af726580fddd
|
@@ -1,4 +1,30 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require 'wayback_machine_downloader'
|
4
|
-
|
4
|
+
require 'optparse'
|
5
|
+
require 'pry-rescue'
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
option_parser = OptionParser.new do |opts|
|
9
|
+
opts.banner = "Usage: wayback_machine_downloader http://example.com"
|
10
|
+
|
11
|
+
opts.separator ""
|
12
|
+
opts.separator "Download a website from Wayback Machine."
|
13
|
+
|
14
|
+
opts.separator ""
|
15
|
+
opts.separator "Optional option:"
|
16
|
+
|
17
|
+
opts.on("-t", "--timestamp TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20150806225358)") do |t|
|
18
|
+
options[:timestamp] = t
|
19
|
+
end
|
20
|
+
end.parse!
|
21
|
+
|
22
|
+
if base_url = ARGV[0]
|
23
|
+
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp]
|
24
|
+
puts "Downlading #{wayback_machine_downloader.base_url} from Wayback Machine..."
|
25
|
+
binding.pry
|
26
|
+
wayback_machine_downloader.download_files
|
27
|
+
else
|
28
|
+
puts "You need to specify a websire to backup. (ie. http://example.com)"
|
29
|
+
puts "Run `wayback_machine_downloader --help` for more help."
|
30
|
+
end
|
@@ -1,28 +1,75 @@
|
|
1
1
|
require 'open-uri'
|
2
|
+
require 'fileutils'
|
2
3
|
|
3
4
|
class WaybackMachineDownloader
|
4
5
|
|
5
|
-
attr_accessor :base_url
|
6
|
+
attr_accessor :base_url, :timestamp
|
6
7
|
|
7
8
|
def initialize params
|
8
9
|
@base_url = params[:base_url]
|
10
|
+
@timestamp = params[:timestamp]
|
11
|
+
end
|
12
|
+
|
13
|
+
def backup_name
|
14
|
+
@base_url.split('/')[2]
|
15
|
+
end
|
16
|
+
|
17
|
+
def backup_path
|
18
|
+
'websites/' + backup_name + '/'
|
9
19
|
end
|
10
20
|
|
11
21
|
def file_list_curated
|
12
|
-
file_list_raw = open "http://web.archive.org/
|
22
|
+
file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
|
13
23
|
file_list_curated = Hash.new
|
14
24
|
file_list_raw.each_line do |line|
|
15
25
|
line = line.split(' ')
|
16
26
|
timestamp = line[1].to_i
|
17
27
|
file_url = line[2]
|
18
|
-
|
19
|
-
|
20
|
-
|
28
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
29
|
+
file_id = URI.unescape file_id
|
30
|
+
if file_list_curated[file_id]
|
31
|
+
unless file_list_curated[file_id][:timestamp] > timestamp
|
32
|
+
file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
|
21
33
|
end
|
22
34
|
else
|
23
|
-
file_list_curated[
|
35
|
+
file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
|
24
36
|
end
|
25
37
|
end
|
26
38
|
file_list_curated
|
27
39
|
end
|
40
|
+
|
41
|
+
def download_files
|
42
|
+
file_list_curated.each do |file_id, file_remote_info|
|
43
|
+
timestamp = file_remote_info[:timestamp]
|
44
|
+
file_url = file_remote_info[:file_url]
|
45
|
+
file_path_elements = file_id.split('/')
|
46
|
+
if file_id == ""
|
47
|
+
dir_path = backup_path
|
48
|
+
file_path = backup_path + 'index.html'
|
49
|
+
elsif file_url[-1] == '/'
|
50
|
+
dir_path = backup_path + file_path_elements[0..-1].join('/')
|
51
|
+
file_path = backup_path + file_path_elements[0..-1].join('/') + 'index.html'
|
52
|
+
else
|
53
|
+
dir_path = backup_path + file_path_elements[0..-2].join('/')
|
54
|
+
file_path = backup_path + file_path_elements[0..-1].join('/')
|
55
|
+
end
|
56
|
+
unless File.exists? file_path
|
57
|
+
FileUtils::mkdir_p dir_path unless File.exists? dir_path
|
58
|
+
open(file_path, "wb") do |file|
|
59
|
+
begin
|
60
|
+
open("http://web.archive.org/web/#{timestamp}id_/#{file_url}") do |uri|
|
61
|
+
file.write(uri.read)
|
62
|
+
end
|
63
|
+
rescue OpenURI::HTTPError => e
|
64
|
+
puts "#{file_url} # 404"
|
65
|
+
file.write(e.io.read)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
puts "#{file_url} -> #{file_path}"
|
69
|
+
else
|
70
|
+
puts "#{file_url} # #{file_path} already exists."
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
28
75
|
end
|