wayback_machine_downloader 0.1.3 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +2 -5
- data/lib/wayback_machine_downloader.rb +43 -15
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8532168d675aff20ea6578d90cbf8c1087cdbd9a
|
4
|
+
data.tar.gz: e4476b5c8504a2466b42be02c5a1fefe7e898c95
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3527721f2675c6aba366c88b5a70da8b85615d9beefc9db51403a607040d8dfa012ba9f58bb202d6f0b6d5f8b8b4316ee892d8cbc588153da72086df14bbd509
|
7
|
+
data.tar.gz: 240f40fbca00affa948adfcecc67fa8426888c166c926b5b890761502426baf1b4a5f93c12c6a7b64847a4e0276c36cf1e8f3766943c61ac2a3e279ea913b6fd
|
@@ -2,14 +2,13 @@
|
|
2
2
|
|
3
3
|
require 'wayback_machine_downloader'
|
4
4
|
require 'optparse'
|
5
|
-
require 'pry-rescue'
|
6
5
|
|
7
6
|
options = {}
|
8
7
|
option_parser = OptionParser.new do |opts|
|
9
8
|
opts.banner = "Usage: wayback_machine_downloader http://example.com"
|
10
9
|
|
11
10
|
opts.separator ""
|
12
|
-
opts.separator "Download
|
11
|
+
opts.separator "Download any website from the Wayback Machine."
|
13
12
|
|
14
13
|
opts.separator ""
|
15
14
|
opts.separator "Optional option:"
|
@@ -21,10 +20,8 @@ end.parse!
|
|
21
20
|
|
22
21
|
if base_url = ARGV[0]
|
23
22
|
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp]
|
24
|
-
puts "Downlading #{wayback_machine_downloader.base_url} from Wayback Machine..."
|
25
|
-
binding.pry
|
26
23
|
wayback_machine_downloader.download_files
|
27
24
|
else
|
28
|
-
puts "You need to specify a
|
25
|
+
puts "You need to specify a website to backup. (e.g., http://example.com)"
|
29
26
|
puts "Run `wayback_machine_downloader --help` for more help."
|
30
27
|
end
|
@@ -3,11 +3,13 @@ require 'fileutils'
|
|
3
3
|
|
4
4
|
class WaybackMachineDownloader
|
5
5
|
|
6
|
+
VERSION = "0.1.5"
|
7
|
+
|
6
8
|
attr_accessor :base_url, :timestamp
|
7
9
|
|
8
10
|
def initialize params
|
9
11
|
@base_url = params[:base_url]
|
10
|
-
@timestamp = params[:timestamp]
|
12
|
+
@timestamp = params[:timestamp].to_i
|
11
13
|
end
|
12
14
|
|
13
15
|
def backup_name
|
@@ -18,58 +20,84 @@ class WaybackMachineDownloader
|
|
18
20
|
'websites/' + backup_name + '/'
|
19
21
|
end
|
20
22
|
|
21
|
-
def
|
23
|
+
def get_file_list_curated
|
22
24
|
file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
|
23
25
|
file_list_curated = Hash.new
|
24
26
|
file_list_raw.each_line do |line|
|
25
27
|
line = line.split(' ')
|
26
|
-
|
28
|
+
file_timestamp = line[1].to_i
|
27
29
|
file_url = line[2]
|
28
30
|
file_id = file_url.split('/')[3..-1].join('/')
|
29
31
|
file_id = URI.unescape file_id
|
30
|
-
if
|
31
|
-
|
32
|
-
file_list_curated[file_id]
|
32
|
+
if @timestamp == 0 or file_timestamp <= @timestamp
|
33
|
+
if file_list_curated[file_id]
|
34
|
+
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
35
|
+
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
36
|
+
end
|
37
|
+
else
|
38
|
+
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
33
39
|
end
|
34
|
-
else
|
35
|
-
file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
|
36
40
|
end
|
37
41
|
end
|
38
42
|
file_list_curated
|
39
43
|
end
|
40
44
|
|
41
45
|
def download_files
|
46
|
+
puts "Downlading #{@base_url} from Wayback Machine..."
|
47
|
+
puts
|
48
|
+
file_list_curated = get_file_list_curated
|
49
|
+
count = 0
|
42
50
|
file_list_curated.each do |file_id, file_remote_info|
|
43
|
-
|
51
|
+
count += 1
|
44
52
|
file_url = file_remote_info[:file_url]
|
45
53
|
file_path_elements = file_id.split('/')
|
46
54
|
if file_id == ""
|
47
55
|
dir_path = backup_path
|
48
56
|
file_path = backup_path + 'index.html'
|
49
|
-
elsif file_url[-1] == '/'
|
57
|
+
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
50
58
|
dir_path = backup_path + file_path_elements[0..-1].join('/')
|
51
|
-
file_path = backup_path + file_path_elements[0..-1].join('/') + 'index.html'
|
59
|
+
file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
|
52
60
|
else
|
53
61
|
dir_path = backup_path + file_path_elements[0..-2].join('/')
|
54
62
|
file_path = backup_path + file_path_elements[0..-1].join('/')
|
55
63
|
end
|
56
64
|
unless File.exists? file_path
|
57
|
-
|
65
|
+
structure_dir_path dir_path
|
58
66
|
open(file_path, "wb") do |file|
|
59
67
|
begin
|
60
68
|
open("http://web.archive.org/web/#{timestamp}id_/#{file_url}") do |uri|
|
61
69
|
file.write(uri.read)
|
62
70
|
end
|
63
71
|
rescue OpenURI::HTTPError => e
|
64
|
-
puts "#{file_url} #
|
72
|
+
puts "#{file_url} # #{e}"
|
65
73
|
file.write(e.io.read)
|
74
|
+
rescue Exception => e
|
75
|
+
puts "#{file_url} # #{e}"
|
66
76
|
end
|
67
77
|
end
|
68
|
-
puts "#{file_url} -> #{file_path}"
|
78
|
+
puts "#{file_url} -> #{file_path} (#{count}/#{file_list_curated.size})"
|
69
79
|
else
|
70
|
-
puts "#{file_url} # #{file_path} already exists."
|
80
|
+
puts "#{file_url} # #{file_path} already exists. (#{count}/#{file_list_curated.size})"
|
71
81
|
end
|
72
82
|
end
|
83
|
+
puts
|
84
|
+
puts "Download complete, saved in #{backup_path}. (#{file_list_curated.size} files)"
|
85
|
+
end
|
86
|
+
|
87
|
+
def structure_dir_path dir_path
|
88
|
+
begin
|
89
|
+
FileUtils::mkdir_p dir_path unless File.exists? dir_path
|
90
|
+
rescue Errno::EEXIST => e
|
91
|
+
puts "# #{e}"
|
92
|
+
file_already_existing = e.to_s.split("File exists @ dir_s_mkdir - ")[-1]
|
93
|
+
file_already_existing_temporary = file_already_existing + '.temp'
|
94
|
+
file_already_existing_permanent = file_already_existing + '/index.html'
|
95
|
+
FileUtils::mv file_already_existing, file_already_existing_temporary
|
96
|
+
FileUtils::mkdir_p file_already_existing
|
97
|
+
FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
|
98
|
+
puts "#{file_already_existing} -> #{file_already_existing_permanent}"
|
99
|
+
structure_dir_path dir_path
|
100
|
+
end
|
73
101
|
end
|
74
102
|
|
75
103
|
end
|