wayback_machine_downloader 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +2 -5
- data/lib/wayback_machine_downloader.rb +43 -15
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8532168d675aff20ea6578d90cbf8c1087cdbd9a
|
4
|
+
data.tar.gz: e4476b5c8504a2466b42be02c5a1fefe7e898c95
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3527721f2675c6aba366c88b5a70da8b85615d9beefc9db51403a607040d8dfa012ba9f58bb202d6f0b6d5f8b8b4316ee892d8cbc588153da72086df14bbd509
|
7
|
+
data.tar.gz: 240f40fbca00affa948adfcecc67fa8426888c166c926b5b890761502426baf1b4a5f93c12c6a7b64847a4e0276c36cf1e8f3766943c61ac2a3e279ea913b6fd
|
@@ -2,14 +2,13 @@
|
|
2
2
|
|
3
3
|
require 'wayback_machine_downloader'
|
4
4
|
require 'optparse'
|
5
|
-
require 'pry-rescue'
|
6
5
|
|
7
6
|
options = {}
|
8
7
|
option_parser = OptionParser.new do |opts|
|
9
8
|
opts.banner = "Usage: wayback_machine_downloader http://example.com"
|
10
9
|
|
11
10
|
opts.separator ""
|
12
|
-
opts.separator "Download
|
11
|
+
opts.separator "Download any website from the Wayback Machine."
|
13
12
|
|
14
13
|
opts.separator ""
|
15
14
|
opts.separator "Optional option:"
|
@@ -21,10 +20,8 @@ end.parse!
|
|
21
20
|
|
22
21
|
if base_url = ARGV[0]
|
23
22
|
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp]
|
24
|
-
puts "Downlading #{wayback_machine_downloader.base_url} from Wayback Machine..."
|
25
|
-
binding.pry
|
26
23
|
wayback_machine_downloader.download_files
|
27
24
|
else
|
28
|
-
puts "You need to specify a
|
25
|
+
puts "You need to specify a website to backup. (e.g., http://example.com)"
|
29
26
|
puts "Run `wayback_machine_downloader --help` for more help."
|
30
27
|
end
|
@@ -3,11 +3,13 @@ require 'fileutils'
|
|
3
3
|
|
4
4
|
class WaybackMachineDownloader
|
5
5
|
|
6
|
+
VERSION = "0.1.5"
|
7
|
+
|
6
8
|
attr_accessor :base_url, :timestamp
|
7
9
|
|
8
10
|
def initialize params
|
9
11
|
@base_url = params[:base_url]
|
10
|
-
@timestamp = params[:timestamp]
|
12
|
+
@timestamp = params[:timestamp].to_i
|
11
13
|
end
|
12
14
|
|
13
15
|
def backup_name
|
@@ -18,58 +20,84 @@ class WaybackMachineDownloader
|
|
18
20
|
'websites/' + backup_name + '/'
|
19
21
|
end
|
20
22
|
|
21
|
-
def
|
23
|
+
def get_file_list_curated
|
22
24
|
file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
|
23
25
|
file_list_curated = Hash.new
|
24
26
|
file_list_raw.each_line do |line|
|
25
27
|
line = line.split(' ')
|
26
|
-
|
28
|
+
file_timestamp = line[1].to_i
|
27
29
|
file_url = line[2]
|
28
30
|
file_id = file_url.split('/')[3..-1].join('/')
|
29
31
|
file_id = URI.unescape file_id
|
30
|
-
if
|
31
|
-
|
32
|
-
file_list_curated[file_id]
|
32
|
+
if @timestamp == 0 or file_timestamp <= @timestamp
|
33
|
+
if file_list_curated[file_id]
|
34
|
+
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
35
|
+
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
36
|
+
end
|
37
|
+
else
|
38
|
+
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
33
39
|
end
|
34
|
-
else
|
35
|
-
file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
|
36
40
|
end
|
37
41
|
end
|
38
42
|
file_list_curated
|
39
43
|
end
|
40
44
|
|
41
45
|
def download_files
|
46
|
+
puts "Downlading #{@base_url} from Wayback Machine..."
|
47
|
+
puts
|
48
|
+
file_list_curated = get_file_list_curated
|
49
|
+
count = 0
|
42
50
|
file_list_curated.each do |file_id, file_remote_info|
|
43
|
-
|
51
|
+
count += 1
|
44
52
|
file_url = file_remote_info[:file_url]
|
45
53
|
file_path_elements = file_id.split('/')
|
46
54
|
if file_id == ""
|
47
55
|
dir_path = backup_path
|
48
56
|
file_path = backup_path + 'index.html'
|
49
|
-
elsif file_url[-1] == '/'
|
57
|
+
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
50
58
|
dir_path = backup_path + file_path_elements[0..-1].join('/')
|
51
|
-
file_path = backup_path + file_path_elements[0..-1].join('/') + 'index.html'
|
59
|
+
file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
|
52
60
|
else
|
53
61
|
dir_path = backup_path + file_path_elements[0..-2].join('/')
|
54
62
|
file_path = backup_path + file_path_elements[0..-1].join('/')
|
55
63
|
end
|
56
64
|
unless File.exists? file_path
|
57
|
-
|
65
|
+
structure_dir_path dir_path
|
58
66
|
open(file_path, "wb") do |file|
|
59
67
|
begin
|
60
68
|
open("http://web.archive.org/web/#{timestamp}id_/#{file_url}") do |uri|
|
61
69
|
file.write(uri.read)
|
62
70
|
end
|
63
71
|
rescue OpenURI::HTTPError => e
|
64
|
-
puts "#{file_url} #
|
72
|
+
puts "#{file_url} # #{e}"
|
65
73
|
file.write(e.io.read)
|
74
|
+
rescue Exception => e
|
75
|
+
puts "#{file_url} # #{e}"
|
66
76
|
end
|
67
77
|
end
|
68
|
-
puts "#{file_url} -> #{file_path}"
|
78
|
+
puts "#{file_url} -> #{file_path} (#{count}/#{file_list_curated.size})"
|
69
79
|
else
|
70
|
-
puts "#{file_url} # #{file_path} already exists."
|
80
|
+
puts "#{file_url} # #{file_path} already exists. (#{count}/#{file_list_curated.size})"
|
71
81
|
end
|
72
82
|
end
|
83
|
+
puts
|
84
|
+
puts "Download complete, saved in #{backup_path}. (#{file_list_curated.size} files)"
|
85
|
+
end
|
86
|
+
|
87
|
+
def structure_dir_path dir_path
|
88
|
+
begin
|
89
|
+
FileUtils::mkdir_p dir_path unless File.exists? dir_path
|
90
|
+
rescue Errno::EEXIST => e
|
91
|
+
puts "# #{e}"
|
92
|
+
file_already_existing = e.to_s.split("File exists @ dir_s_mkdir - ")[-1]
|
93
|
+
file_already_existing_temporary = file_already_existing + '.temp'
|
94
|
+
file_already_existing_permanent = file_already_existing + '/index.html'
|
95
|
+
FileUtils::mv file_already_existing, file_already_existing_temporary
|
96
|
+
FileUtils::mkdir_p file_already_existing
|
97
|
+
FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
|
98
|
+
puts "#{file_already_existing} -> #{file_already_existing_permanent}"
|
99
|
+
structure_dir_path dir_path
|
100
|
+
end
|
73
101
|
end
|
74
102
|
|
75
103
|
end
|