wayback_machine_downloader 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +27 -1
- data/lib/wayback_machine_downloader.rb +53 -6
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9ace9b823151cb8ec95c7b942cbe07a1de388218
|
4
|
+
data.tar.gz: 7a4af6e514823ddc120272ac3d236ea435747e22
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e76d5c4fb1cb024619e5eb90fdc43b4a71d27b0430b8f5251a07ac0b7367c8247a6c2ed29660d5e38e45cd7614d20f250eec3ea6e4633e35b0e6bc740869e6a2
|
7
|
+
data.tar.gz: 5f9307192fecf31509894dc86c4c98b07d9cd5aa28ae046b0d93d797b08eb7e019c9d59023f35c4e625826d4987b8c1716916759df94bc0394e2af726580fddd
|
@@ -1,4 +1,30 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require 'wayback_machine_downloader'
|
4
|
-
|
4
|
+
require 'optparse'
|
5
|
+
require 'pry-rescue'
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
option_parser = OptionParser.new do |opts|
|
9
|
+
opts.banner = "Usage: wayback_machine_downloader http://example.com"
|
10
|
+
|
11
|
+
opts.separator ""
|
12
|
+
opts.separator "Download a website from Wayback Machine."
|
13
|
+
|
14
|
+
opts.separator ""
|
15
|
+
opts.separator "Optional option:"
|
16
|
+
|
17
|
+
opts.on("-t", "--timestamp TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20150806225358)") do |t|
|
18
|
+
options[:timestamp] = t
|
19
|
+
end
|
20
|
+
end.parse!
|
21
|
+
|
22
|
+
if base_url = ARGV[0]
|
23
|
+
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp]
|
24
|
+
puts "Downlading #{wayback_machine_downloader.base_url} from Wayback Machine..."
|
25
|
+
binding.pry
|
26
|
+
wayback_machine_downloader.download_files
|
27
|
+
else
|
28
|
+
puts "You need to specify a websire to backup. (ie. http://example.com)"
|
29
|
+
puts "Run `wayback_machine_downloader --help` for more help."
|
30
|
+
end
|
@@ -1,28 +1,75 @@
|
|
1
1
|
require 'open-uri'
|
2
|
+
require 'fileutils'
|
2
3
|
|
3
4
|
class WaybackMachineDownloader
|
4
5
|
|
5
|
-
attr_accessor :base_url
|
6
|
+
attr_accessor :base_url, :timestamp
|
6
7
|
|
7
8
|
def initialize params
|
8
9
|
@base_url = params[:base_url]
|
10
|
+
@timestamp = params[:timestamp]
|
11
|
+
end
|
12
|
+
|
13
|
+
def backup_name
|
14
|
+
@base_url.split('/')[2]
|
15
|
+
end
|
16
|
+
|
17
|
+
def backup_path
|
18
|
+
'websites/' + backup_name + '/'
|
9
19
|
end
|
10
20
|
|
11
21
|
def file_list_curated
|
12
|
-
file_list_raw = open "http://web.archive.org/
|
22
|
+
file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
|
13
23
|
file_list_curated = Hash.new
|
14
24
|
file_list_raw.each_line do |line|
|
15
25
|
line = line.split(' ')
|
16
26
|
timestamp = line[1].to_i
|
17
27
|
file_url = line[2]
|
18
|
-
|
19
|
-
|
20
|
-
|
28
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
29
|
+
file_id = URI.unescape file_id
|
30
|
+
if file_list_curated[file_id]
|
31
|
+
unless file_list_curated[file_id][:timestamp] > timestamp
|
32
|
+
file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
|
21
33
|
end
|
22
34
|
else
|
23
|
-
file_list_curated[
|
35
|
+
file_list_curated[file_id] = {file_url: file_url, timestamp: timestamp}
|
24
36
|
end
|
25
37
|
end
|
26
38
|
file_list_curated
|
27
39
|
end
|
40
|
+
|
41
|
+
def download_files
|
42
|
+
file_list_curated.each do |file_id, file_remote_info|
|
43
|
+
timestamp = file_remote_info[:timestamp]
|
44
|
+
file_url = file_remote_info[:file_url]
|
45
|
+
file_path_elements = file_id.split('/')
|
46
|
+
if file_id == ""
|
47
|
+
dir_path = backup_path
|
48
|
+
file_path = backup_path + 'index.html'
|
49
|
+
elsif file_url[-1] == '/'
|
50
|
+
dir_path = backup_path + file_path_elements[0..-1].join('/')
|
51
|
+
file_path = backup_path + file_path_elements[0..-1].join('/') + 'index.html'
|
52
|
+
else
|
53
|
+
dir_path = backup_path + file_path_elements[0..-2].join('/')
|
54
|
+
file_path = backup_path + file_path_elements[0..-1].join('/')
|
55
|
+
end
|
56
|
+
unless File.exists? file_path
|
57
|
+
FileUtils::mkdir_p dir_path unless File.exists? dir_path
|
58
|
+
open(file_path, "wb") do |file|
|
59
|
+
begin
|
60
|
+
open("http://web.archive.org/web/#{timestamp}id_/#{file_url}") do |uri|
|
61
|
+
file.write(uri.read)
|
62
|
+
end
|
63
|
+
rescue OpenURI::HTTPError => e
|
64
|
+
puts "#{file_url} # 404"
|
65
|
+
file.write(e.io.read)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
puts "#{file_url} -> #{file_path}"
|
69
|
+
else
|
70
|
+
puts "#{file_url} # #{file_path} already exists."
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
28
75
|
end
|