wayback_machine_downloader 0.1.8 → 0.1.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +1 -1
- data/lib/wayback_machine_downloader.rb +27 -14
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 20560ddf17786139b6ad1bd4e01d4b5e4e41dafe
|
4
|
+
data.tar.gz: 3d8357785e11e81b20123a97e55f084a0442f03b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 333c6bcfcce0ab972d4dc927af130eb9992ebff1c12a45caee68a109aa459c76a0f6a4ace5058612742559b731876cd3af64f3aa2402b790ec756e2dc54a179a
|
7
|
+
data.tar.gz: 3dddac44a54eaf44a396f94c5428fa68da97a2e92bd2ec1c25f55eb569a5b2f64cd3082e429393be3893be29779faa4387a74952cd99fef09161d1a8dc070148
|
@@ -3,7 +3,7 @@ require 'fileutils'
|
|
3
3
|
|
4
4
|
class WaybackMachineDownloader
|
5
5
|
|
6
|
-
VERSION = "0.1.
|
6
|
+
VERSION = "0.1.9"
|
7
7
|
|
8
8
|
attr_accessor :base_url, :timestamp
|
9
9
|
|
@@ -21,35 +21,48 @@ class WaybackMachineDownloader
|
|
21
21
|
end
|
22
22
|
|
23
23
|
def get_file_list_curated
|
24
|
-
|
24
|
+
index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
|
25
|
+
all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
|
25
26
|
file_list_curated = Hash.new
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
if
|
34
|
-
|
27
|
+
[index_file_list_raw, all_file_list_raw].each do |file|
|
28
|
+
file.each_line do |line|
|
29
|
+
line = line.split(' ')
|
30
|
+
file_timestamp = line[1].to_i
|
31
|
+
file_url = line[2]
|
32
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
33
|
+
file_id = URI.unescape file_id
|
34
|
+
if @timestamp == 0 or file_timestamp <= @timestamp
|
35
|
+
if file_list_curated[file_id]
|
36
|
+
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
37
|
+
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
38
|
+
end
|
39
|
+
else
|
35
40
|
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
36
41
|
end
|
37
|
-
else
|
38
|
-
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
39
42
|
end
|
40
43
|
end
|
41
44
|
end
|
42
45
|
file_list_curated
|
43
46
|
end
|
44
47
|
|
48
|
+
def file_list_by_timestamp
|
49
|
+
file_list_curated = get_file_list_curated
|
50
|
+
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
|
51
|
+
file_list_curated.map do |file_remote_info|
|
52
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
53
|
+
file_remote_info[1]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
45
57
|
def download_files
|
46
58
|
puts "Downlading #{@base_url} from Wayback Machine..."
|
47
59
|
puts
|
48
60
|
file_list_curated = get_file_list_curated
|
49
61
|
count = 0
|
50
|
-
|
62
|
+
file_list_by_timestamp.each do |file_remote_info|
|
51
63
|
count += 1
|
52
64
|
file_url = file_remote_info[:file_url]
|
65
|
+
file_id = file_remote_info[:file_id]
|
53
66
|
file_path_elements = file_id.split('/')
|
54
67
|
if file_id == ""
|
55
68
|
dir_path = backup_path
|