wayback_machine_downloader 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +9 -5
- data/lib/wayback_machine_downloader/archive_api.rb +12 -9
- data/lib/wayback_machine_downloader.rb +40 -25
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 88515e4c80ca24904b31eb4261287894176a35a4
|
4
|
+
data.tar.gz: b7d82944989a68cb64c84fc78bc7bbbcce15d58b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3567625f16ff6b38ce3da0b0e720f1a20a16ee2aef34ab5085eaa320dba72f7fad1e42e644704af6ff70506c24fc778fd690a2268386eb226e43010bf7724029
|
7
|
+
data.tar.gz: cd81b5a1b75a1e077806966c9d253d171167d81c09b49cfcdde8e315193142907a49a6d99e062ee4925aeaf9885621d2195d5d358c9e9e0f1bf2ae4ef55df26f
|
@@ -14,7 +14,7 @@ option_parser = OptionParser.new do |opts|
|
|
14
14
|
opts.separator ""
|
15
15
|
opts.separator "Optional options:"
|
16
16
|
|
17
|
-
opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files
|
17
|
+
opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into\n\t\t\t\t Default is ./websites/ plus the domain name") do |t|
|
18
18
|
options[:directory] = t
|
19
19
|
end
|
20
20
|
|
@@ -26,11 +26,11 @@ option_parser = OptionParser.new do |opts|
|
|
26
26
|
options[:to_timestamp] = t
|
27
27
|
end
|
28
28
|
|
29
|
-
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter
|
29
|
+
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter\n\t\t\t\t (use // notation for the filter to be treated as a regex)") do |t|
|
30
30
|
options[:only_filter] = t
|
31
31
|
end
|
32
32
|
|
33
|
-
opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter
|
33
|
+
opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter\n\t\t\t\t (use // notation for the filter to be treated as a regex)") do |t|
|
34
34
|
options[:exclude_filter] = t
|
35
35
|
end
|
36
36
|
|
@@ -38,11 +38,15 @@ option_parser = OptionParser.new do |opts|
|
|
38
38
|
options[:all] = true
|
39
39
|
end
|
40
40
|
|
41
|
-
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time
|
41
|
+
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time\n\t\t\t\t Default is one file at a time (ie. 20)") do |t|
|
42
42
|
options[:threads_count] = t
|
43
43
|
end
|
44
44
|
|
45
|
-
opts.on("-
|
45
|
+
opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)\n\t\t\t\t Count an average of 150,000 snapshots per page ") do |t|
|
46
|
+
options[:maximum_pages] = t
|
47
|
+
end
|
48
|
+
|
49
|
+
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything.") do |t|
|
46
50
|
options[:list] = true
|
47
51
|
end
|
48
52
|
|
@@ -1,20 +1,20 @@
|
|
1
1
|
module ArchiveAPI
|
2
2
|
|
3
|
-
def get_raw_list_from_api url
|
3
|
+
def get_raw_list_from_api url, page_index
|
4
4
|
request_url = "http://web.archive.org/cdx/search/xd?url="
|
5
5
|
request_url += url
|
6
|
-
request_url += parameters_for_api
|
7
|
-
|
8
|
-
|
9
|
-
|
6
|
+
request_url += parameters_for_api page_index
|
7
|
+
request_uri = URI.parse request_url
|
8
|
+
response = Net::HTTP.get_response request_uri
|
9
|
+
response.body
|
10
10
|
end
|
11
11
|
|
12
|
-
def parameters_for_api
|
13
|
-
parameters = "&fl=timestamp,original&gzip=false"
|
12
|
+
def parameters_for_api page_index
|
13
|
+
parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
|
14
14
|
if @all
|
15
|
-
parameters += "
|
15
|
+
parameters += ""
|
16
16
|
else
|
17
|
-
parameters += "&filter=statuscode:200
|
17
|
+
parameters += "&filter=statuscode:200"
|
18
18
|
end
|
19
19
|
if @from_timestamp and @from_timestamp != 0
|
20
20
|
parameters += "&from=" + @from_timestamp.to_s
|
@@ -22,6 +22,9 @@ module ArchiveAPI
|
|
22
22
|
if @to_timestamp and @to_timestamp != 0
|
23
23
|
parameters += "&to=" + @to_timestamp.to_s
|
24
24
|
end
|
25
|
+
if page_index
|
26
|
+
parameters += "&page=#{page_index}"
|
27
|
+
end
|
25
28
|
parameters
|
26
29
|
end
|
27
30
|
|
@@ -13,9 +13,9 @@ class WaybackMachineDownloader
|
|
13
13
|
|
14
14
|
include ArchiveAPI
|
15
15
|
|
16
|
-
VERSION = "1.
|
16
|
+
VERSION = "1.1.0"
|
17
17
|
|
18
|
-
attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
|
18
|
+
attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count
|
19
19
|
|
20
20
|
def initialize params
|
21
21
|
@base_url = params[:base_url]
|
@@ -26,6 +26,7 @@ class WaybackMachineDownloader
|
|
26
26
|
@exclude_filter = params[:exclude_filter]
|
27
27
|
@all = params[:all]
|
28
28
|
@list = params[:list]
|
29
|
+
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
29
30
|
@threads_count = params[:threads_count].to_i
|
30
31
|
end
|
31
32
|
|
@@ -75,32 +76,44 @@ class WaybackMachineDownloader
|
|
75
76
|
end
|
76
77
|
end
|
77
78
|
|
79
|
+
def get_all_snapshots_to_consider
|
80
|
+
print "Getting snapshot pages"
|
81
|
+
snapshot_list_to_consider = ""
|
82
|
+
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
|
83
|
+
print "."
|
84
|
+
@maximum_pages.times do |page_index|
|
85
|
+
snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
|
86
|
+
break if snapshot_list.empty?
|
87
|
+
snapshot_list_to_consider += snapshot_list
|
88
|
+
print "."
|
89
|
+
end
|
90
|
+
puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
|
91
|
+
puts
|
92
|
+
snapshot_list_to_consider
|
93
|
+
end
|
94
|
+
|
78
95
|
def get_file_list_curated
|
79
|
-
index_file_list_raw = get_raw_list_from_api(@base_url)
|
80
|
-
all_file_list_raw = get_raw_list_from_api(@base_url + '/*')
|
81
96
|
file_list_curated = Hash.new
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
99
|
-
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
100
|
-
end
|
101
|
-
else
|
97
|
+
get_all_snapshots_to_consider.each_line do |line|
|
98
|
+
next unless line.include?('/')
|
99
|
+
file_timestamp = line[0..13].to_i
|
100
|
+
file_url = line[15..-2]
|
101
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
102
|
+
file_id = CGI::unescape file_id
|
103
|
+
file_id = file_id.tidy_bytes unless file_id == ""
|
104
|
+
if file_id.nil?
|
105
|
+
puts "Malformed file url, ignoring: #{file_url}"
|
106
|
+
else
|
107
|
+
if match_exclude_filter(file_url)
|
108
|
+
puts "File url matches exclude filter, ignoring: #{file_url}"
|
109
|
+
elsif not match_only_filter(file_url)
|
110
|
+
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
111
|
+
elsif file_list_curated[file_id]
|
112
|
+
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
102
113
|
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
103
114
|
end
|
115
|
+
else
|
116
|
+
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
104
117
|
end
|
105
118
|
end
|
106
119
|
end
|
@@ -126,7 +139,7 @@ class WaybackMachineDownloader
|
|
126
139
|
|
127
140
|
def download_files
|
128
141
|
start_time = Time.now
|
129
|
-
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine
|
142
|
+
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
130
143
|
puts
|
131
144
|
|
132
145
|
if file_list_by_timestamp.count == 0
|
@@ -139,6 +152,8 @@ class WaybackMachineDownloader
|
|
139
152
|
puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
|
140
153
|
return
|
141
154
|
end
|
155
|
+
|
156
|
+
puts "#{file_list_by_timestamp.count} files to download:"
|
142
157
|
|
143
158
|
threads = []
|
144
159
|
@processed_file_count = 0
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-09-
|
11
|
+
date: 2016-09-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|