wayback_machine_downloader 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +9 -5
- data/lib/wayback_machine_downloader/archive_api.rb +12 -9
- data/lib/wayback_machine_downloader.rb +40 -25
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 88515e4c80ca24904b31eb4261287894176a35a4
|
4
|
+
data.tar.gz: b7d82944989a68cb64c84fc78bc7bbbcce15d58b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3567625f16ff6b38ce3da0b0e720f1a20a16ee2aef34ab5085eaa320dba72f7fad1e42e644704af6ff70506c24fc778fd690a2268386eb226e43010bf7724029
|
7
|
+
data.tar.gz: cd81b5a1b75a1e077806966c9d253d171167d81c09b49cfcdde8e315193142907a49a6d99e062ee4925aeaf9885621d2195d5d358c9e9e0f1bf2ae4ef55df26f
|
@@ -14,7 +14,7 @@ option_parser = OptionParser.new do |opts|
|
|
14
14
|
opts.separator ""
|
15
15
|
opts.separator "Optional options:"
|
16
16
|
|
17
|
-
opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files
|
17
|
+
opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into\n\t\t\t\t Default is ./websites/ plus the domain name") do |t|
|
18
18
|
options[:directory] = t
|
19
19
|
end
|
20
20
|
|
@@ -26,11 +26,11 @@ option_parser = OptionParser.new do |opts|
|
|
26
26
|
options[:to_timestamp] = t
|
27
27
|
end
|
28
28
|
|
29
|
-
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter
|
29
|
+
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter\n\t\t\t\t (use // notation for the filter to be treated as a regex)") do |t|
|
30
30
|
options[:only_filter] = t
|
31
31
|
end
|
32
32
|
|
33
|
-
opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter
|
33
|
+
opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter\n\t\t\t\t (use // notation for the filter to be treated as a regex)") do |t|
|
34
34
|
options[:exclude_filter] = t
|
35
35
|
end
|
36
36
|
|
@@ -38,11 +38,15 @@ option_parser = OptionParser.new do |opts|
|
|
38
38
|
options[:all] = true
|
39
39
|
end
|
40
40
|
|
41
|
-
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time
|
41
|
+
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time\n\t\t\t\t Default is one file at a time (ie. 20)") do |t|
|
42
42
|
options[:threads_count] = t
|
43
43
|
end
|
44
44
|
|
45
|
-
opts.on("-
|
45
|
+
opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)\n\t\t\t\t Count an average of 150,000 snapshots per page ") do |t|
|
46
|
+
options[:maximum_pages] = t
|
47
|
+
end
|
48
|
+
|
49
|
+
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything.") do |t|
|
46
50
|
options[:list] = true
|
47
51
|
end
|
48
52
|
|
@@ -1,20 +1,20 @@
|
|
1
1
|
module ArchiveAPI
|
2
2
|
|
3
|
-
def get_raw_list_from_api url
|
3
|
+
def get_raw_list_from_api url, page_index
|
4
4
|
request_url = "http://web.archive.org/cdx/search/xd?url="
|
5
5
|
request_url += url
|
6
|
-
request_url += parameters_for_api
|
7
|
-
|
8
|
-
|
9
|
-
|
6
|
+
request_url += parameters_for_api page_index
|
7
|
+
request_uri = URI.parse request_url
|
8
|
+
response = Net::HTTP.get_response request_uri
|
9
|
+
response.body
|
10
10
|
end
|
11
11
|
|
12
|
-
def parameters_for_api
|
13
|
-
parameters = "&fl=timestamp,original&gzip=false"
|
12
|
+
def parameters_for_api page_index
|
13
|
+
parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
|
14
14
|
if @all
|
15
|
-
parameters += "
|
15
|
+
parameters += ""
|
16
16
|
else
|
17
|
-
parameters += "&filter=statuscode:200
|
17
|
+
parameters += "&filter=statuscode:200"
|
18
18
|
end
|
19
19
|
if @from_timestamp and @from_timestamp != 0
|
20
20
|
parameters += "&from=" + @from_timestamp.to_s
|
@@ -22,6 +22,9 @@ module ArchiveAPI
|
|
22
22
|
if @to_timestamp and @to_timestamp != 0
|
23
23
|
parameters += "&to=" + @to_timestamp.to_s
|
24
24
|
end
|
25
|
+
if page_index
|
26
|
+
parameters += "&page=#{page_index}"
|
27
|
+
end
|
25
28
|
parameters
|
26
29
|
end
|
27
30
|
|
@@ -13,9 +13,9 @@ class WaybackMachineDownloader
|
|
13
13
|
|
14
14
|
include ArchiveAPI
|
15
15
|
|
16
|
-
VERSION = "1.
|
16
|
+
VERSION = "1.1.0"
|
17
17
|
|
18
|
-
attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
|
18
|
+
attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count
|
19
19
|
|
20
20
|
def initialize params
|
21
21
|
@base_url = params[:base_url]
|
@@ -26,6 +26,7 @@ class WaybackMachineDownloader
|
|
26
26
|
@exclude_filter = params[:exclude_filter]
|
27
27
|
@all = params[:all]
|
28
28
|
@list = params[:list]
|
29
|
+
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
29
30
|
@threads_count = params[:threads_count].to_i
|
30
31
|
end
|
31
32
|
|
@@ -75,32 +76,44 @@ class WaybackMachineDownloader
|
|
75
76
|
end
|
76
77
|
end
|
77
78
|
|
79
|
+
def get_all_snapshots_to_consider
|
80
|
+
print "Getting snapshot pages"
|
81
|
+
snapshot_list_to_consider = ""
|
82
|
+
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
|
83
|
+
print "."
|
84
|
+
@maximum_pages.times do |page_index|
|
85
|
+
snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
|
86
|
+
break if snapshot_list.empty?
|
87
|
+
snapshot_list_to_consider += snapshot_list
|
88
|
+
print "."
|
89
|
+
end
|
90
|
+
puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
|
91
|
+
puts
|
92
|
+
snapshot_list_to_consider
|
93
|
+
end
|
94
|
+
|
78
95
|
def get_file_list_curated
|
79
|
-
index_file_list_raw = get_raw_list_from_api(@base_url)
|
80
|
-
all_file_list_raw = get_raw_list_from_api(@base_url + '/*')
|
81
96
|
file_list_curated = Hash.new
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
99
|
-
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
100
|
-
end
|
101
|
-
else
|
97
|
+
get_all_snapshots_to_consider.each_line do |line|
|
98
|
+
next unless line.include?('/')
|
99
|
+
file_timestamp = line[0..13].to_i
|
100
|
+
file_url = line[15..-2]
|
101
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
102
|
+
file_id = CGI::unescape file_id
|
103
|
+
file_id = file_id.tidy_bytes unless file_id == ""
|
104
|
+
if file_id.nil?
|
105
|
+
puts "Malformed file url, ignoring: #{file_url}"
|
106
|
+
else
|
107
|
+
if match_exclude_filter(file_url)
|
108
|
+
puts "File url matches exclude filter, ignoring: #{file_url}"
|
109
|
+
elsif not match_only_filter(file_url)
|
110
|
+
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
111
|
+
elsif file_list_curated[file_id]
|
112
|
+
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
102
113
|
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
103
114
|
end
|
115
|
+
else
|
116
|
+
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
104
117
|
end
|
105
118
|
end
|
106
119
|
end
|
@@ -126,7 +139,7 @@ class WaybackMachineDownloader
|
|
126
139
|
|
127
140
|
def download_files
|
128
141
|
start_time = Time.now
|
129
|
-
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine
|
142
|
+
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
130
143
|
puts
|
131
144
|
|
132
145
|
if file_list_by_timestamp.count == 0
|
@@ -139,6 +152,8 @@ class WaybackMachineDownloader
|
|
139
152
|
puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
|
140
153
|
return
|
141
154
|
end
|
155
|
+
|
156
|
+
puts "#{file_list_by_timestamp.count} files to download:"
|
142
157
|
|
143
158
|
threads = []
|
144
159
|
@processed_file_count = 0
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-09-
|
11
|
+
date: 2016-09-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|