wayback_machine_downloader 2.0.0 → 2.3.0
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 57cbbb04b38525f6dd9c1a8f4022ee28ce45c76d1d26acc90076a4b8b6014b44
|
4
|
+
data.tar.gz: 4128b3ab753e91bea93ddebdafba133091663617e0c247c022076a8c11dfa5c2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bb08b6f6e9fa930b025fbf0c783476bd965e364ef46ccd2fecc8e5d0954be062b67b801acf4d168556f7d90f1d5c836a16184e371a5e5d47da7e804278d893ab
|
7
|
+
data.tar.gz: e750e04ab4e1f795e061f0fe91581abb60baecc7d5427d9ec8e724f931d1afba207512731f350eee7c04ceacf12b4a90823e9b3c750e3799810944430279c330
|
@@ -18,6 +18,10 @@ option_parser = OptionParser.new do |opts|
|
|
18
18
|
options[:directory] = t
|
19
19
|
end
|
20
20
|
|
21
|
+
opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
|
22
|
+
options[:all_timestamps] = true
|
23
|
+
end
|
24
|
+
|
21
25
|
opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
|
22
26
|
options[:from_timestamp] = t
|
23
27
|
end
|
@@ -26,6 +30,10 @@ option_parser = OptionParser.new do |opts|
|
|
26
30
|
options[:to_timestamp] = t
|
27
31
|
end
|
28
32
|
|
33
|
+
opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
|
34
|
+
options[:exact_url] = t
|
35
|
+
end
|
36
|
+
|
29
37
|
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
|
30
38
|
options[:only_filter] = t
|
31
39
|
end
|
@@ -38,15 +46,15 @@ option_parser = OptionParser.new do |opts|
|
|
38
46
|
options[:all] = true
|
39
47
|
end
|
40
48
|
|
41
|
-
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to
|
49
|
+
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
|
42
50
|
options[:threads_count] = t
|
43
51
|
end
|
44
52
|
|
45
|
-
opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page
|
53
|
+
opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
|
46
54
|
options[:maximum_pages] = t
|
47
55
|
end
|
48
56
|
|
49
|
-
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything
|
57
|
+
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
|
50
58
|
options[:list] = true
|
51
59
|
end
|
52
60
|
|
@@ -58,7 +66,7 @@ end.parse!
|
|
58
66
|
if (base_url = ARGV[-1])
|
59
67
|
options[:base_url] = base_url
|
60
68
|
wayback_machine_downloader = WaybackMachineDownloader.new options
|
61
|
-
if
|
69
|
+
if options[:list]
|
62
70
|
wayback_machine_downloader.list_files
|
63
71
|
else
|
64
72
|
wayback_machine_downloader.download_files
|
@@ -14,19 +14,22 @@ class WaybackMachineDownloader
|
|
14
14
|
|
15
15
|
include ArchiveAPI
|
16
16
|
|
17
|
-
VERSION = "2.
|
17
|
+
VERSION = "2.3.0"
|
18
18
|
|
19
|
-
attr_accessor :base_url, :
|
19
|
+
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
20
|
+
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
21
|
+
:all, :maximum_pages, :threads_count
|
20
22
|
|
21
23
|
def initialize params
|
22
24
|
@base_url = params[:base_url]
|
25
|
+
@exact_url = params[:exact_url]
|
23
26
|
@directory = params[:directory]
|
27
|
+
@all_timestamps = params[:all_timestamps]
|
24
28
|
@from_timestamp = params[:from_timestamp].to_i
|
25
29
|
@to_timestamp = params[:to_timestamp].to_i
|
26
30
|
@only_filter = params[:only_filter]
|
27
31
|
@exclude_filter = params[:exclude_filter]
|
28
32
|
@all = params[:all]
|
29
|
-
@list = params[:list]
|
30
33
|
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
31
34
|
@threads_count = params[:threads_count].to_i
|
32
35
|
end
|
@@ -78,30 +81,29 @@ class WaybackMachineDownloader
|
|
78
81
|
end
|
79
82
|
|
80
83
|
def get_all_snapshots_to_consider
|
81
|
-
# Note: Passing a page index parameter allow us to get more snapshots,
|
84
|
+
# Note: Passing a page index parameter allow us to get more snapshots,
|
85
|
+
# but from a less fresh index
|
82
86
|
print "Getting snapshot pages"
|
83
|
-
snapshot_list_to_consider =
|
87
|
+
snapshot_list_to_consider = []
|
84
88
|
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
|
85
89
|
print "."
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
90
|
+
unless @exact_url
|
91
|
+
@maximum_pages.times do |page_index|
|
92
|
+
snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
|
93
|
+
break if snapshot_list.empty?
|
94
|
+
snapshot_list_to_consider += snapshot_list
|
95
|
+
print "."
|
96
|
+
end
|
93
97
|
end
|
94
|
-
puts " found #{snapshot_list_to_consider.
|
98
|
+
puts " found #{snapshot_list_to_consider.length} snaphots to consider."
|
95
99
|
puts
|
96
100
|
snapshot_list_to_consider
|
97
101
|
end
|
98
102
|
|
99
103
|
def get_file_list_curated
|
100
104
|
file_list_curated = Hash.new
|
101
|
-
get_all_snapshots_to_consider.
|
102
|
-
next unless
|
103
|
-
file_timestamp = line[0..13].to_i
|
104
|
-
file_url = line[15..-2]
|
105
|
+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
106
|
+
next unless file_url.include?('/')
|
105
107
|
file_id = file_url.split('/')[3..-1].join('/')
|
106
108
|
file_id = CGI::unescape file_id
|
107
109
|
file_id = file_id.tidy_bytes unless file_id == ""
|
@@ -124,20 +126,61 @@ class WaybackMachineDownloader
|
|
124
126
|
file_list_curated
|
125
127
|
end
|
126
128
|
|
129
|
+
def get_file_list_all_timestamps
|
130
|
+
file_list_curated = Hash.new
|
131
|
+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
132
|
+
next unless file_url.include?('/')
|
133
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
134
|
+
file_id_and_timestamp = [file_timestamp, file_id].join('/')
|
135
|
+
file_id_and_timestamp = CGI::unescape file_id_and_timestamp
|
136
|
+
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
|
137
|
+
if file_id.nil?
|
138
|
+
puts "Malformed file url, ignoring: #{file_url}"
|
139
|
+
else
|
140
|
+
if match_exclude_filter(file_url)
|
141
|
+
puts "File url matches exclude filter, ignoring: #{file_url}"
|
142
|
+
elsif not match_only_filter(file_url)
|
143
|
+
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
144
|
+
elsif file_list_curated[file_id_and_timestamp]
|
145
|
+
puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
|
146
|
+
else
|
147
|
+
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
puts "file_list_curated: " + file_list_curated.count.to_s
|
152
|
+
file_list_curated
|
153
|
+
end
|
154
|
+
|
155
|
+
|
127
156
|
def get_file_list_by_timestamp
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
157
|
+
if @all_timestamps
|
158
|
+
file_list_curated = get_file_list_all_timestamps
|
159
|
+
file_list_curated.map do |file_remote_info|
|
160
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
161
|
+
file_remote_info[1]
|
162
|
+
end
|
163
|
+
else
|
164
|
+
file_list_curated = get_file_list_curated
|
165
|
+
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
|
166
|
+
file_list_curated.map do |file_remote_info|
|
167
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
168
|
+
file_remote_info[1]
|
169
|
+
end
|
133
170
|
end
|
134
171
|
end
|
135
172
|
|
136
173
|
def list_files
|
174
|
+
# retrieval produces its own output
|
175
|
+
@orig_stdout = $stdout
|
176
|
+
$stdout = $stderr
|
177
|
+
files = get_file_list_by_timestamp
|
178
|
+
$stdout = @orig_stdout
|
137
179
|
puts "["
|
138
|
-
|
180
|
+
files[0...-1].each do |file|
|
139
181
|
puts file.to_json + ","
|
140
182
|
end
|
183
|
+
puts files[-1].to_json
|
141
184
|
puts "]"
|
142
185
|
end
|
143
186
|
|
@@ -179,7 +222,7 @@ class WaybackMachineDownloader
|
|
179
222
|
|
180
223
|
def structure_dir_path dir_path
|
181
224
|
begin
|
182
|
-
FileUtils::mkdir_p dir_path unless File.
|
225
|
+
FileUtils::mkdir_p dir_path unless File.exist? dir_path
|
183
226
|
rescue Errno::EEXIST => e
|
184
227
|
error_to_string = e.to_s
|
185
228
|
puts "# #{error_to_string}"
|
@@ -217,14 +260,15 @@ class WaybackMachineDownloader
|
|
217
260
|
file_path = backup_path + file_path_elements[0..-1].join('/')
|
218
261
|
end
|
219
262
|
if Gem.win_platform?
|
263
|
+
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
220
264
|
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
221
265
|
end
|
222
|
-
unless File.
|
266
|
+
unless File.exist? file_path
|
223
267
|
begin
|
224
268
|
structure_dir_path dir_path
|
225
269
|
open(file_path, "wb") do |file|
|
226
270
|
begin
|
227
|
-
open("
|
271
|
+
URI.open("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
|
228
272
|
file.write(uri.read)
|
229
273
|
end
|
230
274
|
rescue OpenURI::HTTPError => e
|
@@ -240,7 +284,7 @@ class WaybackMachineDownloader
|
|
240
284
|
rescue StandardError => e
|
241
285
|
puts "#{file_url} # #{e}"
|
242
286
|
ensure
|
243
|
-
if not @all and File.
|
287
|
+
if not @all and File.exist?(file_path) and File.size(file_path) == 0
|
244
288
|
File.delete(file_path)
|
245
289
|
puts "#{file_path} was empty and was removed."
|
246
290
|
end
|
@@ -1,28 +1,38 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'uri'
|
3
|
+
|
1
4
|
module ArchiveAPI
|
2
5
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
6
|
+
def get_raw_list_from_api url, page_index
|
7
|
+
request_url = URI("https://web.archive.org/cdx/search/xd")
|
8
|
+
params = [["output", "json"], ["url", url]]
|
9
|
+
params += parameters_for_api page_index
|
10
|
+
request_url.query = URI.encode_www_form(params)
|
7
11
|
|
8
|
-
|
9
|
-
|
12
|
+
begin
|
13
|
+
json = JSON.parse(URI(request_url).open.read)
|
14
|
+
if (json[0] <=> ["timestamp","original"]) == 0
|
15
|
+
json.shift
|
16
|
+
end
|
17
|
+
json
|
18
|
+
rescue JSON::ParserError
|
19
|
+
[]
|
20
|
+
end
|
21
|
+
end
|
10
22
|
|
11
|
-
|
12
|
-
|
13
|
-
if
|
14
|
-
parameters
|
15
|
-
else
|
16
|
-
parameters += "&filter=statuscode:200"
|
23
|
+
def parameters_for_api page_index
|
24
|
+
parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
|
25
|
+
if !@all
|
26
|
+
parameters.push(["filter", "statuscode:200"])
|
17
27
|
end
|
18
28
|
if @from_timestamp and @from_timestamp != 0
|
19
|
-
parameters
|
29
|
+
parameters.push(["from", @from_timestamp.to_s])
|
20
30
|
end
|
21
31
|
if @to_timestamp and @to_timestamp != 0
|
22
|
-
parameters
|
32
|
+
parameters.push(["to", @to_timestamp.to_s])
|
23
33
|
end
|
24
34
|
if page_index
|
25
|
-
parameters
|
35
|
+
parameters.push(["page", page_index])
|
26
36
|
end
|
27
37
|
parameters
|
28
38
|
end
|
@@ -60,7 +60,7 @@ module TibyBytes
|
|
60
60
|
bytes.each_index do |i|
|
61
61
|
|
62
62
|
byte = bytes[i]
|
63
|
-
|
63
|
+
_is_ascii = byte < 128
|
64
64
|
is_cont = byte > 127 && byte < 192
|
65
65
|
is_lead = byte > 191 && byte < 245
|
66
66
|
is_unused = byte > 240
|
@@ -70,7 +70,7 @@ module TibyBytes
|
|
70
70
|
if is_unused || is_restricted
|
71
71
|
bytes[i] = tidy_byte(byte)
|
72
72
|
elsif is_cont
|
73
|
-
# Not expecting
|
73
|
+
# Not expecting continuation byte? Clean up. Otherwise, now expect one less.
|
74
74
|
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
75
75
|
else
|
76
76
|
if conts_expected > 0
|
@@ -78,7 +78,7 @@ module TibyBytes
|
|
78
78
|
# the leading byte.
|
79
79
|
begin
|
80
80
|
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
81
|
-
rescue NoMethodError
|
81
|
+
rescue NoMethodError
|
82
82
|
next
|
83
83
|
end
|
84
84
|
conts_expected = 0
|
@@ -98,7 +98,7 @@ module TibyBytes
|
|
98
98
|
end
|
99
99
|
begin
|
100
100
|
bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
101
|
-
rescue ArgumentError
|
101
|
+
rescue ArgumentError
|
102
102
|
nil
|
103
103
|
end
|
104
104
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-06-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -57,7 +57,7 @@ homepage: https://github.com/hartator/wayback-machine-downloader
|
|
57
57
|
licenses:
|
58
58
|
- MIT
|
59
59
|
metadata: {}
|
60
|
-
post_install_message:
|
60
|
+
post_install_message:
|
61
61
|
rdoc_options: []
|
62
62
|
require_paths:
|
63
63
|
- lib
|
@@ -72,9 +72,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
72
72
|
- !ruby/object:Gem::Version
|
73
73
|
version: '0'
|
74
74
|
requirements: []
|
75
|
-
|
76
|
-
|
77
|
-
signing_key:
|
75
|
+
rubygems_version: 3.1.4
|
76
|
+
signing_key:
|
78
77
|
specification_version: 4
|
79
78
|
summary: Download an entire website from the Wayback Machine.
|
80
79
|
test_files: []
|