wayback_machine_downloader 2.0.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 57cbbb04b38525f6dd9c1a8f4022ee28ce45c76d1d26acc90076a4b8b6014b44
|
4
|
+
data.tar.gz: 4128b3ab753e91bea93ddebdafba133091663617e0c247c022076a8c11dfa5c2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bb08b6f6e9fa930b025fbf0c783476bd965e364ef46ccd2fecc8e5d0954be062b67b801acf4d168556f7d90f1d5c836a16184e371a5e5d47da7e804278d893ab
|
7
|
+
data.tar.gz: e750e04ab4e1f795e061f0fe91581abb60baecc7d5427d9ec8e724f931d1afba207512731f350eee7c04ceacf12b4a90823e9b3c750e3799810944430279c330
|
@@ -18,6 +18,10 @@ option_parser = OptionParser.new do |opts|
|
|
18
18
|
options[:directory] = t
|
19
19
|
end
|
20
20
|
|
21
|
+
opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
|
22
|
+
options[:all_timestamps] = true
|
23
|
+
end
|
24
|
+
|
21
25
|
opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
|
22
26
|
options[:from_timestamp] = t
|
23
27
|
end
|
@@ -26,6 +30,10 @@ option_parser = OptionParser.new do |opts|
|
|
26
30
|
options[:to_timestamp] = t
|
27
31
|
end
|
28
32
|
|
33
|
+
opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
|
34
|
+
options[:exact_url] = t
|
35
|
+
end
|
36
|
+
|
29
37
|
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
|
30
38
|
options[:only_filter] = t
|
31
39
|
end
|
@@ -38,15 +46,15 @@ option_parser = OptionParser.new do |opts|
|
|
38
46
|
options[:all] = true
|
39
47
|
end
|
40
48
|
|
41
|
-
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to
|
49
|
+
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
|
42
50
|
options[:threads_count] = t
|
43
51
|
end
|
44
52
|
|
45
|
-
opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page
|
53
|
+
opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
|
46
54
|
options[:maximum_pages] = t
|
47
55
|
end
|
48
56
|
|
49
|
-
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything
|
57
|
+
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
|
50
58
|
options[:list] = true
|
51
59
|
end
|
52
60
|
|
@@ -58,7 +66,7 @@ end.parse!
|
|
58
66
|
if (base_url = ARGV[-1])
|
59
67
|
options[:base_url] = base_url
|
60
68
|
wayback_machine_downloader = WaybackMachineDownloader.new options
|
61
|
-
if
|
69
|
+
if options[:list]
|
62
70
|
wayback_machine_downloader.list_files
|
63
71
|
else
|
64
72
|
wayback_machine_downloader.download_files
|
@@ -14,19 +14,22 @@ class WaybackMachineDownloader
|
|
14
14
|
|
15
15
|
include ArchiveAPI
|
16
16
|
|
17
|
-
VERSION = "2.
|
17
|
+
VERSION = "2.3.0"
|
18
18
|
|
19
|
-
attr_accessor :base_url, :
|
19
|
+
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
20
|
+
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
21
|
+
:all, :maximum_pages, :threads_count
|
20
22
|
|
21
23
|
def initialize params
|
22
24
|
@base_url = params[:base_url]
|
25
|
+
@exact_url = params[:exact_url]
|
23
26
|
@directory = params[:directory]
|
27
|
+
@all_timestamps = params[:all_timestamps]
|
24
28
|
@from_timestamp = params[:from_timestamp].to_i
|
25
29
|
@to_timestamp = params[:to_timestamp].to_i
|
26
30
|
@only_filter = params[:only_filter]
|
27
31
|
@exclude_filter = params[:exclude_filter]
|
28
32
|
@all = params[:all]
|
29
|
-
@list = params[:list]
|
30
33
|
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
31
34
|
@threads_count = params[:threads_count].to_i
|
32
35
|
end
|
@@ -78,30 +81,29 @@ class WaybackMachineDownloader
|
|
78
81
|
end
|
79
82
|
|
80
83
|
def get_all_snapshots_to_consider
|
81
|
-
# Note: Passing a page index parameter allow us to get more snapshots,
|
84
|
+
# Note: Passing a page index parameter allow us to get more snapshots,
|
85
|
+
# but from a less fresh index
|
82
86
|
print "Getting snapshot pages"
|
83
|
-
snapshot_list_to_consider =
|
87
|
+
snapshot_list_to_consider = []
|
84
88
|
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
|
85
89
|
print "."
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
90
|
+
unless @exact_url
|
91
|
+
@maximum_pages.times do |page_index|
|
92
|
+
snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
|
93
|
+
break if snapshot_list.empty?
|
94
|
+
snapshot_list_to_consider += snapshot_list
|
95
|
+
print "."
|
96
|
+
end
|
93
97
|
end
|
94
|
-
puts " found #{snapshot_list_to_consider.
|
98
|
+
puts " found #{snapshot_list_to_consider.length} snaphots to consider."
|
95
99
|
puts
|
96
100
|
snapshot_list_to_consider
|
97
101
|
end
|
98
102
|
|
99
103
|
def get_file_list_curated
|
100
104
|
file_list_curated = Hash.new
|
101
|
-
get_all_snapshots_to_consider.
|
102
|
-
next unless
|
103
|
-
file_timestamp = line[0..13].to_i
|
104
|
-
file_url = line[15..-2]
|
105
|
+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
106
|
+
next unless file_url.include?('/')
|
105
107
|
file_id = file_url.split('/')[3..-1].join('/')
|
106
108
|
file_id = CGI::unescape file_id
|
107
109
|
file_id = file_id.tidy_bytes unless file_id == ""
|
@@ -124,20 +126,61 @@ class WaybackMachineDownloader
|
|
124
126
|
file_list_curated
|
125
127
|
end
|
126
128
|
|
129
|
+
def get_file_list_all_timestamps
|
130
|
+
file_list_curated = Hash.new
|
131
|
+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
132
|
+
next unless file_url.include?('/')
|
133
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
134
|
+
file_id_and_timestamp = [file_timestamp, file_id].join('/')
|
135
|
+
file_id_and_timestamp = CGI::unescape file_id_and_timestamp
|
136
|
+
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
|
137
|
+
if file_id.nil?
|
138
|
+
puts "Malformed file url, ignoring: #{file_url}"
|
139
|
+
else
|
140
|
+
if match_exclude_filter(file_url)
|
141
|
+
puts "File url matches exclude filter, ignoring: #{file_url}"
|
142
|
+
elsif not match_only_filter(file_url)
|
143
|
+
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
144
|
+
elsif file_list_curated[file_id_and_timestamp]
|
145
|
+
puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
|
146
|
+
else
|
147
|
+
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
puts "file_list_curated: " + file_list_curated.count.to_s
|
152
|
+
file_list_curated
|
153
|
+
end
|
154
|
+
|
155
|
+
|
127
156
|
def get_file_list_by_timestamp
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
157
|
+
if @all_timestamps
|
158
|
+
file_list_curated = get_file_list_all_timestamps
|
159
|
+
file_list_curated.map do |file_remote_info|
|
160
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
161
|
+
file_remote_info[1]
|
162
|
+
end
|
163
|
+
else
|
164
|
+
file_list_curated = get_file_list_curated
|
165
|
+
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
|
166
|
+
file_list_curated.map do |file_remote_info|
|
167
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
168
|
+
file_remote_info[1]
|
169
|
+
end
|
133
170
|
end
|
134
171
|
end
|
135
172
|
|
136
173
|
def list_files
|
174
|
+
# retrieval produces its own output
|
175
|
+
@orig_stdout = $stdout
|
176
|
+
$stdout = $stderr
|
177
|
+
files = get_file_list_by_timestamp
|
178
|
+
$stdout = @orig_stdout
|
137
179
|
puts "["
|
138
|
-
|
180
|
+
files[0...-1].each do |file|
|
139
181
|
puts file.to_json + ","
|
140
182
|
end
|
183
|
+
puts files[-1].to_json
|
141
184
|
puts "]"
|
142
185
|
end
|
143
186
|
|
@@ -179,7 +222,7 @@ class WaybackMachineDownloader
|
|
179
222
|
|
180
223
|
def structure_dir_path dir_path
|
181
224
|
begin
|
182
|
-
FileUtils::mkdir_p dir_path unless File.
|
225
|
+
FileUtils::mkdir_p dir_path unless File.exist? dir_path
|
183
226
|
rescue Errno::EEXIST => e
|
184
227
|
error_to_string = e.to_s
|
185
228
|
puts "# #{error_to_string}"
|
@@ -217,14 +260,15 @@ class WaybackMachineDownloader
|
|
217
260
|
file_path = backup_path + file_path_elements[0..-1].join('/')
|
218
261
|
end
|
219
262
|
if Gem.win_platform?
|
263
|
+
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
220
264
|
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
221
265
|
end
|
222
|
-
unless File.
|
266
|
+
unless File.exist? file_path
|
223
267
|
begin
|
224
268
|
structure_dir_path dir_path
|
225
269
|
open(file_path, "wb") do |file|
|
226
270
|
begin
|
227
|
-
open("
|
271
|
+
URI.open("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
|
228
272
|
file.write(uri.read)
|
229
273
|
end
|
230
274
|
rescue OpenURI::HTTPError => e
|
@@ -240,7 +284,7 @@ class WaybackMachineDownloader
|
|
240
284
|
rescue StandardError => e
|
241
285
|
puts "#{file_url} # #{e}"
|
242
286
|
ensure
|
243
|
-
if not @all and File.
|
287
|
+
if not @all and File.exist?(file_path) and File.size(file_path) == 0
|
244
288
|
File.delete(file_path)
|
245
289
|
puts "#{file_path} was empty and was removed."
|
246
290
|
end
|
@@ -1,28 +1,38 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'uri'
|
3
|
+
|
1
4
|
module ArchiveAPI
|
2
5
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
6
|
+
def get_raw_list_from_api url, page_index
|
7
|
+
request_url = URI("https://web.archive.org/cdx/search/xd")
|
8
|
+
params = [["output", "json"], ["url", url]]
|
9
|
+
params += parameters_for_api page_index
|
10
|
+
request_url.query = URI.encode_www_form(params)
|
7
11
|
|
8
|
-
|
9
|
-
|
12
|
+
begin
|
13
|
+
json = JSON.parse(URI(request_url).open.read)
|
14
|
+
if (json[0] <=> ["timestamp","original"]) == 0
|
15
|
+
json.shift
|
16
|
+
end
|
17
|
+
json
|
18
|
+
rescue JSON::ParserError
|
19
|
+
[]
|
20
|
+
end
|
21
|
+
end
|
10
22
|
|
11
|
-
|
12
|
-
|
13
|
-
if
|
14
|
-
parameters
|
15
|
-
else
|
16
|
-
parameters += "&filter=statuscode:200"
|
23
|
+
def parameters_for_api page_index
|
24
|
+
parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
|
25
|
+
if !@all
|
26
|
+
parameters.push(["filter", "statuscode:200"])
|
17
27
|
end
|
18
28
|
if @from_timestamp and @from_timestamp != 0
|
19
|
-
parameters
|
29
|
+
parameters.push(["from", @from_timestamp.to_s])
|
20
30
|
end
|
21
31
|
if @to_timestamp and @to_timestamp != 0
|
22
|
-
parameters
|
32
|
+
parameters.push(["to", @to_timestamp.to_s])
|
23
33
|
end
|
24
34
|
if page_index
|
25
|
-
parameters
|
35
|
+
parameters.push(["page", page_index])
|
26
36
|
end
|
27
37
|
parameters
|
28
38
|
end
|
@@ -60,7 +60,7 @@ module TibyBytes
|
|
60
60
|
bytes.each_index do |i|
|
61
61
|
|
62
62
|
byte = bytes[i]
|
63
|
-
|
63
|
+
_is_ascii = byte < 128
|
64
64
|
is_cont = byte > 127 && byte < 192
|
65
65
|
is_lead = byte > 191 && byte < 245
|
66
66
|
is_unused = byte > 240
|
@@ -70,7 +70,7 @@ module TibyBytes
|
|
70
70
|
if is_unused || is_restricted
|
71
71
|
bytes[i] = tidy_byte(byte)
|
72
72
|
elsif is_cont
|
73
|
-
# Not expecting
|
73
|
+
# Not expecting continuation byte? Clean up. Otherwise, now expect one less.
|
74
74
|
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
75
75
|
else
|
76
76
|
if conts_expected > 0
|
@@ -78,7 +78,7 @@ module TibyBytes
|
|
78
78
|
# the leading byte.
|
79
79
|
begin
|
80
80
|
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
81
|
-
rescue NoMethodError
|
81
|
+
rescue NoMethodError
|
82
82
|
next
|
83
83
|
end
|
84
84
|
conts_expected = 0
|
@@ -98,7 +98,7 @@ module TibyBytes
|
|
98
98
|
end
|
99
99
|
begin
|
100
100
|
bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
101
|
-
rescue ArgumentError
|
101
|
+
rescue ArgumentError
|
102
102
|
nil
|
103
103
|
end
|
104
104
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-06-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -57,7 +57,7 @@ homepage: https://github.com/hartator/wayback-machine-downloader
|
|
57
57
|
licenses:
|
58
58
|
- MIT
|
59
59
|
metadata: {}
|
60
|
-
post_install_message:
|
60
|
+
post_install_message:
|
61
61
|
rdoc_options: []
|
62
62
|
require_paths:
|
63
63
|
- lib
|
@@ -72,9 +72,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
72
72
|
- !ruby/object:Gem::Version
|
73
73
|
version: '0'
|
74
74
|
requirements: []
|
75
|
-
|
76
|
-
|
77
|
-
signing_key:
|
75
|
+
rubygems_version: 3.1.4
|
76
|
+
signing_key:
|
78
77
|
specification_version: 4
|
79
78
|
summary: Download an entire website from the Wayback Machine.
|
80
79
|
test_files: []
|