wayback_machine_downloader 2.1.1 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 54752c73ebfac815e91ef6bba40547a36282e5ec9c3ef2792370c13352fce0b6
|
4
|
+
data.tar.gz: df2f5d94981eeb2d1e55d2b4a9dd8fe57a24e8b29cf79a700ca520b7c3bc1a21
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 108d33cf57b738ba69ccf960f503ab5ea44b296ba043716fb2e83e9fa5bebcaec9a488bc4a5ab64dad55c1f23434c2b71005a86389e9b26fd07b38372f96b6d4
|
7
|
+
data.tar.gz: 62afad1698415e0c80b85599da7aba1e19574ec571862f8d69c56d1fe718f8c65cae3e3be2293d8418ecc7dd09803b4d9908186e93f3062ccd85b363a5e7dde4
|
@@ -18,6 +18,10 @@ option_parser = OptionParser.new do |opts|
|
|
18
18
|
options[:directory] = t
|
19
19
|
end
|
20
20
|
|
21
|
+
opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
|
22
|
+
options[:all_timestamps] = true
|
23
|
+
end
|
24
|
+
|
21
25
|
opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
|
22
26
|
options[:from_timestamp] = t
|
23
27
|
end
|
@@ -42,7 +46,7 @@ option_parser = OptionParser.new do |opts|
|
|
42
46
|
options[:all] = true
|
43
47
|
end
|
44
48
|
|
45
|
-
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to
|
49
|
+
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
|
46
50
|
options[:threads_count] = t
|
47
51
|
end
|
48
52
|
|
@@ -1,28 +1,38 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'uri'
|
3
|
+
|
1
4
|
module ArchiveAPI
|
2
5
|
|
3
6
|
def get_raw_list_from_api url, page_index
|
4
|
-
request_url = "
|
5
|
-
|
6
|
-
|
7
|
+
request_url = URI("https://web.archive.org/cdx/search/xd")
|
8
|
+
params = [["output", "json"], ["url", url]]
|
9
|
+
params += parameters_for_api page_index
|
10
|
+
request_url.query = URI.encode_www_form(params)
|
7
11
|
|
8
|
-
|
12
|
+
begin
|
13
|
+
json = JSON.parse(URI(request_url).open.read)
|
14
|
+
if (json[0] <=> ["timestamp","original"]) == 0
|
15
|
+
json.shift
|
16
|
+
end
|
17
|
+
json
|
18
|
+
rescue JSON::ParserError
|
19
|
+
[]
|
20
|
+
end
|
9
21
|
end
|
10
22
|
|
11
23
|
def parameters_for_api page_index
|
12
|
-
parameters = "
|
13
|
-
if
|
14
|
-
parameters
|
15
|
-
else
|
16
|
-
parameters += "&filter=statuscode:200"
|
24
|
+
parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
|
25
|
+
if !@all
|
26
|
+
parameters.push(["filter", "statuscode:200"])
|
17
27
|
end
|
18
28
|
if @from_timestamp and @from_timestamp != 0
|
19
|
-
parameters
|
29
|
+
parameters.push(["from", @from_timestamp.to_s])
|
20
30
|
end
|
21
31
|
if @to_timestamp and @to_timestamp != 0
|
22
|
-
parameters
|
32
|
+
parameters.push(["to", @to_timestamp.to_s])
|
23
33
|
end
|
24
34
|
if page_index
|
25
|
-
parameters
|
35
|
+
parameters.push(["page", page_index])
|
26
36
|
end
|
27
37
|
parameters
|
28
38
|
end
|
@@ -70,7 +70,7 @@ module TibyBytes
|
|
70
70
|
if is_unused || is_restricted
|
71
71
|
bytes[i] = tidy_byte(byte)
|
72
72
|
elsif is_cont
|
73
|
-
# Not expecting
|
73
|
+
# Not expecting continuation byte? Clean up. Otherwise, now expect one less.
|
74
74
|
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
75
75
|
else
|
76
76
|
if conts_expected > 0
|
@@ -14,9 +14,9 @@ class WaybackMachineDownloader
|
|
14
14
|
|
15
15
|
include ArchiveAPI
|
16
16
|
|
17
|
-
VERSION = "2.
|
17
|
+
VERSION = "2.3.1"
|
18
18
|
|
19
|
-
attr_accessor :base_url, :exact_url, :directory,
|
19
|
+
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
20
20
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
21
21
|
:all, :maximum_pages, :threads_count
|
22
22
|
|
@@ -24,6 +24,7 @@ class WaybackMachineDownloader
|
|
24
24
|
@base_url = params[:base_url]
|
25
25
|
@exact_url = params[:exact_url]
|
26
26
|
@directory = params[:directory]
|
27
|
+
@all_timestamps = params[:all_timestamps]
|
27
28
|
@from_timestamp = params[:from_timestamp].to_i
|
28
29
|
@to_timestamp = params[:to_timestamp].to_i
|
29
30
|
@only_filter = params[:only_filter]
|
@@ -83,7 +84,7 @@ class WaybackMachineDownloader
|
|
83
84
|
# Note: Passing a page index parameter allow us to get more snapshots,
|
84
85
|
# but from a less fresh index
|
85
86
|
print "Getting snapshot pages"
|
86
|
-
snapshot_list_to_consider =
|
87
|
+
snapshot_list_to_consider = []
|
87
88
|
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
|
88
89
|
print "."
|
89
90
|
unless @exact_url
|
@@ -94,17 +95,15 @@ class WaybackMachineDownloader
|
|
94
95
|
print "."
|
95
96
|
end
|
96
97
|
end
|
97
|
-
puts " found #{snapshot_list_to_consider.
|
98
|
+
puts " found #{snapshot_list_to_consider.length} snaphots to consider."
|
98
99
|
puts
|
99
100
|
snapshot_list_to_consider
|
100
101
|
end
|
101
102
|
|
102
103
|
def get_file_list_curated
|
103
104
|
file_list_curated = Hash.new
|
104
|
-
get_all_snapshots_to_consider.
|
105
|
-
next unless
|
106
|
-
file_timestamp = line[0..13].to_i
|
107
|
-
file_url = line[15..-2]
|
105
|
+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
106
|
+
next unless file_url.include?('/')
|
108
107
|
file_id = file_url.split('/')[3..-1].join('/')
|
109
108
|
file_id = CGI::unescape file_id
|
110
109
|
file_id = file_id.tidy_bytes unless file_id == ""
|
@@ -127,22 +126,61 @@ class WaybackMachineDownloader
|
|
127
126
|
file_list_curated
|
128
127
|
end
|
129
128
|
|
129
|
+
def get_file_list_all_timestamps
|
130
|
+
file_list_curated = Hash.new
|
131
|
+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
132
|
+
next unless file_url.include?('/')
|
133
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
134
|
+
file_id_and_timestamp = [file_timestamp, file_id].join('/')
|
135
|
+
file_id_and_timestamp = CGI::unescape file_id_and_timestamp
|
136
|
+
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
|
137
|
+
if file_id.nil?
|
138
|
+
puts "Malformed file url, ignoring: #{file_url}"
|
139
|
+
else
|
140
|
+
if match_exclude_filter(file_url)
|
141
|
+
puts "File url matches exclude filter, ignoring: #{file_url}"
|
142
|
+
elsif not match_only_filter(file_url)
|
143
|
+
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
144
|
+
elsif file_list_curated[file_id_and_timestamp]
|
145
|
+
puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
|
146
|
+
else
|
147
|
+
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
puts "file_list_curated: " + file_list_curated.count.to_s
|
152
|
+
file_list_curated
|
153
|
+
end
|
154
|
+
|
155
|
+
|
130
156
|
def get_file_list_by_timestamp
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
157
|
+
if @all_timestamps
|
158
|
+
file_list_curated = get_file_list_all_timestamps
|
159
|
+
file_list_curated.map do |file_remote_info|
|
160
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
161
|
+
file_remote_info[1]
|
162
|
+
end
|
163
|
+
else
|
164
|
+
file_list_curated = get_file_list_curated
|
165
|
+
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
|
166
|
+
file_list_curated.map do |file_remote_info|
|
167
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
168
|
+
file_remote_info[1]
|
169
|
+
end
|
136
170
|
end
|
137
171
|
end
|
138
172
|
|
139
173
|
def list_files
|
140
174
|
# retrieval produces its own output
|
175
|
+
@orig_stdout = $stdout
|
176
|
+
$stdout = $stderr
|
141
177
|
files = get_file_list_by_timestamp
|
178
|
+
$stdout = @orig_stdout
|
142
179
|
puts "["
|
143
|
-
files.each do |file|
|
180
|
+
files[0...-1].each do |file|
|
144
181
|
puts file.to_json + ","
|
145
182
|
end
|
183
|
+
puts files[-1].to_json
|
146
184
|
puts "]"
|
147
185
|
end
|
148
186
|
|
@@ -222,6 +260,7 @@ class WaybackMachineDownloader
|
|
222
260
|
file_path = backup_path + file_path_elements[0..-1].join('/')
|
223
261
|
end
|
224
262
|
if Gem.win_platform?
|
263
|
+
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
225
264
|
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
226
265
|
end
|
227
266
|
unless File.exist? file_path
|
@@ -229,7 +268,7 @@ class WaybackMachineDownloader
|
|
229
268
|
structure_dir_path dir_path
|
230
269
|
open(file_path, "wb") do |file|
|
231
270
|
begin
|
232
|
-
|
271
|
+
URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}").open("Accept-Encoding" => "plain") do |uri|
|
233
272
|
file.write(uri.read)
|
234
273
|
end
|
235
274
|
rescue OpenURI::HTTPError => e
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-09-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -57,7 +57,7 @@ homepage: https://github.com/hartator/wayback-machine-downloader
|
|
57
57
|
licenses:
|
58
58
|
- MIT
|
59
59
|
metadata: {}
|
60
|
-
post_install_message:
|
60
|
+
post_install_message:
|
61
61
|
rdoc_options: []
|
62
62
|
require_paths:
|
63
63
|
- lib
|
@@ -72,9 +72,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
72
72
|
- !ruby/object:Gem::Version
|
73
73
|
version: '0'
|
74
74
|
requirements: []
|
75
|
-
|
76
|
-
|
77
|
-
signing_key:
|
75
|
+
rubygems_version: 3.1.4
|
76
|
+
signing_key:
|
78
77
|
specification_version: 4
|
79
78
|
summary: Download an entire website from the Wayback Machine.
|
80
79
|
test_files: []
|