wayback_machine_downloader 2.2.1 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 57cbbb04b38525f6dd9c1a8f4022ee28ce45c76d1d26acc90076a4b8b6014b44
|
4
|
+
data.tar.gz: 4128b3ab753e91bea93ddebdafba133091663617e0c247c022076a8c11dfa5c2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bb08b6f6e9fa930b025fbf0c783476bd965e364ef46ccd2fecc8e5d0954be062b67b801acf4d168556f7d90f1d5c836a16184e371a5e5d47da7e804278d893ab
|
7
|
+
data.tar.gz: e750e04ab4e1f795e061f0fe91581abb60baecc7d5427d9ec8e724f931d1afba207512731f350eee7c04ceacf12b4a90823e9b3c750e3799810944430279c330
|
@@ -46,7 +46,7 @@ option_parser = OptionParser.new do |opts|
|
|
46
46
|
options[:all] = true
|
47
47
|
end
|
48
48
|
|
49
|
-
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to
|
49
|
+
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
|
50
50
|
options[:threads_count] = t
|
51
51
|
end
|
52
52
|
|
@@ -14,7 +14,7 @@ class WaybackMachineDownloader
|
|
14
14
|
|
15
15
|
include ArchiveAPI
|
16
16
|
|
17
|
-
VERSION = "2.
|
17
|
+
VERSION = "2.3.0"
|
18
18
|
|
19
19
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
20
20
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
@@ -84,7 +84,7 @@ class WaybackMachineDownloader
|
|
84
84
|
# Note: Passing a page index parameter allow us to get more snapshots,
|
85
85
|
# but from a less fresh index
|
86
86
|
print "Getting snapshot pages"
|
87
|
-
snapshot_list_to_consider =
|
87
|
+
snapshot_list_to_consider = []
|
88
88
|
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
|
89
89
|
print "."
|
90
90
|
unless @exact_url
|
@@ -95,17 +95,15 @@ class WaybackMachineDownloader
|
|
95
95
|
print "."
|
96
96
|
end
|
97
97
|
end
|
98
|
-
puts " found #{snapshot_list_to_consider.
|
98
|
+
puts " found #{snapshot_list_to_consider.length} snaphots to consider."
|
99
99
|
puts
|
100
100
|
snapshot_list_to_consider
|
101
101
|
end
|
102
102
|
|
103
103
|
def get_file_list_curated
|
104
104
|
file_list_curated = Hash.new
|
105
|
-
get_all_snapshots_to_consider.
|
106
|
-
next unless
|
107
|
-
file_timestamp = line[0..13].to_i
|
108
|
-
file_url = line[15..-2]
|
105
|
+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
106
|
+
next unless file_url.include?('/')
|
109
107
|
file_id = file_url.split('/')[3..-1].join('/')
|
110
108
|
file_id = CGI::unescape file_id
|
111
109
|
file_id = file_id.tidy_bytes unless file_id == ""
|
@@ -130,10 +128,8 @@ class WaybackMachineDownloader
|
|
130
128
|
|
131
129
|
def get_file_list_all_timestamps
|
132
130
|
file_list_curated = Hash.new
|
133
|
-
get_all_snapshots_to_consider.
|
134
|
-
next unless
|
135
|
-
file_timestamp = line[0..13].to_i
|
136
|
-
file_url = line[15..-2]
|
131
|
+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
132
|
+
next unless file_url.include?('/')
|
137
133
|
file_id = file_url.split('/')[3..-1].join('/')
|
138
134
|
file_id_and_timestamp = [file_timestamp, file_id].join('/')
|
139
135
|
file_id_and_timestamp = CGI::unescape file_id_and_timestamp
|
@@ -176,11 +172,15 @@ class WaybackMachineDownloader
|
|
176
172
|
|
177
173
|
def list_files
|
178
174
|
# retrieval produces its own output
|
175
|
+
@orig_stdout = $stdout
|
176
|
+
$stdout = $stderr
|
179
177
|
files = get_file_list_by_timestamp
|
178
|
+
$stdout = @orig_stdout
|
180
179
|
puts "["
|
181
|
-
files.each do |file|
|
180
|
+
files[0...-1].each do |file|
|
182
181
|
puts file.to_json + ","
|
183
182
|
end
|
183
|
+
puts files[-1].to_json
|
184
184
|
puts "]"
|
185
185
|
end
|
186
186
|
|
@@ -268,7 +268,7 @@ class WaybackMachineDownloader
|
|
268
268
|
structure_dir_path dir_path
|
269
269
|
open(file_path, "wb") do |file|
|
270
270
|
begin
|
271
|
-
open("
|
271
|
+
URI.open("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
|
272
272
|
file.write(uri.read)
|
273
273
|
end
|
274
274
|
rescue OpenURI::HTTPError => e
|
@@ -1,28 +1,38 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'uri'
|
3
|
+
|
1
4
|
module ArchiveAPI
|
2
5
|
|
3
6
|
def get_raw_list_from_api url, page_index
|
4
|
-
request_url = "
|
5
|
-
|
6
|
-
|
7
|
+
request_url = URI("https://web.archive.org/cdx/search/xd")
|
8
|
+
params = [["output", "json"], ["url", url]]
|
9
|
+
params += parameters_for_api page_index
|
10
|
+
request_url.query = URI.encode_www_form(params)
|
7
11
|
|
8
|
-
|
12
|
+
begin
|
13
|
+
json = JSON.parse(URI(request_url).open.read)
|
14
|
+
if (json[0] <=> ["timestamp","original"]) == 0
|
15
|
+
json.shift
|
16
|
+
end
|
17
|
+
json
|
18
|
+
rescue JSON::ParserError
|
19
|
+
[]
|
20
|
+
end
|
9
21
|
end
|
10
22
|
|
11
23
|
def parameters_for_api page_index
|
12
|
-
parameters = "
|
13
|
-
if
|
14
|
-
parameters
|
15
|
-
else
|
16
|
-
parameters += "&filter=statuscode:200"
|
24
|
+
parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
|
25
|
+
if !@all
|
26
|
+
parameters.push(["filter", "statuscode:200"])
|
17
27
|
end
|
18
28
|
if @from_timestamp and @from_timestamp != 0
|
19
|
-
parameters
|
29
|
+
parameters.push(["from", @from_timestamp.to_s])
|
20
30
|
end
|
21
31
|
if @to_timestamp and @to_timestamp != 0
|
22
|
-
parameters
|
32
|
+
parameters.push(["to", @to_timestamp.to_s])
|
23
33
|
end
|
24
34
|
if page_index
|
25
|
-
parameters
|
35
|
+
parameters.push(["page", page_index])
|
26
36
|
end
|
27
37
|
parameters
|
28
38
|
end
|
@@ -70,7 +70,7 @@ module TibyBytes
|
|
70
70
|
if is_unused || is_restricted
|
71
71
|
bytes[i] = tidy_byte(byte)
|
72
72
|
elsif is_cont
|
73
|
-
# Not expecting
|
73
|
+
# Not expecting continuation byte? Clean up. Otherwise, now expect one less.
|
74
74
|
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
75
75
|
else
|
76
76
|
if conts_expected > 0
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-06-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -57,7 +57,7 @@ homepage: https://github.com/hartator/wayback-machine-downloader
|
|
57
57
|
licenses:
|
58
58
|
- MIT
|
59
59
|
metadata: {}
|
60
|
-
post_install_message:
|
60
|
+
post_install_message:
|
61
61
|
rdoc_options: []
|
62
62
|
require_paths:
|
63
63
|
- lib
|
@@ -72,9 +72,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
72
72
|
- !ruby/object:Gem::Version
|
73
73
|
version: '0'
|
74
74
|
requirements: []
|
75
|
-
|
76
|
-
|
77
|
-
signing_key:
|
75
|
+
rubygems_version: 3.1.4
|
76
|
+
signing_key:
|
78
77
|
specification_version: 4
|
79
78
|
summary: Download an entire website from the Wayback Machine.
|
80
79
|
test_files: []
|