wayback_machine_downloader_straw 2.3.10 → 2.3.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 67f774a5476a54ad0224e11f0c9a24b8df6b0d418f5b3c8886277c286bbe3043
|
|
4
|
+
data.tar.gz: a881ccdac84cd8e4da13edd9fc8117bfdba8c7d432959ef81c85bc95072a0dd9
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 01bdc9142820719c1ab17a50067fc478975627f414a29bdca32ea5fedf23227f33fb331f9470bb002af80cc50a6a74c7c8361f214d162c537d100860bdb664bc
|
|
7
|
+
data.tar.gz: f47436ecd1d4b8a4062d8689dac0d9fc4d73c743d5f84bd96764aa2a186eaae607fcee6c7b9e72f9fd3befd1fadfe9006354a43bfd134c892fbf5dfdd736ee28
|
|
@@ -74,6 +74,14 @@ option_parser = OptionParser.new do |opts|
|
|
|
74
74
|
options[:keep] = true
|
|
75
75
|
end
|
|
76
76
|
|
|
77
|
+
opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
|
|
78
|
+
options[:recursive_subdomains] = true
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
opts.on("--subdomain-depth DEPTH", Integer, "Maximum depth for subdomain recursion (default: 1)") do |t|
|
|
82
|
+
options[:subdomain_depth] = t
|
|
83
|
+
end
|
|
84
|
+
|
|
77
85
|
opts.on("-v", "--version", "Display version") do |t|
|
|
78
86
|
options[:version] = t
|
|
79
87
|
end
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SubdomainProcessor
|
|
4
|
+
def process_subdomains
|
|
5
|
+
return unless @recursive_subdomains
|
|
6
|
+
|
|
7
|
+
puts "Starting subdomain processing..."
|
|
8
|
+
|
|
9
|
+
# extract base domain from the URL for comparison
|
|
10
|
+
base_domain = extract_base_domain(@base_url)
|
|
11
|
+
@processed_domains = Set.new([base_domain])
|
|
12
|
+
@subdomain_queue = Queue.new
|
|
13
|
+
|
|
14
|
+
# scan downloaded files for subdomain links
|
|
15
|
+
initial_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
|
|
16
|
+
puts "Scanning #{initial_files.size} downloaded files for subdomain links..."
|
|
17
|
+
|
|
18
|
+
subdomains_found = scan_files_for_subdomains(initial_files, base_domain)
|
|
19
|
+
|
|
20
|
+
if subdomains_found.empty?
|
|
21
|
+
puts "No subdomains found in downloaded content."
|
|
22
|
+
return
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
puts "Found #{subdomains_found.size} subdomains to process: #{subdomains_found.join(', ')}"
|
|
26
|
+
|
|
27
|
+
# add found subdomains to the queue
|
|
28
|
+
subdomains_found.each do |subdomain|
|
|
29
|
+
full_domain = "#{subdomain}.#{base_domain}"
|
|
30
|
+
@subdomain_queue << "https://#{full_domain}/"
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# process the subdomain queue
|
|
34
|
+
download_subdomains(base_domain)
|
|
35
|
+
|
|
36
|
+
# after all downloads, rewrite all URLs to make local references
|
|
37
|
+
rewrite_subdomain_links(base_domain) if @rewrite
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
private
|
|
41
|
+
|
|
42
|
+
def extract_base_domain(url)
|
|
43
|
+
uri = URI.parse(url.gsub(/^https?:\/\//, '').split('/').first) rescue nil
|
|
44
|
+
return nil unless uri
|
|
45
|
+
|
|
46
|
+
host = uri.host || uri.path.split('/').first
|
|
47
|
+
host = host.downcase
|
|
48
|
+
|
|
49
|
+
# extract the base domain (e.g., "example.com" from "sub.example.com")
|
|
50
|
+
parts = host.split('.')
|
|
51
|
+
return host if parts.size <= 2
|
|
52
|
+
|
|
53
|
+
# for domains like co.uk, we want to keep the last 3 parts
|
|
54
|
+
if parts[-2].length <= 3 && parts[-1].length <= 3 && parts.size > 2
|
|
55
|
+
parts.last(3).join('.')
|
|
56
|
+
else
|
|
57
|
+
parts.last(2).join('.')
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def scan_files_for_subdomains(files, base_domain)
|
|
62
|
+
return [] unless base_domain
|
|
63
|
+
|
|
64
|
+
subdomains = Set.new
|
|
65
|
+
|
|
66
|
+
files.each do |file_path|
|
|
67
|
+
next unless File.exist?(file_path)
|
|
68
|
+
|
|
69
|
+
begin
|
|
70
|
+
content = File.read(file_path)
|
|
71
|
+
|
|
72
|
+
# extract URLs from HTML href/src attributes
|
|
73
|
+
content.scan(/(?:href|src|action|data-src)=["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
|
|
74
|
+
subdomain = match[0].downcase
|
|
75
|
+
next if subdomain == 'www' # skip www subdomain
|
|
76
|
+
subdomains.add(subdomain)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# extract URLs from CSS
|
|
80
|
+
content.scan(/url\(["']?https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
|
|
81
|
+
subdomain = match[0].downcase
|
|
82
|
+
next if subdomain == 'www' # skip www subdomain
|
|
83
|
+
subdomains.add(subdomain)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# extract URLs from JavaScript strings
|
|
87
|
+
content.scan(/["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
|
|
88
|
+
subdomain = match[0].downcase
|
|
89
|
+
next if subdomain == 'www' # skip www subdomain
|
|
90
|
+
subdomains.add(subdomain)
|
|
91
|
+
end
|
|
92
|
+
rescue => e
|
|
93
|
+
puts "Error scanning file #{file_path}: #{e.message}"
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
subdomains.to_a
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def download_subdomains(base_domain)
|
|
101
|
+
puts "Starting subdomain downloads..."
|
|
102
|
+
depth = 0
|
|
103
|
+
max_depth = @subdomain_depth || 1
|
|
104
|
+
|
|
105
|
+
while depth < max_depth && !@subdomain_queue.empty?
|
|
106
|
+
current_batch = []
|
|
107
|
+
|
|
108
|
+
# get all subdomains at current depth
|
|
109
|
+
while !@subdomain_queue.empty?
|
|
110
|
+
current_batch << @subdomain_queue.pop
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
puts "Processing #{current_batch.size} subdomains at depth #{depth + 1}..."
|
|
114
|
+
|
|
115
|
+
# download each subdomain
|
|
116
|
+
current_batch.each do |subdomain_url|
|
|
117
|
+
download_subdomain(subdomain_url, base_domain)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# if we need to go deeper, scan the newly downloaded files
|
|
121
|
+
if depth + 1 < max_depth
|
|
122
|
+
# get all files in the subdomains directory
|
|
123
|
+
new_files = Dir.glob(File.join(backup_path, "subdomains", "**/*.{html,htm,css,js}"))
|
|
124
|
+
new_subdomains = scan_files_for_subdomains(new_files, base_domain)
|
|
125
|
+
|
|
126
|
+
# filter out already processed subdomains
|
|
127
|
+
new_subdomains.each do |subdomain|
|
|
128
|
+
full_domain = "#{subdomain}.#{base_domain}"
|
|
129
|
+
unless @processed_domains.include?(full_domain)
|
|
130
|
+
@processed_domains.add(full_domain)
|
|
131
|
+
@subdomain_queue << "https://#{full_domain}/"
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
puts "Found #{@subdomain_queue.size} new subdomains at depth #{depth + 1}" if !@subdomain_queue.empty?
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
depth += 1
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def download_subdomain(subdomain_url, base_domain)
|
|
143
|
+
begin
|
|
144
|
+
uri = URI.parse(subdomain_url)
|
|
145
|
+
subdomain_host = uri.host
|
|
146
|
+
|
|
147
|
+
# skip if already processed
|
|
148
|
+
if @processed_domains.include?(subdomain_host)
|
|
149
|
+
puts "Skipping already processed subdomain: #{subdomain_host}"
|
|
150
|
+
return
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
@processed_domains.add(subdomain_host)
|
|
154
|
+
puts "Downloading subdomain: #{subdomain_url}"
|
|
155
|
+
|
|
156
|
+
# create the directory for this subdomain
|
|
157
|
+
subdomain_dir = File.join(backup_path, "subdomains", subdomain_host)
|
|
158
|
+
FileUtils.mkdir_p(subdomain_dir)
|
|
159
|
+
|
|
160
|
+
# create subdomain downloader with appropriate options
|
|
161
|
+
subdomain_options = {
|
|
162
|
+
base_url: subdomain_url,
|
|
163
|
+
directory: subdomain_dir,
|
|
164
|
+
from_timestamp: @from_timestamp,
|
|
165
|
+
to_timestamp: @to_timestamp,
|
|
166
|
+
all: @all,
|
|
167
|
+
threads_count: @threads_count,
|
|
168
|
+
maximum_pages: [@maximum_pages / 2, 10].max,
|
|
169
|
+
rewrite: @rewrite,
|
|
170
|
+
# don't recursively process subdomains from here
|
|
171
|
+
recursive_subdomains: false
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
# download the subdomain content
|
|
175
|
+
subdomain_downloader = WaybackMachineDownloader.new(subdomain_options)
|
|
176
|
+
subdomain_downloader.download_files
|
|
177
|
+
|
|
178
|
+
puts "Completed download of subdomain: #{subdomain_host}"
|
|
179
|
+
rescue => e
|
|
180
|
+
puts "Error downloading subdomain #{subdomain_url}: #{e.message}"
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
def rewrite_subdomain_links(base_domain)
|
|
185
|
+
puts "Rewriting all files to use local subdomain references..."
|
|
186
|
+
|
|
187
|
+
all_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
|
|
188
|
+
subdomains = @processed_domains.reject { |domain| domain == base_domain }
|
|
189
|
+
|
|
190
|
+
puts "Found #{all_files.size} files to check for rewriting"
|
|
191
|
+
puts "Will rewrite links for subdomains: #{subdomains.join(', ')}"
|
|
192
|
+
|
|
193
|
+
rewritten_count = 0
|
|
194
|
+
|
|
195
|
+
all_files.each do |file_path|
|
|
196
|
+
next unless File.exist?(file_path)
|
|
197
|
+
|
|
198
|
+
begin
|
|
199
|
+
content = File.read(file_path)
|
|
200
|
+
original_content = content.dup
|
|
201
|
+
|
|
202
|
+
# replace subdomain URLs with local paths
|
|
203
|
+
subdomains.each do |subdomain_host|
|
|
204
|
+
# for HTML attributes (href, src, etc.)
|
|
205
|
+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
|
|
206
|
+
prefix, path, suffix = $1, $2, $3
|
|
207
|
+
path = "/index.html" if path.empty? || path == "/"
|
|
208
|
+
"#{prefix}../subdomains/#{subdomain_host}#{path}#{suffix}"
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# for CSS url()
|
|
212
|
+
content.gsub!(/url\(\s*["']?https?:\/\/#{Regexp.escape(subdomain_host)}([^"'\)]*?)["']?\s*\)/i) do
|
|
213
|
+
path = $1
|
|
214
|
+
path = "/index.html" if path.empty? || path == "/"
|
|
215
|
+
"url(\"../subdomains/#{subdomain_host}#{path}\")"
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# for JavaScript strings
|
|
219
|
+
content.gsub!(/(["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
|
|
220
|
+
quote_start, path, quote_end = $1, $2, $3
|
|
221
|
+
path = "/index.html" if path.empty? || path == "/"
|
|
222
|
+
"#{quote_start}../subdomains/#{subdomain_host}#{path}#{quote_end}"
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
# save if modified
|
|
227
|
+
if content != original_content
|
|
228
|
+
File.write(file_path, content)
|
|
229
|
+
rewritten_count += 1
|
|
230
|
+
end
|
|
231
|
+
rescue => e
|
|
232
|
+
puts "Error rewriting file #{file_path}: #{e.message}"
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
puts "Rewrote links in #{rewritten_count} files"
|
|
237
|
+
end
|
|
238
|
+
end
|
|
@@ -14,6 +14,7 @@ require 'stringio'
|
|
|
14
14
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
|
15
15
|
require_relative 'wayback_machine_downloader/to_regex'
|
|
16
16
|
require_relative 'wayback_machine_downloader/archive_api'
|
|
17
|
+
require_relative 'wayback_machine_downloader/subdom_processor'
|
|
17
18
|
|
|
18
19
|
class ConnectionPool
|
|
19
20
|
MAX_AGE = 300
|
|
@@ -112,8 +113,9 @@ end
|
|
|
112
113
|
class WaybackMachineDownloader
|
|
113
114
|
|
|
114
115
|
include ArchiveAPI
|
|
116
|
+
include SubdomainProcessor
|
|
115
117
|
|
|
116
|
-
VERSION = "2.3.
|
|
118
|
+
VERSION = "2.3.12"
|
|
117
119
|
DEFAULT_TIMEOUT = 30
|
|
118
120
|
MAX_RETRIES = 3
|
|
119
121
|
RETRY_DELAY = 2
|
|
@@ -123,9 +125,11 @@ class WaybackMachineDownloader
|
|
|
123
125
|
STATE_CDX_FILENAME = ".cdx.json"
|
|
124
126
|
STATE_DB_FILENAME = ".downloaded.txt"
|
|
125
127
|
|
|
128
|
+
|
|
126
129
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
|
127
130
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
|
128
|
-
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
|
|
131
|
+
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
|
|
132
|
+
:snapshot_at
|
|
129
133
|
|
|
130
134
|
def initialize params
|
|
131
135
|
validate_params(params)
|
|
@@ -153,6 +157,12 @@ class WaybackMachineDownloader
|
|
|
153
157
|
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
|
|
154
158
|
@db_mutex = Mutex.new
|
|
155
159
|
@rewrite = params[:rewrite] || false
|
|
160
|
+
@recursive_subdomains = params[:recursive_subdomains] || false
|
|
161
|
+
@subdomain_depth = params[:subdomain_depth] || 1
|
|
162
|
+
@snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
|
|
163
|
+
|
|
164
|
+
# URL for rejecting invalid/unencoded wayback urls
|
|
165
|
+
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
|
|
156
166
|
|
|
157
167
|
handle_reset
|
|
158
168
|
end
|
|
@@ -196,7 +206,7 @@ class WaybackMachineDownloader
|
|
|
196
206
|
|
|
197
207
|
def match_only_filter file_url
|
|
198
208
|
if @only_filter
|
|
199
|
-
only_filter_regex = @only_filter.to_regex
|
|
209
|
+
only_filter_regex = @only_filter.to_regex(detect: true)
|
|
200
210
|
if only_filter_regex
|
|
201
211
|
only_filter_regex =~ file_url
|
|
202
212
|
else
|
|
@@ -209,7 +219,7 @@ class WaybackMachineDownloader
|
|
|
209
219
|
|
|
210
220
|
def match_exclude_filter file_url
|
|
211
221
|
if @exclude_filter
|
|
212
|
-
exclude_filter_regex = @exclude_filter.to_regex
|
|
222
|
+
exclude_filter_regex = @exclude_filter.to_regex(detect: true)
|
|
213
223
|
if exclude_filter_regex
|
|
214
224
|
exclude_filter_regex =~ file_url
|
|
215
225
|
else
|
|
@@ -322,6 +332,36 @@ class WaybackMachineDownloader
|
|
|
322
332
|
snapshot_list_to_consider
|
|
323
333
|
end
|
|
324
334
|
|
|
335
|
+
# Get a composite snapshot file list for a specific timestamp
|
|
336
|
+
def get_composite_snapshot_file_list(target_timestamp)
|
|
337
|
+
file_versions = {}
|
|
338
|
+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
|
339
|
+
next unless file_url.include?('/')
|
|
340
|
+
next if file_timestamp.to_i > target_timestamp
|
|
341
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
|
342
|
+
file_id = CGI::unescape file_id
|
|
343
|
+
file_id = file_id.tidy_bytes unless file_id == ""
|
|
344
|
+
next if file_id.nil?
|
|
345
|
+
next if match_exclude_filter(file_url)
|
|
346
|
+
next unless match_only_filter(file_url)
|
|
347
|
+
# Select the most recent version <= target_timestamp
|
|
348
|
+
if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
|
|
349
|
+
file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
|
|
350
|
+
end
|
|
351
|
+
end
|
|
352
|
+
file_versions.values
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
# Returns a list of files for the composite snapshot
|
|
356
|
+
def get_file_list_composite_snapshot(target_timestamp)
|
|
357
|
+
file_list = get_composite_snapshot_file_list(target_timestamp)
|
|
358
|
+
file_list = file_list.sort_by { |_,v| v[:timestamp].to_s }.reverse
|
|
359
|
+
file_list.map do |file_remote_info|
|
|
360
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
|
361
|
+
file_remote_info[1]
|
|
362
|
+
end
|
|
363
|
+
end
|
|
364
|
+
|
|
325
365
|
def get_file_list_curated
|
|
326
366
|
file_list_curated = Hash.new
|
|
327
367
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
|
@@ -376,7 +416,9 @@ class WaybackMachineDownloader
|
|
|
376
416
|
|
|
377
417
|
|
|
378
418
|
def get_file_list_by_timestamp
|
|
379
|
-
if @
|
|
419
|
+
if @snapshot_at
|
|
420
|
+
@file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
|
|
421
|
+
elsif @all_timestamps
|
|
380
422
|
file_list_curated = get_file_list_all_timestamps
|
|
381
423
|
file_list_curated.map do |file_remote_info|
|
|
382
424
|
file_remote_info[1][:file_id] = file_remote_info[0]
|
|
@@ -513,6 +555,16 @@ class WaybackMachineDownloader
|
|
|
513
555
|
|
|
514
556
|
end_time = Time.now
|
|
515
557
|
puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
|
|
558
|
+
|
|
559
|
+
# process subdomains if enabled
|
|
560
|
+
if @recursive_subdomains
|
|
561
|
+
subdomain_start_time = Time.now
|
|
562
|
+
process_subdomains
|
|
563
|
+
subdomain_end_time = Time.now
|
|
564
|
+
subdomain_time = (subdomain_end_time - subdomain_start_time).round(2)
|
|
565
|
+
puts "Subdomain processing finished in #{subdomain_time}s."
|
|
566
|
+
end
|
|
567
|
+
|
|
516
568
|
puts "Results saved in #{backup_path}"
|
|
517
569
|
cleanup
|
|
518
570
|
end
|
|
@@ -709,7 +761,22 @@ class WaybackMachineDownloader
|
|
|
709
761
|
end
|
|
710
762
|
|
|
711
763
|
def file_list_by_timestamp
|
|
712
|
-
@
|
|
764
|
+
if @snapshot_at
|
|
765
|
+
@file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
|
|
766
|
+
elsif @all_timestamps
|
|
767
|
+
file_list_curated = get_file_list_all_timestamps
|
|
768
|
+
file_list_curated.map do |file_remote_info|
|
|
769
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
|
770
|
+
file_remote_info[1]
|
|
771
|
+
end
|
|
772
|
+
else
|
|
773
|
+
file_list_curated = get_file_list_curated
|
|
774
|
+
file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
|
|
775
|
+
file_list_curated.map do |file_remote_info|
|
|
776
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
|
777
|
+
file_remote_info[1]
|
|
778
|
+
end
|
|
779
|
+
end
|
|
713
780
|
end
|
|
714
781
|
|
|
715
782
|
private
|
|
@@ -740,6 +807,12 @@ class WaybackMachineDownloader
|
|
|
740
807
|
# Escape square brackets because they are not valid in URI()
|
|
741
808
|
wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
|
|
742
809
|
|
|
810
|
+
# reject invalid/unencoded wayback_url, behaving as if the resource weren't found
|
|
811
|
+
if not @url_regexp.match?(wayback_url)
|
|
812
|
+
@logger.warn("Skipped #{file_url}: invalid URL")
|
|
813
|
+
return :skipped_not_found
|
|
814
|
+
end
|
|
815
|
+
|
|
743
816
|
request = Net::HTTP::Get.new(URI(wayback_url))
|
|
744
817
|
request["Connection"] = "keep-alive"
|
|
745
818
|
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: wayback_machine_downloader_straw
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.3.
|
|
4
|
+
version: 2.3.12
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- strawberrymaster
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2025-
|
|
10
|
+
date: 2025-07-22 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: concurrent-ruby
|
|
@@ -71,6 +71,7 @@ files:
|
|
|
71
71
|
- bin/wayback_machine_downloader
|
|
72
72
|
- lib/wayback_machine_downloader.rb
|
|
73
73
|
- lib/wayback_machine_downloader/archive_api.rb
|
|
74
|
+
- lib/wayback_machine_downloader/subdom_processor.rb
|
|
74
75
|
- lib/wayback_machine_downloader/tidy_bytes.rb
|
|
75
76
|
- lib/wayback_machine_downloader/to_regex.rb
|
|
76
77
|
homepage: https://github.com/StrawberryMaster/wayback-machine-downloader
|