wayback_machine_downloader_straw 2.3.11 → 2.3.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 67f774a5476a54ad0224e11f0c9a24b8df6b0d418f5b3c8886277c286bbe3043
|
4
|
+
data.tar.gz: a881ccdac84cd8e4da13edd9fc8117bfdba8c7d432959ef81c85bc95072a0dd9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 01bdc9142820719c1ab17a50067fc478975627f414a29bdca32ea5fedf23227f33fb331f9470bb002af80cc50a6a74c7c8361f214d162c537d100860bdb664bc
|
7
|
+
data.tar.gz: f47436ecd1d4b8a4062d8689dac0d9fc4d73c743d5f84bd96764aa2a186eaae607fcee6c7b9e72f9fd3befd1fadfe9006354a43bfd134c892fbf5dfdd736ee28
|
@@ -0,0 +1,238 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SubdomainProcessor
|
4
|
+
def process_subdomains
|
5
|
+
return unless @recursive_subdomains
|
6
|
+
|
7
|
+
puts "Starting subdomain processing..."
|
8
|
+
|
9
|
+
# extract base domain from the URL for comparison
|
10
|
+
base_domain = extract_base_domain(@base_url)
|
11
|
+
@processed_domains = Set.new([base_domain])
|
12
|
+
@subdomain_queue = Queue.new
|
13
|
+
|
14
|
+
# scan downloaded files for subdomain links
|
15
|
+
initial_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
|
16
|
+
puts "Scanning #{initial_files.size} downloaded files for subdomain links..."
|
17
|
+
|
18
|
+
subdomains_found = scan_files_for_subdomains(initial_files, base_domain)
|
19
|
+
|
20
|
+
if subdomains_found.empty?
|
21
|
+
puts "No subdomains found in downloaded content."
|
22
|
+
return
|
23
|
+
end
|
24
|
+
|
25
|
+
puts "Found #{subdomains_found.size} subdomains to process: #{subdomains_found.join(', ')}"
|
26
|
+
|
27
|
+
# add found subdomains to the queue
|
28
|
+
subdomains_found.each do |subdomain|
|
29
|
+
full_domain = "#{subdomain}.#{base_domain}"
|
30
|
+
@subdomain_queue << "https://#{full_domain}/"
|
31
|
+
end
|
32
|
+
|
33
|
+
# process the subdomain queue
|
34
|
+
download_subdomains(base_domain)
|
35
|
+
|
36
|
+
# after all downloads, rewrite all URLs to make local references
|
37
|
+
rewrite_subdomain_links(base_domain) if @rewrite
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def extract_base_domain(url)
|
43
|
+
uri = URI.parse(url.gsub(/^https?:\/\//, '').split('/').first) rescue nil
|
44
|
+
return nil unless uri
|
45
|
+
|
46
|
+
host = uri.host || uri.path.split('/').first
|
47
|
+
host = host.downcase
|
48
|
+
|
49
|
+
# extract the base domain (e.g., "example.com" from "sub.example.com")
|
50
|
+
parts = host.split('.')
|
51
|
+
return host if parts.size <= 2
|
52
|
+
|
53
|
+
# for domains like co.uk, we want to keep the last 3 parts
|
54
|
+
if parts[-2].length <= 3 && parts[-1].length <= 3 && parts.size > 2
|
55
|
+
parts.last(3).join('.')
|
56
|
+
else
|
57
|
+
parts.last(2).join('.')
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def scan_files_for_subdomains(files, base_domain)
|
62
|
+
return [] unless base_domain
|
63
|
+
|
64
|
+
subdomains = Set.new
|
65
|
+
|
66
|
+
files.each do |file_path|
|
67
|
+
next unless File.exist?(file_path)
|
68
|
+
|
69
|
+
begin
|
70
|
+
content = File.read(file_path)
|
71
|
+
|
72
|
+
# extract URLs from HTML href/src attributes
|
73
|
+
content.scan(/(?:href|src|action|data-src)=["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
|
74
|
+
subdomain = match[0].downcase
|
75
|
+
next if subdomain == 'www' # skip www subdomain
|
76
|
+
subdomains.add(subdomain)
|
77
|
+
end
|
78
|
+
|
79
|
+
# extract URLs from CSS
|
80
|
+
content.scan(/url\(["']?https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
|
81
|
+
subdomain = match[0].downcase
|
82
|
+
next if subdomain == 'www' # skip www subdomain
|
83
|
+
subdomains.add(subdomain)
|
84
|
+
end
|
85
|
+
|
86
|
+
# extract URLs from JavaScript strings
|
87
|
+
content.scan(/["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
|
88
|
+
subdomain = match[0].downcase
|
89
|
+
next if subdomain == 'www' # skip www subdomain
|
90
|
+
subdomains.add(subdomain)
|
91
|
+
end
|
92
|
+
rescue => e
|
93
|
+
puts "Error scanning file #{file_path}: #{e.message}"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
subdomains.to_a
|
98
|
+
end
|
99
|
+
|
100
|
+
def download_subdomains(base_domain)
|
101
|
+
puts "Starting subdomain downloads..."
|
102
|
+
depth = 0
|
103
|
+
max_depth = @subdomain_depth || 1
|
104
|
+
|
105
|
+
while depth < max_depth && !@subdomain_queue.empty?
|
106
|
+
current_batch = []
|
107
|
+
|
108
|
+
# get all subdomains at current depth
|
109
|
+
while !@subdomain_queue.empty?
|
110
|
+
current_batch << @subdomain_queue.pop
|
111
|
+
end
|
112
|
+
|
113
|
+
puts "Processing #{current_batch.size} subdomains at depth #{depth + 1}..."
|
114
|
+
|
115
|
+
# download each subdomain
|
116
|
+
current_batch.each do |subdomain_url|
|
117
|
+
download_subdomain(subdomain_url, base_domain)
|
118
|
+
end
|
119
|
+
|
120
|
+
# if we need to go deeper, scan the newly downloaded files
|
121
|
+
if depth + 1 < max_depth
|
122
|
+
# get all files in the subdomains directory
|
123
|
+
new_files = Dir.glob(File.join(backup_path, "subdomains", "**/*.{html,htm,css,js}"))
|
124
|
+
new_subdomains = scan_files_for_subdomains(new_files, base_domain)
|
125
|
+
|
126
|
+
# filter out already processed subdomains
|
127
|
+
new_subdomains.each do |subdomain|
|
128
|
+
full_domain = "#{subdomain}.#{base_domain}"
|
129
|
+
unless @processed_domains.include?(full_domain)
|
130
|
+
@processed_domains.add(full_domain)
|
131
|
+
@subdomain_queue << "https://#{full_domain}/"
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
puts "Found #{@subdomain_queue.size} new subdomains at depth #{depth + 1}" if !@subdomain_queue.empty?
|
136
|
+
end
|
137
|
+
|
138
|
+
depth += 1
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
def download_subdomain(subdomain_url, base_domain)
|
143
|
+
begin
|
144
|
+
uri = URI.parse(subdomain_url)
|
145
|
+
subdomain_host = uri.host
|
146
|
+
|
147
|
+
# skip if already processed
|
148
|
+
if @processed_domains.include?(subdomain_host)
|
149
|
+
puts "Skipping already processed subdomain: #{subdomain_host}"
|
150
|
+
return
|
151
|
+
end
|
152
|
+
|
153
|
+
@processed_domains.add(subdomain_host)
|
154
|
+
puts "Downloading subdomain: #{subdomain_url}"
|
155
|
+
|
156
|
+
# create the directory for this subdomain
|
157
|
+
subdomain_dir = File.join(backup_path, "subdomains", subdomain_host)
|
158
|
+
FileUtils.mkdir_p(subdomain_dir)
|
159
|
+
|
160
|
+
# create subdomain downloader with appropriate options
|
161
|
+
subdomain_options = {
|
162
|
+
base_url: subdomain_url,
|
163
|
+
directory: subdomain_dir,
|
164
|
+
from_timestamp: @from_timestamp,
|
165
|
+
to_timestamp: @to_timestamp,
|
166
|
+
all: @all,
|
167
|
+
threads_count: @threads_count,
|
168
|
+
maximum_pages: [@maximum_pages / 2, 10].max,
|
169
|
+
rewrite: @rewrite,
|
170
|
+
# don't recursively process subdomains from here
|
171
|
+
recursive_subdomains: false
|
172
|
+
}
|
173
|
+
|
174
|
+
# download the subdomain content
|
175
|
+
subdomain_downloader = WaybackMachineDownloader.new(subdomain_options)
|
176
|
+
subdomain_downloader.download_files
|
177
|
+
|
178
|
+
puts "Completed download of subdomain: #{subdomain_host}"
|
179
|
+
rescue => e
|
180
|
+
puts "Error downloading subdomain #{subdomain_url}: #{e.message}"
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
def rewrite_subdomain_links(base_domain)
|
185
|
+
puts "Rewriting all files to use local subdomain references..."
|
186
|
+
|
187
|
+
all_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
|
188
|
+
subdomains = @processed_domains.reject { |domain| domain == base_domain }
|
189
|
+
|
190
|
+
puts "Found #{all_files.size} files to check for rewriting"
|
191
|
+
puts "Will rewrite links for subdomains: #{subdomains.join(', ')}"
|
192
|
+
|
193
|
+
rewritten_count = 0
|
194
|
+
|
195
|
+
all_files.each do |file_path|
|
196
|
+
next unless File.exist?(file_path)
|
197
|
+
|
198
|
+
begin
|
199
|
+
content = File.read(file_path)
|
200
|
+
original_content = content.dup
|
201
|
+
|
202
|
+
# replace subdomain URLs with local paths
|
203
|
+
subdomains.each do |subdomain_host|
|
204
|
+
# for HTML attributes (href, src, etc.)
|
205
|
+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
|
206
|
+
prefix, path, suffix = $1, $2, $3
|
207
|
+
path = "/index.html" if path.empty? || path == "/"
|
208
|
+
"#{prefix}../subdomains/#{subdomain_host}#{path}#{suffix}"
|
209
|
+
end
|
210
|
+
|
211
|
+
# for CSS url()
|
212
|
+
content.gsub!(/url\(\s*["']?https?:\/\/#{Regexp.escape(subdomain_host)}([^"'\)]*?)["']?\s*\)/i) do
|
213
|
+
path = $1
|
214
|
+
path = "/index.html" if path.empty? || path == "/"
|
215
|
+
"url(\"../subdomains/#{subdomain_host}#{path}\")"
|
216
|
+
end
|
217
|
+
|
218
|
+
# for JavaScript strings
|
219
|
+
content.gsub!(/(["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
|
220
|
+
quote_start, path, quote_end = $1, $2, $3
|
221
|
+
path = "/index.html" if path.empty? || path == "/"
|
222
|
+
"#{quote_start}../subdomains/#{subdomain_host}#{path}#{quote_end}"
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
# save if modified
|
227
|
+
if content != original_content
|
228
|
+
File.write(file_path, content)
|
229
|
+
rewritten_count += 1
|
230
|
+
end
|
231
|
+
rescue => e
|
232
|
+
puts "Error rewriting file #{file_path}: #{e.message}"
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
puts "Rewrote links in #{rewritten_count} files"
|
237
|
+
end
|
238
|
+
end
|
@@ -115,7 +115,7 @@ class WaybackMachineDownloader
|
|
115
115
|
include ArchiveAPI
|
116
116
|
include SubdomainProcessor
|
117
117
|
|
118
|
-
VERSION = "2.3.
|
118
|
+
VERSION = "2.3.12"
|
119
119
|
DEFAULT_TIMEOUT = 30
|
120
120
|
MAX_RETRIES = 3
|
121
121
|
RETRY_DELAY = 2
|
@@ -128,7 +128,8 @@ class WaybackMachineDownloader
|
|
128
128
|
|
129
129
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
130
130
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
131
|
-
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
|
131
|
+
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
|
132
|
+
:snapshot_at
|
132
133
|
|
133
134
|
def initialize params
|
134
135
|
validate_params(params)
|
@@ -158,6 +159,7 @@ class WaybackMachineDownloader
|
|
158
159
|
@rewrite = params[:rewrite] || false
|
159
160
|
@recursive_subdomains = params[:recursive_subdomains] || false
|
160
161
|
@subdomain_depth = params[:subdomain_depth] || 1
|
162
|
+
@snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
|
161
163
|
|
162
164
|
# URL for rejecting invalid/unencoded wayback urls
|
163
165
|
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
|
@@ -330,6 +332,36 @@ class WaybackMachineDownloader
|
|
330
332
|
snapshot_list_to_consider
|
331
333
|
end
|
332
334
|
|
335
|
+
# Get a composite snapshot file list for a specific timestamp
|
336
|
+
def get_composite_snapshot_file_list(target_timestamp)
|
337
|
+
file_versions = {}
|
338
|
+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
339
|
+
next unless file_url.include?('/')
|
340
|
+
next if file_timestamp.to_i > target_timestamp
|
341
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
342
|
+
file_id = CGI::unescape file_id
|
343
|
+
file_id = file_id.tidy_bytes unless file_id == ""
|
344
|
+
next if file_id.nil?
|
345
|
+
next if match_exclude_filter(file_url)
|
346
|
+
next unless match_only_filter(file_url)
|
347
|
+
# Select the most recent version <= target_timestamp
|
348
|
+
if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
|
349
|
+
file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
|
350
|
+
end
|
351
|
+
end
|
352
|
+
file_versions.values
|
353
|
+
end
|
354
|
+
|
355
|
+
# Returns a list of files for the composite snapshot
|
356
|
+
def get_file_list_composite_snapshot(target_timestamp)
|
357
|
+
file_list = get_composite_snapshot_file_list(target_timestamp)
|
358
|
+
file_list = file_list.sort_by { |_,v| v[:timestamp].to_s }.reverse
|
359
|
+
file_list.map do |file_remote_info|
|
360
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
361
|
+
file_remote_info[1]
|
362
|
+
end
|
363
|
+
end
|
364
|
+
|
333
365
|
def get_file_list_curated
|
334
366
|
file_list_curated = Hash.new
|
335
367
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
@@ -384,7 +416,9 @@ class WaybackMachineDownloader
|
|
384
416
|
|
385
417
|
|
386
418
|
def get_file_list_by_timestamp
|
387
|
-
if @
|
419
|
+
if @snapshot_at
|
420
|
+
@file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
|
421
|
+
elsif @all_timestamps
|
388
422
|
file_list_curated = get_file_list_all_timestamps
|
389
423
|
file_list_curated.map do |file_remote_info|
|
390
424
|
file_remote_info[1][:file_id] = file_remote_info[0]
|
@@ -727,7 +761,22 @@ class WaybackMachineDownloader
|
|
727
761
|
end
|
728
762
|
|
729
763
|
def file_list_by_timestamp
|
730
|
-
@
|
764
|
+
if @snapshot_at
|
765
|
+
@file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
|
766
|
+
elsif @all_timestamps
|
767
|
+
file_list_curated = get_file_list_all_timestamps
|
768
|
+
file_list_curated.map do |file_remote_info|
|
769
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
770
|
+
file_remote_info[1]
|
771
|
+
end
|
772
|
+
else
|
773
|
+
file_list_curated = get_file_list_curated
|
774
|
+
file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
|
775
|
+
file_list_curated.map do |file_remote_info|
|
776
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
777
|
+
file_remote_info[1]
|
778
|
+
end
|
779
|
+
end
|
731
780
|
end
|
732
781
|
|
733
782
|
private
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader_straw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.3.
|
4
|
+
version: 2.3.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- strawberrymaster
|
8
8
|
bindir: bin
|
9
9
|
cert_chain: []
|
10
|
-
date: 2025-07-
|
10
|
+
date: 2025-07-22 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
12
|
- !ruby/object:Gem::Dependency
|
13
13
|
name: concurrent-ruby
|
@@ -71,6 +71,7 @@ files:
|
|
71
71
|
- bin/wayback_machine_downloader
|
72
72
|
- lib/wayback_machine_downloader.rb
|
73
73
|
- lib/wayback_machine_downloader/archive_api.rb
|
74
|
+
- lib/wayback_machine_downloader/subdom_processor.rb
|
74
75
|
- lib/wayback_machine_downloader/tidy_bytes.rb
|
75
76
|
- lib/wayback_machine_downloader/to_regex.rb
|
76
77
|
homepage: https://github.com/StrawberryMaster/wayback-machine-downloader
|