wayback_machine_downloader_straw 2.4.4 → 2.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +4 -0
- data/lib/wayback_machine_downloader/archive_api.rb +19 -1
- data/lib/wayback_machine_downloader/page_requisites.rb +33 -0
- data/lib/wayback_machine_downloader/url_rewrite.rb +71 -60
- data/lib/wayback_machine_downloader.rb +187 -19
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c5ba50bde9b0306e043eed8151b12f37f603f5cfd73013e53260543f7fc134a5
|
|
4
|
+
data.tar.gz: d3dbc1a0f6f894547fb39e56193c967dbf99685a8fa4d3cecaeaff62070aab4c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 854ec2ccbe2daf620178397bf2c620ed8cf01ca57f175cbb9a2c8e7a057b1495a382ccae64836d75cf08c9a7d5a8ab638859d4c7d34584dcfc51f5eda8b7e5b2
|
|
7
|
+
data.tar.gz: 1ed3f13c7aadcb097a174c7870730df0920c7b08e9476b786d287dfd2a3e29a50b690ea2fb16b0d762dac726648356788b4122e674db44e7c5a49ffc541f4098
|
|
@@ -86,6 +86,10 @@ option_parser = OptionParser.new do |opts|
|
|
|
86
86
|
options[:subdomain_depth] = t
|
|
87
87
|
end
|
|
88
88
|
|
|
89
|
+
opts.on("--page-requisites", "Download related assets (images, css, js) for downloaded HTML pages") do |t|
|
|
90
|
+
options[:page_requisites] = true
|
|
91
|
+
end
|
|
92
|
+
|
|
89
93
|
opts.on("-v", "--version", "Display version") do |t|
|
|
90
94
|
options[:version] = t
|
|
91
95
|
end
|
|
@@ -16,6 +16,10 @@ module ArchiveAPI
|
|
|
16
16
|
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
|
17
17
|
request_url.query = URI.encode_www_form(params)
|
|
18
18
|
|
|
19
|
+
retries = 0
|
|
20
|
+
max_retries = (@max_retries || 3)
|
|
21
|
+
delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
|
|
22
|
+
|
|
19
23
|
begin
|
|
20
24
|
response = http.get(request_url)
|
|
21
25
|
body = response.body.to_s.strip
|
|
@@ -26,7 +30,21 @@ module ArchiveAPI
|
|
|
26
30
|
json.shift if json.first == ["timestamp", "original"]
|
|
27
31
|
json
|
|
28
32
|
rescue JSON::ParserError => e
|
|
29
|
-
warn "Failed to
|
|
33
|
+
warn "Failed to parse JSON from API for #{url}: #{e.message}"
|
|
34
|
+
[]
|
|
35
|
+
rescue Net::ReadTimeout, Net::OpenTimeout => e
|
|
36
|
+
if retries < max_retries
|
|
37
|
+
retries += 1
|
|
38
|
+
warn "Timeout talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
|
|
39
|
+
sleep(delay * retries)
|
|
40
|
+
retry
|
|
41
|
+
else
|
|
42
|
+
warn "Giving up on Wayback CDX API for #{url} after #{max_retries} timeouts."
|
|
43
|
+
[]
|
|
44
|
+
end
|
|
45
|
+
rescue StandardError => e
|
|
46
|
+
# treat any other transient-ish error similarly, though without retries for now
|
|
47
|
+
warn "Error fetching CDX data for #{url}: #{e.message}"
|
|
30
48
|
[]
|
|
31
49
|
end
|
|
32
50
|
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
module PageRequisites
|
|
2
|
+
# regex to find links in href, src, url(), and srcset
|
|
3
|
+
# this ignores data: URIs, mailto:, and anchors
|
|
4
|
+
ASSET_REGEX = /(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
|
|
5
|
+
|
|
6
|
+
def self.extract(html_content)
|
|
7
|
+
assets = []
|
|
8
|
+
|
|
9
|
+
html_content.scan(ASSET_REGEX) do |match|
|
|
10
|
+
# match is an array of capture groups; find the one that matched
|
|
11
|
+
url = match.compact.first
|
|
12
|
+
next unless url
|
|
13
|
+
|
|
14
|
+
# handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
|
|
15
|
+
if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
|
|
16
|
+
url.split(',').each do |src_def|
|
|
17
|
+
src_url = src_def.strip.split(' ').first
|
|
18
|
+
assets << src_url if valid_asset?(src_url)
|
|
19
|
+
end
|
|
20
|
+
else
|
|
21
|
+
assets << url if valid_asset?(url)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
assets.uniq
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def self.valid_asset?(url)
|
|
29
|
+
return false if url.strip.empty?
|
|
30
|
+
return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
|
|
31
|
+
true
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -1,74 +1,85 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
"#{prefix}
|
|
20
|
-
else
|
|
21
|
-
"#{prefix}#{url}#{suffix}"
|
|
3
|
+
module URLRewrite
|
|
4
|
+
# server-side extensions that should work locally
|
|
5
|
+
SERVER_SIDE_EXTS = %w[.php .asp .aspx .jsp .cgi .pl .py].freeze
|
|
6
|
+
|
|
7
|
+
def rewrite_html_attr_urls(content)
|
|
8
|
+
# rewrite URLs to relative paths
|
|
9
|
+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
|
|
10
|
+
prefix, path, suffix = $1, $2, $3
|
|
11
|
+
path = normalize_path_for_local(path)
|
|
12
|
+
"#{prefix}#{path}#{suffix}"
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# rewrite absolute URLs to same domain as relative
|
|
16
|
+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
|
|
17
|
+
prefix, path, suffix = $1, $2, $3
|
|
18
|
+
path = normalize_path_for_local(path)
|
|
19
|
+
"#{prefix}#{path}#{suffix}"
|
|
22
20
|
end
|
|
21
|
+
|
|
22
|
+
content
|
|
23
23
|
end
|
|
24
|
-
content
|
|
25
|
-
end
|
|
26
24
|
|
|
27
|
-
|
|
28
|
-
|
|
25
|
+
def rewrite_css_urls(content)
|
|
26
|
+
# rewrite URLs in CSS
|
|
27
|
+
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
|
|
28
|
+
path = normalize_path_for_local($1)
|
|
29
|
+
"url(\"#{path}\")"
|
|
30
|
+
end
|
|
29
31
|
|
|
30
|
-
|
|
31
|
-
url
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
begin
|
|
35
|
-
uri = URI.parse(url)
|
|
36
|
-
path = uri.path
|
|
37
|
-
path = path[1..-1] if path.start_with?('/')
|
|
38
|
-
"url(\"#{path}\")"
|
|
39
|
-
rescue
|
|
40
|
-
"url(\"#{url}\")"
|
|
41
|
-
end
|
|
42
|
-
elsif url.start_with?('/')
|
|
43
|
-
"url(\"./#{url[1..-1]}\")"
|
|
44
|
-
else
|
|
45
|
-
"url(\"#{url}\")"
|
|
32
|
+
# rewrite absolute URLs in CSS
|
|
33
|
+
content.gsub!(/url\(\s*["']?https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
|
|
34
|
+
path = normalize_path_for_local($1)
|
|
35
|
+
"url(\"#{path}\")"
|
|
46
36
|
end
|
|
37
|
+
|
|
38
|
+
content
|
|
47
39
|
end
|
|
48
|
-
content
|
|
49
|
-
end
|
|
50
40
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
41
|
+
def rewrite_js_urls(content)
|
|
42
|
+
# rewrite archive.org URLs in JavaScript strings
|
|
43
|
+
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
|
|
44
|
+
quote_start, path, quote_end = $1, $2, $3
|
|
45
|
+
path = normalize_path_for_local(path)
|
|
46
|
+
"#{quote_start}#{path}#{quote_end}"
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# rewrite absolute URLs in JavaScript
|
|
50
|
+
content.gsub!(/(["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
|
|
51
|
+
quote_start, path, quote_end = $1, $2, $3
|
|
52
|
+
next "#{quote_start}http#{$2}#{quote_end}" if $2.start_with?('s://', '://')
|
|
53
|
+
path = normalize_path_for_local(path)
|
|
54
|
+
"#{quote_start}#{path}#{quote_end}"
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
content
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
def normalize_path_for_local(path)
|
|
63
|
+
return "./index.html" if path.empty? || path == "/"
|
|
56
64
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
end
|
|
66
|
-
elsif url.start_with?('/')
|
|
67
|
-
"#{quote_start}./#{url[1..-1]}#{quote_end}"
|
|
65
|
+
# handle query strings - they're already part of the filename
|
|
66
|
+
path = path.split('?').first if path.include?('?')
|
|
67
|
+
|
|
68
|
+
# check if this is a server-side script
|
|
69
|
+
ext = File.extname(path).downcase
|
|
70
|
+
if SERVER_SIDE_EXTS.include?(ext)
|
|
71
|
+
# keep the path as-is but ensure it starts with ./
|
|
72
|
+
path = "./#{path}" unless path.start_with?('./', '/')
|
|
68
73
|
else
|
|
69
|
-
|
|
74
|
+
# regular file handling
|
|
75
|
+
path = "./#{path}" unless path.start_with?('./', '/')
|
|
76
|
+
|
|
77
|
+
# if it looks like a directory, add index.html
|
|
78
|
+
if path.end_with?('/') || !path.include?('.')
|
|
79
|
+
path = "#{path.chomp('/')}/index.html"
|
|
80
|
+
end
|
|
70
81
|
end
|
|
82
|
+
|
|
83
|
+
path
|
|
71
84
|
end
|
|
72
|
-
|
|
73
|
-
content
|
|
74
85
|
end
|
|
@@ -15,6 +15,7 @@ require 'digest'
|
|
|
15
15
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
|
16
16
|
require_relative 'wayback_machine_downloader/to_regex'
|
|
17
17
|
require_relative 'wayback_machine_downloader/archive_api'
|
|
18
|
+
require_relative 'wayback_machine_downloader/page_requisites'
|
|
18
19
|
require_relative 'wayback_machine_downloader/subdom_processor'
|
|
19
20
|
require_relative 'wayback_machine_downloader/url_rewrite'
|
|
20
21
|
|
|
@@ -127,8 +128,9 @@ class WaybackMachineDownloader
|
|
|
127
128
|
|
|
128
129
|
include ArchiveAPI
|
|
129
130
|
include SubdomainProcessor
|
|
131
|
+
include URLRewrite
|
|
130
132
|
|
|
131
|
-
VERSION = "2.4.
|
|
133
|
+
VERSION = "2.4.5"
|
|
132
134
|
DEFAULT_TIMEOUT = 30
|
|
133
135
|
MAX_RETRIES = 3
|
|
134
136
|
RETRY_DELAY = 2
|
|
@@ -142,7 +144,7 @@ class WaybackMachineDownloader
|
|
|
142
144
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
|
143
145
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
|
144
146
|
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
|
|
145
|
-
:snapshot_at
|
|
147
|
+
:snapshot_at, :page_requisites
|
|
146
148
|
|
|
147
149
|
def initialize params
|
|
148
150
|
validate_params(params)
|
|
@@ -175,6 +177,8 @@ class WaybackMachineDownloader
|
|
|
175
177
|
@subdomain_depth = params[:subdomain_depth] || 1
|
|
176
178
|
@snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
|
|
177
179
|
@max_retries = params[:max_retries] ? params[:max_retries].to_i : MAX_RETRIES
|
|
180
|
+
@page_requisites = params[:page_requisites] || false
|
|
181
|
+
@pending_jobs = Concurrent::AtomicFixnum.new(0)
|
|
178
182
|
|
|
179
183
|
# URL for rejecting invalid/unencoded wayback urls
|
|
180
184
|
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
|
|
@@ -183,18 +187,29 @@ class WaybackMachineDownloader
|
|
|
183
187
|
end
|
|
184
188
|
|
|
185
189
|
def backup_name
|
|
186
|
-
url_to_process = @base_url
|
|
190
|
+
url_to_process = @base_url
|
|
191
|
+
url_to_process = url_to_process.chomp('/*') if url_to_process&.end_with?('/*')
|
|
192
|
+
|
|
187
193
|
raw = if url_to_process.include?('//')
|
|
188
194
|
url_to_process.split('/')[2]
|
|
189
195
|
else
|
|
190
196
|
url_to_process
|
|
191
197
|
end
|
|
192
198
|
|
|
199
|
+
# if it looks like a wildcard pattern, normalize to a safe host-ish name
|
|
200
|
+
if raw&.start_with?('*.')
|
|
201
|
+
raw = raw.sub(/\A\*\./, 'all-')
|
|
202
|
+
end
|
|
203
|
+
|
|
193
204
|
# sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
|
|
194
205
|
if Gem.win_platform?
|
|
195
206
|
raw = raw.gsub(/[:*?"<>|]/, '_')
|
|
196
207
|
raw = raw.gsub(/[ .]+\z/, '')
|
|
208
|
+
else
|
|
209
|
+
# still good practice to strip path separators (and maybe '*' for POSIX too)
|
|
210
|
+
raw = raw.gsub(/[\/:*?"<>|]/, '_')
|
|
197
211
|
end
|
|
212
|
+
|
|
198
213
|
raw = 'site' if raw.nil? || raw.empty?
|
|
199
214
|
raw
|
|
200
215
|
end
|
|
@@ -313,7 +328,11 @@ class WaybackMachineDownloader
|
|
|
313
328
|
|
|
314
329
|
futures.each do |future|
|
|
315
330
|
begin
|
|
316
|
-
|
|
331
|
+
val = future.value
|
|
332
|
+
# only append if valid
|
|
333
|
+
if val && val.is_a?(Array) && val.first.is_a?(Integer)
|
|
334
|
+
results << val
|
|
335
|
+
end
|
|
317
336
|
rescue => e
|
|
318
337
|
puts "\nError fetching page #{future}: #{e.message}"
|
|
319
338
|
end
|
|
@@ -549,7 +568,7 @@ class WaybackMachineDownloader
|
|
|
549
568
|
end
|
|
550
569
|
end
|
|
551
570
|
end
|
|
552
|
-
|
|
571
|
+
|
|
553
572
|
def download_files
|
|
554
573
|
start_time = Time.now
|
|
555
574
|
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
|
@@ -570,6 +589,12 @@ class WaybackMachineDownloader
|
|
|
570
589
|
|
|
571
590
|
# Load IDs of already downloaded files
|
|
572
591
|
downloaded_ids = load_downloaded_ids
|
|
592
|
+
|
|
593
|
+
# We use a thread-safe Set to track what we have queued/downloaded in this session
|
|
594
|
+
# to avoid infinite loops with page requisites
|
|
595
|
+
@session_downloaded_ids = Concurrent::Set.new
|
|
596
|
+
downloaded_ids.each { |id| @session_downloaded_ids.add(id) }
|
|
597
|
+
|
|
573
598
|
files_to_process = files_to_download.reject do |file_info|
|
|
574
599
|
downloaded_ids.include?(file_info[:file_id])
|
|
575
600
|
end
|
|
@@ -580,8 +605,8 @@ class WaybackMachineDownloader
|
|
|
580
605
|
if skipped_count > 0
|
|
581
606
|
puts "Found #{skipped_count} previously downloaded files, skipping them."
|
|
582
607
|
end
|
|
583
|
-
|
|
584
|
-
if remaining_count == 0
|
|
608
|
+
|
|
609
|
+
if remaining_count == 0 && !@page_requisites
|
|
585
610
|
puts "All matching files have already been downloaded."
|
|
586
611
|
cleanup
|
|
587
612
|
return
|
|
@@ -594,12 +619,22 @@ class WaybackMachineDownloader
|
|
|
594
619
|
@download_mutex = Mutex.new
|
|
595
620
|
|
|
596
621
|
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
|
|
597
|
-
|
|
622
|
+
@worker_pool = Concurrent::FixedThreadPool.new(thread_count)
|
|
623
|
+
|
|
624
|
+
# initial batch
|
|
625
|
+
files_to_process.each do |file_remote_info|
|
|
626
|
+
@session_downloaded_ids.add(file_remote_info[:file_id])
|
|
627
|
+
submit_download_job(file_remote_info)
|
|
628
|
+
end
|
|
598
629
|
|
|
599
|
-
|
|
630
|
+
# wait for all jobs to finish
|
|
631
|
+
loop do
|
|
632
|
+
sleep 0.5
|
|
633
|
+
break if @pending_jobs.value == 0
|
|
634
|
+
end
|
|
600
635
|
|
|
601
|
-
|
|
602
|
-
|
|
636
|
+
@worker_pool.shutdown
|
|
637
|
+
@worker_pool.wait_for_termination
|
|
603
638
|
|
|
604
639
|
end_time = Time.now
|
|
605
640
|
puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
|
|
@@ -617,6 +652,138 @@ class WaybackMachineDownloader
|
|
|
617
652
|
cleanup
|
|
618
653
|
end
|
|
619
654
|
|
|
655
|
+
# helper to submit jobs and increment the counter
|
|
656
|
+
def submit_download_job(file_remote_info)
|
|
657
|
+
@pending_jobs.increment
|
|
658
|
+
@worker_pool.post do
|
|
659
|
+
begin
|
|
660
|
+
process_single_file(file_remote_info)
|
|
661
|
+
ensure
|
|
662
|
+
@pending_jobs.decrement
|
|
663
|
+
end
|
|
664
|
+
end
|
|
665
|
+
end
|
|
666
|
+
|
|
667
|
+
def process_single_file(file_remote_info)
|
|
668
|
+
download_success = false
|
|
669
|
+
downloaded_path = nil
|
|
670
|
+
|
|
671
|
+
@connection_pool.with_connection do |connection|
|
|
672
|
+
result_message, path = download_file(file_remote_info, connection)
|
|
673
|
+
downloaded_path = path
|
|
674
|
+
|
|
675
|
+
if result_message && result_message.include?(' -> ')
|
|
676
|
+
download_success = true
|
|
677
|
+
end
|
|
678
|
+
|
|
679
|
+
@download_mutex.synchronize do
|
|
680
|
+
@processed_file_count += 1 if @processed_file_count < @total_to_download
|
|
681
|
+
# only print if it's a "User" file or a requisite we found
|
|
682
|
+
puts result_message if result_message
|
|
683
|
+
end
|
|
684
|
+
end
|
|
685
|
+
|
|
686
|
+
if download_success
|
|
687
|
+
append_to_db(file_remote_info[:file_id])
|
|
688
|
+
|
|
689
|
+
if @page_requisites && downloaded_path && File.extname(downloaded_path) =~ /\.(html?|php|asp|aspx|jsp)$/i
|
|
690
|
+
process_page_requisites(downloaded_path, file_remote_info)
|
|
691
|
+
end
|
|
692
|
+
end
|
|
693
|
+
rescue => e
|
|
694
|
+
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
|
|
695
|
+
end
|
|
696
|
+
|
|
697
|
+
def process_page_requisites(file_path, parent_remote_info)
|
|
698
|
+
return unless File.exist?(file_path)
|
|
699
|
+
|
|
700
|
+
content = File.read(file_path)
|
|
701
|
+
content = content.force_encoding('UTF-8').scrub
|
|
702
|
+
|
|
703
|
+
assets = PageRequisites.extract(content)
|
|
704
|
+
|
|
705
|
+
# prepare base URI for resolving relative paths
|
|
706
|
+
parent_raw = parent_remote_info[:file_url]
|
|
707
|
+
parent_raw = "http://#{parent_raw}" unless parent_raw.match?(/^https?:\/\//)
|
|
708
|
+
|
|
709
|
+
begin
|
|
710
|
+
base_uri = URI(parent_raw)
|
|
711
|
+
# calculate the "root" host of the site we are downloading to compare later
|
|
712
|
+
current_project_host = URI("http://" + @base_url.gsub(%r{^https?://}, '')).host
|
|
713
|
+
rescue URI::InvalidURIError
|
|
714
|
+
return
|
|
715
|
+
end
|
|
716
|
+
|
|
717
|
+
parent_timestamp = parent_remote_info[:timestamp]
|
|
718
|
+
|
|
719
|
+
assets.each do |asset_rel_url|
|
|
720
|
+
begin
|
|
721
|
+
# resolve full URL (handles relative paths like "../img/logo.png")
|
|
722
|
+
resolved_uri = base_uri + asset_rel_url
|
|
723
|
+
|
|
724
|
+
# detect if the asset URL is already a Wayback "web/<timestamp>/.../https://..." embed
|
|
725
|
+
asset_timestamp = parent_timestamp
|
|
726
|
+
if resolved_uri.path =~ %r{\A/web/([0-9]{4,})[^/]*/(https?://.+)\z}
|
|
727
|
+
embedded_ts = $1
|
|
728
|
+
begin
|
|
729
|
+
orig_uri = URI($2)
|
|
730
|
+
resolved_uri = orig_uri
|
|
731
|
+
asset_timestamp = embedded_ts.to_i
|
|
732
|
+
rescue URI::InvalidURIError
|
|
733
|
+
# fall back to original resolved_uri and parent timestamp
|
|
734
|
+
end
|
|
735
|
+
end
|
|
736
|
+
|
|
737
|
+
# filter out navigation links (pages) vs assets
|
|
738
|
+
# skip if extension is empty or looks like an HTML page
|
|
739
|
+
path = resolved_uri.path
|
|
740
|
+
ext = File.extname(path).downcase
|
|
741
|
+
if ext.empty? || ['.html', '.htm', '.php', '.asp', '.aspx'].include?(ext)
|
|
742
|
+
next
|
|
743
|
+
end
|
|
744
|
+
|
|
745
|
+
# construct the URL for the Wayback API
|
|
746
|
+
asset_wbm_url = resolved_uri.host + resolved_uri.path
|
|
747
|
+
asset_wbm_url += "?#{resolved_uri.query}" if resolved_uri.query
|
|
748
|
+
|
|
749
|
+
# construct the local file ID
|
|
750
|
+
# if the asset is on the SAME domain, strip the domain from the folder path
|
|
751
|
+
# if it's on a DIFFERENT domain (e.g. cdn.jquery.com), keep the domain folder
|
|
752
|
+
if resolved_uri.host == current_project_host
|
|
753
|
+
# e.g. /static/css/style.css
|
|
754
|
+
asset_file_id = resolved_uri.path
|
|
755
|
+
asset_file_id = asset_file_id[1..-1] if asset_file_id.start_with?('/')
|
|
756
|
+
else
|
|
757
|
+
# e.g. cdn.google.com/jquery.js
|
|
758
|
+
asset_file_id = asset_wbm_url
|
|
759
|
+
end
|
|
760
|
+
|
|
761
|
+
rescue URI::InvalidURIError, StandardError
|
|
762
|
+
next
|
|
763
|
+
end
|
|
764
|
+
|
|
765
|
+
# sanitize and queue
|
|
766
|
+
asset_id = sanitize_and_prepare_id(asset_file_id, asset_wbm_url)
|
|
767
|
+
|
|
768
|
+
unless @session_downloaded_ids.include?(asset_id)
|
|
769
|
+
@session_downloaded_ids.add(asset_id)
|
|
770
|
+
|
|
771
|
+
new_file_info = {
|
|
772
|
+
file_url: asset_wbm_url,
|
|
773
|
+
timestamp: asset_timestamp,
|
|
774
|
+
file_id: asset_id
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
@download_mutex.synchronize do
|
|
778
|
+
@total_to_download += 1
|
|
779
|
+
puts "Queued requisite: #{asset_file_id}"
|
|
780
|
+
end
|
|
781
|
+
|
|
782
|
+
submit_download_job(new_file_info)
|
|
783
|
+
end
|
|
784
|
+
end
|
|
785
|
+
end
|
|
786
|
+
|
|
620
787
|
def structure_dir_path dir_path
|
|
621
788
|
begin
|
|
622
789
|
FileUtils::mkdir_p dir_path unless File.exist? dir_path
|
|
@@ -648,7 +815,8 @@ class WaybackMachineDownloader
|
|
|
648
815
|
begin
|
|
649
816
|
content = File.binread(file_path)
|
|
650
817
|
|
|
651
|
-
|
|
818
|
+
# detect encoding for HTML files
|
|
819
|
+
if file_ext == '.html' || file_ext == '.htm' || file_ext == '.php' || file_ext == '.asp'
|
|
652
820
|
encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
|
|
653
821
|
content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
|
|
654
822
|
else
|
|
@@ -664,13 +832,13 @@ class WaybackMachineDownloader
|
|
|
664
832
|
# URLs in JavaScript
|
|
665
833
|
content = rewrite_js_urls(content)
|
|
666
834
|
|
|
667
|
-
# for URLs
|
|
835
|
+
# for URLs that start with a single slash, make them relative
|
|
668
836
|
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
|
669
837
|
prefix, path, suffix = $1, $2, $3
|
|
670
838
|
"#{prefix}./#{path}#{suffix}"
|
|
671
839
|
end
|
|
672
840
|
|
|
673
|
-
# for URLs in CSS that start with a single slash
|
|
841
|
+
# for URLs in CSS that start with a single slash, make them relative
|
|
674
842
|
content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
|
|
675
843
|
path = $1
|
|
676
844
|
"url(\"./#{path}\")"
|
|
@@ -723,7 +891,7 @@ class WaybackMachineDownloader
|
|
|
723
891
|
# check existence *before* download attempt
|
|
724
892
|
# this handles cases where a file was created manually or by a previous partial run without a .db entry
|
|
725
893
|
if File.exist? file_path
|
|
726
|
-
return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
894
|
+
return ["#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
|
|
727
895
|
end
|
|
728
896
|
|
|
729
897
|
begin
|
|
@@ -735,13 +903,13 @@ class WaybackMachineDownloader
|
|
|
735
903
|
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
|
|
736
904
|
rewrite_urls_to_relative(file_path)
|
|
737
905
|
end
|
|
738
|
-
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
906
|
+
return ["#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
|
|
739
907
|
when :skipped_not_found
|
|
740
|
-
"Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
908
|
+
return ["Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
|
|
741
909
|
else
|
|
742
910
|
# ideally, this case should not be reached if download_with_retry behaves as expected.
|
|
743
911
|
@logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
|
|
744
|
-
"Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
912
|
+
return ["Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
|
|
745
913
|
end
|
|
746
914
|
rescue StandardError => e
|
|
747
915
|
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
@@ -749,7 +917,7 @@ class WaybackMachineDownloader
|
|
|
749
917
|
File.delete(file_path)
|
|
750
918
|
msg += "\n#{file_path} was empty and was removed."
|
|
751
919
|
end
|
|
752
|
-
msg
|
|
920
|
+
return [msg, nil]
|
|
753
921
|
end
|
|
754
922
|
end
|
|
755
923
|
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: wayback_machine_downloader_straw
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.4.
|
|
4
|
+
version: 2.4.5
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- strawberrymaster
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date:
|
|
10
|
+
date: 2026-01-05 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: concurrent-ruby
|
|
@@ -71,6 +71,7 @@ files:
|
|
|
71
71
|
- bin/wayback_machine_downloader
|
|
72
72
|
- lib/wayback_machine_downloader.rb
|
|
73
73
|
- lib/wayback_machine_downloader/archive_api.rb
|
|
74
|
+
- lib/wayback_machine_downloader/page_requisites.rb
|
|
74
75
|
- lib/wayback_machine_downloader/subdom_processor.rb
|
|
75
76
|
- lib/wayback_machine_downloader/tidy_bytes.rb
|
|
76
77
|
- lib/wayback_machine_downloader/to_regex.rb
|