wayback_machine_downloader_straw 2.4.4 → 2.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1e81619475172540d94968e5d31fbb6b4df7f08a533efce3b200e1ad4ce5035f
4
- data.tar.gz: aca008795277ebf489cda7f0d8d1691b1983e2eec6f338b95c9bf11552fdfd94
3
+ metadata.gz: c5ba50bde9b0306e043eed8151b12f37f603f5cfd73013e53260543f7fc134a5
4
+ data.tar.gz: d3dbc1a0f6f894547fb39e56193c967dbf99685a8fa4d3cecaeaff62070aab4c
5
5
  SHA512:
6
- metadata.gz: c71eb691ba50308f1f6b29a326c00a0678db1ff94f6ab8620b4d74425c993a64b60dd4d0127f9dd9eb3a72801008672855d901379d870b89aff82862b4f582a3
7
- data.tar.gz: 6277c45d37dd02fea219f93906bf790ddede921ef88e928074b9e3da88b9882a64ecebbc99767b4c0eaef34b9a3407e9fee6352c935ea2d9942eed5735c51632
6
+ metadata.gz: 854ec2ccbe2daf620178397bf2c620ed8cf01ca57f175cbb9a2c8e7a057b1495a382ccae64836d75cf08c9a7d5a8ab638859d4c7d34584dcfc51f5eda8b7e5b2
7
+ data.tar.gz: 1ed3f13c7aadcb097a174c7870730df0920c7b08e9476b786d287dfd2a3e29a50b690ea2fb16b0d762dac726648356788b4122e674db44e7c5a49ffc541f4098
@@ -86,6 +86,10 @@ option_parser = OptionParser.new do |opts|
86
86
  options[:subdomain_depth] = t
87
87
  end
88
88
 
89
+ opts.on("--page-requisites", "Download related assets (images, css, js) for downloaded HTML pages") do |t|
90
+ options[:page_requisites] = true
91
+ end
92
+
89
93
  opts.on("-v", "--version", "Display version") do |t|
90
94
  options[:version] = t
91
95
  end
@@ -16,6 +16,10 @@ module ArchiveAPI
16
16
  params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
17
17
  request_url.query = URI.encode_www_form(params)
18
18
 
19
+ retries = 0
20
+ max_retries = (@max_retries || 3)
21
+ delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
22
+
19
23
  begin
20
24
  response = http.get(request_url)
21
25
  body = response.body.to_s.strip
@@ -26,7 +30,21 @@ module ArchiveAPI
26
30
  json.shift if json.first == ["timestamp", "original"]
27
31
  json
28
32
  rescue JSON::ParserError => e
29
- warn "Failed to fetch data from API: #{e.message}"
33
+ warn "Failed to parse JSON from API for #{url}: #{e.message}"
34
+ []
35
+ rescue Net::ReadTimeout, Net::OpenTimeout => e
36
+ if retries < max_retries
37
+ retries += 1
38
+ warn "Timeout talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
39
+ sleep(delay * retries)
40
+ retry
41
+ else
42
+ warn "Giving up on Wayback CDX API for #{url} after #{max_retries} timeouts."
43
+ []
44
+ end
45
+ rescue StandardError => e
46
+ # treat any other transient-ish error similarly, though without retries for now
47
+ warn "Error fetching CDX data for #{url}: #{e.message}"
30
48
  []
31
49
  end
32
50
  end
@@ -0,0 +1,33 @@
1
+ module PageRequisites
2
+ # regex to find links in href, src, url(), and srcset
3
+ # this ignores data: URIs, mailto:, and anchors
4
+ ASSET_REGEX = /(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
5
+
6
+ def self.extract(html_content)
7
+ assets = []
8
+
9
+ html_content.scan(ASSET_REGEX) do |match|
10
+ # match is an array of capture groups; find the one that matched
11
+ url = match.compact.first
12
+ next unless url
13
+
14
+ # handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
15
+ if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
16
+ url.split(',').each do |src_def|
17
+ src_url = src_def.strip.split(' ').first
18
+ assets << src_url if valid_asset?(src_url)
19
+ end
20
+ else
21
+ assets << url if valid_asset?(url)
22
+ end
23
+ end
24
+
25
+ assets.uniq
26
+ end
27
+
28
+ def self.valid_asset?(url)
29
+ return false if url.strip.empty?
30
+ return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
31
+ true
32
+ end
33
+ end
@@ -1,74 +1,85 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # URLs in HTML attributes
4
- def rewrite_html_attr_urls(content)
5
-
6
- content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
7
- prefix, url, suffix = $1, $2, $3
8
-
9
- if url.start_with?('http')
10
- begin
11
- uri = URI.parse(url)
12
- path = uri.path
13
- path = path[1..-1] if path.start_with?('/')
14
- "#{prefix}#{path}#{suffix}"
15
- rescue
16
- "#{prefix}#{url}#{suffix}"
17
- end
18
- elsif url.start_with?('/')
19
- "#{prefix}./#{url[1..-1]}#{suffix}"
20
- else
21
- "#{prefix}#{url}#{suffix}"
3
+ module URLRewrite
4
+ # server-side extensions that should work locally
5
+ SERVER_SIDE_EXTS = %w[.php .asp .aspx .jsp .cgi .pl .py].freeze
6
+
7
+ def rewrite_html_attr_urls(content)
8
+ # rewrite URLs to relative paths
9
+ content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
10
+ prefix, path, suffix = $1, $2, $3
11
+ path = normalize_path_for_local(path)
12
+ "#{prefix}#{path}#{suffix}"
13
+ end
14
+
15
+ # rewrite absolute URLs to same domain as relative
16
+ content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
17
+ prefix, path, suffix = $1, $2, $3
18
+ path = normalize_path_for_local(path)
19
+ "#{prefix}#{path}#{suffix}"
22
20
  end
21
+
22
+ content
23
23
  end
24
- content
25
- end
26
24
 
27
- # URLs in CSS
28
- def rewrite_css_urls(content)
25
+ def rewrite_css_urls(content)
26
+ # rewrite URLs in CSS
27
+ content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
28
+ path = normalize_path_for_local($1)
29
+ "url(\"#{path}\")"
30
+ end
29
31
 
30
- content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
31
- url = $1
32
-
33
- if url.start_with?('http')
34
- begin
35
- uri = URI.parse(url)
36
- path = uri.path
37
- path = path[1..-1] if path.start_with?('/')
38
- "url(\"#{path}\")"
39
- rescue
40
- "url(\"#{url}\")"
41
- end
42
- elsif url.start_with?('/')
43
- "url(\"./#{url[1..-1]}\")"
44
- else
45
- "url(\"#{url}\")"
32
+ # rewrite absolute URLs in CSS
33
+ content.gsub!(/url\(\s*["']?https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
34
+ path = normalize_path_for_local($1)
35
+ "url(\"#{path}\")"
46
36
  end
37
+
38
+ content
47
39
  end
48
- content
49
- end
50
40
 
51
- # URLs in JavaScript
52
- def rewrite_js_urls(content)
53
-
54
- content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
55
- quote_start, url, quote_end = $1, $2, $3
41
+ def rewrite_js_urls(content)
42
+ # rewrite archive.org URLs in JavaScript strings
43
+ content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
44
+ quote_start, path, quote_end = $1, $2, $3
45
+ path = normalize_path_for_local(path)
46
+ "#{quote_start}#{path}#{quote_end}"
47
+ end
48
+
49
+ # rewrite absolute URLs in JavaScript
50
+ content.gsub!(/(["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
51
+ quote_start, path, quote_end = $1, $2, $3
52
+ next "#{quote_start}http#{$2}#{quote_end}" if $2.start_with?('s://', '://')
53
+ path = normalize_path_for_local(path)
54
+ "#{quote_start}#{path}#{quote_end}"
55
+ end
56
+
57
+ content
58
+ end
59
+
60
+ private
61
+
62
+ def normalize_path_for_local(path)
63
+ return "./index.html" if path.empty? || path == "/"
56
64
 
57
- if url.start_with?('http')
58
- begin
59
- uri = URI.parse(url)
60
- path = uri.path
61
- path = path[1..-1] if path.start_with?('/')
62
- "#{quote_start}#{path}#{quote_end}"
63
- rescue
64
- "#{quote_start}#{url}#{quote_end}"
65
- end
66
- elsif url.start_with?('/')
67
- "#{quote_start}./#{url[1..-1]}#{quote_end}"
65
+ # handle query strings - they're already part of the filename
66
+ path = path.split('?').first if path.include?('?')
67
+
68
+ # check if this is a server-side script
69
+ ext = File.extname(path).downcase
70
+ if SERVER_SIDE_EXTS.include?(ext)
71
+ # keep the path as-is but ensure it starts with ./
72
+ path = "./#{path}" unless path.start_with?('./', '/')
68
73
  else
69
- "#{quote_start}#{url}#{quote_end}"
74
+ # regular file handling
75
+ path = "./#{path}" unless path.start_with?('./', '/')
76
+
77
+ # if it looks like a directory, add index.html
78
+ if path.end_with?('/') || !path.include?('.')
79
+ path = "#{path.chomp('/')}/index.html"
80
+ end
70
81
  end
82
+
83
+ path
71
84
  end
72
-
73
- content
74
85
  end
@@ -15,6 +15,7 @@ require 'digest'
15
15
  require_relative 'wayback_machine_downloader/tidy_bytes'
16
16
  require_relative 'wayback_machine_downloader/to_regex'
17
17
  require_relative 'wayback_machine_downloader/archive_api'
18
+ require_relative 'wayback_machine_downloader/page_requisites'
18
19
  require_relative 'wayback_machine_downloader/subdom_processor'
19
20
  require_relative 'wayback_machine_downloader/url_rewrite'
20
21
 
@@ -127,8 +128,9 @@ class WaybackMachineDownloader
127
128
 
128
129
  include ArchiveAPI
129
130
  include SubdomainProcessor
131
+ include URLRewrite
130
132
 
131
- VERSION = "2.4.4"
133
+ VERSION = "2.4.5"
132
134
  DEFAULT_TIMEOUT = 30
133
135
  MAX_RETRIES = 3
134
136
  RETRY_DELAY = 2
@@ -142,7 +144,7 @@ class WaybackMachineDownloader
142
144
  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
143
145
  :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
144
146
  :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
145
- :snapshot_at
147
+ :snapshot_at, :page_requisites
146
148
 
147
149
  def initialize params
148
150
  validate_params(params)
@@ -175,6 +177,8 @@ class WaybackMachineDownloader
175
177
  @subdomain_depth = params[:subdomain_depth] || 1
176
178
  @snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
177
179
  @max_retries = params[:max_retries] ? params[:max_retries].to_i : MAX_RETRIES
180
+ @page_requisites = params[:page_requisites] || false
181
+ @pending_jobs = Concurrent::AtomicFixnum.new(0)
178
182
 
179
183
  # URL for rejecting invalid/unencoded wayback urls
180
184
  @url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
@@ -183,18 +187,29 @@ class WaybackMachineDownloader
183
187
  end
184
188
 
185
189
  def backup_name
186
- url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
190
+ url_to_process = @base_url
191
+ url_to_process = url_to_process.chomp('/*') if url_to_process&.end_with?('/*')
192
+
187
193
  raw = if url_to_process.include?('//')
188
194
  url_to_process.split('/')[2]
189
195
  else
190
196
  url_to_process
191
197
  end
192
198
 
199
+ # if it looks like a wildcard pattern, normalize to a safe host-ish name
200
+ if raw&.start_with?('*.')
201
+ raw = raw.sub(/\A\*\./, 'all-')
202
+ end
203
+
193
204
  # sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
194
205
  if Gem.win_platform?
195
206
  raw = raw.gsub(/[:*?"<>|]/, '_')
196
207
  raw = raw.gsub(/[ .]+\z/, '')
208
+ else
209
+ # still good practice to strip path separators (and maybe '*' for POSIX too)
210
+ raw = raw.gsub(/[\/:*?"<>|]/, '_')
197
211
  end
212
+
198
213
  raw = 'site' if raw.nil? || raw.empty?
199
214
  raw
200
215
  end
@@ -313,7 +328,11 @@ class WaybackMachineDownloader
313
328
 
314
329
  futures.each do |future|
315
330
  begin
316
- results << future.value
331
+ val = future.value
332
+ # only append if valid
333
+ if val && val.is_a?(Array) && val.first.is_a?(Integer)
334
+ results << val
335
+ end
317
336
  rescue => e
318
337
  puts "\nError fetching page #{future}: #{e.message}"
319
338
  end
@@ -549,7 +568,7 @@ class WaybackMachineDownloader
549
568
  end
550
569
  end
551
570
  end
552
-
571
+
553
572
  def download_files
554
573
  start_time = Time.now
555
574
  puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
@@ -570,6 +589,12 @@ class WaybackMachineDownloader
570
589
 
571
590
  # Load IDs of already downloaded files
572
591
  downloaded_ids = load_downloaded_ids
592
+
593
+ # We use a thread-safe Set to track what we have queued/downloaded in this session
594
+ # to avoid infinite loops with page requisites
595
+ @session_downloaded_ids = Concurrent::Set.new
596
+ downloaded_ids.each { |id| @session_downloaded_ids.add(id) }
597
+
573
598
  files_to_process = files_to_download.reject do |file_info|
574
599
  downloaded_ids.include?(file_info[:file_id])
575
600
  end
@@ -580,8 +605,8 @@ class WaybackMachineDownloader
580
605
  if skipped_count > 0
581
606
  puts "Found #{skipped_count} previously downloaded files, skipping them."
582
607
  end
583
-
584
- if remaining_count == 0
608
+
609
+ if remaining_count == 0 && !@page_requisites
585
610
  puts "All matching files have already been downloaded."
586
611
  cleanup
587
612
  return
@@ -594,12 +619,22 @@ class WaybackMachineDownloader
594
619
  @download_mutex = Mutex.new
595
620
 
596
621
  thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
597
- pool = Concurrent::FixedThreadPool.new(thread_count)
622
+ @worker_pool = Concurrent::FixedThreadPool.new(thread_count)
623
+
624
+ # initial batch
625
+ files_to_process.each do |file_remote_info|
626
+ @session_downloaded_ids.add(file_remote_info[:file_id])
627
+ submit_download_job(file_remote_info)
628
+ end
598
629
 
599
- processing_files(pool, files_to_process)
630
+ # wait for all jobs to finish
631
+ loop do
632
+ sleep 0.5
633
+ break if @pending_jobs.value == 0
634
+ end
600
635
 
601
- pool.shutdown
602
- pool.wait_for_termination
636
+ @worker_pool.shutdown
637
+ @worker_pool.wait_for_termination
603
638
 
604
639
  end_time = Time.now
605
640
  puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
@@ -617,6 +652,138 @@ class WaybackMachineDownloader
617
652
  cleanup
618
653
  end
619
654
 
655
+ # helper to submit jobs and increment the counter
656
+ def submit_download_job(file_remote_info)
657
+ @pending_jobs.increment
658
+ @worker_pool.post do
659
+ begin
660
+ process_single_file(file_remote_info)
661
+ ensure
662
+ @pending_jobs.decrement
663
+ end
664
+ end
665
+ end
666
+
667
+ def process_single_file(file_remote_info)
668
+ download_success = false
669
+ downloaded_path = nil
670
+
671
+ @connection_pool.with_connection do |connection|
672
+ result_message, path = download_file(file_remote_info, connection)
673
+ downloaded_path = path
674
+
675
+ if result_message && result_message.include?(' -> ')
676
+ download_success = true
677
+ end
678
+
679
+ @download_mutex.synchronize do
680
+ @processed_file_count += 1 if @processed_file_count < @total_to_download
681
+ # only print if it's a "User" file or a requisite we found
682
+ puts result_message if result_message
683
+ end
684
+ end
685
+
686
+ if download_success
687
+ append_to_db(file_remote_info[:file_id])
688
+
689
+ if @page_requisites && downloaded_path && File.extname(downloaded_path) =~ /\.(html?|php|asp|aspx|jsp)$/i
690
+ process_page_requisites(downloaded_path, file_remote_info)
691
+ end
692
+ end
693
+ rescue => e
694
+ @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
695
+ end
696
+
697
+ def process_page_requisites(file_path, parent_remote_info)
698
+ return unless File.exist?(file_path)
699
+
700
+ content = File.read(file_path)
701
+ content = content.force_encoding('UTF-8').scrub
702
+
703
+ assets = PageRequisites.extract(content)
704
+
705
+ # prepare base URI for resolving relative paths
706
+ parent_raw = parent_remote_info[:file_url]
707
+ parent_raw = "http://#{parent_raw}" unless parent_raw.match?(/^https?:\/\//)
708
+
709
+ begin
710
+ base_uri = URI(parent_raw)
711
+ # calculate the "root" host of the site we are downloading to compare later
712
+ current_project_host = URI("http://" + @base_url.gsub(%r{^https?://}, '')).host
713
+ rescue URI::InvalidURIError
714
+ return
715
+ end
716
+
717
+ parent_timestamp = parent_remote_info[:timestamp]
718
+
719
+ assets.each do |asset_rel_url|
720
+ begin
721
+ # resolve full URL (handles relative paths like "../img/logo.png")
722
+ resolved_uri = base_uri + asset_rel_url
723
+
724
+ # detect if the asset URL is already a Wayback "web/<timestamp>/.../https://..." embed
725
+ asset_timestamp = parent_timestamp
726
+ if resolved_uri.path =~ %r{\A/web/([0-9]{4,})[^/]*/(https?://.+)\z}
727
+ embedded_ts = $1
728
+ begin
729
+ orig_uri = URI($2)
730
+ resolved_uri = orig_uri
731
+ asset_timestamp = embedded_ts.to_i
732
+ rescue URI::InvalidURIError
733
+ # fall back to original resolved_uri and parent timestamp
734
+ end
735
+ end
736
+
737
+ # filter out navigation links (pages) vs assets
738
+ # skip if extension is empty or looks like an HTML page
739
+ path = resolved_uri.path
740
+ ext = File.extname(path).downcase
741
+ if ext.empty? || ['.html', '.htm', '.php', '.asp', '.aspx'].include?(ext)
742
+ next
743
+ end
744
+
745
+ # construct the URL for the Wayback API
746
+ asset_wbm_url = resolved_uri.host + resolved_uri.path
747
+ asset_wbm_url += "?#{resolved_uri.query}" if resolved_uri.query
748
+
749
+ # construct the local file ID
750
+ # if the asset is on the SAME domain, strip the domain from the folder path
751
+ # if it's on a DIFFERENT domain (e.g. cdn.jquery.com), keep the domain folder
752
+ if resolved_uri.host == current_project_host
753
+ # e.g. /static/css/style.css
754
+ asset_file_id = resolved_uri.path
755
+ asset_file_id = asset_file_id[1..-1] if asset_file_id.start_with?('/')
756
+ else
757
+ # e.g. cdn.google.com/jquery.js
758
+ asset_file_id = asset_wbm_url
759
+ end
760
+
761
+ rescue URI::InvalidURIError, StandardError
762
+ next
763
+ end
764
+
765
+ # sanitize and queue
766
+ asset_id = sanitize_and_prepare_id(asset_file_id, asset_wbm_url)
767
+
768
+ unless @session_downloaded_ids.include?(asset_id)
769
+ @session_downloaded_ids.add(asset_id)
770
+
771
+ new_file_info = {
772
+ file_url: asset_wbm_url,
773
+ timestamp: asset_timestamp,
774
+ file_id: asset_id
775
+ }
776
+
777
+ @download_mutex.synchronize do
778
+ @total_to_download += 1
779
+ puts "Queued requisite: #{asset_file_id}"
780
+ end
781
+
782
+ submit_download_job(new_file_info)
783
+ end
784
+ end
785
+ end
786
+
620
787
  def structure_dir_path dir_path
621
788
  begin
622
789
  FileUtils::mkdir_p dir_path unless File.exist? dir_path
@@ -648,7 +815,8 @@ class WaybackMachineDownloader
648
815
  begin
649
816
  content = File.binread(file_path)
650
817
 
651
- if file_ext == '.html' || file_ext == '.htm'
818
+ # detect encoding for HTML files
819
+ if file_ext == '.html' || file_ext == '.htm' || file_ext == '.php' || file_ext == '.asp'
652
820
  encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
653
821
  content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
654
822
  else
@@ -664,13 +832,13 @@ class WaybackMachineDownloader
664
832
  # URLs in JavaScript
665
833
  content = rewrite_js_urls(content)
666
834
 
667
- # for URLs in HTML attributes that start with a single slash
835
+ # for URLs that start with a single slash, make them relative
668
836
  content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
669
837
  prefix, path, suffix = $1, $2, $3
670
838
  "#{prefix}./#{path}#{suffix}"
671
839
  end
672
840
 
673
- # for URLs in CSS that start with a single slash
841
+ # for URLs in CSS that start with a single slash, make them relative
674
842
  content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
675
843
  path = $1
676
844
  "url(\"./#{path}\")"
@@ -723,7 +891,7 @@ class WaybackMachineDownloader
723
891
  # check existence *before* download attempt
724
892
  # this handles cases where a file was created manually or by a previous partial run without a .db entry
725
893
  if File.exist? file_path
726
- return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
894
+ return ["#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
727
895
  end
728
896
 
729
897
  begin
@@ -735,13 +903,13 @@ class WaybackMachineDownloader
735
903
  if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
736
904
  rewrite_urls_to_relative(file_path)
737
905
  end
738
- "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
906
+ return ["#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
739
907
  when :skipped_not_found
740
- "Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
908
+ return ["Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
741
909
  else
742
910
  # ideally, this case should not be reached if download_with_retry behaves as expected.
743
911
  @logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
744
- "Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
912
+ return ["Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
745
913
  end
746
914
  rescue StandardError => e
747
915
  msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
@@ -749,7 +917,7 @@ class WaybackMachineDownloader
749
917
  File.delete(file_path)
750
918
  msg += "\n#{file_path} was empty and was removed."
751
919
  end
752
- msg
920
+ return [msg, nil]
753
921
  end
754
922
  end
755
923
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.4.4
4
+ version: 2.4.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2025-10-27 00:00:00.000000000 Z
10
+ date: 2026-01-05 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: concurrent-ruby
@@ -71,6 +71,7 @@ files:
71
71
  - bin/wayback_machine_downloader
72
72
  - lib/wayback_machine_downloader.rb
73
73
  - lib/wayback_machine_downloader/archive_api.rb
74
+ - lib/wayback_machine_downloader/page_requisites.rb
74
75
  - lib/wayback_machine_downloader/subdom_processor.rb
75
76
  - lib/wayback_machine_downloader/tidy_bytes.rb
76
77
  - lib/wayback_machine_downloader/to_regex.rb