wayback_machine_downloader_straw 2.3.5 → 2.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d04d7ddf4b722425377ce84ad5c0f917e35553e38edb755c029ae7a2b8f8055d
4
- data.tar.gz: 3d93f41ef2ba3b366a3adf071b947eabb66caf931d022b7ad8a521d3930dfe27
3
+ metadata.gz: b739c4ecda1e325f9d5a33872fa71a8a5103f1770cc18c7e1b46516c96c8fef6
4
+ data.tar.gz: 991cf1f67783f35a8da233e6d9e82edc4d933ef0229d5ecffbe8963c5d049c98
5
5
  SHA512:
6
- metadata.gz: 312dcd879a3589aa0a75d47296dbe04920eee06ae2fb83c18274d918d12517c8a2064c1e9e7a3c774e656636187cd5fc510f249ae71ea6a319cb437ee8d0314b
7
- data.tar.gz: 8cd5dcd421077405f920a8ff966387817450e6169aa9578a9b5f25284d4100f1f5bf77304cda727f975464782064643c96d3ee0985a2b57e16aec523c9e17429
6
+ metadata.gz: f9b71d59d4c5c5bdb82f58fceacd848242a34b12d15abf93c101e4d61ab8fcab46e60011b80f966b0851474160af153c92ab46db5ed2c2e80b0fec3afdc53f8c
7
+ data.tar.gz: 88f39d47bb8405f682ddca4236bd2e3ce93ffbfd426c2430532b904c98e7cb1593406271fa4453847ab95615adbffc36049072bd7c8b45b171e2cecb77bb41ab
@@ -62,6 +62,10 @@ option_parser = OptionParser.new do |opts|
62
62
  options[:rewritten] = true
63
63
  end
64
64
 
65
+ opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
66
+ options[:rewrite] = true
67
+ end
68
+
65
69
  opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
66
70
  options[:reset] = true
67
71
  end
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
113
113
 
114
114
  include ArchiveAPI
115
115
 
116
- VERSION = "2.3.5"
116
+ VERSION = "2.3.7"
117
117
  DEFAULT_TIMEOUT = 30
118
118
  MAX_RETRIES = 3
119
119
  RETRY_DELAY = 2
@@ -125,7 +125,7 @@ class WaybackMachineDownloader
125
125
 
126
126
  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
127
127
  :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
128
- :all, :maximum_pages, :threads_count, :logger, :reset, :keep
128
+ :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
129
129
 
130
130
  def initialize params
131
131
  validate_params(params)
@@ -148,6 +148,7 @@ class WaybackMachineDownloader
148
148
  @failed_downloads = Concurrent::Array.new
149
149
  @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
150
150
  @db_mutex = Mutex.new
151
+ @rewrite = params[:rewrite] || false
151
152
 
152
153
  handle_reset
153
154
  end
@@ -476,8 +477,8 @@ class WaybackMachineDownloader
476
477
  begin
477
478
  @connection_pool.with_connection do |connection|
478
479
  result_message = download_file(file_remote_info, connection)
479
- # for now, assume success if no exception and message doesn't indicate error/skip
480
- if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
480
+ # assume download success if the result message contains ' -> '
481
+ if result_message && result_message.include?(' -> ')
481
482
  download_success = true
482
483
  end
483
484
  @download_mutex.synchronize do
@@ -533,6 +534,101 @@ class WaybackMachineDownloader
533
534
  end
534
535
  end
535
536
 
537
+ def rewrite_urls_to_relative(file_path)
538
+ return unless File.exist?(file_path)
539
+
540
+ file_ext = File.extname(file_path).downcase
541
+
542
+ begin
543
+ content = File.binread(file_path)
544
+
545
+ if file_ext == '.html' || file_ext == '.htm'
546
+ encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
547
+ content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
548
+ else
549
+ content.force_encoding('UTF-8')
550
+ end
551
+
552
+ # URLs in HTML attributes
553
+ content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
554
+ prefix, url, suffix = $1, $2, $3
555
+
556
+ if url.start_with?('http')
557
+ begin
558
+ uri = URI.parse(url)
559
+ path = uri.path
560
+ path = path[1..-1] if path.start_with?('/')
561
+ "#{prefix}#{path}#{suffix}"
562
+ rescue
563
+ "#{prefix}#{url}#{suffix}"
564
+ end
565
+ elsif url.start_with?('/')
566
+ "#{prefix}./#{url[1..-1]}#{suffix}"
567
+ else
568
+ "#{prefix}#{url}#{suffix}"
569
+ end
570
+ end
571
+
572
+ # URLs in CSS
573
+ content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
574
+ url = $1
575
+
576
+ if url.start_with?('http')
577
+ begin
578
+ uri = URI.parse(url)
579
+ path = uri.path
580
+ path = path[1..-1] if path.start_with?('/')
581
+ "url(\"#{path}\")"
582
+ rescue
583
+ "url(\"#{url}\")"
584
+ end
585
+ elsif url.start_with?('/')
586
+ "url(\"./#{url[1..-1]}\")"
587
+ else
588
+ "url(\"#{url}\")"
589
+ end
590
+ end
591
+
592
+ # URLs in JavaScript
593
+ content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
594
+ quote_start, url, quote_end = $1, $2, $3
595
+
596
+ if url.start_with?('http')
597
+ begin
598
+ uri = URI.parse(url)
599
+ path = uri.path
600
+ path = path[1..-1] if path.start_with?('/')
601
+ "#{quote_start}#{path}#{quote_end}"
602
+ rescue
603
+ "#{quote_start}#{url}#{quote_end}"
604
+ end
605
+ elsif url.start_with?('/')
606
+ "#{quote_start}./#{url[1..-1]}#{quote_end}"
607
+ else
608
+ "#{quote_start}#{url}#{quote_end}"
609
+ end
610
+ end
611
+
612
+ # for URLs in HTML attributes that start with a single slash
613
+ content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
614
+ prefix, path, suffix = $1, $2, $3
615
+ "#{prefix}./#{path}#{suffix}"
616
+ end
617
+
618
+ # for URLs in CSS that start with a single slash
619
+ content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
620
+ path = $1
621
+ "url(\"./#{path}\")"
622
+ end
623
+
624
+ # save the modified content back to the file
625
+ File.binwrite(file_path, content)
626
+ puts "Rewrote URLs in #{file_path} to be relative."
627
+ rescue Errno::ENOENT => e
628
+ @logger.warn("Error reading file #{file_path}: #{e.message}")
629
+ end
630
+ end
631
+
536
632
  def download_file (file_remote_info, http)
537
633
  current_encoding = "".encoding
538
634
  file_url = file_remote_info[:file_url].encode(current_encoding)
@@ -563,11 +659,24 @@ class WaybackMachineDownloader
563
659
 
564
660
  begin
565
661
  structure_dir_path dir_path
566
- download_with_retry(file_path, file_url, file_timestamp, http)
567
- "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
662
+ status = download_with_retry(file_path, file_url, file_timestamp, http)
663
+
664
+ case status
665
+ when :saved
666
+ if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
667
+ rewrite_urls_to_relative(file_path)
668
+ end
669
+ "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
670
+ when :skipped_not_found
671
+ "Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
672
+ else
673
+ # ideally, this case should not be reached if download_with_retry behaves as expected.
674
+ @logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
675
+ "Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
676
+ end
568
677
  rescue StandardError => e
569
678
  msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
570
- if not @all and File.exist?(file_path) and File.size(file_path) == 0
679
+ if File.exist?(file_path) and File.size(file_path) == 0
571
680
  File.delete(file_path)
572
681
  msg += "\n#{file_path} was empty and was removed."
573
682
  end
@@ -615,8 +724,7 @@ class WaybackMachineDownloader
615
724
 
616
725
  response = connection.request(request)
617
726
 
618
- case response
619
- when Net::HTTPSuccess
727
+ save_response_body = lambda do
620
728
  File.open(file_path, "wb") do |file|
621
729
  body = response.body
622
730
  if response['content-encoding'] == 'gzip' && body && !body.empty?
@@ -626,26 +734,48 @@ class WaybackMachineDownloader
626
734
  gz.close
627
735
  file.write(decompressed_body)
628
736
  rescue Zlib::GzipFile::Error => e
629
- @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
737
+ @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
630
738
  file.write(body)
631
739
  end
632
740
  else
633
741
  file.write(body) if body
634
742
  end
635
743
  end
636
- when Net::HTTPRedirection
637
- raise "Too many redirects for #{file_url}" if redirect_count >= 2
638
- location = response['location']
639
- @logger.warn("Redirect found for #{file_url} -> #{location}")
640
- return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
641
- when Net::HTTPTooManyRequests
642
- sleep(RATE_LIMIT * 2)
643
- raise "Rate limited, retrying..."
644
- when Net::HTTPNotFound
645
- @logger.warn("File not found, skipping: #{file_url}")
646
- return
647
- else
648
- raise "HTTP Error: #{response.code} #{response.message}"
744
+ end
745
+
746
+ if @all
747
+ case response
748
+ when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
749
+ save_response_body.call
750
+ if response.is_a?(Net::HTTPRedirection)
751
+ @logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
752
+ elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
753
+ @logger.info("Saved error page for #{file_url} (status #{response.code}).")
754
+ end
755
+ return :saved
756
+ else
757
+ # for any other response type when --all is true, treat as an error to be retried or failed
758
+ raise "Unhandled HTTP response: #{response.code} #{response.message}"
759
+ end
760
+ else # not @all (our default behavior)
761
+ case response
762
+ when Net::HTTPSuccess
763
+ save_response_body.call
764
+ return :saved
765
+ when Net::HTTPRedirection
766
+ raise "Too many redirects for #{file_url}" if redirect_count >= 2
767
+ location = response['location']
768
+ @logger.warn("Redirect found for #{file_url} -> #{location}")
769
+ return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
770
+ when Net::HTTPTooManyRequests
771
+ sleep(RATE_LIMIT * 2)
772
+ raise "Rate limited, retrying..."
773
+ when Net::HTTPNotFound
774
+ @logger.warn("File not found, skipping: #{file_url}")
775
+ return :skipped_not_found
776
+ else
777
+ raise "HTTP Error: #{response.code} #{response.message}"
778
+ end
649
779
  end
650
780
 
651
781
  rescue StandardError => e
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.5
4
+ version: 2.3.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-04-30 00:00:00.000000000 Z
11
+ date: 2025-05-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: concurrent-ruby
@@ -86,7 +86,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - ">="
88
88
  - !ruby/object:Gem::Version
89
- version: 1.9.2
89
+ version: 3.4.3
90
90
  required_rubygems_version: !ruby/object:Gem::Requirement
91
91
  requirements:
92
92
  - - ">="