wayback_machine_downloader_straw 2.3.4 → 2.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 220999514eb0c1dd5bce948a2ac028e4527eb07f089b9f6b437f02a6a00860be
4
- data.tar.gz: '03780351285ee37d38ba04652725ffa33f6112837e01e68469c8be3cda13eb45'
3
+ metadata.gz: 04ac6f9f045b4f92a7481ad8544f2f9138454b9eabdcf6f47b28195c1dd1cdaf
4
+ data.tar.gz: '09a16685d1299afb338d86495d1c58825482a6785e7e1a596bb02eb2da1fc7f1'
5
5
  SHA512:
6
- metadata.gz: 3b05448a6271b8e45d5655b5ee415851f6e8e2daaec5f9bb12b0681e58292c06fe4ab91ab4f2ca1530edb0632755808dc8a465165c5e73fea2673481dddad610
7
- data.tar.gz: 95440ee51316da6f2e48c3ec1d54f9fc391b2d59447625f07052222ecfeacf6fc26d430ac64ede6da589c0115fddc7a71fc7eb2fa45ff403491f1b3dc51b66ec
6
+ metadata.gz: fd157e047c8631ff5cdfd4ca540840a7d49196131dc4de9f9725c3989164151e4c05dda0dae0dc884bfb9bbb51483f061378ef7a1e737b36d1d11882719bcf60
7
+ data.tar.gz: e9b814bbbed6caef69972b9e94891f7af9be61674cf50bdd3bb1bf4a60c3622156e93b07de8a3761dba87a852bd67aa10439481c4ca72bffe564019f04451ed5
@@ -62,6 +62,10 @@ option_parser = OptionParser.new do |opts|
62
62
  options[:rewritten] = true
63
63
  end
64
64
 
65
+ opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
66
+ options[:rewrite] = true
67
+ end
68
+
65
69
  opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
66
70
  options[:reset] = true
67
71
  end
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
113
113
 
114
114
  include ArchiveAPI
115
115
 
116
- VERSION = "2.3.4"
116
+ VERSION = "2.3.6"
117
117
  DEFAULT_TIMEOUT = 30
118
118
  MAX_RETRIES = 3
119
119
  RETRY_DELAY = 2
@@ -125,7 +125,7 @@ class WaybackMachineDownloader
125
125
 
126
126
  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
127
127
  :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
128
- :all, :maximum_pages, :threads_count, :logger, :reset, :keep
128
+ :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
129
129
 
130
130
  def initialize params
131
131
  validate_params(params)
@@ -148,6 +148,7 @@ class WaybackMachineDownloader
148
148
  @failed_downloads = Concurrent::Array.new
149
149
  @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
150
150
  @db_mutex = Mutex.new
151
+ @rewrite = params[:rewrite] || false
151
152
 
152
153
  handle_reset
153
154
  end
@@ -533,15 +534,109 @@ class WaybackMachineDownloader
533
534
  end
534
535
  end
535
536
 
537
+ def rewrite_urls_to_relative(file_path)
538
+ return unless File.exist?(file_path)
539
+
540
+ file_ext = File.extname(file_path).downcase
541
+
542
+ begin
543
+ content = File.binread(file_path)
544
+
545
+ if file_ext == '.html' || file_ext == '.htm'
546
+ encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
547
+ content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
548
+ else
549
+ content.force_encoding('UTF-8')
550
+ end
551
+
552
+ # URLs in HTML attributes
553
+ content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
554
+ prefix, url, suffix = $1, $2, $3
555
+
556
+ if url.start_with?('http')
557
+ begin
558
+ uri = URI.parse(url)
559
+ path = uri.path
560
+ path = path[1..-1] if path.start_with?('/')
561
+ "#{prefix}#{path}#{suffix}"
562
+ rescue
563
+ "#{prefix}#{url}#{suffix}"
564
+ end
565
+ elsif url.start_with?('/')
566
+ "#{prefix}./#{url[1..-1]}#{suffix}"
567
+ else
568
+ "#{prefix}#{url}#{suffix}"
569
+ end
570
+ end
571
+
572
+ # URLs in CSS
573
+ content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
574
+ url = $1
575
+
576
+ if url.start_with?('http')
577
+ begin
578
+ uri = URI.parse(url)
579
+ path = uri.path
580
+ path = path[1..-1] if path.start_with?('/')
581
+ "url(\"#{path}\")"
582
+ rescue
583
+ "url(\"#{url}\")"
584
+ end
585
+ elsif url.start_with?('/')
586
+ "url(\"./#{url[1..-1]}\")"
587
+ else
588
+ "url(\"#{url}\")"
589
+ end
590
+ end
591
+
592
+ # URLs in JavaScript
593
+ content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
594
+ quote_start, url, quote_end = $1, $2, $3
595
+
596
+ if url.start_with?('http')
597
+ begin
598
+ uri = URI.parse(url)
599
+ path = uri.path
600
+ path = path[1..-1] if path.start_with?('/')
601
+ "#{quote_start}#{path}#{quote_end}"
602
+ rescue
603
+ "#{quote_start}#{url}#{quote_end}"
604
+ end
605
+ elsif url.start_with?('/')
606
+ "#{quote_start}./#{url[1..-1]}#{quote_end}"
607
+ else
608
+ "#{quote_start}#{url}#{quote_end}"
609
+ end
610
+ end
611
+
612
+ # for URLs in HTML attributes that start with a single slash
613
+ content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
614
+ prefix, path, suffix = $1, $2, $3
615
+ "#{prefix}./#{path}#{suffix}"
616
+ end
617
+
618
+ # for URLs in CSS that start with a single slash
619
+ content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
620
+ path = $1
621
+ "url(\"./#{path}\")"
622
+ end
623
+
624
+ # save the modified content back to the file
625
+ File.binwrite(file_path, content)
626
+ puts "Rewrote URLs in #{file_path} to be relative."
627
+ rescue Errno::ENOENT => e
628
+ @logger.warn("Error reading file #{file_path}: #{e.message}")
629
+ end
630
+ end
631
+
536
632
  def download_file (file_remote_info, http)
537
633
  current_encoding = "".encoding
538
634
  file_url = file_remote_info[:file_url].encode(current_encoding)
539
635
  file_id = file_remote_info[:file_id]
540
636
  file_timestamp = file_remote_info[:timestamp]
541
- original_file_id = @all_timestamps ? file_id.split('/', 2)[1] : file_id
542
- file_path_elements = original_file_id.split('/')
637
+ file_path_elements = file_id.split('/')
543
638
 
544
- if original_file_id == ""
639
+ if file_id == ""
545
640
  dir_path = backup_path
546
641
  file_path = backup_path + 'index.html'
547
642
  elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
@@ -565,10 +660,13 @@ class WaybackMachineDownloader
565
660
  begin
566
661
  structure_dir_path dir_path
567
662
  download_with_retry(file_path, file_url, file_timestamp, http)
663
+ if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
664
+ rewrite_urls_to_relative(file_path)
665
+ end
568
666
  "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
569
667
  rescue StandardError => e
570
668
  msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
571
- if not @all and File.exist?(file_path) and File.size(file_path) == 0
669
+ if File.exist?(file_path) and File.size(file_path) == 0
572
670
  File.delete(file_path)
573
671
  msg += "\n#{file_path} was empty and was removed."
574
672
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.4
4
+ version: 2.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-04-19 00:00:00.000000000 Z
11
+ date: 2025-05-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: concurrent-ruby
@@ -86,7 +86,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - ">="
88
88
  - !ruby/object:Gem::Version
89
- version: 1.9.2
89
+ version: 3.4.3
90
90
  required_rubygems_version: !ruby/object:Gem::Requirement
91
91
  requirements:
92
92
  - - ">="