wayback_machine_downloader_straw 2.3.5 → 2.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +4 -0
- data/lib/wayback_machine_downloader.rb +153 -23
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b739c4ecda1e325f9d5a33872fa71a8a5103f1770cc18c7e1b46516c96c8fef6
|
4
|
+
data.tar.gz: 991cf1f67783f35a8da233e6d9e82edc4d933ef0229d5ecffbe8963c5d049c98
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f9b71d59d4c5c5bdb82f58fceacd848242a34b12d15abf93c101e4d61ab8fcab46e60011b80f966b0851474160af153c92ab46db5ed2c2e80b0fec3afdc53f8c
|
7
|
+
data.tar.gz: 88f39d47bb8405f682ddca4236bd2e3ce93ffbfd426c2430532b904c98e7cb1593406271fa4453847ab95615adbffc36049072bd7c8b45b171e2cecb77bb41ab
|
@@ -62,6 +62,10 @@ option_parser = OptionParser.new do |opts|
|
|
62
62
|
options[:rewritten] = true
|
63
63
|
end
|
64
64
|
|
65
|
+
opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
|
66
|
+
options[:rewrite] = true
|
67
|
+
end
|
68
|
+
|
65
69
|
opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
|
66
70
|
options[:reset] = true
|
67
71
|
end
|
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
|
|
113
113
|
|
114
114
|
include ArchiveAPI
|
115
115
|
|
116
|
-
VERSION = "2.3.
|
116
|
+
VERSION = "2.3.7"
|
117
117
|
DEFAULT_TIMEOUT = 30
|
118
118
|
MAX_RETRIES = 3
|
119
119
|
RETRY_DELAY = 2
|
@@ -125,7 +125,7 @@ class WaybackMachineDownloader
|
|
125
125
|
|
126
126
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
127
127
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
128
|
-
:all, :maximum_pages, :threads_count, :logger, :reset, :keep
|
128
|
+
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
|
129
129
|
|
130
130
|
def initialize params
|
131
131
|
validate_params(params)
|
@@ -148,6 +148,7 @@ class WaybackMachineDownloader
|
|
148
148
|
@failed_downloads = Concurrent::Array.new
|
149
149
|
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
|
150
150
|
@db_mutex = Mutex.new
|
151
|
+
@rewrite = params[:rewrite] || false
|
151
152
|
|
152
153
|
handle_reset
|
153
154
|
end
|
@@ -476,8 +477,8 @@ class WaybackMachineDownloader
|
|
476
477
|
begin
|
477
478
|
@connection_pool.with_connection do |connection|
|
478
479
|
result_message = download_file(file_remote_info, connection)
|
479
|
-
#
|
480
|
-
if result_message &&
|
480
|
+
# assume download success if the result message contains ' -> '
|
481
|
+
if result_message && result_message.include?(' -> ')
|
481
482
|
download_success = true
|
482
483
|
end
|
483
484
|
@download_mutex.synchronize do
|
@@ -533,6 +534,101 @@ class WaybackMachineDownloader
|
|
533
534
|
end
|
534
535
|
end
|
535
536
|
|
537
|
+
def rewrite_urls_to_relative(file_path)
|
538
|
+
return unless File.exist?(file_path)
|
539
|
+
|
540
|
+
file_ext = File.extname(file_path).downcase
|
541
|
+
|
542
|
+
begin
|
543
|
+
content = File.binread(file_path)
|
544
|
+
|
545
|
+
if file_ext == '.html' || file_ext == '.htm'
|
546
|
+
encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
|
547
|
+
content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
|
548
|
+
else
|
549
|
+
content.force_encoding('UTF-8')
|
550
|
+
end
|
551
|
+
|
552
|
+
# URLs in HTML attributes
|
553
|
+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
554
|
+
prefix, url, suffix = $1, $2, $3
|
555
|
+
|
556
|
+
if url.start_with?('http')
|
557
|
+
begin
|
558
|
+
uri = URI.parse(url)
|
559
|
+
path = uri.path
|
560
|
+
path = path[1..-1] if path.start_with?('/')
|
561
|
+
"#{prefix}#{path}#{suffix}"
|
562
|
+
rescue
|
563
|
+
"#{prefix}#{url}#{suffix}"
|
564
|
+
end
|
565
|
+
elsif url.start_with?('/')
|
566
|
+
"#{prefix}./#{url[1..-1]}#{suffix}"
|
567
|
+
else
|
568
|
+
"#{prefix}#{url}#{suffix}"
|
569
|
+
end
|
570
|
+
end
|
571
|
+
|
572
|
+
# URLs in CSS
|
573
|
+
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
|
574
|
+
url = $1
|
575
|
+
|
576
|
+
if url.start_with?('http')
|
577
|
+
begin
|
578
|
+
uri = URI.parse(url)
|
579
|
+
path = uri.path
|
580
|
+
path = path[1..-1] if path.start_with?('/')
|
581
|
+
"url(\"#{path}\")"
|
582
|
+
rescue
|
583
|
+
"url(\"#{url}\")"
|
584
|
+
end
|
585
|
+
elsif url.start_with?('/')
|
586
|
+
"url(\"./#{url[1..-1]}\")"
|
587
|
+
else
|
588
|
+
"url(\"#{url}\")"
|
589
|
+
end
|
590
|
+
end
|
591
|
+
|
592
|
+
# URLs in JavaScript
|
593
|
+
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
594
|
+
quote_start, url, quote_end = $1, $2, $3
|
595
|
+
|
596
|
+
if url.start_with?('http')
|
597
|
+
begin
|
598
|
+
uri = URI.parse(url)
|
599
|
+
path = uri.path
|
600
|
+
path = path[1..-1] if path.start_with?('/')
|
601
|
+
"#{quote_start}#{path}#{quote_end}"
|
602
|
+
rescue
|
603
|
+
"#{quote_start}#{url}#{quote_end}"
|
604
|
+
end
|
605
|
+
elsif url.start_with?('/')
|
606
|
+
"#{quote_start}./#{url[1..-1]}#{quote_end}"
|
607
|
+
else
|
608
|
+
"#{quote_start}#{url}#{quote_end}"
|
609
|
+
end
|
610
|
+
end
|
611
|
+
|
612
|
+
# for URLs in HTML attributes that start with a single slash
|
613
|
+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
614
|
+
prefix, path, suffix = $1, $2, $3
|
615
|
+
"#{prefix}./#{path}#{suffix}"
|
616
|
+
end
|
617
|
+
|
618
|
+
# for URLs in CSS that start with a single slash
|
619
|
+
content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
|
620
|
+
path = $1
|
621
|
+
"url(\"./#{path}\")"
|
622
|
+
end
|
623
|
+
|
624
|
+
# save the modified content back to the file
|
625
|
+
File.binwrite(file_path, content)
|
626
|
+
puts "Rewrote URLs in #{file_path} to be relative."
|
627
|
+
rescue Errno::ENOENT => e
|
628
|
+
@logger.warn("Error reading file #{file_path}: #{e.message}")
|
629
|
+
end
|
630
|
+
end
|
631
|
+
|
536
632
|
def download_file (file_remote_info, http)
|
537
633
|
current_encoding = "".encoding
|
538
634
|
file_url = file_remote_info[:file_url].encode(current_encoding)
|
@@ -563,11 +659,24 @@ class WaybackMachineDownloader
|
|
563
659
|
|
564
660
|
begin
|
565
661
|
structure_dir_path dir_path
|
566
|
-
download_with_retry(file_path, file_url, file_timestamp, http)
|
567
|
-
|
662
|
+
status = download_with_retry(file_path, file_url, file_timestamp, http)
|
663
|
+
|
664
|
+
case status
|
665
|
+
when :saved
|
666
|
+
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
|
667
|
+
rewrite_urls_to_relative(file_path)
|
668
|
+
end
|
669
|
+
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
670
|
+
when :skipped_not_found
|
671
|
+
"Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
|
672
|
+
else
|
673
|
+
# ideally, this case should not be reached if download_with_retry behaves as expected.
|
674
|
+
@logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
|
675
|
+
"Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
|
676
|
+
end
|
568
677
|
rescue StandardError => e
|
569
678
|
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
570
|
-
if
|
679
|
+
if File.exist?(file_path) and File.size(file_path) == 0
|
571
680
|
File.delete(file_path)
|
572
681
|
msg += "\n#{file_path} was empty and was removed."
|
573
682
|
end
|
@@ -615,8 +724,7 @@ class WaybackMachineDownloader
|
|
615
724
|
|
616
725
|
response = connection.request(request)
|
617
726
|
|
618
|
-
|
619
|
-
when Net::HTTPSuccess
|
727
|
+
save_response_body = lambda do
|
620
728
|
File.open(file_path, "wb") do |file|
|
621
729
|
body = response.body
|
622
730
|
if response['content-encoding'] == 'gzip' && body && !body.empty?
|
@@ -626,26 +734,48 @@ class WaybackMachineDownloader
|
|
626
734
|
gz.close
|
627
735
|
file.write(decompressed_body)
|
628
736
|
rescue Zlib::GzipFile::Error => e
|
629
|
-
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
|
737
|
+
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
|
630
738
|
file.write(body)
|
631
739
|
end
|
632
740
|
else
|
633
741
|
file.write(body) if body
|
634
742
|
end
|
635
743
|
end
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
744
|
+
end
|
745
|
+
|
746
|
+
if @all
|
747
|
+
case response
|
748
|
+
when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
|
749
|
+
save_response_body.call
|
750
|
+
if response.is_a?(Net::HTTPRedirection)
|
751
|
+
@logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
|
752
|
+
elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
|
753
|
+
@logger.info("Saved error page for #{file_url} (status #{response.code}).")
|
754
|
+
end
|
755
|
+
return :saved
|
756
|
+
else
|
757
|
+
# for any other response type when --all is true, treat as an error to be retried or failed
|
758
|
+
raise "Unhandled HTTP response: #{response.code} #{response.message}"
|
759
|
+
end
|
760
|
+
else # not @all (our default behavior)
|
761
|
+
case response
|
762
|
+
when Net::HTTPSuccess
|
763
|
+
save_response_body.call
|
764
|
+
return :saved
|
765
|
+
when Net::HTTPRedirection
|
766
|
+
raise "Too many redirects for #{file_url}" if redirect_count >= 2
|
767
|
+
location = response['location']
|
768
|
+
@logger.warn("Redirect found for #{file_url} -> #{location}")
|
769
|
+
return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
|
770
|
+
when Net::HTTPTooManyRequests
|
771
|
+
sleep(RATE_LIMIT * 2)
|
772
|
+
raise "Rate limited, retrying..."
|
773
|
+
when Net::HTTPNotFound
|
774
|
+
@logger.warn("File not found, skipping: #{file_url}")
|
775
|
+
return :skipped_not_found
|
776
|
+
else
|
777
|
+
raise "HTTP Error: #{response.code} #{response.message}"
|
778
|
+
end
|
649
779
|
end
|
650
780
|
|
651
781
|
rescue StandardError => e
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader_straw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.3.
|
4
|
+
version: 2.3.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- strawberrymaster
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-05-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: concurrent-ruby
|
@@ -86,7 +86,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
86
86
|
requirements:
|
87
87
|
- - ">="
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version:
|
89
|
+
version: 3.4.3
|
90
90
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
91
|
requirements:
|
92
92
|
- - ">="
|