wayback_machine_downloader_straw 2.3.4 → 2.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +4 -0
- data/lib/wayback_machine_downloader.rb +104 -6
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz: '
|
3
|
+
metadata.gz: 04ac6f9f045b4f92a7481ad8544f2f9138454b9eabdcf6f47b28195c1dd1cdaf
|
4
|
+
data.tar.gz: '09a16685d1299afb338d86495d1c58825482a6785e7e1a596bb02eb2da1fc7f1'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fd157e047c8631ff5cdfd4ca540840a7d49196131dc4de9f9725c3989164151e4c05dda0dae0dc884bfb9bbb51483f061378ef7a1e737b36d1d11882719bcf60
|
7
|
+
data.tar.gz: e9b814bbbed6caef69972b9e94891f7af9be61674cf50bdd3bb1bf4a60c3622156e93b07de8a3761dba87a852bd67aa10439481c4ca72bffe564019f04451ed5
|
@@ -62,6 +62,10 @@ option_parser = OptionParser.new do |opts|
|
|
62
62
|
options[:rewritten] = true
|
63
63
|
end
|
64
64
|
|
65
|
+
opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
|
66
|
+
options[:rewrite] = true
|
67
|
+
end
|
68
|
+
|
65
69
|
opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
|
66
70
|
options[:reset] = true
|
67
71
|
end
|
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
|
|
113
113
|
|
114
114
|
include ArchiveAPI
|
115
115
|
|
116
|
-
VERSION = "2.3.
|
116
|
+
VERSION = "2.3.6"
|
117
117
|
DEFAULT_TIMEOUT = 30
|
118
118
|
MAX_RETRIES = 3
|
119
119
|
RETRY_DELAY = 2
|
@@ -125,7 +125,7 @@ class WaybackMachineDownloader
|
|
125
125
|
|
126
126
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
127
127
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
128
|
-
:all, :maximum_pages, :threads_count, :logger, :reset, :keep
|
128
|
+
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
|
129
129
|
|
130
130
|
def initialize params
|
131
131
|
validate_params(params)
|
@@ -148,6 +148,7 @@ class WaybackMachineDownloader
|
|
148
148
|
@failed_downloads = Concurrent::Array.new
|
149
149
|
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
|
150
150
|
@db_mutex = Mutex.new
|
151
|
+
@rewrite = params[:rewrite] || false
|
151
152
|
|
152
153
|
handle_reset
|
153
154
|
end
|
@@ -533,15 +534,109 @@ class WaybackMachineDownloader
|
|
533
534
|
end
|
534
535
|
end
|
535
536
|
|
537
|
+
def rewrite_urls_to_relative(file_path)
|
538
|
+
return unless File.exist?(file_path)
|
539
|
+
|
540
|
+
file_ext = File.extname(file_path).downcase
|
541
|
+
|
542
|
+
begin
|
543
|
+
content = File.binread(file_path)
|
544
|
+
|
545
|
+
if file_ext == '.html' || file_ext == '.htm'
|
546
|
+
encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
|
547
|
+
content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
|
548
|
+
else
|
549
|
+
content.force_encoding('UTF-8')
|
550
|
+
end
|
551
|
+
|
552
|
+
# URLs in HTML attributes
|
553
|
+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
554
|
+
prefix, url, suffix = $1, $2, $3
|
555
|
+
|
556
|
+
if url.start_with?('http')
|
557
|
+
begin
|
558
|
+
uri = URI.parse(url)
|
559
|
+
path = uri.path
|
560
|
+
path = path[1..-1] if path.start_with?('/')
|
561
|
+
"#{prefix}#{path}#{suffix}"
|
562
|
+
rescue
|
563
|
+
"#{prefix}#{url}#{suffix}"
|
564
|
+
end
|
565
|
+
elsif url.start_with?('/')
|
566
|
+
"#{prefix}./#{url[1..-1]}#{suffix}"
|
567
|
+
else
|
568
|
+
"#{prefix}#{url}#{suffix}"
|
569
|
+
end
|
570
|
+
end
|
571
|
+
|
572
|
+
# URLs in CSS
|
573
|
+
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
|
574
|
+
url = $1
|
575
|
+
|
576
|
+
if url.start_with?('http')
|
577
|
+
begin
|
578
|
+
uri = URI.parse(url)
|
579
|
+
path = uri.path
|
580
|
+
path = path[1..-1] if path.start_with?('/')
|
581
|
+
"url(\"#{path}\")"
|
582
|
+
rescue
|
583
|
+
"url(\"#{url}\")"
|
584
|
+
end
|
585
|
+
elsif url.start_with?('/')
|
586
|
+
"url(\"./#{url[1..-1]}\")"
|
587
|
+
else
|
588
|
+
"url(\"#{url}\")"
|
589
|
+
end
|
590
|
+
end
|
591
|
+
|
592
|
+
# URLs in JavaScript
|
593
|
+
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
594
|
+
quote_start, url, quote_end = $1, $2, $3
|
595
|
+
|
596
|
+
if url.start_with?('http')
|
597
|
+
begin
|
598
|
+
uri = URI.parse(url)
|
599
|
+
path = uri.path
|
600
|
+
path = path[1..-1] if path.start_with?('/')
|
601
|
+
"#{quote_start}#{path}#{quote_end}"
|
602
|
+
rescue
|
603
|
+
"#{quote_start}#{url}#{quote_end}"
|
604
|
+
end
|
605
|
+
elsif url.start_with?('/')
|
606
|
+
"#{quote_start}./#{url[1..-1]}#{quote_end}"
|
607
|
+
else
|
608
|
+
"#{quote_start}#{url}#{quote_end}"
|
609
|
+
end
|
610
|
+
end
|
611
|
+
|
612
|
+
# for URLs in HTML attributes that start with a single slash
|
613
|
+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
614
|
+
prefix, path, suffix = $1, $2, $3
|
615
|
+
"#{prefix}./#{path}#{suffix}"
|
616
|
+
end
|
617
|
+
|
618
|
+
# for URLs in CSS that start with a single slash
|
619
|
+
content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
|
620
|
+
path = $1
|
621
|
+
"url(\"./#{path}\")"
|
622
|
+
end
|
623
|
+
|
624
|
+
# save the modified content back to the file
|
625
|
+
File.binwrite(file_path, content)
|
626
|
+
puts "Rewrote URLs in #{file_path} to be relative."
|
627
|
+
rescue Errno::ENOENT => e
|
628
|
+
@logger.warn("Error reading file #{file_path}: #{e.message}")
|
629
|
+
end
|
630
|
+
end
|
631
|
+
|
536
632
|
def download_file (file_remote_info, http)
|
537
633
|
current_encoding = "".encoding
|
538
634
|
file_url = file_remote_info[:file_url].encode(current_encoding)
|
539
635
|
file_id = file_remote_info[:file_id]
|
540
636
|
file_timestamp = file_remote_info[:timestamp]
|
541
|
-
|
542
|
-
file_path_elements = original_file_id.split('/')
|
637
|
+
file_path_elements = file_id.split('/')
|
543
638
|
|
544
|
-
if
|
639
|
+
if file_id == ""
|
545
640
|
dir_path = backup_path
|
546
641
|
file_path = backup_path + 'index.html'
|
547
642
|
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
@@ -565,10 +660,13 @@ class WaybackMachineDownloader
|
|
565
660
|
begin
|
566
661
|
structure_dir_path dir_path
|
567
662
|
download_with_retry(file_path, file_url, file_timestamp, http)
|
663
|
+
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
|
664
|
+
rewrite_urls_to_relative(file_path)
|
665
|
+
end
|
568
666
|
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
569
667
|
rescue StandardError => e
|
570
668
|
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
571
|
-
if
|
669
|
+
if File.exist?(file_path) and File.size(file_path) == 0
|
572
670
|
File.delete(file_path)
|
573
671
|
msg += "\n#{file_path} was empty and was removed."
|
574
672
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader_straw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.3.
|
4
|
+
version: 2.3.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- strawberrymaster
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-05-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: concurrent-ruby
|
@@ -86,7 +86,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
86
86
|
requirements:
|
87
87
|
- - ">="
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version:
|
89
|
+
version: 3.4.3
|
90
90
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
91
|
requirements:
|
92
92
|
- - ">="
|