wayback_machine_downloader_straw 2.3.12 → 2.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: f6650c4217f2630db6307bc50ae2d6cefcbc38afc18b5701cc90a956af5cf1cf
|
|
4
|
+
data.tar.gz: 0ad44d7daa4c69b75d319c3518c4b801810be071545d5eded4497073caab4667
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7a8cfd1cda19bc3ff2db8859e03877395eaf44092ffbe9f5334218fbd6293ff1aecc60e2bf272f875a67ecd086a209c56640db221f4d13739669a27eada1c826
|
|
7
|
+
data.tar.gz: 877436af63fa205add55ebeb55bafcd39fec0afa56707ee742871014dac48998e8028ef4616a0b611bee5f9a93ed0d8d136375d457503a3e34b9a37f87321787
|
|
@@ -25,7 +25,7 @@ module ArchiveAPI
|
|
|
25
25
|
# Check if the response contains the header ["timestamp", "original"]
|
|
26
26
|
json.shift if json.first == ["timestamp", "original"]
|
|
27
27
|
json
|
|
28
|
-
rescue JSON::ParserError
|
|
28
|
+
rescue JSON::ParserError => e
|
|
29
29
|
warn "Failed to fetch data from API: #{e.message}"
|
|
30
30
|
[]
|
|
31
31
|
end
|
|
@@ -1,74 +1,74 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
# essentially, this is for converting a string with a potentially
|
|
4
|
+
# broken or unknown encoding into a valid UTF-8 string
|
|
5
|
+
# @todo: consider using charlock_holmes for this in the future
|
|
3
6
|
module TidyBytes
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
7
|
+
UNICODE_REPLACEMENT_CHARACTER = "�"
|
|
8
|
+
|
|
9
|
+
# common encodings to try for best multilingual compatibility
|
|
10
|
+
COMMON_ENCODINGS = [
|
|
11
|
+
Encoding::UTF_8,
|
|
12
|
+
Encoding::Windows_1251, # Cyrillic/Russian legacy
|
|
13
|
+
Encoding::GB18030, # Simplified Chinese
|
|
14
|
+
Encoding::Shift_JIS, # Japanese
|
|
15
|
+
Encoding::EUC_KR, # Korean
|
|
16
|
+
Encoding::ISO_8859_1, # Western European
|
|
17
|
+
Encoding::Windows_1252 # Western European/Latin1 superset
|
|
18
|
+
].select { |enc| Encoding.name_list.include?(enc.name) }
|
|
19
|
+
|
|
20
|
+
# returns true if the string appears to be binary (has null bytes)
|
|
21
|
+
def binary_data?
|
|
22
|
+
self.include?("\x00".b)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# attempts to return a valid UTF-8 version of the string
|
|
26
|
+
def tidy_bytes
|
|
27
|
+
return self if self.encoding == Encoding::UTF_8 && self.valid_encoding?
|
|
28
|
+
return self.dup.force_encoding("BINARY") if binary_data?
|
|
29
|
+
|
|
30
|
+
str = self.dup
|
|
31
|
+
COMMON_ENCODINGS.each do |enc|
|
|
32
|
+
str.force_encoding(enc)
|
|
33
|
+
begin
|
|
34
|
+
utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
|
|
35
|
+
return utf8 if utf8.valid_encoding? && !utf8.include?(UNICODE_REPLACEMENT_CHARACTER)
|
|
36
|
+
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
|
37
|
+
# try next encoding
|
|
38
|
+
end
|
|
35
39
|
end
|
|
36
|
-
end.freeze
|
|
37
40
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
41
|
+
# if no clean conversion found, try again but accept replacement characters
|
|
42
|
+
str = self.dup
|
|
43
|
+
COMMON_ENCODINGS.each do |enc|
|
|
44
|
+
str.force_encoding(enc)
|
|
45
|
+
begin
|
|
46
|
+
utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
|
|
47
|
+
return utf8 if utf8.valid_encoding?
|
|
48
|
+
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
|
49
|
+
# try next encoding
|
|
50
|
+
end
|
|
46
51
|
end
|
|
47
|
-
|
|
52
|
+
|
|
53
|
+
# fallback: replace all invalid/undefined bytes
|
|
54
|
+
str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def tidy_bytes!
|
|
58
|
+
replace(self.tidy_bytes)
|
|
59
|
+
end
|
|
48
60
|
|
|
49
61
|
def self.included(base)
|
|
50
|
-
base.
|
|
51
|
-
|
|
52
|
-
return nil if empty?
|
|
53
|
-
|
|
54
|
-
if force
|
|
55
|
-
buffer = String.new(capacity: bytesize)
|
|
56
|
-
each_byte { |b| buffer << CP1252_TO_UTF8[b] }
|
|
57
|
-
return buffer.force_encoding(Encoding::UTF_8)
|
|
58
|
-
end
|
|
62
|
+
base.send(:include, InstanceMethods)
|
|
63
|
+
end
|
|
59
64
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
scrub { |b| CP1252_TO_UTF8[b.ord] }
|
|
65
|
-
end
|
|
66
|
-
end
|
|
65
|
+
module InstanceMethods
|
|
66
|
+
def tidy_bytes
|
|
67
|
+
TidyBytes.instance_method(:tidy_bytes).bind(self).call
|
|
68
|
+
end
|
|
67
69
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
result ? replace(result) : self
|
|
71
|
-
end
|
|
70
|
+
def tidy_bytes!
|
|
71
|
+
TidyBytes.instance_method(:tidy_bytes!).bind(self).call
|
|
72
72
|
end
|
|
73
73
|
end
|
|
74
74
|
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# URLs in HTML attributes
|
|
4
|
+
def rewrite_html_attr_urls(content)
|
|
5
|
+
|
|
6
|
+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
|
7
|
+
prefix, url, suffix = $1, $2, $3
|
|
8
|
+
|
|
9
|
+
if url.start_with?('http')
|
|
10
|
+
begin
|
|
11
|
+
uri = URI.parse(url)
|
|
12
|
+
path = uri.path
|
|
13
|
+
path = path[1..-1] if path.start_with?('/')
|
|
14
|
+
"#{prefix}#{path}#{suffix}"
|
|
15
|
+
rescue
|
|
16
|
+
"#{prefix}#{url}#{suffix}"
|
|
17
|
+
end
|
|
18
|
+
elsif url.start_with?('/')
|
|
19
|
+
"#{prefix}./#{url[1..-1]}#{suffix}"
|
|
20
|
+
else
|
|
21
|
+
"#{prefix}#{url}#{suffix}"
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
content
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# URLs in CSS
|
|
28
|
+
def rewrite_css_urls(content)
|
|
29
|
+
|
|
30
|
+
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
|
|
31
|
+
url = $1
|
|
32
|
+
|
|
33
|
+
if url.start_with?('http')
|
|
34
|
+
begin
|
|
35
|
+
uri = URI.parse(url)
|
|
36
|
+
path = uri.path
|
|
37
|
+
path = path[1..-1] if path.start_with?('/')
|
|
38
|
+
"url(\"#{path}\")"
|
|
39
|
+
rescue
|
|
40
|
+
"url(\"#{url}\")"
|
|
41
|
+
end
|
|
42
|
+
elsif url.start_with?('/')
|
|
43
|
+
"url(\"./#{url[1..-1]}\")"
|
|
44
|
+
else
|
|
45
|
+
"url(\"#{url}\")"
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
content
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# URLs in JavaScript
|
|
52
|
+
def rewrite_js_urls(content)
|
|
53
|
+
|
|
54
|
+
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
|
55
|
+
quote_start, url, quote_end = $1, $2, $3
|
|
56
|
+
|
|
57
|
+
if url.start_with?('http')
|
|
58
|
+
begin
|
|
59
|
+
uri = URI.parse(url)
|
|
60
|
+
path = uri.path
|
|
61
|
+
path = path[1..-1] if path.start_with?('/')
|
|
62
|
+
"#{quote_start}#{path}#{quote_end}"
|
|
63
|
+
rescue
|
|
64
|
+
"#{quote_start}#{url}#{quote_end}"
|
|
65
|
+
end
|
|
66
|
+
elsif url.start_with?('/')
|
|
67
|
+
"#{quote_start}./#{url[1..-1]}#{quote_end}"
|
|
68
|
+
else
|
|
69
|
+
"#{quote_start}#{url}#{quote_end}"
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
content
|
|
74
|
+
end
|
|
@@ -15,6 +15,7 @@ require_relative 'wayback_machine_downloader/tidy_bytes'
|
|
|
15
15
|
require_relative 'wayback_machine_downloader/to_regex'
|
|
16
16
|
require_relative 'wayback_machine_downloader/archive_api'
|
|
17
17
|
require_relative 'wayback_machine_downloader/subdom_processor'
|
|
18
|
+
require_relative 'wayback_machine_downloader/url_rewrite'
|
|
18
19
|
|
|
19
20
|
class ConnectionPool
|
|
20
21
|
MAX_AGE = 300
|
|
@@ -115,7 +116,7 @@ class WaybackMachineDownloader
|
|
|
115
116
|
include ArchiveAPI
|
|
116
117
|
include SubdomainProcessor
|
|
117
118
|
|
|
118
|
-
VERSION = "2.
|
|
119
|
+
VERSION = "2.4.1"
|
|
119
120
|
DEFAULT_TIMEOUT = 30
|
|
120
121
|
MAX_RETRIES = 3
|
|
121
122
|
RETRY_DELAY = 2
|
|
@@ -133,10 +134,11 @@ class WaybackMachineDownloader
|
|
|
133
134
|
|
|
134
135
|
def initialize params
|
|
135
136
|
validate_params(params)
|
|
136
|
-
@base_url = params[:base_url]
|
|
137
|
+
@base_url = params[:base_url]&.tidy_bytes
|
|
137
138
|
@exact_url = params[:exact_url]
|
|
138
139
|
if params[:directory]
|
|
139
|
-
|
|
140
|
+
sanitized_dir = params[:directory].tidy_bytes
|
|
141
|
+
@directory = File.expand_path(sanitized_dir)
|
|
140
142
|
else
|
|
141
143
|
@directory = nil
|
|
142
144
|
end
|
|
@@ -338,15 +340,15 @@ class WaybackMachineDownloader
|
|
|
338
340
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
|
339
341
|
next unless file_url.include?('/')
|
|
340
342
|
next if file_timestamp.to_i > target_timestamp
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
file_id =
|
|
343
|
+
|
|
344
|
+
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
|
345
|
+
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
|
344
346
|
next if file_id.nil?
|
|
345
347
|
next if match_exclude_filter(file_url)
|
|
346
348
|
next unless match_only_filter(file_url)
|
|
347
|
-
|
|
349
|
+
|
|
348
350
|
if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
|
|
349
|
-
file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
|
|
351
|
+
file_versions[file_id] = { file_url: file_url, timestamp: file_timestamp, file_id: file_id }
|
|
350
352
|
end
|
|
351
353
|
end
|
|
352
354
|
file_versions.values
|
|
@@ -366,22 +368,27 @@ class WaybackMachineDownloader
|
|
|
366
368
|
file_list_curated = Hash.new
|
|
367
369
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
|
368
370
|
next unless file_url.include?('/')
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
file_id =
|
|
371
|
+
|
|
372
|
+
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
|
373
|
+
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
|
372
374
|
if file_id.nil?
|
|
373
375
|
puts "Malformed file url, ignoring: #{file_url}"
|
|
376
|
+
next
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
if file_id.include?('<') || file_id.include?('>')
|
|
380
|
+
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
|
|
374
381
|
else
|
|
375
382
|
if match_exclude_filter(file_url)
|
|
376
383
|
puts "File url matches exclude filter, ignoring: #{file_url}"
|
|
377
|
-
elsif
|
|
384
|
+
elsif !match_only_filter(file_url)
|
|
378
385
|
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
|
379
386
|
elsif file_list_curated[file_id]
|
|
380
387
|
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
|
381
|
-
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
|
388
|
+
file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
|
|
382
389
|
end
|
|
383
390
|
else
|
|
384
|
-
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
|
391
|
+
file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
|
|
385
392
|
end
|
|
386
393
|
end
|
|
387
394
|
end
|
|
@@ -392,21 +399,32 @@ class WaybackMachineDownloader
|
|
|
392
399
|
file_list_curated = Hash.new
|
|
393
400
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
|
394
401
|
next unless file_url.include?('/')
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
|
|
402
|
+
|
|
403
|
+
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
|
404
|
+
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
|
399
405
|
if file_id.nil?
|
|
400
406
|
puts "Malformed file url, ignoring: #{file_url}"
|
|
407
|
+
next
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
file_id_and_timestamp_raw = [file_timestamp, file_id].join('/')
|
|
411
|
+
file_id_and_timestamp = sanitize_and_prepare_id(file_id_and_timestamp_raw, file_url)
|
|
412
|
+
if file_id_and_timestamp.nil?
|
|
413
|
+
puts "Malformed file id/timestamp combo, ignoring: #{file_url}"
|
|
414
|
+
next
|
|
415
|
+
end
|
|
416
|
+
|
|
417
|
+
if file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>')
|
|
418
|
+
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
|
|
401
419
|
else
|
|
402
420
|
if match_exclude_filter(file_url)
|
|
403
421
|
puts "File url matches exclude filter, ignoring: #{file_url}"
|
|
404
|
-
elsif
|
|
422
|
+
elsif !match_only_filter(file_url)
|
|
405
423
|
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
|
406
424
|
elsif file_list_curated[file_id_and_timestamp]
|
|
407
|
-
|
|
425
|
+
# duplicate combo, ignore silently (verbose flag not shown here)
|
|
408
426
|
else
|
|
409
|
-
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
|
|
427
|
+
file_list_curated[file_id_and_timestamp] = { file_url: file_url, timestamp: file_timestamp }
|
|
410
428
|
end
|
|
411
429
|
end
|
|
412
430
|
end
|
|
@@ -473,6 +491,39 @@ class WaybackMachineDownloader
|
|
|
473
491
|
end
|
|
474
492
|
end
|
|
475
493
|
|
|
494
|
+
def processing_files(pool, files_to_process)
|
|
495
|
+
files_to_process.each do |file_remote_info|
|
|
496
|
+
pool.post do
|
|
497
|
+
download_success = false
|
|
498
|
+
begin
|
|
499
|
+
@connection_pool.with_connection do |connection|
|
|
500
|
+
result_message = download_file(file_remote_info, connection)
|
|
501
|
+
# assume download success if the result message contains ' -> '
|
|
502
|
+
if result_message && result_message.include?(' -> ')
|
|
503
|
+
download_success = true
|
|
504
|
+
end
|
|
505
|
+
@download_mutex.synchronize do
|
|
506
|
+
@processed_file_count += 1
|
|
507
|
+
# adjust progress message to reflect remaining files
|
|
508
|
+
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
|
|
509
|
+
puts progress_message if progress_message
|
|
510
|
+
end
|
|
511
|
+
end
|
|
512
|
+
# sppend to DB only after successful download outside the connection block
|
|
513
|
+
if download_success
|
|
514
|
+
append_to_db(file_remote_info[:file_id])
|
|
515
|
+
end
|
|
516
|
+
rescue => e
|
|
517
|
+
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
|
|
518
|
+
@download_mutex.synchronize do
|
|
519
|
+
@processed_file_count += 1
|
|
520
|
+
end
|
|
521
|
+
end
|
|
522
|
+
sleep(RATE_LIMIT)
|
|
523
|
+
end
|
|
524
|
+
end
|
|
525
|
+
end
|
|
526
|
+
|
|
476
527
|
def download_files
|
|
477
528
|
start_time = Time.now
|
|
478
529
|
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
|
@@ -519,36 +570,7 @@ class WaybackMachineDownloader
|
|
|
519
570
|
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
|
|
520
571
|
pool = Concurrent::FixedThreadPool.new(thread_count)
|
|
521
572
|
|
|
522
|
-
files_to_process
|
|
523
|
-
pool.post do
|
|
524
|
-
download_success = false
|
|
525
|
-
begin
|
|
526
|
-
@connection_pool.with_connection do |connection|
|
|
527
|
-
result_message = download_file(file_remote_info, connection)
|
|
528
|
-
# assume download success if the result message contains ' -> '
|
|
529
|
-
if result_message && result_message.include?(' -> ')
|
|
530
|
-
download_success = true
|
|
531
|
-
end
|
|
532
|
-
@download_mutex.synchronize do
|
|
533
|
-
@processed_file_count += 1
|
|
534
|
-
# adjust progress message to reflect remaining files
|
|
535
|
-
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
|
|
536
|
-
puts progress_message if progress_message
|
|
537
|
-
end
|
|
538
|
-
end
|
|
539
|
-
# sppend to DB only after successful download outside the connection block
|
|
540
|
-
if download_success
|
|
541
|
-
append_to_db(file_remote_info[:file_id])
|
|
542
|
-
end
|
|
543
|
-
rescue => e
|
|
544
|
-
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
|
|
545
|
-
@download_mutex.synchronize do
|
|
546
|
-
@processed_file_count += 1
|
|
547
|
-
end
|
|
548
|
-
end
|
|
549
|
-
sleep(RATE_LIMIT)
|
|
550
|
-
end
|
|
551
|
-
end
|
|
573
|
+
processing_files(pool, files_to_process)
|
|
552
574
|
|
|
553
575
|
pool.shutdown
|
|
554
576
|
pool.wait_for_termination
|
|
@@ -608,64 +630,13 @@ class WaybackMachineDownloader
|
|
|
608
630
|
end
|
|
609
631
|
|
|
610
632
|
# URLs in HTML attributes
|
|
611
|
-
content
|
|
612
|
-
prefix, url, suffix = $1, $2, $3
|
|
613
|
-
|
|
614
|
-
if url.start_with?('http')
|
|
615
|
-
begin
|
|
616
|
-
uri = URI.parse(url)
|
|
617
|
-
path = uri.path
|
|
618
|
-
path = path[1..-1] if path.start_with?('/')
|
|
619
|
-
"#{prefix}#{path}#{suffix}"
|
|
620
|
-
rescue
|
|
621
|
-
"#{prefix}#{url}#{suffix}"
|
|
622
|
-
end
|
|
623
|
-
elsif url.start_with?('/')
|
|
624
|
-
"#{prefix}./#{url[1..-1]}#{suffix}"
|
|
625
|
-
else
|
|
626
|
-
"#{prefix}#{url}#{suffix}"
|
|
627
|
-
end
|
|
628
|
-
end
|
|
633
|
+
rewrite_html_attr_urls(content)
|
|
629
634
|
|
|
630
635
|
# URLs in CSS
|
|
631
|
-
content
|
|
632
|
-
url = $1
|
|
633
|
-
|
|
634
|
-
if url.start_with?('http')
|
|
635
|
-
begin
|
|
636
|
-
uri = URI.parse(url)
|
|
637
|
-
path = uri.path
|
|
638
|
-
path = path[1..-1] if path.start_with?('/')
|
|
639
|
-
"url(\"#{path}\")"
|
|
640
|
-
rescue
|
|
641
|
-
"url(\"#{url}\")"
|
|
642
|
-
end
|
|
643
|
-
elsif url.start_with?('/')
|
|
644
|
-
"url(\"./#{url[1..-1]}\")"
|
|
645
|
-
else
|
|
646
|
-
"url(\"#{url}\")"
|
|
647
|
-
end
|
|
648
|
-
end
|
|
636
|
+
rewrite_css_urls(content)
|
|
649
637
|
|
|
650
638
|
# URLs in JavaScript
|
|
651
|
-
content
|
|
652
|
-
quote_start, url, quote_end = $1, $2, $3
|
|
653
|
-
|
|
654
|
-
if url.start_with?('http')
|
|
655
|
-
begin
|
|
656
|
-
uri = URI.parse(url)
|
|
657
|
-
path = uri.path
|
|
658
|
-
path = path[1..-1] if path.start_with?('/')
|
|
659
|
-
"#{quote_start}#{path}#{quote_end}"
|
|
660
|
-
rescue
|
|
661
|
-
"#{quote_start}#{url}#{quote_end}"
|
|
662
|
-
end
|
|
663
|
-
elsif url.start_with?('/')
|
|
664
|
-
"#{quote_start}./#{url[1..-1]}#{quote_end}"
|
|
665
|
-
else
|
|
666
|
-
"#{quote_start}#{url}#{quote_end}"
|
|
667
|
-
end
|
|
668
|
-
end
|
|
639
|
+
rewrite_js_urls(content)
|
|
669
640
|
|
|
670
641
|
# for URLs in HTML attributes that start with a single slash
|
|
671
642
|
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
|
@@ -794,6 +765,20 @@ class WaybackMachineDownloader
|
|
|
794
765
|
end
|
|
795
766
|
logger
|
|
796
767
|
end
|
|
768
|
+
|
|
769
|
+
# safely sanitize a file id (or id+timestamp)
|
|
770
|
+
def sanitize_and_prepare_id(raw, file_url)
|
|
771
|
+
return nil if raw.nil?
|
|
772
|
+
begin
|
|
773
|
+
raw = CGI.unescape(raw) rescue raw
|
|
774
|
+
raw.gsub!(/<[^>]*>/, '')
|
|
775
|
+
raw = raw.tidy_bytes unless raw.empty?
|
|
776
|
+
raw
|
|
777
|
+
rescue => e
|
|
778
|
+
@logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
|
|
779
|
+
nil
|
|
780
|
+
end
|
|
781
|
+
end
|
|
797
782
|
|
|
798
783
|
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
|
|
799
784
|
retries = 0
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: wayback_machine_downloader_straw
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.4.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- strawberrymaster
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2025-
|
|
10
|
+
date: 2025-08-12 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: concurrent-ruby
|
|
@@ -74,6 +74,7 @@ files:
|
|
|
74
74
|
- lib/wayback_machine_downloader/subdom_processor.rb
|
|
75
75
|
- lib/wayback_machine_downloader/tidy_bytes.rb
|
|
76
76
|
- lib/wayback_machine_downloader/to_regex.rb
|
|
77
|
+
- lib/wayback_machine_downloader/url_rewrite.rb
|
|
77
78
|
homepage: https://github.com/StrawberryMaster/wayback-machine-downloader
|
|
78
79
|
licenses:
|
|
79
80
|
- MIT
|