wayback_machine_downloader_straw 2.3.12 → 2.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 67f774a5476a54ad0224e11f0c9a24b8df6b0d418f5b3c8886277c286bbe3043
4
- data.tar.gz: a881ccdac84cd8e4da13edd9fc8117bfdba8c7d432959ef81c85bc95072a0dd9
3
+ metadata.gz: f6650c4217f2630db6307bc50ae2d6cefcbc38afc18b5701cc90a956af5cf1cf
4
+ data.tar.gz: 0ad44d7daa4c69b75d319c3518c4b801810be071545d5eded4497073caab4667
5
5
  SHA512:
6
- metadata.gz: 01bdc9142820719c1ab17a50067fc478975627f414a29bdca32ea5fedf23227f33fb331f9470bb002af80cc50a6a74c7c8361f214d162c537d100860bdb664bc
7
- data.tar.gz: f47436ecd1d4b8a4062d8689dac0d9fc4d73c743d5f84bd96764aa2a186eaae607fcee6c7b9e72f9fd3befd1fadfe9006354a43bfd134c892fbf5dfdd736ee28
6
+ metadata.gz: 7a8cfd1cda19bc3ff2db8859e03877395eaf44092ffbe9f5334218fbd6293ff1aecc60e2bf272f875a67ecd086a209c56640db221f4d13739669a27eada1c826
7
+ data.tar.gz: 877436af63fa205add55ebeb55bafcd39fec0afa56707ee742871014dac48998e8028ef4616a0b611bee5f9a93ed0d8d136375d457503a3e34b9a37f87321787
@@ -25,7 +25,7 @@ module ArchiveAPI
25
25
  # Check if the response contains the header ["timestamp", "original"]
26
26
  json.shift if json.first == ["timestamp", "original"]
27
27
  json
28
- rescue JSON::ParserError, StandardError => e
28
+ rescue JSON::ParserError => e
29
29
  warn "Failed to fetch data from API: #{e.message}"
30
30
  []
31
31
  end
@@ -1,74 +1,74 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # essentially, this is for converting a string with a potentially
4
+ # broken or unknown encoding into a valid UTF-8 string
5
+ # @todo: consider using charlock_holmes for this in the future
3
6
  module TidyBytes
4
- # precomputing CP1252 to UTF-8 mappings for bytes 128-159
5
- CP1252_MAP = (128..159).map do |byte|
6
- case byte
7
- when 128 then [226, 130, 172] # EURO SIGN
8
- when 130 then [226, 128, 154] # SINGLE LOW-9 QUOTATION MARK
9
- when 131 then [198, 146] # LATIN SMALL LETTER F WITH HOOK
10
- when 132 then [226, 128, 158] # DOUBLE LOW-9 QUOTATION MARK
11
- when 133 then [226, 128, 166] # HORIZONTAL ELLIPSIS
12
- when 134 then [226, 128, 160] # DAGGER
13
- when 135 then [226, 128, 161] # DOUBLE DAGGER
14
- when 136 then [203, 134] # MODIFIER LETTER CIRCUMFLEX ACCENT
15
- when 137 then [226, 128, 176] # PER MILLE SIGN
16
- when 138 then [197, 160] # LATIN CAPITAL LETTER S WITH CARON
17
- when 139 then [226, 128, 185] # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
18
- when 140 then [197, 146] # LATIN CAPITAL LIGATURE OE
19
- when 142 then [197, 189] # LATIN CAPITAL LETTER Z WITH CARON
20
- when 145 then [226, 128, 152] # LEFT SINGLE QUOTATION MARK
21
- when 146 then [226, 128, 153] # RIGHT SINGLE QUOTATION MARK
22
- when 147 then [226, 128, 156] # LEFT DOUBLE QUOTATION MARK
23
- when 148 then [226, 128, 157] # RIGHT DOUBLE QUOTATION MARK
24
- when 149 then [226, 128, 162] # BULLET
25
- when 150 then [226, 128, 147] # EN DASH
26
- when 151 then [226, 128, 148] # EM DASH
27
- when 152 then [203, 156] # SMALL TILDE
28
- when 153 then [226, 132, 162] # TRADE MARK SIGN
29
- when 154 then [197, 161] # LATIN SMALL LETTER S WITH CARON
30
- when 155 then [226, 128, 186] # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
31
- when 156 then [197, 147] # LATIN SMALL LIGATURE OE
32
- when 158 then [197, 190] # LATIN SMALL LETTER Z WITH CARON
33
- when 159 then [197, 184] # LATIN SMALL LETTER Y WITH DIAERESIS
34
- else nil # ANYTHING ELSE...
7
+ UNICODE_REPLACEMENT_CHARACTER = "�"
8
+
9
+ # common encodings to try for best multilingual compatibility
10
+ COMMON_ENCODINGS = [
11
+ Encoding::UTF_8,
12
+ Encoding::Windows_1251, # Cyrillic/Russian legacy
13
+ Encoding::GB18030, # Simplified Chinese
14
+ Encoding::Shift_JIS, # Japanese
15
+ Encoding::EUC_KR, # Korean
16
+ Encoding::ISO_8859_1, # Western European
17
+ Encoding::Windows_1252 # Western European/Latin1 superset
18
+ ].select { |enc| Encoding.name_list.include?(enc.name) }
19
+
20
+ # returns true if the string appears to be binary (has null bytes)
21
+ def binary_data?
22
+ self.include?("\x00".b)
23
+ end
24
+
25
+ # attempts to return a valid UTF-8 version of the string
26
+ def tidy_bytes
27
+ return self if self.encoding == Encoding::UTF_8 && self.valid_encoding?
28
+ return self.dup.force_encoding("BINARY") if binary_data?
29
+
30
+ str = self.dup
31
+ COMMON_ENCODINGS.each do |enc|
32
+ str.force_encoding(enc)
33
+ begin
34
+ utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
35
+ return utf8 if utf8.valid_encoding? && !utf8.include?(UNICODE_REPLACEMENT_CHARACTER)
36
+ rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
37
+ # try next encoding
38
+ end
35
39
  end
36
- end.freeze
37
40
 
38
- # precomputing all possible byte conversions
39
- CP1252_TO_UTF8 = Array.new(256) do |b|
40
- if (128..159).cover?(b)
41
- CP1252_MAP[b - 128]&.pack('C*')
42
- elsif b < 128
43
- b.chr
44
- else
45
- b < 192 ? [194, b].pack('C*') : [195, b - 64].pack('C*')
41
+ # if no clean conversion found, try again but accept replacement characters
42
+ str = self.dup
43
+ COMMON_ENCODINGS.each do |enc|
44
+ str.force_encoding(enc)
45
+ begin
46
+ utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
47
+ return utf8 if utf8.valid_encoding?
48
+ rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
49
+ # try next encoding
50
+ end
46
51
  end
47
- end.freeze
52
+
53
+ # fallback: replace all invalid/undefined bytes
54
+ str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
55
+ end
56
+
57
+ def tidy_bytes!
58
+ replace(self.tidy_bytes)
59
+ end
48
60
 
49
61
  def self.included(base)
50
- base.class_eval do
51
- def tidy_bytes(force = false)
52
- return nil if empty?
53
-
54
- if force
55
- buffer = String.new(capacity: bytesize)
56
- each_byte { |b| buffer << CP1252_TO_UTF8[b] }
57
- return buffer.force_encoding(Encoding::UTF_8)
58
- end
62
+ base.send(:include, InstanceMethods)
63
+ end
59
64
 
60
- begin
61
- encode('UTF-8')
62
- rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
63
- buffer = String.new(capacity: bytesize)
64
- scrub { |b| CP1252_TO_UTF8[b.ord] }
65
- end
66
- end
65
+ module InstanceMethods
66
+ def tidy_bytes
67
+ TidyBytes.instance_method(:tidy_bytes).bind(self).call
68
+ end
67
69
 
68
- def tidy_bytes!(force = false)
69
- result = tidy_bytes(force)
70
- result ? replace(result) : self
71
- end
70
+ def tidy_bytes!
71
+ TidyBytes.instance_method(:tidy_bytes!).bind(self).call
72
72
  end
73
73
  end
74
74
  end
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ # URLs in HTML attributes
4
+ def rewrite_html_attr_urls(content)
5
+
6
+ content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
7
+ prefix, url, suffix = $1, $2, $3
8
+
9
+ if url.start_with?('http')
10
+ begin
11
+ uri = URI.parse(url)
12
+ path = uri.path
13
+ path = path[1..-1] if path.start_with?('/')
14
+ "#{prefix}#{path}#{suffix}"
15
+ rescue
16
+ "#{prefix}#{url}#{suffix}"
17
+ end
18
+ elsif url.start_with?('/')
19
+ "#{prefix}./#{url[1..-1]}#{suffix}"
20
+ else
21
+ "#{prefix}#{url}#{suffix}"
22
+ end
23
+ end
24
+ content
25
+ end
26
+
27
+ # URLs in CSS
28
+ def rewrite_css_urls(content)
29
+
30
+ content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
31
+ url = $1
32
+
33
+ if url.start_with?('http')
34
+ begin
35
+ uri = URI.parse(url)
36
+ path = uri.path
37
+ path = path[1..-1] if path.start_with?('/')
38
+ "url(\"#{path}\")"
39
+ rescue
40
+ "url(\"#{url}\")"
41
+ end
42
+ elsif url.start_with?('/')
43
+ "url(\"./#{url[1..-1]}\")"
44
+ else
45
+ "url(\"#{url}\")"
46
+ end
47
+ end
48
+ content
49
+ end
50
+
51
+ # URLs in JavaScript
52
+ def rewrite_js_urls(content)
53
+
54
+ content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
55
+ quote_start, url, quote_end = $1, $2, $3
56
+
57
+ if url.start_with?('http')
58
+ begin
59
+ uri = URI.parse(url)
60
+ path = uri.path
61
+ path = path[1..-1] if path.start_with?('/')
62
+ "#{quote_start}#{path}#{quote_end}"
63
+ rescue
64
+ "#{quote_start}#{url}#{quote_end}"
65
+ end
66
+ elsif url.start_with?('/')
67
+ "#{quote_start}./#{url[1..-1]}#{quote_end}"
68
+ else
69
+ "#{quote_start}#{url}#{quote_end}"
70
+ end
71
+ end
72
+
73
+ content
74
+ end
@@ -15,6 +15,7 @@ require_relative 'wayback_machine_downloader/tidy_bytes'
15
15
  require_relative 'wayback_machine_downloader/to_regex'
16
16
  require_relative 'wayback_machine_downloader/archive_api'
17
17
  require_relative 'wayback_machine_downloader/subdom_processor'
18
+ require_relative 'wayback_machine_downloader/url_rewrite'
18
19
 
19
20
  class ConnectionPool
20
21
  MAX_AGE = 300
@@ -115,7 +116,7 @@ class WaybackMachineDownloader
115
116
  include ArchiveAPI
116
117
  include SubdomainProcessor
117
118
 
118
- VERSION = "2.3.12"
119
+ VERSION = "2.4.1"
119
120
  DEFAULT_TIMEOUT = 30
120
121
  MAX_RETRIES = 3
121
122
  RETRY_DELAY = 2
@@ -133,10 +134,11 @@ class WaybackMachineDownloader
133
134
 
134
135
  def initialize params
135
136
  validate_params(params)
136
- @base_url = params[:base_url]
137
+ @base_url = params[:base_url]&.tidy_bytes
137
138
  @exact_url = params[:exact_url]
138
139
  if params[:directory]
139
- @directory = File.expand_path(params[:directory])
140
+ sanitized_dir = params[:directory].tidy_bytes
141
+ @directory = File.expand_path(sanitized_dir)
140
142
  else
141
143
  @directory = nil
142
144
  end
@@ -338,15 +340,15 @@ class WaybackMachineDownloader
338
340
  get_all_snapshots_to_consider.each do |file_timestamp, file_url|
339
341
  next unless file_url.include?('/')
340
342
  next if file_timestamp.to_i > target_timestamp
341
- file_id = file_url.split('/')[3..-1].join('/')
342
- file_id = CGI::unescape file_id
343
- file_id = file_id.tidy_bytes unless file_id == ""
343
+
344
+ raw_tail = file_url.split('/')[3..-1]&.join('/')
345
+ file_id = sanitize_and_prepare_id(raw_tail, file_url)
344
346
  next if file_id.nil?
345
347
  next if match_exclude_filter(file_url)
346
348
  next unless match_only_filter(file_url)
347
- # Select the most recent version <= target_timestamp
349
+
348
350
  if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
349
- file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
351
+ file_versions[file_id] = { file_url: file_url, timestamp: file_timestamp, file_id: file_id }
350
352
  end
351
353
  end
352
354
  file_versions.values
@@ -366,22 +368,27 @@ class WaybackMachineDownloader
366
368
  file_list_curated = Hash.new
367
369
  get_all_snapshots_to_consider.each do |file_timestamp, file_url|
368
370
  next unless file_url.include?('/')
369
- file_id = file_url.split('/')[3..-1].join('/')
370
- file_id = CGI::unescape file_id
371
- file_id = file_id.tidy_bytes unless file_id == ""
371
+
372
+ raw_tail = file_url.split('/')[3..-1]&.join('/')
373
+ file_id = sanitize_and_prepare_id(raw_tail, file_url)
372
374
  if file_id.nil?
373
375
  puts "Malformed file url, ignoring: #{file_url}"
376
+ next
377
+ end
378
+
379
+ if file_id.include?('<') || file_id.include?('>')
380
+ puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
374
381
  else
375
382
  if match_exclude_filter(file_url)
376
383
  puts "File url matches exclude filter, ignoring: #{file_url}"
377
- elsif not match_only_filter(file_url)
384
+ elsif !match_only_filter(file_url)
378
385
  puts "File url doesn't match only filter, ignoring: #{file_url}"
379
386
  elsif file_list_curated[file_id]
380
387
  unless file_list_curated[file_id][:timestamp] > file_timestamp
381
- file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
388
+ file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
382
389
  end
383
390
  else
384
- file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
391
+ file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
385
392
  end
386
393
  end
387
394
  end
@@ -392,21 +399,32 @@ class WaybackMachineDownloader
392
399
  file_list_curated = Hash.new
393
400
  get_all_snapshots_to_consider.each do |file_timestamp, file_url|
394
401
  next unless file_url.include?('/')
395
- file_id = file_url.split('/')[3..-1].join('/')
396
- file_id_and_timestamp = [file_timestamp, file_id].join('/')
397
- file_id_and_timestamp = CGI::unescape file_id_and_timestamp
398
- file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
402
+
403
+ raw_tail = file_url.split('/')[3..-1]&.join('/')
404
+ file_id = sanitize_and_prepare_id(raw_tail, file_url)
399
405
  if file_id.nil?
400
406
  puts "Malformed file url, ignoring: #{file_url}"
407
+ next
408
+ end
409
+
410
+ file_id_and_timestamp_raw = [file_timestamp, file_id].join('/')
411
+ file_id_and_timestamp = sanitize_and_prepare_id(file_id_and_timestamp_raw, file_url)
412
+ if file_id_and_timestamp.nil?
413
+ puts "Malformed file id/timestamp combo, ignoring: #{file_url}"
414
+ next
415
+ end
416
+
417
+ if file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>')
418
+ puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
401
419
  else
402
420
  if match_exclude_filter(file_url)
403
421
  puts "File url matches exclude filter, ignoring: #{file_url}"
404
- elsif not match_only_filter(file_url)
422
+ elsif !match_only_filter(file_url)
405
423
  puts "File url doesn't match only filter, ignoring: #{file_url}"
406
424
  elsif file_list_curated[file_id_and_timestamp]
407
- puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
425
+ # duplicate combo, ignore silently (verbose flag not shown here)
408
426
  else
409
- file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
427
+ file_list_curated[file_id_and_timestamp] = { file_url: file_url, timestamp: file_timestamp }
410
428
  end
411
429
  end
412
430
  end
@@ -473,6 +491,39 @@ class WaybackMachineDownloader
473
491
  end
474
492
  end
475
493
 
494
+ def processing_files(pool, files_to_process)
495
+ files_to_process.each do |file_remote_info|
496
+ pool.post do
497
+ download_success = false
498
+ begin
499
+ @connection_pool.with_connection do |connection|
500
+ result_message = download_file(file_remote_info, connection)
501
+ # assume download success if the result message contains ' -> '
502
+ if result_message && result_message.include?(' -> ')
503
+ download_success = true
504
+ end
505
+ @download_mutex.synchronize do
506
+ @processed_file_count += 1
507
+ # adjust progress message to reflect remaining files
508
+ progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
509
+ puts progress_message if progress_message
510
+ end
511
+ end
512
+ # sppend to DB only after successful download outside the connection block
513
+ if download_success
514
+ append_to_db(file_remote_info[:file_id])
515
+ end
516
+ rescue => e
517
+ @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
518
+ @download_mutex.synchronize do
519
+ @processed_file_count += 1
520
+ end
521
+ end
522
+ sleep(RATE_LIMIT)
523
+ end
524
+ end
525
+ end
526
+
476
527
  def download_files
477
528
  start_time = Time.now
478
529
  puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
@@ -519,36 +570,7 @@ class WaybackMachineDownloader
519
570
  thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
520
571
  pool = Concurrent::FixedThreadPool.new(thread_count)
521
572
 
522
- files_to_process.each do |file_remote_info|
523
- pool.post do
524
- download_success = false
525
- begin
526
- @connection_pool.with_connection do |connection|
527
- result_message = download_file(file_remote_info, connection)
528
- # assume download success if the result message contains ' -> '
529
- if result_message && result_message.include?(' -> ')
530
- download_success = true
531
- end
532
- @download_mutex.synchronize do
533
- @processed_file_count += 1
534
- # adjust progress message to reflect remaining files
535
- progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
536
- puts progress_message if progress_message
537
- end
538
- end
539
- # sppend to DB only after successful download outside the connection block
540
- if download_success
541
- append_to_db(file_remote_info[:file_id])
542
- end
543
- rescue => e
544
- @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
545
- @download_mutex.synchronize do
546
- @processed_file_count += 1
547
- end
548
- end
549
- sleep(RATE_LIMIT)
550
- end
551
- end
573
+ processing_files(pool, files_to_process)
552
574
 
553
575
  pool.shutdown
554
576
  pool.wait_for_termination
@@ -608,64 +630,13 @@ class WaybackMachineDownloader
608
630
  end
609
631
 
610
632
  # URLs in HTML attributes
611
- content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
612
- prefix, url, suffix = $1, $2, $3
613
-
614
- if url.start_with?('http')
615
- begin
616
- uri = URI.parse(url)
617
- path = uri.path
618
- path = path[1..-1] if path.start_with?('/')
619
- "#{prefix}#{path}#{suffix}"
620
- rescue
621
- "#{prefix}#{url}#{suffix}"
622
- end
623
- elsif url.start_with?('/')
624
- "#{prefix}./#{url[1..-1]}#{suffix}"
625
- else
626
- "#{prefix}#{url}#{suffix}"
627
- end
628
- end
633
+ rewrite_html_attr_urls(content)
629
634
 
630
635
  # URLs in CSS
631
- content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
632
- url = $1
633
-
634
- if url.start_with?('http')
635
- begin
636
- uri = URI.parse(url)
637
- path = uri.path
638
- path = path[1..-1] if path.start_with?('/')
639
- "url(\"#{path}\")"
640
- rescue
641
- "url(\"#{url}\")"
642
- end
643
- elsif url.start_with?('/')
644
- "url(\"./#{url[1..-1]}\")"
645
- else
646
- "url(\"#{url}\")"
647
- end
648
- end
636
+ rewrite_css_urls(content)
649
637
 
650
638
  # URLs in JavaScript
651
- content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
652
- quote_start, url, quote_end = $1, $2, $3
653
-
654
- if url.start_with?('http')
655
- begin
656
- uri = URI.parse(url)
657
- path = uri.path
658
- path = path[1..-1] if path.start_with?('/')
659
- "#{quote_start}#{path}#{quote_end}"
660
- rescue
661
- "#{quote_start}#{url}#{quote_end}"
662
- end
663
- elsif url.start_with?('/')
664
- "#{quote_start}./#{url[1..-1]}#{quote_end}"
665
- else
666
- "#{quote_start}#{url}#{quote_end}"
667
- end
668
- end
639
+ rewrite_js_urls(content)
669
640
 
670
641
  # for URLs in HTML attributes that start with a single slash
671
642
  content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
@@ -794,6 +765,20 @@ class WaybackMachineDownloader
794
765
  end
795
766
  logger
796
767
  end
768
+
769
+ # safely sanitize a file id (or id+timestamp)
770
+ def sanitize_and_prepare_id(raw, file_url)
771
+ return nil if raw.nil?
772
+ begin
773
+ raw = CGI.unescape(raw) rescue raw
774
+ raw.gsub!(/<[^>]*>/, '')
775
+ raw = raw.tidy_bytes unless raw.empty?
776
+ raw
777
+ rescue => e
778
+ @logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
779
+ nil
780
+ end
781
+ end
797
782
 
798
783
  def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
799
784
  retries = 0
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.12
4
+ version: 2.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2025-07-22 00:00:00.000000000 Z
10
+ date: 2025-08-12 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: concurrent-ruby
@@ -74,6 +74,7 @@ files:
74
74
  - lib/wayback_machine_downloader/subdom_processor.rb
75
75
  - lib/wayback_machine_downloader/tidy_bytes.rb
76
76
  - lib/wayback_machine_downloader/to_regex.rb
77
+ - lib/wayback_machine_downloader/url_rewrite.rb
77
78
  homepage: https://github.com/StrawberryMaster/wayback-machine-downloader
78
79
  licenses:
79
80
  - MIT