wayback_machine_downloader_straw 2.3.12 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 67f774a5476a54ad0224e11f0c9a24b8df6b0d418f5b3c8886277c286bbe3043
4
- data.tar.gz: a881ccdac84cd8e4da13edd9fc8117bfdba8c7d432959ef81c85bc95072a0dd9
3
+ metadata.gz: 35a8c4a865a9da5cb45e7f63e2f832f491895f5c69c3d440b9c8b4230b8444f1
4
+ data.tar.gz: a96d746b41f3e3b7a1cf6df38df3b23a79361f57f667eea562be72961bf391c2
5
5
  SHA512:
6
- metadata.gz: 01bdc9142820719c1ab17a50067fc478975627f414a29bdca32ea5fedf23227f33fb331f9470bb002af80cc50a6a74c7c8361f214d162c537d100860bdb664bc
7
- data.tar.gz: f47436ecd1d4b8a4062d8689dac0d9fc4d73c743d5f84bd96764aa2a186eaae607fcee6c7b9e72f9fd3befd1fadfe9006354a43bfd134c892fbf5dfdd736ee28
6
+ metadata.gz: 783bb658ee95bd523fb3dc8c2c11a027947becc4e72902e2fff85eb725bbc8e3ef8e7bb22b08598f015f77e801526354f36b6d920144df9fd6bca440cccf8127
7
+ data.tar.gz: a2e0ce3e4df543574b1c04e349d120b31d900bbbfe3f9bf512706f57094d89c49574290520df25fdd8c920577baf561272af65ca4c36d058a3a4097efa167a83
@@ -25,7 +25,7 @@ module ArchiveAPI
25
25
  # Check if the response contains the header ["timestamp", "original"]
26
26
  json.shift if json.first == ["timestamp", "original"]
27
27
  json
28
- rescue JSON::ParserError, StandardError => e
28
+ rescue JSON::ParserError => e
29
29
  warn "Failed to fetch data from API: #{e.message}"
30
30
  []
31
31
  end
@@ -1,74 +1,74 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # essentially, this is for converting a string with a potentially
4
+ # broken or unknown encoding into a valid UTF-8 string
5
+ # @todo: consider using charlock_holmes for this in the future
3
6
  module TidyBytes
4
- # precomputing CP1252 to UTF-8 mappings for bytes 128-159
5
- CP1252_MAP = (128..159).map do |byte|
6
- case byte
7
- when 128 then [226, 130, 172] # EURO SIGN
8
- when 130 then [226, 128, 154] # SINGLE LOW-9 QUOTATION MARK
9
- when 131 then [198, 146] # LATIN SMALL LETTER F WITH HOOK
10
- when 132 then [226, 128, 158] # DOUBLE LOW-9 QUOTATION MARK
11
- when 133 then [226, 128, 166] # HORIZONTAL ELLIPSIS
12
- when 134 then [226, 128, 160] # DAGGER
13
- when 135 then [226, 128, 161] # DOUBLE DAGGER
14
- when 136 then [203, 134] # MODIFIER LETTER CIRCUMFLEX ACCENT
15
- when 137 then [226, 128, 176] # PER MILLE SIGN
16
- when 138 then [197, 160] # LATIN CAPITAL LETTER S WITH CARON
17
- when 139 then [226, 128, 185] # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
18
- when 140 then [197, 146] # LATIN CAPITAL LIGATURE OE
19
- when 142 then [197, 189] # LATIN CAPITAL LETTER Z WITH CARON
20
- when 145 then [226, 128, 152] # LEFT SINGLE QUOTATION MARK
21
- when 146 then [226, 128, 153] # RIGHT SINGLE QUOTATION MARK
22
- when 147 then [226, 128, 156] # LEFT DOUBLE QUOTATION MARK
23
- when 148 then [226, 128, 157] # RIGHT DOUBLE QUOTATION MARK
24
- when 149 then [226, 128, 162] # BULLET
25
- when 150 then [226, 128, 147] # EN DASH
26
- when 151 then [226, 128, 148] # EM DASH
27
- when 152 then [203, 156] # SMALL TILDE
28
- when 153 then [226, 132, 162] # TRADE MARK SIGN
29
- when 154 then [197, 161] # LATIN SMALL LETTER S WITH CARON
30
- when 155 then [226, 128, 186] # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
31
- when 156 then [197, 147] # LATIN SMALL LIGATURE OE
32
- when 158 then [197, 190] # LATIN SMALL LETTER Z WITH CARON
33
- when 159 then [197, 184] # LATIN SMALL LETTER Y WITH DIAERESIS
34
- else nil # ANYTHING ELSE...
7
+ UNICODE_REPLACEMENT_CHARACTER = "�"
8
+
9
+ # common encodings to try for best multilingual compatibility
10
+ COMMON_ENCODINGS = [
11
+ Encoding::UTF_8,
12
+ Encoding::Windows_1251, # Cyrillic/Russian legacy
13
+ Encoding::GB18030, # Simplified Chinese
14
+ Encoding::Shift_JIS, # Japanese
15
+ Encoding::EUC_KR, # Korean
16
+ Encoding::ISO_8859_1, # Western European
17
+ Encoding::Windows_1252 # Western European/Latin1 superset
18
+ ].select { |enc| Encoding.name_list.include?(enc.name) }
19
+
20
+ # returns true if the string appears to be binary (has null bytes)
21
+ def binary_data?
22
+ self.include?("\x00".b)
23
+ end
24
+
25
+ # attempts to return a valid UTF-8 version of the string
26
+ def tidy_bytes
27
+ return self if self.encoding == Encoding::UTF_8 && self.valid_encoding?
28
+ return self.dup.force_encoding("BINARY") if binary_data?
29
+
30
+ str = self.dup
31
+ COMMON_ENCODINGS.each do |enc|
32
+ str.force_encoding(enc)
33
+ begin
34
+ utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
35
+ return utf8 if utf8.valid_encoding? && !utf8.include?(UNICODE_REPLACEMENT_CHARACTER)
36
+ rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
37
+ # try next encoding
38
+ end
35
39
  end
36
- end.freeze
37
40
 
38
- # precomputing all possible byte conversions
39
- CP1252_TO_UTF8 = Array.new(256) do |b|
40
- if (128..159).cover?(b)
41
- CP1252_MAP[b - 128]&.pack('C*')
42
- elsif b < 128
43
- b.chr
44
- else
45
- b < 192 ? [194, b].pack('C*') : [195, b - 64].pack('C*')
41
+ # if no clean conversion found, try again but accept replacement characters
42
+ str = self.dup
43
+ COMMON_ENCODINGS.each do |enc|
44
+ str.force_encoding(enc)
45
+ begin
46
+ utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
47
+ return utf8 if utf8.valid_encoding?
48
+ rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
49
+ # try next encoding
50
+ end
46
51
  end
47
- end.freeze
52
+
53
+ # fallback: replace all invalid/undefined bytes
54
+ str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
55
+ end
56
+
57
+ def tidy_bytes!
58
+ replace(self.tidy_bytes)
59
+ end
48
60
 
49
61
  def self.included(base)
50
- base.class_eval do
51
- def tidy_bytes(force = false)
52
- return nil if empty?
53
-
54
- if force
55
- buffer = String.new(capacity: bytesize)
56
- each_byte { |b| buffer << CP1252_TO_UTF8[b] }
57
- return buffer.force_encoding(Encoding::UTF_8)
58
- end
62
+ base.send(:include, InstanceMethods)
63
+ end
59
64
 
60
- begin
61
- encode('UTF-8')
62
- rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
63
- buffer = String.new(capacity: bytesize)
64
- scrub { |b| CP1252_TO_UTF8[b.ord] }
65
- end
66
- end
65
+ module InstanceMethods
66
+ def tidy_bytes
67
+ TidyBytes.instance_method(:tidy_bytes).bind(self).call
68
+ end
67
69
 
68
- def tidy_bytes!(force = false)
69
- result = tidy_bytes(force)
70
- result ? replace(result) : self
71
- end
70
+ def tidy_bytes!
71
+ TidyBytes.instance_method(:tidy_bytes!).bind(self).call
72
72
  end
73
73
  end
74
74
  end
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ # URLs in HTML attributes
4
+ def rewrite_html_attr_urls(content)
5
+
6
+ content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
7
+ prefix, url, suffix = $1, $2, $3
8
+
9
+ if url.start_with?('http')
10
+ begin
11
+ uri = URI.parse(url)
12
+ path = uri.path
13
+ path = path[1..-1] if path.start_with?('/')
14
+ "#{prefix}#{path}#{suffix}"
15
+ rescue
16
+ "#{prefix}#{url}#{suffix}"
17
+ end
18
+ elsif url.start_with?('/')
19
+ "#{prefix}./#{url[1..-1]}#{suffix}"
20
+ else
21
+ "#{prefix}#{url}#{suffix}"
22
+ end
23
+ end
24
+ content
25
+ end
26
+
27
+ # URLs in CSS
28
+ def rewrite_css_urls(content)
29
+
30
+ content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
31
+ url = $1
32
+
33
+ if url.start_with?('http')
34
+ begin
35
+ uri = URI.parse(url)
36
+ path = uri.path
37
+ path = path[1..-1] if path.start_with?('/')
38
+ "url(\"#{path}\")"
39
+ rescue
40
+ "url(\"#{url}\")"
41
+ end
42
+ elsif url.start_with?('/')
43
+ "url(\"./#{url[1..-1]}\")"
44
+ else
45
+ "url(\"#{url}\")"
46
+ end
47
+ end
48
+ content
49
+ end
50
+
51
+ # URLs in JavaScript
52
+ def rewrite_js_urls(content)
53
+
54
+ content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
55
+ quote_start, url, quote_end = $1, $2, $3
56
+
57
+ if url.start_with?('http')
58
+ begin
59
+ uri = URI.parse(url)
60
+ path = uri.path
61
+ path = path[1..-1] if path.start_with?('/')
62
+ "#{quote_start}#{path}#{quote_end}"
63
+ rescue
64
+ "#{quote_start}#{url}#{quote_end}"
65
+ end
66
+ elsif url.start_with?('/')
67
+ "#{quote_start}./#{url[1..-1]}#{quote_end}"
68
+ else
69
+ "#{quote_start}#{url}#{quote_end}"
70
+ end
71
+ end
72
+
73
+ content
74
+ end
@@ -15,6 +15,7 @@ require_relative 'wayback_machine_downloader/tidy_bytes'
15
15
  require_relative 'wayback_machine_downloader/to_regex'
16
16
  require_relative 'wayback_machine_downloader/archive_api'
17
17
  require_relative 'wayback_machine_downloader/subdom_processor'
18
+ require_relative 'wayback_machine_downloader/url_rewrite'
18
19
 
19
20
  class ConnectionPool
20
21
  MAX_AGE = 300
@@ -115,7 +116,7 @@ class WaybackMachineDownloader
115
116
  include ArchiveAPI
116
117
  include SubdomainProcessor
117
118
 
118
- VERSION = "2.3.12"
119
+ VERSION = "2.4.0"
119
120
  DEFAULT_TIMEOUT = 30
120
121
  MAX_RETRIES = 3
121
122
  RETRY_DELAY = 2
@@ -133,10 +134,11 @@ class WaybackMachineDownloader
133
134
 
134
135
  def initialize params
135
136
  validate_params(params)
136
- @base_url = params[:base_url]
137
+ @base_url = params[:base_url]&.tidy_bytes
137
138
  @exact_url = params[:exact_url]
138
139
  if params[:directory]
139
- @directory = File.expand_path(params[:directory])
140
+ sanitized_dir = params[:directory].tidy_bytes
141
+ @directory = File.expand_path(sanitized_dir)
140
142
  else
141
143
  @directory = nil
142
144
  end
@@ -473,6 +475,39 @@ class WaybackMachineDownloader
473
475
  end
474
476
  end
475
477
 
478
+ def processing_files(pool, files_to_process)
479
+ files_to_process.each do |file_remote_info|
480
+ pool.post do
481
+ download_success = false
482
+ begin
483
+ @connection_pool.with_connection do |connection|
484
+ result_message = download_file(file_remote_info, connection)
485
+ # assume download success if the result message contains ' -> '
486
+ if result_message && result_message.include?(' -> ')
487
+ download_success = true
488
+ end
489
+ @download_mutex.synchronize do
490
+ @processed_file_count += 1
491
+ # adjust progress message to reflect remaining files
492
+ progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
493
+ puts progress_message if progress_message
494
+ end
495
+ end
496
+ # sppend to DB only after successful download outside the connection block
497
+ if download_success
498
+ append_to_db(file_remote_info[:file_id])
499
+ end
500
+ rescue => e
501
+ @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
502
+ @download_mutex.synchronize do
503
+ @processed_file_count += 1
504
+ end
505
+ end
506
+ sleep(RATE_LIMIT)
507
+ end
508
+ end
509
+ end
510
+
476
511
  def download_files
477
512
  start_time = Time.now
478
513
  puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
@@ -519,36 +554,7 @@ class WaybackMachineDownloader
519
554
  thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
520
555
  pool = Concurrent::FixedThreadPool.new(thread_count)
521
556
 
522
- files_to_process.each do |file_remote_info|
523
- pool.post do
524
- download_success = false
525
- begin
526
- @connection_pool.with_connection do |connection|
527
- result_message = download_file(file_remote_info, connection)
528
- # assume download success if the result message contains ' -> '
529
- if result_message && result_message.include?(' -> ')
530
- download_success = true
531
- end
532
- @download_mutex.synchronize do
533
- @processed_file_count += 1
534
- # adjust progress message to reflect remaining files
535
- progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
536
- puts progress_message if progress_message
537
- end
538
- end
539
- # sppend to DB only after successful download outside the connection block
540
- if download_success
541
- append_to_db(file_remote_info[:file_id])
542
- end
543
- rescue => e
544
- @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
545
- @download_mutex.synchronize do
546
- @processed_file_count += 1
547
- end
548
- end
549
- sleep(RATE_LIMIT)
550
- end
551
- end
557
+ processing_files(pool, files_to_process)
552
558
 
553
559
  pool.shutdown
554
560
  pool.wait_for_termination
@@ -608,64 +614,13 @@ class WaybackMachineDownloader
608
614
  end
609
615
 
610
616
  # URLs in HTML attributes
611
- content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
612
- prefix, url, suffix = $1, $2, $3
613
-
614
- if url.start_with?('http')
615
- begin
616
- uri = URI.parse(url)
617
- path = uri.path
618
- path = path[1..-1] if path.start_with?('/')
619
- "#{prefix}#{path}#{suffix}"
620
- rescue
621
- "#{prefix}#{url}#{suffix}"
622
- end
623
- elsif url.start_with?('/')
624
- "#{prefix}./#{url[1..-1]}#{suffix}"
625
- else
626
- "#{prefix}#{url}#{suffix}"
627
- end
628
- end
617
+ rewrite_html_attr_urls(content)
629
618
 
630
619
  # URLs in CSS
631
- content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
632
- url = $1
633
-
634
- if url.start_with?('http')
635
- begin
636
- uri = URI.parse(url)
637
- path = uri.path
638
- path = path[1..-1] if path.start_with?('/')
639
- "url(\"#{path}\")"
640
- rescue
641
- "url(\"#{url}\")"
642
- end
643
- elsif url.start_with?('/')
644
- "url(\"./#{url[1..-1]}\")"
645
- else
646
- "url(\"#{url}\")"
647
- end
648
- end
620
+ rewrite_css_urls(content)
649
621
 
650
622
  # URLs in JavaScript
651
- content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
652
- quote_start, url, quote_end = $1, $2, $3
653
-
654
- if url.start_with?('http')
655
- begin
656
- uri = URI.parse(url)
657
- path = uri.path
658
- path = path[1..-1] if path.start_with?('/')
659
- "#{quote_start}#{path}#{quote_end}"
660
- rescue
661
- "#{quote_start}#{url}#{quote_end}"
662
- end
663
- elsif url.start_with?('/')
664
- "#{quote_start}./#{url[1..-1]}#{quote_end}"
665
- else
666
- "#{quote_start}#{url}#{quote_end}"
667
- end
668
- end
623
+ rewrite_js_urls(content)
669
624
 
670
625
  # for URLs in HTML attributes that start with a single slash
671
626
  content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.12
4
+ version: 2.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2025-07-22 00:00:00.000000000 Z
10
+ date: 2025-08-04 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: concurrent-ruby
@@ -74,6 +74,7 @@ files:
74
74
  - lib/wayback_machine_downloader/subdom_processor.rb
75
75
  - lib/wayback_machine_downloader/tidy_bytes.rb
76
76
  - lib/wayback_machine_downloader/to_regex.rb
77
+ - lib/wayback_machine_downloader/url_rewrite.rb
77
78
  homepage: https://github.com/StrawberryMaster/wayback-machine-downloader
78
79
  licenses:
79
80
  - MIT