wayback_machine_downloader_straw 2.3.12 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 35a8c4a865a9da5cb45e7f63e2f832f491895f5c69c3d440b9c8b4230b8444f1
|
|
4
|
+
data.tar.gz: a96d746b41f3e3b7a1cf6df38df3b23a79361f57f667eea562be72961bf391c2
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 783bb658ee95bd523fb3dc8c2c11a027947becc4e72902e2fff85eb725bbc8e3ef8e7bb22b08598f015f77e801526354f36b6d920144df9fd6bca440cccf8127
|
|
7
|
+
data.tar.gz: a2e0ce3e4df543574b1c04e349d120b31d900bbbfe3f9bf512706f57094d89c49574290520df25fdd8c920577baf561272af65ca4c36d058a3a4097efa167a83
|
|
@@ -25,7 +25,7 @@ module ArchiveAPI
|
|
|
25
25
|
# Check if the response contains the header ["timestamp", "original"]
|
|
26
26
|
json.shift if json.first == ["timestamp", "original"]
|
|
27
27
|
json
|
|
28
|
-
rescue JSON::ParserError
|
|
28
|
+
rescue JSON::ParserError => e
|
|
29
29
|
warn "Failed to fetch data from API: #{e.message}"
|
|
30
30
|
[]
|
|
31
31
|
end
|
|
@@ -1,74 +1,74 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
# essentially, this is for converting a string with a potentially
|
|
4
|
+
# broken or unknown encoding into a valid UTF-8 string
|
|
5
|
+
# @todo: consider using charlock_holmes for this in the future
|
|
3
6
|
module TidyBytes
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
7
|
+
UNICODE_REPLACEMENT_CHARACTER = "�"
|
|
8
|
+
|
|
9
|
+
# common encodings to try for best multilingual compatibility
|
|
10
|
+
COMMON_ENCODINGS = [
|
|
11
|
+
Encoding::UTF_8,
|
|
12
|
+
Encoding::Windows_1251, # Cyrillic/Russian legacy
|
|
13
|
+
Encoding::GB18030, # Simplified Chinese
|
|
14
|
+
Encoding::Shift_JIS, # Japanese
|
|
15
|
+
Encoding::EUC_KR, # Korean
|
|
16
|
+
Encoding::ISO_8859_1, # Western European
|
|
17
|
+
Encoding::Windows_1252 # Western European/Latin1 superset
|
|
18
|
+
].select { |enc| Encoding.name_list.include?(enc.name) }
|
|
19
|
+
|
|
20
|
+
# returns true if the string appears to be binary (has null bytes)
|
|
21
|
+
def binary_data?
|
|
22
|
+
self.include?("\x00".b)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# attempts to return a valid UTF-8 version of the string
|
|
26
|
+
def tidy_bytes
|
|
27
|
+
return self if self.encoding == Encoding::UTF_8 && self.valid_encoding?
|
|
28
|
+
return self.dup.force_encoding("BINARY") if binary_data?
|
|
29
|
+
|
|
30
|
+
str = self.dup
|
|
31
|
+
COMMON_ENCODINGS.each do |enc|
|
|
32
|
+
str.force_encoding(enc)
|
|
33
|
+
begin
|
|
34
|
+
utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
|
|
35
|
+
return utf8 if utf8.valid_encoding? && !utf8.include?(UNICODE_REPLACEMENT_CHARACTER)
|
|
36
|
+
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
|
37
|
+
# try next encoding
|
|
38
|
+
end
|
|
35
39
|
end
|
|
36
|
-
end.freeze
|
|
37
40
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
41
|
+
# if no clean conversion found, try again but accept replacement characters
|
|
42
|
+
str = self.dup
|
|
43
|
+
COMMON_ENCODINGS.each do |enc|
|
|
44
|
+
str.force_encoding(enc)
|
|
45
|
+
begin
|
|
46
|
+
utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
|
|
47
|
+
return utf8 if utf8.valid_encoding?
|
|
48
|
+
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
|
49
|
+
# try next encoding
|
|
50
|
+
end
|
|
46
51
|
end
|
|
47
|
-
|
|
52
|
+
|
|
53
|
+
# fallback: replace all invalid/undefined bytes
|
|
54
|
+
str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def tidy_bytes!
|
|
58
|
+
replace(self.tidy_bytes)
|
|
59
|
+
end
|
|
48
60
|
|
|
49
61
|
def self.included(base)
|
|
50
|
-
base.
|
|
51
|
-
|
|
52
|
-
return nil if empty?
|
|
53
|
-
|
|
54
|
-
if force
|
|
55
|
-
buffer = String.new(capacity: bytesize)
|
|
56
|
-
each_byte { |b| buffer << CP1252_TO_UTF8[b] }
|
|
57
|
-
return buffer.force_encoding(Encoding::UTF_8)
|
|
58
|
-
end
|
|
62
|
+
base.send(:include, InstanceMethods)
|
|
63
|
+
end
|
|
59
64
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
scrub { |b| CP1252_TO_UTF8[b.ord] }
|
|
65
|
-
end
|
|
66
|
-
end
|
|
65
|
+
module InstanceMethods
|
|
66
|
+
def tidy_bytes
|
|
67
|
+
TidyBytes.instance_method(:tidy_bytes).bind(self).call
|
|
68
|
+
end
|
|
67
69
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
result ? replace(result) : self
|
|
71
|
-
end
|
|
70
|
+
def tidy_bytes!
|
|
71
|
+
TidyBytes.instance_method(:tidy_bytes!).bind(self).call
|
|
72
72
|
end
|
|
73
73
|
end
|
|
74
74
|
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# URLs in HTML attributes
|
|
4
|
+
def rewrite_html_attr_urls(content)
|
|
5
|
+
|
|
6
|
+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
|
7
|
+
prefix, url, suffix = $1, $2, $3
|
|
8
|
+
|
|
9
|
+
if url.start_with?('http')
|
|
10
|
+
begin
|
|
11
|
+
uri = URI.parse(url)
|
|
12
|
+
path = uri.path
|
|
13
|
+
path = path[1..-1] if path.start_with?('/')
|
|
14
|
+
"#{prefix}#{path}#{suffix}"
|
|
15
|
+
rescue
|
|
16
|
+
"#{prefix}#{url}#{suffix}"
|
|
17
|
+
end
|
|
18
|
+
elsif url.start_with?('/')
|
|
19
|
+
"#{prefix}./#{url[1..-1]}#{suffix}"
|
|
20
|
+
else
|
|
21
|
+
"#{prefix}#{url}#{suffix}"
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
content
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# URLs in CSS
|
|
28
|
+
def rewrite_css_urls(content)
|
|
29
|
+
|
|
30
|
+
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
|
|
31
|
+
url = $1
|
|
32
|
+
|
|
33
|
+
if url.start_with?('http')
|
|
34
|
+
begin
|
|
35
|
+
uri = URI.parse(url)
|
|
36
|
+
path = uri.path
|
|
37
|
+
path = path[1..-1] if path.start_with?('/')
|
|
38
|
+
"url(\"#{path}\")"
|
|
39
|
+
rescue
|
|
40
|
+
"url(\"#{url}\")"
|
|
41
|
+
end
|
|
42
|
+
elsif url.start_with?('/')
|
|
43
|
+
"url(\"./#{url[1..-1]}\")"
|
|
44
|
+
else
|
|
45
|
+
"url(\"#{url}\")"
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
content
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# URLs in JavaScript
|
|
52
|
+
def rewrite_js_urls(content)
|
|
53
|
+
|
|
54
|
+
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
|
55
|
+
quote_start, url, quote_end = $1, $2, $3
|
|
56
|
+
|
|
57
|
+
if url.start_with?('http')
|
|
58
|
+
begin
|
|
59
|
+
uri = URI.parse(url)
|
|
60
|
+
path = uri.path
|
|
61
|
+
path = path[1..-1] if path.start_with?('/')
|
|
62
|
+
"#{quote_start}#{path}#{quote_end}"
|
|
63
|
+
rescue
|
|
64
|
+
"#{quote_start}#{url}#{quote_end}"
|
|
65
|
+
end
|
|
66
|
+
elsif url.start_with?('/')
|
|
67
|
+
"#{quote_start}./#{url[1..-1]}#{quote_end}"
|
|
68
|
+
else
|
|
69
|
+
"#{quote_start}#{url}#{quote_end}"
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
content
|
|
74
|
+
end
|
|
@@ -15,6 +15,7 @@ require_relative 'wayback_machine_downloader/tidy_bytes'
|
|
|
15
15
|
require_relative 'wayback_machine_downloader/to_regex'
|
|
16
16
|
require_relative 'wayback_machine_downloader/archive_api'
|
|
17
17
|
require_relative 'wayback_machine_downloader/subdom_processor'
|
|
18
|
+
require_relative 'wayback_machine_downloader/url_rewrite'
|
|
18
19
|
|
|
19
20
|
class ConnectionPool
|
|
20
21
|
MAX_AGE = 300
|
|
@@ -115,7 +116,7 @@ class WaybackMachineDownloader
|
|
|
115
116
|
include ArchiveAPI
|
|
116
117
|
include SubdomainProcessor
|
|
117
118
|
|
|
118
|
-
VERSION = "2.
|
|
119
|
+
VERSION = "2.4.0"
|
|
119
120
|
DEFAULT_TIMEOUT = 30
|
|
120
121
|
MAX_RETRIES = 3
|
|
121
122
|
RETRY_DELAY = 2
|
|
@@ -133,10 +134,11 @@ class WaybackMachineDownloader
|
|
|
133
134
|
|
|
134
135
|
def initialize params
|
|
135
136
|
validate_params(params)
|
|
136
|
-
@base_url = params[:base_url]
|
|
137
|
+
@base_url = params[:base_url]&.tidy_bytes
|
|
137
138
|
@exact_url = params[:exact_url]
|
|
138
139
|
if params[:directory]
|
|
139
|
-
|
|
140
|
+
sanitized_dir = params[:directory].tidy_bytes
|
|
141
|
+
@directory = File.expand_path(sanitized_dir)
|
|
140
142
|
else
|
|
141
143
|
@directory = nil
|
|
142
144
|
end
|
|
@@ -473,6 +475,39 @@ class WaybackMachineDownloader
|
|
|
473
475
|
end
|
|
474
476
|
end
|
|
475
477
|
|
|
478
|
+
def processing_files(pool, files_to_process)
|
|
479
|
+
files_to_process.each do |file_remote_info|
|
|
480
|
+
pool.post do
|
|
481
|
+
download_success = false
|
|
482
|
+
begin
|
|
483
|
+
@connection_pool.with_connection do |connection|
|
|
484
|
+
result_message = download_file(file_remote_info, connection)
|
|
485
|
+
# assume download success if the result message contains ' -> '
|
|
486
|
+
if result_message && result_message.include?(' -> ')
|
|
487
|
+
download_success = true
|
|
488
|
+
end
|
|
489
|
+
@download_mutex.synchronize do
|
|
490
|
+
@processed_file_count += 1
|
|
491
|
+
# adjust progress message to reflect remaining files
|
|
492
|
+
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
|
|
493
|
+
puts progress_message if progress_message
|
|
494
|
+
end
|
|
495
|
+
end
|
|
496
|
+
# sppend to DB only after successful download outside the connection block
|
|
497
|
+
if download_success
|
|
498
|
+
append_to_db(file_remote_info[:file_id])
|
|
499
|
+
end
|
|
500
|
+
rescue => e
|
|
501
|
+
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
|
|
502
|
+
@download_mutex.synchronize do
|
|
503
|
+
@processed_file_count += 1
|
|
504
|
+
end
|
|
505
|
+
end
|
|
506
|
+
sleep(RATE_LIMIT)
|
|
507
|
+
end
|
|
508
|
+
end
|
|
509
|
+
end
|
|
510
|
+
|
|
476
511
|
def download_files
|
|
477
512
|
start_time = Time.now
|
|
478
513
|
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
|
@@ -519,36 +554,7 @@ class WaybackMachineDownloader
|
|
|
519
554
|
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
|
|
520
555
|
pool = Concurrent::FixedThreadPool.new(thread_count)
|
|
521
556
|
|
|
522
|
-
files_to_process
|
|
523
|
-
pool.post do
|
|
524
|
-
download_success = false
|
|
525
|
-
begin
|
|
526
|
-
@connection_pool.with_connection do |connection|
|
|
527
|
-
result_message = download_file(file_remote_info, connection)
|
|
528
|
-
# assume download success if the result message contains ' -> '
|
|
529
|
-
if result_message && result_message.include?(' -> ')
|
|
530
|
-
download_success = true
|
|
531
|
-
end
|
|
532
|
-
@download_mutex.synchronize do
|
|
533
|
-
@processed_file_count += 1
|
|
534
|
-
# adjust progress message to reflect remaining files
|
|
535
|
-
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
|
|
536
|
-
puts progress_message if progress_message
|
|
537
|
-
end
|
|
538
|
-
end
|
|
539
|
-
# sppend to DB only after successful download outside the connection block
|
|
540
|
-
if download_success
|
|
541
|
-
append_to_db(file_remote_info[:file_id])
|
|
542
|
-
end
|
|
543
|
-
rescue => e
|
|
544
|
-
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
|
|
545
|
-
@download_mutex.synchronize do
|
|
546
|
-
@processed_file_count += 1
|
|
547
|
-
end
|
|
548
|
-
end
|
|
549
|
-
sleep(RATE_LIMIT)
|
|
550
|
-
end
|
|
551
|
-
end
|
|
557
|
+
processing_files(pool, files_to_process)
|
|
552
558
|
|
|
553
559
|
pool.shutdown
|
|
554
560
|
pool.wait_for_termination
|
|
@@ -608,64 +614,13 @@ class WaybackMachineDownloader
|
|
|
608
614
|
end
|
|
609
615
|
|
|
610
616
|
# URLs in HTML attributes
|
|
611
|
-
content
|
|
612
|
-
prefix, url, suffix = $1, $2, $3
|
|
613
|
-
|
|
614
|
-
if url.start_with?('http')
|
|
615
|
-
begin
|
|
616
|
-
uri = URI.parse(url)
|
|
617
|
-
path = uri.path
|
|
618
|
-
path = path[1..-1] if path.start_with?('/')
|
|
619
|
-
"#{prefix}#{path}#{suffix}"
|
|
620
|
-
rescue
|
|
621
|
-
"#{prefix}#{url}#{suffix}"
|
|
622
|
-
end
|
|
623
|
-
elsif url.start_with?('/')
|
|
624
|
-
"#{prefix}./#{url[1..-1]}#{suffix}"
|
|
625
|
-
else
|
|
626
|
-
"#{prefix}#{url}#{suffix}"
|
|
627
|
-
end
|
|
628
|
-
end
|
|
617
|
+
rewrite_html_attr_urls(content)
|
|
629
618
|
|
|
630
619
|
# URLs in CSS
|
|
631
|
-
content
|
|
632
|
-
url = $1
|
|
633
|
-
|
|
634
|
-
if url.start_with?('http')
|
|
635
|
-
begin
|
|
636
|
-
uri = URI.parse(url)
|
|
637
|
-
path = uri.path
|
|
638
|
-
path = path[1..-1] if path.start_with?('/')
|
|
639
|
-
"url(\"#{path}\")"
|
|
640
|
-
rescue
|
|
641
|
-
"url(\"#{url}\")"
|
|
642
|
-
end
|
|
643
|
-
elsif url.start_with?('/')
|
|
644
|
-
"url(\"./#{url[1..-1]}\")"
|
|
645
|
-
else
|
|
646
|
-
"url(\"#{url}\")"
|
|
647
|
-
end
|
|
648
|
-
end
|
|
620
|
+
rewrite_css_urls(content)
|
|
649
621
|
|
|
650
622
|
# URLs in JavaScript
|
|
651
|
-
content
|
|
652
|
-
quote_start, url, quote_end = $1, $2, $3
|
|
653
|
-
|
|
654
|
-
if url.start_with?('http')
|
|
655
|
-
begin
|
|
656
|
-
uri = URI.parse(url)
|
|
657
|
-
path = uri.path
|
|
658
|
-
path = path[1..-1] if path.start_with?('/')
|
|
659
|
-
"#{quote_start}#{path}#{quote_end}"
|
|
660
|
-
rescue
|
|
661
|
-
"#{quote_start}#{url}#{quote_end}"
|
|
662
|
-
end
|
|
663
|
-
elsif url.start_with?('/')
|
|
664
|
-
"#{quote_start}./#{url[1..-1]}#{quote_end}"
|
|
665
|
-
else
|
|
666
|
-
"#{quote_start}#{url}#{quote_end}"
|
|
667
|
-
end
|
|
668
|
-
end
|
|
623
|
+
rewrite_js_urls(content)
|
|
669
624
|
|
|
670
625
|
# for URLs in HTML attributes that start with a single slash
|
|
671
626
|
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: wayback_machine_downloader_straw
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.4.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- strawberrymaster
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2025-
|
|
10
|
+
date: 2025-08-04 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: concurrent-ruby
|
|
@@ -74,6 +74,7 @@ files:
|
|
|
74
74
|
- lib/wayback_machine_downloader/subdom_processor.rb
|
|
75
75
|
- lib/wayback_machine_downloader/tidy_bytes.rb
|
|
76
76
|
- lib/wayback_machine_downloader/to_regex.rb
|
|
77
|
+
- lib/wayback_machine_downloader/url_rewrite.rb
|
|
77
78
|
homepage: https://github.com/StrawberryMaster/wayback-machine-downloader
|
|
78
79
|
licenses:
|
|
79
80
|
- MIT
|