wayback_machine_downloader_straw 2.4.1 → 2.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +4 -0
- data/lib/wayback_machine_downloader.rb +191 -99
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1e81619475172540d94968e5d31fbb6b4df7f08a533efce3b200e1ad4ce5035f
|
|
4
|
+
data.tar.gz: aca008795277ebf489cda7f0d8d1691b1983e2eec6f338b95c9bf11552fdfd94
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c71eb691ba50308f1f6b29a326c00a0678db1ff94f6ab8620b4d74425c993a64b60dd4d0127f9dd9eb3a72801008672855d901379d870b89aff82862b4f582a3
|
|
7
|
+
data.tar.gz: 6277c45d37dd02fea219f93906bf790ddede921ef88e928074b9e3da88b9882a64ecebbc99767b4c0eaef34b9a3407e9fee6352c935ea2d9942eed5735c51632
|
|
@@ -74,6 +74,10 @@ option_parser = OptionParser.new do |opts|
|
|
|
74
74
|
options[:keep] = true
|
|
75
75
|
end
|
|
76
76
|
|
|
77
|
+
opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
|
|
78
|
+
options[:max_retries] = t
|
|
79
|
+
end
|
|
80
|
+
|
|
77
81
|
opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
|
|
78
82
|
options[:recursive_subdomains] = true
|
|
79
83
|
end
|
|
@@ -11,6 +11,7 @@ require 'concurrent-ruby'
|
|
|
11
11
|
require 'logger'
|
|
12
12
|
require 'zlib'
|
|
13
13
|
require 'stringio'
|
|
14
|
+
require 'digest'
|
|
14
15
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
|
15
16
|
require_relative 'wayback_machine_downloader/to_regex'
|
|
16
17
|
require_relative 'wayback_machine_downloader/archive_api'
|
|
@@ -24,69 +25,81 @@ class ConnectionPool
|
|
|
24
25
|
MAX_RETRIES = 3
|
|
25
26
|
|
|
26
27
|
def initialize(size)
|
|
27
|
-
@
|
|
28
|
-
@pool
|
|
29
|
-
@creation_times = Concurrent::Map.new
|
|
28
|
+
@pool = SizedQueue.new(size)
|
|
29
|
+
size.times { @pool << build_connection_entry }
|
|
30
30
|
@cleanup_thread = schedule_cleanup
|
|
31
31
|
end
|
|
32
32
|
|
|
33
|
-
def with_connection
|
|
34
|
-
|
|
33
|
+
def with_connection
|
|
34
|
+
entry = acquire_connection
|
|
35
35
|
begin
|
|
36
|
-
yield
|
|
36
|
+
yield entry[:http]
|
|
37
37
|
ensure
|
|
38
|
-
release_connection(
|
|
38
|
+
release_connection(entry)
|
|
39
39
|
end
|
|
40
40
|
end
|
|
41
41
|
|
|
42
42
|
def shutdown
|
|
43
43
|
@cleanup_thread&.exit
|
|
44
|
-
|
|
45
|
-
@pool.clear
|
|
46
|
-
@creation_times.clear
|
|
44
|
+
drain_pool { |entry| safe_finish(entry[:http]) }
|
|
47
45
|
end
|
|
48
46
|
|
|
49
47
|
private
|
|
50
48
|
|
|
51
49
|
def acquire_connection
|
|
52
|
-
|
|
53
|
-
|
|
50
|
+
entry = @pool.pop
|
|
51
|
+
if stale?(entry)
|
|
52
|
+
safe_finish(entry[:http])
|
|
53
|
+
entry = build_connection_entry
|
|
54
|
+
end
|
|
55
|
+
entry
|
|
56
|
+
end
|
|
54
57
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
@creation_times[thread_id] = Time.now
|
|
58
|
+
def release_connection(entry)
|
|
59
|
+
if stale?(entry)
|
|
60
|
+
safe_finish(entry[:http])
|
|
61
|
+
entry = build_connection_entry
|
|
60
62
|
end
|
|
63
|
+
@pool << entry
|
|
64
|
+
end
|
|
61
65
|
|
|
62
|
-
|
|
66
|
+
def stale?(entry)
|
|
67
|
+
http = entry[:http]
|
|
68
|
+
!http.started? || (Time.now - entry[:created_at] > MAX_AGE)
|
|
63
69
|
end
|
|
64
70
|
|
|
65
|
-
def
|
|
66
|
-
|
|
67
|
-
if conn.started? && Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
|
|
68
|
-
conn.finish
|
|
69
|
-
@pool.delete(Thread.current.object_id)
|
|
70
|
-
@creation_times.delete(Thread.current.object_id)
|
|
71
|
-
end
|
|
71
|
+
def build_connection_entry
|
|
72
|
+
{ http: create_connection, created_at: Time.now }
|
|
72
73
|
end
|
|
73
74
|
|
|
74
|
-
def
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
false
|
|
75
|
+
def safe_finish(http)
|
|
76
|
+
http.finish if http&.started?
|
|
77
|
+
rescue StandardError
|
|
78
|
+
nil
|
|
79
79
|
end
|
|
80
80
|
|
|
81
|
-
def
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
81
|
+
def drain_pool
|
|
82
|
+
loop do
|
|
83
|
+
entry = begin
|
|
84
|
+
@pool.pop(true)
|
|
85
|
+
rescue ThreadError
|
|
86
|
+
break
|
|
87
|
+
end
|
|
88
|
+
yield(entry)
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def cleanup_old_connections
|
|
93
|
+
entry = begin
|
|
94
|
+
@pool.pop(true)
|
|
95
|
+
rescue ThreadError
|
|
96
|
+
return
|
|
97
|
+
end
|
|
98
|
+
if stale?(entry)
|
|
99
|
+
safe_finish(entry[:http])
|
|
100
|
+
entry = build_connection_entry
|
|
101
|
+
end
|
|
102
|
+
@pool << entry
|
|
90
103
|
end
|
|
91
104
|
|
|
92
105
|
def schedule_cleanup
|
|
@@ -98,16 +111,15 @@ class ConnectionPool
|
|
|
98
111
|
end
|
|
99
112
|
end
|
|
100
113
|
|
|
101
|
-
def
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
end
|
|
114
|
+
def create_connection
|
|
115
|
+
http = Net::HTTP.new("web.archive.org", 443)
|
|
116
|
+
http.use_ssl = true
|
|
117
|
+
http.read_timeout = DEFAULT_TIMEOUT
|
|
118
|
+
http.open_timeout = DEFAULT_TIMEOUT
|
|
119
|
+
http.keep_alive_timeout = 30
|
|
120
|
+
http.max_retries = MAX_RETRIES
|
|
121
|
+
http.start
|
|
122
|
+
http
|
|
111
123
|
end
|
|
112
124
|
end
|
|
113
125
|
|
|
@@ -116,7 +128,7 @@ class WaybackMachineDownloader
|
|
|
116
128
|
include ArchiveAPI
|
|
117
129
|
include SubdomainProcessor
|
|
118
130
|
|
|
119
|
-
VERSION = "2.4.
|
|
131
|
+
VERSION = "2.4.4"
|
|
120
132
|
DEFAULT_TIMEOUT = 30
|
|
121
133
|
MAX_RETRIES = 3
|
|
122
134
|
RETRY_DELAY = 2
|
|
@@ -162,6 +174,7 @@ class WaybackMachineDownloader
|
|
|
162
174
|
@recursive_subdomains = params[:recursive_subdomains] || false
|
|
163
175
|
@subdomain_depth = params[:subdomain_depth] || 1
|
|
164
176
|
@snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
|
|
177
|
+
@max_retries = params[:max_retries] ? params[:max_retries].to_i : MAX_RETRIES
|
|
165
178
|
|
|
166
179
|
# URL for rejecting invalid/unencoded wayback urls
|
|
167
180
|
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
|
|
@@ -171,12 +184,19 @@ class WaybackMachineDownloader
|
|
|
171
184
|
|
|
172
185
|
def backup_name
|
|
173
186
|
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
|
|
174
|
-
|
|
175
|
-
if url_to_process.include? '//'
|
|
187
|
+
raw = if url_to_process.include?('//')
|
|
176
188
|
url_to_process.split('/')[2]
|
|
177
189
|
else
|
|
178
190
|
url_to_process
|
|
179
191
|
end
|
|
192
|
+
|
|
193
|
+
# sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
|
|
194
|
+
if Gem.win_platform?
|
|
195
|
+
raw = raw.gsub(/[:*?"<>|]/, '_')
|
|
196
|
+
raw = raw.gsub(/[ .]+\z/, '')
|
|
197
|
+
end
|
|
198
|
+
raw = 'site' if raw.nil? || raw.empty?
|
|
199
|
+
raw
|
|
180
200
|
end
|
|
181
201
|
|
|
182
202
|
def backup_path
|
|
@@ -185,7 +205,8 @@ class WaybackMachineDownloader
|
|
|
185
205
|
@directory
|
|
186
206
|
else
|
|
187
207
|
# ensure the default path is absolute and normalized
|
|
188
|
-
|
|
208
|
+
cwd = Dir.pwd
|
|
209
|
+
File.expand_path(File.join(cwd, 'websites', backup_name))
|
|
189
210
|
end
|
|
190
211
|
end
|
|
191
212
|
|
|
@@ -269,53 +290,58 @@ class WaybackMachineDownloader
|
|
|
269
290
|
page_index = 0
|
|
270
291
|
batch_size = [@threads_count, 5].min
|
|
271
292
|
continue_fetching = true
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
293
|
+
fetch_pool = Concurrent::FixedThreadPool.new([@threads_count, 1].max)
|
|
294
|
+
begin
|
|
295
|
+
while continue_fetching && page_index < @maximum_pages
|
|
296
|
+
# Determine the range of pages to fetch in this batch
|
|
297
|
+
end_index = [page_index + batch_size, @maximum_pages].min
|
|
298
|
+
current_batch = (page_index...end_index).to_a
|
|
299
|
+
|
|
300
|
+
# Create futures for concurrent API calls
|
|
301
|
+
futures = current_batch.map do |page|
|
|
302
|
+
Concurrent::Future.execute(executor: fetch_pool) do
|
|
303
|
+
result = nil
|
|
304
|
+
@connection_pool.with_connection do |connection|
|
|
305
|
+
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
|
|
306
|
+
end
|
|
307
|
+
result ||= []
|
|
308
|
+
[page, result]
|
|
284
309
|
end
|
|
285
|
-
result ||= []
|
|
286
|
-
[page, result]
|
|
287
310
|
end
|
|
288
|
-
end
|
|
289
311
|
|
|
290
|
-
|
|
312
|
+
results = []
|
|
291
313
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
314
|
+
futures.each do |future|
|
|
315
|
+
begin
|
|
316
|
+
results << future.value
|
|
317
|
+
rescue => e
|
|
318
|
+
puts "\nError fetching page #{future}: #{e.message}"
|
|
319
|
+
end
|
|
297
320
|
end
|
|
298
|
-
end
|
|
299
321
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
322
|
+
# Sort results by page number to maintain order
|
|
323
|
+
results.sort_by! { |page, _| page }
|
|
324
|
+
|
|
325
|
+
# Process results and check for empty pages
|
|
326
|
+
results.each do |page, result|
|
|
327
|
+
if result.nil? || result.empty?
|
|
328
|
+
continue_fetching = false
|
|
329
|
+
break
|
|
330
|
+
else
|
|
331
|
+
mutex.synchronize do
|
|
332
|
+
snapshot_list_to_consider.concat(result)
|
|
333
|
+
print "."
|
|
334
|
+
end
|
|
312
335
|
end
|
|
313
336
|
end
|
|
314
|
-
end
|
|
315
337
|
|
|
316
|
-
|
|
338
|
+
page_index = end_index
|
|
317
339
|
|
|
318
|
-
|
|
340
|
+
sleep(RATE_LIMIT) if continue_fetching
|
|
341
|
+
end
|
|
342
|
+
ensure
|
|
343
|
+
fetch_pool.shutdown
|
|
344
|
+
fetch_pool.wait_for_termination
|
|
319
345
|
end
|
|
320
346
|
end
|
|
321
347
|
|
|
@@ -630,13 +656,13 @@ class WaybackMachineDownloader
|
|
|
630
656
|
end
|
|
631
657
|
|
|
632
658
|
# URLs in HTML attributes
|
|
633
|
-
rewrite_html_attr_urls(content)
|
|
659
|
+
content = rewrite_html_attr_urls(content)
|
|
634
660
|
|
|
635
661
|
# URLs in CSS
|
|
636
|
-
rewrite_css_urls(content)
|
|
662
|
+
content = rewrite_css_urls(content)
|
|
637
663
|
|
|
638
664
|
# URLs in JavaScript
|
|
639
|
-
rewrite_js_urls(content)
|
|
665
|
+
content = rewrite_js_urls(content)
|
|
640
666
|
|
|
641
667
|
# for URLs in HTML attributes that start with a single slash
|
|
642
668
|
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
|
@@ -769,17 +795,83 @@ class WaybackMachineDownloader
|
|
|
769
795
|
# safely sanitize a file id (or id+timestamp)
|
|
770
796
|
def sanitize_and_prepare_id(raw, file_url)
|
|
771
797
|
return nil if raw.nil?
|
|
798
|
+
return "" if raw.empty?
|
|
799
|
+
original = raw.dup
|
|
772
800
|
begin
|
|
773
|
-
|
|
774
|
-
raw.
|
|
775
|
-
|
|
801
|
+
# work on a binary copy to avoid premature encoding errors
|
|
802
|
+
raw = raw.dup.force_encoding(Encoding::BINARY)
|
|
803
|
+
|
|
804
|
+
# percent-decode (repeat until stable in case of double-encoding)
|
|
805
|
+
loop do
|
|
806
|
+
decoded = raw.gsub(/%([0-9A-Fa-f]{2})/) { [$1].pack('H2') }
|
|
807
|
+
break if decoded == raw
|
|
808
|
+
raw = decoded
|
|
809
|
+
end
|
|
810
|
+
|
|
811
|
+
# try tidy_bytes
|
|
812
|
+
begin
|
|
813
|
+
raw = raw.tidy_bytes
|
|
814
|
+
rescue StandardError
|
|
815
|
+
# fallback: scrub to UTF-8
|
|
816
|
+
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
|
817
|
+
end
|
|
818
|
+
|
|
819
|
+
# ensure UTF-8 and scrub again
|
|
820
|
+
unless raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
|
|
821
|
+
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
|
822
|
+
end
|
|
823
|
+
|
|
824
|
+
# strip HTML/comment artifacts & control chars
|
|
825
|
+
raw.gsub!(/<!--+/, '')
|
|
826
|
+
raw.gsub!(/[\x00-\x1F]/, '')
|
|
827
|
+
|
|
828
|
+
# split query; hash it for stable short name
|
|
829
|
+
path_part, query_part = raw.split('?', 2)
|
|
830
|
+
if query_part && !query_part.empty?
|
|
831
|
+
q_digest = Digest::SHA256.hexdigest(query_part)[0, 12]
|
|
832
|
+
if path_part.include?('.')
|
|
833
|
+
pre, _sep, post = path_part.rpartition('.')
|
|
834
|
+
path_part = "#{pre}__q#{q_digest}.#{post}"
|
|
835
|
+
else
|
|
836
|
+
path_part = "#{path_part}__q#{q_digest}"
|
|
837
|
+
end
|
|
838
|
+
end
|
|
839
|
+
raw = path_part
|
|
840
|
+
|
|
841
|
+
# collapse slashes & trim leading slash
|
|
842
|
+
raw.gsub!(%r{/+}, '/')
|
|
843
|
+
raw.sub!(%r{\A/}, '')
|
|
844
|
+
|
|
845
|
+
# segment-wise sanitation
|
|
846
|
+
raw = raw.split('/').map do |segment|
|
|
847
|
+
seg = segment.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
|
848
|
+
seg = seg.gsub(/[:*?"<>|\\]/) { |c| "%#{c.ord.to_s(16).upcase}" }
|
|
849
|
+
seg = seg.gsub(/[ .]+\z/, '') if Gem.win_platform?
|
|
850
|
+
seg.empty? ? '_' : seg
|
|
851
|
+
end.join('/')
|
|
852
|
+
|
|
853
|
+
# remove any remaining angle brackets
|
|
854
|
+
raw.tr!('<>', '')
|
|
855
|
+
|
|
856
|
+
# final fallback if empty
|
|
857
|
+
raw = "file__#{Digest::SHA1.hexdigest(original)[0,10]}" if raw.nil? || raw.empty?
|
|
858
|
+
|
|
776
859
|
raw
|
|
777
860
|
rescue => e
|
|
778
861
|
@logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
|
|
779
|
-
nil
|
|
862
|
+
# deterministic fallback – never return nil so caller won’t mark malformed
|
|
863
|
+
"file__#{Digest::SHA1.hexdigest(original)[0,10]}"
|
|
780
864
|
end
|
|
781
865
|
end
|
|
782
866
|
|
|
867
|
+
# wrap URL in parentheses if it contains characters that commonly break unquoted
|
|
868
|
+
# Windows CMD usage (e.g., &). This is only for display; user still must quote
|
|
869
|
+
# when invoking manually.
|
|
870
|
+
def safe_display_url(url)
|
|
871
|
+
return url unless url && url.match?(/[&]/)
|
|
872
|
+
"(#{url})"
|
|
873
|
+
end
|
|
874
|
+
|
|
783
875
|
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
|
|
784
876
|
retries = 0
|
|
785
877
|
begin
|
|
@@ -860,9 +952,9 @@ class WaybackMachineDownloader
|
|
|
860
952
|
end
|
|
861
953
|
|
|
862
954
|
rescue StandardError => e
|
|
863
|
-
if retries <
|
|
955
|
+
if retries < @max_retries
|
|
864
956
|
retries += 1
|
|
865
|
-
@logger.warn("Retry #{retries}/#{
|
|
957
|
+
@logger.warn("Retry #{retries}/#{@max_retries} for #{file_url}: #{e.message}")
|
|
866
958
|
sleep(RETRY_DELAY * retries)
|
|
867
959
|
retry
|
|
868
960
|
else
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: wayback_machine_downloader_straw
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.4.
|
|
4
|
+
version: 2.4.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- strawberrymaster
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2025-
|
|
10
|
+
date: 2025-10-27 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: concurrent-ruby
|