wayback_machine_downloader_straw 2.4.3 → 2.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +4 -0
- data/lib/wayback_machine_downloader.rb +111 -93
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1e81619475172540d94968e5d31fbb6b4df7f08a533efce3b200e1ad4ce5035f
|
|
4
|
+
data.tar.gz: aca008795277ebf489cda7f0d8d1691b1983e2eec6f338b95c9bf11552fdfd94
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c71eb691ba50308f1f6b29a326c00a0678db1ff94f6ab8620b4d74425c993a64b60dd4d0127f9dd9eb3a72801008672855d901379d870b89aff82862b4f582a3
|
|
7
|
+
data.tar.gz: 6277c45d37dd02fea219f93906bf790ddede921ef88e928074b9e3da88b9882a64ecebbc99767b4c0eaef34b9a3407e9fee6352c935ea2d9942eed5735c51632
|
|
@@ -74,6 +74,10 @@ option_parser = OptionParser.new do |opts|
|
|
|
74
74
|
options[:keep] = true
|
|
75
75
|
end
|
|
76
76
|
|
|
77
|
+
opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
|
|
78
|
+
options[:max_retries] = t
|
|
79
|
+
end
|
|
80
|
+
|
|
77
81
|
opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
|
|
78
82
|
options[:recursive_subdomains] = true
|
|
79
83
|
end
|
|
@@ -25,69 +25,81 @@ class ConnectionPool
|
|
|
25
25
|
MAX_RETRIES = 3
|
|
26
26
|
|
|
27
27
|
def initialize(size)
|
|
28
|
-
@
|
|
29
|
-
@pool
|
|
30
|
-
@creation_times = Concurrent::Map.new
|
|
28
|
+
@pool = SizedQueue.new(size)
|
|
29
|
+
size.times { @pool << build_connection_entry }
|
|
31
30
|
@cleanup_thread = schedule_cleanup
|
|
32
31
|
end
|
|
33
32
|
|
|
34
|
-
def with_connection
|
|
35
|
-
|
|
33
|
+
def with_connection
|
|
34
|
+
entry = acquire_connection
|
|
36
35
|
begin
|
|
37
|
-
yield
|
|
36
|
+
yield entry[:http]
|
|
38
37
|
ensure
|
|
39
|
-
release_connection(
|
|
38
|
+
release_connection(entry)
|
|
40
39
|
end
|
|
41
40
|
end
|
|
42
41
|
|
|
43
42
|
def shutdown
|
|
44
43
|
@cleanup_thread&.exit
|
|
45
|
-
|
|
46
|
-
@pool.clear
|
|
47
|
-
@creation_times.clear
|
|
44
|
+
drain_pool { |entry| safe_finish(entry[:http]) }
|
|
48
45
|
end
|
|
49
46
|
|
|
50
47
|
private
|
|
51
48
|
|
|
52
49
|
def acquire_connection
|
|
53
|
-
|
|
54
|
-
|
|
50
|
+
entry = @pool.pop
|
|
51
|
+
if stale?(entry)
|
|
52
|
+
safe_finish(entry[:http])
|
|
53
|
+
entry = build_connection_entry
|
|
54
|
+
end
|
|
55
|
+
entry
|
|
56
|
+
end
|
|
55
57
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
@creation_times[thread_id] = Time.now
|
|
58
|
+
def release_connection(entry)
|
|
59
|
+
if stale?(entry)
|
|
60
|
+
safe_finish(entry[:http])
|
|
61
|
+
entry = build_connection_entry
|
|
61
62
|
end
|
|
63
|
+
@pool << entry
|
|
64
|
+
end
|
|
62
65
|
|
|
63
|
-
|
|
66
|
+
def stale?(entry)
|
|
67
|
+
http = entry[:http]
|
|
68
|
+
!http.started? || (Time.now - entry[:created_at] > MAX_AGE)
|
|
64
69
|
end
|
|
65
70
|
|
|
66
|
-
def
|
|
67
|
-
|
|
68
|
-
if conn.started? && Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
|
|
69
|
-
conn.finish
|
|
70
|
-
@pool.delete(Thread.current.object_id)
|
|
71
|
-
@creation_times.delete(Thread.current.object_id)
|
|
72
|
-
end
|
|
71
|
+
def build_connection_entry
|
|
72
|
+
{ http: create_connection, created_at: Time.now }
|
|
73
73
|
end
|
|
74
74
|
|
|
75
|
-
def
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
false
|
|
75
|
+
def safe_finish(http)
|
|
76
|
+
http.finish if http&.started?
|
|
77
|
+
rescue StandardError
|
|
78
|
+
nil
|
|
80
79
|
end
|
|
81
80
|
|
|
82
|
-
def
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
81
|
+
def drain_pool
|
|
82
|
+
loop do
|
|
83
|
+
entry = begin
|
|
84
|
+
@pool.pop(true)
|
|
85
|
+
rescue ThreadError
|
|
86
|
+
break
|
|
87
|
+
end
|
|
88
|
+
yield(entry)
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def cleanup_old_connections
|
|
93
|
+
entry = begin
|
|
94
|
+
@pool.pop(true)
|
|
95
|
+
rescue ThreadError
|
|
96
|
+
return
|
|
97
|
+
end
|
|
98
|
+
if stale?(entry)
|
|
99
|
+
safe_finish(entry[:http])
|
|
100
|
+
entry = build_connection_entry
|
|
101
|
+
end
|
|
102
|
+
@pool << entry
|
|
91
103
|
end
|
|
92
104
|
|
|
93
105
|
def schedule_cleanup
|
|
@@ -99,16 +111,15 @@ class ConnectionPool
|
|
|
99
111
|
end
|
|
100
112
|
end
|
|
101
113
|
|
|
102
|
-
def
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
end
|
|
114
|
+
def create_connection
|
|
115
|
+
http = Net::HTTP.new("web.archive.org", 443)
|
|
116
|
+
http.use_ssl = true
|
|
117
|
+
http.read_timeout = DEFAULT_TIMEOUT
|
|
118
|
+
http.open_timeout = DEFAULT_TIMEOUT
|
|
119
|
+
http.keep_alive_timeout = 30
|
|
120
|
+
http.max_retries = MAX_RETRIES
|
|
121
|
+
http.start
|
|
122
|
+
http
|
|
112
123
|
end
|
|
113
124
|
end
|
|
114
125
|
|
|
@@ -117,7 +128,7 @@ class WaybackMachineDownloader
|
|
|
117
128
|
include ArchiveAPI
|
|
118
129
|
include SubdomainProcessor
|
|
119
130
|
|
|
120
|
-
VERSION = "2.4.
|
|
131
|
+
VERSION = "2.4.4"
|
|
121
132
|
DEFAULT_TIMEOUT = 30
|
|
122
133
|
MAX_RETRIES = 3
|
|
123
134
|
RETRY_DELAY = 2
|
|
@@ -163,6 +174,7 @@ class WaybackMachineDownloader
|
|
|
163
174
|
@recursive_subdomains = params[:recursive_subdomains] || false
|
|
164
175
|
@subdomain_depth = params[:subdomain_depth] || 1
|
|
165
176
|
@snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
|
|
177
|
+
@max_retries = params[:max_retries] ? params[:max_retries].to_i : MAX_RETRIES
|
|
166
178
|
|
|
167
179
|
# URL for rejecting invalid/unencoded wayback urls
|
|
168
180
|
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
|
|
@@ -193,7 +205,8 @@ class WaybackMachineDownloader
|
|
|
193
205
|
@directory
|
|
194
206
|
else
|
|
195
207
|
# ensure the default path is absolute and normalized
|
|
196
|
-
|
|
208
|
+
cwd = Dir.pwd
|
|
209
|
+
File.expand_path(File.join(cwd, 'websites', backup_name))
|
|
197
210
|
end
|
|
198
211
|
end
|
|
199
212
|
|
|
@@ -277,53 +290,58 @@ class WaybackMachineDownloader
|
|
|
277
290
|
page_index = 0
|
|
278
291
|
batch_size = [@threads_count, 5].min
|
|
279
292
|
continue_fetching = true
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
293
|
+
fetch_pool = Concurrent::FixedThreadPool.new([@threads_count, 1].max)
|
|
294
|
+
begin
|
|
295
|
+
while continue_fetching && page_index < @maximum_pages
|
|
296
|
+
# Determine the range of pages to fetch in this batch
|
|
297
|
+
end_index = [page_index + batch_size, @maximum_pages].min
|
|
298
|
+
current_batch = (page_index...end_index).to_a
|
|
299
|
+
|
|
300
|
+
# Create futures for concurrent API calls
|
|
301
|
+
futures = current_batch.map do |page|
|
|
302
|
+
Concurrent::Future.execute(executor: fetch_pool) do
|
|
303
|
+
result = nil
|
|
304
|
+
@connection_pool.with_connection do |connection|
|
|
305
|
+
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
|
|
306
|
+
end
|
|
307
|
+
result ||= []
|
|
308
|
+
[page, result]
|
|
292
309
|
end
|
|
293
|
-
result ||= []
|
|
294
|
-
[page, result]
|
|
295
310
|
end
|
|
296
|
-
end
|
|
297
311
|
|
|
298
|
-
|
|
312
|
+
results = []
|
|
299
313
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
314
|
+
futures.each do |future|
|
|
315
|
+
begin
|
|
316
|
+
results << future.value
|
|
317
|
+
rescue => e
|
|
318
|
+
puts "\nError fetching page #{future}: #{e.message}"
|
|
319
|
+
end
|
|
305
320
|
end
|
|
306
|
-
end
|
|
307
|
-
|
|
308
|
-
# Sort results by page number to maintain order
|
|
309
|
-
results.sort_by! { |page, _| page }
|
|
310
321
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
322
|
+
# Sort results by page number to maintain order
|
|
323
|
+
results.sort_by! { |page, _| page }
|
|
324
|
+
|
|
325
|
+
# Process results and check for empty pages
|
|
326
|
+
results.each do |page, result|
|
|
327
|
+
if result.nil? || result.empty?
|
|
328
|
+
continue_fetching = false
|
|
329
|
+
break
|
|
330
|
+
else
|
|
331
|
+
mutex.synchronize do
|
|
332
|
+
snapshot_list_to_consider.concat(result)
|
|
333
|
+
print "."
|
|
334
|
+
end
|
|
320
335
|
end
|
|
321
336
|
end
|
|
322
|
-
end
|
|
323
337
|
|
|
324
|
-
|
|
338
|
+
page_index = end_index
|
|
325
339
|
|
|
326
|
-
|
|
340
|
+
sleep(RATE_LIMIT) if continue_fetching
|
|
341
|
+
end
|
|
342
|
+
ensure
|
|
343
|
+
fetch_pool.shutdown
|
|
344
|
+
fetch_pool.wait_for_termination
|
|
327
345
|
end
|
|
328
346
|
end
|
|
329
347
|
|
|
@@ -638,13 +656,13 @@ class WaybackMachineDownloader
|
|
|
638
656
|
end
|
|
639
657
|
|
|
640
658
|
# URLs in HTML attributes
|
|
641
|
-
rewrite_html_attr_urls(content)
|
|
659
|
+
content = rewrite_html_attr_urls(content)
|
|
642
660
|
|
|
643
661
|
# URLs in CSS
|
|
644
|
-
rewrite_css_urls(content)
|
|
662
|
+
content = rewrite_css_urls(content)
|
|
645
663
|
|
|
646
664
|
# URLs in JavaScript
|
|
647
|
-
rewrite_js_urls(content)
|
|
665
|
+
content = rewrite_js_urls(content)
|
|
648
666
|
|
|
649
667
|
# for URLs in HTML attributes that start with a single slash
|
|
650
668
|
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
|
@@ -934,9 +952,9 @@ class WaybackMachineDownloader
|
|
|
934
952
|
end
|
|
935
953
|
|
|
936
954
|
rescue StandardError => e
|
|
937
|
-
if retries <
|
|
955
|
+
if retries < @max_retries
|
|
938
956
|
retries += 1
|
|
939
|
-
@logger.warn("Retry #{retries}/#{
|
|
957
|
+
@logger.warn("Retry #{retries}/#{@max_retries} for #{file_url}: #{e.message}")
|
|
940
958
|
sleep(RETRY_DELAY * retries)
|
|
941
959
|
retry
|
|
942
960
|
else
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: wayback_machine_downloader_straw
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.4.
|
|
4
|
+
version: 2.4.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- strawberrymaster
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2025-
|
|
10
|
+
date: 2025-10-27 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: concurrent-ruby
|