wayback_machine_downloader_straw 2.4.3 → 2.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ce7592163165a7f8235bf4a6e1915cf531511fafc7f6874c0d1673fb29db704f
4
- data.tar.gz: 7d48ffebf130d3b32d1ec233cf5141cc3cf192bcf16751db4380bf62863971c1
3
+ metadata.gz: 1e81619475172540d94968e5d31fbb6b4df7f08a533efce3b200e1ad4ce5035f
4
+ data.tar.gz: aca008795277ebf489cda7f0d8d1691b1983e2eec6f338b95c9bf11552fdfd94
5
5
  SHA512:
6
- metadata.gz: 16d56de1814e36174c47ab5bda6c9d5e02aba15bafa72a1d57056d0ac146e5fff5c6ca43f9198262d90820e4dcbe4e63772f01bd1ee5207c7ab07e9bb959e069
7
- data.tar.gz: 07602af4f0cfb9927d43239da0c38cb2411aa408d11fe3f91cb4a403fa415ca8de095eee7467e4613d32aadb8c0a13ffea19ac2f93fd5bf005a991d91e8a064a
6
+ metadata.gz: c71eb691ba50308f1f6b29a326c00a0678db1ff94f6ab8620b4d74425c993a64b60dd4d0127f9dd9eb3a72801008672855d901379d870b89aff82862b4f582a3
7
+ data.tar.gz: 6277c45d37dd02fea219f93906bf790ddede921ef88e928074b9e3da88b9882a64ecebbc99767b4c0eaef34b9a3407e9fee6352c935ea2d9942eed5735c51632
@@ -74,6 +74,10 @@ option_parser = OptionParser.new do |opts|
74
74
  options[:keep] = true
75
75
  end
76
76
 
77
+ opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
78
+ options[:max_retries] = t
79
+ end
80
+
77
81
  opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
78
82
  options[:recursive_subdomains] = true
79
83
  end
@@ -25,69 +25,81 @@ class ConnectionPool
25
25
  MAX_RETRIES = 3
26
26
 
27
27
  def initialize(size)
28
- @size = size
29
- @pool = Concurrent::Map.new
30
- @creation_times = Concurrent::Map.new
28
+ @pool = SizedQueue.new(size)
29
+ size.times { @pool << build_connection_entry }
31
30
  @cleanup_thread = schedule_cleanup
32
31
  end
33
32
 
34
- def with_connection(&block)
35
- conn = acquire_connection
33
+ def with_connection
34
+ entry = acquire_connection
36
35
  begin
37
- yield conn
36
+ yield entry[:http]
38
37
  ensure
39
- release_connection(conn)
38
+ release_connection(entry)
40
39
  end
41
40
  end
42
41
 
43
42
  def shutdown
44
43
  @cleanup_thread&.exit
45
- @pool.each_value { |conn| conn.finish if conn&.started? }
46
- @pool.clear
47
- @creation_times.clear
44
+ drain_pool { |entry| safe_finish(entry[:http]) }
48
45
  end
49
46
 
50
47
  private
51
48
 
52
49
  def acquire_connection
53
- thread_id = Thread.current.object_id
54
- conn = @pool[thread_id]
50
+ entry = @pool.pop
51
+ if stale?(entry)
52
+ safe_finish(entry[:http])
53
+ entry = build_connection_entry
54
+ end
55
+ entry
56
+ end
55
57
 
56
- if should_create_new?(conn)
57
- conn&.finish if conn&.started?
58
- conn = create_connection
59
- @pool[thread_id] = conn
60
- @creation_times[thread_id] = Time.now
58
+ def release_connection(entry)
59
+ if stale?(entry)
60
+ safe_finish(entry[:http])
61
+ entry = build_connection_entry
61
62
  end
63
+ @pool << entry
64
+ end
62
65
 
63
- conn
66
+ def stale?(entry)
67
+ http = entry[:http]
68
+ !http.started? || (Time.now - entry[:created_at] > MAX_AGE)
64
69
  end
65
70
 
66
- def release_connection(conn)
67
- return unless conn
68
- if conn.started? && Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
69
- conn.finish
70
- @pool.delete(Thread.current.object_id)
71
- @creation_times.delete(Thread.current.object_id)
72
- end
71
+ def build_connection_entry
72
+ { http: create_connection, created_at: Time.now }
73
73
  end
74
74
 
75
- def should_create_new?(conn)
76
- return true if conn.nil?
77
- return true unless conn.started?
78
- return true if Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
79
- false
75
+ def safe_finish(http)
76
+ http.finish if http&.started?
77
+ rescue StandardError
78
+ nil
80
79
  end
81
80
 
82
- def create_connection
83
- http = Net::HTTP.new("web.archive.org", 443)
84
- http.use_ssl = true
85
- http.read_timeout = DEFAULT_TIMEOUT
86
- http.open_timeout = DEFAULT_TIMEOUT
87
- http.keep_alive_timeout = 30
88
- http.max_retries = MAX_RETRIES
89
- http.start
90
- http
81
+ def drain_pool
82
+ loop do
83
+ entry = begin
84
+ @pool.pop(true)
85
+ rescue ThreadError
86
+ break
87
+ end
88
+ yield(entry)
89
+ end
90
+ end
91
+
92
+ def cleanup_old_connections
93
+ entry = begin
94
+ @pool.pop(true)
95
+ rescue ThreadError
96
+ return
97
+ end
98
+ if stale?(entry)
99
+ safe_finish(entry[:http])
100
+ entry = build_connection_entry
101
+ end
102
+ @pool << entry
91
103
  end
92
104
 
93
105
  def schedule_cleanup
@@ -99,16 +111,15 @@ class ConnectionPool
99
111
  end
100
112
  end
101
113
 
102
- def cleanup_old_connections
103
- current_time = Time.now
104
- @creation_times.each do |thread_id, creation_time|
105
- if current_time - creation_time > MAX_AGE
106
- conn = @pool[thread_id]
107
- conn&.finish if conn&.started?
108
- @pool.delete(thread_id)
109
- @creation_times.delete(thread_id)
110
- end
111
- end
114
+ def create_connection
115
+ http = Net::HTTP.new("web.archive.org", 443)
116
+ http.use_ssl = true
117
+ http.read_timeout = DEFAULT_TIMEOUT
118
+ http.open_timeout = DEFAULT_TIMEOUT
119
+ http.keep_alive_timeout = 30
120
+ http.max_retries = MAX_RETRIES
121
+ http.start
122
+ http
112
123
  end
113
124
  end
114
125
 
@@ -117,7 +128,7 @@ class WaybackMachineDownloader
117
128
  include ArchiveAPI
118
129
  include SubdomainProcessor
119
130
 
120
- VERSION = "2.4.3"
131
+ VERSION = "2.4.4"
121
132
  DEFAULT_TIMEOUT = 30
122
133
  MAX_RETRIES = 3
123
134
  RETRY_DELAY = 2
@@ -163,6 +174,7 @@ class WaybackMachineDownloader
163
174
  @recursive_subdomains = params[:recursive_subdomains] || false
164
175
  @subdomain_depth = params[:subdomain_depth] || 1
165
176
  @snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
177
+ @max_retries = params[:max_retries] ? params[:max_retries].to_i : MAX_RETRIES
166
178
 
167
179
  # URL for rejecting invalid/unencoded wayback urls
168
180
  @url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
@@ -193,7 +205,8 @@ class WaybackMachineDownloader
193
205
  @directory
194
206
  else
195
207
  # ensure the default path is absolute and normalized
196
- File.expand_path(File.join('websites', backup_name))
208
+ cwd = Dir.pwd
209
+ File.expand_path(File.join(cwd, 'websites', backup_name))
197
210
  end
198
211
  end
199
212
 
@@ -277,53 +290,58 @@ class WaybackMachineDownloader
277
290
  page_index = 0
278
291
  batch_size = [@threads_count, 5].min
279
292
  continue_fetching = true
280
-
281
- while continue_fetching && page_index < @maximum_pages
282
- # Determine the range of pages to fetch in this batch
283
- end_index = [page_index + batch_size, @maximum_pages].min
284
- current_batch = (page_index...end_index).to_a
285
-
286
- # Create futures for concurrent API calls
287
- futures = current_batch.map do |page|
288
- Concurrent::Future.execute do
289
- result = nil
290
- @connection_pool.with_connection do |connection|
291
- result = get_raw_list_from_api("#{@base_url}/*", page, connection)
293
+ fetch_pool = Concurrent::FixedThreadPool.new([@threads_count, 1].max)
294
+ begin
295
+ while continue_fetching && page_index < @maximum_pages
296
+ # Determine the range of pages to fetch in this batch
297
+ end_index = [page_index + batch_size, @maximum_pages].min
298
+ current_batch = (page_index...end_index).to_a
299
+
300
+ # Create futures for concurrent API calls
301
+ futures = current_batch.map do |page|
302
+ Concurrent::Future.execute(executor: fetch_pool) do
303
+ result = nil
304
+ @connection_pool.with_connection do |connection|
305
+ result = get_raw_list_from_api("#{@base_url}/*", page, connection)
306
+ end
307
+ result ||= []
308
+ [page, result]
292
309
  end
293
- result ||= []
294
- [page, result]
295
310
  end
296
- end
297
311
 
298
- results = []
312
+ results = []
299
313
 
300
- futures.each do |future|
301
- begin
302
- results << future.value
303
- rescue => e
304
- puts "\nError fetching page #{future}: #{e.message}"
314
+ futures.each do |future|
315
+ begin
316
+ results << future.value
317
+ rescue => e
318
+ puts "\nError fetching page #{future}: #{e.message}"
319
+ end
305
320
  end
306
- end
307
-
308
- # Sort results by page number to maintain order
309
- results.sort_by! { |page, _| page }
310
321
 
311
- # Process results and check for empty pages
312
- results.each do |page, result|
313
- if result.nil? || result.empty?
314
- continue_fetching = false
315
- break
316
- else
317
- mutex.synchronize do
318
- snapshot_list_to_consider.concat(result)
319
- print "."
322
+ # Sort results by page number to maintain order
323
+ results.sort_by! { |page, _| page }
324
+
325
+ # Process results and check for empty pages
326
+ results.each do |page, result|
327
+ if result.nil? || result.empty?
328
+ continue_fetching = false
329
+ break
330
+ else
331
+ mutex.synchronize do
332
+ snapshot_list_to_consider.concat(result)
333
+ print "."
334
+ end
320
335
  end
321
336
  end
322
- end
323
337
 
324
- page_index = end_index
338
+ page_index = end_index
325
339
 
326
- sleep(RATE_LIMIT) if continue_fetching
340
+ sleep(RATE_LIMIT) if continue_fetching
341
+ end
342
+ ensure
343
+ fetch_pool.shutdown
344
+ fetch_pool.wait_for_termination
327
345
  end
328
346
  end
329
347
 
@@ -638,13 +656,13 @@ class WaybackMachineDownloader
638
656
  end
639
657
 
640
658
  # URLs in HTML attributes
641
- rewrite_html_attr_urls(content)
659
+ content = rewrite_html_attr_urls(content)
642
660
 
643
661
  # URLs in CSS
644
- rewrite_css_urls(content)
662
+ content = rewrite_css_urls(content)
645
663
 
646
664
  # URLs in JavaScript
647
- rewrite_js_urls(content)
665
+ content = rewrite_js_urls(content)
648
666
 
649
667
  # for URLs in HTML attributes that start with a single slash
650
668
  content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
@@ -934,9 +952,9 @@ class WaybackMachineDownloader
934
952
  end
935
953
 
936
954
  rescue StandardError => e
937
- if retries < MAX_RETRIES
955
+ if retries < @max_retries
938
956
  retries += 1
939
- @logger.warn("Retry #{retries}/#{MAX_RETRIES} for #{file_url}: #{e.message}")
957
+ @logger.warn("Retry #{retries}/#{@max_retries} for #{file_url}: #{e.message}")
940
958
  sleep(RETRY_DELAY * retries)
941
959
  retry
942
960
  else
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.4.3
4
+ version: 2.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2025-08-19 00:00:00.000000000 Z
10
+ date: 2025-10-27 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: concurrent-ruby