wayback_machine_downloader_straw 2.4.1 → 2.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f6650c4217f2630db6307bc50ae2d6cefcbc38afc18b5701cc90a956af5cf1cf
4
- data.tar.gz: 0ad44d7daa4c69b75d319c3518c4b801810be071545d5eded4497073caab4667
3
+ metadata.gz: 1e81619475172540d94968e5d31fbb6b4df7f08a533efce3b200e1ad4ce5035f
4
+ data.tar.gz: aca008795277ebf489cda7f0d8d1691b1983e2eec6f338b95c9bf11552fdfd94
5
5
  SHA512:
6
- metadata.gz: 7a8cfd1cda19bc3ff2db8859e03877395eaf44092ffbe9f5334218fbd6293ff1aecc60e2bf272f875a67ecd086a209c56640db221f4d13739669a27eada1c826
7
- data.tar.gz: 877436af63fa205add55ebeb55bafcd39fec0afa56707ee742871014dac48998e8028ef4616a0b611bee5f9a93ed0d8d136375d457503a3e34b9a37f87321787
6
+ metadata.gz: c71eb691ba50308f1f6b29a326c00a0678db1ff94f6ab8620b4d74425c993a64b60dd4d0127f9dd9eb3a72801008672855d901379d870b89aff82862b4f582a3
7
+ data.tar.gz: 6277c45d37dd02fea219f93906bf790ddede921ef88e928074b9e3da88b9882a64ecebbc99767b4c0eaef34b9a3407e9fee6352c935ea2d9942eed5735c51632
@@ -74,6 +74,10 @@ option_parser = OptionParser.new do |opts|
74
74
  options[:keep] = true
75
75
  end
76
76
 
77
+ opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
78
+ options[:max_retries] = t
79
+ end
80
+
77
81
  opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
78
82
  options[:recursive_subdomains] = true
79
83
  end
@@ -11,6 +11,7 @@ require 'concurrent-ruby'
11
11
  require 'logger'
12
12
  require 'zlib'
13
13
  require 'stringio'
14
+ require 'digest'
14
15
  require_relative 'wayback_machine_downloader/tidy_bytes'
15
16
  require_relative 'wayback_machine_downloader/to_regex'
16
17
  require_relative 'wayback_machine_downloader/archive_api'
@@ -24,69 +25,81 @@ class ConnectionPool
24
25
  MAX_RETRIES = 3
25
26
 
26
27
  def initialize(size)
27
- @size = size
28
- @pool = Concurrent::Map.new
29
- @creation_times = Concurrent::Map.new
28
+ @pool = SizedQueue.new(size)
29
+ size.times { @pool << build_connection_entry }
30
30
  @cleanup_thread = schedule_cleanup
31
31
  end
32
32
 
33
- def with_connection(&block)
34
- conn = acquire_connection
33
+ def with_connection
34
+ entry = acquire_connection
35
35
  begin
36
- yield conn
36
+ yield entry[:http]
37
37
  ensure
38
- release_connection(conn)
38
+ release_connection(entry)
39
39
  end
40
40
  end
41
41
 
42
42
  def shutdown
43
43
  @cleanup_thread&.exit
44
- @pool.each_value { |conn| conn.finish if conn&.started? }
45
- @pool.clear
46
- @creation_times.clear
44
+ drain_pool { |entry| safe_finish(entry[:http]) }
47
45
  end
48
46
 
49
47
  private
50
48
 
51
49
  def acquire_connection
52
- thread_id = Thread.current.object_id
53
- conn = @pool[thread_id]
50
+ entry = @pool.pop
51
+ if stale?(entry)
52
+ safe_finish(entry[:http])
53
+ entry = build_connection_entry
54
+ end
55
+ entry
56
+ end
54
57
 
55
- if should_create_new?(conn)
56
- conn&.finish if conn&.started?
57
- conn = create_connection
58
- @pool[thread_id] = conn
59
- @creation_times[thread_id] = Time.now
58
+ def release_connection(entry)
59
+ if stale?(entry)
60
+ safe_finish(entry[:http])
61
+ entry = build_connection_entry
60
62
  end
63
+ @pool << entry
64
+ end
61
65
 
62
- conn
66
+ def stale?(entry)
67
+ http = entry[:http]
68
+ !http.started? || (Time.now - entry[:created_at] > MAX_AGE)
63
69
  end
64
70
 
65
- def release_connection(conn)
66
- return unless conn
67
- if conn.started? && Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
68
- conn.finish
69
- @pool.delete(Thread.current.object_id)
70
- @creation_times.delete(Thread.current.object_id)
71
- end
71
+ def build_connection_entry
72
+ { http: create_connection, created_at: Time.now }
72
73
  end
73
74
 
74
- def should_create_new?(conn)
75
- return true if conn.nil?
76
- return true unless conn.started?
77
- return true if Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
78
- false
75
+ def safe_finish(http)
76
+ http.finish if http&.started?
77
+ rescue StandardError
78
+ nil
79
79
  end
80
80
 
81
- def create_connection
82
- http = Net::HTTP.new("web.archive.org", 443)
83
- http.use_ssl = true
84
- http.read_timeout = DEFAULT_TIMEOUT
85
- http.open_timeout = DEFAULT_TIMEOUT
86
- http.keep_alive_timeout = 30
87
- http.max_retries = MAX_RETRIES
88
- http.start
89
- http
81
+ def drain_pool
82
+ loop do
83
+ entry = begin
84
+ @pool.pop(true)
85
+ rescue ThreadError
86
+ break
87
+ end
88
+ yield(entry)
89
+ end
90
+ end
91
+
92
+ def cleanup_old_connections
93
+ entry = begin
94
+ @pool.pop(true)
95
+ rescue ThreadError
96
+ return
97
+ end
98
+ if stale?(entry)
99
+ safe_finish(entry[:http])
100
+ entry = build_connection_entry
101
+ end
102
+ @pool << entry
90
103
  end
91
104
 
92
105
  def schedule_cleanup
@@ -98,16 +111,15 @@ class ConnectionPool
98
111
  end
99
112
  end
100
113
 
101
- def cleanup_old_connections
102
- current_time = Time.now
103
- @creation_times.each do |thread_id, creation_time|
104
- if current_time - creation_time > MAX_AGE
105
- conn = @pool[thread_id]
106
- conn&.finish if conn&.started?
107
- @pool.delete(thread_id)
108
- @creation_times.delete(thread_id)
109
- end
110
- end
114
+ def create_connection
115
+ http = Net::HTTP.new("web.archive.org", 443)
116
+ http.use_ssl = true
117
+ http.read_timeout = DEFAULT_TIMEOUT
118
+ http.open_timeout = DEFAULT_TIMEOUT
119
+ http.keep_alive_timeout = 30
120
+ http.max_retries = MAX_RETRIES
121
+ http.start
122
+ http
111
123
  end
112
124
  end
113
125
 
@@ -116,7 +128,7 @@ class WaybackMachineDownloader
116
128
  include ArchiveAPI
117
129
  include SubdomainProcessor
118
130
 
119
- VERSION = "2.4.1"
131
+ VERSION = "2.4.4"
120
132
  DEFAULT_TIMEOUT = 30
121
133
  MAX_RETRIES = 3
122
134
  RETRY_DELAY = 2
@@ -162,6 +174,7 @@ class WaybackMachineDownloader
162
174
  @recursive_subdomains = params[:recursive_subdomains] || false
163
175
  @subdomain_depth = params[:subdomain_depth] || 1
164
176
  @snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
177
+ @max_retries = params[:max_retries] ? params[:max_retries].to_i : MAX_RETRIES
165
178
 
166
179
  # URL for rejecting invalid/unencoded wayback urls
167
180
  @url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
@@ -171,12 +184,19 @@ class WaybackMachineDownloader
171
184
 
172
185
  def backup_name
173
186
  url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
174
-
175
- if url_to_process.include? '//'
187
+ raw = if url_to_process.include?('//')
176
188
  url_to_process.split('/')[2]
177
189
  else
178
190
  url_to_process
179
191
  end
192
+
193
+ # sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
194
+ if Gem.win_platform?
195
+ raw = raw.gsub(/[:*?"<>|]/, '_')
196
+ raw = raw.gsub(/[ .]+\z/, '')
197
+ end
198
+ raw = 'site' if raw.nil? || raw.empty?
199
+ raw
180
200
  end
181
201
 
182
202
  def backup_path
@@ -185,7 +205,8 @@ class WaybackMachineDownloader
185
205
  @directory
186
206
  else
187
207
  # ensure the default path is absolute and normalized
188
- File.expand_path(File.join('websites', backup_name))
208
+ cwd = Dir.pwd
209
+ File.expand_path(File.join(cwd, 'websites', backup_name))
189
210
  end
190
211
  end
191
212
 
@@ -269,53 +290,58 @@ class WaybackMachineDownloader
269
290
  page_index = 0
270
291
  batch_size = [@threads_count, 5].min
271
292
  continue_fetching = true
272
-
273
- while continue_fetching && page_index < @maximum_pages
274
- # Determine the range of pages to fetch in this batch
275
- end_index = [page_index + batch_size, @maximum_pages].min
276
- current_batch = (page_index...end_index).to_a
277
-
278
- # Create futures for concurrent API calls
279
- futures = current_batch.map do |page|
280
- Concurrent::Future.execute do
281
- result = nil
282
- @connection_pool.with_connection do |connection|
283
- result = get_raw_list_from_api("#{@base_url}/*", page, connection)
293
+ fetch_pool = Concurrent::FixedThreadPool.new([@threads_count, 1].max)
294
+ begin
295
+ while continue_fetching && page_index < @maximum_pages
296
+ # Determine the range of pages to fetch in this batch
297
+ end_index = [page_index + batch_size, @maximum_pages].min
298
+ current_batch = (page_index...end_index).to_a
299
+
300
+ # Create futures for concurrent API calls
301
+ futures = current_batch.map do |page|
302
+ Concurrent::Future.execute(executor: fetch_pool) do
303
+ result = nil
304
+ @connection_pool.with_connection do |connection|
305
+ result = get_raw_list_from_api("#{@base_url}/*", page, connection)
306
+ end
307
+ result ||= []
308
+ [page, result]
284
309
  end
285
- result ||= []
286
- [page, result]
287
310
  end
288
- end
289
311
 
290
- results = []
312
+ results = []
291
313
 
292
- futures.each do |future|
293
- begin
294
- results << future.value
295
- rescue => e
296
- puts "\nError fetching page #{future}: #{e.message}"
314
+ futures.each do |future|
315
+ begin
316
+ results << future.value
317
+ rescue => e
318
+ puts "\nError fetching page #{future}: #{e.message}"
319
+ end
297
320
  end
298
- end
299
321
 
300
- # Sort results by page number to maintain order
301
- results.sort_by! { |page, _| page }
302
-
303
- # Process results and check for empty pages
304
- results.each do |page, result|
305
- if result.nil? || result.empty?
306
- continue_fetching = false
307
- break
308
- else
309
- mutex.synchronize do
310
- snapshot_list_to_consider.concat(result)
311
- print "."
322
+ # Sort results by page number to maintain order
323
+ results.sort_by! { |page, _| page }
324
+
325
+ # Process results and check for empty pages
326
+ results.each do |page, result|
327
+ if result.nil? || result.empty?
328
+ continue_fetching = false
329
+ break
330
+ else
331
+ mutex.synchronize do
332
+ snapshot_list_to_consider.concat(result)
333
+ print "."
334
+ end
312
335
  end
313
336
  end
314
- end
315
337
 
316
- page_index = end_index
338
+ page_index = end_index
317
339
 
318
- sleep(RATE_LIMIT) if continue_fetching
340
+ sleep(RATE_LIMIT) if continue_fetching
341
+ end
342
+ ensure
343
+ fetch_pool.shutdown
344
+ fetch_pool.wait_for_termination
319
345
  end
320
346
  end
321
347
 
@@ -630,13 +656,13 @@ class WaybackMachineDownloader
630
656
  end
631
657
 
632
658
  # URLs in HTML attributes
633
- rewrite_html_attr_urls(content)
659
+ content = rewrite_html_attr_urls(content)
634
660
 
635
661
  # URLs in CSS
636
- rewrite_css_urls(content)
662
+ content = rewrite_css_urls(content)
637
663
 
638
664
  # URLs in JavaScript
639
- rewrite_js_urls(content)
665
+ content = rewrite_js_urls(content)
640
666
 
641
667
  # for URLs in HTML attributes that start with a single slash
642
668
  content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
@@ -769,17 +795,83 @@ class WaybackMachineDownloader
769
795
  # safely sanitize a file id (or id+timestamp)
770
796
  def sanitize_and_prepare_id(raw, file_url)
771
797
  return nil if raw.nil?
798
+ return "" if raw.empty?
799
+ original = raw.dup
772
800
  begin
773
- raw = CGI.unescape(raw) rescue raw
774
- raw.gsub!(/<[^>]*>/, '')
775
- raw = raw.tidy_bytes unless raw.empty?
801
+ # work on a binary copy to avoid premature encoding errors
802
+ raw = raw.dup.force_encoding(Encoding::BINARY)
803
+
804
+ # percent-decode (repeat until stable in case of double-encoding)
805
+ loop do
806
+ decoded = raw.gsub(/%([0-9A-Fa-f]{2})/) { [$1].pack('H2') }
807
+ break if decoded == raw
808
+ raw = decoded
809
+ end
810
+
811
+ # try tidy_bytes
812
+ begin
813
+ raw = raw.tidy_bytes
814
+ rescue StandardError
815
+ # fallback: scrub to UTF-8
816
+ raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
817
+ end
818
+
819
+ # ensure UTF-8 and scrub again
820
+ unless raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
821
+ raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
822
+ end
823
+
824
+ # strip HTML/comment artifacts & control chars
825
+ raw.gsub!(/<!--+/, '')
826
+ raw.gsub!(/[\x00-\x1F]/, '')
827
+
828
+ # split query; hash it for stable short name
829
+ path_part, query_part = raw.split('?', 2)
830
+ if query_part && !query_part.empty?
831
+ q_digest = Digest::SHA256.hexdigest(query_part)[0, 12]
832
+ if path_part.include?('.')
833
+ pre, _sep, post = path_part.rpartition('.')
834
+ path_part = "#{pre}__q#{q_digest}.#{post}"
835
+ else
836
+ path_part = "#{path_part}__q#{q_digest}"
837
+ end
838
+ end
839
+ raw = path_part
840
+
841
+ # collapse slashes & trim leading slash
842
+ raw.gsub!(%r{/+}, '/')
843
+ raw.sub!(%r{\A/}, '')
844
+
845
+ # segment-wise sanitation
846
+ raw = raw.split('/').map do |segment|
847
+ seg = segment.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
848
+ seg = seg.gsub(/[:*?"<>|\\]/) { |c| "%#{c.ord.to_s(16).upcase}" }
849
+ seg = seg.gsub(/[ .]+\z/, '') if Gem.win_platform?
850
+ seg.empty? ? '_' : seg
851
+ end.join('/')
852
+
853
+ # remove any remaining angle brackets
854
+ raw.tr!('<>', '')
855
+
856
+ # final fallback if empty
857
+ raw = "file__#{Digest::SHA1.hexdigest(original)[0,10]}" if raw.nil? || raw.empty?
858
+
776
859
  raw
777
860
  rescue => e
778
861
  @logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
779
- nil
862
+ # deterministic fallback – never return nil so caller won’t mark malformed
863
+ "file__#{Digest::SHA1.hexdigest(original)[0,10]}"
780
864
  end
781
865
  end
782
866
 
867
+ # wrap URL in parentheses if it contains characters that commonly break unquoted
868
+ # Windows CMD usage (e.g., &). This is only for display; user still must quote
869
+ # when invoking manually.
870
+ def safe_display_url(url)
871
+ return url unless url && url.match?(/[&]/)
872
+ "(#{url})"
873
+ end
874
+
783
875
  def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
784
876
  retries = 0
785
877
  begin
@@ -860,9 +952,9 @@ class WaybackMachineDownloader
860
952
  end
861
953
 
862
954
  rescue StandardError => e
863
- if retries < MAX_RETRIES
955
+ if retries < @max_retries
864
956
  retries += 1
865
- @logger.warn("Retry #{retries}/#{MAX_RETRIES} for #{file_url}: #{e.message}")
957
+ @logger.warn("Retry #{retries}/#{@max_retries} for #{file_url}: #{e.message}")
866
958
  sleep(RETRY_DELAY * retries)
867
959
  retry
868
960
  else
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.4.1
4
+ version: 2.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2025-08-12 00:00:00.000000000 Z
10
+ date: 2025-10-27 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: concurrent-ruby