wayback_machine_downloader_straw 2.4.3 → 2.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ce7592163165a7f8235bf4a6e1915cf531511fafc7f6874c0d1673fb29db704f
4
- data.tar.gz: 7d48ffebf130d3b32d1ec233cf5141cc3cf192bcf16751db4380bf62863971c1
3
+ metadata.gz: c5ba50bde9b0306e043eed8151b12f37f603f5cfd73013e53260543f7fc134a5
4
+ data.tar.gz: d3dbc1a0f6f894547fb39e56193c967dbf99685a8fa4d3cecaeaff62070aab4c
5
5
  SHA512:
6
- metadata.gz: 16d56de1814e36174c47ab5bda6c9d5e02aba15bafa72a1d57056d0ac146e5fff5c6ca43f9198262d90820e4dcbe4e63772f01bd1ee5207c7ab07e9bb959e069
7
- data.tar.gz: 07602af4f0cfb9927d43239da0c38cb2411aa408d11fe3f91cb4a403fa415ca8de095eee7467e4613d32aadb8c0a13ffea19ac2f93fd5bf005a991d91e8a064a
6
+ metadata.gz: 854ec2ccbe2daf620178397bf2c620ed8cf01ca57f175cbb9a2c8e7a057b1495a382ccae64836d75cf08c9a7d5a8ab638859d4c7d34584dcfc51f5eda8b7e5b2
7
+ data.tar.gz: 1ed3f13c7aadcb097a174c7870730df0920c7b08e9476b786d287dfd2a3e29a50b690ea2fb16b0d762dac726648356788b4122e674db44e7c5a49ffc541f4098
@@ -74,6 +74,10 @@ option_parser = OptionParser.new do |opts|
74
74
  options[:keep] = true
75
75
  end
76
76
 
77
+ opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
78
+ options[:max_retries] = t
79
+ end
80
+
77
81
  opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
78
82
  options[:recursive_subdomains] = true
79
83
  end
@@ -82,6 +86,10 @@ option_parser = OptionParser.new do |opts|
82
86
  options[:subdomain_depth] = t
83
87
  end
84
88
 
89
+ opts.on("--page-requisites", "Download related assets (images, css, js) for downloaded HTML pages") do |t|
90
+ options[:page_requisites] = true
91
+ end
92
+
85
93
  opts.on("-v", "--version", "Display version") do |t|
86
94
  options[:version] = t
87
95
  end
@@ -16,6 +16,10 @@ module ArchiveAPI
16
16
  params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
17
17
  request_url.query = URI.encode_www_form(params)
18
18
 
19
+ retries = 0
20
+ max_retries = (@max_retries || 3)
21
+ delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
22
+
19
23
  begin
20
24
  response = http.get(request_url)
21
25
  body = response.body.to_s.strip
@@ -26,7 +30,21 @@ module ArchiveAPI
26
30
  json.shift if json.first == ["timestamp", "original"]
27
31
  json
28
32
  rescue JSON::ParserError => e
29
- warn "Failed to fetch data from API: #{e.message}"
33
+ warn "Failed to parse JSON from API for #{url}: #{e.message}"
34
+ []
35
+ rescue Net::ReadTimeout, Net::OpenTimeout => e
36
+ if retries < max_retries
37
+ retries += 1
38
+ warn "Timeout talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
39
+ sleep(delay * retries)
40
+ retry
41
+ else
42
+ warn "Giving up on Wayback CDX API for #{url} after #{max_retries} timeouts."
43
+ []
44
+ end
45
+ rescue StandardError => e
46
+ # treat any other transient-ish error similarly, though without retries for now
47
+ warn "Error fetching CDX data for #{url}: #{e.message}"
30
48
  []
31
49
  end
32
50
  end
@@ -0,0 +1,33 @@
1
+ module PageRequisites
2
+ # regex to find links in href, src, url(), and srcset
3
+ # this ignores data: URIs, mailto:, and anchors
4
+ ASSET_REGEX = /(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
5
+
6
+ def self.extract(html_content)
7
+ assets = []
8
+
9
+ html_content.scan(ASSET_REGEX) do |match|
10
+ # match is an array of capture groups; find the one that matched
11
+ url = match.compact.first
12
+ next unless url
13
+
14
+ # handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
15
+ if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
16
+ url.split(',').each do |src_def|
17
+ src_url = src_def.strip.split(' ').first
18
+ assets << src_url if valid_asset?(src_url)
19
+ end
20
+ else
21
+ assets << url if valid_asset?(url)
22
+ end
23
+ end
24
+
25
+ assets.uniq
26
+ end
27
+
28
+ def self.valid_asset?(url)
29
+ return false if url.strip.empty?
30
+ return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
31
+ true
32
+ end
33
+ end
@@ -1,74 +1,85 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # URLs in HTML attributes
4
- def rewrite_html_attr_urls(content)
5
-
6
- content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
7
- prefix, url, suffix = $1, $2, $3
8
-
9
- if url.start_with?('http')
10
- begin
11
- uri = URI.parse(url)
12
- path = uri.path
13
- path = path[1..-1] if path.start_with?('/')
14
- "#{prefix}#{path}#{suffix}"
15
- rescue
16
- "#{prefix}#{url}#{suffix}"
17
- end
18
- elsif url.start_with?('/')
19
- "#{prefix}./#{url[1..-1]}#{suffix}"
20
- else
21
- "#{prefix}#{url}#{suffix}"
3
+ module URLRewrite
4
+ # server-side extensions that should work locally
5
+ SERVER_SIDE_EXTS = %w[.php .asp .aspx .jsp .cgi .pl .py].freeze
6
+
7
+ def rewrite_html_attr_urls(content)
8
+ # rewrite URLs to relative paths
9
+ content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
10
+ prefix, path, suffix = $1, $2, $3
11
+ path = normalize_path_for_local(path)
12
+ "#{prefix}#{path}#{suffix}"
13
+ end
14
+
15
+ # rewrite absolute URLs to same domain as relative
16
+ content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
17
+ prefix, path, suffix = $1, $2, $3
18
+ path = normalize_path_for_local(path)
19
+ "#{prefix}#{path}#{suffix}"
22
20
  end
21
+
22
+ content
23
23
  end
24
- content
25
- end
26
24
 
27
- # URLs in CSS
28
- def rewrite_css_urls(content)
25
+ def rewrite_css_urls(content)
26
+ # rewrite URLs in CSS
27
+ content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
28
+ path = normalize_path_for_local($1)
29
+ "url(\"#{path}\")"
30
+ end
29
31
 
30
- content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
31
- url = $1
32
-
33
- if url.start_with?('http')
34
- begin
35
- uri = URI.parse(url)
36
- path = uri.path
37
- path = path[1..-1] if path.start_with?('/')
38
- "url(\"#{path}\")"
39
- rescue
40
- "url(\"#{url}\")"
41
- end
42
- elsif url.start_with?('/')
43
- "url(\"./#{url[1..-1]}\")"
44
- else
45
- "url(\"#{url}\")"
32
+ # rewrite absolute URLs in CSS
33
+ content.gsub!(/url\(\s*["']?https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
34
+ path = normalize_path_for_local($1)
35
+ "url(\"#{path}\")"
46
36
  end
37
+
38
+ content
47
39
  end
48
- content
49
- end
50
40
 
51
- # URLs in JavaScript
52
- def rewrite_js_urls(content)
53
-
54
- content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
55
- quote_start, url, quote_end = $1, $2, $3
41
+ def rewrite_js_urls(content)
42
+ # rewrite archive.org URLs in JavaScript strings
43
+ content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
44
+ quote_start, path, quote_end = $1, $2, $3
45
+ path = normalize_path_for_local(path)
46
+ "#{quote_start}#{path}#{quote_end}"
47
+ end
48
+
49
+ # rewrite absolute URLs in JavaScript
50
+ content.gsub!(/(["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
51
+ quote_start, path, quote_end = $1, $2, $3
52
+ next "#{quote_start}http#{$2}#{quote_end}" if $2.start_with?('s://', '://')
53
+ path = normalize_path_for_local(path)
54
+ "#{quote_start}#{path}#{quote_end}"
55
+ end
56
+
57
+ content
58
+ end
59
+
60
+ private
61
+
62
+ def normalize_path_for_local(path)
63
+ return "./index.html" if path.empty? || path == "/"
56
64
 
57
- if url.start_with?('http')
58
- begin
59
- uri = URI.parse(url)
60
- path = uri.path
61
- path = path[1..-1] if path.start_with?('/')
62
- "#{quote_start}#{path}#{quote_end}"
63
- rescue
64
- "#{quote_start}#{url}#{quote_end}"
65
- end
66
- elsif url.start_with?('/')
67
- "#{quote_start}./#{url[1..-1]}#{quote_end}"
65
+ # handle query strings - they're already part of the filename
66
+ path = path.split('?').first if path.include?('?')
67
+
68
+ # check if this is a server-side script
69
+ ext = File.extname(path).downcase
70
+ if SERVER_SIDE_EXTS.include?(ext)
71
+ # keep the path as-is but ensure it starts with ./
72
+ path = "./#{path}" unless path.start_with?('./', '/')
68
73
  else
69
- "#{quote_start}#{url}#{quote_end}"
74
+ # regular file handling
75
+ path = "./#{path}" unless path.start_with?('./', '/')
76
+
77
+ # if it looks like a directory, add index.html
78
+ if path.end_with?('/') || !path.include?('.')
79
+ path = "#{path.chomp('/')}/index.html"
80
+ end
70
81
  end
82
+
83
+ path
71
84
  end
72
-
73
- content
74
85
  end
@@ -15,6 +15,7 @@ require 'digest'
15
15
  require_relative 'wayback_machine_downloader/tidy_bytes'
16
16
  require_relative 'wayback_machine_downloader/to_regex'
17
17
  require_relative 'wayback_machine_downloader/archive_api'
18
+ require_relative 'wayback_machine_downloader/page_requisites'
18
19
  require_relative 'wayback_machine_downloader/subdom_processor'
19
20
  require_relative 'wayback_machine_downloader/url_rewrite'
20
21
 
@@ -25,69 +26,81 @@ class ConnectionPool
25
26
  MAX_RETRIES = 3
26
27
 
27
28
  def initialize(size)
28
- @size = size
29
- @pool = Concurrent::Map.new
30
- @creation_times = Concurrent::Map.new
29
+ @pool = SizedQueue.new(size)
30
+ size.times { @pool << build_connection_entry }
31
31
  @cleanup_thread = schedule_cleanup
32
32
  end
33
33
 
34
- def with_connection(&block)
35
- conn = acquire_connection
34
+ def with_connection
35
+ entry = acquire_connection
36
36
  begin
37
- yield conn
37
+ yield entry[:http]
38
38
  ensure
39
- release_connection(conn)
39
+ release_connection(entry)
40
40
  end
41
41
  end
42
42
 
43
43
  def shutdown
44
44
  @cleanup_thread&.exit
45
- @pool.each_value { |conn| conn.finish if conn&.started? }
46
- @pool.clear
47
- @creation_times.clear
45
+ drain_pool { |entry| safe_finish(entry[:http]) }
48
46
  end
49
47
 
50
48
  private
51
49
 
52
50
  def acquire_connection
53
- thread_id = Thread.current.object_id
54
- conn = @pool[thread_id]
51
+ entry = @pool.pop
52
+ if stale?(entry)
53
+ safe_finish(entry[:http])
54
+ entry = build_connection_entry
55
+ end
56
+ entry
57
+ end
55
58
 
56
- if should_create_new?(conn)
57
- conn&.finish if conn&.started?
58
- conn = create_connection
59
- @pool[thread_id] = conn
60
- @creation_times[thread_id] = Time.now
59
+ def release_connection(entry)
60
+ if stale?(entry)
61
+ safe_finish(entry[:http])
62
+ entry = build_connection_entry
61
63
  end
64
+ @pool << entry
65
+ end
62
66
 
63
- conn
67
+ def stale?(entry)
68
+ http = entry[:http]
69
+ !http.started? || (Time.now - entry[:created_at] > MAX_AGE)
64
70
  end
65
71
 
66
- def release_connection(conn)
67
- return unless conn
68
- if conn.started? && Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
69
- conn.finish
70
- @pool.delete(Thread.current.object_id)
71
- @creation_times.delete(Thread.current.object_id)
72
- end
72
+ def build_connection_entry
73
+ { http: create_connection, created_at: Time.now }
73
74
  end
74
75
 
75
- def should_create_new?(conn)
76
- return true if conn.nil?
77
- return true unless conn.started?
78
- return true if Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
79
- false
76
+ def safe_finish(http)
77
+ http.finish if http&.started?
78
+ rescue StandardError
79
+ nil
80
80
  end
81
81
 
82
- def create_connection
83
- http = Net::HTTP.new("web.archive.org", 443)
84
- http.use_ssl = true
85
- http.read_timeout = DEFAULT_TIMEOUT
86
- http.open_timeout = DEFAULT_TIMEOUT
87
- http.keep_alive_timeout = 30
88
- http.max_retries = MAX_RETRIES
89
- http.start
90
- http
82
+ def drain_pool
83
+ loop do
84
+ entry = begin
85
+ @pool.pop(true)
86
+ rescue ThreadError
87
+ break
88
+ end
89
+ yield(entry)
90
+ end
91
+ end
92
+
93
+ def cleanup_old_connections
94
+ entry = begin
95
+ @pool.pop(true)
96
+ rescue ThreadError
97
+ return
98
+ end
99
+ if stale?(entry)
100
+ safe_finish(entry[:http])
101
+ entry = build_connection_entry
102
+ end
103
+ @pool << entry
91
104
  end
92
105
 
93
106
  def schedule_cleanup
@@ -99,16 +112,15 @@ class ConnectionPool
99
112
  end
100
113
  end
101
114
 
102
- def cleanup_old_connections
103
- current_time = Time.now
104
- @creation_times.each do |thread_id, creation_time|
105
- if current_time - creation_time > MAX_AGE
106
- conn = @pool[thread_id]
107
- conn&.finish if conn&.started?
108
- @pool.delete(thread_id)
109
- @creation_times.delete(thread_id)
110
- end
111
- end
115
+ def create_connection
116
+ http = Net::HTTP.new("web.archive.org", 443)
117
+ http.use_ssl = true
118
+ http.read_timeout = DEFAULT_TIMEOUT
119
+ http.open_timeout = DEFAULT_TIMEOUT
120
+ http.keep_alive_timeout = 30
121
+ http.max_retries = MAX_RETRIES
122
+ http.start
123
+ http
112
124
  end
113
125
  end
114
126
 
@@ -116,8 +128,9 @@ class WaybackMachineDownloader
116
128
 
117
129
  include ArchiveAPI
118
130
  include SubdomainProcessor
131
+ include URLRewrite
119
132
 
120
- VERSION = "2.4.3"
133
+ VERSION = "2.4.5"
121
134
  DEFAULT_TIMEOUT = 30
122
135
  MAX_RETRIES = 3
123
136
  RETRY_DELAY = 2
@@ -131,7 +144,7 @@ class WaybackMachineDownloader
131
144
  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
132
145
  :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
133
146
  :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
134
- :snapshot_at
147
+ :snapshot_at, :page_requisites
135
148
 
136
149
  def initialize params
137
150
  validate_params(params)
@@ -163,6 +176,9 @@ class WaybackMachineDownloader
163
176
  @recursive_subdomains = params[:recursive_subdomains] || false
164
177
  @subdomain_depth = params[:subdomain_depth] || 1
165
178
  @snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
179
+ @max_retries = params[:max_retries] ? params[:max_retries].to_i : MAX_RETRIES
180
+ @page_requisites = params[:page_requisites] || false
181
+ @pending_jobs = Concurrent::AtomicFixnum.new(0)
166
182
 
167
183
  # URL for rejecting invalid/unencoded wayback urls
168
184
  @url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
@@ -171,18 +187,29 @@ class WaybackMachineDownloader
171
187
  end
172
188
 
173
189
  def backup_name
174
- url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
190
+ url_to_process = @base_url
191
+ url_to_process = url_to_process.chomp('/*') if url_to_process&.end_with?('/*')
192
+
175
193
  raw = if url_to_process.include?('//')
176
194
  url_to_process.split('/')[2]
177
195
  else
178
196
  url_to_process
179
197
  end
180
198
 
199
+ # if it looks like a wildcard pattern, normalize to a safe host-ish name
200
+ if raw&.start_with?('*.')
201
+ raw = raw.sub(/\A\*\./, 'all-')
202
+ end
203
+
181
204
  # sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
182
205
  if Gem.win_platform?
183
206
  raw = raw.gsub(/[:*?"<>|]/, '_')
184
207
  raw = raw.gsub(/[ .]+\z/, '')
208
+ else
209
+ # still good practice to strip path separators (and maybe '*' for POSIX too)
210
+ raw = raw.gsub(/[\/:*?"<>|]/, '_')
185
211
  end
212
+
186
213
  raw = 'site' if raw.nil? || raw.empty?
187
214
  raw
188
215
  end
@@ -193,7 +220,8 @@ class WaybackMachineDownloader
193
220
  @directory
194
221
  else
195
222
  # ensure the default path is absolute and normalized
196
- File.expand_path(File.join('websites', backup_name))
223
+ cwd = Dir.pwd
224
+ File.expand_path(File.join(cwd, 'websites', backup_name))
197
225
  end
198
226
  end
199
227
 
@@ -277,53 +305,62 @@ class WaybackMachineDownloader
277
305
  page_index = 0
278
306
  batch_size = [@threads_count, 5].min
279
307
  continue_fetching = true
280
-
281
- while continue_fetching && page_index < @maximum_pages
282
- # Determine the range of pages to fetch in this batch
283
- end_index = [page_index + batch_size, @maximum_pages].min
284
- current_batch = (page_index...end_index).to_a
285
-
286
- # Create futures for concurrent API calls
287
- futures = current_batch.map do |page|
288
- Concurrent::Future.execute do
289
- result = nil
290
- @connection_pool.with_connection do |connection|
291
- result = get_raw_list_from_api("#{@base_url}/*", page, connection)
308
+ fetch_pool = Concurrent::FixedThreadPool.new([@threads_count, 1].max)
309
+ begin
310
+ while continue_fetching && page_index < @maximum_pages
311
+ # Determine the range of pages to fetch in this batch
312
+ end_index = [page_index + batch_size, @maximum_pages].min
313
+ current_batch = (page_index...end_index).to_a
314
+
315
+ # Create futures for concurrent API calls
316
+ futures = current_batch.map do |page|
317
+ Concurrent::Future.execute(executor: fetch_pool) do
318
+ result = nil
319
+ @connection_pool.with_connection do |connection|
320
+ result = get_raw_list_from_api("#{@base_url}/*", page, connection)
321
+ end
322
+ result ||= []
323
+ [page, result]
292
324
  end
293
- result ||= []
294
- [page, result]
295
325
  end
296
- end
297
326
 
298
- results = []
327
+ results = []
299
328
 
300
- futures.each do |future|
301
- begin
302
- results << future.value
303
- rescue => e
304
- puts "\nError fetching page #{future}: #{e.message}"
329
+ futures.each do |future|
330
+ begin
331
+ val = future.value
332
+ # only append if valid
333
+ if val && val.is_a?(Array) && val.first.is_a?(Integer)
334
+ results << val
335
+ end
336
+ rescue => e
337
+ puts "\nError fetching page #{future}: #{e.message}"
338
+ end
305
339
  end
306
- end
307
340
 
308
- # Sort results by page number to maintain order
309
- results.sort_by! { |page, _| page }
310
-
311
- # Process results and check for empty pages
312
- results.each do |page, result|
313
- if result.nil? || result.empty?
314
- continue_fetching = false
315
- break
316
- else
317
- mutex.synchronize do
318
- snapshot_list_to_consider.concat(result)
319
- print "."
341
+ # Sort results by page number to maintain order
342
+ results.sort_by! { |page, _| page }
343
+
344
+ # Process results and check for empty pages
345
+ results.each do |page, result|
346
+ if result.nil? || result.empty?
347
+ continue_fetching = false
348
+ break
349
+ else
350
+ mutex.synchronize do
351
+ snapshot_list_to_consider.concat(result)
352
+ print "."
353
+ end
320
354
  end
321
355
  end
322
- end
323
356
 
324
- page_index = end_index
357
+ page_index = end_index
325
358
 
326
- sleep(RATE_LIMIT) if continue_fetching
359
+ sleep(RATE_LIMIT) if continue_fetching
360
+ end
361
+ ensure
362
+ fetch_pool.shutdown
363
+ fetch_pool.wait_for_termination
327
364
  end
328
365
  end
329
366
 
@@ -531,7 +568,7 @@ class WaybackMachineDownloader
531
568
  end
532
569
  end
533
570
  end
534
-
571
+
535
572
  def download_files
536
573
  start_time = Time.now
537
574
  puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
@@ -552,6 +589,12 @@ class WaybackMachineDownloader
552
589
 
553
590
  # Load IDs of already downloaded files
554
591
  downloaded_ids = load_downloaded_ids
592
+
593
+ # We use a thread-safe Set to track what we have queued/downloaded in this session
594
+ # to avoid infinite loops with page requisites
595
+ @session_downloaded_ids = Concurrent::Set.new
596
+ downloaded_ids.each { |id| @session_downloaded_ids.add(id) }
597
+
555
598
  files_to_process = files_to_download.reject do |file_info|
556
599
  downloaded_ids.include?(file_info[:file_id])
557
600
  end
@@ -562,8 +605,8 @@ class WaybackMachineDownloader
562
605
  if skipped_count > 0
563
606
  puts "Found #{skipped_count} previously downloaded files, skipping them."
564
607
  end
565
-
566
- if remaining_count == 0
608
+
609
+ if remaining_count == 0 && !@page_requisites
567
610
  puts "All matching files have already been downloaded."
568
611
  cleanup
569
612
  return
@@ -576,12 +619,22 @@ class WaybackMachineDownloader
576
619
  @download_mutex = Mutex.new
577
620
 
578
621
  thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
579
- pool = Concurrent::FixedThreadPool.new(thread_count)
622
+ @worker_pool = Concurrent::FixedThreadPool.new(thread_count)
580
623
 
581
- processing_files(pool, files_to_process)
624
+ # initial batch
625
+ files_to_process.each do |file_remote_info|
626
+ @session_downloaded_ids.add(file_remote_info[:file_id])
627
+ submit_download_job(file_remote_info)
628
+ end
629
+
630
+ # wait for all jobs to finish
631
+ loop do
632
+ sleep 0.5
633
+ break if @pending_jobs.value == 0
634
+ end
582
635
 
583
- pool.shutdown
584
- pool.wait_for_termination
636
+ @worker_pool.shutdown
637
+ @worker_pool.wait_for_termination
585
638
 
586
639
  end_time = Time.now
587
640
  puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
@@ -599,6 +652,138 @@ class WaybackMachineDownloader
599
652
  cleanup
600
653
  end
601
654
 
655
+ # helper to submit jobs and increment the counter
656
+ def submit_download_job(file_remote_info)
657
+ @pending_jobs.increment
658
+ @worker_pool.post do
659
+ begin
660
+ process_single_file(file_remote_info)
661
+ ensure
662
+ @pending_jobs.decrement
663
+ end
664
+ end
665
+ end
666
+
667
+ def process_single_file(file_remote_info)
668
+ download_success = false
669
+ downloaded_path = nil
670
+
671
+ @connection_pool.with_connection do |connection|
672
+ result_message, path = download_file(file_remote_info, connection)
673
+ downloaded_path = path
674
+
675
+ if result_message && result_message.include?(' -> ')
676
+ download_success = true
677
+ end
678
+
679
+ @download_mutex.synchronize do
680
+ @processed_file_count += 1 if @processed_file_count < @total_to_download
681
+ # only print if it's a "User" file or a requisite we found
682
+ puts result_message if result_message
683
+ end
684
+ end
685
+
686
+ if download_success
687
+ append_to_db(file_remote_info[:file_id])
688
+
689
+ if @page_requisites && downloaded_path && File.extname(downloaded_path) =~ /\.(html?|php|asp|aspx|jsp)$/i
690
+ process_page_requisites(downloaded_path, file_remote_info)
691
+ end
692
+ end
693
+ rescue => e
694
+ @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
695
+ end
696
+
697
+ def process_page_requisites(file_path, parent_remote_info)
698
+ return unless File.exist?(file_path)
699
+
700
+ content = File.read(file_path)
701
+ content = content.force_encoding('UTF-8').scrub
702
+
703
+ assets = PageRequisites.extract(content)
704
+
705
+ # prepare base URI for resolving relative paths
706
+ parent_raw = parent_remote_info[:file_url]
707
+ parent_raw = "http://#{parent_raw}" unless parent_raw.match?(/^https?:\/\//)
708
+
709
+ begin
710
+ base_uri = URI(parent_raw)
711
+ # calculate the "root" host of the site we are downloading to compare later
712
+ current_project_host = URI("http://" + @base_url.gsub(%r{^https?://}, '')).host
713
+ rescue URI::InvalidURIError
714
+ return
715
+ end
716
+
717
+ parent_timestamp = parent_remote_info[:timestamp]
718
+
719
+ assets.each do |asset_rel_url|
720
+ begin
721
+ # resolve full URL (handles relative paths like "../img/logo.png")
722
+ resolved_uri = base_uri + asset_rel_url
723
+
724
+ # detect if the asset URL is already a Wayback "web/<timestamp>/.../https://..." embed
725
+ asset_timestamp = parent_timestamp
726
+ if resolved_uri.path =~ %r{\A/web/([0-9]{4,})[^/]*/(https?://.+)\z}
727
+ embedded_ts = $1
728
+ begin
729
+ orig_uri = URI($2)
730
+ resolved_uri = orig_uri
731
+ asset_timestamp = embedded_ts.to_i
732
+ rescue URI::InvalidURIError
733
+ # fall back to original resolved_uri and parent timestamp
734
+ end
735
+ end
736
+
737
+ # filter out navigation links (pages) vs assets
738
+ # skip if extension is empty or looks like an HTML page
739
+ path = resolved_uri.path
740
+ ext = File.extname(path).downcase
741
+ if ext.empty? || ['.html', '.htm', '.php', '.asp', '.aspx'].include?(ext)
742
+ next
743
+ end
744
+
745
+ # construct the URL for the Wayback API
746
+ asset_wbm_url = resolved_uri.host + resolved_uri.path
747
+ asset_wbm_url += "?#{resolved_uri.query}" if resolved_uri.query
748
+
749
+ # construct the local file ID
750
+ # if the asset is on the SAME domain, strip the domain from the folder path
751
+ # if it's on a DIFFERENT domain (e.g. cdn.jquery.com), keep the domain folder
752
+ if resolved_uri.host == current_project_host
753
+ # e.g. /static/css/style.css
754
+ asset_file_id = resolved_uri.path
755
+ asset_file_id = asset_file_id[1..-1] if asset_file_id.start_with?('/')
756
+ else
757
+ # e.g. cdn.google.com/jquery.js
758
+ asset_file_id = asset_wbm_url
759
+ end
760
+
761
+ rescue URI::InvalidURIError, StandardError
762
+ next
763
+ end
764
+
765
+ # sanitize and queue
766
+ asset_id = sanitize_and_prepare_id(asset_file_id, asset_wbm_url)
767
+
768
+ unless @session_downloaded_ids.include?(asset_id)
769
+ @session_downloaded_ids.add(asset_id)
770
+
771
+ new_file_info = {
772
+ file_url: asset_wbm_url,
773
+ timestamp: asset_timestamp,
774
+ file_id: asset_id
775
+ }
776
+
777
+ @download_mutex.synchronize do
778
+ @total_to_download += 1
779
+ puts "Queued requisite: #{asset_file_id}"
780
+ end
781
+
782
+ submit_download_job(new_file_info)
783
+ end
784
+ end
785
+ end
786
+
602
787
  def structure_dir_path dir_path
603
788
  begin
604
789
  FileUtils::mkdir_p dir_path unless File.exist? dir_path
@@ -630,7 +815,8 @@ class WaybackMachineDownloader
630
815
  begin
631
816
  content = File.binread(file_path)
632
817
 
633
- if file_ext == '.html' || file_ext == '.htm'
818
+ # detect encoding for HTML files
819
+ if file_ext == '.html' || file_ext == '.htm' || file_ext == '.php' || file_ext == '.asp'
634
820
  encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
635
821
  content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
636
822
  else
@@ -638,21 +824,21 @@ class WaybackMachineDownloader
638
824
  end
639
825
 
640
826
  # URLs in HTML attributes
641
- rewrite_html_attr_urls(content)
827
+ content = rewrite_html_attr_urls(content)
642
828
 
643
829
  # URLs in CSS
644
- rewrite_css_urls(content)
830
+ content = rewrite_css_urls(content)
645
831
 
646
832
  # URLs in JavaScript
647
- rewrite_js_urls(content)
833
+ content = rewrite_js_urls(content)
648
834
 
649
- # for URLs in HTML attributes that start with a single slash
835
+ # for URLs that start with a single slash, make them relative
650
836
  content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
651
837
  prefix, path, suffix = $1, $2, $3
652
838
  "#{prefix}./#{path}#{suffix}"
653
839
  end
654
840
 
655
- # for URLs in CSS that start with a single slash
841
+ # for URLs in CSS that start with a single slash, make them relative
656
842
  content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
657
843
  path = $1
658
844
  "url(\"./#{path}\")"
@@ -705,7 +891,7 @@ class WaybackMachineDownloader
705
891
  # check existence *before* download attempt
706
892
  # this handles cases where a file was created manually or by a previous partial run without a .db entry
707
893
  if File.exist? file_path
708
- return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
894
+ return ["#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
709
895
  end
710
896
 
711
897
  begin
@@ -717,13 +903,13 @@ class WaybackMachineDownloader
717
903
  if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
718
904
  rewrite_urls_to_relative(file_path)
719
905
  end
720
- "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
906
+ return ["#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
721
907
  when :skipped_not_found
722
- "Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
908
+ return ["Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
723
909
  else
724
910
  # ideally, this case should not be reached if download_with_retry behaves as expected.
725
911
  @logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
726
- "Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
912
+ return ["Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
727
913
  end
728
914
  rescue StandardError => e
729
915
  msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
@@ -731,7 +917,7 @@ class WaybackMachineDownloader
731
917
  File.delete(file_path)
732
918
  msg += "\n#{file_path} was empty and was removed."
733
919
  end
734
- msg
920
+ return [msg, nil]
735
921
  end
736
922
  end
737
923
 
@@ -934,9 +1120,9 @@ class WaybackMachineDownloader
934
1120
  end
935
1121
 
936
1122
  rescue StandardError => e
937
- if retries < MAX_RETRIES
1123
+ if retries < @max_retries
938
1124
  retries += 1
939
- @logger.warn("Retry #{retries}/#{MAX_RETRIES} for #{file_url}: #{e.message}")
1125
+ @logger.warn("Retry #{retries}/#{@max_retries} for #{file_url}: #{e.message}")
940
1126
  sleep(RETRY_DELAY * retries)
941
1127
  retry
942
1128
  else
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.4.3
4
+ version: 2.4.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2025-08-19 00:00:00.000000000 Z
10
+ date: 2026-01-05 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: concurrent-ruby
@@ -71,6 +71,7 @@ files:
71
71
  - bin/wayback_machine_downloader
72
72
  - lib/wayback_machine_downloader.rb
73
73
  - lib/wayback_machine_downloader/archive_api.rb
74
+ - lib/wayback_machine_downloader/page_requisites.rb
74
75
  - lib/wayback_machine_downloader/subdom_processor.rb
75
76
  - lib/wayback_machine_downloader/tidy_bytes.rb
76
77
  - lib/wayback_machine_downloader/to_regex.rb