wayback_machine_downloader_straw 2.3.11 → 2.3.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 509fe40b83d02f834a549ef921e8c6619c67e0f0cba4bfe9645c91f20d528fff
4
- data.tar.gz: b3b3adad5c44a38909685c2573b129de683e3770c333bd5a0660d5c82d437515
3
+ metadata.gz: 67f774a5476a54ad0224e11f0c9a24b8df6b0d418f5b3c8886277c286bbe3043
4
+ data.tar.gz: a881ccdac84cd8e4da13edd9fc8117bfdba8c7d432959ef81c85bc95072a0dd9
5
5
  SHA512:
6
- metadata.gz: ac2d2ef9b510e50fa2f6a9bb9a5854d995808e52ec2f910e927c364130050d1722296d4a3362d5562b3c8fe31c6f6fddf95c7ce5efb26517390c46cbfb522e89
7
- data.tar.gz: 82e36806ab9a427c20f0aa9efcf540491a7423292f5bbb482c82daad86e4a9259af547ebfe754d2f30556e830e6cd789614e1b6ef50b09af3d763f66855c8a46
6
+ metadata.gz: 01bdc9142820719c1ab17a50067fc478975627f414a29bdca32ea5fedf23227f33fb331f9470bb002af80cc50a6a74c7c8361f214d162c537d100860bdb664bc
7
+ data.tar.gz: f47436ecd1d4b8a4062d8689dac0d9fc4d73c743d5f84bd96764aa2a186eaae607fcee6c7b9e72f9fd3befd1fadfe9006354a43bfd134c892fbf5dfdd736ee28
@@ -0,0 +1,238 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SubdomainProcessor
4
+ def process_subdomains
5
+ return unless @recursive_subdomains
6
+
7
+ puts "Starting subdomain processing..."
8
+
9
+ # extract base domain from the URL for comparison
10
+ base_domain = extract_base_domain(@base_url)
11
+ @processed_domains = Set.new([base_domain])
12
+ @subdomain_queue = Queue.new
13
+
14
+ # scan downloaded files for subdomain links
15
+ initial_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
16
+ puts "Scanning #{initial_files.size} downloaded files for subdomain links..."
17
+
18
+ subdomains_found = scan_files_for_subdomains(initial_files, base_domain)
19
+
20
+ if subdomains_found.empty?
21
+ puts "No subdomains found in downloaded content."
22
+ return
23
+ end
24
+
25
+ puts "Found #{subdomains_found.size} subdomains to process: #{subdomains_found.join(', ')}"
26
+
27
+ # add found subdomains to the queue
28
+ subdomains_found.each do |subdomain|
29
+ full_domain = "#{subdomain}.#{base_domain}"
30
+ @subdomain_queue << "https://#{full_domain}/"
31
+ end
32
+
33
+ # process the subdomain queue
34
+ download_subdomains(base_domain)
35
+
36
+ # after all downloads, rewrite all URLs to make local references
37
+ rewrite_subdomain_links(base_domain) if @rewrite
38
+ end
39
+
40
+ private
41
+
42
+ def extract_base_domain(url)
43
+ uri = URI.parse(url.gsub(/^https?:\/\//, '').split('/').first) rescue nil
44
+ return nil unless uri
45
+
46
+ host = uri.host || uri.path.split('/').first
47
+ host = host.downcase
48
+
49
+ # extract the base domain (e.g., "example.com" from "sub.example.com")
50
+ parts = host.split('.')
51
+ return host if parts.size <= 2
52
+
53
+ # for domains like co.uk, we want to keep the last 3 parts
54
+ if parts[-2].length <= 3 && parts[-1].length <= 3 && parts.size > 2
55
+ parts.last(3).join('.')
56
+ else
57
+ parts.last(2).join('.')
58
+ end
59
+ end
60
+
61
+ def scan_files_for_subdomains(files, base_domain)
62
+ return [] unless base_domain
63
+
64
+ subdomains = Set.new
65
+
66
+ files.each do |file_path|
67
+ next unless File.exist?(file_path)
68
+
69
+ begin
70
+ content = File.read(file_path)
71
+
72
+ # extract URLs from HTML href/src attributes
73
+ content.scan(/(?:href|src|action|data-src)=["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
74
+ subdomain = match[0].downcase
75
+ next if subdomain == 'www' # skip www subdomain
76
+ subdomains.add(subdomain)
77
+ end
78
+
79
+ # extract URLs from CSS
80
+ content.scan(/url\(["']?https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
81
+ subdomain = match[0].downcase
82
+ next if subdomain == 'www' # skip www subdomain
83
+ subdomains.add(subdomain)
84
+ end
85
+
86
+ # extract URLs from JavaScript strings
87
+ content.scan(/["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
88
+ subdomain = match[0].downcase
89
+ next if subdomain == 'www' # skip www subdomain
90
+ subdomains.add(subdomain)
91
+ end
92
+ rescue => e
93
+ puts "Error scanning file #{file_path}: #{e.message}"
94
+ end
95
+ end
96
+
97
+ subdomains.to_a
98
+ end
99
+
100
+ def download_subdomains(base_domain)
101
+ puts "Starting subdomain downloads..."
102
+ depth = 0
103
+ max_depth = @subdomain_depth || 1
104
+
105
+ while depth < max_depth && !@subdomain_queue.empty?
106
+ current_batch = []
107
+
108
+ # get all subdomains at current depth
109
+ while !@subdomain_queue.empty?
110
+ current_batch << @subdomain_queue.pop
111
+ end
112
+
113
+ puts "Processing #{current_batch.size} subdomains at depth #{depth + 1}..."
114
+
115
+ # download each subdomain
116
+ current_batch.each do |subdomain_url|
117
+ download_subdomain(subdomain_url, base_domain)
118
+ end
119
+
120
+ # if we need to go deeper, scan the newly downloaded files
121
+ if depth + 1 < max_depth
122
+ # get all files in the subdomains directory
123
+ new_files = Dir.glob(File.join(backup_path, "subdomains", "**/*.{html,htm,css,js}"))
124
+ new_subdomains = scan_files_for_subdomains(new_files, base_domain)
125
+
126
+ # filter out already processed subdomains
127
+ new_subdomains.each do |subdomain|
128
+ full_domain = "#{subdomain}.#{base_domain}"
129
+ unless @processed_domains.include?(full_domain)
130
+ @processed_domains.add(full_domain)
131
+ @subdomain_queue << "https://#{full_domain}/"
132
+ end
133
+ end
134
+
135
+ puts "Found #{@subdomain_queue.size} new subdomains at depth #{depth + 1}" if !@subdomain_queue.empty?
136
+ end
137
+
138
+ depth += 1
139
+ end
140
+ end
141
+
142
+ def download_subdomain(subdomain_url, base_domain)
143
+ begin
144
+ uri = URI.parse(subdomain_url)
145
+ subdomain_host = uri.host
146
+
147
+ # skip if already processed
148
+ if @processed_domains.include?(subdomain_host)
149
+ puts "Skipping already processed subdomain: #{subdomain_host}"
150
+ return
151
+ end
152
+
153
+ @processed_domains.add(subdomain_host)
154
+ puts "Downloading subdomain: #{subdomain_url}"
155
+
156
+ # create the directory for this subdomain
157
+ subdomain_dir = File.join(backup_path, "subdomains", subdomain_host)
158
+ FileUtils.mkdir_p(subdomain_dir)
159
+
160
+ # create subdomain downloader with appropriate options
161
+ subdomain_options = {
162
+ base_url: subdomain_url,
163
+ directory: subdomain_dir,
164
+ from_timestamp: @from_timestamp,
165
+ to_timestamp: @to_timestamp,
166
+ all: @all,
167
+ threads_count: @threads_count,
168
+ maximum_pages: [@maximum_pages / 2, 10].max,
169
+ rewrite: @rewrite,
170
+ # don't recursively process subdomains from here
171
+ recursive_subdomains: false
172
+ }
173
+
174
+ # download the subdomain content
175
+ subdomain_downloader = WaybackMachineDownloader.new(subdomain_options)
176
+ subdomain_downloader.download_files
177
+
178
+ puts "Completed download of subdomain: #{subdomain_host}"
179
+ rescue => e
180
+ puts "Error downloading subdomain #{subdomain_url}: #{e.message}"
181
+ end
182
+ end
183
+
184
+ def rewrite_subdomain_links(base_domain)
185
+ puts "Rewriting all files to use local subdomain references..."
186
+
187
+ all_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
188
+ subdomains = @processed_domains.reject { |domain| domain == base_domain }
189
+
190
+ puts "Found #{all_files.size} files to check for rewriting"
191
+ puts "Will rewrite links for subdomains: #{subdomains.join(', ')}"
192
+
193
+ rewritten_count = 0
194
+
195
+ all_files.each do |file_path|
196
+ next unless File.exist?(file_path)
197
+
198
+ begin
199
+ content = File.read(file_path)
200
+ original_content = content.dup
201
+
202
+ # replace subdomain URLs with local paths
203
+ subdomains.each do |subdomain_host|
204
+ # for HTML attributes (href, src, etc.)
205
+ content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
206
+ prefix, path, suffix = $1, $2, $3
207
+ path = "/index.html" if path.empty? || path == "/"
208
+ "#{prefix}../subdomains/#{subdomain_host}#{path}#{suffix}"
209
+ end
210
+
211
+ # for CSS url()
212
+ content.gsub!(/url\(\s*["']?https?:\/\/#{Regexp.escape(subdomain_host)}([^"'\)]*?)["']?\s*\)/i) do
213
+ path = $1
214
+ path = "/index.html" if path.empty? || path == "/"
215
+ "url(\"../subdomains/#{subdomain_host}#{path}\")"
216
+ end
217
+
218
+ # for JavaScript strings
219
+ content.gsub!(/(["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
220
+ quote_start, path, quote_end = $1, $2, $3
221
+ path = "/index.html" if path.empty? || path == "/"
222
+ "#{quote_start}../subdomains/#{subdomain_host}#{path}#{quote_end}"
223
+ end
224
+ end
225
+
226
+ # save if modified
227
+ if content != original_content
228
+ File.write(file_path, content)
229
+ rewritten_count += 1
230
+ end
231
+ rescue => e
232
+ puts "Error rewriting file #{file_path}: #{e.message}"
233
+ end
234
+ end
235
+
236
+ puts "Rewrote links in #{rewritten_count} files"
237
+ end
238
+ end
@@ -31,6 +31,7 @@ module TidyBytes
31
31
  when 156 then [197, 147] # LATIN SMALL LIGATURE OE
32
32
  when 158 then [197, 190] # LATIN SMALL LETTER Z WITH CARON
33
33
  when 159 then [197, 184] # LATIN SMALL LETTER Y WITH DIAERESIS
34
+ else nil # ANYTHING ELSE...
34
35
  end
35
36
  end.freeze
36
37
 
@@ -115,7 +115,7 @@ class WaybackMachineDownloader
115
115
  include ArchiveAPI
116
116
  include SubdomainProcessor
117
117
 
118
- VERSION = "2.3.11"
118
+ VERSION = "2.3.12"
119
119
  DEFAULT_TIMEOUT = 30
120
120
  MAX_RETRIES = 3
121
121
  RETRY_DELAY = 2
@@ -128,7 +128,8 @@ class WaybackMachineDownloader
128
128
 
129
129
  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
130
130
  :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
131
- :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
131
+ :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
132
+ :snapshot_at
132
133
 
133
134
  def initialize params
134
135
  validate_params(params)
@@ -158,6 +159,7 @@ class WaybackMachineDownloader
158
159
  @rewrite = params[:rewrite] || false
159
160
  @recursive_subdomains = params[:recursive_subdomains] || false
160
161
  @subdomain_depth = params[:subdomain_depth] || 1
162
+ @snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
161
163
 
162
164
  # URL for rejecting invalid/unencoded wayback urls
163
165
  @url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
@@ -330,6 +332,36 @@ class WaybackMachineDownloader
330
332
  snapshot_list_to_consider
331
333
  end
332
334
 
335
+ # Get a composite snapshot file list for a specific timestamp
336
+ def get_composite_snapshot_file_list(target_timestamp)
337
+ file_versions = {}
338
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
339
+ next unless file_url.include?('/')
340
+ next if file_timestamp.to_i > target_timestamp
341
+ file_id = file_url.split('/')[3..-1].join('/')
342
+ file_id = CGI::unescape file_id
343
+ file_id = file_id.tidy_bytes unless file_id == ""
344
+ next if file_id.nil?
345
+ next if match_exclude_filter(file_url)
346
+ next unless match_only_filter(file_url)
347
+ # Select the most recent version <= target_timestamp
348
+ if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
349
+ file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
350
+ end
351
+ end
352
+ file_versions.values
353
+ end
354
+
355
+ # Returns a list of files for the composite snapshot
356
+ def get_file_list_composite_snapshot(target_timestamp)
357
+ file_list = get_composite_snapshot_file_list(target_timestamp)
358
+ file_list = file_list.sort_by { |_,v| v[:timestamp].to_s }.reverse
359
+ file_list.map do |file_remote_info|
360
+ file_remote_info[1][:file_id] = file_remote_info[0]
361
+ file_remote_info[1]
362
+ end
363
+ end
364
+
333
365
  def get_file_list_curated
334
366
  file_list_curated = Hash.new
335
367
  get_all_snapshots_to_consider.each do |file_timestamp, file_url|
@@ -384,7 +416,9 @@ class WaybackMachineDownloader
384
416
 
385
417
 
386
418
  def get_file_list_by_timestamp
387
- if @all_timestamps
419
+ if @snapshot_at
420
+ @file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
421
+ elsif @all_timestamps
388
422
  file_list_curated = get_file_list_all_timestamps
389
423
  file_list_curated.map do |file_remote_info|
390
424
  file_remote_info[1][:file_id] = file_remote_info[0]
@@ -727,7 +761,22 @@ class WaybackMachineDownloader
727
761
  end
728
762
 
729
763
  def file_list_by_timestamp
730
- @file_list_by_timestamp ||= get_file_list_by_timestamp
764
+ if @snapshot_at
765
+ @file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
766
+ elsif @all_timestamps
767
+ file_list_curated = get_file_list_all_timestamps
768
+ file_list_curated.map do |file_remote_info|
769
+ file_remote_info[1][:file_id] = file_remote_info[0]
770
+ file_remote_info[1]
771
+ end
772
+ else
773
+ file_list_curated = get_file_list_curated
774
+ file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
775
+ file_list_curated.map do |file_remote_info|
776
+ file_remote_info[1][:file_id] = file_remote_info[0]
777
+ file_remote_info[1]
778
+ end
779
+ end
731
780
  end
732
781
 
733
782
  private
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.11
4
+ version: 2.3.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2025-07-21 00:00:00.000000000 Z
10
+ date: 2025-07-22 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: concurrent-ruby
@@ -71,6 +71,7 @@ files:
71
71
  - bin/wayback_machine_downloader
72
72
  - lib/wayback_machine_downloader.rb
73
73
  - lib/wayback_machine_downloader/archive_api.rb
74
+ - lib/wayback_machine_downloader/subdom_processor.rb
74
75
  - lib/wayback_machine_downloader/tidy_bytes.rb
75
76
  - lib/wayback_machine_downloader/to_regex.rb
76
77
  homepage: https://github.com/StrawberryMaster/wayback-machine-downloader