wayback_machine_downloader_straw 2.3.10 → 2.3.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ef661bf573b09f79453cf6343d737c24715f343b6593cf313f2502ecd9a650cb
4
- data.tar.gz: b80be4aaae7ab4ff695af6cc85273ac437fab1e6a68d3d8bdad67a9661be17e4
3
+ metadata.gz: 67f774a5476a54ad0224e11f0c9a24b8df6b0d418f5b3c8886277c286bbe3043
4
+ data.tar.gz: a881ccdac84cd8e4da13edd9fc8117bfdba8c7d432959ef81c85bc95072a0dd9
5
5
  SHA512:
6
- metadata.gz: 3dfb6477b142eebb45741e1b5a4552dd33feac34baa1eae5453baaa08a9a5be242ba46d4f1162e2dd2b68e8903e6de8402d6b6fa86128f312defac74f2e8da29
7
- data.tar.gz: 39758aef4bda77babb81d479ef9f266e3fa328af163c7c3c053290796fda95ccb8ec8d3725a9dae5164b79debc6530919cd79df3f7421842f951b0ee6ef79e60
6
+ metadata.gz: 01bdc9142820719c1ab17a50067fc478975627f414a29bdca32ea5fedf23227f33fb331f9470bb002af80cc50a6a74c7c8361f214d162c537d100860bdb664bc
7
+ data.tar.gz: f47436ecd1d4b8a4062d8689dac0d9fc4d73c743d5f84bd96764aa2a186eaae607fcee6c7b9e72f9fd3befd1fadfe9006354a43bfd134c892fbf5dfdd736ee28
@@ -74,6 +74,14 @@ option_parser = OptionParser.new do |opts|
74
74
  options[:keep] = true
75
75
  end
76
76
 
77
+ opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
78
+ options[:recursive_subdomains] = true
79
+ end
80
+
81
+ opts.on("--subdomain-depth DEPTH", Integer, "Maximum depth for subdomain recursion (default: 1)") do |t|
82
+ options[:subdomain_depth] = t
83
+ end
84
+
77
85
  opts.on("-v", "--version", "Display version") do |t|
78
86
  options[:version] = t
79
87
  end
@@ -0,0 +1,238 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SubdomainProcessor
4
+ def process_subdomains
5
+ return unless @recursive_subdomains
6
+
7
+ puts "Starting subdomain processing..."
8
+
9
+ # extract base domain from the URL for comparison
10
+ base_domain = extract_base_domain(@base_url)
11
+ @processed_domains = Set.new([base_domain])
12
+ @subdomain_queue = Queue.new
13
+
14
+ # scan downloaded files for subdomain links
15
+ initial_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
16
+ puts "Scanning #{initial_files.size} downloaded files for subdomain links..."
17
+
18
+ subdomains_found = scan_files_for_subdomains(initial_files, base_domain)
19
+
20
+ if subdomains_found.empty?
21
+ puts "No subdomains found in downloaded content."
22
+ return
23
+ end
24
+
25
+ puts "Found #{subdomains_found.size} subdomains to process: #{subdomains_found.join(', ')}"
26
+
27
+ # add found subdomains to the queue
28
+ subdomains_found.each do |subdomain|
29
+ full_domain = "#{subdomain}.#{base_domain}"
30
+ @subdomain_queue << "https://#{full_domain}/"
31
+ end
32
+
33
+ # process the subdomain queue
34
+ download_subdomains(base_domain)
35
+
36
+ # after all downloads, rewrite all URLs to make local references
37
+ rewrite_subdomain_links(base_domain) if @rewrite
38
+ end
39
+
40
+ private
41
+
42
+ def extract_base_domain(url)
43
+ uri = URI.parse(url.gsub(/^https?:\/\//, '').split('/').first) rescue nil
44
+ return nil unless uri
45
+
46
+ host = uri.host || uri.path.split('/').first
47
+ host = host.downcase
48
+
49
+ # extract the base domain (e.g., "example.com" from "sub.example.com")
50
+ parts = host.split('.')
51
+ return host if parts.size <= 2
52
+
53
+ # for domains like co.uk, we want to keep the last 3 parts
54
+ if parts[-2].length <= 3 && parts[-1].length <= 3 && parts.size > 2
55
+ parts.last(3).join('.')
56
+ else
57
+ parts.last(2).join('.')
58
+ end
59
+ end
60
+
61
+ def scan_files_for_subdomains(files, base_domain)
62
+ return [] unless base_domain
63
+
64
+ subdomains = Set.new
65
+
66
+ files.each do |file_path|
67
+ next unless File.exist?(file_path)
68
+
69
+ begin
70
+ content = File.read(file_path)
71
+
72
+ # extract URLs from HTML href/src attributes
73
+ content.scan(/(?:href|src|action|data-src)=["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
74
+ subdomain = match[0].downcase
75
+ next if subdomain == 'www' # skip www subdomain
76
+ subdomains.add(subdomain)
77
+ end
78
+
79
+ # extract URLs from CSS
80
+ content.scan(/url\(["']?https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
81
+ subdomain = match[0].downcase
82
+ next if subdomain == 'www' # skip www subdomain
83
+ subdomains.add(subdomain)
84
+ end
85
+
86
+ # extract URLs from JavaScript strings
87
+ content.scan(/["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
88
+ subdomain = match[0].downcase
89
+ next if subdomain == 'www' # skip www subdomain
90
+ subdomains.add(subdomain)
91
+ end
92
+ rescue => e
93
+ puts "Error scanning file #{file_path}: #{e.message}"
94
+ end
95
+ end
96
+
97
+ subdomains.to_a
98
+ end
99
+
100
+ def download_subdomains(base_domain)
101
+ puts "Starting subdomain downloads..."
102
+ depth = 0
103
+ max_depth = @subdomain_depth || 1
104
+
105
+ while depth < max_depth && !@subdomain_queue.empty?
106
+ current_batch = []
107
+
108
+ # get all subdomains at current depth
109
+ while !@subdomain_queue.empty?
110
+ current_batch << @subdomain_queue.pop
111
+ end
112
+
113
+ puts "Processing #{current_batch.size} subdomains at depth #{depth + 1}..."
114
+
115
+ # download each subdomain
116
+ current_batch.each do |subdomain_url|
117
+ download_subdomain(subdomain_url, base_domain)
118
+ end
119
+
120
+ # if we need to go deeper, scan the newly downloaded files
121
+ if depth + 1 < max_depth
122
+ # get all files in the subdomains directory
123
+ new_files = Dir.glob(File.join(backup_path, "subdomains", "**/*.{html,htm,css,js}"))
124
+ new_subdomains = scan_files_for_subdomains(new_files, base_domain)
125
+
126
+ # filter out already processed subdomains
127
+ new_subdomains.each do |subdomain|
128
+ full_domain = "#{subdomain}.#{base_domain}"
129
+ unless @processed_domains.include?(full_domain)
130
+ @processed_domains.add(full_domain)
131
+ @subdomain_queue << "https://#{full_domain}/"
132
+ end
133
+ end
134
+
135
+ puts "Found #{@subdomain_queue.size} new subdomains at depth #{depth + 1}" if !@subdomain_queue.empty?
136
+ end
137
+
138
+ depth += 1
139
+ end
140
+ end
141
+
142
+ def download_subdomain(subdomain_url, base_domain)
143
+ begin
144
+ uri = URI.parse(subdomain_url)
145
+ subdomain_host = uri.host
146
+
147
+ # skip if already processed
148
+ if @processed_domains.include?(subdomain_host)
149
+ puts "Skipping already processed subdomain: #{subdomain_host}"
150
+ return
151
+ end
152
+
153
+ @processed_domains.add(subdomain_host)
154
+ puts "Downloading subdomain: #{subdomain_url}"
155
+
156
+ # create the directory for this subdomain
157
+ subdomain_dir = File.join(backup_path, "subdomains", subdomain_host)
158
+ FileUtils.mkdir_p(subdomain_dir)
159
+
160
+ # create subdomain downloader with appropriate options
161
+ subdomain_options = {
162
+ base_url: subdomain_url,
163
+ directory: subdomain_dir,
164
+ from_timestamp: @from_timestamp,
165
+ to_timestamp: @to_timestamp,
166
+ all: @all,
167
+ threads_count: @threads_count,
168
+ maximum_pages: [@maximum_pages / 2, 10].max,
169
+ rewrite: @rewrite,
170
+ # don't recursively process subdomains from here
171
+ recursive_subdomains: false
172
+ }
173
+
174
+ # download the subdomain content
175
+ subdomain_downloader = WaybackMachineDownloader.new(subdomain_options)
176
+ subdomain_downloader.download_files
177
+
178
+ puts "Completed download of subdomain: #{subdomain_host}"
179
+ rescue => e
180
+ puts "Error downloading subdomain #{subdomain_url}: #{e.message}"
181
+ end
182
+ end
183
+
184
+ def rewrite_subdomain_links(base_domain)
185
+ puts "Rewriting all files to use local subdomain references..."
186
+
187
+ all_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
188
+ subdomains = @processed_domains.reject { |domain| domain == base_domain }
189
+
190
+ puts "Found #{all_files.size} files to check for rewriting"
191
+ puts "Will rewrite links for subdomains: #{subdomains.join(', ')}"
192
+
193
+ rewritten_count = 0
194
+
195
+ all_files.each do |file_path|
196
+ next unless File.exist?(file_path)
197
+
198
+ begin
199
+ content = File.read(file_path)
200
+ original_content = content.dup
201
+
202
+ # replace subdomain URLs with local paths
203
+ subdomains.each do |subdomain_host|
204
+ # for HTML attributes (href, src, etc.)
205
+ content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
206
+ prefix, path, suffix = $1, $2, $3
207
+ path = "/index.html" if path.empty? || path == "/"
208
+ "#{prefix}../subdomains/#{subdomain_host}#{path}#{suffix}"
209
+ end
210
+
211
+ # for CSS url()
212
+ content.gsub!(/url\(\s*["']?https?:\/\/#{Regexp.escape(subdomain_host)}([^"'\)]*?)["']?\s*\)/i) do
213
+ path = $1
214
+ path = "/index.html" if path.empty? || path == "/"
215
+ "url(\"../subdomains/#{subdomain_host}#{path}\")"
216
+ end
217
+
218
+ # for JavaScript strings
219
+ content.gsub!(/(["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
220
+ quote_start, path, quote_end = $1, $2, $3
221
+ path = "/index.html" if path.empty? || path == "/"
222
+ "#{quote_start}../subdomains/#{subdomain_host}#{path}#{quote_end}"
223
+ end
224
+ end
225
+
226
+ # save if modified
227
+ if content != original_content
228
+ File.write(file_path, content)
229
+ rewritten_count += 1
230
+ end
231
+ rescue => e
232
+ puts "Error rewriting file #{file_path}: #{e.message}"
233
+ end
234
+ end
235
+
236
+ puts "Rewrote links in #{rewritten_count} files"
237
+ end
238
+ end
@@ -31,6 +31,7 @@ module TidyBytes
31
31
  when 156 then [197, 147] # LATIN SMALL LIGATURE OE
32
32
  when 158 then [197, 190] # LATIN SMALL LETTER Z WITH CARON
33
33
  when 159 then [197, 184] # LATIN SMALL LETTER Y WITH DIAERESIS
34
+ else nil # ANYTHING ELSE...
34
35
  end
35
36
  end.freeze
36
37
 
@@ -14,6 +14,7 @@ require 'stringio'
14
14
  require_relative 'wayback_machine_downloader/tidy_bytes'
15
15
  require_relative 'wayback_machine_downloader/to_regex'
16
16
  require_relative 'wayback_machine_downloader/archive_api'
17
+ require_relative 'wayback_machine_downloader/subdom_processor'
17
18
 
18
19
  class ConnectionPool
19
20
  MAX_AGE = 300
@@ -112,8 +113,9 @@ end
112
113
  class WaybackMachineDownloader
113
114
 
114
115
  include ArchiveAPI
116
+ include SubdomainProcessor
115
117
 
116
- VERSION = "2.3.10"
118
+ VERSION = "2.3.12"
117
119
  DEFAULT_TIMEOUT = 30
118
120
  MAX_RETRIES = 3
119
121
  RETRY_DELAY = 2
@@ -123,9 +125,11 @@ class WaybackMachineDownloader
123
125
  STATE_CDX_FILENAME = ".cdx.json"
124
126
  STATE_DB_FILENAME = ".downloaded.txt"
125
127
 
128
+
126
129
  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
127
130
  :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
128
- :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
131
+ :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
132
+ :snapshot_at
129
133
 
130
134
  def initialize params
131
135
  validate_params(params)
@@ -153,6 +157,12 @@ class WaybackMachineDownloader
153
157
  @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
154
158
  @db_mutex = Mutex.new
155
159
  @rewrite = params[:rewrite] || false
160
+ @recursive_subdomains = params[:recursive_subdomains] || false
161
+ @subdomain_depth = params[:subdomain_depth] || 1
162
+ @snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
163
+
164
+ # URL for rejecting invalid/unencoded wayback urls
165
+ @url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
156
166
 
157
167
  handle_reset
158
168
  end
@@ -196,7 +206,7 @@ class WaybackMachineDownloader
196
206
 
197
207
  def match_only_filter file_url
198
208
  if @only_filter
199
- only_filter_regex = @only_filter.to_regex
209
+ only_filter_regex = @only_filter.to_regex(detect: true)
200
210
  if only_filter_regex
201
211
  only_filter_regex =~ file_url
202
212
  else
@@ -209,7 +219,7 @@ class WaybackMachineDownloader
209
219
 
210
220
  def match_exclude_filter file_url
211
221
  if @exclude_filter
212
- exclude_filter_regex = @exclude_filter.to_regex
222
+ exclude_filter_regex = @exclude_filter.to_regex(detect: true)
213
223
  if exclude_filter_regex
214
224
  exclude_filter_regex =~ file_url
215
225
  else
@@ -322,6 +332,36 @@ class WaybackMachineDownloader
322
332
  snapshot_list_to_consider
323
333
  end
324
334
 
335
+ # Get a composite snapshot file list for a specific timestamp
336
+ def get_composite_snapshot_file_list(target_timestamp)
337
+ file_versions = {}
338
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
339
+ next unless file_url.include?('/')
340
+ next if file_timestamp.to_i > target_timestamp
341
+ file_id = file_url.split('/')[3..-1].join('/')
342
+ file_id = CGI::unescape file_id
343
+ file_id = file_id.tidy_bytes unless file_id == ""
344
+ next if file_id.nil?
345
+ next if match_exclude_filter(file_url)
346
+ next unless match_only_filter(file_url)
347
+ # Select the most recent version <= target_timestamp
348
+ if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
349
+ file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
350
+ end
351
+ end
352
+ file_versions.values
353
+ end
354
+
355
+ # Returns a list of files for the composite snapshot
356
+ def get_file_list_composite_snapshot(target_timestamp)
357
+ file_list = get_composite_snapshot_file_list(target_timestamp)
358
+ file_list = file_list.sort_by { |_,v| v[:timestamp].to_s }.reverse
359
+ file_list.map do |file_remote_info|
360
+ file_remote_info[1][:file_id] = file_remote_info[0]
361
+ file_remote_info[1]
362
+ end
363
+ end
364
+
325
365
  def get_file_list_curated
326
366
  file_list_curated = Hash.new
327
367
  get_all_snapshots_to_consider.each do |file_timestamp, file_url|
@@ -376,7 +416,9 @@ class WaybackMachineDownloader
376
416
 
377
417
 
378
418
  def get_file_list_by_timestamp
379
- if @all_timestamps
419
+ if @snapshot_at
420
+ @file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
421
+ elsif @all_timestamps
380
422
  file_list_curated = get_file_list_all_timestamps
381
423
  file_list_curated.map do |file_remote_info|
382
424
  file_remote_info[1][:file_id] = file_remote_info[0]
@@ -513,6 +555,16 @@ class WaybackMachineDownloader
513
555
 
514
556
  end_time = Time.now
515
557
  puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
558
+
559
+ # process subdomains if enabled
560
+ if @recursive_subdomains
561
+ subdomain_start_time = Time.now
562
+ process_subdomains
563
+ subdomain_end_time = Time.now
564
+ subdomain_time = (subdomain_end_time - subdomain_start_time).round(2)
565
+ puts "Subdomain processing finished in #{subdomain_time}s."
566
+ end
567
+
516
568
  puts "Results saved in #{backup_path}"
517
569
  cleanup
518
570
  end
@@ -709,7 +761,22 @@ class WaybackMachineDownloader
709
761
  end
710
762
 
711
763
  def file_list_by_timestamp
712
- @file_list_by_timestamp ||= get_file_list_by_timestamp
764
+ if @snapshot_at
765
+ @file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
766
+ elsif @all_timestamps
767
+ file_list_curated = get_file_list_all_timestamps
768
+ file_list_curated.map do |file_remote_info|
769
+ file_remote_info[1][:file_id] = file_remote_info[0]
770
+ file_remote_info[1]
771
+ end
772
+ else
773
+ file_list_curated = get_file_list_curated
774
+ file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
775
+ file_list_curated.map do |file_remote_info|
776
+ file_remote_info[1][:file_id] = file_remote_info[0]
777
+ file_remote_info[1]
778
+ end
779
+ end
713
780
  end
714
781
 
715
782
  private
@@ -740,6 +807,12 @@ class WaybackMachineDownloader
740
807
  # Escape square brackets because they are not valid in URI()
741
808
  wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
742
809
 
810
+ # reject invalid/unencoded wayback_url, behaving as if the resource weren't found
811
+ if not @url_regexp.match?(wayback_url)
812
+ @logger.warn("Skipped #{file_url}: invalid URL")
813
+ return :skipped_not_found
814
+ end
815
+
743
816
  request = Net::HTTP::Get.new(URI(wayback_url))
744
817
  request["Connection"] = "keep-alive"
745
818
  request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.10
4
+ version: 2.3.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2025-06-27 00:00:00.000000000 Z
10
+ date: 2025-07-22 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: concurrent-ruby
@@ -71,6 +71,7 @@ files:
71
71
  - bin/wayback_machine_downloader
72
72
  - lib/wayback_machine_downloader.rb
73
73
  - lib/wayback_machine_downloader/archive_api.rb
74
+ - lib/wayback_machine_downloader/subdom_processor.rb
74
75
  - lib/wayback_machine_downloader/tidy_bytes.rb
75
76
  - lib/wayback_machine_downloader/to_regex.rb
76
77
  homepage: https://github.com/StrawberryMaster/wayback-machine-downloader