wayback_machine_downloader_straw 2.4.0 → 2.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 35a8c4a865a9da5cb45e7f63e2f832f491895f5c69c3d440b9c8b4230b8444f1
4
- data.tar.gz: a96d746b41f3e3b7a1cf6df38df3b23a79361f57f667eea562be72961bf391c2
3
+ metadata.gz: f6650c4217f2630db6307bc50ae2d6cefcbc38afc18b5701cc90a956af5cf1cf
4
+ data.tar.gz: 0ad44d7daa4c69b75d319c3518c4b801810be071545d5eded4497073caab4667
5
5
  SHA512:
6
- metadata.gz: 783bb658ee95bd523fb3dc8c2c11a027947becc4e72902e2fff85eb725bbc8e3ef8e7bb22b08598f015f77e801526354f36b6d920144df9fd6bca440cccf8127
7
- data.tar.gz: a2e0ce3e4df543574b1c04e349d120b31d900bbbfe3f9bf512706f57094d89c49574290520df25fdd8c920577baf561272af65ca4c36d058a3a4097efa167a83
6
+ metadata.gz: 7a8cfd1cda19bc3ff2db8859e03877395eaf44092ffbe9f5334218fbd6293ff1aecc60e2bf272f875a67ecd086a209c56640db221f4d13739669a27eada1c826
7
+ data.tar.gz: 877436af63fa205add55ebeb55bafcd39fec0afa56707ee742871014dac48998e8028ef4616a0b611bee5f9a93ed0d8d136375d457503a3e34b9a37f87321787
@@ -116,7 +116,7 @@ class WaybackMachineDownloader
116
116
  include ArchiveAPI
117
117
  include SubdomainProcessor
118
118
 
119
- VERSION = "2.4.0"
119
+ VERSION = "2.4.1"
120
120
  DEFAULT_TIMEOUT = 30
121
121
  MAX_RETRIES = 3
122
122
  RETRY_DELAY = 2
@@ -340,15 +340,15 @@ class WaybackMachineDownloader
340
340
  get_all_snapshots_to_consider.each do |file_timestamp, file_url|
341
341
  next unless file_url.include?('/')
342
342
  next if file_timestamp.to_i > target_timestamp
343
- file_id = file_url.split('/')[3..-1].join('/')
344
- file_id = CGI::unescape file_id
345
- file_id = file_id.tidy_bytes unless file_id == ""
343
+
344
+ raw_tail = file_url.split('/')[3..-1]&.join('/')
345
+ file_id = sanitize_and_prepare_id(raw_tail, file_url)
346
346
  next if file_id.nil?
347
347
  next if match_exclude_filter(file_url)
348
348
  next unless match_only_filter(file_url)
349
- # Select the most recent version <= target_timestamp
349
+
350
350
  if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
351
- file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
351
+ file_versions[file_id] = { file_url: file_url, timestamp: file_timestamp, file_id: file_id }
352
352
  end
353
353
  end
354
354
  file_versions.values
@@ -368,22 +368,27 @@ class WaybackMachineDownloader
368
368
  file_list_curated = Hash.new
369
369
  get_all_snapshots_to_consider.each do |file_timestamp, file_url|
370
370
  next unless file_url.include?('/')
371
- file_id = file_url.split('/')[3..-1].join('/')
372
- file_id = CGI::unescape file_id
373
- file_id = file_id.tidy_bytes unless file_id == ""
371
+
372
+ raw_tail = file_url.split('/')[3..-1]&.join('/')
373
+ file_id = sanitize_and_prepare_id(raw_tail, file_url)
374
374
  if file_id.nil?
375
375
  puts "Malformed file url, ignoring: #{file_url}"
376
+ next
377
+ end
378
+
379
+ if file_id.include?('<') || file_id.include?('>')
380
+ puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
376
381
  else
377
382
  if match_exclude_filter(file_url)
378
383
  puts "File url matches exclude filter, ignoring: #{file_url}"
379
- elsif not match_only_filter(file_url)
384
+ elsif !match_only_filter(file_url)
380
385
  puts "File url doesn't match only filter, ignoring: #{file_url}"
381
386
  elsif file_list_curated[file_id]
382
387
  unless file_list_curated[file_id][:timestamp] > file_timestamp
383
- file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
388
+ file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
384
389
  end
385
390
  else
386
- file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
391
+ file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
387
392
  end
388
393
  end
389
394
  end
@@ -394,21 +399,32 @@ class WaybackMachineDownloader
394
399
  file_list_curated = Hash.new
395
400
  get_all_snapshots_to_consider.each do |file_timestamp, file_url|
396
401
  next unless file_url.include?('/')
397
- file_id = file_url.split('/')[3..-1].join('/')
398
- file_id_and_timestamp = [file_timestamp, file_id].join('/')
399
- file_id_and_timestamp = CGI::unescape file_id_and_timestamp
400
- file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
402
+
403
+ raw_tail = file_url.split('/')[3..-1]&.join('/')
404
+ file_id = sanitize_and_prepare_id(raw_tail, file_url)
401
405
  if file_id.nil?
402
406
  puts "Malformed file url, ignoring: #{file_url}"
407
+ next
408
+ end
409
+
410
+ file_id_and_timestamp_raw = [file_timestamp, file_id].join('/')
411
+ file_id_and_timestamp = sanitize_and_prepare_id(file_id_and_timestamp_raw, file_url)
412
+ if file_id_and_timestamp.nil?
413
+ puts "Malformed file id/timestamp combo, ignoring: #{file_url}"
414
+ next
415
+ end
416
+
417
+ if file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>')
418
+ puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
403
419
  else
404
420
  if match_exclude_filter(file_url)
405
421
  puts "File url matches exclude filter, ignoring: #{file_url}"
406
- elsif not match_only_filter(file_url)
422
+ elsif !match_only_filter(file_url)
407
423
  puts "File url doesn't match only filter, ignoring: #{file_url}"
408
424
  elsif file_list_curated[file_id_and_timestamp]
409
- puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
425
+ # duplicate combo, ignore silently (verbose flag not shown here)
410
426
  else
411
- file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
427
+ file_list_curated[file_id_and_timestamp] = { file_url: file_url, timestamp: file_timestamp }
412
428
  end
413
429
  end
414
430
  end
@@ -749,6 +765,20 @@ class WaybackMachineDownloader
749
765
  end
750
766
  logger
751
767
  end
768
+
769
+ # safely sanitize a file id (or id+timestamp)
770
+ def sanitize_and_prepare_id(raw, file_url)
771
+ return nil if raw.nil?
772
+ begin
773
+ raw = CGI.unescape(raw) rescue raw
774
+ raw.gsub!(/<[^>]*>/, '')
775
+ raw = raw.tidy_bytes unless raw.empty?
776
+ raw
777
+ rescue => e
778
+ @logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
779
+ nil
780
+ end
781
+ end
752
782
 
753
783
  def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
754
784
  retries = 0
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.4.0
4
+ version: 2.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2025-08-04 00:00:00.000000000 Z
10
+ date: 2025-08-12 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: concurrent-ruby