wayback_machine_downloader_straw 2.3.7 → 2.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b739c4ecda1e325f9d5a33872fa71a8a5103f1770cc18c7e1b46516c96c8fef6
4
- data.tar.gz: 991cf1f67783f35a8da233e6d9e82edc4d933ef0229d5ecffbe8963c5d049c98
3
+ metadata.gz: 1df8b0394ed1e39ab2d1ef050c51c809afbc0005533e286b83389bab99a7507c
4
+ data.tar.gz: 500212ad8353f9252ade813c5e405e9bc0fa1f6ae4967dcfee36686caecff0fa
5
5
  SHA512:
6
- metadata.gz: f9b71d59d4c5c5bdb82f58fceacd848242a34b12d15abf93c101e4d61ab8fcab46e60011b80f966b0851474160af153c92ab46db5ed2c2e80b0fec3afdc53f8c
7
- data.tar.gz: 88f39d47bb8405f682ddca4236bd2e3ce93ffbfd426c2430532b904c98e7cb1593406271fa4453847ab95615adbffc36049072bd7c8b45b171e2cecb77bb41ab
6
+ metadata.gz: c5b855e47b617217591f385b6aeb5afd55631e10acaa39d1acf0929603717e68f350457fea2e184f17e338baeb4fd1cb6ba9ce6632967c56a80c8a66c2fbae87
7
+ data.tar.gz: b2c13318213a68827d47e73b8fba54ad2d52f49f73ea76726207f80d2db3bb9b77bb349980b823ac12a42f1ec1ba7705eb06affdf579970945eb655c07c955f2
@@ -4,6 +4,14 @@ require 'uri'
4
4
  module ArchiveAPI
5
5
 
6
6
  def get_raw_list_from_api(url, page_index, http)
7
+ # Automatically append /* if the URL doesn't contain a path after the domain
8
+ # This is a workaround for an issue with the API and *some* domains.
9
+ # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
10
+ # But don't do this when exact_url flag is set
11
+ if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
12
+ url = "#{url}/*"
13
+ end
14
+
7
15
  request_url = URI("https://web.archive.org/cdx/search/cdx")
8
16
  params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
9
17
  request_url.query = URI.encode_www_form(params)
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
113
113
 
114
114
  include ArchiveAPI
115
115
 
116
- VERSION = "2.3.7"
116
+ VERSION = "2.3.9"
117
117
  DEFAULT_TIMEOUT = 30
118
118
  MAX_RETRIES = 3
119
119
  RETRY_DELAY = 2
@@ -131,7 +131,11 @@ class WaybackMachineDownloader
131
131
  validate_params(params)
132
132
  @base_url = params[:base_url]
133
133
  @exact_url = params[:exact_url]
134
- @directory = params[:directory]
134
+ if params[:directory]
135
+ @directory = File.expand_path(params[:directory])
136
+ else
137
+ @directory = nil
138
+ end
135
139
  @all_timestamps = params[:all_timestamps]
136
140
  @from_timestamp = params[:from_timestamp].to_i
137
141
  @to_timestamp = params[:to_timestamp].to_i
@@ -154,22 +158,22 @@ class WaybackMachineDownloader
154
158
  end
155
159
 
156
160
  def backup_name
157
- if @base_url.include? '//'
158
- @base_url.split('/')[2]
161
+ url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
162
+
163
+ if url_to_process.include? '//'
164
+ url_to_process.split('/')[2]
159
165
  else
160
- @base_url
166
+ url_to_process
161
167
  end
162
168
  end
163
169
 
164
170
  def backup_path
165
171
  if @directory
166
- if @directory[-1] == '/'
167
- @directory
168
- else
169
- @directory + '/'
170
- end
172
+ # because @directory is already an absolute path, we just ensure it exists
173
+ @directory
171
174
  else
172
- 'websites/' + backup_name + '/'
175
+ # ensure the default path is absolute and normalized
176
+ File.expand_path(File.join('websites', backup_name))
173
177
  end
174
178
  end
175
179
 
@@ -241,6 +245,7 @@ class WaybackMachineDownloader
241
245
  # Fetch the initial set of snapshots, sequentially
242
246
  @connection_pool.with_connection do |connection|
243
247
  initial_list = get_raw_list_from_api(@base_url, nil, connection)
248
+ initial_list ||= []
244
249
  mutex.synchronize do
245
250
  snapshot_list_to_consider.concat(initial_list)
246
251
  print "."
@@ -265,6 +270,7 @@ class WaybackMachineDownloader
265
270
  @connection_pool.with_connection do |connection|
266
271
  result = get_raw_list_from_api("#{@base_url}/*", page, connection)
267
272
  end
273
+ result ||= []
268
274
  [page, result]
269
275
  end
270
276
  end
@@ -284,7 +290,7 @@ class WaybackMachineDownloader
284
290
 
285
291
  # Process results and check for empty pages
286
292
  results.each do |page, result|
287
- if result.empty?
293
+ if result.nil? || result.empty?
288
294
  continue_fetching = false
289
295
  break
290
296
  else
@@ -634,21 +640,35 @@ class WaybackMachineDownloader
634
640
  file_url = file_remote_info[:file_url].encode(current_encoding)
635
641
  file_id = file_remote_info[:file_id]
636
642
  file_timestamp = file_remote_info[:timestamp]
637
- file_path_elements = file_id.split('/')
643
+
644
+ # sanitize file_id to ensure it is a valid path component
645
+ raw_path_elements = file_id.split('/')
646
+
647
+ sanitized_path_elements = raw_path_elements.map do |element|
648
+ if Gem.win_platform?
649
+ # for Windows, we need to sanitize path components to avoid invalid characters
650
+ # this prevents issues with file names that contain characters not allowed in
651
+ # Windows file systems. See # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions
652
+ element.gsub(/[:\*?"<>\|\&\=\/\\]/, ->(match) { '%' + match.ord.to_s(16).upcase })
653
+ else
654
+ element
655
+ end
656
+ end
657
+
658
+ current_backup_path = backup_path
638
659
 
639
660
  if file_id == ""
640
- dir_path = backup_path
641
- file_path = backup_path + 'index.html'
642
- elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
643
- dir_path = backup_path + file_path_elements[0..-1].join('/')
644
- file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
661
+ dir_path = current_backup_path
662
+ file_path = File.join(dir_path, 'index.html')
663
+ elsif file_url[-1] == '/' || (sanitized_path_elements.last && !sanitized_path_elements.last.include?('.'))
664
+ # if file_id is a directory, we treat it as such
665
+ dir_path = File.join(current_backup_path, *sanitized_path_elements)
666
+ file_path = File.join(dir_path, 'index.html')
645
667
  else
646
- dir_path = backup_path + file_path_elements[0..-2].join('/')
647
- file_path = backup_path + file_path_elements[0..-1].join('/')
648
- end
649
- if Gem.win_platform?
650
- dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
651
- file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
668
+ # if file_id is a file, we treat it as such
669
+ filename = sanitized_path_elements.pop
670
+ dir_path = File.join(current_backup_path, *sanitized_path_elements)
671
+ file_path = File.join(dir_path, filename)
652
672
  end
653
673
 
654
674
  # check existence *before* download attempt
@@ -717,6 +737,9 @@ class WaybackMachineDownloader
717
737
  "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
718
738
  end
719
739
 
740
+ # Escape square brackets because they are not valid in URI()
741
+ wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
742
+
720
743
  request = Net::HTTP::Get.new(URI(wayback_url))
721
744
  request["Connection"] = "keep-alive"
722
745
  request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.7
4
+ version: 2.3.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2025-05-27 00:00:00.000000000 Z
10
+ date: 2025-06-18 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: concurrent-ruby
@@ -78,7 +77,6 @@ homepage: https://github.com/StrawberryMaster/wayback-machine-downloader
78
77
  licenses:
79
78
  - MIT
80
79
  metadata: {}
81
- post_install_message:
82
80
  rdoc_options: []
83
81
  require_paths:
84
82
  - lib
@@ -93,8 +91,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
93
91
  - !ruby/object:Gem::Version
94
92
  version: '0'
95
93
  requirements: []
96
- rubygems_version: 3.5.11
97
- signing_key:
94
+ rubygems_version: 3.6.2
98
95
  specification_version: 4
99
96
  summary: Download an entire website from the Wayback Machine.
100
97
  test_files: []