wayback_machine_downloader_straw 2.3.7 → 2.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b739c4ecda1e325f9d5a33872fa71a8a5103f1770cc18c7e1b46516c96c8fef6
4
- data.tar.gz: 991cf1f67783f35a8da233e6d9e82edc4d933ef0229d5ecffbe8963c5d049c98
3
+ metadata.gz: df42d96c68c19fd39b6da3c9e9d51934197484ccb1ceb7a9387116622b0214a7
4
+ data.tar.gz: d6f04e3dc44c9f216b9d3dc631275fac5e48447ebd963a33818e82baf1ff79b3
5
5
  SHA512:
6
- metadata.gz: f9b71d59d4c5c5bdb82f58fceacd848242a34b12d15abf93c101e4d61ab8fcab46e60011b80f966b0851474160af153c92ab46db5ed2c2e80b0fec3afdc53f8c
7
- data.tar.gz: 88f39d47bb8405f682ddca4236bd2e3ce93ffbfd426c2430532b904c98e7cb1593406271fa4453847ab95615adbffc36049072bd7c8b45b171e2cecb77bb41ab
6
+ metadata.gz: b9654877bb591082e1ef1c5dfdacff0bf887ed68f8ae1b2d995a99b87232523aa3350aede2d8cbb4045dbb15b380a1e93451004a45f881ad323615c0f66632c5
7
+ data.tar.gz: eb8753d3ceb689e9b8c3f3dbaeeac7c9dd818497f916882d5d3271f1901c099f8b7103e7b49bcef51d71aab86b2607174ac2eece768a092242b0d5e0dcec9b28
@@ -4,6 +4,13 @@ require 'uri'
4
4
  module ArchiveAPI
5
5
 
6
6
  def get_raw_list_from_api(url, page_index, http)
7
+ # Automatically append /* if the URL doesn't contain a path after the domain
8
+ # This is a workaround for an issue with the API and *some* domains.
9
+ # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
10
+ if url && !url.match(/^https?:\/\/.*\//i)
11
+ url = "#{url}/*"
12
+ end
13
+
7
14
  request_url = URI("https://web.archive.org/cdx/search/cdx")
8
15
  params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
9
16
  request_url.query = URI.encode_www_form(params)
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
113
113
 
114
114
  include ArchiveAPI
115
115
 
116
- VERSION = "2.3.7"
116
+ VERSION = "2.3.8"
117
117
  DEFAULT_TIMEOUT = 30
118
118
  MAX_RETRIES = 3
119
119
  RETRY_DELAY = 2
@@ -154,10 +154,12 @@ class WaybackMachineDownloader
154
154
  end
155
155
 
156
156
  def backup_name
157
- if @base_url.include? '//'
158
- @base_url.split('/')[2]
157
+ url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
158
+
159
+ if url_to_process.include? '//'
160
+ url_to_process.split('/')[2]
159
161
  else
160
- @base_url
162
+ url_to_process
161
163
  end
162
164
  end
163
165
 
@@ -241,6 +243,7 @@ class WaybackMachineDownloader
241
243
  # Fetch the initial set of snapshots, sequentially
242
244
  @connection_pool.with_connection do |connection|
243
245
  initial_list = get_raw_list_from_api(@base_url, nil, connection)
246
+ initial_list ||= []
244
247
  mutex.synchronize do
245
248
  snapshot_list_to_consider.concat(initial_list)
246
249
  print "."
@@ -265,6 +268,7 @@ class WaybackMachineDownloader
265
268
  @connection_pool.with_connection do |connection|
266
269
  result = get_raw_list_from_api("#{@base_url}/*", page, connection)
267
270
  end
271
+ result ||= []
268
272
  [page, result]
269
273
  end
270
274
  end
@@ -284,7 +288,7 @@ class WaybackMachineDownloader
284
288
 
285
289
  # Process results and check for empty pages
286
290
  results.each do |page, result|
287
- if result.empty?
291
+ if result.nil? || result.empty?
288
292
  continue_fetching = false
289
293
  break
290
294
  else
@@ -717,6 +721,9 @@ class WaybackMachineDownloader
717
721
  "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
718
722
  end
719
723
 
724
+ # Escape square brackets because they are not valid in URI()
725
+ wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
726
+
720
727
  request = Net::HTTP::Get.new(URI(wayback_url))
721
728
  request["Connection"] = "keep-alive"
722
729
  request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.7
4
+ version: 2.3.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2025-05-27 00:00:00.000000000 Z
10
+ date: 2025-06-05 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: concurrent-ruby
@@ -78,7 +77,6 @@ homepage: https://github.com/StrawberryMaster/wayback-machine-downloader
78
77
  licenses:
79
78
  - MIT
80
79
  metadata: {}
81
- post_install_message:
82
80
  rdoc_options: []
83
81
  require_paths:
84
82
  - lib
@@ -93,8 +91,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
93
91
  - !ruby/object:Gem::Version
94
92
  version: '0'
95
93
  requirements: []
96
- rubygems_version: 3.5.11
97
- signing_key:
94
+ rubygems_version: 3.6.2
98
95
  specification_version: 4
99
96
  summary: Download an entire website from the Wayback Machine.
100
97
  test_files: []