wayback_machine_downloader_straw 2.4.6 → 2.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e52df092b59b0eec27b390b5b00fcfc17fab271acd6cd9df774912f57cfc4dc1
4
- data.tar.gz: ce170b42caad7e8136b07c2aa5cb6e751f57dd64bd40c0addcd42a31798d0047
3
+ metadata.gz: 558d3187ee31faeadb08cf83e32a87307ae9d55a3327206598f27a78fb715e08
4
+ data.tar.gz: 9845999e0e618afde419869bb01b04277aca318aa80b238feb8252540fc16315
5
5
  SHA512:
6
- metadata.gz: 558e9cdfc3d7d4d2081ccb49b12a96bdb64b7768697eb0a1b9a431ed1ad3017ce894975e046a6e50766928c3863797715c7b45d013b6ab7ad78bca59ea86c6d0
7
- data.tar.gz: af3064f1489d32cf078fd5d87d2773700e9dfa498075f089029e0e7ec47c500c7815e84d51f426bb6fc3067bf02c9a9404da3b6f74d263c99b4ae96fc32dab35
6
+ metadata.gz: b323c1065ea1ab1d3c5909458cae726462ba1b88fd89effe8cd1efbdd1301d2022363c56b191f8ddbe28c0143fe87dfc94ca847865bb08e3564e29ba36f231a4
7
+ data.tar.gz: 00c4e775ee05e176e1048d6e5ddd9ddc436edaf9bf7fea61dbab33480e4adc499ea8a3a584e591858aad3a4c2f10096a3b6eb1ff6c8146f26ba6a3d42f104e32
@@ -48,6 +48,10 @@ option_parser = OptionParser.new do |opts|
48
48
  options[:all] = true
49
49
  end
50
50
 
51
+ opts.on("--keep-duplicates", "Do not collapse duplicate CDX captures by digest") do |t|
52
+ options[:keep_duplicates] = true
53
+ end
54
+
51
55
  opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
52
56
  options[:threads_count] = t
53
57
  end
@@ -4,12 +4,22 @@ require 'uri'
4
4
  module ArchiveAPI
5
5
 
6
6
  def get_raw_list_from_api(url, page_index, http)
7
- # Automatically append /* if the URL doesn't contain a path after the domain
7
+ # Automatically append /* for host-only URLs
8
8
  # This is a workaround for an issue with the API and *some* domains.
9
9
  # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
10
- # But don't do this when exact_url flag is set
11
- if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
12
- url = "#{url}/*"
10
+ # But don't do this when exact_url flag is set, and never append twice
11
+ if url && !@exact_url
12
+ normalized_url = url.to_s
13
+ has_wildcard = normalized_url.include?('*')
14
+ host_and_rest = normalized_url
15
+ .sub(/\Ahttps?:\/\//i, '')
16
+ .split(/[?#]/, 2)
17
+ .first
18
+ has_path = host_and_rest.include?('/')
19
+
20
+ unless has_wildcard || has_path
21
+ url = "#{normalized_url}/*"
22
+ end
13
23
  end
14
24
 
15
25
  request_url = URI("https://web.archive.org/cdx/search/cdx")
@@ -63,8 +73,9 @@ module ArchiveAPI
63
73
  end
64
74
 
65
75
  def parameters_for_api(page_index)
66
- parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "true"]]
67
- parameters.push(["filter", "statuscode:200"]) unless @all
76
+ parameters = [["fl", "timestamp,original"], ["gzip", "true"]]
77
+ parameters.push(["collapse", "digest"]) unless @keep_duplicates || @all_timestamps
78
+ parameters.push(["filter", "statuscode:2..|30[12378]"]) unless @all
68
79
  parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
69
80
  parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
70
81
  parameters.push(["page", page_index]) if page_index
@@ -2,11 +2,8 @@
2
2
 
3
3
  require 'thread'
4
4
  require 'net/http'
5
- require 'open-uri'
6
5
  require 'fileutils'
7
- require 'cgi'
8
6
  require 'json'
9
- require 'time'
10
7
  require 'concurrent-ruby'
11
8
  require 'logger'
12
9
  require 'zlib'
@@ -133,7 +130,7 @@ class WaybackMachineDownloader
133
130
  include SubdomainProcessor
134
131
  include URLRewrite
135
132
 
136
- VERSION = "2.4.6"
133
+ VERSION = "2.4.7"
137
134
  DEFAULT_TIMEOUT = 30
138
135
  MAX_RETRIES = 3
139
136
  RETRY_DELAY = 2
@@ -146,7 +143,7 @@ class WaybackMachineDownloader
146
143
 
147
144
  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
148
145
  :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
149
- :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
146
+ :all, :keep_duplicates, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
150
147
  :snapshot_at, :page_requisites
151
148
 
152
149
  def initialize params
@@ -165,6 +162,7 @@ class WaybackMachineDownloader
165
162
  @only_filter = params[:only_filter]
166
163
  @exclude_filter = params[:exclude_filter]
167
164
  @all = params[:all]
165
+ @keep_duplicates = params[:keep_duplicates] || false
168
166
  @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
169
167
  @threads_count = [params[:threads_count].to_i, 1].max
170
168
  @rewritten = params[:rewritten]
@@ -293,10 +291,8 @@ class WaybackMachineDownloader
293
291
 
294
292
  # if snapshot_at is set, limit CDX queries to snapshots at or before that timestamp
295
293
  original_to = @to_timestamp
296
- skip_cache = false
297
294
  if @snapshot_at
298
295
  @to_timestamp = @snapshot_at
299
- skip_cache = true
300
296
  end
301
297
 
302
298
  puts "Getting snapshot pages from Wayback Machine API..."
@@ -329,7 +325,7 @@ class WaybackMachineDownloader
329
325
  Concurrent::Future.execute(executor: fetch_pool) do
330
326
  result = nil
331
327
  @connection_pool.with_connection do |connection|
332
- result = get_raw_list_from_api("#{@base_url}/*", page, connection)
328
+ result = get_raw_list_from_api(@base_url, page, connection)
333
329
  end
334
330
  result ||= []
335
331
  [page, result]
@@ -382,10 +378,8 @@ class WaybackMachineDownloader
382
378
  # save the fetched list to the cache file
383
379
  begin
384
380
  FileUtils.mkdir_p(File.dirname(cdx_path))
385
- unless skip_cache
386
- File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
387
- puts "Saved snapshot list to #{cdx_path}"
388
- end
381
+ File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
382
+ puts "Saved snapshot list to #{cdx_path}"
389
383
  rescue => e
390
384
  puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
391
385
  ensure
@@ -532,7 +526,16 @@ class WaybackMachineDownloader
532
526
  if File.exist?(db_path) && !@reset
533
527
  puts "Loading list of already downloaded files from #{db_path}"
534
528
  begin
535
- File.foreach(db_path) { |line| downloaded_ids.add(line.strip) }
529
+ File.foreach(db_path) do |line|
530
+ id = line.strip
531
+ # only trust DB entries that actually exist on disk; this helps when resuming
532
+ path = local_path_for_file_id(id)
533
+ if path && File.exist?(path)
534
+ downloaded_ids.add(id)
535
+ else
536
+ puts "Found DB entry but file missing, will requeue: #{id}" if @logger && @logger.level == Logger::DEBUG
537
+ end
538
+ end
536
539
  rescue => e
537
540
  puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
538
541
  downloaded_ids.clear
@@ -687,6 +690,23 @@ class WaybackMachineDownloader
687
690
  def process_single_file(file_remote_info)
688
691
  download_success = false
689
692
  downloaded_path = nil
693
+
694
+ # fast-path for resumed runs: if file already exists locally, avoid HTTP work entirely
695
+ existing_path = local_path_for_file_id(file_remote_info[:file_id])
696
+ if existing_path && File.exist?(existing_path)
697
+ result_message = "#{color("[EXISTS]", :cyan)} #{file_remote_info[:file_url]} (#{@processed_file_count + 1}/#{@total_to_download})"
698
+ @download_mutex.synchronize do
699
+ @processed_file_count += 1 if @processed_file_count < @total_to_download
700
+ puts result_message
701
+ end
702
+
703
+ append_to_db(file_remote_info[:file_id])
704
+
705
+ if @page_requisites && File.extname(existing_path) =~ /\.(html?|php|asp|aspx|jsp)$/i
706
+ process_page_requisites(existing_path, file_remote_info)
707
+ end
708
+ return
709
+ end
690
710
 
691
711
  @connection_pool.with_connection do |connection|
692
712
  result_message, downloaded_path = download_file(file_remote_info, connection)
@@ -994,6 +1014,27 @@ class WaybackMachineDownloader
994
1014
  end
995
1015
  end
996
1016
 
1017
+ # derive the local filesystem path for a sanitized `file_id` stored in the DB
1018
+ def local_path_for_file_id(file_id)
1019
+ return nil if file_id.nil?
1020
+ current_backup_path = backup_path
1021
+
1022
+ # file_id coming from DB is expected to already be sanitized
1023
+ raw_path_elements = file_id.split('/')
1024
+
1025
+ if file_id == ""
1026
+ dir_path = current_backup_path
1027
+ return File.join(dir_path, 'index.html')
1028
+ elsif file_id[-1] == '/' || (raw_path_elements.last && !raw_path_elements.last.include?('.'))
1029
+ dir_path = File.join(current_backup_path, *raw_path_elements)
1030
+ return File.join(dir_path, 'index.html')
1031
+ else
1032
+ filename = raw_path_elements.pop
1033
+ dir_path = File.join(current_backup_path, *raw_path_elements)
1034
+ return File.join(dir_path, filename)
1035
+ end
1036
+ end
1037
+
997
1038
  def color(text, color_code)
998
1039
  return text if Gem.win_platform? && !ENV['ENABLE_ANSI']
999
1040
  codes = { red: 31, green: 32, yellow: 33, blue: 34, magenta: 35, cyan: 36, white: 37 }
@@ -1111,6 +1152,46 @@ class WaybackMachineDownloader
1111
1152
  end
1112
1153
  end
1113
1154
 
1155
+ def build_wayback_url(source_url, file_timestamp)
1156
+ source = source_url.to_s
1157
+ return source if wayback_archive_url?(source)
1158
+
1159
+ if source.start_with?('/web/')
1160
+ return "https://web.archive.org#{source}"
1161
+ end
1162
+
1163
+ if @rewritten
1164
+ "https://web.archive.org/web/#{file_timestamp}/#{source}"
1165
+ else
1166
+ "https://web.archive.org/web/#{file_timestamp}id_/#{source}"
1167
+ end
1168
+ end
1169
+
1170
+ def wayback_archive_url?(url)
1171
+ url.to_s.match?(%r{\Ahttps?://web\.archive\.org/web/})
1172
+ end
1173
+
1174
+ def extract_original_url(url)
1175
+ match = url.to_s.match(%r{\Ahttps?://web\.archive\.org/web/\d{1,14}(?:[a-z_]*)/(https?://.+)\z})
1176
+ match && match[1]
1177
+ end
1178
+
1179
+ def resolve_redirect_source(current_source_url, location)
1180
+ return nil if location.nil? || location.empty?
1181
+
1182
+ location = location.to_s
1183
+ return location if wayback_archive_url?(location)
1184
+
1185
+ if location.start_with?('/web/')
1186
+ return "https://web.archive.org#{location}"
1187
+ end
1188
+
1189
+ base_url = extract_original_url(current_source_url) || current_source_url.to_s
1190
+ URI.join(base_url, location).to_s
1191
+ rescue URI::InvalidURIError
1192
+ location
1193
+ end
1194
+
1114
1195
  # wrap URL in parentheses if it contains characters that commonly break unquoted
1115
1196
  # Windows CMD usage (e.g., &). This is only for display; user still must quote
1116
1197
  # when invoking manually.
@@ -1122,11 +1203,7 @@ class WaybackMachineDownloader
1122
1203
  def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
1123
1204
  retries = 0
1124
1205
  begin
1125
- wayback_url = if @rewritten
1126
- "https://web.archive.org/web/#{file_timestamp}/#{file_url}"
1127
- else
1128
- "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
1129
- end
1206
+ wayback_url = build_wayback_url(file_url, file_timestamp)
1130
1207
 
1131
1208
  # Escape characters that are not valid in URI()
1132
1209
  wayback_url = wayback_url.gsub(' ', '%20').gsub('[', '%5B').gsub(']', '%5D')
@@ -1193,7 +1270,8 @@ class WaybackMachineDownloader
1193
1270
  raise "Too many redirects for #{file_url}" if redirect_count >= 5
1194
1271
  location = response['location']
1195
1272
  @logger.warn("Redirect found for #{file_url} -> #{location}")
1196
- return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
1273
+ redirected_source = resolve_redirect_source(file_url, location)
1274
+ return download_with_retry(file_path, redirected_source, file_timestamp, connection, redirect_count + 1)
1197
1275
  when Net::HTTPTooManyRequests
1198
1276
  sleep(RATE_LIMIT * 2)
1199
1277
  raise "Rate limited, retrying..."
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader_straw
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.4.6
4
+ version: 2.4.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - strawberrymaster
@@ -18,7 +18,7 @@ dependencies:
18
18
  version: '1.3'
19
19
  - - ">="
20
20
  - !ruby/object:Gem::Version
21
- version: 1.3.4
21
+ version: 1.3.6
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
@@ -28,7 +28,7 @@ dependencies:
28
28
  version: '1.3'
29
29
  - - ">="
30
30
  - !ruby/object:Gem::Version
31
- version: 1.3.4
31
+ version: 1.3.6
32
32
  - !ruby/object:Gem::Dependency
33
33
  name: rake
34
34
  requirement: !ruby/object:Gem::Requirement
@@ -94,7 +94,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
94
94
  - !ruby/object:Gem::Version
95
95
  version: '0'
96
96
  requirements: []
97
- rubygems_version: 4.0.3
97
+ rubygems_version: 4.0.6
98
98
  specification_version: 4
99
99
  summary: Download an entire website from the Wayback Machine.
100
100
  test_files: []