wayback_machine_downloader_straw 2.4.6 → 2.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +4 -0
- data/lib/wayback_machine_downloader/archive_api.rb +17 -6
- data/lib/wayback_machine_downloader.rb +97 -19
- metadata +4 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 558d3187ee31faeadb08cf83e32a87307ae9d55a3327206598f27a78fb715e08
|
|
4
|
+
data.tar.gz: 9845999e0e618afde419869bb01b04277aca318aa80b238feb8252540fc16315
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b323c1065ea1ab1d3c5909458cae726462ba1b88fd89effe8cd1efbdd1301d2022363c56b191f8ddbe28c0143fe87dfc94ca847865bb08e3564e29ba36f231a4
|
|
7
|
+
data.tar.gz: 00c4e775ee05e176e1048d6e5ddd9ddc436edaf9bf7fea61dbab33480e4adc499ea8a3a584e591858aad3a4c2f10096a3b6eb1ff6c8146f26ba6a3d42f104e32
|
|
@@ -48,6 +48,10 @@ option_parser = OptionParser.new do |opts|
|
|
|
48
48
|
options[:all] = true
|
|
49
49
|
end
|
|
50
50
|
|
|
51
|
+
opts.on("--keep-duplicates", "Do not collapse duplicate CDX captures by digest") do |t|
|
|
52
|
+
options[:keep_duplicates] = true
|
|
53
|
+
end
|
|
54
|
+
|
|
51
55
|
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
|
|
52
56
|
options[:threads_count] = t
|
|
53
57
|
end
|
|
@@ -4,12 +4,22 @@ require 'uri'
|
|
|
4
4
|
module ArchiveAPI
|
|
5
5
|
|
|
6
6
|
def get_raw_list_from_api(url, page_index, http)
|
|
7
|
-
# Automatically append /*
|
|
7
|
+
# Automatically append /* for host-only URLs
|
|
8
8
|
# This is a workaround for an issue with the API and *some* domains.
|
|
9
9
|
# See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
|
|
10
|
-
# But don't do this when exact_url flag is set
|
|
11
|
-
if url &&
|
|
12
|
-
|
|
10
|
+
# But don't do this when exact_url flag is set, and never append twice
|
|
11
|
+
if url && !@exact_url
|
|
12
|
+
normalized_url = url.to_s
|
|
13
|
+
has_wildcard = normalized_url.include?('*')
|
|
14
|
+
host_and_rest = normalized_url
|
|
15
|
+
.sub(/\Ahttps?:\/\//i, '')
|
|
16
|
+
.split(/[?#]/, 2)
|
|
17
|
+
.first
|
|
18
|
+
has_path = host_and_rest.include?('/')
|
|
19
|
+
|
|
20
|
+
unless has_wildcard || has_path
|
|
21
|
+
url = "#{normalized_url}/*"
|
|
22
|
+
end
|
|
13
23
|
end
|
|
14
24
|
|
|
15
25
|
request_url = URI("https://web.archive.org/cdx/search/cdx")
|
|
@@ -63,8 +73,9 @@ module ArchiveAPI
|
|
|
63
73
|
end
|
|
64
74
|
|
|
65
75
|
def parameters_for_api(page_index)
|
|
66
|
-
parameters = [["fl", "timestamp,original"], ["
|
|
67
|
-
parameters.push(["
|
|
76
|
+
parameters = [["fl", "timestamp,original"], ["gzip", "true"]]
|
|
77
|
+
parameters.push(["collapse", "digest"]) unless @keep_duplicates || @all_timestamps
|
|
78
|
+
parameters.push(["filter", "statuscode:2..|30[12378]"]) unless @all
|
|
68
79
|
parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
|
|
69
80
|
parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
|
|
70
81
|
parameters.push(["page", page_index]) if page_index
|
|
@@ -2,11 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
require 'thread'
|
|
4
4
|
require 'net/http'
|
|
5
|
-
require 'open-uri'
|
|
6
5
|
require 'fileutils'
|
|
7
|
-
require 'cgi'
|
|
8
6
|
require 'json'
|
|
9
|
-
require 'time'
|
|
10
7
|
require 'concurrent-ruby'
|
|
11
8
|
require 'logger'
|
|
12
9
|
require 'zlib'
|
|
@@ -133,7 +130,7 @@ class WaybackMachineDownloader
|
|
|
133
130
|
include SubdomainProcessor
|
|
134
131
|
include URLRewrite
|
|
135
132
|
|
|
136
|
-
VERSION = "2.4.
|
|
133
|
+
VERSION = "2.4.7"
|
|
137
134
|
DEFAULT_TIMEOUT = 30
|
|
138
135
|
MAX_RETRIES = 3
|
|
139
136
|
RETRY_DELAY = 2
|
|
@@ -146,7 +143,7 @@ class WaybackMachineDownloader
|
|
|
146
143
|
|
|
147
144
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
|
148
145
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
|
149
|
-
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
|
|
146
|
+
:all, :keep_duplicates, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
|
|
150
147
|
:snapshot_at, :page_requisites
|
|
151
148
|
|
|
152
149
|
def initialize params
|
|
@@ -165,6 +162,7 @@ class WaybackMachineDownloader
|
|
|
165
162
|
@only_filter = params[:only_filter]
|
|
166
163
|
@exclude_filter = params[:exclude_filter]
|
|
167
164
|
@all = params[:all]
|
|
165
|
+
@keep_duplicates = params[:keep_duplicates] || false
|
|
168
166
|
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
|
169
167
|
@threads_count = [params[:threads_count].to_i, 1].max
|
|
170
168
|
@rewritten = params[:rewritten]
|
|
@@ -293,10 +291,8 @@ class WaybackMachineDownloader
|
|
|
293
291
|
|
|
294
292
|
# if snapshot_at is set, limit CDX queries to snapshots at or before that timestamp
|
|
295
293
|
original_to = @to_timestamp
|
|
296
|
-
skip_cache = false
|
|
297
294
|
if @snapshot_at
|
|
298
295
|
@to_timestamp = @snapshot_at
|
|
299
|
-
skip_cache = true
|
|
300
296
|
end
|
|
301
297
|
|
|
302
298
|
puts "Getting snapshot pages from Wayback Machine API..."
|
|
@@ -329,7 +325,7 @@ class WaybackMachineDownloader
|
|
|
329
325
|
Concurrent::Future.execute(executor: fetch_pool) do
|
|
330
326
|
result = nil
|
|
331
327
|
@connection_pool.with_connection do |connection|
|
|
332
|
-
result = get_raw_list_from_api(
|
|
328
|
+
result = get_raw_list_from_api(@base_url, page, connection)
|
|
333
329
|
end
|
|
334
330
|
result ||= []
|
|
335
331
|
[page, result]
|
|
@@ -382,10 +378,8 @@ class WaybackMachineDownloader
|
|
|
382
378
|
# save the fetched list to the cache file
|
|
383
379
|
begin
|
|
384
380
|
FileUtils.mkdir_p(File.dirname(cdx_path))
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
puts "Saved snapshot list to #{cdx_path}"
|
|
388
|
-
end
|
|
381
|
+
File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
|
|
382
|
+
puts "Saved snapshot list to #{cdx_path}"
|
|
389
383
|
rescue => e
|
|
390
384
|
puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
|
|
391
385
|
ensure
|
|
@@ -532,7 +526,16 @@ class WaybackMachineDownloader
|
|
|
532
526
|
if File.exist?(db_path) && !@reset
|
|
533
527
|
puts "Loading list of already downloaded files from #{db_path}"
|
|
534
528
|
begin
|
|
535
|
-
File.foreach(db_path)
|
|
529
|
+
File.foreach(db_path) do |line|
|
|
530
|
+
id = line.strip
|
|
531
|
+
# only trust DB entries that actually exist on disk; this helps when resuming
|
|
532
|
+
path = local_path_for_file_id(id)
|
|
533
|
+
if path && File.exist?(path)
|
|
534
|
+
downloaded_ids.add(id)
|
|
535
|
+
else
|
|
536
|
+
puts "Found DB entry but file missing, will requeue: #{id}" if @logger && @logger.level == Logger::DEBUG
|
|
537
|
+
end
|
|
538
|
+
end
|
|
536
539
|
rescue => e
|
|
537
540
|
puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
|
|
538
541
|
downloaded_ids.clear
|
|
@@ -687,6 +690,23 @@ class WaybackMachineDownloader
|
|
|
687
690
|
def process_single_file(file_remote_info)
|
|
688
691
|
download_success = false
|
|
689
692
|
downloaded_path = nil
|
|
693
|
+
|
|
694
|
+
# fast-path for resumed runs: if file already exists locally, avoid HTTP work entirely
|
|
695
|
+
existing_path = local_path_for_file_id(file_remote_info[:file_id])
|
|
696
|
+
if existing_path && File.exist?(existing_path)
|
|
697
|
+
result_message = "#{color("[EXISTS]", :cyan)} #{file_remote_info[:file_url]} (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
698
|
+
@download_mutex.synchronize do
|
|
699
|
+
@processed_file_count += 1 if @processed_file_count < @total_to_download
|
|
700
|
+
puts result_message
|
|
701
|
+
end
|
|
702
|
+
|
|
703
|
+
append_to_db(file_remote_info[:file_id])
|
|
704
|
+
|
|
705
|
+
if @page_requisites && File.extname(existing_path) =~ /\.(html?|php|asp|aspx|jsp)$/i
|
|
706
|
+
process_page_requisites(existing_path, file_remote_info)
|
|
707
|
+
end
|
|
708
|
+
return
|
|
709
|
+
end
|
|
690
710
|
|
|
691
711
|
@connection_pool.with_connection do |connection|
|
|
692
712
|
result_message, downloaded_path = download_file(file_remote_info, connection)
|
|
@@ -994,6 +1014,27 @@ class WaybackMachineDownloader
|
|
|
994
1014
|
end
|
|
995
1015
|
end
|
|
996
1016
|
|
|
1017
|
+
# derive the local filesystem path for a sanitized `file_id` stored in the DB
|
|
1018
|
+
def local_path_for_file_id(file_id)
|
|
1019
|
+
return nil if file_id.nil?
|
|
1020
|
+
current_backup_path = backup_path
|
|
1021
|
+
|
|
1022
|
+
# file_id coming from DB is expected to already be sanitized
|
|
1023
|
+
raw_path_elements = file_id.split('/')
|
|
1024
|
+
|
|
1025
|
+
if file_id == ""
|
|
1026
|
+
dir_path = current_backup_path
|
|
1027
|
+
return File.join(dir_path, 'index.html')
|
|
1028
|
+
elsif file_id[-1] == '/' || (raw_path_elements.last && !raw_path_elements.last.include?('.'))
|
|
1029
|
+
dir_path = File.join(current_backup_path, *raw_path_elements)
|
|
1030
|
+
return File.join(dir_path, 'index.html')
|
|
1031
|
+
else
|
|
1032
|
+
filename = raw_path_elements.pop
|
|
1033
|
+
dir_path = File.join(current_backup_path, *raw_path_elements)
|
|
1034
|
+
return File.join(dir_path, filename)
|
|
1035
|
+
end
|
|
1036
|
+
end
|
|
1037
|
+
|
|
997
1038
|
def color(text, color_code)
|
|
998
1039
|
return text if Gem.win_platform? && !ENV['ENABLE_ANSI']
|
|
999
1040
|
codes = { red: 31, green: 32, yellow: 33, blue: 34, magenta: 35, cyan: 36, white: 37 }
|
|
@@ -1111,6 +1152,46 @@ class WaybackMachineDownloader
|
|
|
1111
1152
|
end
|
|
1112
1153
|
end
|
|
1113
1154
|
|
|
1155
|
+
def build_wayback_url(source_url, file_timestamp)
|
|
1156
|
+
source = source_url.to_s
|
|
1157
|
+
return source if wayback_archive_url?(source)
|
|
1158
|
+
|
|
1159
|
+
if source.start_with?('/web/')
|
|
1160
|
+
return "https://web.archive.org#{source}"
|
|
1161
|
+
end
|
|
1162
|
+
|
|
1163
|
+
if @rewritten
|
|
1164
|
+
"https://web.archive.org/web/#{file_timestamp}/#{source}"
|
|
1165
|
+
else
|
|
1166
|
+
"https://web.archive.org/web/#{file_timestamp}id_/#{source}"
|
|
1167
|
+
end
|
|
1168
|
+
end
|
|
1169
|
+
|
|
1170
|
+
def wayback_archive_url?(url)
|
|
1171
|
+
url.to_s.match?(%r{\Ahttps?://web\.archive\.org/web/})
|
|
1172
|
+
end
|
|
1173
|
+
|
|
1174
|
+
def extract_original_url(url)
|
|
1175
|
+
match = url.to_s.match(%r{\Ahttps?://web\.archive\.org/web/\d{1,14}(?:[a-z_]*)/(https?://.+)\z})
|
|
1176
|
+
match && match[1]
|
|
1177
|
+
end
|
|
1178
|
+
|
|
1179
|
+
def resolve_redirect_source(current_source_url, location)
|
|
1180
|
+
return nil if location.nil? || location.empty?
|
|
1181
|
+
|
|
1182
|
+
location = location.to_s
|
|
1183
|
+
return location if wayback_archive_url?(location)
|
|
1184
|
+
|
|
1185
|
+
if location.start_with?('/web/')
|
|
1186
|
+
return "https://web.archive.org#{location}"
|
|
1187
|
+
end
|
|
1188
|
+
|
|
1189
|
+
base_url = extract_original_url(current_source_url) || current_source_url.to_s
|
|
1190
|
+
URI.join(base_url, location).to_s
|
|
1191
|
+
rescue URI::InvalidURIError
|
|
1192
|
+
location
|
|
1193
|
+
end
|
|
1194
|
+
|
|
1114
1195
|
# wrap URL in parentheses if it contains characters that commonly break unquoted
|
|
1115
1196
|
# Windows CMD usage (e.g., &). This is only for display; user still must quote
|
|
1116
1197
|
# when invoking manually.
|
|
@@ -1122,11 +1203,7 @@ class WaybackMachineDownloader
|
|
|
1122
1203
|
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
|
|
1123
1204
|
retries = 0
|
|
1124
1205
|
begin
|
|
1125
|
-
wayback_url =
|
|
1126
|
-
"https://web.archive.org/web/#{file_timestamp}/#{file_url}"
|
|
1127
|
-
else
|
|
1128
|
-
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
|
|
1129
|
-
end
|
|
1206
|
+
wayback_url = build_wayback_url(file_url, file_timestamp)
|
|
1130
1207
|
|
|
1131
1208
|
# Escape characters that are not valid in URI()
|
|
1132
1209
|
wayback_url = wayback_url.gsub(' ', '%20').gsub('[', '%5B').gsub(']', '%5D')
|
|
@@ -1193,7 +1270,8 @@ class WaybackMachineDownloader
|
|
|
1193
1270
|
raise "Too many redirects for #{file_url}" if redirect_count >= 5
|
|
1194
1271
|
location = response['location']
|
|
1195
1272
|
@logger.warn("Redirect found for #{file_url} -> #{location}")
|
|
1196
|
-
|
|
1273
|
+
redirected_source = resolve_redirect_source(file_url, location)
|
|
1274
|
+
return download_with_retry(file_path, redirected_source, file_timestamp, connection, redirect_count + 1)
|
|
1197
1275
|
when Net::HTTPTooManyRequests
|
|
1198
1276
|
sleep(RATE_LIMIT * 2)
|
|
1199
1277
|
raise "Rate limited, retrying..."
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: wayback_machine_downloader_straw
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.4.
|
|
4
|
+
version: 2.4.7
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- strawberrymaster
|
|
@@ -18,7 +18,7 @@ dependencies:
|
|
|
18
18
|
version: '1.3'
|
|
19
19
|
- - ">="
|
|
20
20
|
- !ruby/object:Gem::Version
|
|
21
|
-
version: 1.3.
|
|
21
|
+
version: 1.3.6
|
|
22
22
|
type: :runtime
|
|
23
23
|
prerelease: false
|
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
|
@@ -28,7 +28,7 @@ dependencies:
|
|
|
28
28
|
version: '1.3'
|
|
29
29
|
- - ">="
|
|
30
30
|
- !ruby/object:Gem::Version
|
|
31
|
-
version: 1.3.
|
|
31
|
+
version: 1.3.6
|
|
32
32
|
- !ruby/object:Gem::Dependency
|
|
33
33
|
name: rake
|
|
34
34
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -94,7 +94,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
94
94
|
- !ruby/object:Gem::Version
|
|
95
95
|
version: '0'
|
|
96
96
|
requirements: []
|
|
97
|
-
rubygems_version: 4.0.
|
|
97
|
+
rubygems_version: 4.0.6
|
|
98
98
|
specification_version: 4
|
|
99
99
|
summary: Download an entire website from the Wayback Machine.
|
|
100
100
|
test_files: []
|