wayback_machine_downloader_straw 2.4.3 → 2.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +8 -0
- data/lib/wayback_machine_downloader/archive_api.rb +19 -1
- data/lib/wayback_machine_downloader/page_requisites.rb +33 -0
- data/lib/wayback_machine_downloader/url_rewrite.rb +71 -60
- data/lib/wayback_machine_downloader.rb +296 -110
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c5ba50bde9b0306e043eed8151b12f37f603f5cfd73013e53260543f7fc134a5
|
|
4
|
+
data.tar.gz: d3dbc1a0f6f894547fb39e56193c967dbf99685a8fa4d3cecaeaff62070aab4c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 854ec2ccbe2daf620178397bf2c620ed8cf01ca57f175cbb9a2c8e7a057b1495a382ccae64836d75cf08c9a7d5a8ab638859d4c7d34584dcfc51f5eda8b7e5b2
|
|
7
|
+
data.tar.gz: 1ed3f13c7aadcb097a174c7870730df0920c7b08e9476b786d287dfd2a3e29a50b690ea2fb16b0d762dac726648356788b4122e674db44e7c5a49ffc541f4098
|
|
@@ -74,6 +74,10 @@ option_parser = OptionParser.new do |opts|
|
|
|
74
74
|
options[:keep] = true
|
|
75
75
|
end
|
|
76
76
|
|
|
77
|
+
opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
|
|
78
|
+
options[:max_retries] = t
|
|
79
|
+
end
|
|
80
|
+
|
|
77
81
|
opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
|
|
78
82
|
options[:recursive_subdomains] = true
|
|
79
83
|
end
|
|
@@ -82,6 +86,10 @@ option_parser = OptionParser.new do |opts|
|
|
|
82
86
|
options[:subdomain_depth] = t
|
|
83
87
|
end
|
|
84
88
|
|
|
89
|
+
opts.on("--page-requisites", "Download related assets (images, css, js) for downloaded HTML pages") do |t|
|
|
90
|
+
options[:page_requisites] = true
|
|
91
|
+
end
|
|
92
|
+
|
|
85
93
|
opts.on("-v", "--version", "Display version") do |t|
|
|
86
94
|
options[:version] = t
|
|
87
95
|
end
|
|
@@ -16,6 +16,10 @@ module ArchiveAPI
|
|
|
16
16
|
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
|
17
17
|
request_url.query = URI.encode_www_form(params)
|
|
18
18
|
|
|
19
|
+
retries = 0
|
|
20
|
+
max_retries = (@max_retries || 3)
|
|
21
|
+
delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
|
|
22
|
+
|
|
19
23
|
begin
|
|
20
24
|
response = http.get(request_url)
|
|
21
25
|
body = response.body.to_s.strip
|
|
@@ -26,7 +30,21 @@ module ArchiveAPI
|
|
|
26
30
|
json.shift if json.first == ["timestamp", "original"]
|
|
27
31
|
json
|
|
28
32
|
rescue JSON::ParserError => e
|
|
29
|
-
warn "Failed to
|
|
33
|
+
warn "Failed to parse JSON from API for #{url}: #{e.message}"
|
|
34
|
+
[]
|
|
35
|
+
rescue Net::ReadTimeout, Net::OpenTimeout => e
|
|
36
|
+
if retries < max_retries
|
|
37
|
+
retries += 1
|
|
38
|
+
warn "Timeout talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
|
|
39
|
+
sleep(delay * retries)
|
|
40
|
+
retry
|
|
41
|
+
else
|
|
42
|
+
warn "Giving up on Wayback CDX API for #{url} after #{max_retries} timeouts."
|
|
43
|
+
[]
|
|
44
|
+
end
|
|
45
|
+
rescue StandardError => e
|
|
46
|
+
# treat any other transient-ish error similarly, though without retries for now
|
|
47
|
+
warn "Error fetching CDX data for #{url}: #{e.message}"
|
|
30
48
|
[]
|
|
31
49
|
end
|
|
32
50
|
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
module PageRequisites
|
|
2
|
+
# regex to find links in href, src, url(), and srcset
|
|
3
|
+
# this ignores data: URIs, mailto:, and anchors
|
|
4
|
+
ASSET_REGEX = /(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
|
|
5
|
+
|
|
6
|
+
def self.extract(html_content)
|
|
7
|
+
assets = []
|
|
8
|
+
|
|
9
|
+
html_content.scan(ASSET_REGEX) do |match|
|
|
10
|
+
# match is an array of capture groups; find the one that matched
|
|
11
|
+
url = match.compact.first
|
|
12
|
+
next unless url
|
|
13
|
+
|
|
14
|
+
# handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
|
|
15
|
+
if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
|
|
16
|
+
url.split(',').each do |src_def|
|
|
17
|
+
src_url = src_def.strip.split(' ').first
|
|
18
|
+
assets << src_url if valid_asset?(src_url)
|
|
19
|
+
end
|
|
20
|
+
else
|
|
21
|
+
assets << url if valid_asset?(url)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
assets.uniq
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def self.valid_asset?(url)
|
|
29
|
+
return false if url.strip.empty?
|
|
30
|
+
return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
|
|
31
|
+
true
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -1,74 +1,85 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
"#{prefix}
|
|
20
|
-
else
|
|
21
|
-
"#{prefix}#{url}#{suffix}"
|
|
3
|
+
module URLRewrite
|
|
4
|
+
# server-side extensions that should work locally
|
|
5
|
+
SERVER_SIDE_EXTS = %w[.php .asp .aspx .jsp .cgi .pl .py].freeze
|
|
6
|
+
|
|
7
|
+
def rewrite_html_attr_urls(content)
|
|
8
|
+
# rewrite URLs to relative paths
|
|
9
|
+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
|
|
10
|
+
prefix, path, suffix = $1, $2, $3
|
|
11
|
+
path = normalize_path_for_local(path)
|
|
12
|
+
"#{prefix}#{path}#{suffix}"
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# rewrite absolute URLs to same domain as relative
|
|
16
|
+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
|
|
17
|
+
prefix, path, suffix = $1, $2, $3
|
|
18
|
+
path = normalize_path_for_local(path)
|
|
19
|
+
"#{prefix}#{path}#{suffix}"
|
|
22
20
|
end
|
|
21
|
+
|
|
22
|
+
content
|
|
23
23
|
end
|
|
24
|
-
content
|
|
25
|
-
end
|
|
26
24
|
|
|
27
|
-
|
|
28
|
-
|
|
25
|
+
def rewrite_css_urls(content)
|
|
26
|
+
# rewrite URLs in CSS
|
|
27
|
+
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
|
|
28
|
+
path = normalize_path_for_local($1)
|
|
29
|
+
"url(\"#{path}\")"
|
|
30
|
+
end
|
|
29
31
|
|
|
30
|
-
|
|
31
|
-
url
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
begin
|
|
35
|
-
uri = URI.parse(url)
|
|
36
|
-
path = uri.path
|
|
37
|
-
path = path[1..-1] if path.start_with?('/')
|
|
38
|
-
"url(\"#{path}\")"
|
|
39
|
-
rescue
|
|
40
|
-
"url(\"#{url}\")"
|
|
41
|
-
end
|
|
42
|
-
elsif url.start_with?('/')
|
|
43
|
-
"url(\"./#{url[1..-1]}\")"
|
|
44
|
-
else
|
|
45
|
-
"url(\"#{url}\")"
|
|
32
|
+
# rewrite absolute URLs in CSS
|
|
33
|
+
content.gsub!(/url\(\s*["']?https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
|
|
34
|
+
path = normalize_path_for_local($1)
|
|
35
|
+
"url(\"#{path}\")"
|
|
46
36
|
end
|
|
37
|
+
|
|
38
|
+
content
|
|
47
39
|
end
|
|
48
|
-
content
|
|
49
|
-
end
|
|
50
40
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
41
|
+
def rewrite_js_urls(content)
|
|
42
|
+
# rewrite archive.org URLs in JavaScript strings
|
|
43
|
+
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
|
|
44
|
+
quote_start, path, quote_end = $1, $2, $3
|
|
45
|
+
path = normalize_path_for_local(path)
|
|
46
|
+
"#{quote_start}#{path}#{quote_end}"
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# rewrite absolute URLs in JavaScript
|
|
50
|
+
content.gsub!(/(["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
|
|
51
|
+
quote_start, path, quote_end = $1, $2, $3
|
|
52
|
+
next "#{quote_start}http#{$2}#{quote_end}" if $2.start_with?('s://', '://')
|
|
53
|
+
path = normalize_path_for_local(path)
|
|
54
|
+
"#{quote_start}#{path}#{quote_end}"
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
content
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
def normalize_path_for_local(path)
|
|
63
|
+
return "./index.html" if path.empty? || path == "/"
|
|
56
64
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
end
|
|
66
|
-
elsif url.start_with?('/')
|
|
67
|
-
"#{quote_start}./#{url[1..-1]}#{quote_end}"
|
|
65
|
+
# handle query strings - they're already part of the filename
|
|
66
|
+
path = path.split('?').first if path.include?('?')
|
|
67
|
+
|
|
68
|
+
# check if this is a server-side script
|
|
69
|
+
ext = File.extname(path).downcase
|
|
70
|
+
if SERVER_SIDE_EXTS.include?(ext)
|
|
71
|
+
# keep the path as-is but ensure it starts with ./
|
|
72
|
+
path = "./#{path}" unless path.start_with?('./', '/')
|
|
68
73
|
else
|
|
69
|
-
|
|
74
|
+
# regular file handling
|
|
75
|
+
path = "./#{path}" unless path.start_with?('./', '/')
|
|
76
|
+
|
|
77
|
+
# if it looks like a directory, add index.html
|
|
78
|
+
if path.end_with?('/') || !path.include?('.')
|
|
79
|
+
path = "#{path.chomp('/')}/index.html"
|
|
80
|
+
end
|
|
70
81
|
end
|
|
82
|
+
|
|
83
|
+
path
|
|
71
84
|
end
|
|
72
|
-
|
|
73
|
-
content
|
|
74
85
|
end
|
|
@@ -15,6 +15,7 @@ require 'digest'
|
|
|
15
15
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
|
16
16
|
require_relative 'wayback_machine_downloader/to_regex'
|
|
17
17
|
require_relative 'wayback_machine_downloader/archive_api'
|
|
18
|
+
require_relative 'wayback_machine_downloader/page_requisites'
|
|
18
19
|
require_relative 'wayback_machine_downloader/subdom_processor'
|
|
19
20
|
require_relative 'wayback_machine_downloader/url_rewrite'
|
|
20
21
|
|
|
@@ -25,69 +26,81 @@ class ConnectionPool
|
|
|
25
26
|
MAX_RETRIES = 3
|
|
26
27
|
|
|
27
28
|
def initialize(size)
|
|
28
|
-
@
|
|
29
|
-
@pool
|
|
30
|
-
@creation_times = Concurrent::Map.new
|
|
29
|
+
@pool = SizedQueue.new(size)
|
|
30
|
+
size.times { @pool << build_connection_entry }
|
|
31
31
|
@cleanup_thread = schedule_cleanup
|
|
32
32
|
end
|
|
33
33
|
|
|
34
|
-
def with_connection
|
|
35
|
-
|
|
34
|
+
def with_connection
|
|
35
|
+
entry = acquire_connection
|
|
36
36
|
begin
|
|
37
|
-
yield
|
|
37
|
+
yield entry[:http]
|
|
38
38
|
ensure
|
|
39
|
-
release_connection(
|
|
39
|
+
release_connection(entry)
|
|
40
40
|
end
|
|
41
41
|
end
|
|
42
42
|
|
|
43
43
|
def shutdown
|
|
44
44
|
@cleanup_thread&.exit
|
|
45
|
-
|
|
46
|
-
@pool.clear
|
|
47
|
-
@creation_times.clear
|
|
45
|
+
drain_pool { |entry| safe_finish(entry[:http]) }
|
|
48
46
|
end
|
|
49
47
|
|
|
50
48
|
private
|
|
51
49
|
|
|
52
50
|
def acquire_connection
|
|
53
|
-
|
|
54
|
-
|
|
51
|
+
entry = @pool.pop
|
|
52
|
+
if stale?(entry)
|
|
53
|
+
safe_finish(entry[:http])
|
|
54
|
+
entry = build_connection_entry
|
|
55
|
+
end
|
|
56
|
+
entry
|
|
57
|
+
end
|
|
55
58
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
@creation_times[thread_id] = Time.now
|
|
59
|
+
def release_connection(entry)
|
|
60
|
+
if stale?(entry)
|
|
61
|
+
safe_finish(entry[:http])
|
|
62
|
+
entry = build_connection_entry
|
|
61
63
|
end
|
|
64
|
+
@pool << entry
|
|
65
|
+
end
|
|
62
66
|
|
|
63
|
-
|
|
67
|
+
def stale?(entry)
|
|
68
|
+
http = entry[:http]
|
|
69
|
+
!http.started? || (Time.now - entry[:created_at] > MAX_AGE)
|
|
64
70
|
end
|
|
65
71
|
|
|
66
|
-
def
|
|
67
|
-
|
|
68
|
-
if conn.started? && Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
|
|
69
|
-
conn.finish
|
|
70
|
-
@pool.delete(Thread.current.object_id)
|
|
71
|
-
@creation_times.delete(Thread.current.object_id)
|
|
72
|
-
end
|
|
72
|
+
def build_connection_entry
|
|
73
|
+
{ http: create_connection, created_at: Time.now }
|
|
73
74
|
end
|
|
74
75
|
|
|
75
|
-
def
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
false
|
|
76
|
+
def safe_finish(http)
|
|
77
|
+
http.finish if http&.started?
|
|
78
|
+
rescue StandardError
|
|
79
|
+
nil
|
|
80
80
|
end
|
|
81
81
|
|
|
82
|
-
def
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
82
|
+
def drain_pool
|
|
83
|
+
loop do
|
|
84
|
+
entry = begin
|
|
85
|
+
@pool.pop(true)
|
|
86
|
+
rescue ThreadError
|
|
87
|
+
break
|
|
88
|
+
end
|
|
89
|
+
yield(entry)
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def cleanup_old_connections
|
|
94
|
+
entry = begin
|
|
95
|
+
@pool.pop(true)
|
|
96
|
+
rescue ThreadError
|
|
97
|
+
return
|
|
98
|
+
end
|
|
99
|
+
if stale?(entry)
|
|
100
|
+
safe_finish(entry[:http])
|
|
101
|
+
entry = build_connection_entry
|
|
102
|
+
end
|
|
103
|
+
@pool << entry
|
|
91
104
|
end
|
|
92
105
|
|
|
93
106
|
def schedule_cleanup
|
|
@@ -99,16 +112,15 @@ class ConnectionPool
|
|
|
99
112
|
end
|
|
100
113
|
end
|
|
101
114
|
|
|
102
|
-
def
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
end
|
|
115
|
+
def create_connection
|
|
116
|
+
http = Net::HTTP.new("web.archive.org", 443)
|
|
117
|
+
http.use_ssl = true
|
|
118
|
+
http.read_timeout = DEFAULT_TIMEOUT
|
|
119
|
+
http.open_timeout = DEFAULT_TIMEOUT
|
|
120
|
+
http.keep_alive_timeout = 30
|
|
121
|
+
http.max_retries = MAX_RETRIES
|
|
122
|
+
http.start
|
|
123
|
+
http
|
|
112
124
|
end
|
|
113
125
|
end
|
|
114
126
|
|
|
@@ -116,8 +128,9 @@ class WaybackMachineDownloader
|
|
|
116
128
|
|
|
117
129
|
include ArchiveAPI
|
|
118
130
|
include SubdomainProcessor
|
|
131
|
+
include URLRewrite
|
|
119
132
|
|
|
120
|
-
VERSION = "2.4.
|
|
133
|
+
VERSION = "2.4.5"
|
|
121
134
|
DEFAULT_TIMEOUT = 30
|
|
122
135
|
MAX_RETRIES = 3
|
|
123
136
|
RETRY_DELAY = 2
|
|
@@ -131,7 +144,7 @@ class WaybackMachineDownloader
|
|
|
131
144
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
|
132
145
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
|
133
146
|
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
|
|
134
|
-
:snapshot_at
|
|
147
|
+
:snapshot_at, :page_requisites
|
|
135
148
|
|
|
136
149
|
def initialize params
|
|
137
150
|
validate_params(params)
|
|
@@ -163,6 +176,9 @@ class WaybackMachineDownloader
|
|
|
163
176
|
@recursive_subdomains = params[:recursive_subdomains] || false
|
|
164
177
|
@subdomain_depth = params[:subdomain_depth] || 1
|
|
165
178
|
@snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
|
|
179
|
+
@max_retries = params[:max_retries] ? params[:max_retries].to_i : MAX_RETRIES
|
|
180
|
+
@page_requisites = params[:page_requisites] || false
|
|
181
|
+
@pending_jobs = Concurrent::AtomicFixnum.new(0)
|
|
166
182
|
|
|
167
183
|
# URL for rejecting invalid/unencoded wayback urls
|
|
168
184
|
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
|
|
@@ -171,18 +187,29 @@ class WaybackMachineDownloader
|
|
|
171
187
|
end
|
|
172
188
|
|
|
173
189
|
def backup_name
|
|
174
|
-
url_to_process = @base_url
|
|
190
|
+
url_to_process = @base_url
|
|
191
|
+
url_to_process = url_to_process.chomp('/*') if url_to_process&.end_with?('/*')
|
|
192
|
+
|
|
175
193
|
raw = if url_to_process.include?('//')
|
|
176
194
|
url_to_process.split('/')[2]
|
|
177
195
|
else
|
|
178
196
|
url_to_process
|
|
179
197
|
end
|
|
180
198
|
|
|
199
|
+
# if it looks like a wildcard pattern, normalize to a safe host-ish name
|
|
200
|
+
if raw&.start_with?('*.')
|
|
201
|
+
raw = raw.sub(/\A\*\./, 'all-')
|
|
202
|
+
end
|
|
203
|
+
|
|
181
204
|
# sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
|
|
182
205
|
if Gem.win_platform?
|
|
183
206
|
raw = raw.gsub(/[:*?"<>|]/, '_')
|
|
184
207
|
raw = raw.gsub(/[ .]+\z/, '')
|
|
208
|
+
else
|
|
209
|
+
# still good practice to strip path separators (and maybe '*' for POSIX too)
|
|
210
|
+
raw = raw.gsub(/[\/:*?"<>|]/, '_')
|
|
185
211
|
end
|
|
212
|
+
|
|
186
213
|
raw = 'site' if raw.nil? || raw.empty?
|
|
187
214
|
raw
|
|
188
215
|
end
|
|
@@ -193,7 +220,8 @@ class WaybackMachineDownloader
|
|
|
193
220
|
@directory
|
|
194
221
|
else
|
|
195
222
|
# ensure the default path is absolute and normalized
|
|
196
|
-
|
|
223
|
+
cwd = Dir.pwd
|
|
224
|
+
File.expand_path(File.join(cwd, 'websites', backup_name))
|
|
197
225
|
end
|
|
198
226
|
end
|
|
199
227
|
|
|
@@ -277,53 +305,62 @@ class WaybackMachineDownloader
|
|
|
277
305
|
page_index = 0
|
|
278
306
|
batch_size = [@threads_count, 5].min
|
|
279
307
|
continue_fetching = true
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
308
|
+
fetch_pool = Concurrent::FixedThreadPool.new([@threads_count, 1].max)
|
|
309
|
+
begin
|
|
310
|
+
while continue_fetching && page_index < @maximum_pages
|
|
311
|
+
# Determine the range of pages to fetch in this batch
|
|
312
|
+
end_index = [page_index + batch_size, @maximum_pages].min
|
|
313
|
+
current_batch = (page_index...end_index).to_a
|
|
314
|
+
|
|
315
|
+
# Create futures for concurrent API calls
|
|
316
|
+
futures = current_batch.map do |page|
|
|
317
|
+
Concurrent::Future.execute(executor: fetch_pool) do
|
|
318
|
+
result = nil
|
|
319
|
+
@connection_pool.with_connection do |connection|
|
|
320
|
+
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
|
|
321
|
+
end
|
|
322
|
+
result ||= []
|
|
323
|
+
[page, result]
|
|
292
324
|
end
|
|
293
|
-
result ||= []
|
|
294
|
-
[page, result]
|
|
295
325
|
end
|
|
296
|
-
end
|
|
297
326
|
|
|
298
|
-
|
|
327
|
+
results = []
|
|
299
328
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
329
|
+
futures.each do |future|
|
|
330
|
+
begin
|
|
331
|
+
val = future.value
|
|
332
|
+
# only append if valid
|
|
333
|
+
if val && val.is_a?(Array) && val.first.is_a?(Integer)
|
|
334
|
+
results << val
|
|
335
|
+
end
|
|
336
|
+
rescue => e
|
|
337
|
+
puts "\nError fetching page #{future}: #{e.message}"
|
|
338
|
+
end
|
|
305
339
|
end
|
|
306
|
-
end
|
|
307
340
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
341
|
+
# Sort results by page number to maintain order
|
|
342
|
+
results.sort_by! { |page, _| page }
|
|
343
|
+
|
|
344
|
+
# Process results and check for empty pages
|
|
345
|
+
results.each do |page, result|
|
|
346
|
+
if result.nil? || result.empty?
|
|
347
|
+
continue_fetching = false
|
|
348
|
+
break
|
|
349
|
+
else
|
|
350
|
+
mutex.synchronize do
|
|
351
|
+
snapshot_list_to_consider.concat(result)
|
|
352
|
+
print "."
|
|
353
|
+
end
|
|
320
354
|
end
|
|
321
355
|
end
|
|
322
|
-
end
|
|
323
356
|
|
|
324
|
-
|
|
357
|
+
page_index = end_index
|
|
325
358
|
|
|
326
|
-
|
|
359
|
+
sleep(RATE_LIMIT) if continue_fetching
|
|
360
|
+
end
|
|
361
|
+
ensure
|
|
362
|
+
fetch_pool.shutdown
|
|
363
|
+
fetch_pool.wait_for_termination
|
|
327
364
|
end
|
|
328
365
|
end
|
|
329
366
|
|
|
@@ -531,7 +568,7 @@ class WaybackMachineDownloader
|
|
|
531
568
|
end
|
|
532
569
|
end
|
|
533
570
|
end
|
|
534
|
-
|
|
571
|
+
|
|
535
572
|
def download_files
|
|
536
573
|
start_time = Time.now
|
|
537
574
|
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
|
@@ -552,6 +589,12 @@ class WaybackMachineDownloader
|
|
|
552
589
|
|
|
553
590
|
# Load IDs of already downloaded files
|
|
554
591
|
downloaded_ids = load_downloaded_ids
|
|
592
|
+
|
|
593
|
+
# We use a thread-safe Set to track what we have queued/downloaded in this session
|
|
594
|
+
# to avoid infinite loops with page requisites
|
|
595
|
+
@session_downloaded_ids = Concurrent::Set.new
|
|
596
|
+
downloaded_ids.each { |id| @session_downloaded_ids.add(id) }
|
|
597
|
+
|
|
555
598
|
files_to_process = files_to_download.reject do |file_info|
|
|
556
599
|
downloaded_ids.include?(file_info[:file_id])
|
|
557
600
|
end
|
|
@@ -562,8 +605,8 @@ class WaybackMachineDownloader
|
|
|
562
605
|
if skipped_count > 0
|
|
563
606
|
puts "Found #{skipped_count} previously downloaded files, skipping them."
|
|
564
607
|
end
|
|
565
|
-
|
|
566
|
-
if remaining_count == 0
|
|
608
|
+
|
|
609
|
+
if remaining_count == 0 && !@page_requisites
|
|
567
610
|
puts "All matching files have already been downloaded."
|
|
568
611
|
cleanup
|
|
569
612
|
return
|
|
@@ -576,12 +619,22 @@ class WaybackMachineDownloader
|
|
|
576
619
|
@download_mutex = Mutex.new
|
|
577
620
|
|
|
578
621
|
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
|
|
579
|
-
|
|
622
|
+
@worker_pool = Concurrent::FixedThreadPool.new(thread_count)
|
|
580
623
|
|
|
581
|
-
|
|
624
|
+
# initial batch
|
|
625
|
+
files_to_process.each do |file_remote_info|
|
|
626
|
+
@session_downloaded_ids.add(file_remote_info[:file_id])
|
|
627
|
+
submit_download_job(file_remote_info)
|
|
628
|
+
end
|
|
629
|
+
|
|
630
|
+
# wait for all jobs to finish
|
|
631
|
+
loop do
|
|
632
|
+
sleep 0.5
|
|
633
|
+
break if @pending_jobs.value == 0
|
|
634
|
+
end
|
|
582
635
|
|
|
583
|
-
|
|
584
|
-
|
|
636
|
+
@worker_pool.shutdown
|
|
637
|
+
@worker_pool.wait_for_termination
|
|
585
638
|
|
|
586
639
|
end_time = Time.now
|
|
587
640
|
puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
|
|
@@ -599,6 +652,138 @@ class WaybackMachineDownloader
|
|
|
599
652
|
cleanup
|
|
600
653
|
end
|
|
601
654
|
|
|
655
|
+
# helper to submit jobs and increment the counter
|
|
656
|
+
def submit_download_job(file_remote_info)
|
|
657
|
+
@pending_jobs.increment
|
|
658
|
+
@worker_pool.post do
|
|
659
|
+
begin
|
|
660
|
+
process_single_file(file_remote_info)
|
|
661
|
+
ensure
|
|
662
|
+
@pending_jobs.decrement
|
|
663
|
+
end
|
|
664
|
+
end
|
|
665
|
+
end
|
|
666
|
+
|
|
667
|
+
def process_single_file(file_remote_info)
|
|
668
|
+
download_success = false
|
|
669
|
+
downloaded_path = nil
|
|
670
|
+
|
|
671
|
+
@connection_pool.with_connection do |connection|
|
|
672
|
+
result_message, path = download_file(file_remote_info, connection)
|
|
673
|
+
downloaded_path = path
|
|
674
|
+
|
|
675
|
+
if result_message && result_message.include?(' -> ')
|
|
676
|
+
download_success = true
|
|
677
|
+
end
|
|
678
|
+
|
|
679
|
+
@download_mutex.synchronize do
|
|
680
|
+
@processed_file_count += 1 if @processed_file_count < @total_to_download
|
|
681
|
+
# only print if it's a "User" file or a requisite we found
|
|
682
|
+
puts result_message if result_message
|
|
683
|
+
end
|
|
684
|
+
end
|
|
685
|
+
|
|
686
|
+
if download_success
|
|
687
|
+
append_to_db(file_remote_info[:file_id])
|
|
688
|
+
|
|
689
|
+
if @page_requisites && downloaded_path && File.extname(downloaded_path) =~ /\.(html?|php|asp|aspx|jsp)$/i
|
|
690
|
+
process_page_requisites(downloaded_path, file_remote_info)
|
|
691
|
+
end
|
|
692
|
+
end
|
|
693
|
+
rescue => e
|
|
694
|
+
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
|
|
695
|
+
end
|
|
696
|
+
|
|
697
|
+
def process_page_requisites(file_path, parent_remote_info)
|
|
698
|
+
return unless File.exist?(file_path)
|
|
699
|
+
|
|
700
|
+
content = File.read(file_path)
|
|
701
|
+
content = content.force_encoding('UTF-8').scrub
|
|
702
|
+
|
|
703
|
+
assets = PageRequisites.extract(content)
|
|
704
|
+
|
|
705
|
+
# prepare base URI for resolving relative paths
|
|
706
|
+
parent_raw = parent_remote_info[:file_url]
|
|
707
|
+
parent_raw = "http://#{parent_raw}" unless parent_raw.match?(/^https?:\/\//)
|
|
708
|
+
|
|
709
|
+
begin
|
|
710
|
+
base_uri = URI(parent_raw)
|
|
711
|
+
# calculate the "root" host of the site we are downloading to compare later
|
|
712
|
+
current_project_host = URI("http://" + @base_url.gsub(%r{^https?://}, '')).host
|
|
713
|
+
rescue URI::InvalidURIError
|
|
714
|
+
return
|
|
715
|
+
end
|
|
716
|
+
|
|
717
|
+
parent_timestamp = parent_remote_info[:timestamp]
|
|
718
|
+
|
|
719
|
+
assets.each do |asset_rel_url|
|
|
720
|
+
begin
|
|
721
|
+
# resolve full URL (handles relative paths like "../img/logo.png")
|
|
722
|
+
resolved_uri = base_uri + asset_rel_url
|
|
723
|
+
|
|
724
|
+
# detect if the asset URL is already a Wayback "web/<timestamp>/.../https://..." embed
|
|
725
|
+
asset_timestamp = parent_timestamp
|
|
726
|
+
if resolved_uri.path =~ %r{\A/web/([0-9]{4,})[^/]*/(https?://.+)\z}
|
|
727
|
+
embedded_ts = $1
|
|
728
|
+
begin
|
|
729
|
+
orig_uri = URI($2)
|
|
730
|
+
resolved_uri = orig_uri
|
|
731
|
+
asset_timestamp = embedded_ts.to_i
|
|
732
|
+
rescue URI::InvalidURIError
|
|
733
|
+
# fall back to original resolved_uri and parent timestamp
|
|
734
|
+
end
|
|
735
|
+
end
|
|
736
|
+
|
|
737
|
+
# filter out navigation links (pages) vs assets
|
|
738
|
+
# skip if extension is empty or looks like an HTML page
|
|
739
|
+
path = resolved_uri.path
|
|
740
|
+
ext = File.extname(path).downcase
|
|
741
|
+
if ext.empty? || ['.html', '.htm', '.php', '.asp', '.aspx'].include?(ext)
|
|
742
|
+
next
|
|
743
|
+
end
|
|
744
|
+
|
|
745
|
+
# construct the URL for the Wayback API
|
|
746
|
+
asset_wbm_url = resolved_uri.host + resolved_uri.path
|
|
747
|
+
asset_wbm_url += "?#{resolved_uri.query}" if resolved_uri.query
|
|
748
|
+
|
|
749
|
+
# construct the local file ID
|
|
750
|
+
# if the asset is on the SAME domain, strip the domain from the folder path
|
|
751
|
+
# if it's on a DIFFERENT domain (e.g. cdn.jquery.com), keep the domain folder
|
|
752
|
+
if resolved_uri.host == current_project_host
|
|
753
|
+
# e.g. /static/css/style.css
|
|
754
|
+
asset_file_id = resolved_uri.path
|
|
755
|
+
asset_file_id = asset_file_id[1..-1] if asset_file_id.start_with?('/')
|
|
756
|
+
else
|
|
757
|
+
# e.g. cdn.google.com/jquery.js
|
|
758
|
+
asset_file_id = asset_wbm_url
|
|
759
|
+
end
|
|
760
|
+
|
|
761
|
+
rescue URI::InvalidURIError, StandardError
|
|
762
|
+
next
|
|
763
|
+
end
|
|
764
|
+
|
|
765
|
+
# sanitize and queue
|
|
766
|
+
asset_id = sanitize_and_prepare_id(asset_file_id, asset_wbm_url)
|
|
767
|
+
|
|
768
|
+
unless @session_downloaded_ids.include?(asset_id)
|
|
769
|
+
@session_downloaded_ids.add(asset_id)
|
|
770
|
+
|
|
771
|
+
new_file_info = {
|
|
772
|
+
file_url: asset_wbm_url,
|
|
773
|
+
timestamp: asset_timestamp,
|
|
774
|
+
file_id: asset_id
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
@download_mutex.synchronize do
|
|
778
|
+
@total_to_download += 1
|
|
779
|
+
puts "Queued requisite: #{asset_file_id}"
|
|
780
|
+
end
|
|
781
|
+
|
|
782
|
+
submit_download_job(new_file_info)
|
|
783
|
+
end
|
|
784
|
+
end
|
|
785
|
+
end
|
|
786
|
+
|
|
602
787
|
def structure_dir_path dir_path
|
|
603
788
|
begin
|
|
604
789
|
FileUtils::mkdir_p dir_path unless File.exist? dir_path
|
|
@@ -630,7 +815,8 @@ class WaybackMachineDownloader
|
|
|
630
815
|
begin
|
|
631
816
|
content = File.binread(file_path)
|
|
632
817
|
|
|
633
|
-
|
|
818
|
+
# detect encoding for HTML files
|
|
819
|
+
if file_ext == '.html' || file_ext == '.htm' || file_ext == '.php' || file_ext == '.asp'
|
|
634
820
|
encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
|
|
635
821
|
content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
|
|
636
822
|
else
|
|
@@ -638,21 +824,21 @@ class WaybackMachineDownloader
|
|
|
638
824
|
end
|
|
639
825
|
|
|
640
826
|
# URLs in HTML attributes
|
|
641
|
-
rewrite_html_attr_urls(content)
|
|
827
|
+
content = rewrite_html_attr_urls(content)
|
|
642
828
|
|
|
643
829
|
# URLs in CSS
|
|
644
|
-
rewrite_css_urls(content)
|
|
830
|
+
content = rewrite_css_urls(content)
|
|
645
831
|
|
|
646
832
|
# URLs in JavaScript
|
|
647
|
-
rewrite_js_urls(content)
|
|
833
|
+
content = rewrite_js_urls(content)
|
|
648
834
|
|
|
649
|
-
# for URLs
|
|
835
|
+
# for URLs that start with a single slash, make them relative
|
|
650
836
|
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
|
651
837
|
prefix, path, suffix = $1, $2, $3
|
|
652
838
|
"#{prefix}./#{path}#{suffix}"
|
|
653
839
|
end
|
|
654
840
|
|
|
655
|
-
# for URLs in CSS that start with a single slash
|
|
841
|
+
# for URLs in CSS that start with a single slash, make them relative
|
|
656
842
|
content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
|
|
657
843
|
path = $1
|
|
658
844
|
"url(\"./#{path}\")"
|
|
@@ -705,7 +891,7 @@ class WaybackMachineDownloader
|
|
|
705
891
|
# check existence *before* download attempt
|
|
706
892
|
# this handles cases where a file was created manually or by a previous partial run without a .db entry
|
|
707
893
|
if File.exist? file_path
|
|
708
|
-
return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
894
|
+
return ["#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
|
|
709
895
|
end
|
|
710
896
|
|
|
711
897
|
begin
|
|
@@ -717,13 +903,13 @@ class WaybackMachineDownloader
|
|
|
717
903
|
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
|
|
718
904
|
rewrite_urls_to_relative(file_path)
|
|
719
905
|
end
|
|
720
|
-
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
906
|
+
return ["#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
|
|
721
907
|
when :skipped_not_found
|
|
722
|
-
"Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
908
|
+
return ["Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
|
|
723
909
|
else
|
|
724
910
|
# ideally, this case should not be reached if download_with_retry behaves as expected.
|
|
725
911
|
@logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
|
|
726
|
-
"Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
912
|
+
return ["Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
|
|
727
913
|
end
|
|
728
914
|
rescue StandardError => e
|
|
729
915
|
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
@@ -731,7 +917,7 @@ class WaybackMachineDownloader
|
|
|
731
917
|
File.delete(file_path)
|
|
732
918
|
msg += "\n#{file_path} was empty and was removed."
|
|
733
919
|
end
|
|
734
|
-
msg
|
|
920
|
+
return [msg, nil]
|
|
735
921
|
end
|
|
736
922
|
end
|
|
737
923
|
|
|
@@ -934,9 +1120,9 @@ class WaybackMachineDownloader
|
|
|
934
1120
|
end
|
|
935
1121
|
|
|
936
1122
|
rescue StandardError => e
|
|
937
|
-
if retries <
|
|
1123
|
+
if retries < @max_retries
|
|
938
1124
|
retries += 1
|
|
939
|
-
@logger.warn("Retry #{retries}/#{
|
|
1125
|
+
@logger.warn("Retry #{retries}/#{@max_retries} for #{file_url}: #{e.message}")
|
|
940
1126
|
sleep(RETRY_DELAY * retries)
|
|
941
1127
|
retry
|
|
942
1128
|
else
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: wayback_machine_downloader_straw
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.4.
|
|
4
|
+
version: 2.4.5
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- strawberrymaster
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date:
|
|
10
|
+
date: 2026-01-05 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: concurrent-ruby
|
|
@@ -71,6 +71,7 @@ files:
|
|
|
71
71
|
- bin/wayback_machine_downloader
|
|
72
72
|
- lib/wayback_machine_downloader.rb
|
|
73
73
|
- lib/wayback_machine_downloader/archive_api.rb
|
|
74
|
+
- lib/wayback_machine_downloader/page_requisites.rb
|
|
74
75
|
- lib/wayback_machine_downloader/subdom_processor.rb
|
|
75
76
|
- lib/wayback_machine_downloader/tidy_bytes.rb
|
|
76
77
|
- lib/wayback_machine_downloader/to_regex.rb
|