archaeo 0.2.7 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/archaeo/archive_health_check.rb +77 -0
- data/lib/archaeo/bulk_downloader.rb +82 -24
- data/lib/archaeo/cdx_api.rb +39 -7
- data/lib/archaeo/cdx_cache.rb +105 -0
- data/lib/archaeo/cli.rb +109 -8
- data/lib/archaeo/download_state.rb +35 -0
- data/lib/archaeo/encoding_detector.rb +91 -0
- data/lib/archaeo/page.rb +1 -1
- data/lib/archaeo/path_sanitizer.rb +152 -0
- data/lib/archaeo/pattern_filter.rb +80 -0
- data/lib/archaeo/rate_limiter.rb +86 -0
- data/lib/archaeo/save_api.rb +7 -2
- data/lib/archaeo/save_result.rb +26 -8
- data/lib/archaeo/subdomain_discovery.rb +117 -0
- data/lib/archaeo/url_rewriter.rb +64 -7
- data/lib/archaeo/version.rb +1 -1
- data/lib/archaeo.rb +7 -0
- metadata +9 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 67239af7cc927c495c67a849ecefb1cdc886ce8d95ddd6e27a2decdde6a93cd3
|
|
4
|
+
data.tar.gz: 8ce4a0f786c2e7db3268b6660a1aa9e2f3b913ff99c22c85c3c2190457defc90
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ca0a9cc2bf0ad33a0d3dfd88e3228fd79fc3291a42fd3d13bbfbe4e37e744b0e3a5dadcec1cab48c0e13b6af872a8e3f4e80ce3e6593b18f024416b9cf7370fa
|
|
7
|
+
data.tar.gz: bb4b1d9e720dfdcc18c7c4ccb73cc55e29a3e31fb6ffb5bf3b8c0fce1548a63a06da4a710ab5fc5020f142ede69dbdfbef5451944183e280e86e018379a792eb
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Verifies that archived snapshots are still accessible.
|
|
5
|
+
#
|
|
6
|
+
# Checks each snapshot by performing HEAD requests to the
|
|
7
|
+
# archive URL and reporting accessibility status.
|
|
8
|
+
HealthReport = Struct.new(
|
|
9
|
+
:total, :accessible, :missing, :errors, :details,
|
|
10
|
+
keyword_init: true
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
HealthDetail = Struct.new(
|
|
14
|
+
:snapshot, :status, :error,
|
|
15
|
+
keyword_init: true
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
class ArchiveHealthCheck
|
|
19
|
+
def initialize(client: HttpClient.new, cdx_api: nil)
|
|
20
|
+
@client = client
|
|
21
|
+
@cdx_api = cdx_api
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def check(url, from: nil, to: nil, sample: nil)
|
|
25
|
+
snapshots = fetch_snapshots(url, from: from, to: to)
|
|
26
|
+
snapshots = sample_snapshots(snapshots, sample) if sample
|
|
27
|
+
|
|
28
|
+
details = check_snapshots(snapshots)
|
|
29
|
+
build_report(details)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def fetch_snapshots(url, from:, to:)
|
|
35
|
+
cdx = @cdx_api || CdxApi.new(client: @client)
|
|
36
|
+
opts = {}
|
|
37
|
+
opts[:from] = from if from
|
|
38
|
+
opts[:to] = to if to
|
|
39
|
+
cdx.snapshots(url, **opts)
|
|
40
|
+
.select(&:success?).to_a
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def sample_snapshots(snapshots, count)
|
|
44
|
+
return snapshots if count.nil? || count >= snapshots.size
|
|
45
|
+
|
|
46
|
+
step = snapshots.size.to_f / count
|
|
47
|
+
(0...count).map { |i| snapshots[(i * step).to_i] }
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def check_snapshots(snapshots)
|
|
51
|
+
snapshots.map do |snap|
|
|
52
|
+
check_single(snap)
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def check_single(snapshot)
|
|
57
|
+
response = @client.head(snapshot.archive_url)
|
|
58
|
+
status = response.status.between?(200, 399) ? :accessible : :missing
|
|
59
|
+
HealthDetail.new(snapshot: snapshot, status: status, error: nil)
|
|
60
|
+
rescue StandardError => e
|
|
61
|
+
HealthDetail.new(snapshot: snapshot, status: :error, error: e.message)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def build_report(details)
|
|
65
|
+
total = details.size
|
|
66
|
+
accessible = details.count { |d| d.status == :accessible }
|
|
67
|
+
missing = details.count { |d| d.status == :missing }
|
|
68
|
+
errors = details.count { |d| d.status == :error }
|
|
69
|
+
|
|
70
|
+
HealthReport.new(
|
|
71
|
+
total: total, accessible: accessible,
|
|
72
|
+
missing: missing, errors: errors,
|
|
73
|
+
details: details
|
|
74
|
+
)
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -15,23 +15,31 @@ module Archaeo
|
|
|
15
15
|
# for interrupted download recovery.
|
|
16
16
|
class BulkDownloader
|
|
17
17
|
def initialize(client: HttpClient.new, output_dir: "archive",
|
|
18
|
-
cdx_api: nil, concurrency: 1, on_error: nil
|
|
18
|
+
cdx_api: nil, concurrency: 1, on_error: nil,
|
|
19
|
+
rate_limiter: nil, path_sanitizer: nil)
|
|
19
20
|
@client = client
|
|
20
21
|
@output_dir = output_dir
|
|
21
22
|
@cdx_api = cdx_api
|
|
22
23
|
@concurrency = [1, concurrency.to_i].max
|
|
23
24
|
@on_error = on_error
|
|
25
|
+
@rate_limiter = rate_limiter || RateLimiter.new
|
|
26
|
+
@path_sanitizer = path_sanitizer || PathSanitizer.new
|
|
24
27
|
end
|
|
25
28
|
|
|
26
29
|
def download(url, from: nil, to: nil, resume: false,
|
|
27
|
-
dry_run: false,
|
|
30
|
+
dry_run: false, all_timestamps: false,
|
|
31
|
+
filter: nil, page_requisites: false,
|
|
32
|
+
snapshot_at: nil, &block)
|
|
28
33
|
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
29
34
|
url = UrlNormalizer.normalize(url)
|
|
30
35
|
FileUtils.mkdir_p(@output_dir) unless dry_run
|
|
31
36
|
|
|
32
|
-
snapshots = fetch_snapshots(url, from: from, to: to
|
|
37
|
+
snapshots = fetch_snapshots(url, from: from, to: to,
|
|
38
|
+
all_timestamps: all_timestamps,
|
|
39
|
+
snapshot_at: snapshot_at)
|
|
40
|
+
snapshots = apply_filter(snapshots, filter)
|
|
33
41
|
downloaded, skipped, bytes, failed =
|
|
34
|
-
run_download(snapshots, resume, dry_run, block)
|
|
42
|
+
run_download(snapshots, resume, dry_run, page_requisites, block)
|
|
35
43
|
|
|
36
44
|
build_summary(start_time, snapshots.size, downloaded,
|
|
37
45
|
skipped, bytes, failed: failed)
|
|
@@ -39,25 +47,39 @@ module Archaeo
|
|
|
39
47
|
|
|
40
48
|
private
|
|
41
49
|
|
|
42
|
-
def fetch_snapshots(url, from:, to:)
|
|
50
|
+
def fetch_snapshots(url, from:, to:, all_timestamps:, snapshot_at:)
|
|
43
51
|
cdx = @cdx_api || CdxApi.new(client: @client)
|
|
52
|
+
|
|
53
|
+
if snapshot_at
|
|
54
|
+
ts = Timestamp.coerce(snapshot_at)
|
|
55
|
+
return cdx.composite_snapshot(url, timestamp: ts, collapse: ["digest"])
|
|
56
|
+
end
|
|
57
|
+
|
|
44
58
|
options = {}
|
|
45
59
|
options[:from] = from if from
|
|
46
60
|
options[:to] = to if to
|
|
61
|
+
options[:collapse] = ["digest"] unless all_timestamps
|
|
62
|
+
|
|
47
63
|
cdx.snapshots(url, **options)
|
|
48
64
|
.select { |snap| !snap.blocked? && snap.status_code == 200 }
|
|
49
65
|
end
|
|
50
66
|
|
|
51
|
-
def
|
|
67
|
+
def apply_filter(snapshots, filter)
|
|
68
|
+
return snapshots unless filter
|
|
69
|
+
|
|
70
|
+
snapshots.select { |snap| filter.match?(snap.original_url) }
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def run_download(snapshots, resume, dry_run, page_requisites, progress)
|
|
52
74
|
state = DownloadState.new(@output_dir)
|
|
53
75
|
total = snapshots.size
|
|
54
76
|
|
|
55
77
|
if @concurrency == 1
|
|
56
78
|
download_sequential(snapshots, total, state, resume,
|
|
57
|
-
dry_run, progress)
|
|
79
|
+
dry_run, page_requisites, progress)
|
|
58
80
|
else
|
|
59
81
|
download_concurrent(snapshots, total, state, resume,
|
|
60
|
-
dry_run, progress)
|
|
82
|
+
dry_run, page_requisites, progress)
|
|
61
83
|
end
|
|
62
84
|
end
|
|
63
85
|
|
|
@@ -71,11 +93,12 @@ module Archaeo
|
|
|
71
93
|
end
|
|
72
94
|
|
|
73
95
|
def download_sequential(snapshots, total, state, resume,
|
|
74
|
-
dry_run, progress)
|
|
96
|
+
dry_run, page_requisites, progress)
|
|
75
97
|
counters = { downloaded: 0, skipped: 0, bytes: 0, failed: 0 }
|
|
76
98
|
|
|
77
99
|
snapshots.each_with_index do |snap, index|
|
|
78
100
|
process_sequential(snap, state, resume, dry_run, counters)
|
|
101
|
+
fetch_requisites(snap, dry_run, counters) if page_requisites
|
|
79
102
|
progress&.call(index + 1, total, snap)
|
|
80
103
|
end
|
|
81
104
|
|
|
@@ -96,6 +119,47 @@ module Archaeo
|
|
|
96
119
|
@on_error&.call(snap, e)
|
|
97
120
|
end
|
|
98
121
|
|
|
122
|
+
def fetch_requisites(snap, dry_run, counters)
|
|
123
|
+
return if dry_run
|
|
124
|
+
|
|
125
|
+
begin
|
|
126
|
+
bundle = snap.fetch_with_assets(client: @client)
|
|
127
|
+
bundle.assets.downloadable.all.each do |asset_url|
|
|
128
|
+
asset_snap = find_asset_snapshot(asset_url)
|
|
129
|
+
next unless asset_snap
|
|
130
|
+
|
|
131
|
+
counters[:bytes] += write_asset(asset_snap)
|
|
132
|
+
counters[:downloaded] += 1
|
|
133
|
+
end
|
|
134
|
+
rescue StandardError
|
|
135
|
+
nil
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def find_asset_snapshot(asset_url)
|
|
140
|
+
cdx = @cdx_api || CdxApi.new(client: @client)
|
|
141
|
+
cdx.near(asset_url, timestamp: Timestamp.now)
|
|
142
|
+
rescue NoSnapshotFound, StandardError
|
|
143
|
+
nil
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def write_asset(snapshot)
|
|
147
|
+
content = fetch_content(snapshot)
|
|
148
|
+
filename = build_filename(snapshot)
|
|
149
|
+
FileUtils.mkdir_p(File.dirname(filename))
|
|
150
|
+
tmp_path = "#{filename}.tmp"
|
|
151
|
+
File.binwrite(tmp_path, content)
|
|
152
|
+
File.rename(tmp_path, filename)
|
|
153
|
+
content.bytesize
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def fetch_content(snapshot)
|
|
157
|
+
@rate_limiter.wait(host: "web.archive.org")
|
|
158
|
+
Fetcher.new(client: @client).fetch(
|
|
159
|
+
snapshot.original_url, timestamp: snapshot.timestamp
|
|
160
|
+
).content
|
|
161
|
+
end
|
|
162
|
+
|
|
99
163
|
def download_snapshot(snap, state)
|
|
100
164
|
content = fetch_and_save(snap)
|
|
101
165
|
state.mark_completed(snap.timestamp, url: snap.original_url,
|
|
@@ -104,7 +168,7 @@ module Archaeo
|
|
|
104
168
|
end
|
|
105
169
|
|
|
106
170
|
def download_concurrent(snapshots, total, state, resume,
|
|
107
|
-
dry_run, progress)
|
|
171
|
+
dry_run, page_requisites, progress)
|
|
108
172
|
queue = snapshots.each_with_index.to_a
|
|
109
173
|
shared = { mutex: Mutex.new, errors: [],
|
|
110
174
|
downloaded: 0, skipped: 0, bytes: 0, failed: 0 }
|
|
@@ -112,7 +176,7 @@ module Archaeo
|
|
|
112
176
|
threads = Array.new(@concurrency) do
|
|
113
177
|
Thread.new do
|
|
114
178
|
process_queue(queue, total, state, resume,
|
|
115
|
-
dry_run, progress, shared)
|
|
179
|
+
dry_run, page_requisites, progress, shared)
|
|
116
180
|
end
|
|
117
181
|
end
|
|
118
182
|
threads.each(&:join)
|
|
@@ -122,7 +186,7 @@ module Archaeo
|
|
|
122
186
|
end
|
|
123
187
|
|
|
124
188
|
def process_queue(queue, total, state, resume, dry_run,
|
|
125
|
-
progress, shared)
|
|
189
|
+
_page_requisites, progress, shared)
|
|
126
190
|
loop do
|
|
127
191
|
snap, index = shared[:mutex].synchronize { queue.shift }
|
|
128
192
|
break unless snap
|
|
@@ -177,6 +241,7 @@ module Archaeo
|
|
|
177
241
|
end
|
|
178
242
|
|
|
179
243
|
def fetch_page(snapshot)
|
|
244
|
+
@rate_limiter.wait(host: "web.archive.org")
|
|
180
245
|
Fetcher.new(client: @client).fetch(
|
|
181
246
|
snapshot.original_url, timestamp: snapshot.timestamp
|
|
182
247
|
)
|
|
@@ -231,21 +296,14 @@ module Archaeo
|
|
|
231
296
|
end
|
|
232
297
|
|
|
233
298
|
def build_filename(snapshot)
|
|
299
|
+
safe_path = @path_sanitizer.sanitize(snapshot.original_url)
|
|
234
300
|
ts = snapshot.timestamp.to_s
|
|
235
|
-
safe_path = snapshot.original_url
|
|
236
|
-
.sub(%r{\Ahttps?://}, "")
|
|
237
|
-
.gsub(%r{[<>:"|?*#]}, "_")
|
|
238
|
-
.gsub(%r{[/\\]}, File::SEPARATOR)
|
|
239
|
-
.gsub(%r{[?&=]}, "_")
|
|
240
|
-
|
|
241
|
-
safe_path = safe_path[0..-2] if safe_path.end_with?(File::SEPARATOR)
|
|
242
|
-
safe_path = "#{safe_path}index" if safe_path.empty?
|
|
243
301
|
|
|
244
|
-
segments = safe_path.split(File::SEPARATOR)
|
|
245
|
-
|
|
246
|
-
end
|
|
302
|
+
segments = safe_path.split(File::SEPARATOR)
|
|
303
|
+
last = segments.pop || "index"
|
|
247
304
|
|
|
248
|
-
File.join(@output_dir, *segments,
|
|
305
|
+
File.join(@output_dir, *segments,
|
|
306
|
+
"#{last}_#{ts}#{extension_for(snapshot)}")
|
|
249
307
|
end
|
|
250
308
|
end
|
|
251
309
|
end
|
data/lib/archaeo/cdx_api.rb
CHANGED
|
@@ -40,8 +40,9 @@ module Archaeo
|
|
|
40
40
|
last_skip_timestamp: "lastSkipTimestamp",
|
|
41
41
|
}.freeze
|
|
42
42
|
|
|
43
|
-
def initialize(client: HttpClient.new)
|
|
43
|
+
def initialize(client: HttpClient.new, cache_dir: nil)
|
|
44
44
|
@client = client
|
|
45
|
+
@cache = cache_dir ? CdxCache.new(cache_dir) : nil
|
|
45
46
|
end
|
|
46
47
|
|
|
47
48
|
# Returns an Enumerator of Snapshot objects, auto-paginating
|
|
@@ -50,13 +51,26 @@ module Archaeo
|
|
|
50
51
|
url = UrlNormalizer.normalize(url)
|
|
51
52
|
validate_options!(options)
|
|
52
53
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
fetch_page(url, options, yielder)
|
|
56
|
-
else
|
|
57
|
-
fetch_with_resume_key(url, options, yielder)
|
|
58
|
-
end
|
|
54
|
+
if @cache && !options.key?(:page)
|
|
55
|
+
return cached_snapshots(url, options)
|
|
59
56
|
end
|
|
57
|
+
|
|
58
|
+
build_enumerator(url, options)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Returns one snapshot per unique URL, picking the newest at or before
|
|
62
|
+
# the given timestamp for point-in-time site reconstruction.
|
|
63
|
+
def composite_snapshot(url, timestamp:, collapse: [])
|
|
64
|
+
ts = Timestamp.coerce(timestamp)
|
|
65
|
+
options = { to: ts.to_s, sort: "reverse" }
|
|
66
|
+
options[:collapse] = collapse unless collapse.empty?
|
|
67
|
+
|
|
68
|
+
seen = {}
|
|
69
|
+
snapshots(url, **options).each do |snap|
|
|
70
|
+
key = snap.original_url
|
|
71
|
+
seen[key] = snap unless seen.key?(key)
|
|
72
|
+
end
|
|
73
|
+
seen.values
|
|
60
74
|
end
|
|
61
75
|
|
|
62
76
|
def near(url, timestamp:)
|
|
@@ -153,6 +167,24 @@ module Archaeo
|
|
|
153
167
|
|
|
154
168
|
private
|
|
155
169
|
|
|
170
|
+
def cached_snapshots(url, options)
|
|
171
|
+
Enumerator.new do |yielder|
|
|
172
|
+
@cache.fetch(url, **options) do
|
|
173
|
+
build_enumerator(url, options).to_a
|
|
174
|
+
end.each { |s| yielder << s }
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def build_enumerator(url, options)
|
|
179
|
+
Enumerator.new do |yielder|
|
|
180
|
+
if options.key?(:page)
|
|
181
|
+
fetch_page(url, options, yielder)
|
|
182
|
+
else
|
|
183
|
+
fetch_with_resume_key(url, options, yielder)
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
156
188
|
def fetch_with_resume_key(url, options, yielder)
|
|
157
189
|
params = build_params(url, options)
|
|
158
190
|
loop do
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "digest"
|
|
5
|
+
|
|
6
|
+
module Archaeo
|
|
7
|
+
# Persists CDX API query results to disk for resume support.
|
|
8
|
+
#
|
|
9
|
+
# Caches snapshot lists keyed by query parameters so that
|
|
10
|
+
# interrupted downloads can resume without re-querying CDX.
|
|
11
|
+
class CdxCache
|
|
12
|
+
CACHE_DIR = ".cache"
|
|
13
|
+
|
|
14
|
+
def initialize(base_dir)
|
|
15
|
+
@base_dir = base_dir
|
|
16
|
+
@cache_dir = File.join(base_dir, CACHE_DIR)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def fetch(url, **options)
|
|
20
|
+
key = cache_key(url, options)
|
|
21
|
+
path = cache_path(key)
|
|
22
|
+
|
|
23
|
+
if File.exist?(path)
|
|
24
|
+
load_cache(path)
|
|
25
|
+
else
|
|
26
|
+
snapshots = yield
|
|
27
|
+
save_cache(path, url, options, snapshots)
|
|
28
|
+
snapshots
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def cached?(url, **options)
|
|
33
|
+
File.exist?(cache_path(cache_key(url, options)))
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def cache_key(url, options = {})
|
|
37
|
+
parts = [url.to_s]
|
|
38
|
+
parts << options[:from].to_s if options[:from]
|
|
39
|
+
parts << options[:to].to_s if options[:to]
|
|
40
|
+
parts << options[:match_type].to_s if options[:match_type]
|
|
41
|
+
parts += Array(options[:filters]).map(&:to_s) if options[:filters]
|
|
42
|
+
parts += Array(options[:collapse]).map(&:to_s) if options[:collapse]
|
|
43
|
+
parts << options[:sort].to_s if options[:sort]
|
|
44
|
+
Digest::SHA256.hexdigest(parts.join("|"))[0, 16]
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def clear(url = nil, **options)
|
|
48
|
+
if url
|
|
49
|
+
FileUtils.rm_f(cache_path(cache_key(url, options)))
|
|
50
|
+
else
|
|
51
|
+
FileUtils.rm_rf(@cache_dir)
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
def cache_path(key)
|
|
58
|
+
FileUtils.mkdir_p(@cache_dir)
|
|
59
|
+
File.join(@cache_dir, "#{key}.cdx.json")
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def load_cache(path)
|
|
63
|
+
data = JSON.parse(File.read(path))
|
|
64
|
+
data["snapshots"].map { |row| build_snapshot(row) }
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def save_cache(path, url, options, snapshots)
|
|
68
|
+
data = {
|
|
69
|
+
"url" => url.to_s,
|
|
70
|
+
"options" => serialize_options(options),
|
|
71
|
+
"cached_at" => Time.now.utc.iso8601,
|
|
72
|
+
"snapshots" => snapshots.map(&:as_json),
|
|
73
|
+
}
|
|
74
|
+
tmp_path = "#{path}.tmp"
|
|
75
|
+
File.write(tmp_path, JSON.generate(data))
|
|
76
|
+
File.rename(tmp_path, path)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def serialize_options(options)
|
|
80
|
+
h = {}
|
|
81
|
+
h["from"] = options[:from].to_s if options[:from]
|
|
82
|
+
h["to"] = options[:to].to_s if options[:to]
|
|
83
|
+
h["match_type"] = options[:match_type].to_s if options[:match_type]
|
|
84
|
+
h["filters"] = Array(options[:filters]).map(&:to_s) if options[:filters]
|
|
85
|
+
if options[:collapse]
|
|
86
|
+
h["collapse"] =
|
|
87
|
+
Array(options[:collapse]).map(&:to_s)
|
|
88
|
+
end
|
|
89
|
+
h["sort"] = options[:sort].to_s if options[:sort]
|
|
90
|
+
h
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def build_snapshot(row)
|
|
94
|
+
Snapshot.new(
|
|
95
|
+
urlkey: row["urlkey"],
|
|
96
|
+
timestamp: row["timestamp"],
|
|
97
|
+
original_url: row["original_url"],
|
|
98
|
+
mimetype: row["mimetype"],
|
|
99
|
+
status_code: row["status_code"],
|
|
100
|
+
digest: row["digest"],
|
|
101
|
+
length: row["length"],
|
|
102
|
+
)
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
data/lib/archaeo/cli.rb
CHANGED
|
@@ -37,6 +37,8 @@ module Archaeo
|
|
|
37
37
|
option :limit, type: :numeric, desc: "Max snapshots to return"
|
|
38
38
|
option :format, desc: "Output format (table, json, csv)",
|
|
39
39
|
default: "table"
|
|
40
|
+
option :fields, type: :array,
|
|
41
|
+
desc: "Specific fields to print (timestamp,original,etc)"
|
|
40
42
|
def snapshots(url)
|
|
41
43
|
fmt = validate_output_format
|
|
42
44
|
handle_errors do
|
|
@@ -123,11 +125,21 @@ module Archaeo
|
|
|
123
125
|
end
|
|
124
126
|
|
|
125
127
|
desc "save URL", "Save a URL to the Wayback Machine"
|
|
128
|
+
option :headers, type: :boolean, default: false,
|
|
129
|
+
desc: "Show response headers"
|
|
126
130
|
def save(url)
|
|
127
131
|
handle_errors do
|
|
128
132
|
result = SaveApi.new.save(url)
|
|
129
133
|
label = result.cached? ? "Cached" : "Saved"
|
|
130
134
|
puts "#{label}: #{result.archive_url}"
|
|
135
|
+
if options[:headers] && result.response_headers
|
|
136
|
+
puts "Status: #{result.status_code}"
|
|
137
|
+
puts "Response URL: #{result.response_url}" if result.response_url
|
|
138
|
+
puts "Headers:"
|
|
139
|
+
result.response_headers.each do |k, v|
|
|
140
|
+
puts " #{k}: #{v}"
|
|
141
|
+
end
|
|
142
|
+
end
|
|
131
143
|
end
|
|
132
144
|
end
|
|
133
145
|
|
|
@@ -162,11 +174,16 @@ module Archaeo
|
|
|
162
174
|
"Fetch a page and rewrite archive URLs to local paths"
|
|
163
175
|
option :prefix, desc: "Local path prefix", default: "local"
|
|
164
176
|
option :output, desc: "Write rewritten HTML to file"
|
|
177
|
+
option :rewrite_js, type: :boolean, default: false,
|
|
178
|
+
desc: "Rewrite URLs in JavaScript strings"
|
|
179
|
+
option :rewrite_absolute, type: :boolean, default: false,
|
|
180
|
+
desc: "Rewrite all absolute archive URLs"
|
|
165
181
|
def rewrite(url, timestamp)
|
|
166
182
|
handle_errors do
|
|
167
183
|
coerced = Timestamp.coerce(timestamp)
|
|
168
184
|
page = Fetcher.new.fetch(url, timestamp: coerced)
|
|
169
|
-
|
|
185
|
+
rewriter = build_rewriter(url, coerced)
|
|
186
|
+
rewritten = rewriter.rewrite_html(page.content)
|
|
170
187
|
output_rewritten(rewritten)
|
|
171
188
|
end
|
|
172
189
|
end
|
|
@@ -215,22 +232,61 @@ module Archaeo
|
|
|
215
232
|
desc: "Number of parallel downloads"
|
|
216
233
|
option :dry_run, type: :boolean, default: false,
|
|
217
234
|
desc: "Preview downloads without fetching"
|
|
235
|
+
option :all_timestamps, type: :boolean, default: false,
|
|
236
|
+
desc: "Download all timestamps, not just latest"
|
|
237
|
+
option :only, desc: "Only download URLs matching this pattern"
|
|
238
|
+
option :exclude, desc: "Exclude URLs matching this pattern"
|
|
239
|
+
option :page_requisites, type: :boolean, default: false,
|
|
240
|
+
desc: "Download linked assets (CSS/JS/images)"
|
|
241
|
+
option :snapshot_at, desc: "Download composite snapshot at timestamp"
|
|
242
|
+
option :rate_limit, type: :numeric, default: 0,
|
|
243
|
+
desc: "Min seconds between requests"
|
|
218
244
|
def download(url)
|
|
219
245
|
handle_errors do
|
|
246
|
+
rate_limiter = RateLimiter.new(
|
|
247
|
+
min_interval: options[:rate_limit].to_f,
|
|
248
|
+
)
|
|
249
|
+
filter = build_filter
|
|
220
250
|
downloader = BulkDownloader.new(
|
|
221
251
|
output_dir: options[:output],
|
|
222
252
|
concurrency: options[:concurrency],
|
|
253
|
+
rate_limiter: rate_limiter,
|
|
223
254
|
)
|
|
224
|
-
download_with_progress(downloader, url)
|
|
255
|
+
download_with_progress(downloader, url, filter)
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
desc "health URL", "Check health of archived snapshots"
|
|
260
|
+
option :from, desc: "Start timestamp"
|
|
261
|
+
option :to, desc: "End timestamp"
|
|
262
|
+
option :sample, type: :numeric, desc: "Check only N snapshots"
|
|
263
|
+
option :format, desc: "Output format (table, json)", default: "table"
|
|
264
|
+
def health(url)
|
|
265
|
+
handle_errors do
|
|
266
|
+
checker = ArchiveHealthCheck.new
|
|
267
|
+
report = checker.check(
|
|
268
|
+
url,
|
|
269
|
+
from: options[:from],
|
|
270
|
+
to: options[:to],
|
|
271
|
+
sample: options[:sample],
|
|
272
|
+
)
|
|
273
|
+
output_health(report)
|
|
225
274
|
end
|
|
226
275
|
end
|
|
227
276
|
|
|
228
277
|
desc "known_urls DOMAIN",
|
|
229
278
|
"List all known URLs for a domain"
|
|
279
|
+
option :subdomain, type: :boolean, default: false,
|
|
280
|
+
desc: "Include subdomain URLs"
|
|
281
|
+
option :file, desc: "Save URLs to file"
|
|
230
282
|
def known_urls(domain)
|
|
231
283
|
handle_errors do
|
|
232
|
-
|
|
233
|
-
|
|
284
|
+
match_type = options[:subdomain] ? "domain" : "prefix"
|
|
285
|
+
urls = CdxApi.new.known_urls(domain, match_type: match_type)
|
|
286
|
+
if options[:file]
|
|
287
|
+
save_urls_to_file(urls, options[:file])
|
|
288
|
+
else
|
|
289
|
+
urls.each { |u| puts u }
|
|
234
290
|
end
|
|
235
291
|
end
|
|
236
292
|
end
|
|
@@ -331,7 +387,11 @@ module Archaeo
|
|
|
331
387
|
def build_rewriter(url, timestamp)
|
|
332
388
|
normalized = UrlNormalizer.normalize(url)
|
|
333
389
|
archive_prefix = ArchiveUrl.new(normalized, timestamp: timestamp).to_s
|
|
334
|
-
UrlRewriter.new(
|
|
390
|
+
UrlRewriter.new(
|
|
391
|
+
archive_prefix, options[:prefix],
|
|
392
|
+
rewrite_js: options[:rewrite_js],
|
|
393
|
+
rewrite_absolute: options[:rewrite_absolute]
|
|
394
|
+
)
|
|
335
395
|
end
|
|
336
396
|
|
|
337
397
|
def output_rewritten(content)
|
|
@@ -366,14 +426,55 @@ module Archaeo
|
|
|
366
426
|
end
|
|
367
427
|
end
|
|
368
428
|
|
|
369
|
-
def
|
|
429
|
+
def build_filter
|
|
430
|
+
only = options[:only]
|
|
431
|
+
exclude = options[:exclude]
|
|
432
|
+
return nil unless only || exclude
|
|
433
|
+
|
|
434
|
+
PatternFilter.new(only: only, exclude: exclude)
|
|
435
|
+
end
|
|
436
|
+
|
|
437
|
+
def download_with_progress(downloader, url, filter)
|
|
370
438
|
summary = downloader.download(
|
|
371
|
-
url,
|
|
372
|
-
|
|
439
|
+
url,
|
|
440
|
+
from: options[:from], to: options[:to],
|
|
441
|
+
resume: options[:resume], dry_run: options[:dry_run],
|
|
442
|
+
all_timestamps: options[:all_timestamps],
|
|
443
|
+
filter: filter,
|
|
444
|
+
page_requisites: options[:page_requisites],
|
|
445
|
+
snapshot_at: options[:snapshot_at]
|
|
373
446
|
) { |c, t, s| print_progress(c, t, s) }
|
|
374
447
|
print_summary(summary)
|
|
375
448
|
end
|
|
376
449
|
|
|
450
|
+
def output_health(report)
|
|
451
|
+
case options[:format]
|
|
452
|
+
when "json"
|
|
453
|
+
data = {
|
|
454
|
+
total: report.total,
|
|
455
|
+
accessible: report.accessible,
|
|
456
|
+
missing: report.missing,
|
|
457
|
+
errors: report.errors,
|
|
458
|
+
}
|
|
459
|
+
puts JSON.generate(data)
|
|
460
|
+
else
|
|
461
|
+
puts "Total: #{report.total}"
|
|
462
|
+
puts "Accessible: #{report.accessible}"
|
|
463
|
+
puts "Missing: #{report.missing}"
|
|
464
|
+
puts "Errors: #{report.errors}"
|
|
465
|
+
end
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
def save_urls_to_file(urls, file_path)
|
|
469
|
+
FileUtils.mkdir_p(File.dirname(file_path)) unless File.dirname(file_path) == "."
|
|
470
|
+
File.open(file_path, "w") do |f|
|
|
471
|
+
urls.each do |url|
|
|
472
|
+
f.puts(url)
|
|
473
|
+
end
|
|
474
|
+
end
|
|
475
|
+
warn "Saved #{urls.size} URLs to #{file_path}" unless quiet?
|
|
476
|
+
end
|
|
477
|
+
|
|
377
478
|
def print_progress(current, total, snap)
|
|
378
479
|
return if quiet?
|
|
379
480
|
|
|
@@ -62,6 +62,36 @@ module Archaeo
|
|
|
62
62
|
end
|
|
63
63
|
end
|
|
64
64
|
|
|
65
|
+
def file_exists?(timestamp, base_dir: @output_dir)
|
|
66
|
+
entry = entry_for(timestamp)
|
|
67
|
+
return false unless entry
|
|
68
|
+
|
|
69
|
+
file_path = find_file(base_dir, timestamp.to_s)
|
|
70
|
+
File.exist?(file_path)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def stale_entries(base_dir: @output_dir)
|
|
74
|
+
@mutex.synchronize do
|
|
75
|
+
entries.reject do |e|
|
|
76
|
+
find_file(base_dir,
|
|
77
|
+
e["ts"]) && File.exist?(find_file(base_dir, e["ts"]))
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def cleanup_stale(base_dir: @output_dir)
|
|
83
|
+
@mutex.synchronize do
|
|
84
|
+
stale = entries.reject do |e|
|
|
85
|
+
path = find_file(base_dir, e["ts"])
|
|
86
|
+
path && File.exist?(path)
|
|
87
|
+
end
|
|
88
|
+
@entries = entries - stale
|
|
89
|
+
@entries_key = nil
|
|
90
|
+
save
|
|
91
|
+
stale.size
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
65
95
|
private
|
|
66
96
|
|
|
67
97
|
def entries
|
|
@@ -103,5 +133,10 @@ module Archaeo
|
|
|
103
133
|
File.write(tmp_path, content)
|
|
104
134
|
File.rename(tmp_path, @path)
|
|
105
135
|
end
|
|
136
|
+
|
|
137
|
+
def find_file(base_dir, timestamp)
|
|
138
|
+
pattern = File.join(base_dir, "**", "*#{timestamp}*")
|
|
139
|
+
Dir.glob(pattern).first
|
|
140
|
+
end
|
|
106
141
|
end
|
|
107
142
|
end
|