archaeo 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +95 -1
- data/lib/archaeo/asset_extractor.rb +10 -0
- data/lib/archaeo/asset_list.rb +23 -0
- data/lib/archaeo/availability_api.rb +31 -0
- data/lib/archaeo/availability_result.rb +11 -0
- data/lib/archaeo/bulk_downloader.rb +56 -33
- data/lib/archaeo/cdx_api.rb +18 -0
- data/lib/archaeo/cdx_filter.rb +11 -0
- data/lib/archaeo/cdx_timeline.rb +66 -0
- data/lib/archaeo/cli.rb +181 -4
- data/lib/archaeo/download_state.rb +28 -15
- data/lib/archaeo/fetcher.rb +13 -0
- data/lib/archaeo/http_client.rb +24 -13
- data/lib/archaeo/page.rb +56 -0
- data/lib/archaeo/save_api.rb +16 -0
- data/lib/archaeo/save_result.rb +5 -1
- data/lib/archaeo/snapshot.rb +12 -0
- data/lib/archaeo/timestamp.rb +46 -0
- data/lib/archaeo/url_rewriter.rb +25 -0
- data/lib/archaeo/version.rb +1 -1
- data/lib/archaeo.rb +12 -0
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ecdcd994fa61efa836a5224a5e329b40b72694c27a79cbb6eb4f91bf57c0f2c9
|
|
4
|
+
data.tar.gz: 03ad557eb55ce9946a2936e3beec8cad13db2ecd4b2fc49b0996131d35e6ddba
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a2859d1738f4f4a9fa0f0ed89d118dacfc24a2f75d3237ad3bdd31cf26c041e8aa5f47c998d8b5e61907c34d491ed48b6df22d2784702d60a15820ba8d8a2a27
|
|
7
|
+
data.tar.gz: e2df62b1077c90d8b04173f9aa713f590f9692cdfde3d3b656796308810341cd5b052321f97a28513ebe98143cdb8f28fa7e182720555db231983fa3d2a6d4be
|
data/README.adoc
CHANGED
|
@@ -57,6 +57,21 @@ end
|
|
|
57
57
|
# Count snapshots
|
|
58
58
|
cdx.count("example.com") # => Integer
|
|
59
59
|
|
|
60
|
+
# Deduplicated snapshots (collapse by digest)
|
|
61
|
+
cdx.unique_snapshots("example.com").each do |snap|
|
|
62
|
+
puts snap.timestamp
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Timeline analysis (time-bucketed frequency)
|
|
66
|
+
timeline = cdx.timeline("example.com",
|
|
67
|
+
from: "20220101", to: "20221231",
|
|
68
|
+
bucket_size: :month)
|
|
69
|
+
timeline.to_h # => { "202201" => 5, "202202" => 3, ... }
|
|
70
|
+
timeline.peak # => ["202201", 5]
|
|
71
|
+
timeline.total # => 42
|
|
72
|
+
timeline.span # => ["202201", "202212"]
|
|
73
|
+
timeline.size # => 12 (number of buckets)
|
|
74
|
+
|
|
60
75
|
# Filter by status code, mimetype, or URL pattern
|
|
61
76
|
cdx.snapshots("example.com",
|
|
62
77
|
filters: [Archaeo::CdxFilter.by_status(200)],
|
|
@@ -102,6 +117,10 @@ result.to_h # => Hash representation
|
|
|
102
117
|
result.as_json # => JSON-serializable Hash
|
|
103
118
|
|
|
104
119
|
api.available?("example.com") # => true/false
|
|
120
|
+
|
|
121
|
+
# Batch availability check
|
|
122
|
+
results = api.batch_available?(%w[example.com other.com])
|
|
123
|
+
# => { "example.com" => AvailabilityResult, ... }
|
|
105
124
|
----
|
|
106
125
|
|
|
107
126
|
=== Save a URL (SavePageNow)
|
|
@@ -114,8 +133,14 @@ result.url # => "https://example.com/"
|
|
|
114
133
|
result.archive_url # => "https://web.archive.org/web/..."
|
|
115
134
|
result.timestamp # => Archaeo::Timestamp
|
|
116
135
|
result.cached? # => true if already archived
|
|
136
|
+
result.success? # => true if archive_url is present
|
|
117
137
|
result.to_h # => Hash representation
|
|
118
138
|
result.as_json # => JSON-serializable Hash
|
|
139
|
+
|
|
140
|
+
# Batch save multiple URLs
|
|
141
|
+
results = save.batch_save(%w[https://a.com https://b.com],
|
|
142
|
+
delay: 2, stop_on_error: false)
|
|
143
|
+
results.each { |r| puts "#{r.url}: #{r.success?}" }
|
|
119
144
|
----
|
|
120
145
|
|
|
121
146
|
=== Fetch Archived Content
|
|
@@ -148,6 +173,15 @@ page = fetcher.fetch("https://example.com/",
|
|
|
148
173
|
page = fetcher.fetch("https://example.com/",
|
|
149
174
|
timestamp: "20220615000000",
|
|
150
175
|
snapshot: snap)
|
|
176
|
+
|
|
177
|
+
# Raise on error status (raises FetchError with page attached)
|
|
178
|
+
page = fetcher.fetch!("https://example.com/",
|
|
179
|
+
timestamp: "20220615000000")
|
|
180
|
+
# FetchError includes: .status_code, .url, .page
|
|
181
|
+
|
|
182
|
+
# Page links and meta extraction
|
|
183
|
+
page.links # => [{ href: "...", text: "...", external: true/false }]
|
|
184
|
+
page.meta_tags # => { "description" => "...", "og:title" => "...", "canonical" => "..." }
|
|
151
185
|
----
|
|
152
186
|
|
|
153
187
|
=== Fetch Page with Assets
|
|
@@ -186,6 +220,13 @@ restored = Archaeo::AssetList.from_json(json_string)
|
|
|
186
220
|
|
|
187
221
|
# Safe type access
|
|
188
222
|
bundle.assets.urls_by_type(:image) # works for any type key
|
|
223
|
+
|
|
224
|
+
# Domain analysis
|
|
225
|
+
bundle.assets.domain_counts
|
|
226
|
+
# => { "cdn.example.com" => 3, "fonts.googleapis.com" => 1 }
|
|
227
|
+
|
|
228
|
+
# Filter downloadable assets (excludes data: and fragment URLs)
|
|
229
|
+
downloadable = bundle.assets.downloadable
|
|
189
230
|
----
|
|
190
231
|
|
|
191
232
|
=== Bulk Download with Resume
|
|
@@ -200,6 +241,7 @@ end
|
|
|
200
241
|
summary.total # => total snapshots found
|
|
201
242
|
summary.downloaded # => successfully downloaded
|
|
202
243
|
summary.skipped # => skipped (already downloaded with resume)
|
|
244
|
+
summary.failed # => failed downloads
|
|
203
245
|
summary.bytes_written # => total bytes written
|
|
204
246
|
summary.elapsed # => seconds elapsed
|
|
205
247
|
|
|
@@ -237,6 +279,10 @@ entry = state.entry_for("20220615000000")
|
|
|
237
279
|
# Total bytes downloaded
|
|
238
280
|
state.total_bytes # => Integer
|
|
239
281
|
|
|
282
|
+
# List all completed timestamps
|
|
283
|
+
state.size # => number of completed entries
|
|
284
|
+
state.timestamps # => ["20220101000000", "20220102000000"]
|
|
285
|
+
|
|
240
286
|
# Clear state for a fresh download
|
|
241
287
|
state.clear
|
|
242
288
|
----
|
|
@@ -279,6 +325,14 @@ Archaeo::CdxFilter.by_mimetype_prefix("image") # => matches image/*
|
|
|
279
325
|
# Convenience factories
|
|
280
326
|
Archaeo::CdxFilter.only_html # => text/html only
|
|
281
327
|
Archaeo::CdxFilter.excluding_redirects # => excludes 3xx statuses
|
|
328
|
+
|
|
329
|
+
# Introspection
|
|
330
|
+
filter = Archaeo::CdxFilter.by_status(200)
|
|
331
|
+
filter.field # => "statuscode"
|
|
332
|
+
filter.pattern # => "200"
|
|
333
|
+
filter.matches?("200") # => true
|
|
334
|
+
filter.matches?("404") # => false
|
|
335
|
+
filter.negated? # => false
|
|
282
336
|
----
|
|
283
337
|
|
|
284
338
|
=== URL Rewriting
|
|
@@ -298,6 +352,7 @@ rewriter.rewrite("https://web.archive.org/web/20220615000000/style.css")
|
|
|
298
352
|
rewriter.rewrite_batch(["url1", "url2"])
|
|
299
353
|
|
|
300
354
|
# Rewrite URLs within HTML (src, href, srcset, data-src, poster)
|
|
355
|
+
# Also rewrites inline style url() and <style> element url()
|
|
301
356
|
rewritten_html = rewriter.rewrite_html(html_content)
|
|
302
357
|
----
|
|
303
358
|
|
|
@@ -319,6 +374,10 @@ snap.age # => seconds since capture
|
|
|
319
374
|
snap.older_than?(3600) # => true if older than 1 hour
|
|
320
375
|
snap.newer_than?(3600) # => true if newer than 1 hour
|
|
321
376
|
|
|
377
|
+
# Content comparison (by digest)
|
|
378
|
+
snap1.same_content_as?(snap2) # => true if same digest
|
|
379
|
+
snap1.duplicate_of?(snap2) # => true if same digest AND different timestamp
|
|
380
|
+
|
|
322
381
|
# Identity URL (raw content, no Wayback rewriting)
|
|
323
382
|
snap.identity_url
|
|
324
383
|
|
|
@@ -370,6 +429,18 @@ ts1 < ts2 # => true/false
|
|
|
370
429
|
|
|
371
430
|
# Immutable -- frozen on creation
|
|
372
431
|
ts.frozen? # => true
|
|
432
|
+
|
|
433
|
+
# Date/time helpers
|
|
434
|
+
ts.quarter # => 1..4
|
|
435
|
+
ts.wday # => 0..6 (Sunday = 0)
|
|
436
|
+
ts.human_readable # => "2022-06-15 00:00:00 UTC"
|
|
437
|
+
ts.to_date # => Date object
|
|
438
|
+
|
|
439
|
+
# Date ranges for coverage analysis
|
|
440
|
+
range = ts.date_range(:month)
|
|
441
|
+
# => Timestamp(Jun 1)..Timestamp(Jun 30 23:59:59)
|
|
442
|
+
ts.date_range(:day) # => single day range
|
|
443
|
+
ts.date_range(:year) # => full year range
|
|
373
444
|
----
|
|
374
445
|
|
|
375
446
|
=== HTTP Client Observability
|
|
@@ -383,6 +454,13 @@ client = Archaeo::HttpClient.new(
|
|
|
383
454
|
},
|
|
384
455
|
)
|
|
385
456
|
|
|
457
|
+
# Intercept requests before they are sent
|
|
458
|
+
client = Archaeo::HttpClient.new(
|
|
459
|
+
before_request: ->(uri, request) {
|
|
460
|
+
request["X-Custom-Header"] = "value"
|
|
461
|
+
},
|
|
462
|
+
)
|
|
463
|
+
|
|
386
464
|
# Inspect connection pool state
|
|
387
465
|
client.pool_stats
|
|
388
466
|
# => { active_connections: 2, max_pool_size: 8,
|
|
@@ -401,6 +479,7 @@ archaeo --version
|
|
|
401
479
|
archaeo snapshots example.com
|
|
402
480
|
archaeo snapshots --format json example.com
|
|
403
481
|
archaeo snapshots --format csv --from 20220101 --to 20221231 example.com
|
|
482
|
+
archaeo snapshots --filter-status 200 --filter-type text/html example.com
|
|
404
483
|
|
|
405
484
|
# Find closest snapshot
|
|
406
485
|
archaeo near example.com 20220101
|
|
@@ -440,6 +519,18 @@ archaeo fetch --identity https://example.com/ 20220615120000
|
|
|
440
519
|
archaeo fetch-assets https://example.com/ 20220615120000
|
|
441
520
|
archaeo fetch-assets --format json https://example.com/ 20220615120000
|
|
442
521
|
|
|
522
|
+
# Rewrite archive URLs to local paths
|
|
523
|
+
archaeo rewrite https://example.com/ 20220615120000
|
|
524
|
+
archaeo rewrite --output page.html --prefix local https://example.com/ 20220615120000
|
|
525
|
+
|
|
526
|
+
# Compare assets between two snapshots
|
|
527
|
+
archaeo diff https://example.com/ 20220101 20220615
|
|
528
|
+
archaeo diff --format json https://example.com/ 20220101 20220615
|
|
529
|
+
|
|
530
|
+
# Audit assets for an archived page
|
|
531
|
+
archaeo asset-audit https://example.com/ 20220615120000
|
|
532
|
+
archaeo asset-audit --format json https://example.com/ 20220615120000
|
|
533
|
+
|
|
443
534
|
# Download all snapshots
|
|
444
535
|
archaeo download example.com --output ./archive
|
|
445
536
|
|
|
@@ -480,6 +571,9 @@ Archaeo::SaveFailed
|
|
|
480
571
|
|
|
481
572
|
# Content digest mismatch
|
|
482
573
|
Archaeo::IntegrityError
|
|
574
|
+
|
|
575
|
+
# HTTP error during fetch (includes .page, .url, .status_code)
|
|
576
|
+
Archaeo::FetchError
|
|
483
577
|
----
|
|
484
578
|
|
|
485
579
|
== Architecture
|
|
@@ -491,7 +585,7 @@ Archaeo follows a model-driven, OOP design:
|
|
|
491
585
|
| Layer | Classes | Purpose
|
|
492
586
|
|
|
493
587
|
| *Models*
|
|
494
|
-
| `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`
|
|
588
|
+
| `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`, `CdxTimeline`
|
|
495
589
|
| Domain value objects with `to_h`, `as_json`, `inspect` support
|
|
496
590
|
|
|
497
591
|
| *URL Processing*
|
|
@@ -120,6 +120,7 @@ module Archaeo
|
|
|
120
120
|
extract_media_sources(list)
|
|
121
121
|
extract_video_posters(list)
|
|
122
122
|
extract_embeds(list)
|
|
123
|
+
extract_tracks(list)
|
|
123
124
|
end
|
|
124
125
|
|
|
125
126
|
def extract_media_sources(list)
|
|
@@ -138,6 +139,15 @@ module Archaeo
|
|
|
138
139
|
@doc.css("iframe[src], embed[src]").each do |el|
|
|
139
140
|
list.add(resolve(el["src"]), type: :media)
|
|
140
141
|
end
|
|
142
|
+
@doc.css("object[data]").each do |el|
|
|
143
|
+
list.add(resolve(el["data"]), type: :media)
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def extract_tracks(list)
|
|
148
|
+
@doc.css("track[src]").each do |el|
|
|
149
|
+
list.add(resolve(el["src"]), type: :media)
|
|
150
|
+
end
|
|
141
151
|
end
|
|
142
152
|
|
|
143
153
|
def extract_inline_css(list)
|
data/lib/archaeo/asset_list.rb
CHANGED
|
@@ -102,5 +102,28 @@ module Archaeo
|
|
|
102
102
|
end
|
|
103
103
|
list
|
|
104
104
|
end
|
|
105
|
+
|
|
106
|
+
def domain_counts
|
|
107
|
+
all.each_with_object(Hash.new(0)) do |url, counts|
|
|
108
|
+
host = begin
|
|
109
|
+
URI.parse(url).host
|
|
110
|
+
rescue URI::InvalidURIError
|
|
111
|
+
"(invalid)"
|
|
112
|
+
end
|
|
113
|
+
counts[host || "(relative)"] += 1
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def downloadable
|
|
118
|
+
filtered = self.class.new
|
|
119
|
+
CATEGORIES.each do |type|
|
|
120
|
+
@urls_by_type[type].each do |url|
|
|
121
|
+
next if url.start_with?("data:", "#")
|
|
122
|
+
|
|
123
|
+
filtered.add(url, type: type)
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
filtered
|
|
127
|
+
end
|
|
105
128
|
end
|
|
106
129
|
end
|
|
@@ -38,6 +38,16 @@ module Archaeo
|
|
|
38
38
|
near(url).available?
|
|
39
39
|
end
|
|
40
40
|
|
|
41
|
+
def batch_available?(urls, concurrency: 1)
|
|
42
|
+
if concurrency <= 1
|
|
43
|
+
urls.to_h do |u|
|
|
44
|
+
[u, near(u)]
|
|
45
|
+
end
|
|
46
|
+
else
|
|
47
|
+
batch_concurrent(urls, concurrency)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
41
51
|
private
|
|
42
52
|
|
|
43
53
|
def parse_response(response, url)
|
|
@@ -78,5 +88,26 @@ module Archaeo
|
|
|
78
88
|
archived_status: archived_status,
|
|
79
89
|
)
|
|
80
90
|
end
|
|
91
|
+
|
|
92
|
+
def batch_concurrent(urls, concurrency)
|
|
93
|
+
results = {}
|
|
94
|
+
mutex = Mutex.new
|
|
95
|
+
queue = urls.dup
|
|
96
|
+
threads = Array.new(concurrency) do
|
|
97
|
+
Thread.new { drain_queue(queue, results, mutex) }
|
|
98
|
+
end
|
|
99
|
+
threads.each(&:join)
|
|
100
|
+
results
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def drain_queue(queue, results, mutex)
|
|
104
|
+
loop do
|
|
105
|
+
url = mutex.synchronize { queue.shift }
|
|
106
|
+
break unless url
|
|
107
|
+
|
|
108
|
+
result = near(url)
|
|
109
|
+
mutex.synchronize { results[url] = result }
|
|
110
|
+
end
|
|
111
|
+
end
|
|
81
112
|
end
|
|
82
113
|
end
|
|
@@ -56,5 +56,16 @@ module Archaeo
|
|
|
56
56
|
def inspect
|
|
57
57
|
"#<#{self.class.name} #{@url} available=#{@available}>"
|
|
58
58
|
end
|
|
59
|
+
|
|
60
|
+
def to_snapshot
|
|
61
|
+
return nil unless available?
|
|
62
|
+
|
|
63
|
+
Snapshot.new(
|
|
64
|
+
urlkey: UrlNormalizer.normalize(url).downcase,
|
|
65
|
+
timestamp: timestamp,
|
|
66
|
+
original_url: url,
|
|
67
|
+
status_code: archived_status || 200,
|
|
68
|
+
)
|
|
69
|
+
end
|
|
59
70
|
end
|
|
60
71
|
end
|
|
@@ -15,11 +15,12 @@ module Archaeo
|
|
|
15
15
|
# for interrupted download recovery.
|
|
16
16
|
class BulkDownloader
|
|
17
17
|
def initialize(client: HttpClient.new, output_dir: "archive",
|
|
18
|
-
cdx_api: nil, concurrency: 1)
|
|
18
|
+
cdx_api: nil, concurrency: 1, on_error: nil)
|
|
19
19
|
@client = client
|
|
20
20
|
@output_dir = output_dir
|
|
21
21
|
@cdx_api = cdx_api
|
|
22
22
|
@concurrency = [1, concurrency.to_i].max
|
|
23
|
+
@on_error = on_error
|
|
23
24
|
end
|
|
24
25
|
|
|
25
26
|
def download(url, from: nil, to: nil, resume: false,
|
|
@@ -29,10 +30,11 @@ module Archaeo
|
|
|
29
30
|
FileUtils.mkdir_p(@output_dir) unless dry_run
|
|
30
31
|
|
|
31
32
|
snapshots = fetch_snapshots(url, from: from, to: to)
|
|
32
|
-
downloaded, skipped, bytes =
|
|
33
|
+
downloaded, skipped, bytes, failed =
|
|
33
34
|
run_download(snapshots, resume, dry_run, block)
|
|
34
35
|
|
|
35
|
-
build_summary(start_time, snapshots.size, downloaded,
|
|
36
|
+
build_summary(start_time, snapshots.size, downloaded,
|
|
37
|
+
skipped, bytes, failed: failed)
|
|
36
38
|
end
|
|
37
39
|
|
|
38
40
|
private
|
|
@@ -59,24 +61,26 @@ module Archaeo
|
|
|
59
61
|
end
|
|
60
62
|
end
|
|
61
63
|
|
|
62
|
-
def build_summary(start_time, total, downloaded, skipped,
|
|
64
|
+
def build_summary(start_time, total, downloaded, skipped,
|
|
65
|
+
bytes, failed: 0)
|
|
63
66
|
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
64
67
|
DownloadSummary.new(
|
|
65
68
|
total: total, downloaded: downloaded, skipped: skipped,
|
|
66
|
-
failed:
|
|
69
|
+
failed: failed, bytes_written: bytes, elapsed: elapsed
|
|
67
70
|
)
|
|
68
71
|
end
|
|
69
72
|
|
|
70
73
|
def download_sequential(snapshots, total, state, resume,
|
|
71
74
|
dry_run, progress)
|
|
72
|
-
counters = { downloaded: 0, skipped: 0, bytes: 0 }
|
|
75
|
+
counters = { downloaded: 0, skipped: 0, bytes: 0, failed: 0 }
|
|
73
76
|
|
|
74
77
|
snapshots.each_with_index do |snap, index|
|
|
75
78
|
process_sequential(snap, state, resume, dry_run, counters)
|
|
76
79
|
progress&.call(index + 1, total, snap)
|
|
77
80
|
end
|
|
78
81
|
|
|
79
|
-
[counters[:downloaded], counters[:skipped],
|
|
82
|
+
[counters[:downloaded], counters[:skipped],
|
|
83
|
+
counters[:bytes], counters[:failed]]
|
|
80
84
|
end
|
|
81
85
|
|
|
82
86
|
def process_sequential(snap, state, resume, dry_run, counters)
|
|
@@ -87,6 +91,9 @@ module Archaeo
|
|
|
87
91
|
|
|
88
92
|
counters[:bytes] += download_snapshot(snap, state) unless dry_run
|
|
89
93
|
counters[:downloaded] += 1
|
|
94
|
+
rescue StandardError => e
|
|
95
|
+
counters[:failed] += 1
|
|
96
|
+
@on_error&.call(snap, e)
|
|
90
97
|
end
|
|
91
98
|
|
|
92
99
|
def download_snapshot(snap, state)
|
|
@@ -100,7 +107,7 @@ module Archaeo
|
|
|
100
107
|
dry_run, progress)
|
|
101
108
|
queue = snapshots.each_with_index.to_a
|
|
102
109
|
shared = { mutex: Mutex.new, errors: [],
|
|
103
|
-
downloaded: 0, skipped: 0, bytes: 0 }
|
|
110
|
+
downloaded: 0, skipped: 0, bytes: 0, failed: 0 }
|
|
104
111
|
|
|
105
112
|
threads = Array.new(@concurrency) do
|
|
106
113
|
Thread.new do
|
|
@@ -109,17 +116,9 @@ module Archaeo
|
|
|
109
116
|
end
|
|
110
117
|
end
|
|
111
118
|
threads.each(&:join)
|
|
112
|
-
raise_on_errors(shared[:errors])
|
|
113
|
-
|
|
114
|
-
[shared[:downloaded], shared[:skipped], shared[:bytes]]
|
|
115
|
-
end
|
|
116
119
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
raise Error,
|
|
121
|
-
"#{errors.size} download(s) failed: " \
|
|
122
|
-
"#{errors.map { |s, _| s.timestamp }.join(', ')}"
|
|
120
|
+
[shared[:downloaded], shared[:skipped],
|
|
121
|
+
shared[:bytes], shared[:failed]]
|
|
123
122
|
end
|
|
124
123
|
|
|
125
124
|
def process_queue(queue, total, state, resume, dry_run,
|
|
@@ -133,7 +132,7 @@ module Archaeo
|
|
|
133
132
|
next
|
|
134
133
|
end
|
|
135
134
|
|
|
136
|
-
concurrent_fetch(snap,
|
|
135
|
+
concurrent_fetch(snap, dry_run, shared)
|
|
137
136
|
progress&.call(index + 1, total, snap)
|
|
138
137
|
end
|
|
139
138
|
end
|
|
@@ -145,35 +144,59 @@ module Archaeo
|
|
|
145
144
|
true
|
|
146
145
|
end
|
|
147
146
|
|
|
148
|
-
def concurrent_fetch(snap,
|
|
147
|
+
def concurrent_fetch(snap, dry_run, shared)
|
|
149
148
|
unless dry_run
|
|
150
149
|
content = fetch_and_save(snap)
|
|
151
|
-
shared
|
|
152
|
-
state.mark_completed(snap.timestamp,
|
|
153
|
-
url: snap.original_url,
|
|
154
|
-
bytes: content.bytesize)
|
|
155
|
-
shared[:bytes] += content.bytesize
|
|
156
|
-
end
|
|
150
|
+
record_completed(snap, content, shared)
|
|
157
151
|
end
|
|
158
152
|
shared[:mutex].synchronize { shared[:downloaded] += 1 }
|
|
159
153
|
rescue StandardError => e
|
|
160
|
-
shared[:mutex].synchronize
|
|
154
|
+
shared[:mutex].synchronize do
|
|
155
|
+
shared[:failed] += 1
|
|
156
|
+
shared[:errors] << [snap, e]
|
|
157
|
+
end
|
|
158
|
+
@on_error&.call(snap, e)
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
def record_completed(snap, content, shared)
|
|
162
|
+
shared[:mutex].synchronize do
|
|
163
|
+
state.mark_completed(snap.timestamp,
|
|
164
|
+
url: snap.original_url,
|
|
165
|
+
bytes: content.bytesize)
|
|
166
|
+
shared[:bytes] += content.bytesize
|
|
167
|
+
end
|
|
161
168
|
end
|
|
162
169
|
|
|
163
170
|
def fetch_and_save(snapshot)
|
|
164
|
-
|
|
165
|
-
page
|
|
166
|
-
|
|
171
|
+
page = fetch_page(snapshot)
|
|
172
|
+
validate_page_status(page, snapshot)
|
|
173
|
+
write_page_file(page, snapshot)
|
|
174
|
+
rescue StandardError
|
|
175
|
+
FileUtils.rm_f(tmp_path) if defined?(tmp_path)
|
|
176
|
+
raise
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def fetch_page(snapshot)
|
|
180
|
+
Fetcher.new(client: @client).fetch(
|
|
181
|
+
snapshot.original_url, timestamp: snapshot.timestamp
|
|
182
|
+
)
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def validate_page_status(page, snapshot)
|
|
186
|
+
return if page.status_code.between?(200, 299)
|
|
167
187
|
|
|
188
|
+
raise Error,
|
|
189
|
+
"HTTP #{page.status_code} for " \
|
|
190
|
+
"#{snapshot.original_url} at #{snapshot.timestamp}"
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def write_page_file(page, snapshot)
|
|
168
194
|
filename = build_filename(snapshot)
|
|
169
195
|
FileUtils.mkdir_p(File.dirname(filename))
|
|
170
196
|
tmp_path = "#{filename}.tmp"
|
|
171
197
|
File.binwrite(tmp_path, page.content)
|
|
172
198
|
File.rename(tmp_path, filename)
|
|
173
199
|
page.content
|
|
174
|
-
rescue StandardError
|
|
175
|
-
FileUtils.rm_f(tmp_path) if defined?(tmp_path)
|
|
176
|
-
raise
|
|
177
200
|
end
|
|
178
201
|
|
|
179
202
|
EXTENSION_MAP = {
|
data/lib/archaeo/cdx_api.rb
CHANGED
|
@@ -110,6 +110,24 @@ module Archaeo
|
|
|
110
110
|
snapshots(url, **options).count
|
|
111
111
|
end
|
|
112
112
|
|
|
113
|
+
def unique_snapshots(url, resolve_revisits: true, **options)
|
|
114
|
+
snapshots(url,
|
|
115
|
+
collapse: ["digest"],
|
|
116
|
+
resolve_revisits: resolve_revisits,
|
|
117
|
+
**options)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def timeline(url, from: nil, to: nil,
|
|
121
|
+
bucket_size: :month, status: 200)
|
|
122
|
+
options = {}
|
|
123
|
+
options[:from] = Timestamp.coerce(from).to_s if from
|
|
124
|
+
options[:to] = Timestamp.coerce(to).to_s if to
|
|
125
|
+
options[:filters] = [CdxFilter.by_status(status)] if status
|
|
126
|
+
|
|
127
|
+
snaps = snapshots(url, **options).to_a
|
|
128
|
+
CdxTimeline.new(snaps, bucket_size: bucket_size)
|
|
129
|
+
end
|
|
130
|
+
|
|
113
131
|
# Returns the number of pages for a paginated query.
|
|
114
132
|
def num_pages(url, **options)
|
|
115
133
|
url = UrlNormalizer.normalize(url)
|
data/lib/archaeo/cdx_filter.rb
CHANGED
|
@@ -31,6 +31,17 @@ module Archaeo
|
|
|
31
31
|
stripped.split(":", 2).first.to_s
|
|
32
32
|
end
|
|
33
33
|
|
|
34
|
+
def pattern
|
|
35
|
+
stripped = @expression.delete_prefix("!")
|
|
36
|
+
stripped.split(":", 2).last.to_s
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def matches?(value)
|
|
40
|
+
regex = Regexp.new(pattern)
|
|
41
|
+
result = regex.match?(value.to_s)
|
|
42
|
+
negated? ? !result : result
|
|
43
|
+
end
|
|
44
|
+
|
|
34
45
|
def self.by_status(code)
|
|
35
46
|
new("statuscode:#{code}")
|
|
36
47
|
end
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Time-bucketed snapshot frequency analysis.
|
|
5
|
+
#
|
|
6
|
+
# Groups snapshots by configurable time buckets (day, week, month, year)
|
|
7
|
+
# for frequency analysis and coverage reporting.
|
|
8
|
+
class CdxTimeline
|
|
9
|
+
BUCKET_FORMATS = {
|
|
10
|
+
day: "%Y%m%d",
|
|
11
|
+
week: "%YW%V",
|
|
12
|
+
month: "%Y%m",
|
|
13
|
+
year: "%Y",
|
|
14
|
+
}.freeze
|
|
15
|
+
|
|
16
|
+
def initialize(snapshots, bucket_size: :month)
|
|
17
|
+
@bucket_size = bucket_size
|
|
18
|
+
@buckets = build_buckets(snapshots)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def to_a
|
|
22
|
+
@buckets.sort_by(&:first)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def to_h
|
|
26
|
+
@buckets.dup
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def peak
|
|
30
|
+
@buckets.max_by(&:last)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def total
|
|
34
|
+
@buckets.values.sum
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def span
|
|
38
|
+
keys = @buckets.keys
|
|
39
|
+
return nil if keys.empty?
|
|
40
|
+
|
|
41
|
+
[keys.first, keys.last]
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def empty?
|
|
45
|
+
@buckets.empty?
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def size
|
|
49
|
+
@buckets.size
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def inspect
|
|
53
|
+
"#<#{self.class.name} #{total} snapshots in #{@buckets.size} buckets>"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
private
|
|
57
|
+
|
|
58
|
+
def build_buckets(snapshots)
|
|
59
|
+
fmt = BUCKET_FORMATS[@bucket_size] || BUCKET_FORMATS[:month]
|
|
60
|
+
snapshots.each_with_object(Hash.new(0)) do |snap, counts|
|
|
61
|
+
key = snap.timestamp.to_time.strftime(fmt)
|
|
62
|
+
counts[key] += 1
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
data/lib/archaeo/cli.rb
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "csv"
|
|
4
4
|
require "json"
|
|
5
|
+
require "set"
|
|
5
6
|
require "thor"
|
|
6
7
|
|
|
7
8
|
module Archaeo
|
|
@@ -27,6 +28,10 @@ module Archaeo
|
|
|
27
28
|
option :match_type,
|
|
28
29
|
desc: "Match type (exact, prefix, host, domain)"
|
|
29
30
|
option :filter, type: :array, desc: "CDX filter expressions"
|
|
31
|
+
option :filter_status, type: :array,
|
|
32
|
+
desc: "Only include these status codes"
|
|
33
|
+
option :filter_type, type: :array,
|
|
34
|
+
desc: "MIME type prefixes (e.g. image, text/html)"
|
|
30
35
|
option :collapse, type: :array, desc: "CDX collapse fields"
|
|
31
36
|
option :sort, desc: "Sort order (default, closest, reverse)"
|
|
32
37
|
option :limit, type: :numeric, desc: "Max snapshots to return"
|
|
@@ -153,6 +158,53 @@ module Archaeo
|
|
|
153
158
|
end
|
|
154
159
|
end
|
|
155
160
|
|
|
161
|
+
desc "rewrite URL TIMESTAMP",
|
|
162
|
+
"Fetch a page and rewrite archive URLs to local paths"
|
|
163
|
+
option :prefix, desc: "Local path prefix", default: "local"
|
|
164
|
+
option :output, desc: "Write rewritten HTML to file"
|
|
165
|
+
def rewrite(url, timestamp)
|
|
166
|
+
handle_errors do
|
|
167
|
+
coerced = Timestamp.coerce(timestamp)
|
|
168
|
+
page = Fetcher.new.fetch(url, timestamp: coerced)
|
|
169
|
+
rewritten = build_rewriter(url, coerced).rewrite_html(page.content)
|
|
170
|
+
output_rewritten(rewritten)
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
desc "diff URL TIMESTAMP_A TIMESTAMP_B",
|
|
175
|
+
"Compare assets of two archived snapshots"
|
|
176
|
+
option :format, desc: "Output format (table, json)", default: "table"
|
|
177
|
+
def diff(url, timestamp_a, timestamp_b)
|
|
178
|
+
handle_errors do
|
|
179
|
+
bundle_a = Fetcher.new.fetch_page_with_assets(
|
|
180
|
+
url, timestamp: timestamp_a
|
|
181
|
+
)
|
|
182
|
+
bundle_b = Fetcher.new.fetch_page_with_assets(
|
|
183
|
+
url, timestamp: timestamp_b
|
|
184
|
+
)
|
|
185
|
+
output_diff(bundle_a.assets, bundle_b.assets,
|
|
186
|
+
timestamp_a, timestamp_b)
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
desc "asset-audit URL TIMESTAMP",
|
|
191
|
+
"Audit assets for an archived page"
|
|
192
|
+
option :format, desc: "Output format (table, json)", default: "table"
|
|
193
|
+
def asset_audit(url, timestamp)
|
|
194
|
+
handle_errors do
|
|
195
|
+
bundle = Fetcher.new.fetch_page_with_assets(
|
|
196
|
+
url, timestamp: timestamp
|
|
197
|
+
)
|
|
198
|
+
report = build_audit_report(bundle)
|
|
199
|
+
case options[:format]
|
|
200
|
+
when "json"
|
|
201
|
+
puts JSON.generate(report)
|
|
202
|
+
else
|
|
203
|
+
print_audit_report(report)
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
|
|
156
208
|
desc "download URL", "Download all archived snapshots of a URL"
|
|
157
209
|
option :output, desc: "Output directory", default: "archive"
|
|
158
210
|
option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
|
|
@@ -276,6 +328,30 @@ module Archaeo
|
|
|
276
328
|
end
|
|
277
329
|
end
|
|
278
330
|
|
|
331
|
+
def build_rewriter(url, timestamp)
|
|
332
|
+
normalized = UrlNormalizer.normalize(url)
|
|
333
|
+
archive_prefix = ArchiveUrl.new(normalized, timestamp: timestamp).to_s
|
|
334
|
+
UrlRewriter.new(archive_prefix, options[:prefix])
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
def output_rewritten(content)
|
|
338
|
+
if options[:output]
|
|
339
|
+
write_output(options[:output], content)
|
|
340
|
+
else
|
|
341
|
+
$stdout.write(content)
|
|
342
|
+
end
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
def output_diff(assets_a, assets_b, ts_a, ts_b)
|
|
346
|
+
comparison = compare_asset_lists(assets_a, assets_b)
|
|
347
|
+
case options[:format]
|
|
348
|
+
when "json"
|
|
349
|
+
puts JSON.generate(comparison)
|
|
350
|
+
else
|
|
351
|
+
print_diff_report(comparison, ts_a, ts_b)
|
|
352
|
+
end
|
|
353
|
+
end
|
|
354
|
+
|
|
279
355
|
def output_assets(bundle)
|
|
280
356
|
case options[:format]
|
|
281
357
|
when "json"
|
|
@@ -307,16 +383,36 @@ module Archaeo
|
|
|
307
383
|
def print_summary(summary)
|
|
308
384
|
return if quiet?
|
|
309
385
|
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
386
|
+
parts = ["Downloaded #{summary.downloaded}/#{summary.total}"]
|
|
387
|
+
parts << "#{summary.failed} failed" if summary.failed.positive?
|
|
388
|
+
parts << "(#{summary.bytes_written} bytes)"
|
|
389
|
+
parts << "in #{summary.elapsed.round(1)}s"
|
|
390
|
+
warn parts.join(" ")
|
|
313
391
|
end
|
|
314
392
|
|
|
315
393
|
def build_cdx_options(opts)
|
|
316
|
-
|
|
394
|
+
result = {}
|
|
395
|
+
CDX_OPTION_MAP.each do |cli_key, api_key|
|
|
317
396
|
value = opts[cli_key]
|
|
318
397
|
result[api_key] = value if value
|
|
319
398
|
end
|
|
399
|
+
append_convenience_filters!(result, opts)
|
|
400
|
+
result
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
def append_convenience_filters!(result, opts)
|
|
404
|
+
filters = Array(result[:filters])
|
|
405
|
+
filters += status_filters(opts[:filter_status])
|
|
406
|
+
filters += type_filters(opts[:filter_type])
|
|
407
|
+
result[:filters] = filters unless filters.empty?
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
def status_filters(codes)
|
|
411
|
+
Array(codes).map { |code| CdxFilter.by_status(code).to_s }
|
|
412
|
+
end
|
|
413
|
+
|
|
414
|
+
def type_filters(prefixes)
|
|
415
|
+
Array(prefixes).map { |p| CdxFilter.by_mimetype_prefix(p).to_s }
|
|
320
416
|
end
|
|
321
417
|
|
|
322
418
|
def output_table(snaps)
|
|
@@ -347,5 +443,86 @@ module Archaeo
|
|
|
347
443
|
File.binwrite(path, content)
|
|
348
444
|
warn "Written to #{path}" unless quiet?
|
|
349
445
|
end
|
|
446
|
+
|
|
447
|
+
def compare_asset_lists(assets_a, assets_b)
|
|
448
|
+
all_a = assets_a.all.to_set
|
|
449
|
+
all_b = assets_b.all.to_set
|
|
450
|
+
build_diff(all_a, all_b, assets_a.counts, assets_b.counts)
|
|
451
|
+
end
|
|
452
|
+
|
|
453
|
+
def build_diff(set_a, set_b, counts_a, counts_b)
|
|
454
|
+
{
|
|
455
|
+
only_in_a: (set_a - set_b).to_a.sort,
|
|
456
|
+
only_in_b: (set_b - set_a).to_a.sort,
|
|
457
|
+
unchanged: (set_a & set_b).to_a.sort,
|
|
458
|
+
counts_a: counts_a,
|
|
459
|
+
counts_b: counts_b,
|
|
460
|
+
}
|
|
461
|
+
end
|
|
462
|
+
|
|
463
|
+
def print_diff_report(comparison, ts_a, ts_b)
|
|
464
|
+
puts "Comparing #{ts_a} vs #{ts_b}"
|
|
465
|
+
puts
|
|
466
|
+
print_url_list("Removed:", comparison[:only_in_a], " - ")
|
|
467
|
+
print_url_list("Added:", comparison[:only_in_b], " + ")
|
|
468
|
+
puts "Unchanged: #{comparison[:unchanged].size}"
|
|
469
|
+
end
|
|
470
|
+
|
|
471
|
+
def print_url_list(header, urls, prefix)
|
|
472
|
+
return unless urls.any?
|
|
473
|
+
|
|
474
|
+
puts header
|
|
475
|
+
urls.each { |url| puts "#{prefix}#{url}" }
|
|
476
|
+
puts
|
|
477
|
+
end
|
|
478
|
+
|
|
479
|
+
def build_audit_report(bundle)
|
|
480
|
+
assets = bundle.assets
|
|
481
|
+
downloadable = assets.downloadable
|
|
482
|
+
{
|
|
483
|
+
page_url: bundle.page.archive_url,
|
|
484
|
+
total_assets: assets.size,
|
|
485
|
+
downloadable: downloadable.size,
|
|
486
|
+
counts: assets.counts,
|
|
487
|
+
domains: assets.domain_counts,
|
|
488
|
+
duplicates: find_duplicate_urls(assets),
|
|
489
|
+
}
|
|
490
|
+
end
|
|
491
|
+
|
|
492
|
+
def print_audit_report(report)
|
|
493
|
+
puts "Page: #{report[:page_url]}"
|
|
494
|
+
puts "Total assets: #{report[:total_assets]}"
|
|
495
|
+
puts "Downloadable: #{report[:downloadable]}"
|
|
496
|
+
puts
|
|
497
|
+
print_type_counts(report[:counts])
|
|
498
|
+
print_domain_counts(report[:domains])
|
|
499
|
+
print_url_list("Duplicates:", report[:duplicates], " ")
|
|
500
|
+
end
|
|
501
|
+
|
|
502
|
+
def print_type_counts(counts)
|
|
503
|
+
puts "By type:"
|
|
504
|
+
counts.each { |type, count| puts " #{type}: #{count}" }
|
|
505
|
+
puts
|
|
506
|
+
end
|
|
507
|
+
|
|
508
|
+
def print_domain_counts(domains)
|
|
509
|
+
puts "By domain:"
|
|
510
|
+
domains.sort_by { |_, v| -v }.each do |domain, count|
|
|
511
|
+
puts " #{domain}: #{count}"
|
|
512
|
+
end
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
def find_duplicate_urls(assets)
|
|
516
|
+
seen = {}
|
|
517
|
+
dupes = []
|
|
518
|
+
assets.all.each do |url|
|
|
519
|
+
if seen[url]
|
|
520
|
+
dupes << url unless dupes.include?(url)
|
|
521
|
+
else
|
|
522
|
+
seen[url] = true
|
|
523
|
+
end
|
|
524
|
+
end
|
|
525
|
+
dupes
|
|
526
|
+
end
|
|
350
527
|
end
|
|
351
528
|
end
|
|
@@ -17,36 +17,49 @@ module Archaeo
|
|
|
17
17
|
def initialize(output_dir)
|
|
18
18
|
@output_dir = output_dir
|
|
19
19
|
@path = File.join(output_dir, STATE_FILE)
|
|
20
|
+
@mutex = Mutex.new
|
|
20
21
|
end
|
|
21
22
|
|
|
22
23
|
def completed?(timestamp)
|
|
23
|
-
entries_key.include?(timestamp.to_s)
|
|
24
|
+
@mutex.synchronize { entries_key.include?(timestamp.to_s) }
|
|
24
25
|
end
|
|
25
26
|
|
|
26
27
|
def mark_completed(timestamp, url: nil, bytes: nil)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
28
|
+
@mutex.synchronize do
|
|
29
|
+
ts = timestamp.to_s
|
|
30
|
+
return if entries_key.include?(ts)
|
|
31
|
+
|
|
32
|
+
entry = { "ts" => ts, "at" => Time.now.utc.iso8601 }
|
|
33
|
+
entry["url"] = url if url
|
|
34
|
+
entry["bytes"] = bytes if bytes
|
|
35
|
+
entries << entry
|
|
36
|
+
@entries_key = nil
|
|
37
|
+
save
|
|
38
|
+
end
|
|
36
39
|
end
|
|
37
40
|
|
|
38
41
|
def entry_for(timestamp)
|
|
39
|
-
entries.find { |e| e["ts"] == timestamp.to_s }
|
|
42
|
+
@mutex.synchronize { entries.find { |e| e["ts"] == timestamp.to_s } }
|
|
40
43
|
end
|
|
41
44
|
|
|
42
45
|
def total_bytes
|
|
43
|
-
entries.sum { |e| e["bytes"].to_i }
|
|
46
|
+
@mutex.synchronize { entries.sum { |e| e["bytes"].to_i } }
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def size
|
|
50
|
+
@mutex.synchronize { entries.size }
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def timestamps
|
|
54
|
+
@mutex.synchronize { entries.map { |e| e["ts"] } }
|
|
44
55
|
end
|
|
45
56
|
|
|
46
57
|
def clear
|
|
47
|
-
@
|
|
48
|
-
|
|
49
|
-
|
|
58
|
+
@mutex.synchronize do
|
|
59
|
+
@entries = []
|
|
60
|
+
@entries_key = nil
|
|
61
|
+
FileUtils.rm_f(@path)
|
|
62
|
+
end
|
|
50
63
|
end
|
|
51
64
|
|
|
52
65
|
private
|
data/lib/archaeo/fetcher.rb
CHANGED
|
@@ -25,6 +25,19 @@ module Archaeo
|
|
|
25
25
|
build_page(response, archive_url.to_s, url, ts)
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
+
def fetch!(url, timestamp:, identity: false, snapshot: nil)
|
|
29
|
+
page = fetch(url, timestamp: timestamp, identity: identity,
|
|
30
|
+
snapshot: snapshot)
|
|
31
|
+
return page if page.status_code.between?(200, 299)
|
|
32
|
+
|
|
33
|
+
raise FetchError.new(
|
|
34
|
+
"HTTP #{page.status_code} for #{page.original_url}",
|
|
35
|
+
status_code: page.status_code,
|
|
36
|
+
url: page.original_url,
|
|
37
|
+
page: page,
|
|
38
|
+
)
|
|
39
|
+
end
|
|
40
|
+
|
|
28
41
|
def fetch_page_with_assets(url, timestamp:)
|
|
29
42
|
page = fetch(url, timestamp: timestamp)
|
|
30
43
|
assets = AssetExtractor.new(page.content,
|
data/lib/archaeo/http_client.rb
CHANGED
|
@@ -63,12 +63,14 @@ module Archaeo
|
|
|
63
63
|
max_retries: DEFAULT_MAX_RETRIES,
|
|
64
64
|
retry_delay: DEFAULT_RETRY_DELAY,
|
|
65
65
|
user_agent: nil,
|
|
66
|
-
on_request: nil
|
|
66
|
+
on_request: nil,
|
|
67
|
+
before_request: nil)
|
|
67
68
|
@timeout = timeout
|
|
68
69
|
@max_retries = max_retries
|
|
69
70
|
@retry_delay = retry_delay
|
|
70
71
|
@user_agent = user_agent
|
|
71
72
|
@on_request = on_request
|
|
73
|
+
@before_request = before_request
|
|
72
74
|
@connections = {}
|
|
73
75
|
@last_used = {}
|
|
74
76
|
@mutex = Mutex.new
|
|
@@ -203,7 +205,7 @@ module Archaeo
|
|
|
203
205
|
def attempt_with_retries(uri, headers, request_class)
|
|
204
206
|
retries = 0
|
|
205
207
|
begin
|
|
206
|
-
execute_and_check(uri, headers, request_class)
|
|
208
|
+
execute_and_check(uri, headers, request_class, retries)
|
|
207
209
|
rescue RetriableStatusError => e
|
|
208
210
|
retry_status(e, retries += 1) && retry
|
|
209
211
|
rescue *TRANSIENT_ERRORS => e
|
|
@@ -223,8 +225,9 @@ module Archaeo
|
|
|
223
225
|
sleep(@retry_delay * retries)
|
|
224
226
|
end
|
|
225
227
|
|
|
226
|
-
def execute_and_check(uri, headers, request_class)
|
|
227
|
-
response = execute_with_connection(uri, headers, request_class
|
|
228
|
+
def execute_and_check(uri, headers, request_class, retry_count)
|
|
229
|
+
response = execute_with_connection(uri, headers, request_class,
|
|
230
|
+
retry_count)
|
|
228
231
|
if RETRIABLE_STATUSES.include?(response.status)
|
|
229
232
|
raise RetriableStatusError, response
|
|
230
233
|
end
|
|
@@ -255,9 +258,9 @@ module Archaeo
|
|
|
255
258
|
"Failed after #{retries} retries: #{error.message}"
|
|
256
259
|
end
|
|
257
260
|
|
|
258
|
-
def execute_with_connection(uri, headers, request_class)
|
|
261
|
+
def execute_with_connection(uri, headers, request_class, retry_count)
|
|
259
262
|
request = build_request(uri, headers, request_class)
|
|
260
|
-
execute_tracked_request(uri, request)
|
|
263
|
+
execute_tracked_request(uri, request, retry_count)
|
|
261
264
|
rescue *TRANSIENT_ERRORS
|
|
262
265
|
raise
|
|
263
266
|
rescue StandardError
|
|
@@ -268,16 +271,17 @@ module Archaeo
|
|
|
268
271
|
def build_request(uri, headers, request_class)
|
|
269
272
|
request = request_class.new(uri)
|
|
270
273
|
headers.each { |k, v| request[k] = v }
|
|
274
|
+
@before_request&.call(uri, request)
|
|
271
275
|
request
|
|
272
276
|
end
|
|
273
277
|
|
|
274
|
-
def execute_tracked_request(uri, request)
|
|
278
|
+
def execute_tracked_request(uri, request, retry_count)
|
|
275
279
|
http = connection_for(uri)
|
|
276
280
|
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
277
281
|
raw = http.request(request)
|
|
278
282
|
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
|
|
279
283
|
response = build_response(raw)
|
|
280
|
-
@on_request&.call(uri, elapsed, response.status,
|
|
284
|
+
@on_request&.call(uri, elapsed, response.status, retry_count)
|
|
281
285
|
response
|
|
282
286
|
end
|
|
283
287
|
|
|
@@ -286,7 +290,7 @@ module Archaeo
|
|
|
286
290
|
"User-Agent" => select_user_agent,
|
|
287
291
|
"Accept" => "text/html,application/xhtml+xml," \
|
|
288
292
|
"application/xml;q=0.9,*/*;q=0.8",
|
|
289
|
-
"Accept-Encoding" => "gzip",
|
|
293
|
+
"Accept-Encoding" => "gzip, deflate",
|
|
290
294
|
"Accept-Language" => "en-US,en;q=0.9",
|
|
291
295
|
"Connection" => "keep-alive",
|
|
292
296
|
}
|
|
@@ -303,10 +307,17 @@ module Archaeo
|
|
|
303
307
|
|
|
304
308
|
def decompress_body(raw)
|
|
305
309
|
body = raw.body.to_s
|
|
306
|
-
return body
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
+
return body if body.empty?
|
|
311
|
+
|
|
312
|
+
case raw["content-encoding"]
|
|
313
|
+
when "gzip"
|
|
314
|
+
Zlib::GzipReader.new(StringIO.new(body)).read
|
|
315
|
+
when "deflate"
|
|
316
|
+
Zlib::Inflate.inflate(body)
|
|
317
|
+
else
|
|
318
|
+
body
|
|
319
|
+
end
|
|
320
|
+
rescue Zlib::GzipFile::Error, Zlib::DataError
|
|
310
321
|
body
|
|
311
322
|
end
|
|
312
323
|
end
|
data/lib/archaeo/page.rb
CHANGED
|
@@ -67,6 +67,32 @@ module Archaeo
|
|
|
67
67
|
end
|
|
68
68
|
end
|
|
69
69
|
|
|
70
|
+
def links
|
|
71
|
+
return [] unless html?
|
|
72
|
+
|
|
73
|
+
@links ||= begin
|
|
74
|
+
doc = Nokogiri::HTML(@raw_content)
|
|
75
|
+
base = @archive_url || @original_url
|
|
76
|
+
doc.css("a[href]").map do |anchor|
|
|
77
|
+
href = resolve_page_url(anchor["href"], base)
|
|
78
|
+
{ href: href, text: anchor.text.strip,
|
|
79
|
+
external: href && !href.include?(original_domain) }
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def meta_tags
|
|
85
|
+
return {} unless html?
|
|
86
|
+
|
|
87
|
+
@meta_tags ||= begin
|
|
88
|
+
doc = Nokogiri::HTML(@raw_content)
|
|
89
|
+
result = extract_meta_entries(doc)
|
|
90
|
+
canonical = doc.at_css('link[rel="canonical"]')
|
|
91
|
+
result["canonical"] = canonical["href"].to_s if canonical
|
|
92
|
+
result
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
70
96
|
def to_h
|
|
71
97
|
{
|
|
72
98
|
content_type: @content_type,
|
|
@@ -146,5 +172,35 @@ module Archaeo
|
|
|
146
172
|
invalid: :replace, undef: :replace,
|
|
147
173
|
replace: "?")
|
|
148
174
|
end
|
|
175
|
+
|
|
176
|
+
def original_domain
|
|
177
|
+
@original_domain ||= begin
|
|
178
|
+
URI.parse(@original_url).host
|
|
179
|
+
rescue URI::InvalidURIError
|
|
180
|
+
nil
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
def extract_meta_entries(doc)
|
|
185
|
+
result = {}
|
|
186
|
+
doc.css("meta[name], meta[property], meta[http-equiv]").each do |meta|
|
|
187
|
+
key = meta["name"] || meta["property"] || meta["http-equiv"]
|
|
188
|
+
next unless key
|
|
189
|
+
|
|
190
|
+
result[key.downcase] = meta["content"].to_s
|
|
191
|
+
end
|
|
192
|
+
result
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def resolve_page_url(href, base)
|
|
196
|
+
return href unless href
|
|
197
|
+
return href if href.start_with?("http", "//", "data:", "#",
|
|
198
|
+
"javascript:")
|
|
199
|
+
return nil unless base
|
|
200
|
+
|
|
201
|
+
URI.join(base, href).to_s
|
|
202
|
+
rescue URI::InvalidURIError
|
|
203
|
+
nil
|
|
204
|
+
end
|
|
149
205
|
end
|
|
150
206
|
end
|
data/lib/archaeo/save_api.rb
CHANGED
|
@@ -23,6 +23,22 @@ module Archaeo
|
|
|
23
23
|
attempt_save(save_url, start_time, url)
|
|
24
24
|
end
|
|
25
25
|
|
|
26
|
+
def batch_save(urls, delay: 2, stop_on_error: false)
|
|
27
|
+
results = []
|
|
28
|
+
urls.each_with_index do |url, i|
|
|
29
|
+
sleep(delay) if i.positive?
|
|
30
|
+
result = save(url)
|
|
31
|
+
results << result
|
|
32
|
+
rescue RateLimitError, SaveFailed => e
|
|
33
|
+
raise e if stop_on_error
|
|
34
|
+
|
|
35
|
+
results << SaveResult.new(
|
|
36
|
+
url: url, archive_url: nil, timestamp: nil, cached: false,
|
|
37
|
+
)
|
|
38
|
+
end
|
|
39
|
+
results
|
|
40
|
+
end
|
|
41
|
+
|
|
26
42
|
private
|
|
27
43
|
|
|
28
44
|
def attempt_save(save_url, start_time, url)
|
data/lib/archaeo/save_result.rb
CHANGED
|
@@ -11,7 +11,7 @@ module Archaeo
|
|
|
11
11
|
def initialize(url:, archive_url:, timestamp:, cached:)
|
|
12
12
|
@url = url
|
|
13
13
|
@archive_url = archive_url
|
|
14
|
-
@timestamp = Timestamp.coerce(timestamp)
|
|
14
|
+
@timestamp = timestamp ? Timestamp.coerce(timestamp) : nil
|
|
15
15
|
@cached = cached
|
|
16
16
|
end
|
|
17
17
|
|
|
@@ -19,6 +19,10 @@ module Archaeo
|
|
|
19
19
|
@cached
|
|
20
20
|
end
|
|
21
21
|
|
|
22
|
+
def success?
|
|
23
|
+
!@archive_url.nil?
|
|
24
|
+
end
|
|
25
|
+
|
|
22
26
|
def to_h
|
|
23
27
|
{ url: @url, archive_url: @archive_url,
|
|
24
28
|
timestamp: @timestamp, cached: @cached }
|
data/lib/archaeo/snapshot.rb
CHANGED
|
@@ -70,6 +70,18 @@ module Archaeo
|
|
|
70
70
|
age <= seconds
|
|
71
71
|
end
|
|
72
72
|
|
|
73
|
+
def same_content_as?(other)
|
|
74
|
+
return false unless other.is_a?(self.class)
|
|
75
|
+
return false if digest.nil? || digest.empty?
|
|
76
|
+
return false if other.digest.nil? || other.digest.empty?
|
|
77
|
+
|
|
78
|
+
digest == other.digest
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def duplicate_of?(other)
|
|
82
|
+
same_content_as?(other) && timestamp != other.timestamp
|
|
83
|
+
end
|
|
84
|
+
|
|
73
85
|
def fetch(client: HttpClient.new, identity: false)
|
|
74
86
|
Fetcher.new(client: client).fetch(
|
|
75
87
|
original_url, timestamp: @timestamp, identity: identity
|
data/lib/archaeo/timestamp.rb
CHANGED
|
@@ -140,8 +140,54 @@ module Archaeo
|
|
|
140
140
|
[year, month, day, hour, minute, second]
|
|
141
141
|
end
|
|
142
142
|
|
|
143
|
+
def quarter
|
|
144
|
+
((month - 1) / 3) + 1
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def wday
|
|
148
|
+
@to_time.wday
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def human_readable
|
|
152
|
+
@to_time.strftime("%Y-%m-%d %H:%M:%S UTC")
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def date_range(granularity = :day)
|
|
156
|
+
start_ts = range_start(granularity)
|
|
157
|
+
end_ts = range_end(start_ts, granularity)
|
|
158
|
+
start_ts..end_ts
|
|
159
|
+
end
|
|
160
|
+
|
|
143
161
|
def inspect
|
|
144
162
|
"#<#{self.class.name} #{self}>"
|
|
145
163
|
end
|
|
164
|
+
|
|
165
|
+
private
|
|
166
|
+
|
|
167
|
+
def range_start(granularity)
|
|
168
|
+
case granularity
|
|
169
|
+
when :month then self.class.new(year: year, month: month)
|
|
170
|
+
when :year then self.class.new(year: year)
|
|
171
|
+
else self.class.new(year: year, month: month, day: day)
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def range_end(start_ts, granularity)
|
|
176
|
+
case granularity
|
|
177
|
+
when :month then next_month_start - 1
|
|
178
|
+
when :year
|
|
179
|
+
self.class.new(year: year, month: 12, day: 31,
|
|
180
|
+
hour: 23, minute: 59, second: 59)
|
|
181
|
+
else start_ts + 86_399
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def next_month_start
|
|
186
|
+
if month == 12
|
|
187
|
+
self.class.new(year: year + 1, month: 1)
|
|
188
|
+
else
|
|
189
|
+
self.class.new(year: year, month: month + 1)
|
|
190
|
+
end
|
|
191
|
+
end
|
|
146
192
|
end
|
|
147
193
|
end
|
data/lib/archaeo/url_rewriter.rb
CHANGED
|
@@ -10,6 +10,7 @@ module Archaeo
|
|
|
10
10
|
# rooted at a configurable local directory.
|
|
11
11
|
class UrlRewriter
|
|
12
12
|
URL_ATTRS = %w[src href data-src poster].freeze
|
|
13
|
+
CSS_URL_RE = /url\(\s*['"]?([^'")\s]+)['"]?\s*\)/
|
|
13
14
|
|
|
14
15
|
def initialize(archive_prefix, local_prefix)
|
|
15
16
|
@archive_prefix = archive_prefix.to_s
|
|
@@ -31,6 +32,8 @@ module Archaeo
|
|
|
31
32
|
doc = Nokogiri::HTML(html_content)
|
|
32
33
|
rewrite_url_attrs(doc)
|
|
33
34
|
rewrite_srcset_attrs(doc)
|
|
35
|
+
rewrite_inline_style_attrs(doc)
|
|
36
|
+
rewrite_style_elements(doc)
|
|
34
37
|
doc.to_html
|
|
35
38
|
end
|
|
36
39
|
|
|
@@ -53,6 +56,28 @@ module Archaeo
|
|
|
53
56
|
|
|
54
57
|
private
|
|
55
58
|
|
|
59
|
+
def rewrite_inline_style_attrs(doc)
|
|
60
|
+
doc.css("[style]").each do |el|
|
|
61
|
+
next unless el["style"]
|
|
62
|
+
|
|
63
|
+
el["style"] = rewrite_css_urls(el["style"])
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def rewrite_style_elements(doc)
|
|
68
|
+
doc.css("style").each do |el|
|
|
69
|
+
el.content = rewrite_css_urls(el.text)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def rewrite_css_urls(css_text)
|
|
74
|
+
css_text.gsub(CSS_URL_RE) do
|
|
75
|
+
url = Regexp.last_match[1]
|
|
76
|
+
rewritten = url.start_with?(@archive_prefix) ? rewrite(url) : url
|
|
77
|
+
"url('#{rewritten}')"
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
56
81
|
def rewrite_srcset(srcset)
|
|
57
82
|
return srcset unless srcset
|
|
58
83
|
|
data/lib/archaeo/version.rb
CHANGED
data/lib/archaeo.rb
CHANGED
|
@@ -16,6 +16,17 @@ module Archaeo
|
|
|
16
16
|
class SaveFailed < Error; end
|
|
17
17
|
class IntegrityError < Error; end
|
|
18
18
|
|
|
19
|
+
class FetchError < Error
|
|
20
|
+
attr_reader :status_code, :url, :page
|
|
21
|
+
|
|
22
|
+
def initialize(message, status_code:, url:, page:)
|
|
23
|
+
super(message)
|
|
24
|
+
@status_code = status_code
|
|
25
|
+
@url = url
|
|
26
|
+
@page = page
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
19
30
|
autoload :Timestamp, "archaeo/timestamp"
|
|
20
31
|
autoload :ArchiveUrl, "archaeo/archive_url"
|
|
21
32
|
autoload :Snapshot, "archaeo/snapshot"
|
|
@@ -25,6 +36,7 @@ module Archaeo
|
|
|
25
36
|
autoload :AvailabilityResult, "archaeo/availability_result"
|
|
26
37
|
autoload :UrlNormalizer, "archaeo/url_normalizer"
|
|
27
38
|
autoload :CdxFilter, "archaeo/cdx_filter"
|
|
39
|
+
autoload :CdxTimeline, "archaeo/cdx_timeline"
|
|
28
40
|
autoload :AssetList, "archaeo/asset_list"
|
|
29
41
|
autoload :AssetExtractor, "archaeo/asset_extractor"
|
|
30
42
|
autoload :UrlRewriter, "archaeo/url_rewriter"
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: archaeo
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.7
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-05-
|
|
11
|
+
date: 2026-05-11 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: csv
|
|
@@ -79,6 +79,7 @@ files:
|
|
|
79
79
|
- lib/archaeo/bulk_downloader.rb
|
|
80
80
|
- lib/archaeo/cdx_api.rb
|
|
81
81
|
- lib/archaeo/cdx_filter.rb
|
|
82
|
+
- lib/archaeo/cdx_timeline.rb
|
|
82
83
|
- lib/archaeo/cli.rb
|
|
83
84
|
- lib/archaeo/download_state.rb
|
|
84
85
|
- lib/archaeo/fetcher.rb
|