archaeo 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +160 -6
- data/lib/archaeo/archive_url.rb +16 -0
- data/lib/archaeo/asset_extractor.rb +19 -0
- data/lib/archaeo/asset_list.rb +31 -0
- data/lib/archaeo/availability_result.rb +24 -0
- data/lib/archaeo/bulk_downloader.rb +97 -28
- data/lib/archaeo/cdx_api.rb +4 -0
- data/lib/archaeo/cdx_filter.rb +12 -0
- data/lib/archaeo/cli.rb +96 -10
- data/lib/archaeo/download_state.rb +46 -15
- data/lib/archaeo/fetcher.rb +16 -1
- data/lib/archaeo/http_client.rb +43 -11
- data/lib/archaeo/page.rb +32 -0
- data/lib/archaeo/page_bundle.rb +28 -0
- data/lib/archaeo/save_result.rb +19 -0
- data/lib/archaeo/snapshot.rb +22 -0
- data/lib/archaeo/timestamp.rb +14 -0
- data/lib/archaeo/url_normalizer.rb +7 -1
- data/lib/archaeo/url_rewriter.rb +46 -0
- data/lib/archaeo/version.rb +1 -1
- data/lib/archaeo.rb +1 -0
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: dff73d8ab14a3b75bf98281d20b5427b55757b330d57e8899a2ffb04d9046c6d
|
|
4
|
+
data.tar.gz: d92c2f8e77d6ba5c51283f0581bb51182ebe84aab74ffe4e4971e0d405eab2cc
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ed1a823e5f358e53ec653e5eee902f95787b9bdedc5670a214f3c1784f4c1829f705d5a53ded5cfb6777f16ed33e32430f2aca13cb9c1fad628885daa55a60a1
|
|
7
|
+
data.tar.gz: 3b809e1aad60db5e04a356dff5ab450333ff88f56bf481630887f45b3b3f09035ee1ba959807d22129b827b30324b7730e7cf5c248656b62c38a0beab8ad2581
|
data/README.adoc
CHANGED
|
@@ -54,6 +54,9 @@ cdx.between("example.com", from: "20220101", to: "20221231").each do |snap|
|
|
|
54
54
|
puts snap.timestamp
|
|
55
55
|
end
|
|
56
56
|
|
|
57
|
+
# Count snapshots
|
|
58
|
+
cdx.count("example.com") # => Integer
|
|
59
|
+
|
|
57
60
|
# Filter by status code, mimetype, or URL pattern
|
|
58
61
|
cdx.snapshots("example.com",
|
|
59
62
|
filters: [Archaeo::CdxFilter.by_status(200)],
|
|
@@ -69,6 +72,11 @@ filters = Archaeo::CdxFilter.combine(
|
|
|
69
72
|
)
|
|
70
73
|
cdx.snapshots("example.com", filters: filters)
|
|
71
74
|
|
|
75
|
+
# Convenience filter factories
|
|
76
|
+
Archaeo::CdxFilter.only_html # text/html only
|
|
77
|
+
Archaeo::CdxFilter.by_mimetype_prefix("image") # any image/*
|
|
78
|
+
Archaeo::CdxFilter.excluding_redirects # exclude 3xx
|
|
79
|
+
|
|
72
80
|
# Page-based pagination
|
|
73
81
|
cdx.snapshots("example.com", page: 0)
|
|
74
82
|
|
|
@@ -90,6 +98,8 @@ result.available? # => true/false
|
|
|
90
98
|
result.archive_url # => "https://web.archive.org/web/..."
|
|
91
99
|
result.timestamp # => Archaeo::Timestamp
|
|
92
100
|
result.archived_status # => HTTP status code of the archived page
|
|
101
|
+
result.to_h # => Hash representation
|
|
102
|
+
result.as_json # => JSON-serializable Hash
|
|
93
103
|
|
|
94
104
|
api.available?("example.com") # => true/false
|
|
95
105
|
----
|
|
@@ -104,6 +114,8 @@ result.url # => "https://example.com/"
|
|
|
104
114
|
result.archive_url # => "https://web.archive.org/web/..."
|
|
105
115
|
result.timestamp # => Archaeo::Timestamp
|
|
106
116
|
result.cached? # => true if already archived
|
|
117
|
+
result.to_h # => Hash representation
|
|
118
|
+
result.as_json # => JSON-serializable Hash
|
|
107
119
|
----
|
|
108
120
|
|
|
109
121
|
=== Fetch Archived Content
|
|
@@ -120,13 +132,22 @@ page.status_code # => 200
|
|
|
120
132
|
page.archive_url # => full archive URL
|
|
121
133
|
page.title # => "Example Domain"
|
|
122
134
|
page.html? # => true
|
|
135
|
+
page.css? # => true for text/css
|
|
123
136
|
page.json? # => false
|
|
124
137
|
page.size # => content length in bytes
|
|
138
|
+
page.to_h # => Hash with all fields
|
|
139
|
+
page.as_json # => JSON-serializable Hash
|
|
140
|
+
page.inspect # => "#<Archaeo::Page text/html 1234 bytes>"
|
|
125
141
|
|
|
126
142
|
# Raw (identity) mode -- no Wayback Machine rewriting
|
|
127
143
|
page = fetcher.fetch("https://example.com/",
|
|
128
144
|
timestamp: "20220615000000",
|
|
129
145
|
identity: true)
|
|
146
|
+
|
|
147
|
+
# With digest verification (raises IntegrityError on mismatch)
|
|
148
|
+
page = fetcher.fetch("https://example.com/",
|
|
149
|
+
timestamp: "20220615000000",
|
|
150
|
+
snapshot: snap)
|
|
130
151
|
----
|
|
131
152
|
|
|
132
153
|
=== Fetch Page with Assets
|
|
@@ -146,10 +167,25 @@ bundle.assets.fonts
|
|
|
146
167
|
bundle.assets.media
|
|
147
168
|
bundle.size # => total count (page + assets)
|
|
148
169
|
bundle.asset_count # => number of assets
|
|
170
|
+
bundle.to_h # => Hash representation
|
|
171
|
+
bundle.to_json # => JSON string
|
|
149
172
|
|
|
150
173
|
# Serialize asset list
|
|
151
174
|
bundle.assets.to_json
|
|
152
175
|
bundle.assets.counts # => { css: 1, js: 2, image: 3, font: 0, media: 1 }
|
|
176
|
+
|
|
177
|
+
# Filter assets by type
|
|
178
|
+
css_only = bundle.assets.filter(:css)
|
|
179
|
+
images_and_fonts = bundle.assets.filter(:image, :font)
|
|
180
|
+
|
|
181
|
+
# Merge asset lists (deduplicates)
|
|
182
|
+
merged = bundle.assets.merge(other_assets)
|
|
183
|
+
|
|
184
|
+
# Reconstruct from JSON
|
|
185
|
+
restored = Archaeo::AssetList.from_json(json_string)
|
|
186
|
+
|
|
187
|
+
# Safe type access
|
|
188
|
+
bundle.assets.urls_by_type(:image) # works for any type key
|
|
153
189
|
----
|
|
154
190
|
|
|
155
191
|
=== Bulk Download with Resume
|
|
@@ -157,13 +193,22 @@ bundle.assets.counts # => { css: 1, js: 2, image: 3, font: 0, media: 1 }
|
|
|
157
193
|
[source,ruby]
|
|
158
194
|
----
|
|
159
195
|
downloader = Archaeo::BulkDownloader.new(output_dir: "archive")
|
|
160
|
-
downloader.download("example.com") do |current, total, snapshot|
|
|
196
|
+
summary = downloader.download("example.com") do |current, total, snapshot|
|
|
161
197
|
puts "[#{current}/#{total}] #{snapshot.original_url}"
|
|
162
198
|
end
|
|
163
199
|
|
|
200
|
+
summary.total # => total snapshots found
|
|
201
|
+
summary.downloaded # => successfully downloaded
|
|
202
|
+
summary.skipped # => skipped (already downloaded with resume)
|
|
203
|
+
summary.bytes_written # => total bytes written
|
|
204
|
+
summary.elapsed # => seconds elapsed
|
|
205
|
+
|
|
164
206
|
# Resume interrupted download
|
|
165
207
|
downloader.download("example.com", resume: true)
|
|
166
208
|
|
|
209
|
+
# Dry run (preview without fetching)
|
|
210
|
+
summary = downloader.download("example.com", dry_run: true)
|
|
211
|
+
|
|
167
212
|
# Filter by date range
|
|
168
213
|
downloader.download("example.com",
|
|
169
214
|
from: "20220101", to: "20221231")
|
|
@@ -175,6 +220,27 @@ downloader = Archaeo::BulkDownloader.new(
|
|
|
175
220
|
downloader.download("example.com")
|
|
176
221
|
----
|
|
177
222
|
|
|
223
|
+
=== Download State (Resume Tracking)
|
|
224
|
+
|
|
225
|
+
[source,ruby]
|
|
226
|
+
----
|
|
227
|
+
state = Archaeo::DownloadState.new("archive")
|
|
228
|
+
|
|
229
|
+
# Check if a snapshot was already downloaded
|
|
230
|
+
state.completed?("20220615000000") # => true/false
|
|
231
|
+
|
|
232
|
+
# Get metadata for a completed snapshot
|
|
233
|
+
entry = state.entry_for("20220615000000")
|
|
234
|
+
# => { "ts" => "20220615000000", "at" => "2022-06-15T12:00:00Z",
|
|
235
|
+
# "url" => "https://example.com/", "bytes" => 12345 }
|
|
236
|
+
|
|
237
|
+
# Total bytes downloaded
|
|
238
|
+
state.total_bytes # => Integer
|
|
239
|
+
|
|
240
|
+
# Clear state for a fresh download
|
|
241
|
+
state.clear
|
|
242
|
+
----
|
|
243
|
+
|
|
178
244
|
=== URL Normalization
|
|
179
245
|
|
|
180
246
|
[source,ruby]
|
|
@@ -187,6 +253,10 @@ Archaeo::UrlNormalizer.normalize('"https://example.com/%252F"')
|
|
|
187
253
|
|
|
188
254
|
Archaeo::UrlNormalizer.with_scheme("example.com")
|
|
189
255
|
# => "https://example.com"
|
|
256
|
+
|
|
257
|
+
# Default ports are stripped
|
|
258
|
+
Archaeo::UrlNormalizer.normalize("https://example.com:443/path")
|
|
259
|
+
# => "https://example.com/path"
|
|
190
260
|
----
|
|
191
261
|
|
|
192
262
|
=== CDX Filters
|
|
@@ -202,6 +272,33 @@ Archaeo::CdxFilter.by_url("example.com") # => "original:example.com"
|
|
|
202
272
|
# Compose filters
|
|
203
273
|
filters = Archaeo::CdxFilter.only_successful
|
|
204
274
|
error_filters = Archaeo::CdxFilter.excluding_errors
|
|
275
|
+
|
|
276
|
+
# Mimetype prefix matching
|
|
277
|
+
Archaeo::CdxFilter.by_mimetype_prefix("image") # => matches image/*
|
|
278
|
+
|
|
279
|
+
# Convenience factories
|
|
280
|
+
Archaeo::CdxFilter.only_html # => text/html only
|
|
281
|
+
Archaeo::CdxFilter.excluding_redirects # => excludes 3xx statuses
|
|
282
|
+
----
|
|
283
|
+
|
|
284
|
+
=== URL Rewriting
|
|
285
|
+
|
|
286
|
+
[source,ruby]
|
|
287
|
+
----
|
|
288
|
+
rewriter = Archaeo::UrlRewriter.new(
|
|
289
|
+
"https://web.archive.org/web/20220615000000/",
|
|
290
|
+
"local",
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
# Rewrite single URL
|
|
294
|
+
rewriter.rewrite("https://web.archive.org/web/20220615000000/style.css")
|
|
295
|
+
# => "local/style.css"
|
|
296
|
+
|
|
297
|
+
# Rewrite batch
|
|
298
|
+
rewriter.rewrite_batch(["url1", "url2"])
|
|
299
|
+
|
|
300
|
+
# Rewrite URLs within HTML (src, href, srcset, data-src, poster)
|
|
301
|
+
rewritten_html = rewriter.rewrite_html(html_content)
|
|
205
302
|
----
|
|
206
303
|
|
|
207
304
|
=== Snapshot Convenience
|
|
@@ -217,6 +314,14 @@ snap.client_error? # => true for 4xx
|
|
|
217
314
|
snap.server_error? # => true for 5xx
|
|
218
315
|
snap.error? # => true for 4xx/5xx
|
|
219
316
|
|
|
317
|
+
# Age helpers
|
|
318
|
+
snap.age # => seconds since capture
|
|
319
|
+
snap.older_than?(3600) # => true if older than 1 hour
|
|
320
|
+
snap.newer_than?(3600) # => true if newer than 1 hour
|
|
321
|
+
|
|
322
|
+
# Identity URL (raw content, no Wayback rewriting)
|
|
323
|
+
snap.identity_url
|
|
324
|
+
|
|
220
325
|
# Fetch content directly from a snapshot
|
|
221
326
|
page = snap.fetch
|
|
222
327
|
|
|
@@ -225,6 +330,7 @@ bundle = snap.fetch_with_assets
|
|
|
225
330
|
|
|
226
331
|
# JSON-serializable representation
|
|
227
332
|
snap.as_json # => Hash with primitive values only
|
|
333
|
+
snap.inspect # => "#<Archaeo::Snapshot 20220101 ...>"
|
|
228
334
|
----
|
|
229
335
|
|
|
230
336
|
=== Timestamps
|
|
@@ -250,6 +356,10 @@ ts.to_s # => "20220615000000"
|
|
|
250
356
|
ts.to_iso8601 # => "2022-06-15T00:00:00Z"
|
|
251
357
|
ts.to_rfc3339 # => "2022-06-15T00:00:00+00:00"
|
|
252
358
|
|
|
359
|
+
# Decompose
|
|
360
|
+
ts.to_h # => { year: 2022, month: 6, day: 15, hour: 0, minute: 0, second: 0 }
|
|
361
|
+
ts.to_a # => [2022, 6, 15, 0, 0, 0]
|
|
362
|
+
|
|
253
363
|
# Arithmetic
|
|
254
364
|
ts + 3600 # => Timestamp one hour later
|
|
255
365
|
ts - 3600 # => Timestamp one hour earlier
|
|
@@ -257,6 +367,27 @@ ts1 - ts2 # => seconds between timestamps
|
|
|
257
367
|
|
|
258
368
|
# Comparison
|
|
259
369
|
ts1 < ts2 # => true/false
|
|
370
|
+
|
|
371
|
+
# Immutable -- frozen on creation
|
|
372
|
+
ts.frozen? # => true
|
|
373
|
+
----
|
|
374
|
+
|
|
375
|
+
=== HTTP Client Observability
|
|
376
|
+
|
|
377
|
+
[source,ruby]
|
|
378
|
+
----
|
|
379
|
+
# Track every request with a callback
|
|
380
|
+
client = Archaeo::HttpClient.new(
|
|
381
|
+
on_request: ->(uri, elapsed, status, retries) {
|
|
382
|
+
puts "#{status} #{uri} (#{elapsed.round(3)}s, #{retries} retries)"
|
|
383
|
+
},
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
# Inspect connection pool state
|
|
387
|
+
client.pool_stats
|
|
388
|
+
# => { active_connections: 2, max_pool_size: 8,
|
|
389
|
+
# hosts: ["web.archive.org"],
|
|
390
|
+
# idle_times: { "web.archive.org": 12 } }
|
|
260
391
|
----
|
|
261
392
|
|
|
262
393
|
=== Command-Line Interface
|
|
@@ -279,6 +410,16 @@ archaeo near --format json example.com 20220101
|
|
|
279
410
|
archaeo oldest example.com
|
|
280
411
|
archaeo newest --format json example.com
|
|
281
412
|
|
|
413
|
+
# Find before/after a timestamp
|
|
414
|
+
archaeo before example.com 20220101
|
|
415
|
+
archaeo after example.com 20220101
|
|
416
|
+
|
|
417
|
+
# List snapshots in a date range
|
|
418
|
+
archaeo between example.com 20220101 20221231
|
|
419
|
+
|
|
420
|
+
# Count snapshots
|
|
421
|
+
archaeo count example.com
|
|
422
|
+
|
|
282
423
|
# Check availability (with optional timestamp)
|
|
283
424
|
archaeo available example.com
|
|
284
425
|
archaeo available --timestamp 20220101 example.com
|
|
@@ -295,15 +436,25 @@ archaeo fetch --output page.html https://example.com/ 20220615120000
|
|
|
295
436
|
# Fetch raw (identity) content
|
|
296
437
|
archaeo fetch --identity https://example.com/ 20220615120000
|
|
297
438
|
|
|
439
|
+
# Fetch a page and list its extracted assets
|
|
440
|
+
archaeo fetch-assets https://example.com/ 20220615120000
|
|
441
|
+
archaeo fetch-assets --format json https://example.com/ 20220615120000
|
|
442
|
+
|
|
298
443
|
# Download all snapshots
|
|
299
444
|
archaeo download example.com --output ./archive
|
|
300
445
|
|
|
446
|
+
# Dry run (preview without fetching)
|
|
447
|
+
archaeo download --dry_run example.com
|
|
448
|
+
|
|
301
449
|
# Parallel downloads
|
|
302
450
|
archaeo download --concurrency 4 example.com --output ./archive
|
|
303
451
|
|
|
304
452
|
# Resume interrupted download
|
|
305
453
|
archaeo download example.com --resume
|
|
306
454
|
|
|
455
|
+
# Suppress progress messages
|
|
456
|
+
archaeo --quiet download example.com
|
|
457
|
+
|
|
307
458
|
# Discover all known URLs for a domain
|
|
308
459
|
archaeo known_urls example.com
|
|
309
460
|
----
|
|
@@ -326,6 +477,9 @@ Archaeo::MaximumRetriesExceeded
|
|
|
326
477
|
|
|
327
478
|
# SavePageNow session limit
|
|
328
479
|
Archaeo::SaveFailed
|
|
480
|
+
|
|
481
|
+
# Content digest mismatch
|
|
482
|
+
Archaeo::IntegrityError
|
|
329
483
|
----
|
|
330
484
|
|
|
331
485
|
== Architecture
|
|
@@ -338,15 +492,15 @@ Archaeo follows a model-driven, OOP design:
|
|
|
338
492
|
|
|
339
493
|
| *Models*
|
|
340
494
|
| `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`
|
|
341
|
-
| Domain value objects
|
|
495
|
+
| Domain value objects with `to_h`, `as_json`, `inspect` support
|
|
342
496
|
|
|
343
497
|
| *URL Processing*
|
|
344
498
|
| `UrlNormalizer`, `CdxFilter`, `UrlRewriter`
|
|
345
|
-
| URL sanitization, validated filtering with composition, and rewriting
|
|
499
|
+
| URL sanitization, validated filtering with composition, and HTML URL rewriting
|
|
346
500
|
|
|
347
501
|
| *Asset Extraction*
|
|
348
502
|
| `AssetExtractor`, `AssetList`
|
|
349
|
-
| Parse HTML for resource URLs
|
|
503
|
+
| Parse HTML for resource URLs including preloads and modulepreload
|
|
350
504
|
|
|
351
505
|
| *APIs*
|
|
352
506
|
| `CdxApi`, `AvailabilityApi`, `SaveApi`
|
|
@@ -354,11 +508,11 @@ Archaeo follows a model-driven, OOP design:
|
|
|
354
508
|
|
|
355
509
|
| *Operations*
|
|
356
510
|
| `Fetcher`, `BulkDownloader`, `DownloadState`
|
|
357
|
-
| Download content with resume
|
|
511
|
+
| Download content with resume, dry-run, digest verification, and download summaries
|
|
358
512
|
|
|
359
513
|
| *Infrastructure*
|
|
360
514
|
| `HttpClient`
|
|
361
|
-
| HTTP transport with retries, gzip, 429/503 handling, connection pooling
|
|
515
|
+
| HTTP transport with retries, gzip, 429/503 handling, connection pooling, and per-request observability
|
|
362
516
|
|===
|
|
363
517
|
|
|
364
518
|
All API classes accept an `HttpClient` via dependency injection for testability.
|
data/lib/archaeo/archive_url.rb
CHANGED
|
@@ -53,6 +53,22 @@ module Archaeo
|
|
|
53
53
|
"#{BASE}/#{@timestamp}#{suffix}/#{@original_url}"
|
|
54
54
|
end
|
|
55
55
|
|
|
56
|
+
def identity_url
|
|
57
|
+
return to_s if identity?
|
|
58
|
+
|
|
59
|
+
self.class.new(@original_url, timestamp: @timestamp, identity: true).to_s
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def to_h
|
|
63
|
+
{ original_url: @original_url, timestamp: @timestamp,
|
|
64
|
+
identity: @identity }
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def as_json(*)
|
|
68
|
+
{ original_url: @original_url, timestamp: @timestamp.to_s,
|
|
69
|
+
identity: @identity, url: to_s }
|
|
70
|
+
end
|
|
71
|
+
|
|
56
72
|
def self.extract_original_url(string, ts_str, identity)
|
|
57
73
|
marker = identity ? "#{ts_str}id_/" : "#{ts_str}/"
|
|
58
74
|
idx = string.index(marker)
|
|
@@ -24,6 +24,12 @@ module Archaeo
|
|
|
24
24
|
"\\s*:[^;]*#{CSS_URL_PATTERN.source}",
|
|
25
25
|
)
|
|
26
26
|
|
|
27
|
+
PRELOAD_TYPE_MAP = {
|
|
28
|
+
"style" => :css,
|
|
29
|
+
"script" => :js,
|
|
30
|
+
"image" => :image,
|
|
31
|
+
}.freeze
|
|
32
|
+
|
|
27
33
|
def initialize(html, base_url: nil)
|
|
28
34
|
@doc = Nokogiri::HTML(html.to_s)
|
|
29
35
|
@base_url = base_url
|
|
@@ -38,6 +44,7 @@ module Archaeo
|
|
|
38
44
|
extract_media(list)
|
|
39
45
|
extract_inline_css(list)
|
|
40
46
|
extract_inline_styles(list)
|
|
47
|
+
extract_preloads(list)
|
|
41
48
|
list
|
|
42
49
|
end
|
|
43
50
|
|
|
@@ -53,6 +60,9 @@ module Archaeo
|
|
|
53
60
|
@doc.css("script[src]").each do |el|
|
|
54
61
|
list.add(resolve(el["src"]), type: :js)
|
|
55
62
|
end
|
|
63
|
+
@doc.css('link[rel="modulepreload"]').each do |el|
|
|
64
|
+
list.add(resolve(el["href"]), type: :js)
|
|
65
|
+
end
|
|
56
66
|
end
|
|
57
67
|
|
|
58
68
|
def extract_images(list)
|
|
@@ -202,5 +212,14 @@ module Archaeo
|
|
|
202
212
|
rescue URI::InvalidURIError
|
|
203
213
|
url
|
|
204
214
|
end
|
|
215
|
+
|
|
216
|
+
def extract_preloads(list)
|
|
217
|
+
@doc.css('link[rel="preload"][as]').each do |el|
|
|
218
|
+
type = PRELOAD_TYPE_MAP[el["as"]]
|
|
219
|
+
next unless type
|
|
220
|
+
|
|
221
|
+
list.add(resolve(el["href"]), type: type)
|
|
222
|
+
end
|
|
223
|
+
end
|
|
205
224
|
end
|
|
206
225
|
end
|
data/lib/archaeo/asset_list.rb
CHANGED
|
@@ -40,6 +40,10 @@ module Archaeo
|
|
|
40
40
|
@urls_by_type[:image]
|
|
41
41
|
end
|
|
42
42
|
|
|
43
|
+
def urls_by_type(type)
|
|
44
|
+
@urls_by_type[type] || []
|
|
45
|
+
end
|
|
46
|
+
|
|
43
47
|
def fonts
|
|
44
48
|
@urls_by_type[:font]
|
|
45
49
|
end
|
|
@@ -71,5 +75,32 @@ module Archaeo
|
|
|
71
75
|
def counts
|
|
72
76
|
@urls_by_type.transform_values(&:size)
|
|
73
77
|
end
|
|
78
|
+
|
|
79
|
+
def filter(*types)
|
|
80
|
+
result = self.class.new
|
|
81
|
+
types.each do |type|
|
|
82
|
+
@urls_by_type[type]&.each { |url| result.add(url, type: type) }
|
|
83
|
+
end
|
|
84
|
+
result
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def merge(other)
|
|
88
|
+
CATEGORIES.each do |type|
|
|
89
|
+
other.urls_by_type(type).each { |url| add(url, type: type) }
|
|
90
|
+
end
|
|
91
|
+
self
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def self.from_json(json_string)
|
|
95
|
+
data = JSON.parse(json_string)
|
|
96
|
+
list = new
|
|
97
|
+
data.each do |type, urls|
|
|
98
|
+
sym = type.to_sym
|
|
99
|
+
next unless CATEGORIES.include?(sym)
|
|
100
|
+
|
|
101
|
+
Array(urls).each { |url| list.add(url, type: sym) }
|
|
102
|
+
end
|
|
103
|
+
list
|
|
104
|
+
end
|
|
74
105
|
end
|
|
75
106
|
end
|
|
@@ -32,5 +32,29 @@ module Archaeo
|
|
|
32
32
|
"#{url} -> not available"
|
|
33
33
|
end
|
|
34
34
|
end
|
|
35
|
+
|
|
36
|
+
def to_h
|
|
37
|
+
{
|
|
38
|
+
url: @url,
|
|
39
|
+
available: @available,
|
|
40
|
+
archive_url: @archive_url,
|
|
41
|
+
timestamp: @timestamp,
|
|
42
|
+
archived_status: @archived_status,
|
|
43
|
+
}
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def as_json(*)
|
|
47
|
+
{
|
|
48
|
+
url: @url,
|
|
49
|
+
available: @available,
|
|
50
|
+
archive_url: @archive_url,
|
|
51
|
+
timestamp: @timestamp&.to_s,
|
|
52
|
+
archived_status: @archived_status,
|
|
53
|
+
}
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def inspect
|
|
57
|
+
"#<#{self.class.name} #{@url} available=#{@available}>"
|
|
58
|
+
end
|
|
35
59
|
end
|
|
36
60
|
end
|
|
@@ -3,6 +3,11 @@
|
|
|
3
3
|
require "fileutils"
|
|
4
4
|
|
|
5
5
|
module Archaeo
|
|
6
|
+
DownloadSummary = Struct.new(
|
|
7
|
+
:total, :downloaded, :skipped, :failed, :bytes_written, :elapsed,
|
|
8
|
+
keyword_init: true
|
|
9
|
+
)
|
|
10
|
+
|
|
6
11
|
# Downloads all archived snapshots of a URL with resume support.
|
|
7
12
|
#
|
|
8
13
|
# Queries the CDX API for matching snapshots, fetches each page,
|
|
@@ -17,20 +22,17 @@ module Archaeo
|
|
|
17
22
|
@concurrency = [1, concurrency.to_i].max
|
|
18
23
|
end
|
|
19
24
|
|
|
20
|
-
def download(url, from: nil, to: nil, resume: false,
|
|
25
|
+
def download(url, from: nil, to: nil, resume: false,
|
|
26
|
+
dry_run: false, &block)
|
|
27
|
+
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
21
28
|
url = UrlNormalizer.normalize(url)
|
|
22
|
-
FileUtils.mkdir_p(@output_dir)
|
|
23
|
-
state = DownloadState.new(@output_dir)
|
|
29
|
+
FileUtils.mkdir_p(@output_dir) unless dry_run
|
|
24
30
|
|
|
25
31
|
snapshots = fetch_snapshots(url, from: from, to: to)
|
|
26
|
-
|
|
27
|
-
|
|
32
|
+
downloaded, skipped, bytes =
|
|
33
|
+
run_download(snapshots, resume, dry_run, block)
|
|
28
34
|
|
|
29
|
-
|
|
30
|
-
download_sequential(snapshots, total, state, resume, progress)
|
|
31
|
-
else
|
|
32
|
-
download_concurrent(snapshots, total, state, resume, progress)
|
|
33
|
-
end
|
|
35
|
+
build_summary(start_time, snapshots.size, downloaded, skipped, bytes)
|
|
34
36
|
end
|
|
35
37
|
|
|
36
38
|
private
|
|
@@ -44,29 +46,75 @@ module Archaeo
|
|
|
44
46
|
.select { |snap| !snap.blocked? && snap.status_code == 200 }
|
|
45
47
|
end
|
|
46
48
|
|
|
47
|
-
def
|
|
48
|
-
|
|
49
|
-
|
|
49
|
+
def run_download(snapshots, resume, dry_run, progress)
|
|
50
|
+
state = DownloadState.new(@output_dir)
|
|
51
|
+
total = snapshots.size
|
|
50
52
|
|
|
51
|
-
|
|
52
|
-
state
|
|
53
|
+
if @concurrency == 1
|
|
54
|
+
download_sequential(snapshots, total, state, resume,
|
|
55
|
+
dry_run, progress)
|
|
56
|
+
else
|
|
57
|
+
download_concurrent(snapshots, total, state, resume,
|
|
58
|
+
dry_run, progress)
|
|
59
|
+
end
|
|
60
|
+
end
|
|
53
61
|
|
|
62
|
+
def build_summary(start_time, total, downloaded, skipped, bytes)
|
|
63
|
+
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
64
|
+
DownloadSummary.new(
|
|
65
|
+
total: total, downloaded: downloaded, skipped: skipped,
|
|
66
|
+
failed: 0, bytes_written: bytes, elapsed: elapsed
|
|
67
|
+
)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def download_sequential(snapshots, total, state, resume,
|
|
71
|
+
dry_run, progress)
|
|
72
|
+
counters = { downloaded: 0, skipped: 0, bytes: 0 }
|
|
73
|
+
|
|
74
|
+
snapshots.each_with_index do |snap, index|
|
|
75
|
+
process_sequential(snap, state, resume, dry_run, counters)
|
|
54
76
|
progress&.call(index + 1, total, snap)
|
|
55
77
|
end
|
|
78
|
+
|
|
79
|
+
[counters[:downloaded], counters[:skipped], counters[:bytes]]
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def process_sequential(snap, state, resume, dry_run, counters)
|
|
83
|
+
if resume && state.completed?(snap.timestamp)
|
|
84
|
+
counters[:skipped] += 1
|
|
85
|
+
return
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
counters[:bytes] += download_snapshot(snap, state) unless dry_run
|
|
89
|
+
counters[:downloaded] += 1
|
|
56
90
|
end
|
|
57
91
|
|
|
58
|
-
def
|
|
92
|
+
def download_snapshot(snap, state)
|
|
93
|
+
content = fetch_and_save(snap)
|
|
94
|
+
state.mark_completed(snap.timestamp, url: snap.original_url,
|
|
95
|
+
bytes: content.bytesize)
|
|
96
|
+
content.bytesize
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def download_concurrent(snapshots, total, state, resume,
|
|
100
|
+
dry_run, progress)
|
|
59
101
|
queue = snapshots.each_with_index.to_a
|
|
60
|
-
|
|
61
|
-
|
|
102
|
+
shared = { mutex: Mutex.new, errors: [],
|
|
103
|
+
downloaded: 0, skipped: 0, bytes: 0 }
|
|
62
104
|
|
|
63
105
|
threads = Array.new(@concurrency) do
|
|
64
106
|
Thread.new do
|
|
65
|
-
process_queue(queue, total, state, resume,
|
|
107
|
+
process_queue(queue, total, state, resume,
|
|
108
|
+
dry_run, progress, shared)
|
|
66
109
|
end
|
|
67
110
|
end
|
|
68
111
|
threads.each(&:join)
|
|
112
|
+
raise_on_errors(shared[:errors])
|
|
113
|
+
|
|
114
|
+
[shared[:downloaded], shared[:skipped], shared[:bytes]]
|
|
115
|
+
end
|
|
69
116
|
|
|
117
|
+
def raise_on_errors(errors)
|
|
70
118
|
return unless errors.any?
|
|
71
119
|
|
|
72
120
|
raise Error,
|
|
@@ -74,24 +122,44 @@ module Archaeo
|
|
|
74
122
|
"#{errors.map { |s, _| s.timestamp }.join(', ')}"
|
|
75
123
|
end
|
|
76
124
|
|
|
77
|
-
def process_queue(queue, total, state, resume,
|
|
125
|
+
def process_queue(queue, total, state, resume, dry_run,
|
|
126
|
+
progress, shared)
|
|
78
127
|
loop do
|
|
79
|
-
snap, index = mutex.synchronize { queue.shift }
|
|
128
|
+
snap, index = shared[:mutex].synchronize { queue.shift }
|
|
80
129
|
break unless snap
|
|
81
130
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
fetch_and_save(snap)
|
|
86
|
-
state.mark_completed(snap.timestamp)
|
|
87
|
-
rescue StandardError => e
|
|
88
|
-
mutex.synchronize { errors << [snap, e] }
|
|
131
|
+
if skip_snapshot?(snap, state, resume, shared)
|
|
132
|
+
progress&.call(index + 1, total, snap)
|
|
133
|
+
next
|
|
89
134
|
end
|
|
90
135
|
|
|
136
|
+
concurrent_fetch(snap, state, dry_run, shared)
|
|
91
137
|
progress&.call(index + 1, total, snap)
|
|
92
138
|
end
|
|
93
139
|
end
|
|
94
140
|
|
|
141
|
+
def skip_snapshot?(snap, state, resume, shared)
|
|
142
|
+
return false unless resume && state.completed?(snap.timestamp)
|
|
143
|
+
|
|
144
|
+
shared[:mutex].synchronize { shared[:skipped] += 1 }
|
|
145
|
+
true
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def concurrent_fetch(snap, state, dry_run, shared)
|
|
149
|
+
unless dry_run
|
|
150
|
+
content = fetch_and_save(snap)
|
|
151
|
+
shared[:mutex].synchronize do
|
|
152
|
+
state.mark_completed(snap.timestamp,
|
|
153
|
+
url: snap.original_url,
|
|
154
|
+
bytes: content.bytesize)
|
|
155
|
+
shared[:bytes] += content.bytesize
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
shared[:mutex].synchronize { shared[:downloaded] += 1 }
|
|
159
|
+
rescue StandardError => e
|
|
160
|
+
shared[:mutex].synchronize { shared[:errors] << [snap, e] }
|
|
161
|
+
end
|
|
162
|
+
|
|
95
163
|
def fetch_and_save(snapshot)
|
|
96
164
|
fetcher = Fetcher.new(client: @client)
|
|
97
165
|
page = fetcher.fetch(snapshot.original_url,
|
|
@@ -102,6 +170,7 @@ module Archaeo
|
|
|
102
170
|
tmp_path = "#{filename}.tmp"
|
|
103
171
|
File.binwrite(tmp_path, page.content)
|
|
104
172
|
File.rename(tmp_path, filename)
|
|
173
|
+
page.content
|
|
105
174
|
rescue StandardError
|
|
106
175
|
FileUtils.rm_f(tmp_path) if defined?(tmp_path)
|
|
107
176
|
raise
|
data/lib/archaeo/cdx_api.rb
CHANGED
|
@@ -106,6 +106,10 @@ module Archaeo
|
|
|
106
106
|
**options)
|
|
107
107
|
end
|
|
108
108
|
|
|
109
|
+
def count(url, **options)
|
|
110
|
+
snapshots(url, **options).count
|
|
111
|
+
end
|
|
112
|
+
|
|
109
113
|
# Returns the number of pages for a paginated query.
|
|
110
114
|
def num_pages(url, **options)
|
|
111
115
|
url = UrlNormalizer.normalize(url)
|
data/lib/archaeo/cdx_filter.rb
CHANGED
|
@@ -76,6 +76,18 @@ module Archaeo
|
|
|
76
76
|
excluding_status(502), excluding_status(503)]
|
|
77
77
|
end
|
|
78
78
|
|
|
79
|
+
def self.only_html
|
|
80
|
+
[by_mimetype("text/html")]
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def self.by_mimetype_prefix(prefix)
|
|
84
|
+
new("mimetype:#{Regexp.escape(prefix)}.*")
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def self.excluding_redirects
|
|
88
|
+
%w[301 302 303 307 308].map { |c| excluding_status(c) }
|
|
89
|
+
end
|
|
90
|
+
|
|
79
91
|
private
|
|
80
92
|
|
|
81
93
|
def validate!
|