archaeo 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +128 -7
- data/lib/archaeo/availability_api.rb +1 -0
- data/lib/archaeo/bulk_downloader.rb +1 -0
- data/lib/archaeo/cdx_api.rb +9 -0
- data/lib/archaeo/cli.rb +2 -1
- data/lib/archaeo/fetcher.rb +8 -0
- data/lib/archaeo/page_bundle.rb +16 -0
- data/lib/archaeo/save_api.rb +1 -0
- data/lib/archaeo/url_normalizer.rb +17 -0
- data/lib/archaeo/version.rb +1 -1
- data/lib/archaeo.rb +1 -0
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: cac7a475384c04aaa8a1879a207ac2bb5fad40347f21142de904cc78f5525717
|
|
4
|
+
data.tar.gz: 4282ca2795d7d8baefd750e0c283302ba5c8138d3105fcc94b66162d2671dceb
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4e1802542e03ca5f467d383897297593dc3e20232774d71c1fca57a56b8610d6b3e0985d8692554c7a7e4532604f66088c50782433cee3730276e8ac1c4de9e4
|
|
7
|
+
data.tar.gz: d763469fabd810b6c81deca4f2463982ec1e92016a029edf33871ea62e105c231ed43eb7dab0524a374a5782d0ab3cd2e8f53f9a6e0a01f61c5cfdcbcb3a3724
|
data/README.adoc
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
Archaeo is a Ruby client for the Internet Archive's https://web.archive.org[Wayback Machine] APIs.
|
|
6
6
|
|
|
7
|
-
It provides a model-driven interface for querying archived snapshots, checking availability, saving URLs,
|
|
7
|
+
It provides a model-driven interface for querying archived snapshots, checking availability, saving URLs, fetching archived content, and bulk downloading with resume support.
|
|
8
8
|
|
|
9
9
|
== Installation
|
|
10
10
|
|
|
@@ -33,7 +33,7 @@ require "archaeo"
|
|
|
33
33
|
----
|
|
34
34
|
cdx = Archaeo::CdxApi.new
|
|
35
35
|
|
|
36
|
-
# Enumerate all snapshots
|
|
36
|
+
# Enumerate all snapshots (auto-paginates via resume key)
|
|
37
37
|
cdx.snapshots("example.com").each do |snapshot|
|
|
38
38
|
puts snapshot.timestamp
|
|
39
39
|
puts snapshot.original_url
|
|
@@ -48,6 +48,23 @@ near = cdx.near("example.com", timestamp: "20220101")
|
|
|
48
48
|
# Filter by time
|
|
49
49
|
before = cdx.before("example.com", timestamp: "20220101")
|
|
50
50
|
after = cdx.after("example.com", timestamp: "20220101")
|
|
51
|
+
|
|
52
|
+
# Filter by status code, mimetype, or URL pattern
|
|
53
|
+
cdx.snapshots("example.com",
|
|
54
|
+
filters: [Archaeo::CdxFilter.by_status(200)],
|
|
55
|
+
collapse: ["digest"],
|
|
56
|
+
match_type: "domain",
|
|
57
|
+
sort: "reverse",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Page-based pagination
|
|
61
|
+
cdx.snapshots("example.com", page: 0)
|
|
62
|
+
|
|
63
|
+
# Count pages
|
|
64
|
+
cdx.num_pages("example.com")
|
|
65
|
+
|
|
66
|
+
# Discover all known URLs for a domain
|
|
67
|
+
cdx.known_urls("example.com")
|
|
51
68
|
----
|
|
52
69
|
|
|
53
70
|
=== Check Availability
|
|
@@ -94,6 +111,65 @@ page = fetcher.fetch("https://example.com/",
|
|
|
94
111
|
identity: true)
|
|
95
112
|
----
|
|
96
113
|
|
|
114
|
+
=== Fetch Page with Assets
|
|
115
|
+
|
|
116
|
+
[source,ruby]
|
|
117
|
+
----
|
|
118
|
+
fetcher = Archaeo::Fetcher.new
|
|
119
|
+
bundle = fetcher.fetch_page_with_assets("https://example.com/",
|
|
120
|
+
timestamp: "20220615000000")
|
|
121
|
+
|
|
122
|
+
bundle.page # => Archaeo::Page
|
|
123
|
+
bundle.assets # => Archaeo::AssetList
|
|
124
|
+
bundle.assets.css # => ["https://example.com/style.css", ...]
|
|
125
|
+
bundle.assets.js # => ["https://example.com/app.js", ...]
|
|
126
|
+
bundle.assets.images
|
|
127
|
+
bundle.assets.fonts
|
|
128
|
+
bundle.assets.media
|
|
129
|
+
----
|
|
130
|
+
|
|
131
|
+
=== Bulk Download with Resume
|
|
132
|
+
|
|
133
|
+
[source,ruby]
|
|
134
|
+
----
|
|
135
|
+
downloader = Archaeo::BulkDownloader.new(output_dir: "archive")
|
|
136
|
+
downloader.download("example.com") do |current, total, snapshot|
|
|
137
|
+
puts "[#{current}/#{total}] #{snapshot.original_url}"
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Resume interrupted download
|
|
141
|
+
downloader.download("example.com", resume: true)
|
|
142
|
+
|
|
143
|
+
# Filter by date range
|
|
144
|
+
downloader.download("example.com",
|
|
145
|
+
from: "20220101", to: "20221231")
|
|
146
|
+
----
|
|
147
|
+
|
|
148
|
+
=== URL Normalization
|
|
149
|
+
|
|
150
|
+
[source,ruby]
|
|
151
|
+
----
|
|
152
|
+
Archaeo::UrlNormalizer.normalize(" https://example.com/ ")
|
|
153
|
+
# => "https://example.com/"
|
|
154
|
+
|
|
155
|
+
Archaeo::UrlNormalizer.normalize('"https://example.com/%252F"')
|
|
156
|
+
# => "https://example.com/%2F"
|
|
157
|
+
|
|
158
|
+
Archaeo::UrlNormalizer.with_scheme("example.com")
|
|
159
|
+
# => "https://example.com"
|
|
160
|
+
----
|
|
161
|
+
|
|
162
|
+
=== CDX Filters
|
|
163
|
+
|
|
164
|
+
[source,ruby]
|
|
165
|
+
----
|
|
166
|
+
# Build validated filter expressions
|
|
167
|
+
Archaeo::CdxFilter.by_status(200) # => "statuscode:200"
|
|
168
|
+
Archaeo::CdxFilter.excluding_status(404) # => "!statuscode:404"
|
|
169
|
+
Archaeo::CdxFilter.by_mimetype("text/html") # => "mimetype:text/html"
|
|
170
|
+
Archaeo::CdxFilter.by_url("example.com") # => "original:example.com"
|
|
171
|
+
----
|
|
172
|
+
|
|
97
173
|
=== Timestamps
|
|
98
174
|
|
|
99
175
|
[source,ruby]
|
|
@@ -121,8 +197,13 @@ ts1 < ts2 # => true/false
|
|
|
121
197
|
|
|
122
198
|
[source,bash]
|
|
123
199
|
----
|
|
124
|
-
#
|
|
200
|
+
# Show version
|
|
201
|
+
archaeo --version
|
|
202
|
+
|
|
203
|
+
# List snapshots (table, json, or csv format)
|
|
125
204
|
archaeo snapshots example.com
|
|
205
|
+
archaeo snapshots --format json example.com
|
|
206
|
+
archaeo snapshots --format csv --from 20220101 --to 20221231 example.com
|
|
126
207
|
|
|
127
208
|
# Find closest snapshot
|
|
128
209
|
archaeo near example.com 20220101
|
|
@@ -136,8 +217,40 @@ archaeo save https://example.com/
|
|
|
136
217
|
# Fetch archived content
|
|
137
218
|
archaeo fetch https://example.com/ 20220615120000
|
|
138
219
|
|
|
220
|
+
# Fetch and save to file
|
|
221
|
+
archaeo fetch --output page.html https://example.com/ 20220615120000
|
|
222
|
+
|
|
139
223
|
# Fetch raw (identity) content
|
|
140
224
|
archaeo fetch --identity https://example.com/ 20220615120000
|
|
225
|
+
|
|
226
|
+
# Download all snapshots
|
|
227
|
+
archaeo download example.com --output ./archive
|
|
228
|
+
|
|
229
|
+
# Resume interrupted download
|
|
230
|
+
archaeo download example.com --resume
|
|
231
|
+
|
|
232
|
+
# Discover all known URLs for a domain
|
|
233
|
+
archaeo known_urls example.com
|
|
234
|
+
----
|
|
235
|
+
|
|
236
|
+
=== Error Handling
|
|
237
|
+
|
|
238
|
+
[source,ruby]
|
|
239
|
+
----
|
|
240
|
+
# Blocked site (robots.txt)
|
|
241
|
+
Archaeo::BlockedSiteError
|
|
242
|
+
|
|
243
|
+
# No snapshot found
|
|
244
|
+
Archaeo::NoSnapshotFound
|
|
245
|
+
|
|
246
|
+
# Rate limited by Wayback Machine
|
|
247
|
+
Archaeo::RateLimitError
|
|
248
|
+
|
|
249
|
+
# Maximum retries exceeded
|
|
250
|
+
Archaeo::MaximumRetriesExceeded
|
|
251
|
+
|
|
252
|
+
# SavePageNow session limit
|
|
253
|
+
Archaeo::SaveFailed
|
|
141
254
|
----
|
|
142
255
|
|
|
143
256
|
== Architecture
|
|
@@ -149,20 +262,28 @@ Archaeo follows a model-driven, OOP design:
|
|
|
149
262
|
| Layer | Classes | Purpose
|
|
150
263
|
|
|
151
264
|
| *Models*
|
|
152
|
-
| `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `SaveResult`, `AvailabilityResult`
|
|
265
|
+
| `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`
|
|
153
266
|
| Domain value objects
|
|
154
267
|
|
|
268
|
+
| *URL Processing*
|
|
269
|
+
| `UrlNormalizer`, `CdxFilter`, `UrlRewriter`
|
|
270
|
+
| URL sanitization, filtering, and rewriting
|
|
271
|
+
|
|
272
|
+
| *Asset Extraction*
|
|
273
|
+
| `AssetExtractor`, `AssetList`
|
|
274
|
+
| Parse HTML for resource URLs
|
|
275
|
+
|
|
155
276
|
| *APIs*
|
|
156
277
|
| `CdxApi`, `AvailabilityApi`, `SaveApi`
|
|
157
278
|
| Query and mutate the archive
|
|
158
279
|
|
|
159
280
|
| *Operations*
|
|
160
|
-
| `Fetcher`
|
|
161
|
-
| Download
|
|
281
|
+
| `Fetcher`, `BulkDownloader`, `DownloadState`
|
|
282
|
+
| Download content with resume support
|
|
162
283
|
|
|
163
284
|
| *Infrastructure*
|
|
164
285
|
| `HttpClient`
|
|
165
|
-
| HTTP transport with retries
|
|
286
|
+
| HTTP transport with retries, gzip, connection pooling
|
|
166
287
|
|===
|
|
167
288
|
|
|
168
289
|
All API classes accept an `HttpClient` via dependency injection for testability.
|
data/lib/archaeo/cdx_api.rb
CHANGED
|
@@ -47,6 +47,7 @@ module Archaeo
|
|
|
47
47
|
# Returns an Enumerator of Snapshot objects, auto-paginating
|
|
48
48
|
# via resume key unless an explicit page is requested.
|
|
49
49
|
def snapshots(url, **options)
|
|
50
|
+
url = UrlNormalizer.normalize(url)
|
|
50
51
|
validate_options!(options)
|
|
51
52
|
|
|
52
53
|
Enumerator.new do |yielder|
|
|
@@ -59,9 +60,15 @@ module Archaeo
|
|
|
59
60
|
end
|
|
60
61
|
|
|
61
62
|
def near(url, timestamp:)
|
|
63
|
+
url = UrlNormalizer.normalize(url)
|
|
62
64
|
ts = Timestamp.coerce(timestamp)
|
|
63
65
|
result = snapshots(url, sort: "closest",
|
|
64
66
|
closest: ts.to_s, limit: 1).first
|
|
67
|
+
if result&.blocked?
|
|
68
|
+
raise BlockedSiteError,
|
|
69
|
+
"Site is blocked: #{url}"
|
|
70
|
+
end
|
|
71
|
+
|
|
65
72
|
result || raise(NoSnapshotFound,
|
|
66
73
|
"No snapshot found near #{ts} for #{url}")
|
|
67
74
|
end
|
|
@@ -94,6 +101,7 @@ module Archaeo
|
|
|
94
101
|
|
|
95
102
|
# Returns the number of pages for a paginated query.
|
|
96
103
|
def num_pages(url, **options)
|
|
104
|
+
url = UrlNormalizer.normalize(url)
|
|
97
105
|
params = { "url" => url, "showNumPages" => "true" }
|
|
98
106
|
merge_scalar_params!(params, options)
|
|
99
107
|
response = @client.get(
|
|
@@ -109,6 +117,7 @@ module Archaeo
|
|
|
109
117
|
|
|
110
118
|
# Returns all unique original URLs under a domain.
|
|
111
119
|
def known_urls(domain, match_type: "domain")
|
|
120
|
+
domain = UrlNormalizer.normalize(domain)
|
|
112
121
|
snapshots(domain, match_type: match_type,
|
|
113
122
|
collapse: ["urlkey"]).map(&:original_url).uniq
|
|
114
123
|
end
|
data/lib/archaeo/cli.rb
CHANGED
|
@@ -158,13 +158,14 @@ module Archaeo
|
|
|
158
158
|
end
|
|
159
159
|
|
|
160
160
|
def output_csv(snaps)
|
|
161
|
-
|
|
161
|
+
csv = CSV.generate do |csv|
|
|
162
162
|
csv << %w[timestamp status_code url archive_url]
|
|
163
163
|
snaps.each do |snap|
|
|
164
164
|
csv << [snap.timestamp.to_s, snap.status_code,
|
|
165
165
|
snap.original_url, snap.archive_url]
|
|
166
166
|
end
|
|
167
167
|
end
|
|
168
|
+
puts csv
|
|
168
169
|
end
|
|
169
170
|
|
|
170
171
|
def write_output(path, content)
|
data/lib/archaeo/fetcher.rb
CHANGED
|
@@ -14,6 +14,7 @@ module Archaeo
|
|
|
14
14
|
end
|
|
15
15
|
|
|
16
16
|
def fetch(url, timestamp:, identity: false)
|
|
17
|
+
url = UrlNormalizer.normalize(url)
|
|
17
18
|
ts = Timestamp.coerce(timestamp)
|
|
18
19
|
archive_url = ArchiveUrl.new(url, timestamp: ts,
|
|
19
20
|
identity: identity)
|
|
@@ -21,6 +22,13 @@ module Archaeo
|
|
|
21
22
|
build_page(response, archive_url.to_s, url, ts)
|
|
22
23
|
end
|
|
23
24
|
|
|
25
|
+
def fetch_page_with_assets(url, timestamp:)
|
|
26
|
+
page = fetch(url, timestamp: timestamp)
|
|
27
|
+
assets = AssetExtractor.new(page.content,
|
|
28
|
+
base_url: page.archive_url).extract
|
|
29
|
+
PageBundle.new(page: page, assets: assets)
|
|
30
|
+
end
|
|
31
|
+
|
|
24
32
|
private
|
|
25
33
|
|
|
26
34
|
def build_page(response, archive_url, url, timestamp)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# A fetched page together with all its extracted asset URLs.
|
|
5
|
+
#
|
|
6
|
+
# Bundles a Page with the AssetList discovered from its HTML,
|
|
7
|
+
# providing a single object for complete page archival.
|
|
8
|
+
class PageBundle
|
|
9
|
+
attr_reader :page, :assets
|
|
10
|
+
|
|
11
|
+
def initialize(page:, assets:)
|
|
12
|
+
@page = page
|
|
13
|
+
@assets = assets
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
data/lib/archaeo/save_api.rb
CHANGED
|
@@ -22,6 +22,23 @@ module Archaeo
|
|
|
22
22
|
normalized.match?(%r{\A[a-z][a-z0-9+\-.]*://}) ? normalized : "https://#{normalized}"
|
|
23
23
|
end
|
|
24
24
|
|
|
25
|
+
VALID_URL_RE = %r{\A([a-z][a-z0-9+\-.]*://)?[^\s]+\z}
|
|
26
|
+
|
|
27
|
+
def self.valid?(url)
|
|
28
|
+
normalized = normalize(url)
|
|
29
|
+
return false if normalized.empty?
|
|
30
|
+
|
|
31
|
+
normalized.match?(VALID_URL_RE)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def self.validate!(url)
|
|
35
|
+
normalized = normalize(url)
|
|
36
|
+
raise ArgumentError, "URL cannot be empty" if normalized.empty?
|
|
37
|
+
raise ArgumentError, "Invalid URL: #{url}" unless valid?(url)
|
|
38
|
+
|
|
39
|
+
normalized
|
|
40
|
+
end
|
|
41
|
+
|
|
25
42
|
def to_s
|
|
26
43
|
@normalized
|
|
27
44
|
end
|
data/lib/archaeo/version.rb
CHANGED
data/lib/archaeo.rb
CHANGED
|
@@ -19,6 +19,7 @@ module Archaeo
|
|
|
19
19
|
autoload :ArchiveUrl, "archaeo/archive_url"
|
|
20
20
|
autoload :Snapshot, "archaeo/snapshot"
|
|
21
21
|
autoload :Page, "archaeo/page"
|
|
22
|
+
autoload :PageBundle, "archaeo/page_bundle"
|
|
22
23
|
autoload :SaveResult, "archaeo/save_result"
|
|
23
24
|
autoload :AvailabilityResult, "archaeo/availability_result"
|
|
24
25
|
autoload :UrlNormalizer, "archaeo/url_normalizer"
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: archaeo
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
@@ -84,6 +84,7 @@ files:
|
|
|
84
84
|
- lib/archaeo/fetcher.rb
|
|
85
85
|
- lib/archaeo/http_client.rb
|
|
86
86
|
- lib/archaeo/page.rb
|
|
87
|
+
- lib/archaeo/page_bundle.rb
|
|
87
88
|
- lib/archaeo/save_api.rb
|
|
88
89
|
- lib/archaeo/save_result.rb
|
|
89
90
|
- lib/archaeo/snapshot.rb
|