archaeo 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8b82c7c8f2e568697056587556ac01bee92f51376af390aa26ed9c76221200d8
4
- data.tar.gz: 411cf04b07219a02e5b049120bb74528dc17cf5fcb8424acd3c7c5ee69659bea
3
+ metadata.gz: cac7a475384c04aaa8a1879a207ac2bb5fad40347f21142de904cc78f5525717
4
+ data.tar.gz: 4282ca2795d7d8baefd750e0c283302ba5c8138d3105fcc94b66162d2671dceb
5
5
  SHA512:
6
- metadata.gz: ec87c66817c1ce3f318e45d5b26221ca44c1958dfa9009fbe1541a1b3bbdb8e4585f6baf66e73c6a839a10d938e189b61dbea43afccf74a6f55476ad8719e18e
7
- data.tar.gz: 0c33454fa368f009a7383ebc8a6a79f3638615740bd312eaab568c21d90cb20a0c4c6615324b20634f00614021016cf6d464910679fcf61fdf8d663097935cd7
6
+ metadata.gz: 4e1802542e03ca5f467d383897297593dc3e20232774d71c1fca57a56b8610d6b3e0985d8692554c7a7e4532604f66088c50782433cee3730276e8ac1c4de9e4
7
+ data.tar.gz: d763469fabd810b6c81deca4f2463982ec1e92016a029edf33871ea62e105c231ed43eb7dab0524a374a5782d0ab3cd2e8f53f9a6e0a01f61c5cfdcbcb3a3724
data/README.adoc CHANGED
@@ -4,7 +4,7 @@
4
4
 
5
5
  Archaeo is a Ruby client for the Internet Archive's https://web.archive.org[Wayback Machine] APIs.
6
6
 
7
- It provides a model-driven interface for querying archived snapshots, checking availability, saving URLs, and fetching archived content.
7
+ It provides a model-driven interface for querying archived snapshots, checking availability, saving URLs, fetching archived content, and bulk downloading with resume support.
8
8
 
9
9
  == Installation
10
10
 
@@ -33,7 +33,7 @@ require "archaeo"
33
33
  ----
34
34
  cdx = Archaeo::CdxApi.new
35
35
 
36
- # Enumerate all snapshots
36
+ # Enumerate all snapshots (auto-paginates via resume key)
37
37
  cdx.snapshots("example.com").each do |snapshot|
38
38
  puts snapshot.timestamp
39
39
  puts snapshot.original_url
@@ -48,6 +48,23 @@ near = cdx.near("example.com", timestamp: "20220101")
48
48
  # Filter by time
49
49
  before = cdx.before("example.com", timestamp: "20220101")
50
50
  after = cdx.after("example.com", timestamp: "20220101")
51
+
52
+ # Filter by status code, mimetype, or URL pattern
53
+ cdx.snapshots("example.com",
54
+ filters: [Archaeo::CdxFilter.by_status(200)],
55
+ collapse: ["digest"],
56
+ match_type: "domain",
57
+ sort: "reverse",
58
+ )
59
+
60
+ # Page-based pagination
61
+ cdx.snapshots("example.com", page: 0)
62
+
63
+ # Count pages
64
+ cdx.num_pages("example.com")
65
+
66
+ # Discover all known URLs for a domain
67
+ cdx.known_urls("example.com")
51
68
  ----
52
69
 
53
70
  === Check Availability
@@ -94,6 +111,65 @@ page = fetcher.fetch("https://example.com/",
94
111
  identity: true)
95
112
  ----
96
113
 
114
+ === Fetch Page with Assets
115
+
116
+ [source,ruby]
117
+ ----
118
+ fetcher = Archaeo::Fetcher.new
119
+ bundle = fetcher.fetch_page_with_assets("https://example.com/",
120
+ timestamp: "20220615000000")
121
+
122
+ bundle.page # => Archaeo::Page
123
+ bundle.assets # => Archaeo::AssetList
124
+ bundle.assets.css # => ["https://example.com/style.css", ...]
125
+ bundle.assets.js # => ["https://example.com/app.js", ...]
126
+ bundle.assets.images
127
+ bundle.assets.fonts
128
+ bundle.assets.media
129
+ ----
130
+
131
+ === Bulk Download with Resume
132
+
133
+ [source,ruby]
134
+ ----
135
+ downloader = Archaeo::BulkDownloader.new(output_dir: "archive")
136
+ downloader.download("example.com") do |current, total, snapshot|
137
+ puts "[#{current}/#{total}] #{snapshot.original_url}"
138
+ end
139
+
140
+ # Resume interrupted download
141
+ downloader.download("example.com", resume: true)
142
+
143
+ # Filter by date range
144
+ downloader.download("example.com",
145
+ from: "20220101", to: "20221231")
146
+ ----
147
+
148
+ === URL Normalization
149
+
150
+ [source,ruby]
151
+ ----
152
+ Archaeo::UrlNormalizer.normalize(" https://example.com/ ")
153
+ # => "https://example.com/"
154
+
155
+ Archaeo::UrlNormalizer.normalize('"https://example.com/%252F"')
156
+ # => "https://example.com/%2F"
157
+
158
+ Archaeo::UrlNormalizer.with_scheme("example.com")
159
+ # => "https://example.com"
160
+ ----
161
+
162
+ === CDX Filters
163
+
164
+ [source,ruby]
165
+ ----
166
+ # Build validated filter expressions
167
+ Archaeo::CdxFilter.by_status(200) # => "statuscode:200"
168
+ Archaeo::CdxFilter.excluding_status(404) # => "!statuscode:404"
169
+ Archaeo::CdxFilter.by_mimetype("text/html") # => "mimetype:text/html"
170
+ Archaeo::CdxFilter.by_url("example.com") # => "original:example.com"
171
+ ----
172
+
97
173
  === Timestamps
98
174
 
99
175
  [source,ruby]
@@ -121,8 +197,13 @@ ts1 < ts2 # => true/false
121
197
 
122
198
  [source,bash]
123
199
  ----
124
- # List snapshots
200
+ # Show version
201
+ archaeo --version
202
+
203
+ # List snapshots (table, json, or csv format)
125
204
  archaeo snapshots example.com
205
+ archaeo snapshots --format json example.com
206
+ archaeo snapshots --format csv --from 20220101 --to 20221231 example.com
126
207
 
127
208
  # Find closest snapshot
128
209
  archaeo near example.com 20220101
@@ -136,8 +217,40 @@ archaeo save https://example.com/
136
217
  # Fetch archived content
137
218
  archaeo fetch https://example.com/ 20220615120000
138
219
 
220
+ # Fetch and save to file
221
+ archaeo fetch --output page.html https://example.com/ 20220615120000
222
+
139
223
  # Fetch raw (identity) content
140
224
  archaeo fetch --identity https://example.com/ 20220615120000
225
+
226
+ # Download all snapshots
227
+ archaeo download example.com --output ./archive
228
+
229
+ # Resume interrupted download
230
+ archaeo download example.com --resume
231
+
232
+ # Discover all known URLs for a domain
233
+ archaeo known_urls example.com
234
+ ----
235
+
236
+ === Error Handling
237
+
238
+ [source,ruby]
239
+ ----
240
+ # Blocked site (robots.txt)
241
+ Archaeo::BlockedSiteError
242
+
243
+ # No snapshot found
244
+ Archaeo::NoSnapshotFound
245
+
246
+ # Rate limited by Wayback Machine
247
+ Archaeo::RateLimitError
248
+
249
+ # Maximum retries exceeded
250
+ Archaeo::MaximumRetriesExceeded
251
+
252
+ # SavePageNow session limit
253
+ Archaeo::SaveFailed
141
254
  ----
142
255
 
143
256
  == Architecture
@@ -149,20 +262,28 @@ Archaeo follows a model-driven, OOP design:
149
262
  | Layer | Classes | Purpose
150
263
 
151
264
  | *Models*
152
- | `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `SaveResult`, `AvailabilityResult`
265
+ | `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`
153
266
  | Domain value objects
154
267
 
268
+ | *URL Processing*
269
+ | `UrlNormalizer`, `CdxFilter`, `UrlRewriter`
270
+ | URL sanitization, filtering, and rewriting
271
+
272
+ | *Asset Extraction*
273
+ | `AssetExtractor`, `AssetList`
274
+ | Parse HTML for resource URLs
275
+
155
276
  | *APIs*
156
277
  | `CdxApi`, `AvailabilityApi`, `SaveApi`
157
278
  | Query and mutate the archive
158
279
 
159
280
  | *Operations*
160
- | `Fetcher`
161
- | Download archived content
281
+ | `Fetcher`, `BulkDownloader`, `DownloadState`
282
+ | Download content with resume support
162
283
 
163
284
  | *Infrastructure*
164
285
  | `HttpClient`
165
- | HTTP transport with retries and gzip
286
+ | HTTP transport with retries, gzip, connection pooling
166
287
  |===
167
288
 
168
289
  All API classes accept an `HttpClient` via dependency injection for testability.
data/lib/archaeo/cli.rb CHANGED
@@ -158,13 +158,14 @@ module Archaeo
158
158
  end
159
159
 
160
160
  def output_csv(snaps)
161
- puts CSV.generate do |csv|
161
+ csv = CSV.generate do |csv|
162
162
  csv << %w[timestamp status_code url archive_url]
163
163
  snaps.each do |snap|
164
164
  csv << [snap.timestamp.to_s, snap.status_code,
165
165
  snap.original_url, snap.archive_url]
166
166
  end
167
167
  end
168
+ puts csv
168
169
  end
169
170
 
170
171
  def write_output(path, content)
@@ -22,6 +22,23 @@ module Archaeo
22
22
  normalized.match?(%r{\A[a-z][a-z0-9+\-.]*://}) ? normalized : "https://#{normalized}"
23
23
  end
24
24
 
25
+ VALID_URL_RE = %r{\A([a-z][a-z0-9+\-.]*://)?[^\s]+\z}
26
+
27
+ def self.valid?(url)
28
+ normalized = normalize(url)
29
+ return false if normalized.empty?
30
+
31
+ normalized.match?(VALID_URL_RE)
32
+ end
33
+
34
+ def self.validate!(url)
35
+ normalized = normalize(url)
36
+ raise ArgumentError, "URL cannot be empty" if normalized.empty?
37
+ raise ArgumentError, "Invalid URL: #{url}" unless valid?(url)
38
+
39
+ normalized
40
+ end
41
+
25
42
  def to_s
26
43
  @normalized
27
44
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Archaeo
4
- VERSION = "0.2.1"
4
+ VERSION = "0.2.2"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: archaeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.