archaeo 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 24c6b37575e8f673a8e6acb7aba38264ac811236cb91e905663914d08a283289
4
- data.tar.gz: 632f36d31ee83b23f727dd8eb8217929d3dad26c94c6948fb0621aff2b937701
3
+ metadata.gz: cac7a475384c04aaa8a1879a207ac2bb5fad40347f21142de904cc78f5525717
4
+ data.tar.gz: 4282ca2795d7d8baefd750e0c283302ba5c8138d3105fcc94b66162d2671dceb
5
5
  SHA512:
6
- metadata.gz: 33cf0ea6c5317be5aafba988e8652555a8e7a77620e93571a51c1537ae4cdd455e76d170fe78549a5e3068ec82db12138159cb87a31c1aafda5c036a6ca2511e
7
- data.tar.gz: 2c0ae7a6a461913ed475dd3084e495f057db995ebf9ea8a1e6c9595aae74cd6bc65a0cdc6f2b2a3a831d9cc0e45e92e06a13a4f7653e89c0a8bbcd7226207677
6
+ metadata.gz: 4e1802542e03ca5f467d383897297593dc3e20232774d71c1fca57a56b8610d6b3e0985d8692554c7a7e4532604f66088c50782433cee3730276e8ac1c4de9e4
7
+ data.tar.gz: d763469fabd810b6c81deca4f2463982ec1e92016a029edf33871ea62e105c231ed43eb7dab0524a374a5782d0ab3cd2e8f53f9a6e0a01f61c5cfdcbcb3a3724
data/README.adoc CHANGED
@@ -4,7 +4,7 @@
4
4
 
5
5
  Archaeo is a Ruby client for the Internet Archive's https://web.archive.org[Wayback Machine] APIs.
6
6
 
7
- It provides a model-driven interface for querying archived snapshots, checking availability, saving URLs, and fetching archived content.
7
+ It provides a model-driven interface for querying archived snapshots, checking availability, saving URLs, fetching archived content, and bulk downloading with resume support.
8
8
 
9
9
  == Installation
10
10
 
@@ -33,7 +33,7 @@ require "archaeo"
33
33
  ----
34
34
  cdx = Archaeo::CdxApi.new
35
35
 
36
- # Enumerate all snapshots
36
+ # Enumerate all snapshots (auto-paginates via resume key)
37
37
  cdx.snapshots("example.com").each do |snapshot|
38
38
  puts snapshot.timestamp
39
39
  puts snapshot.original_url
@@ -48,6 +48,23 @@ near = cdx.near("example.com", timestamp: "20220101")
48
48
  # Filter by time
49
49
  before = cdx.before("example.com", timestamp: "20220101")
50
50
  after = cdx.after("example.com", timestamp: "20220101")
51
+
52
+ # Filter by status code, mimetype, or URL pattern
53
+ cdx.snapshots("example.com",
54
+ filters: [Archaeo::CdxFilter.by_status(200)],
55
+ collapse: ["digest"],
56
+ match_type: "domain",
57
+ sort: "reverse",
58
+ )
59
+
60
+ # Page-based pagination
61
+ cdx.snapshots("example.com", page: 0)
62
+
63
+ # Count pages
64
+ cdx.num_pages("example.com")
65
+
66
+ # Discover all known URLs for a domain
67
+ cdx.known_urls("example.com")
51
68
  ----
52
69
 
53
70
  === Check Availability
@@ -94,6 +111,65 @@ page = fetcher.fetch("https://example.com/",
94
111
  identity: true)
95
112
  ----
96
113
 
114
+ === Fetch Page with Assets
115
+
116
+ [source,ruby]
117
+ ----
118
+ fetcher = Archaeo::Fetcher.new
119
+ bundle = fetcher.fetch_page_with_assets("https://example.com/",
120
+ timestamp: "20220615000000")
121
+
122
+ bundle.page # => Archaeo::Page
123
+ bundle.assets # => Archaeo::AssetList
124
+ bundle.assets.css # => ["https://example.com/style.css", ...]
125
+ bundle.assets.js # => ["https://example.com/app.js", ...]
126
+ bundle.assets.images
127
+ bundle.assets.fonts
128
+ bundle.assets.media
129
+ ----
130
+
131
+ === Bulk Download with Resume
132
+
133
+ [source,ruby]
134
+ ----
135
+ downloader = Archaeo::BulkDownloader.new(output_dir: "archive")
136
+ downloader.download("example.com") do |current, total, snapshot|
137
+ puts "[#{current}/#{total}] #{snapshot.original_url}"
138
+ end
139
+
140
+ # Resume interrupted download
141
+ downloader.download("example.com", resume: true)
142
+
143
+ # Filter by date range
144
+ downloader.download("example.com",
145
+ from: "20220101", to: "20221231")
146
+ ----
147
+
148
+ === URL Normalization
149
+
150
+ [source,ruby]
151
+ ----
152
+ Archaeo::UrlNormalizer.normalize(" https://example.com/ ")
153
+ # => "https://example.com/"
154
+
155
+ Archaeo::UrlNormalizer.normalize('"https://example.com/%252F"')
156
+ # => "https://example.com/%2F"
157
+
158
+ Archaeo::UrlNormalizer.with_scheme("example.com")
159
+ # => "https://example.com"
160
+ ----
161
+
162
+ === CDX Filters
163
+
164
+ [source,ruby]
165
+ ----
166
+ # Build validated filter expressions
167
+ Archaeo::CdxFilter.by_status(200) # => "statuscode:200"
168
+ Archaeo::CdxFilter.excluding_status(404) # => "!statuscode:404"
169
+ Archaeo::CdxFilter.by_mimetype("text/html") # => "mimetype:text/html"
170
+ Archaeo::CdxFilter.by_url("example.com") # => "original:example.com"
171
+ ----
172
+
97
173
  === Timestamps
98
174
 
99
175
  [source,ruby]
@@ -121,8 +197,13 @@ ts1 < ts2 # => true/false
121
197
 
122
198
  [source,bash]
123
199
  ----
124
- # List snapshots
200
+ # Show version
201
+ archaeo --version
202
+
203
+ # List snapshots (table, json, or csv format)
125
204
  archaeo snapshots example.com
205
+ archaeo snapshots --format json example.com
206
+ archaeo snapshots --format csv --from 20220101 --to 20221231 example.com
126
207
 
127
208
  # Find closest snapshot
128
209
  archaeo near example.com 20220101
@@ -136,8 +217,40 @@ archaeo save https://example.com/
136
217
  # Fetch archived content
137
218
  archaeo fetch https://example.com/ 20220615120000
138
219
 
220
+ # Fetch and save to file
221
+ archaeo fetch --output page.html https://example.com/ 20220615120000
222
+
139
223
  # Fetch raw (identity) content
140
224
  archaeo fetch --identity https://example.com/ 20220615120000
225
+
226
+ # Download all snapshots
227
+ archaeo download example.com --output ./archive
228
+
229
+ # Resume interrupted download
230
+ archaeo download example.com --resume
231
+
232
+ # Discover all known URLs for a domain
233
+ archaeo known_urls example.com
234
+ ----
235
+
236
+ === Error Handling
237
+
238
+ [source,ruby]
239
+ ----
240
+ # Blocked site (robots.txt)
241
+ Archaeo::BlockedSiteError
242
+
243
+ # No snapshot found
244
+ Archaeo::NoSnapshotFound
245
+
246
+ # Rate limited by Wayback Machine
247
+ Archaeo::RateLimitError
248
+
249
+ # Maximum retries exceeded
250
+ Archaeo::MaximumRetriesExceeded
251
+
252
+ # SavePageNow session limit
253
+ Archaeo::SaveFailed
141
254
  ----
142
255
 
143
256
  == Architecture
@@ -149,20 +262,28 @@ Archaeo follows a model-driven, OOP design:
149
262
  | Layer | Classes | Purpose
150
263
 
151
264
  | *Models*
152
- | `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `SaveResult`, `AvailabilityResult`
265
+ | `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`
153
266
  | Domain value objects
154
267
 
268
+ | *URL Processing*
269
+ | `UrlNormalizer`, `CdxFilter`, `UrlRewriter`
270
+ | URL sanitization, filtering, and rewriting
271
+
272
+ | *Asset Extraction*
273
+ | `AssetExtractor`, `AssetList`
274
+ | Parse HTML for resource URLs
275
+
155
276
  | *APIs*
156
277
  | `CdxApi`, `AvailabilityApi`, `SaveApi`
157
278
  | Query and mutate the archive
158
279
 
159
280
  | *Operations*
160
- | `Fetcher`
161
- | Download archived content
281
+ | `Fetcher`, `BulkDownloader`, `DownloadState`
282
+ | Download content with resume support
162
283
 
163
284
  | *Infrastructure*
164
285
  | `HttpClient`
165
- | HTTP transport with retries and gzip
286
+ | HTTP transport with retries, gzip, connection pooling
166
287
  |===
167
288
 
168
289
  All API classes accept an `HttpClient` via dependency injection for testability.
@@ -16,6 +16,7 @@ module Archaeo
16
16
  end
17
17
 
18
18
  def near(url, timestamp: nil)
19
+ url = UrlNormalizer.normalize(url)
19
20
  params = { "url" => url }
20
21
  params["timestamp"] = timestamp.to_s if timestamp
21
22
 
@@ -15,6 +15,7 @@ module Archaeo
15
15
  end
16
16
 
17
17
  def download(url, from: nil, to: nil, resume: false)
18
+ url = UrlNormalizer.normalize(url)
18
19
  FileUtils.mkdir_p(@output_dir)
19
20
  state = DownloadState.new(@output_dir)
20
21
 
@@ -47,6 +47,7 @@ module Archaeo
47
47
  # Returns an Enumerator of Snapshot objects, auto-paginating
48
48
  # via resume key unless an explicit page is requested.
49
49
  def snapshots(url, **options)
50
+ url = UrlNormalizer.normalize(url)
50
51
  validate_options!(options)
51
52
 
52
53
  Enumerator.new do |yielder|
@@ -59,9 +60,15 @@ module Archaeo
59
60
  end
60
61
 
61
62
  def near(url, timestamp:)
63
+ url = UrlNormalizer.normalize(url)
62
64
  ts = Timestamp.coerce(timestamp)
63
65
  result = snapshots(url, sort: "closest",
64
66
  closest: ts.to_s, limit: 1).first
67
+ if result&.blocked?
68
+ raise BlockedSiteError,
69
+ "Site is blocked: #{url}"
70
+ end
71
+
65
72
  result || raise(NoSnapshotFound,
66
73
  "No snapshot found near #{ts} for #{url}")
67
74
  end
@@ -94,6 +101,7 @@ module Archaeo
94
101
 
95
102
  # Returns the number of pages for a paginated query.
96
103
  def num_pages(url, **options)
104
+ url = UrlNormalizer.normalize(url)
97
105
  params = { "url" => url, "showNumPages" => "true" }
98
106
  merge_scalar_params!(params, options)
99
107
  response = @client.get(
@@ -109,6 +117,7 @@ module Archaeo
109
117
 
110
118
  # Returns all unique original URLs under a domain.
111
119
  def known_urls(domain, match_type: "domain")
120
+ domain = UrlNormalizer.normalize(domain)
112
121
  snapshots(domain, match_type: match_type,
113
122
  collapse: ["urlkey"]).map(&:original_url).uniq
114
123
  end
data/lib/archaeo/cli.rb CHANGED
@@ -158,13 +158,14 @@ module Archaeo
158
158
  end
159
159
 
160
160
  def output_csv(snaps)
161
- puts CSV.generate do |csv|
161
+ csv = CSV.generate do |csv|
162
162
  csv << %w[timestamp status_code url archive_url]
163
163
  snaps.each do |snap|
164
164
  csv << [snap.timestamp.to_s, snap.status_code,
165
165
  snap.original_url, snap.archive_url]
166
166
  end
167
167
  end
168
+ puts csv
168
169
  end
169
170
 
170
171
  def write_output(path, content)
@@ -14,6 +14,7 @@ module Archaeo
14
14
  end
15
15
 
16
16
  def fetch(url, timestamp:, identity: false)
17
+ url = UrlNormalizer.normalize(url)
17
18
  ts = Timestamp.coerce(timestamp)
18
19
  archive_url = ArchiveUrl.new(url, timestamp: ts,
19
20
  identity: identity)
@@ -21,6 +22,13 @@ module Archaeo
21
22
  build_page(response, archive_url.to_s, url, ts)
22
23
  end
23
24
 
25
+ def fetch_page_with_assets(url, timestamp:)
26
+ page = fetch(url, timestamp: timestamp)
27
+ assets = AssetExtractor.new(page.content,
28
+ base_url: page.archive_url).extract
29
+ PageBundle.new(page: page, assets: assets)
30
+ end
31
+
24
32
  private
25
33
 
26
34
  def build_page(response, archive_url, url, timestamp)
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # A fetched page together with all its extracted asset URLs.
5
+ #
6
+ # Bundles a Page with the AssetList discovered from its HTML,
7
+ # providing a single object for complete page archival.
8
+ class PageBundle
9
+ attr_reader :page, :assets
10
+
11
+ def initialize(page:, assets:)
12
+ @page = page
13
+ @assets = assets
14
+ end
15
+ end
16
+ end
@@ -17,6 +17,7 @@ module Archaeo
17
17
  end
18
18
 
19
19
  def save(url)
20
+ url = UrlNormalizer.normalize(url)
20
21
  save_url = "#{ENDPOINT}/#{url}"
21
22
  start_time = Time.now.utc
22
23
  attempt_save(save_url, start_time, url)
@@ -22,6 +22,23 @@ module Archaeo
22
22
  normalized.match?(%r{\A[a-z][a-z0-9+\-.]*://}) ? normalized : "https://#{normalized}"
23
23
  end
24
24
 
25
+ VALID_URL_RE = %r{\A([a-z][a-z0-9+\-.]*://)?[^\s]+\z}
26
+
27
+ def self.valid?(url)
28
+ normalized = normalize(url)
29
+ return false if normalized.empty?
30
+
31
+ normalized.match?(VALID_URL_RE)
32
+ end
33
+
34
+ def self.validate!(url)
35
+ normalized = normalize(url)
36
+ raise ArgumentError, "URL cannot be empty" if normalized.empty?
37
+ raise ArgumentError, "Invalid URL: #{url}" unless valid?(url)
38
+
39
+ normalized
40
+ end
41
+
25
42
  def to_s
26
43
  @normalized
27
44
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Archaeo
4
- VERSION = "0.2.0"
4
+ VERSION = "0.2.2"
5
5
  end
data/lib/archaeo.rb CHANGED
@@ -19,6 +19,7 @@ module Archaeo
19
19
  autoload :ArchiveUrl, "archaeo/archive_url"
20
20
  autoload :Snapshot, "archaeo/snapshot"
21
21
  autoload :Page, "archaeo/page"
22
+ autoload :PageBundle, "archaeo/page_bundle"
22
23
  autoload :SaveResult, "archaeo/save_result"
23
24
  autoload :AvailabilityResult, "archaeo/availability_result"
24
25
  autoload :UrlNormalizer, "archaeo/url_normalizer"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: archaeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
@@ -84,6 +84,7 @@ files:
84
84
  - lib/archaeo/fetcher.rb
85
85
  - lib/archaeo/http_client.rb
86
86
  - lib/archaeo/page.rb
87
+ - lib/archaeo/page_bundle.rb
87
88
  - lib/archaeo/save_api.rb
88
89
  - lib/archaeo/save_result.rb
89
90
  - lib/archaeo/snapshot.rb