archaeo 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dff73d8ab14a3b75bf98281d20b5427b55757b330d57e8899a2ffb04d9046c6d
4
- data.tar.gz: d92c2f8e77d6ba5c51283f0581bb51182ebe84aab74ffe4e4971e0d405eab2cc
3
+ metadata.gz: ecdcd994fa61efa836a5224a5e329b40b72694c27a79cbb6eb4f91bf57c0f2c9
4
+ data.tar.gz: 03ad557eb55ce9946a2936e3beec8cad13db2ecd4b2fc49b0996131d35e6ddba
5
5
  SHA512:
6
- metadata.gz: ed1a823e5f358e53ec653e5eee902f95787b9bdedc5670a214f3c1784f4c1829f705d5a53ded5cfb6777f16ed33e32430f2aca13cb9c1fad628885daa55a60a1
7
- data.tar.gz: 3b809e1aad60db5e04a356dff5ab450333ff88f56bf481630887f45b3b3f09035ee1ba959807d22129b827b30324b7730e7cf5c248656b62c38a0beab8ad2581
6
+ metadata.gz: a2859d1738f4f4a9fa0f0ed89d118dacfc24a2f75d3237ad3bdd31cf26c041e8aa5f47c998d8b5e61907c34d491ed48b6df22d2784702d60a15820ba8d8a2a27
7
+ data.tar.gz: e2df62b1077c90d8b04173f9aa713f590f9692cdfde3d3b656796308810341cd5b052321f97a28513ebe98143cdb8f28fa7e182720555db231983fa3d2a6d4be
data/README.adoc CHANGED
@@ -57,6 +57,21 @@ end
57
57
  # Count snapshots
58
58
  cdx.count("example.com") # => Integer
59
59
 
60
+ # Deduplicated snapshots (collapse by digest)
61
+ cdx.unique_snapshots("example.com").each do |snap|
62
+ puts snap.timestamp
63
+ end
64
+
65
+ # Timeline analysis (time-bucketed frequency)
66
+ timeline = cdx.timeline("example.com",
67
+ from: "20220101", to: "20221231",
68
+ bucket_size: :month)
69
+ timeline.to_h # => { "202201" => 5, "202202" => 3, ... }
70
+ timeline.peak # => ["202201", 5]
71
+ timeline.total # => 42
72
+ timeline.span # => ["202201", "202212"]
73
+ timeline.size # => 12 (number of buckets)
74
+
60
75
  # Filter by status code, mimetype, or URL pattern
61
76
  cdx.snapshots("example.com",
62
77
  filters: [Archaeo::CdxFilter.by_status(200)],
@@ -102,6 +117,10 @@ result.to_h # => Hash representation
102
117
  result.as_json # => JSON-serializable Hash
103
118
 
104
119
  api.available?("example.com") # => true/false
120
+
121
+ # Batch availability check
122
+ results = api.batch_available?(%w[example.com other.com])
123
+ # => { "example.com" => AvailabilityResult, ... }
105
124
  ----
106
125
 
107
126
  === Save a URL (SavePageNow)
@@ -114,8 +133,14 @@ result.url # => "https://example.com/"
114
133
  result.archive_url # => "https://web.archive.org/web/..."
115
134
  result.timestamp # => Archaeo::Timestamp
116
135
  result.cached? # => true if already archived
136
+ result.success? # => true if archive_url is present
117
137
  result.to_h # => Hash representation
118
138
  result.as_json # => JSON-serializable Hash
139
+
140
+ # Batch save multiple URLs
141
+ results = save.batch_save(%w[https://a.com https://b.com],
142
+ delay: 2, stop_on_error: false)
143
+ results.each { |r| puts "#{r.url}: #{r.success?}" }
119
144
  ----
120
145
 
121
146
  === Fetch Archived Content
@@ -148,6 +173,15 @@ page = fetcher.fetch("https://example.com/",
148
173
  page = fetcher.fetch("https://example.com/",
149
174
  timestamp: "20220615000000",
150
175
  snapshot: snap)
176
+
177
+ # Raise on error status (raises FetchError with page attached)
178
+ page = fetcher.fetch!("https://example.com/",
179
+ timestamp: "20220615000000")
180
+ # FetchError includes: .status_code, .url, .page
181
+
182
+ # Page links and meta extraction
183
+ page.links # => [{ href: "...", text: "...", external: true/false }]
184
+ page.meta_tags # => { "description" => "...", "og:title" => "...", "canonical" => "..." }
151
185
  ----
152
186
 
153
187
  === Fetch Page with Assets
@@ -186,6 +220,13 @@ restored = Archaeo::AssetList.from_json(json_string)
186
220
 
187
221
  # Safe type access
188
222
  bundle.assets.urls_by_type(:image) # works for any type key
223
+
224
+ # Domain analysis
225
+ bundle.assets.domain_counts
226
+ # => { "cdn.example.com" => 3, "fonts.googleapis.com" => 1 }
227
+
228
+ # Filter downloadable assets (excludes data: and fragment URLs)
229
+ downloadable = bundle.assets.downloadable
189
230
  ----
190
231
 
191
232
  === Bulk Download with Resume
@@ -200,6 +241,7 @@ end
200
241
  summary.total # => total snapshots found
201
242
  summary.downloaded # => successfully downloaded
202
243
  summary.skipped # => skipped (already downloaded with resume)
244
+ summary.failed # => failed downloads
203
245
  summary.bytes_written # => total bytes written
204
246
  summary.elapsed # => seconds elapsed
205
247
 
@@ -237,6 +279,10 @@ entry = state.entry_for("20220615000000")
237
279
  # Total bytes downloaded
238
280
  state.total_bytes # => Integer
239
281
 
282
+ # List all completed timestamps
283
+ state.size # => number of completed entries
284
+ state.timestamps # => ["20220101000000", "20220102000000"]
285
+
240
286
  # Clear state for a fresh download
241
287
  state.clear
242
288
  ----
@@ -279,6 +325,14 @@ Archaeo::CdxFilter.by_mimetype_prefix("image") # => matches image/*
279
325
  # Convenience factories
280
326
  Archaeo::CdxFilter.only_html # => text/html only
281
327
  Archaeo::CdxFilter.excluding_redirects # => excludes 3xx statuses
328
+
329
+ # Introspection
330
+ filter = Archaeo::CdxFilter.by_status(200)
331
+ filter.field # => "statuscode"
332
+ filter.pattern # => "200"
333
+ filter.matches?("200") # => true
334
+ filter.matches?("404") # => false
335
+ filter.negated? # => false
282
336
  ----
283
337
 
284
338
  === URL Rewriting
@@ -298,6 +352,7 @@ rewriter.rewrite("https://web.archive.org/web/20220615000000/style.css")
298
352
  rewriter.rewrite_batch(["url1", "url2"])
299
353
 
300
354
  # Rewrite URLs within HTML (src, href, srcset, data-src, poster)
355
+ # Also rewrites inline style url() and <style> element url()
301
356
  rewritten_html = rewriter.rewrite_html(html_content)
302
357
  ----
303
358
 
@@ -319,6 +374,10 @@ snap.age # => seconds since capture
319
374
  snap.older_than?(3600) # => true if older than 1 hour
320
375
  snap.newer_than?(3600) # => true if newer than 1 hour
321
376
 
377
+ # Content comparison (by digest)
378
+ snap1.same_content_as?(snap2) # => true if same digest
379
+ snap1.duplicate_of?(snap2) # => true if same digest AND different timestamp
380
+
322
381
  # Identity URL (raw content, no Wayback rewriting)
323
382
  snap.identity_url
324
383
 
@@ -370,6 +429,18 @@ ts1 < ts2 # => true/false
370
429
 
371
430
  # Immutable -- frozen on creation
372
431
  ts.frozen? # => true
432
+
433
+ # Date/time helpers
434
+ ts.quarter # => 1..4
435
+ ts.wday # => 0..6 (Sunday = 0)
436
+ ts.human_readable # => "2022-06-15 00:00:00 UTC"
437
+ ts.to_date # => Date object
438
+
439
+ # Date ranges for coverage analysis
440
+ range = ts.date_range(:month)
441
+ # => Timestamp(Jun 1)..Timestamp(Jun 30 23:59:59)
442
+ ts.date_range(:day) # => single day range
443
+ ts.date_range(:year) # => full year range
373
444
  ----
374
445
 
375
446
  === HTTP Client Observability
@@ -383,6 +454,13 @@ client = Archaeo::HttpClient.new(
383
454
  },
384
455
  )
385
456
 
457
+ # Intercept requests before they are sent
458
+ client = Archaeo::HttpClient.new(
459
+ before_request: ->(uri, request) {
460
+ request["X-Custom-Header"] = "value"
461
+ },
462
+ )
463
+
386
464
  # Inspect connection pool state
387
465
  client.pool_stats
388
466
  # => { active_connections: 2, max_pool_size: 8,
@@ -401,6 +479,7 @@ archaeo --version
401
479
  archaeo snapshots example.com
402
480
  archaeo snapshots --format json example.com
403
481
  archaeo snapshots --format csv --from 20220101 --to 20221231 example.com
482
+ archaeo snapshots --filter-status 200 --filter-type text/html example.com
404
483
 
405
484
  # Find closest snapshot
406
485
  archaeo near example.com 20220101
@@ -440,6 +519,18 @@ archaeo fetch --identity https://example.com/ 20220615120000
440
519
  archaeo fetch-assets https://example.com/ 20220615120000
441
520
  archaeo fetch-assets --format json https://example.com/ 20220615120000
442
521
 
522
+ # Rewrite archive URLs to local paths
523
+ archaeo rewrite https://example.com/ 20220615120000
524
+ archaeo rewrite --output page.html --prefix local https://example.com/ 20220615120000
525
+
526
+ # Compare assets between two snapshots
527
+ archaeo diff https://example.com/ 20220101 20220615
528
+ archaeo diff --format json https://example.com/ 20220101 20220615
529
+
530
+ # Audit assets for an archived page
531
+ archaeo asset-audit https://example.com/ 20220615120000
532
+ archaeo asset-audit --format json https://example.com/ 20220615120000
533
+
443
534
  # Download all snapshots
444
535
  archaeo download example.com --output ./archive
445
536
 
@@ -480,6 +571,9 @@ Archaeo::SaveFailed
480
571
 
481
572
  # Content digest mismatch
482
573
  Archaeo::IntegrityError
574
+
575
+ # HTTP error during fetch (includes .page, .url, .status_code)
576
+ Archaeo::FetchError
483
577
  ----
484
578
 
485
579
  == Architecture
@@ -491,7 +585,7 @@ Archaeo follows a model-driven, OOP design:
491
585
  | Layer | Classes | Purpose
492
586
 
493
587
  | *Models*
494
- | `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`
588
+ | `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`, `CdxTimeline`
495
589
  | Domain value objects with `to_h`, `as_json`, `inspect` support
496
590
 
497
591
  | *URL Processing*
@@ -120,6 +120,7 @@ module Archaeo
120
120
  extract_media_sources(list)
121
121
  extract_video_posters(list)
122
122
  extract_embeds(list)
123
+ extract_tracks(list)
123
124
  end
124
125
 
125
126
  def extract_media_sources(list)
@@ -138,6 +139,15 @@ module Archaeo
138
139
  @doc.css("iframe[src], embed[src]").each do |el|
139
140
  list.add(resolve(el["src"]), type: :media)
140
141
  end
142
+ @doc.css("object[data]").each do |el|
143
+ list.add(resolve(el["data"]), type: :media)
144
+ end
145
+ end
146
+
147
+ def extract_tracks(list)
148
+ @doc.css("track[src]").each do |el|
149
+ list.add(resolve(el["src"]), type: :media)
150
+ end
141
151
  end
142
152
 
143
153
  def extract_inline_css(list)
@@ -102,5 +102,28 @@ module Archaeo
102
102
  end
103
103
  list
104
104
  end
105
+
106
+ def domain_counts
107
+ all.each_with_object(Hash.new(0)) do |url, counts|
108
+ host = begin
109
+ URI.parse(url).host
110
+ rescue URI::InvalidURIError
111
+ "(invalid)"
112
+ end
113
+ counts[host || "(relative)"] += 1
114
+ end
115
+ end
116
+
117
+ def downloadable
118
+ filtered = self.class.new
119
+ CATEGORIES.each do |type|
120
+ @urls_by_type[type].each do |url|
121
+ next if url.start_with?("data:", "#")
122
+
123
+ filtered.add(url, type: type)
124
+ end
125
+ end
126
+ filtered
127
+ end
105
128
  end
106
129
  end
@@ -38,6 +38,16 @@ module Archaeo
38
38
  near(url).available?
39
39
  end
40
40
 
41
+ def batch_available?(urls, concurrency: 1)
42
+ if concurrency <= 1
43
+ urls.to_h do |u|
44
+ [u, near(u)]
45
+ end
46
+ else
47
+ batch_concurrent(urls, concurrency)
48
+ end
49
+ end
50
+
41
51
  private
42
52
 
43
53
  def parse_response(response, url)
@@ -78,5 +88,26 @@ module Archaeo
78
88
  archived_status: archived_status,
79
89
  )
80
90
  end
91
+
92
+ def batch_concurrent(urls, concurrency)
93
+ results = {}
94
+ mutex = Mutex.new
95
+ queue = urls.dup
96
+ threads = Array.new(concurrency) do
97
+ Thread.new { drain_queue(queue, results, mutex) }
98
+ end
99
+ threads.each(&:join)
100
+ results
101
+ end
102
+
103
+ def drain_queue(queue, results, mutex)
104
+ loop do
105
+ url = mutex.synchronize { queue.shift }
106
+ break unless url
107
+
108
+ result = near(url)
109
+ mutex.synchronize { results[url] = result }
110
+ end
111
+ end
81
112
  end
82
113
  end
@@ -56,5 +56,16 @@ module Archaeo
56
56
  def inspect
57
57
  "#<#{self.class.name} #{@url} available=#{@available}>"
58
58
  end
59
+
60
+ def to_snapshot
61
+ return nil unless available?
62
+
63
+ Snapshot.new(
64
+ urlkey: UrlNormalizer.normalize(url).downcase,
65
+ timestamp: timestamp,
66
+ original_url: url,
67
+ status_code: archived_status || 200,
68
+ )
69
+ end
59
70
  end
60
71
  end
@@ -15,11 +15,12 @@ module Archaeo
15
15
  # for interrupted download recovery.
16
16
  class BulkDownloader
17
17
  def initialize(client: HttpClient.new, output_dir: "archive",
18
- cdx_api: nil, concurrency: 1)
18
+ cdx_api: nil, concurrency: 1, on_error: nil)
19
19
  @client = client
20
20
  @output_dir = output_dir
21
21
  @cdx_api = cdx_api
22
22
  @concurrency = [1, concurrency.to_i].max
23
+ @on_error = on_error
23
24
  end
24
25
 
25
26
  def download(url, from: nil, to: nil, resume: false,
@@ -29,10 +30,11 @@ module Archaeo
29
30
  FileUtils.mkdir_p(@output_dir) unless dry_run
30
31
 
31
32
  snapshots = fetch_snapshots(url, from: from, to: to)
32
- downloaded, skipped, bytes =
33
+ downloaded, skipped, bytes, failed =
33
34
  run_download(snapshots, resume, dry_run, block)
34
35
 
35
- build_summary(start_time, snapshots.size, downloaded, skipped, bytes)
36
+ build_summary(start_time, snapshots.size, downloaded,
37
+ skipped, bytes, failed: failed)
36
38
  end
37
39
 
38
40
  private
@@ -59,24 +61,26 @@ module Archaeo
59
61
  end
60
62
  end
61
63
 
62
- def build_summary(start_time, total, downloaded, skipped, bytes)
64
+ def build_summary(start_time, total, downloaded, skipped,
65
+ bytes, failed: 0)
63
66
  elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
64
67
  DownloadSummary.new(
65
68
  total: total, downloaded: downloaded, skipped: skipped,
66
- failed: 0, bytes_written: bytes, elapsed: elapsed
69
+ failed: failed, bytes_written: bytes, elapsed: elapsed
67
70
  )
68
71
  end
69
72
 
70
73
  def download_sequential(snapshots, total, state, resume,
71
74
  dry_run, progress)
72
- counters = { downloaded: 0, skipped: 0, bytes: 0 }
75
+ counters = { downloaded: 0, skipped: 0, bytes: 0, failed: 0 }
73
76
 
74
77
  snapshots.each_with_index do |snap, index|
75
78
  process_sequential(snap, state, resume, dry_run, counters)
76
79
  progress&.call(index + 1, total, snap)
77
80
  end
78
81
 
79
- [counters[:downloaded], counters[:skipped], counters[:bytes]]
82
+ [counters[:downloaded], counters[:skipped],
83
+ counters[:bytes], counters[:failed]]
80
84
  end
81
85
 
82
86
  def process_sequential(snap, state, resume, dry_run, counters)
@@ -87,6 +91,9 @@ module Archaeo
87
91
 
88
92
  counters[:bytes] += download_snapshot(snap, state) unless dry_run
89
93
  counters[:downloaded] += 1
94
+ rescue StandardError => e
95
+ counters[:failed] += 1
96
+ @on_error&.call(snap, e)
90
97
  end
91
98
 
92
99
  def download_snapshot(snap, state)
@@ -100,7 +107,7 @@ module Archaeo
100
107
  dry_run, progress)
101
108
  queue = snapshots.each_with_index.to_a
102
109
  shared = { mutex: Mutex.new, errors: [],
103
- downloaded: 0, skipped: 0, bytes: 0 }
110
+ downloaded: 0, skipped: 0, bytes: 0, failed: 0 }
104
111
 
105
112
  threads = Array.new(@concurrency) do
106
113
  Thread.new do
@@ -109,17 +116,9 @@ module Archaeo
109
116
  end
110
117
  end
111
118
  threads.each(&:join)
112
- raise_on_errors(shared[:errors])
113
-
114
- [shared[:downloaded], shared[:skipped], shared[:bytes]]
115
- end
116
119
 
117
- def raise_on_errors(errors)
118
- return unless errors.any?
119
-
120
- raise Error,
121
- "#{errors.size} download(s) failed: " \
122
- "#{errors.map { |s, _| s.timestamp }.join(', ')}"
120
+ [shared[:downloaded], shared[:skipped],
121
+ shared[:bytes], shared[:failed]]
123
122
  end
124
123
 
125
124
  def process_queue(queue, total, state, resume, dry_run,
@@ -133,7 +132,7 @@ module Archaeo
133
132
  next
134
133
  end
135
134
 
136
- concurrent_fetch(snap, state, dry_run, shared)
135
+ concurrent_fetch(snap, dry_run, shared)
137
136
  progress&.call(index + 1, total, snap)
138
137
  end
139
138
  end
@@ -145,35 +144,59 @@ module Archaeo
145
144
  true
146
145
  end
147
146
 
148
- def concurrent_fetch(snap, state, dry_run, shared)
147
+ def concurrent_fetch(snap, dry_run, shared)
149
148
  unless dry_run
150
149
  content = fetch_and_save(snap)
151
- shared[:mutex].synchronize do
152
- state.mark_completed(snap.timestamp,
153
- url: snap.original_url,
154
- bytes: content.bytesize)
155
- shared[:bytes] += content.bytesize
156
- end
150
+ record_completed(snap, content, shared)
157
151
  end
158
152
  shared[:mutex].synchronize { shared[:downloaded] += 1 }
159
153
  rescue StandardError => e
160
- shared[:mutex].synchronize { shared[:errors] << [snap, e] }
154
+ shared[:mutex].synchronize do
155
+ shared[:failed] += 1
156
+ shared[:errors] << [snap, e]
157
+ end
158
+ @on_error&.call(snap, e)
159
+ end
160
+
161
+ def record_completed(snap, content, shared)
162
+ shared[:mutex].synchronize do
163
+ state.mark_completed(snap.timestamp,
164
+ url: snap.original_url,
165
+ bytes: content.bytesize)
166
+ shared[:bytes] += content.bytesize
167
+ end
161
168
  end
162
169
 
163
170
  def fetch_and_save(snapshot)
164
- fetcher = Fetcher.new(client: @client)
165
- page = fetcher.fetch(snapshot.original_url,
166
- timestamp: snapshot.timestamp)
171
+ page = fetch_page(snapshot)
172
+ validate_page_status(page, snapshot)
173
+ write_page_file(page, snapshot)
174
+ rescue StandardError
175
+ FileUtils.rm_f(tmp_path) if defined?(tmp_path)
176
+ raise
177
+ end
178
+
179
+ def fetch_page(snapshot)
180
+ Fetcher.new(client: @client).fetch(
181
+ snapshot.original_url, timestamp: snapshot.timestamp
182
+ )
183
+ end
184
+
185
+ def validate_page_status(page, snapshot)
186
+ return if page.status_code.between?(200, 299)
167
187
 
188
+ raise Error,
189
+ "HTTP #{page.status_code} for " \
190
+ "#{snapshot.original_url} at #{snapshot.timestamp}"
191
+ end
192
+
193
+ def write_page_file(page, snapshot)
168
194
  filename = build_filename(snapshot)
169
195
  FileUtils.mkdir_p(File.dirname(filename))
170
196
  tmp_path = "#{filename}.tmp"
171
197
  File.binwrite(tmp_path, page.content)
172
198
  File.rename(tmp_path, filename)
173
199
  page.content
174
- rescue StandardError
175
- FileUtils.rm_f(tmp_path) if defined?(tmp_path)
176
- raise
177
200
  end
178
201
 
179
202
  EXTENSION_MAP = {
@@ -110,6 +110,24 @@ module Archaeo
110
110
  snapshots(url, **options).count
111
111
  end
112
112
 
113
+ def unique_snapshots(url, resolve_revisits: true, **options)
114
+ snapshots(url,
115
+ collapse: ["digest"],
116
+ resolve_revisits: resolve_revisits,
117
+ **options)
118
+ end
119
+
120
+ def timeline(url, from: nil, to: nil,
121
+ bucket_size: :month, status: 200)
122
+ options = {}
123
+ options[:from] = Timestamp.coerce(from).to_s if from
124
+ options[:to] = Timestamp.coerce(to).to_s if to
125
+ options[:filters] = [CdxFilter.by_status(status)] if status
126
+
127
+ snaps = snapshots(url, **options).to_a
128
+ CdxTimeline.new(snaps, bucket_size: bucket_size)
129
+ end
130
+
113
131
  # Returns the number of pages for a paginated query.
114
132
  def num_pages(url, **options)
115
133
  url = UrlNormalizer.normalize(url)
@@ -31,6 +31,17 @@ module Archaeo
31
31
  stripped.split(":", 2).first.to_s
32
32
  end
33
33
 
34
+ def pattern
35
+ stripped = @expression.delete_prefix("!")
36
+ stripped.split(":", 2).last.to_s
37
+ end
38
+
39
+ def matches?(value)
40
+ regex = Regexp.new(pattern)
41
+ result = regex.match?(value.to_s)
42
+ negated? ? !result : result
43
+ end
44
+
34
45
  def self.by_status(code)
35
46
  new("statuscode:#{code}")
36
47
  end
@@ -0,0 +1,66 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Time-bucketed snapshot frequency analysis.
5
+ #
6
+ # Groups snapshots by configurable time buckets (day, week, month, year)
7
+ # for frequency analysis and coverage reporting.
8
+ class CdxTimeline
9
+ BUCKET_FORMATS = {
10
+ day: "%Y%m%d",
11
+ week: "%YW%V",
12
+ month: "%Y%m",
13
+ year: "%Y",
14
+ }.freeze
15
+
16
+ def initialize(snapshots, bucket_size: :month)
17
+ @bucket_size = bucket_size
18
+ @buckets = build_buckets(snapshots)
19
+ end
20
+
21
+ def to_a
22
+ @buckets.sort_by(&:first)
23
+ end
24
+
25
+ def to_h
26
+ @buckets.dup
27
+ end
28
+
29
+ def peak
30
+ @buckets.max_by(&:last)
31
+ end
32
+
33
+ def total
34
+ @buckets.values.sum
35
+ end
36
+
37
+ def span
38
+ keys = @buckets.keys
39
+ return nil if keys.empty?
40
+
41
+ [keys.first, keys.last]
42
+ end
43
+
44
+ def empty?
45
+ @buckets.empty?
46
+ end
47
+
48
+ def size
49
+ @buckets.size
50
+ end
51
+
52
+ def inspect
53
+ "#<#{self.class.name} #{total} snapshots in #{@buckets.size} buckets>"
54
+ end
55
+
56
+ private
57
+
58
+ def build_buckets(snapshots)
59
+ fmt = BUCKET_FORMATS[@bucket_size] || BUCKET_FORMATS[:month]
60
+ snapshots.each_with_object(Hash.new(0)) do |snap, counts|
61
+ key = snap.timestamp.to_time.strftime(fmt)
62
+ counts[key] += 1
63
+ end
64
+ end
65
+ end
66
+ end
data/lib/archaeo/cli.rb CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  require "csv"
4
4
  require "json"
5
+ require "set"
5
6
  require "thor"
6
7
 
7
8
  module Archaeo
@@ -27,6 +28,10 @@ module Archaeo
27
28
  option :match_type,
28
29
  desc: "Match type (exact, prefix, host, domain)"
29
30
  option :filter, type: :array, desc: "CDX filter expressions"
31
+ option :filter_status, type: :array,
32
+ desc: "Only include these status codes"
33
+ option :filter_type, type: :array,
34
+ desc: "MIME type prefixes (e.g. image, text/html)"
30
35
  option :collapse, type: :array, desc: "CDX collapse fields"
31
36
  option :sort, desc: "Sort order (default, closest, reverse)"
32
37
  option :limit, type: :numeric, desc: "Max snapshots to return"
@@ -153,6 +158,53 @@ module Archaeo
153
158
  end
154
159
  end
155
160
 
161
+ desc "rewrite URL TIMESTAMP",
162
+ "Fetch a page and rewrite archive URLs to local paths"
163
+ option :prefix, desc: "Local path prefix", default: "local"
164
+ option :output, desc: "Write rewritten HTML to file"
165
+ def rewrite(url, timestamp)
166
+ handle_errors do
167
+ coerced = Timestamp.coerce(timestamp)
168
+ page = Fetcher.new.fetch(url, timestamp: coerced)
169
+ rewritten = build_rewriter(url, coerced).rewrite_html(page.content)
170
+ output_rewritten(rewritten)
171
+ end
172
+ end
173
+
174
+ desc "diff URL TIMESTAMP_A TIMESTAMP_B",
175
+ "Compare assets of two archived snapshots"
176
+ option :format, desc: "Output format (table, json)", default: "table"
177
+ def diff(url, timestamp_a, timestamp_b)
178
+ handle_errors do
179
+ bundle_a = Fetcher.new.fetch_page_with_assets(
180
+ url, timestamp: timestamp_a
181
+ )
182
+ bundle_b = Fetcher.new.fetch_page_with_assets(
183
+ url, timestamp: timestamp_b
184
+ )
185
+ output_diff(bundle_a.assets, bundle_b.assets,
186
+ timestamp_a, timestamp_b)
187
+ end
188
+ end
189
+
190
+ desc "asset-audit URL TIMESTAMP",
191
+ "Audit assets for an archived page"
192
+ option :format, desc: "Output format (table, json)", default: "table"
193
+ def asset_audit(url, timestamp)
194
+ handle_errors do
195
+ bundle = Fetcher.new.fetch_page_with_assets(
196
+ url, timestamp: timestamp
197
+ )
198
+ report = build_audit_report(bundle)
199
+ case options[:format]
200
+ when "json"
201
+ puts JSON.generate(report)
202
+ else
203
+ print_audit_report(report)
204
+ end
205
+ end
206
+ end
207
+
156
208
  desc "download URL", "Download all archived snapshots of a URL"
157
209
  option :output, desc: "Output directory", default: "archive"
158
210
  option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
@@ -276,6 +328,30 @@ module Archaeo
276
328
  end
277
329
  end
278
330
 
331
+ def build_rewriter(url, timestamp)
332
+ normalized = UrlNormalizer.normalize(url)
333
+ archive_prefix = ArchiveUrl.new(normalized, timestamp: timestamp).to_s
334
+ UrlRewriter.new(archive_prefix, options[:prefix])
335
+ end
336
+
337
+ def output_rewritten(content)
338
+ if options[:output]
339
+ write_output(options[:output], content)
340
+ else
341
+ $stdout.write(content)
342
+ end
343
+ end
344
+
345
+ def output_diff(assets_a, assets_b, ts_a, ts_b)
346
+ comparison = compare_asset_lists(assets_a, assets_b)
347
+ case options[:format]
348
+ when "json"
349
+ puts JSON.generate(comparison)
350
+ else
351
+ print_diff_report(comparison, ts_a, ts_b)
352
+ end
353
+ end
354
+
279
355
  def output_assets(bundle)
280
356
  case options[:format]
281
357
  when "json"
@@ -307,16 +383,36 @@ module Archaeo
307
383
  def print_summary(summary)
308
384
  return if quiet?
309
385
 
310
- warn "Downloaded #{summary.downloaded}/#{summary.total} " \
311
- "(#{summary.bytes_written} bytes) in " \
312
- "#{summary.elapsed.round(1)}s"
386
+ parts = ["Downloaded #{summary.downloaded}/#{summary.total}"]
387
+ parts << "#{summary.failed} failed" if summary.failed.positive?
388
+ parts << "(#{summary.bytes_written} bytes)"
389
+ parts << "in #{summary.elapsed.round(1)}s"
390
+ warn parts.join(" ")
313
391
  end
314
392
 
315
393
  def build_cdx_options(opts)
316
- CDX_OPTION_MAP.each_with_object({}) do |(cli_key, api_key), result|
394
+ result = {}
395
+ CDX_OPTION_MAP.each do |cli_key, api_key|
317
396
  value = opts[cli_key]
318
397
  result[api_key] = value if value
319
398
  end
399
+ append_convenience_filters!(result, opts)
400
+ result
401
+ end
402
+
403
+ def append_convenience_filters!(result, opts)
404
+ filters = Array(result[:filters])
405
+ filters += status_filters(opts[:filter_status])
406
+ filters += type_filters(opts[:filter_type])
407
+ result[:filters] = filters unless filters.empty?
408
+ end
409
+
410
+ def status_filters(codes)
411
+ Array(codes).map { |code| CdxFilter.by_status(code).to_s }
412
+ end
413
+
414
+ def type_filters(prefixes)
415
+ Array(prefixes).map { |p| CdxFilter.by_mimetype_prefix(p).to_s }
320
416
  end
321
417
 
322
418
  def output_table(snaps)
@@ -347,5 +443,86 @@ module Archaeo
347
443
  File.binwrite(path, content)
348
444
  warn "Written to #{path}" unless quiet?
349
445
  end
446
+
447
+ def compare_asset_lists(assets_a, assets_b)
448
+ all_a = assets_a.all.to_set
449
+ all_b = assets_b.all.to_set
450
+ build_diff(all_a, all_b, assets_a.counts, assets_b.counts)
451
+ end
452
+
453
+ def build_diff(set_a, set_b, counts_a, counts_b)
454
+ {
455
+ only_in_a: (set_a - set_b).to_a.sort,
456
+ only_in_b: (set_b - set_a).to_a.sort,
457
+ unchanged: (set_a & set_b).to_a.sort,
458
+ counts_a: counts_a,
459
+ counts_b: counts_b,
460
+ }
461
+ end
462
+
463
+ def print_diff_report(comparison, ts_a, ts_b)
464
+ puts "Comparing #{ts_a} vs #{ts_b}"
465
+ puts
466
+ print_url_list("Removed:", comparison[:only_in_a], " - ")
467
+ print_url_list("Added:", comparison[:only_in_b], " + ")
468
+ puts "Unchanged: #{comparison[:unchanged].size}"
469
+ end
470
+
471
+ def print_url_list(header, urls, prefix)
472
+ return unless urls.any?
473
+
474
+ puts header
475
+ urls.each { |url| puts "#{prefix}#{url}" }
476
+ puts
477
+ end
478
+
479
+ def build_audit_report(bundle)
480
+ assets = bundle.assets
481
+ downloadable = assets.downloadable
482
+ {
483
+ page_url: bundle.page.archive_url,
484
+ total_assets: assets.size,
485
+ downloadable: downloadable.size,
486
+ counts: assets.counts,
487
+ domains: assets.domain_counts,
488
+ duplicates: find_duplicate_urls(assets),
489
+ }
490
+ end
491
+
492
+ def print_audit_report(report)
493
+ puts "Page: #{report[:page_url]}"
494
+ puts "Total assets: #{report[:total_assets]}"
495
+ puts "Downloadable: #{report[:downloadable]}"
496
+ puts
497
+ print_type_counts(report[:counts])
498
+ print_domain_counts(report[:domains])
499
+ print_url_list("Duplicates:", report[:duplicates], " ")
500
+ end
501
+
502
+ def print_type_counts(counts)
503
+ puts "By type:"
504
+ counts.each { |type, count| puts " #{type}: #{count}" }
505
+ puts
506
+ end
507
+
508
+ def print_domain_counts(domains)
509
+ puts "By domain:"
510
+ domains.sort_by { |_, v| -v }.each do |domain, count|
511
+ puts " #{domain}: #{count}"
512
+ end
513
+ end
514
+
515
+ def find_duplicate_urls(assets)
516
+ seen = {}
517
+ dupes = []
518
+ assets.all.each do |url|
519
+ if seen[url]
520
+ dupes << url unless dupes.include?(url)
521
+ else
522
+ seen[url] = true
523
+ end
524
+ end
525
+ dupes
526
+ end
350
527
  end
351
528
  end
@@ -17,36 +17,49 @@ module Archaeo
17
17
  def initialize(output_dir)
18
18
  @output_dir = output_dir
19
19
  @path = File.join(output_dir, STATE_FILE)
20
+ @mutex = Mutex.new
20
21
  end
21
22
 
22
23
  def completed?(timestamp)
23
- entries_key.include?(timestamp.to_s)
24
+ @mutex.synchronize { entries_key.include?(timestamp.to_s) }
24
25
  end
25
26
 
26
27
  def mark_completed(timestamp, url: nil, bytes: nil)
27
- ts = timestamp.to_s
28
- return if entries_key.include?(ts)
29
-
30
- entry = { "ts" => ts, "at" => Time.now.utc.iso8601 }
31
- entry["url"] = url if url
32
- entry["bytes"] = bytes if bytes
33
- entries << entry
34
- @entries_key = nil
35
- save
28
+ @mutex.synchronize do
29
+ ts = timestamp.to_s
30
+ return if entries_key.include?(ts)
31
+
32
+ entry = { "ts" => ts, "at" => Time.now.utc.iso8601 }
33
+ entry["url"] = url if url
34
+ entry["bytes"] = bytes if bytes
35
+ entries << entry
36
+ @entries_key = nil
37
+ save
38
+ end
36
39
  end
37
40
 
38
41
  def entry_for(timestamp)
39
- entries.find { |e| e["ts"] == timestamp.to_s }
42
+ @mutex.synchronize { entries.find { |e| e["ts"] == timestamp.to_s } }
40
43
  end
41
44
 
42
45
  def total_bytes
43
- entries.sum { |e| e["bytes"].to_i }
46
+ @mutex.synchronize { entries.sum { |e| e["bytes"].to_i } }
47
+ end
48
+
49
+ def size
50
+ @mutex.synchronize { entries.size }
51
+ end
52
+
53
+ def timestamps
54
+ @mutex.synchronize { entries.map { |e| e["ts"] } }
44
55
  end
45
56
 
46
57
  def clear
47
- @entries = []
48
- @entries_key = nil
49
- FileUtils.rm_f(@path)
58
+ @mutex.synchronize do
59
+ @entries = []
60
+ @entries_key = nil
61
+ FileUtils.rm_f(@path)
62
+ end
50
63
  end
51
64
 
52
65
  private
@@ -25,6 +25,19 @@ module Archaeo
25
25
  build_page(response, archive_url.to_s, url, ts)
26
26
  end
27
27
 
28
+ def fetch!(url, timestamp:, identity: false, snapshot: nil)
29
+ page = fetch(url, timestamp: timestamp, identity: identity,
30
+ snapshot: snapshot)
31
+ return page if page.status_code.between?(200, 299)
32
+
33
+ raise FetchError.new(
34
+ "HTTP #{page.status_code} for #{page.original_url}",
35
+ status_code: page.status_code,
36
+ url: page.original_url,
37
+ page: page,
38
+ )
39
+ end
40
+
28
41
  def fetch_page_with_assets(url, timestamp:)
29
42
  page = fetch(url, timestamp: timestamp)
30
43
  assets = AssetExtractor.new(page.content,
@@ -63,12 +63,14 @@ module Archaeo
63
63
  max_retries: DEFAULT_MAX_RETRIES,
64
64
  retry_delay: DEFAULT_RETRY_DELAY,
65
65
  user_agent: nil,
66
- on_request: nil)
66
+ on_request: nil,
67
+ before_request: nil)
67
68
  @timeout = timeout
68
69
  @max_retries = max_retries
69
70
  @retry_delay = retry_delay
70
71
  @user_agent = user_agent
71
72
  @on_request = on_request
73
+ @before_request = before_request
72
74
  @connections = {}
73
75
  @last_used = {}
74
76
  @mutex = Mutex.new
@@ -203,7 +205,7 @@ module Archaeo
203
205
  def attempt_with_retries(uri, headers, request_class)
204
206
  retries = 0
205
207
  begin
206
- execute_and_check(uri, headers, request_class)
208
+ execute_and_check(uri, headers, request_class, retries)
207
209
  rescue RetriableStatusError => e
208
210
  retry_status(e, retries += 1) && retry
209
211
  rescue *TRANSIENT_ERRORS => e
@@ -223,8 +225,9 @@ module Archaeo
223
225
  sleep(@retry_delay * retries)
224
226
  end
225
227
 
226
- def execute_and_check(uri, headers, request_class)
227
- response = execute_with_connection(uri, headers, request_class)
228
+ def execute_and_check(uri, headers, request_class, retry_count)
229
+ response = execute_with_connection(uri, headers, request_class,
230
+ retry_count)
228
231
  if RETRIABLE_STATUSES.include?(response.status)
229
232
  raise RetriableStatusError, response
230
233
  end
@@ -255,9 +258,9 @@ module Archaeo
255
258
  "Failed after #{retries} retries: #{error.message}"
256
259
  end
257
260
 
258
- def execute_with_connection(uri, headers, request_class)
261
+ def execute_with_connection(uri, headers, request_class, retry_count)
259
262
  request = build_request(uri, headers, request_class)
260
- execute_tracked_request(uri, request)
263
+ execute_tracked_request(uri, request, retry_count)
261
264
  rescue *TRANSIENT_ERRORS
262
265
  raise
263
266
  rescue StandardError
@@ -268,16 +271,17 @@ module Archaeo
268
271
  def build_request(uri, headers, request_class)
269
272
  request = request_class.new(uri)
270
273
  headers.each { |k, v| request[k] = v }
274
+ @before_request&.call(uri, request)
271
275
  request
272
276
  end
273
277
 
274
- def execute_tracked_request(uri, request)
278
+ def execute_tracked_request(uri, request, retry_count)
275
279
  http = connection_for(uri)
276
280
  start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
277
281
  raw = http.request(request)
278
282
  elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
279
283
  response = build_response(raw)
280
- @on_request&.call(uri, elapsed, response.status, 0)
284
+ @on_request&.call(uri, elapsed, response.status, retry_count)
281
285
  response
282
286
  end
283
287
 
@@ -286,7 +290,7 @@ module Archaeo
286
290
  "User-Agent" => select_user_agent,
287
291
  "Accept" => "text/html,application/xhtml+xml," \
288
292
  "application/xml;q=0.9,*/*;q=0.8",
289
- "Accept-Encoding" => "gzip",
293
+ "Accept-Encoding" => "gzip, deflate",
290
294
  "Accept-Language" => "en-US,en;q=0.9",
291
295
  "Connection" => "keep-alive",
292
296
  }
@@ -303,10 +307,17 @@ module Archaeo
303
307
 
304
308
  def decompress_body(raw)
305
309
  body = raw.body.to_s
306
- return body unless raw["content-encoding"] == "gzip" && !body.empty?
307
-
308
- Zlib::GzipReader.new(StringIO.new(body)).read
309
- rescue Zlib::GzipFile::Error
310
+ return body if body.empty?
311
+
312
+ case raw["content-encoding"]
313
+ when "gzip"
314
+ Zlib::GzipReader.new(StringIO.new(body)).read
315
+ when "deflate"
316
+ Zlib::Inflate.inflate(body)
317
+ else
318
+ body
319
+ end
320
+ rescue Zlib::GzipFile::Error, Zlib::DataError
310
321
  body
311
322
  end
312
323
  end
data/lib/archaeo/page.rb CHANGED
@@ -67,6 +67,32 @@ module Archaeo
67
67
  end
68
68
  end
69
69
 
70
+ def links
71
+ return [] unless html?
72
+
73
+ @links ||= begin
74
+ doc = Nokogiri::HTML(@raw_content)
75
+ base = @archive_url || @original_url
76
+ doc.css("a[href]").map do |anchor|
77
+ href = resolve_page_url(anchor["href"], base)
78
+ { href: href, text: anchor.text.strip,
79
+ external: href && !href.include?(original_domain) }
80
+ end
81
+ end
82
+ end
83
+
84
+ def meta_tags
85
+ return {} unless html?
86
+
87
+ @meta_tags ||= begin
88
+ doc = Nokogiri::HTML(@raw_content)
89
+ result = extract_meta_entries(doc)
90
+ canonical = doc.at_css('link[rel="canonical"]')
91
+ result["canonical"] = canonical["href"].to_s if canonical
92
+ result
93
+ end
94
+ end
95
+
70
96
  def to_h
71
97
  {
72
98
  content_type: @content_type,
@@ -146,5 +172,35 @@ module Archaeo
146
172
  invalid: :replace, undef: :replace,
147
173
  replace: "?")
148
174
  end
175
+
176
+ def original_domain
177
+ @original_domain ||= begin
178
+ URI.parse(@original_url).host
179
+ rescue URI::InvalidURIError
180
+ nil
181
+ end
182
+ end
183
+
184
+ def extract_meta_entries(doc)
185
+ result = {}
186
+ doc.css("meta[name], meta[property], meta[http-equiv]").each do |meta|
187
+ key = meta["name"] || meta["property"] || meta["http-equiv"]
188
+ next unless key
189
+
190
+ result[key.downcase] = meta["content"].to_s
191
+ end
192
+ result
193
+ end
194
+
195
+ def resolve_page_url(href, base)
196
+ return href unless href
197
+ return href if href.start_with?("http", "//", "data:", "#",
198
+ "javascript:")
199
+ return nil unless base
200
+
201
+ URI.join(base, href).to_s
202
+ rescue URI::InvalidURIError
203
+ nil
204
+ end
149
205
  end
150
206
  end
@@ -23,6 +23,22 @@ module Archaeo
23
23
  attempt_save(save_url, start_time, url)
24
24
  end
25
25
 
26
+ def batch_save(urls, delay: 2, stop_on_error: false)
27
+ results = []
28
+ urls.each_with_index do |url, i|
29
+ sleep(delay) if i.positive?
30
+ result = save(url)
31
+ results << result
32
+ rescue RateLimitError, SaveFailed => e
33
+ raise e if stop_on_error
34
+
35
+ results << SaveResult.new(
36
+ url: url, archive_url: nil, timestamp: nil, cached: false,
37
+ )
38
+ end
39
+ results
40
+ end
41
+
26
42
  private
27
43
 
28
44
  def attempt_save(save_url, start_time, url)
@@ -11,7 +11,7 @@ module Archaeo
11
11
  def initialize(url:, archive_url:, timestamp:, cached:)
12
12
  @url = url
13
13
  @archive_url = archive_url
14
- @timestamp = Timestamp.coerce(timestamp)
14
+ @timestamp = timestamp ? Timestamp.coerce(timestamp) : nil
15
15
  @cached = cached
16
16
  end
17
17
 
@@ -19,6 +19,10 @@ module Archaeo
19
19
  @cached
20
20
  end
21
21
 
22
+ def success?
23
+ !@archive_url.nil?
24
+ end
25
+
22
26
  def to_h
23
27
  { url: @url, archive_url: @archive_url,
24
28
  timestamp: @timestamp, cached: @cached }
@@ -70,6 +70,18 @@ module Archaeo
70
70
  age <= seconds
71
71
  end
72
72
 
73
+ def same_content_as?(other)
74
+ return false unless other.is_a?(self.class)
75
+ return false if digest.nil? || digest.empty?
76
+ return false if other.digest.nil? || other.digest.empty?
77
+
78
+ digest == other.digest
79
+ end
80
+
81
+ def duplicate_of?(other)
82
+ same_content_as?(other) && timestamp != other.timestamp
83
+ end
84
+
73
85
  def fetch(client: HttpClient.new, identity: false)
74
86
  Fetcher.new(client: client).fetch(
75
87
  original_url, timestamp: @timestamp, identity: identity
@@ -140,8 +140,54 @@ module Archaeo
140
140
  [year, month, day, hour, minute, second]
141
141
  end
142
142
 
143
+ def quarter
144
+ ((month - 1) / 3) + 1
145
+ end
146
+
147
+ def wday
148
+ @to_time.wday
149
+ end
150
+
151
+ def human_readable
152
+ @to_time.strftime("%Y-%m-%d %H:%M:%S UTC")
153
+ end
154
+
155
+ def date_range(granularity = :day)
156
+ start_ts = range_start(granularity)
157
+ end_ts = range_end(start_ts, granularity)
158
+ start_ts..end_ts
159
+ end
160
+
143
161
  def inspect
144
162
  "#<#{self.class.name} #{self}>"
145
163
  end
164
+
165
+ private
166
+
167
+ def range_start(granularity)
168
+ case granularity
169
+ when :month then self.class.new(year: year, month: month)
170
+ when :year then self.class.new(year: year)
171
+ else self.class.new(year: year, month: month, day: day)
172
+ end
173
+ end
174
+
175
+ def range_end(start_ts, granularity)
176
+ case granularity
177
+ when :month then next_month_start - 1
178
+ when :year
179
+ self.class.new(year: year, month: 12, day: 31,
180
+ hour: 23, minute: 59, second: 59)
181
+ else start_ts + 86_399
182
+ end
183
+ end
184
+
185
+ def next_month_start
186
+ if month == 12
187
+ self.class.new(year: year + 1, month: 1)
188
+ else
189
+ self.class.new(year: year, month: month + 1)
190
+ end
191
+ end
146
192
  end
147
193
  end
@@ -10,6 +10,7 @@ module Archaeo
10
10
  # rooted at a configurable local directory.
11
11
  class UrlRewriter
12
12
  URL_ATTRS = %w[src href data-src poster].freeze
13
+ CSS_URL_RE = /url\(\s*['"]?([^'")\s]+)['"]?\s*\)/
13
14
 
14
15
  def initialize(archive_prefix, local_prefix)
15
16
  @archive_prefix = archive_prefix.to_s
@@ -31,6 +32,8 @@ module Archaeo
31
32
  doc = Nokogiri::HTML(html_content)
32
33
  rewrite_url_attrs(doc)
33
34
  rewrite_srcset_attrs(doc)
35
+ rewrite_inline_style_attrs(doc)
36
+ rewrite_style_elements(doc)
34
37
  doc.to_html
35
38
  end
36
39
 
@@ -53,6 +56,28 @@ module Archaeo
53
56
 
54
57
  private
55
58
 
59
+ def rewrite_inline_style_attrs(doc)
60
+ doc.css("[style]").each do |el|
61
+ next unless el["style"]
62
+
63
+ el["style"] = rewrite_css_urls(el["style"])
64
+ end
65
+ end
66
+
67
+ def rewrite_style_elements(doc)
68
+ doc.css("style").each do |el|
69
+ el.content = rewrite_css_urls(el.text)
70
+ end
71
+ end
72
+
73
+ def rewrite_css_urls(css_text)
74
+ css_text.gsub(CSS_URL_RE) do
75
+ url = Regexp.last_match[1]
76
+ rewritten = url.start_with?(@archive_prefix) ? rewrite(url) : url
77
+ "url('#{rewritten}')"
78
+ end
79
+ end
80
+
56
81
  def rewrite_srcset(srcset)
57
82
  return srcset unless srcset
58
83
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Archaeo
4
- VERSION = "0.2.6"
4
+ VERSION = "0.2.7"
5
5
  end
data/lib/archaeo.rb CHANGED
@@ -16,6 +16,17 @@ module Archaeo
16
16
  class SaveFailed < Error; end
17
17
  class IntegrityError < Error; end
18
18
 
19
+ class FetchError < Error
20
+ attr_reader :status_code, :url, :page
21
+
22
+ def initialize(message, status_code:, url:, page:)
23
+ super(message)
24
+ @status_code = status_code
25
+ @url = url
26
+ @page = page
27
+ end
28
+ end
29
+
19
30
  autoload :Timestamp, "archaeo/timestamp"
20
31
  autoload :ArchiveUrl, "archaeo/archive_url"
21
32
  autoload :Snapshot, "archaeo/snapshot"
@@ -25,6 +36,7 @@ module Archaeo
25
36
  autoload :AvailabilityResult, "archaeo/availability_result"
26
37
  autoload :UrlNormalizer, "archaeo/url_normalizer"
27
38
  autoload :CdxFilter, "archaeo/cdx_filter"
39
+ autoload :CdxTimeline, "archaeo/cdx_timeline"
28
40
  autoload :AssetList, "archaeo/asset_list"
29
41
  autoload :AssetExtractor, "archaeo/asset_extractor"
30
42
  autoload :UrlRewriter, "archaeo/url_rewriter"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: archaeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-05-10 00:00:00.000000000 Z
11
+ date: 2026-05-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: csv
@@ -79,6 +79,7 @@ files:
79
79
  - lib/archaeo/bulk_downloader.rb
80
80
  - lib/archaeo/cdx_api.rb
81
81
  - lib/archaeo/cdx_filter.rb
82
+ - lib/archaeo/cdx_timeline.rb
82
83
  - lib/archaeo/cli.rb
83
84
  - lib/archaeo/download_state.rb
84
85
  - lib/archaeo/fetcher.rb