RubyGems - archaeo - Versions diffs - 0.2.6 → 0.2.7 - Mend

archaeo 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/README.adoc +95 -1
data/lib/archaeo/asset_extractor.rb +10 -0
data/lib/archaeo/asset_list.rb +23 -0
data/lib/archaeo/availability_api.rb +31 -0
data/lib/archaeo/availability_result.rb +11 -0
data/lib/archaeo/bulk_downloader.rb +56 -33
data/lib/archaeo/cdx_api.rb +18 -0
data/lib/archaeo/cdx_filter.rb +11 -0
data/lib/archaeo/cdx_timeline.rb +66 -0
data/lib/archaeo/cli.rb +181 -4
data/lib/archaeo/download_state.rb +28 -15
data/lib/archaeo/fetcher.rb +13 -0
data/lib/archaeo/http_client.rb +24 -13
data/lib/archaeo/page.rb +56 -0
data/lib/archaeo/save_api.rb +16 -0
data/lib/archaeo/save_result.rb +5 -1
data/lib/archaeo/snapshot.rb +12 -0
data/lib/archaeo/timestamp.rb +46 -0
data/lib/archaeo/url_rewriter.rb +25 -0
data/lib/archaeo/version.rb +1 -1
data/lib/archaeo.rb +12 -0
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: dff73d8ab14a3b75bf98281d20b5427b55757b330d57e8899a2ffb04d9046c6d
-  data.tar.gz: d92c2f8e77d6ba5c51283f0581bb51182ebe84aab74ffe4e4971e0d405eab2cc
+  metadata.gz: ecdcd994fa61efa836a5224a5e329b40b72694c27a79cbb6eb4f91bf57c0f2c9
+  data.tar.gz: 03ad557eb55ce9946a2936e3beec8cad13db2ecd4b2fc49b0996131d35e6ddba
 SHA512:
-  metadata.gz: ed1a823e5f358e53ec653e5eee902f95787b9bdedc5670a214f3c1784f4c1829f705d5a53ded5cfb6777f16ed33e32430f2aca13cb9c1fad628885daa55a60a1
-  data.tar.gz: 3b809e1aad60db5e04a356dff5ab450333ff88f56bf481630887f45b3b3f09035ee1ba959807d22129b827b30324b7730e7cf5c248656b62c38a0beab8ad2581
+  metadata.gz: a2859d1738f4f4a9fa0f0ed89d118dacfc24a2f75d3237ad3bdd31cf26c041e8aa5f47c998d8b5e61907c34d491ed48b6df22d2784702d60a15820ba8d8a2a27
+  data.tar.gz: e2df62b1077c90d8b04173f9aa713f590f9692cdfde3d3b656796308810341cd5b052321f97a28513ebe98143cdb8f28fa7e182720555db231983fa3d2a6d4be

data/README.adoc CHANGED Viewed

@@ -57,6 +57,21 @@ end
 # Count snapshots
 cdx.count("example.com")  # => Integer
+# Deduplicated snapshots (collapse by digest)
+cdx.unique_snapshots("example.com").each do |snap|
+  puts snap.timestamp
+end
+# Timeline analysis (time-bucketed frequency)
+timeline = cdx.timeline("example.com",
+                        from: "20220101", to: "20221231",
+                        bucket_size: :month)
+timeline.to_h     # => { "202201" => 5, "202202" => 3, ... }
+timeline.peak     # => ["202201", 5]
+timeline.total    # => 42
+timeline.span     # => ["202201", "202212"]
+timeline.size     # => 12 (number of buckets)
 # Filter by status code, mimetype, or URL pattern
 cdx.snapshots("example.com",
   filters: [Archaeo::CdxFilter.by_status(200)],
@@ -102,6 +117,10 @@ result.to_h         # => Hash representation
 result.as_json      # => JSON-serializable Hash
 api.available?("example.com")  # => true/false
+# Batch availability check
+results = api.batch_available?(%w[example.com other.com])
+# => { "example.com" => AvailabilityResult, ... }
 ----
 === Save a URL (SavePageNow)
@@ -114,8 +133,14 @@ result.url          # => "https://example.com/"
 result.archive_url  # => "https://web.archive.org/web/..."
 result.timestamp    # => Archaeo::Timestamp
 result.cached?      # => true if already archived
+result.success?     # => true if archive_url is present
 result.to_h         # => Hash representation
 result.as_json      # => JSON-serializable Hash
+# Batch save multiple URLs
+results = save.batch_save(%w[https://a.com https://b.com],
+                          delay: 2, stop_on_error: false)
+results.each { |r| puts "#{r.url}: #{r.success?}" }
 ----
 === Fetch Archived Content
@@ -148,6 +173,15 @@ page = fetcher.fetch("https://example.com/",
 page = fetcher.fetch("https://example.com/",
                      timestamp: "20220615000000",
                      snapshot: snap)
+# Raise on error status (raises FetchError with page attached)
+page = fetcher.fetch!("https://example.com/",
+                      timestamp: "20220615000000")
+# FetchError includes: .status_code, .url, .page
+# Page links and meta extraction
+page.links      # => [{ href: "...", text: "...", external: true/false }]
+page.meta_tags  # => { "description" => "...", "og:title" => "...", "canonical" => "..." }
 ----
 === Fetch Page with Assets
@@ -186,6 +220,13 @@ restored = Archaeo::AssetList.from_json(json_string)
 # Safe type access
 bundle.assets.urls_by_type(:image)  # works for any type key
+# Domain analysis
+bundle.assets.domain_counts
+# => { "cdn.example.com" => 3, "fonts.googleapis.com" => 1 }
+# Filter downloadable assets (excludes data: and fragment URLs)
+downloadable = bundle.assets.downloadable
 ----
 === Bulk Download with Resume
@@ -200,6 +241,7 @@ end
 summary.total          # => total snapshots found
 summary.downloaded     # => successfully downloaded
 summary.skipped        # => skipped (already downloaded with resume)
+summary.failed         # => failed downloads
 summary.bytes_written  # => total bytes written
 summary.elapsed        # => seconds elapsed
@@ -237,6 +279,10 @@ entry = state.entry_for("20220615000000")
 # Total bytes downloaded
 state.total_bytes  # => Integer
+# List all completed timestamps
+state.size        # => number of completed entries
+state.timestamps  # => ["20220101000000", "20220102000000"]
 # Clear state for a fresh download
 state.clear
 ----
@@ -279,6 +325,14 @@ Archaeo::CdxFilter.by_mimetype_prefix("image")  # => matches image/*
 # Convenience factories
 Archaeo::CdxFilter.only_html            # => text/html only
 Archaeo::CdxFilter.excluding_redirects  # => excludes 3xx statuses
+# Introspection
+filter = Archaeo::CdxFilter.by_status(200)
+filter.field    # => "statuscode"
+filter.pattern  # => "200"
+filter.matches?("200")  # => true
+filter.matches?("404")  # => false
+filter.negated?         # => false
 ----
 === URL Rewriting
@@ -298,6 +352,7 @@ rewriter.rewrite("https://web.archive.org/web/20220615000000/style.css")
 rewriter.rewrite_batch(["url1", "url2"])
 # Rewrite URLs within HTML (src, href, srcset, data-src, poster)
+# Also rewrites inline style url() and <style> element url()
 rewritten_html = rewriter.rewrite_html(html_content)
 ----
@@ -319,6 +374,10 @@ snap.age            # => seconds since capture
 snap.older_than?(3600)  # => true if older than 1 hour
 snap.newer_than?(3600)  # => true if newer than 1 hour
+# Content comparison (by digest)
+snap1.same_content_as?(snap2)  # => true if same digest
+snap1.duplicate_of?(snap2)     # => true if same digest AND different timestamp
 # Identity URL (raw content, no Wayback rewriting)
 snap.identity_url
@@ -370,6 +429,18 @@ ts1 < ts2   # => true/false
 # Immutable -- frozen on creation
 ts.frozen?  # => true
+# Date/time helpers
+ts.quarter         # => 1..4
+ts.wday            # => 0..6 (Sunday = 0)
+ts.human_readable  # => "2022-06-15 00:00:00 UTC"
+ts.to_date         # => Date object
+# Date ranges for coverage analysis
+range = ts.date_range(:month)
+# => Timestamp(Jun 1)..Timestamp(Jun 30 23:59:59)
+ts.date_range(:day)   # => single day range
+ts.date_range(:year)  # => full year range
 ----
 === HTTP Client Observability
@@ -383,6 +454,13 @@ client = Archaeo::HttpClient.new(
   },
 )
+# Intercept requests before they are sent
+client = Archaeo::HttpClient.new(
+  before_request: ->(uri, request) {
+    request["X-Custom-Header"] = "value"
+  },
+)
 # Inspect connection pool state
 client.pool_stats
 # => { active_connections: 2, max_pool_size: 8,
@@ -401,6 +479,7 @@ archaeo --version
 archaeo snapshots example.com
 archaeo snapshots --format json example.com
 archaeo snapshots --format csv --from 20220101 --to 20221231 example.com
+archaeo snapshots --filter-status 200 --filter-type text/html example.com
 # Find closest snapshot
 archaeo near example.com 20220101
@@ -440,6 +519,18 @@ archaeo fetch --identity https://example.com/ 20220615120000
 archaeo fetch-assets https://example.com/ 20220615120000
 archaeo fetch-assets --format json https://example.com/ 20220615120000
+# Rewrite archive URLs to local paths
+archaeo rewrite https://example.com/ 20220615120000
+archaeo rewrite --output page.html --prefix local https://example.com/ 20220615120000
+# Compare assets between two snapshots
+archaeo diff https://example.com/ 20220101 20220615
+archaeo diff --format json https://example.com/ 20220101 20220615
+# Audit assets for an archived page
+archaeo asset-audit https://example.com/ 20220615120000
+archaeo asset-audit --format json https://example.com/ 20220615120000
 # Download all snapshots
 archaeo download example.com --output ./archive
@@ -480,6 +571,9 @@ Archaeo::SaveFailed
 # Content digest mismatch
 Archaeo::IntegrityError
+# HTTP error during fetch (includes .page, .url, .status_code)
+Archaeo::FetchError
 ----
 == Architecture
@@ -491,7 +585,7 @@ Archaeo follows a model-driven, OOP design:
 | Layer | Classes | Purpose
 | *Models*
-| `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`
+| `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`, `CdxTimeline`
 | Domain value objects with `to_h`, `as_json`, `inspect` support
 | *URL Processing*

data/lib/archaeo/asset_extractor.rb CHANGED Viewed

@@ -120,6 +120,7 @@ module Archaeo
       extract_media_sources(list)
       extract_video_posters(list)
       extract_embeds(list)
+      extract_tracks(list)
     end
     def extract_media_sources(list)
@@ -138,6 +139,15 @@ module Archaeo
       @doc.css("iframe[src], embed[src]").each do |el|
         list.add(resolve(el["src"]), type: :media)
       end
+      @doc.css("object[data]").each do |el|
+        list.add(resolve(el["data"]), type: :media)
+      end
+    end
+    def extract_tracks(list)
+      @doc.css("track[src]").each do |el|
+        list.add(resolve(el["src"]), type: :media)
+      end
     end
     def extract_inline_css(list)

data/lib/archaeo/asset_list.rb CHANGED Viewed

@@ -102,5 +102,28 @@ module Archaeo
       end
       list
     end
+    def domain_counts
+      all.each_with_object(Hash.new(0)) do |url, counts|
+        host = begin
+          URI.parse(url).host
+        rescue URI::InvalidURIError
+          "(invalid)"
+        end
+        counts[host || "(relative)"] += 1
+      end
+    end
+    def downloadable
+      filtered = self.class.new
+      CATEGORIES.each do |type|
+        @urls_by_type[type].each do |url|
+          next if url.start_with?("data:", "#")
+          filtered.add(url, type: type)
+        end
+      end
+      filtered
+    end
   end
 end

data/lib/archaeo/availability_api.rb CHANGED Viewed

@@ -38,6 +38,16 @@ module Archaeo
       near(url).available?
     end
+    def batch_available?(urls, concurrency: 1)
+      if concurrency <= 1
+        urls.to_h do |u|
+          [u, near(u)]
+        end
+      else
+        batch_concurrent(urls, concurrency)
+      end
+    end
     private
     def parse_response(response, url)
@@ -78,5 +88,26 @@ module Archaeo
         archived_status: archived_status,
       )
     end
+    def batch_concurrent(urls, concurrency)
+      results = {}
+      mutex = Mutex.new
+      queue = urls.dup
+      threads = Array.new(concurrency) do
+        Thread.new { drain_queue(queue, results, mutex) }
+      end
+      threads.each(&:join)
+      results
+    end
+    def drain_queue(queue, results, mutex)
+      loop do
+        url = mutex.synchronize { queue.shift }
+        break unless url
+        result = near(url)
+        mutex.synchronize { results[url] = result }
+      end
+    end
   end
 end

data/lib/archaeo/availability_result.rb CHANGED Viewed

@@ -56,5 +56,16 @@ module Archaeo
     def inspect
       "#<#{self.class.name} #{@url} available=#{@available}>"
     end
+    def to_snapshot
+      return nil unless available?
+      Snapshot.new(
+        urlkey: UrlNormalizer.normalize(url).downcase,
+        timestamp: timestamp,
+        original_url: url,
+        status_code: archived_status || 200,
+      )
+    end
   end
 end

data/lib/archaeo/bulk_downloader.rb CHANGED Viewed

@@ -15,11 +15,12 @@ module Archaeo
   # for interrupted download recovery.
   class BulkDownloader
     def initialize(client: HttpClient.new, output_dir: "archive",
-                   cdx_api: nil, concurrency: 1)
+                   cdx_api: nil, concurrency: 1, on_error: nil)
       @client = client
       @output_dir = output_dir
       @cdx_api = cdx_api
       @concurrency = [1, concurrency.to_i].max
+      @on_error = on_error
     end
     def download(url, from: nil, to: nil, resume: false,
@@ -29,10 +30,11 @@ module Archaeo
       FileUtils.mkdir_p(@output_dir) unless dry_run
       snapshots = fetch_snapshots(url, from: from, to: to)
-      downloaded, skipped, bytes =
+      downloaded, skipped, bytes, failed =
         run_download(snapshots, resume, dry_run, block)
-      build_summary(start_time, snapshots.size, downloaded, skipped, bytes)
+      build_summary(start_time, snapshots.size, downloaded,
+                    skipped, bytes, failed: failed)
     end
     private
@@ -59,24 +61,26 @@ module Archaeo
       end
     end
-    def build_summary(start_time, total, downloaded, skipped, bytes)
+    def build_summary(start_time, total, downloaded, skipped,
+                      bytes, failed: 0)
       elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
       DownloadSummary.new(
         total: total, downloaded: downloaded, skipped: skipped,
-        failed: 0, bytes_written: bytes, elapsed: elapsed
+        failed: failed, bytes_written: bytes, elapsed: elapsed
       )
     end
     def download_sequential(snapshots, total, state, resume,
                             dry_run, progress)
-      counters = { downloaded: 0, skipped: 0, bytes: 0 }
+      counters = { downloaded: 0, skipped: 0, bytes: 0, failed: 0 }
       snapshots.each_with_index do |snap, index|
         process_sequential(snap, state, resume, dry_run, counters)
         progress&.call(index + 1, total, snap)
       end
-      [counters[:downloaded], counters[:skipped], counters[:bytes]]
+      [counters[:downloaded], counters[:skipped],
+       counters[:bytes], counters[:failed]]
     end
     def process_sequential(snap, state, resume, dry_run, counters)
@@ -87,6 +91,9 @@ module Archaeo
       counters[:bytes] += download_snapshot(snap, state) unless dry_run
       counters[:downloaded] += 1
+    rescue StandardError => e
+      counters[:failed] += 1
+      @on_error&.call(snap, e)
     end
     def download_snapshot(snap, state)
@@ -100,7 +107,7 @@ module Archaeo
                             dry_run, progress)
       queue = snapshots.each_with_index.to_a
       shared = { mutex: Mutex.new, errors: [],
-                 downloaded: 0, skipped: 0, bytes: 0 }
+                 downloaded: 0, skipped: 0, bytes: 0, failed: 0 }
       threads = Array.new(@concurrency) do
         Thread.new do
@@ -109,17 +116,9 @@ module Archaeo
         end
       end
       threads.each(&:join)
-      raise_on_errors(shared[:errors])
-      [shared[:downloaded], shared[:skipped], shared[:bytes]]
-    end
-    def raise_on_errors(errors)
-      return unless errors.any?
-      raise Error,
-            "#{errors.size} download(s) failed: " \
-            "#{errors.map { |s, _| s.timestamp }.join(', ')}"
+      [shared[:downloaded], shared[:skipped],
+       shared[:bytes], shared[:failed]]
     end
     def process_queue(queue, total, state, resume, dry_run,
@@ -133,7 +132,7 @@ module Archaeo
           next
         end
-        concurrent_fetch(snap, state, dry_run, shared)
+        concurrent_fetch(snap, dry_run, shared)
         progress&.call(index + 1, total, snap)
       end
     end
@@ -145,35 +144,59 @@ module Archaeo
       true
     end
-    def concurrent_fetch(snap, state, dry_run, shared)
+    def concurrent_fetch(snap, dry_run, shared)
       unless dry_run
         content = fetch_and_save(snap)
-        shared[:mutex].synchronize do
-          state.mark_completed(snap.timestamp,
-                               url: snap.original_url,
-                               bytes: content.bytesize)
-          shared[:bytes] += content.bytesize
-        end
+        record_completed(snap, content, shared)
       end
       shared[:mutex].synchronize { shared[:downloaded] += 1 }
     rescue StandardError => e
-      shared[:mutex].synchronize { shared[:errors] << [snap, e] }
+      shared[:mutex].synchronize do
+        shared[:failed] += 1
+        shared[:errors] << [snap, e]
+      end
+      @on_error&.call(snap, e)
+    end
+    def record_completed(snap, content, shared)
+      shared[:mutex].synchronize do
+        state.mark_completed(snap.timestamp,
+                             url: snap.original_url,
+                             bytes: content.bytesize)
+        shared[:bytes] += content.bytesize
+      end
     end
     def fetch_and_save(snapshot)
-      fetcher = Fetcher.new(client: @client)
-      page = fetcher.fetch(snapshot.original_url,
-                           timestamp: snapshot.timestamp)
+      page = fetch_page(snapshot)
+      validate_page_status(page, snapshot)
+      write_page_file(page, snapshot)
+    rescue StandardError
+      FileUtils.rm_f(tmp_path) if defined?(tmp_path)
+      raise
+    end
+    def fetch_page(snapshot)
+      Fetcher.new(client: @client).fetch(
+        snapshot.original_url, timestamp: snapshot.timestamp
+      )
+    end
+    def validate_page_status(page, snapshot)
+      return if page.status_code.between?(200, 299)
+      raise Error,
+            "HTTP #{page.status_code} for " \
+            "#{snapshot.original_url} at #{snapshot.timestamp}"
+    end
+    def write_page_file(page, snapshot)
       filename = build_filename(snapshot)
       FileUtils.mkdir_p(File.dirname(filename))
       tmp_path = "#{filename}.tmp"
       File.binwrite(tmp_path, page.content)
       File.rename(tmp_path, filename)
       page.content
-    rescue StandardError
-      FileUtils.rm_f(tmp_path) if defined?(tmp_path)
-      raise
     end
     EXTENSION_MAP = {

data/lib/archaeo/cdx_api.rb CHANGED Viewed

@@ -110,6 +110,24 @@ module Archaeo
       snapshots(url, **options).count
     end
+    def unique_snapshots(url, resolve_revisits: true, **options)
+      snapshots(url,
+                collapse: ["digest"],
+                resolve_revisits: resolve_revisits,
+                **options)
+    end
+    def timeline(url, from: nil, to: nil,
+                 bucket_size: :month, status: 200)
+      options = {}
+      options[:from] = Timestamp.coerce(from).to_s if from
+      options[:to] = Timestamp.coerce(to).to_s if to
+      options[:filters] = [CdxFilter.by_status(status)] if status
+      snaps = snapshots(url, **options).to_a
+      CdxTimeline.new(snaps, bucket_size: bucket_size)
+    end
     # Returns the number of pages for a paginated query.
     def num_pages(url, **options)
       url = UrlNormalizer.normalize(url)

data/lib/archaeo/cdx_filter.rb CHANGED Viewed

@@ -31,6 +31,17 @@ module Archaeo
       stripped.split(":", 2).first.to_s
     end
+    def pattern
+      stripped = @expression.delete_prefix("!")
+      stripped.split(":", 2).last.to_s
+    end
+    def matches?(value)
+      regex = Regexp.new(pattern)
+      result = regex.match?(value.to_s)
+      negated? ? !result : result
+    end
     def self.by_status(code)
       new("statuscode:#{code}")
     end

data/lib/archaeo/cdx_timeline.rb ADDED Viewed

@@ -0,0 +1,66 @@
+# frozen_string_literal: true
+module Archaeo
+  # Time-bucketed snapshot frequency analysis.
+  #
+  # Groups snapshots by configurable time buckets (day, week, month, year)
+  # for frequency analysis and coverage reporting.
+  class CdxTimeline
+    BUCKET_FORMATS = {
+      day: "%Y%m%d",
+      week: "%YW%V",
+      month: "%Y%m",
+      year: "%Y",
+    }.freeze
+    def initialize(snapshots, bucket_size: :month)
+      @bucket_size = bucket_size
+      @buckets = build_buckets(snapshots)
+    end
+    def to_a
+      @buckets.sort_by(&:first)
+    end
+    def to_h
+      @buckets.dup
+    end
+    def peak
+      @buckets.max_by(&:last)
+    end
+    def total
+      @buckets.values.sum
+    end
+    def span
+      keys = @buckets.keys
+      return nil if keys.empty?
+      [keys.first, keys.last]
+    end
+    def empty?
+      @buckets.empty?
+    end
+    def size
+      @buckets.size
+    end
+    def inspect
+      "#<#{self.class.name} #{total} snapshots in #{@buckets.size} buckets>"
+    end
+    private
+    def build_buckets(snapshots)
+      fmt = BUCKET_FORMATS[@bucket_size] || BUCKET_FORMATS[:month]
+      snapshots.each_with_object(Hash.new(0)) do |snap, counts|
+        key = snap.timestamp.to_time.strftime(fmt)
+        counts[key] += 1
+      end
+    end
+  end
+end

data/lib/archaeo/cli.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require "csv"
 require "json"
+require "set"
 require "thor"
 module Archaeo
@@ -27,6 +28,10 @@ module Archaeo
     option :match_type,
            desc: "Match type (exact, prefix, host, domain)"
     option :filter, type: :array, desc: "CDX filter expressions"
+    option :filter_status, type: :array,
+                           desc: "Only include these status codes"
+    option :filter_type, type: :array,
+                         desc: "MIME type prefixes (e.g. image, text/html)"
     option :collapse, type: :array, desc: "CDX collapse fields"
     option :sort, desc: "Sort order (default, closest, reverse)"
     option :limit, type: :numeric, desc: "Max snapshots to return"
@@ -153,6 +158,53 @@ module Archaeo
       end
     end
+    desc "rewrite URL TIMESTAMP",
+         "Fetch a page and rewrite archive URLs to local paths"
+    option :prefix, desc: "Local path prefix", default: "local"
+    option :output, desc: "Write rewritten HTML to file"
+    def rewrite(url, timestamp)
+      handle_errors do
+        coerced = Timestamp.coerce(timestamp)
+        page = Fetcher.new.fetch(url, timestamp: coerced)
+        rewritten = build_rewriter(url, coerced).rewrite_html(page.content)
+        output_rewritten(rewritten)
+      end
+    end
+    desc "diff URL TIMESTAMP_A TIMESTAMP_B",
+         "Compare assets of two archived snapshots"
+    option :format, desc: "Output format (table, json)", default: "table"
+    def diff(url, timestamp_a, timestamp_b)
+      handle_errors do
+        bundle_a = Fetcher.new.fetch_page_with_assets(
+          url, timestamp: timestamp_a
+        )
+        bundle_b = Fetcher.new.fetch_page_with_assets(
+          url, timestamp: timestamp_b
+        )
+        output_diff(bundle_a.assets, bundle_b.assets,
+                    timestamp_a, timestamp_b)
+      end
+    end
+    desc "asset-audit URL TIMESTAMP",
+         "Audit assets for an archived page"
+    option :format, desc: "Output format (table, json)", default: "table"
+    def asset_audit(url, timestamp)
+      handle_errors do
+        bundle = Fetcher.new.fetch_page_with_assets(
+          url, timestamp: timestamp
+        )
+        report = build_audit_report(bundle)
+        case options[:format]
+        when "json"
+          puts JSON.generate(report)
+        else
+          print_audit_report(report)
+        end
+      end
+    end
     desc "download URL", "Download all archived snapshots of a URL"
     option :output, desc: "Output directory", default: "archive"
     option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
@@ -276,6 +328,30 @@ module Archaeo
       end
     end
+    def build_rewriter(url, timestamp)
+      normalized = UrlNormalizer.normalize(url)
+      archive_prefix = ArchiveUrl.new(normalized, timestamp: timestamp).to_s
+      UrlRewriter.new(archive_prefix, options[:prefix])
+    end
+    def output_rewritten(content)
+      if options[:output]
+        write_output(options[:output], content)
+      else
+        $stdout.write(content)
+      end
+    end
+    def output_diff(assets_a, assets_b, ts_a, ts_b)
+      comparison = compare_asset_lists(assets_a, assets_b)
+      case options[:format]
+      when "json"
+        puts JSON.generate(comparison)
+      else
+        print_diff_report(comparison, ts_a, ts_b)
+      end
+    end
     def output_assets(bundle)
       case options[:format]
       when "json"
@@ -307,16 +383,36 @@ module Archaeo
     def print_summary(summary)
       return if quiet?
-      warn "Downloaded #{summary.downloaded}/#{summary.total} " \
-           "(#{summary.bytes_written} bytes) in " \
-           "#{summary.elapsed.round(1)}s"
+      parts = ["Downloaded #{summary.downloaded}/#{summary.total}"]
+      parts << "#{summary.failed} failed" if summary.failed.positive?
+      parts << "(#{summary.bytes_written} bytes)"
+      parts << "in #{summary.elapsed.round(1)}s"
+      warn parts.join(" ")
     end
     def build_cdx_options(opts)
-      CDX_OPTION_MAP.each_with_object({}) do |(cli_key, api_key), result|
+      result = {}
+      CDX_OPTION_MAP.each do |cli_key, api_key|
         value = opts[cli_key]
         result[api_key] = value if value
       end
+      append_convenience_filters!(result, opts)
+      result
+    end
+    def append_convenience_filters!(result, opts)
+      filters = Array(result[:filters])
+      filters += status_filters(opts[:filter_status])
+      filters += type_filters(opts[:filter_type])
+      result[:filters] = filters unless filters.empty?
+    end
+    def status_filters(codes)
+      Array(codes).map { |code| CdxFilter.by_status(code).to_s }
+    end
+    def type_filters(prefixes)
+      Array(prefixes).map { |p| CdxFilter.by_mimetype_prefix(p).to_s }
     end
     def output_table(snaps)
@@ -347,5 +443,86 @@ module Archaeo
       File.binwrite(path, content)
       warn "Written to #{path}" unless quiet?
     end
+    def compare_asset_lists(assets_a, assets_b)
+      all_a = assets_a.all.to_set
+      all_b = assets_b.all.to_set
+      build_diff(all_a, all_b, assets_a.counts, assets_b.counts)
+    end
+    def build_diff(set_a, set_b, counts_a, counts_b)
+      {
+        only_in_a: (set_a - set_b).to_a.sort,
+        only_in_b: (set_b - set_a).to_a.sort,
+        unchanged: (set_a & set_b).to_a.sort,
+        counts_a: counts_a,
+        counts_b: counts_b,
+      }
+    end
+    def print_diff_report(comparison, ts_a, ts_b)
+      puts "Comparing #{ts_a} vs #{ts_b}"
+      puts
+      print_url_list("Removed:", comparison[:only_in_a], "  - ")
+      print_url_list("Added:", comparison[:only_in_b], "  + ")
+      puts "Unchanged: #{comparison[:unchanged].size}"
+    end
+    def print_url_list(header, urls, prefix)
+      return unless urls.any?
+      puts header
+      urls.each { |url| puts "#{prefix}#{url}" }
+      puts
+    end
+    def build_audit_report(bundle)
+      assets = bundle.assets
+      downloadable = assets.downloadable
+      {
+        page_url: bundle.page.archive_url,
+        total_assets: assets.size,
+        downloadable: downloadable.size,
+        counts: assets.counts,
+        domains: assets.domain_counts,
+        duplicates: find_duplicate_urls(assets),
+      }
+    end
+    def print_audit_report(report)
+      puts "Page: #{report[:page_url]}"
+      puts "Total assets: #{report[:total_assets]}"
+      puts "Downloadable: #{report[:downloadable]}"
+      puts
+      print_type_counts(report[:counts])
+      print_domain_counts(report[:domains])
+      print_url_list("Duplicates:", report[:duplicates], "  ")
+    end
+    def print_type_counts(counts)
+      puts "By type:"
+      counts.each { |type, count| puts "  #{type}: #{count}" }
+      puts
+    end
+    def print_domain_counts(domains)
+      puts "By domain:"
+      domains.sort_by { |_, v| -v }.each do |domain, count|
+        puts "  #{domain}: #{count}"
+      end
+    end
+    def find_duplicate_urls(assets)
+      seen = {}
+      dupes = []
+      assets.all.each do |url|
+        if seen[url]
+          dupes << url unless dupes.include?(url)
+        else
+          seen[url] = true
+        end
+      end
+      dupes
+    end
   end
 end

data/lib/archaeo/download_state.rb CHANGED Viewed

@@ -17,36 +17,49 @@ module Archaeo
     def initialize(output_dir)
       @output_dir = output_dir
       @path = File.join(output_dir, STATE_FILE)
+      @mutex = Mutex.new
     end
     def completed?(timestamp)
-      entries_key.include?(timestamp.to_s)
+      @mutex.synchronize { entries_key.include?(timestamp.to_s) }
     end
     def mark_completed(timestamp, url: nil, bytes: nil)
-      ts = timestamp.to_s
-      return if entries_key.include?(ts)
-      entry = { "ts" => ts, "at" => Time.now.utc.iso8601 }
-      entry["url"] = url if url
-      entry["bytes"] = bytes if bytes
-      entries << entry
-      @entries_key = nil
-      save
+      @mutex.synchronize do
+        ts = timestamp.to_s
+        return if entries_key.include?(ts)
+        entry = { "ts" => ts, "at" => Time.now.utc.iso8601 }
+        entry["url"] = url if url
+        entry["bytes"] = bytes if bytes
+        entries << entry
+        @entries_key = nil
+        save
+      end
     end
     def entry_for(timestamp)
-      entries.find { |e| e["ts"] == timestamp.to_s }
+      @mutex.synchronize { entries.find { |e| e["ts"] == timestamp.to_s } }
     end
     def total_bytes
-      entries.sum { |e| e["bytes"].to_i }
+      @mutex.synchronize { entries.sum { |e| e["bytes"].to_i } }
+    end
+    def size
+      @mutex.synchronize { entries.size }
+    end
+    def timestamps
+      @mutex.synchronize { entries.map { |e| e["ts"] } }
     end
     def clear
-      @entries = []
-      @entries_key = nil
-      FileUtils.rm_f(@path)
+      @mutex.synchronize do
+        @entries = []
+        @entries_key = nil
+        FileUtils.rm_f(@path)
+      end
     end
     private

data/lib/archaeo/fetcher.rb CHANGED Viewed

@@ -25,6 +25,19 @@ module Archaeo
       build_page(response, archive_url.to_s, url, ts)
     end
+    def fetch!(url, timestamp:, identity: false, snapshot: nil)
+      page = fetch(url, timestamp: timestamp, identity: identity,
+                        snapshot: snapshot)
+      return page if page.status_code.between?(200, 299)
+      raise FetchError.new(
+        "HTTP #{page.status_code} for #{page.original_url}",
+        status_code: page.status_code,
+        url: page.original_url,
+        page: page,
+      )
+    end
     def fetch_page_with_assets(url, timestamp:)
       page = fetch(url, timestamp: timestamp)
       assets = AssetExtractor.new(page.content,

data/lib/archaeo/http_client.rb CHANGED Viewed

@@ -63,12 +63,14 @@ module Archaeo
                    max_retries: DEFAULT_MAX_RETRIES,
                    retry_delay: DEFAULT_RETRY_DELAY,
                    user_agent: nil,
-                   on_request: nil)
+                   on_request: nil,
+                   before_request: nil)
       @timeout = timeout
       @max_retries = max_retries
       @retry_delay = retry_delay
       @user_agent = user_agent
       @on_request = on_request
+      @before_request = before_request
       @connections = {}
       @last_used = {}
       @mutex = Mutex.new
@@ -203,7 +205,7 @@ module Archaeo
     def attempt_with_retries(uri, headers, request_class)
       retries = 0
       begin
-        execute_and_check(uri, headers, request_class)
+        execute_and_check(uri, headers, request_class, retries)
       rescue RetriableStatusError => e
         retry_status(e, retries += 1) && retry
       rescue *TRANSIENT_ERRORS => e
@@ -223,8 +225,9 @@ module Archaeo
       sleep(@retry_delay * retries)
     end
-    def execute_and_check(uri, headers, request_class)
-      response = execute_with_connection(uri, headers, request_class)
+    def execute_and_check(uri, headers, request_class, retry_count)
+      response = execute_with_connection(uri, headers, request_class,
+                                         retry_count)
       if RETRIABLE_STATUSES.include?(response.status)
         raise RetriableStatusError, response
       end
@@ -255,9 +258,9 @@ module Archaeo
             "Failed after #{retries} retries: #{error.message}"
     end
-    def execute_with_connection(uri, headers, request_class)
+    def execute_with_connection(uri, headers, request_class, retry_count)
       request = build_request(uri, headers, request_class)
-      execute_tracked_request(uri, request)
+      execute_tracked_request(uri, request, retry_count)
     rescue *TRANSIENT_ERRORS
       raise
     rescue StandardError
@@ -268,16 +271,17 @@ module Archaeo
     def build_request(uri, headers, request_class)
       request = request_class.new(uri)
       headers.each { |k, v| request[k] = v }
+      @before_request&.call(uri, request)
       request
     end
-    def execute_tracked_request(uri, request)
+    def execute_tracked_request(uri, request, retry_count)
       http = connection_for(uri)
       start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
       raw = http.request(request)
       elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
       response = build_response(raw)
-      @on_request&.call(uri, elapsed, response.status, 0)
+      @on_request&.call(uri, elapsed, response.status, retry_count)
       response
     end
@@ -286,7 +290,7 @@ module Archaeo
         "User-Agent" => select_user_agent,
         "Accept" => "text/html,application/xhtml+xml," \
                     "application/xml;q=0.9,*/*;q=0.8",
-        "Accept-Encoding" => "gzip",
+        "Accept-Encoding" => "gzip, deflate",
         "Accept-Language" => "en-US,en;q=0.9",
         "Connection" => "keep-alive",
       }
@@ -303,10 +307,17 @@ module Archaeo
     def decompress_body(raw)
       body = raw.body.to_s
-      return body unless raw["content-encoding"] == "gzip" && !body.empty?
-      Zlib::GzipReader.new(StringIO.new(body)).read
-    rescue Zlib::GzipFile::Error
+      return body if body.empty?
+      case raw["content-encoding"]
+      when "gzip"
+        Zlib::GzipReader.new(StringIO.new(body)).read
+      when "deflate"
+        Zlib::Inflate.inflate(body)
+      else
+        body
+      end
+    rescue Zlib::GzipFile::Error, Zlib::DataError
       body
     end
   end

data/lib/archaeo/page.rb CHANGED Viewed

@@ -67,6 +67,32 @@ module Archaeo
       end
     end
+    def links
+      return [] unless html?
+      @links ||= begin
+        doc = Nokogiri::HTML(@raw_content)
+        base = @archive_url || @original_url
+        doc.css("a[href]").map do |anchor|
+          href = resolve_page_url(anchor["href"], base)
+          { href: href, text: anchor.text.strip,
+            external: href && !href.include?(original_domain) }
+        end
+      end
+    end
+    def meta_tags
+      return {} unless html?
+      @meta_tags ||= begin
+        doc = Nokogiri::HTML(@raw_content)
+        result = extract_meta_entries(doc)
+        canonical = doc.at_css('link[rel="canonical"]')
+        result["canonical"] = canonical["href"].to_s if canonical
+        result
+      end
+    end
     def to_h
       {
         content_type: @content_type,
@@ -146,5 +172,35 @@ module Archaeo
                 invalid: :replace, undef: :replace,
                 replace: "?")
     end
+    def original_domain
+      @original_domain ||= begin
+        URI.parse(@original_url).host
+      rescue URI::InvalidURIError
+        nil
+      end
+    end
+    def extract_meta_entries(doc)
+      result = {}
+      doc.css("meta[name], meta[property], meta[http-equiv]").each do |meta|
+        key = meta["name"] || meta["property"] || meta["http-equiv"]
+        next unless key
+        result[key.downcase] = meta["content"].to_s
+      end
+      result
+    end
+    def resolve_page_url(href, base)
+      return href unless href
+      return href if href.start_with?("http", "//", "data:", "#",
+                                      "javascript:")
+      return nil unless base
+      URI.join(base, href).to_s
+    rescue URI::InvalidURIError
+      nil
+    end
   end
 end

data/lib/archaeo/save_api.rb CHANGED Viewed

@@ -23,6 +23,22 @@ module Archaeo
       attempt_save(save_url, start_time, url)
     end
+    def batch_save(urls, delay: 2, stop_on_error: false)
+      results = []
+      urls.each_with_index do |url, i|
+        sleep(delay) if i.positive?
+        result = save(url)
+        results << result
+      rescue RateLimitError, SaveFailed => e
+        raise e if stop_on_error
+        results << SaveResult.new(
+          url: url, archive_url: nil, timestamp: nil, cached: false,
+        )
+      end
+      results
+    end
     private
     def attempt_save(save_url, start_time, url)

data/lib/archaeo/save_result.rb CHANGED Viewed

@@ -11,7 +11,7 @@ module Archaeo
     def initialize(url:, archive_url:, timestamp:, cached:)
       @url = url
       @archive_url = archive_url
-      @timestamp = Timestamp.coerce(timestamp)
+      @timestamp = timestamp ? Timestamp.coerce(timestamp) : nil
       @cached = cached
     end
@@ -19,6 +19,10 @@ module Archaeo
       @cached
     end
+    def success?
+      !@archive_url.nil?
+    end
     def to_h
       { url: @url, archive_url: @archive_url,
         timestamp: @timestamp, cached: @cached }

data/lib/archaeo/snapshot.rb CHANGED Viewed

@@ -70,6 +70,18 @@ module Archaeo
       age <= seconds
     end
+    def same_content_as?(other)
+      return false unless other.is_a?(self.class)
+      return false if digest.nil? || digest.empty?
+      return false if other.digest.nil? || other.digest.empty?
+      digest == other.digest
+    end
+    def duplicate_of?(other)
+      same_content_as?(other) && timestamp != other.timestamp
+    end
     def fetch(client: HttpClient.new, identity: false)
       Fetcher.new(client: client).fetch(
         original_url, timestamp: @timestamp, identity: identity

data/lib/archaeo/timestamp.rb CHANGED Viewed

@@ -140,8 +140,54 @@ module Archaeo
       [year, month, day, hour, minute, second]
     end
+    def quarter
+      ((month - 1) / 3) + 1
+    end
+    def wday
+      @to_time.wday
+    end
+    def human_readable
+      @to_time.strftime("%Y-%m-%d %H:%M:%S UTC")
+    end
+    def date_range(granularity = :day)
+      start_ts = range_start(granularity)
+      end_ts = range_end(start_ts, granularity)
+      start_ts..end_ts
+    end
     def inspect
       "#<#{self.class.name} #{self}>"
     end
+    private
+    def range_start(granularity)
+      case granularity
+      when :month then self.class.new(year: year, month: month)
+      when :year then self.class.new(year: year)
+      else self.class.new(year: year, month: month, day: day)
+      end
+    end
+    def range_end(start_ts, granularity)
+      case granularity
+      when :month then next_month_start - 1
+      when :year
+        self.class.new(year: year, month: 12, day: 31,
+                       hour: 23, minute: 59, second: 59)
+      else start_ts + 86_399
+      end
+    end
+    def next_month_start
+      if month == 12
+        self.class.new(year: year + 1, month: 1)
+      else
+        self.class.new(year: year, month: month + 1)
+      end
+    end
   end
 end

data/lib/archaeo/url_rewriter.rb CHANGED Viewed

@@ -10,6 +10,7 @@ module Archaeo
   # rooted at a configurable local directory.
   class UrlRewriter
     URL_ATTRS = %w[src href data-src poster].freeze
+    CSS_URL_RE = /url\(\s*['"]?([^'")\s]+)['"]?\s*\)/
     def initialize(archive_prefix, local_prefix)
       @archive_prefix = archive_prefix.to_s
@@ -31,6 +32,8 @@ module Archaeo
       doc = Nokogiri::HTML(html_content)
       rewrite_url_attrs(doc)
       rewrite_srcset_attrs(doc)
+      rewrite_inline_style_attrs(doc)
+      rewrite_style_elements(doc)
       doc.to_html
     end
@@ -53,6 +56,28 @@ module Archaeo
     private
+    def rewrite_inline_style_attrs(doc)
+      doc.css("[style]").each do |el|
+        next unless el["style"]
+        el["style"] = rewrite_css_urls(el["style"])
+      end
+    end
+    def rewrite_style_elements(doc)
+      doc.css("style").each do |el|
+        el.content = rewrite_css_urls(el.text)
+      end
+    end
+    def rewrite_css_urls(css_text)
+      css_text.gsub(CSS_URL_RE) do
+        url = Regexp.last_match[1]
+        rewritten = url.start_with?(@archive_prefix) ? rewrite(url) : url
+        "url('#{rewritten}')"
+      end
+    end
     def rewrite_srcset(srcset)
       return srcset unless srcset

data/lib/archaeo/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Archaeo
-  VERSION = "0.2.6"
+  VERSION = "0.2.7"
 end

data/lib/archaeo.rb CHANGED Viewed

@@ -16,6 +16,17 @@ module Archaeo
   class SaveFailed < Error; end
   class IntegrityError < Error; end
+  class FetchError < Error
+    attr_reader :status_code, :url, :page
+    def initialize(message, status_code:, url:, page:)
+      super(message)
+      @status_code = status_code
+      @url = url
+      @page = page
+    end
+  end
   autoload :Timestamp, "archaeo/timestamp"
   autoload :ArchiveUrl, "archaeo/archive_url"
   autoload :Snapshot, "archaeo/snapshot"
@@ -25,6 +36,7 @@ module Archaeo
   autoload :AvailabilityResult, "archaeo/availability_result"
   autoload :UrlNormalizer, "archaeo/url_normalizer"
   autoload :CdxFilter, "archaeo/cdx_filter"
+  autoload :CdxTimeline, "archaeo/cdx_timeline"
   autoload :AssetList, "archaeo/asset_list"
   autoload :AssetExtractor, "archaeo/asset_extractor"
   autoload :UrlRewriter, "archaeo/url_rewriter"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: archaeo
 version: !ruby/object:Gem::Version
-  version: 0.2.6
+  version: 0.2.7
 platform: ruby
 authors:
 - Ribose Inc.
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-05-10 00:00:00.000000000 Z
+date: 2026-05-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: csv
@@ -79,6 +79,7 @@ files:
 - lib/archaeo/bulk_downloader.rb
 - lib/archaeo/cdx_api.rb
 - lib/archaeo/cdx_filter.rb
+- lib/archaeo/cdx_timeline.rb
 - lib/archaeo/cli.rb
 - lib/archaeo/download_state.rb
 - lib/archaeo/fetcher.rb