RubyGems - archaeo - Versions diffs - 0.2.7 → 0.2.8 - Mend

archaeo 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/lib/archaeo/archive_health_check.rb +77 -0
data/lib/archaeo/bulk_downloader.rb +82 -24
data/lib/archaeo/cdx_api.rb +39 -7
data/lib/archaeo/cdx_cache.rb +105 -0
data/lib/archaeo/cli.rb +109 -8
data/lib/archaeo/download_state.rb +35 -0
data/lib/archaeo/encoding_detector.rb +91 -0
data/lib/archaeo/page.rb +1 -1
data/lib/archaeo/path_sanitizer.rb +152 -0
data/lib/archaeo/pattern_filter.rb +80 -0
data/lib/archaeo/rate_limiter.rb +86 -0
data/lib/archaeo/save_api.rb +7 -2
data/lib/archaeo/save_result.rb +26 -8
data/lib/archaeo/subdomain_discovery.rb +117 -0
data/lib/archaeo/url_rewriter.rb +64 -7
data/lib/archaeo/version.rb +1 -1
data/lib/archaeo.rb +7 -0
metadata +9 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ecdcd994fa61efa836a5224a5e329b40b72694c27a79cbb6eb4f91bf57c0f2c9
-  data.tar.gz: 03ad557eb55ce9946a2936e3beec8cad13db2ecd4b2fc49b0996131d35e6ddba
+  metadata.gz: 67239af7cc927c495c67a849ecefb1cdc886ce8d95ddd6e27a2decdde6a93cd3
+  data.tar.gz: 8ce4a0f786c2e7db3268b6660a1aa9e2f3b913ff99c22c85c3c2190457defc90
 SHA512:
-  metadata.gz: a2859d1738f4f4a9fa0f0ed89d118dacfc24a2f75d3237ad3bdd31cf26c041e8aa5f47c998d8b5e61907c34d491ed48b6df22d2784702d60a15820ba8d8a2a27
-  data.tar.gz: e2df62b1077c90d8b04173f9aa713f590f9692cdfde3d3b656796308810341cd5b052321f97a28513ebe98143cdb8f28fa7e182720555db231983fa3d2a6d4be
+  metadata.gz: ca0a9cc2bf0ad33a0d3dfd88e3228fd79fc3291a42fd3d13bbfbe4e37e744b0e3a5dadcec1cab48c0e13b6af872a8e3f4e80ce3e6593b18f024416b9cf7370fa
+  data.tar.gz: bb4b1d9e720dfdcc18c7c4ccb73cc55e29a3e31fb6ffb5bf3b8c0fce1548a63a06da4a710ab5fc5020f142ede69dbdfbef5451944183e280e86e018379a792eb

data/lib/archaeo/archive_health_check.rb ADDED Viewed

@@ -0,0 +1,77 @@
+# frozen_string_literal: true
+module Archaeo
+  # Verifies that archived snapshots are still accessible.
+  #
+  # Checks each snapshot by performing HEAD requests to the
+  # archive URL and reporting accessibility status.
+  HealthReport = Struct.new(
+    :total, :accessible, :missing, :errors, :details,
+    keyword_init: true
+  )
+  HealthDetail = Struct.new(
+    :snapshot, :status, :error,
+    keyword_init: true
+  )
+  class ArchiveHealthCheck
+    def initialize(client: HttpClient.new, cdx_api: nil)
+      @client = client
+      @cdx_api = cdx_api
+    end
+    def check(url, from: nil, to: nil, sample: nil)
+      snapshots = fetch_snapshots(url, from: from, to: to)
+      snapshots = sample_snapshots(snapshots, sample) if sample
+      details = check_snapshots(snapshots)
+      build_report(details)
+    end
+    private
+    def fetch_snapshots(url, from:, to:)
+      cdx = @cdx_api || CdxApi.new(client: @client)
+      opts = {}
+      opts[:from] = from if from
+      opts[:to] = to if to
+      cdx.snapshots(url, **opts)
+        .select(&:success?).to_a
+    end
+    def sample_snapshots(snapshots, count)
+      return snapshots if count.nil? || count >= snapshots.size
+      step = snapshots.size.to_f / count
+      (0...count).map { |i| snapshots[(i * step).to_i] }
+    end
+    def check_snapshots(snapshots)
+      snapshots.map do |snap|
+        check_single(snap)
+      end
+    end
+    def check_single(snapshot)
+      response = @client.head(snapshot.archive_url)
+      status = response.status.between?(200, 399) ? :accessible : :missing
+      HealthDetail.new(snapshot: snapshot, status: status, error: nil)
+    rescue StandardError => e
+      HealthDetail.new(snapshot: snapshot, status: :error, error: e.message)
+    end
+    def build_report(details)
+      total = details.size
+      accessible = details.count { |d| d.status == :accessible }
+      missing = details.count { |d| d.status == :missing }
+      errors = details.count { |d| d.status == :error }
+      HealthReport.new(
+        total: total, accessible: accessible,
+        missing: missing, errors: errors,
+        details: details
+      )
+    end
+  end
+end

data/lib/archaeo/bulk_downloader.rb CHANGED Viewed

@@ -15,23 +15,31 @@ module Archaeo
   # for interrupted download recovery.
   class BulkDownloader
     def initialize(client: HttpClient.new, output_dir: "archive",
-                   cdx_api: nil, concurrency: 1, on_error: nil)
+                   cdx_api: nil, concurrency: 1, on_error: nil,
+                   rate_limiter: nil, path_sanitizer: nil)
       @client = client
       @output_dir = output_dir
       @cdx_api = cdx_api
       @concurrency = [1, concurrency.to_i].max
       @on_error = on_error
+      @rate_limiter = rate_limiter || RateLimiter.new
+      @path_sanitizer = path_sanitizer || PathSanitizer.new
     end
     def download(url, from: nil, to: nil, resume: false,
-                 dry_run: false, &block)
+                 dry_run: false, all_timestamps: false,
+                 filter: nil, page_requisites: false,
+                 snapshot_at: nil, &block)
       start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
       url = UrlNormalizer.normalize(url)
       FileUtils.mkdir_p(@output_dir) unless dry_run
-      snapshots = fetch_snapshots(url, from: from, to: to)
+      snapshots = fetch_snapshots(url, from: from, to: to,
+                                       all_timestamps: all_timestamps,
+                                       snapshot_at: snapshot_at)
+      snapshots = apply_filter(snapshots, filter)
       downloaded, skipped, bytes, failed =
-        run_download(snapshots, resume, dry_run, block)
+        run_download(snapshots, resume, dry_run, page_requisites, block)
       build_summary(start_time, snapshots.size, downloaded,
                     skipped, bytes, failed: failed)
@@ -39,25 +47,39 @@ module Archaeo
     private
-    def fetch_snapshots(url, from:, to:)
+    def fetch_snapshots(url, from:, to:, all_timestamps:, snapshot_at:)
       cdx = @cdx_api || CdxApi.new(client: @client)
+      if snapshot_at
+        ts = Timestamp.coerce(snapshot_at)
+        return cdx.composite_snapshot(url, timestamp: ts, collapse: ["digest"])
+      end
       options = {}
       options[:from] = from if from
       options[:to] = to if to
+      options[:collapse] = ["digest"] unless all_timestamps
       cdx.snapshots(url, **options)
         .select { |snap| !snap.blocked? && snap.status_code == 200 }
     end
-    def run_download(snapshots, resume, dry_run, progress)
+    def apply_filter(snapshots, filter)
+      return snapshots unless filter
+      snapshots.select { |snap| filter.match?(snap.original_url) }
+    end
+    def run_download(snapshots, resume, dry_run, page_requisites, progress)
       state = DownloadState.new(@output_dir)
       total = snapshots.size
       if @concurrency == 1
         download_sequential(snapshots, total, state, resume,
-                            dry_run, progress)
+                            dry_run, page_requisites, progress)
       else
         download_concurrent(snapshots, total, state, resume,
-                            dry_run, progress)
+                            dry_run, page_requisites, progress)
       end
     end
@@ -71,11 +93,12 @@ module Archaeo
     end
     def download_sequential(snapshots, total, state, resume,
-                            dry_run, progress)
+                            dry_run, page_requisites, progress)
       counters = { downloaded: 0, skipped: 0, bytes: 0, failed: 0 }
       snapshots.each_with_index do |snap, index|
         process_sequential(snap, state, resume, dry_run, counters)
+        fetch_requisites(snap, dry_run, counters) if page_requisites
         progress&.call(index + 1, total, snap)
       end
@@ -96,6 +119,47 @@ module Archaeo
       @on_error&.call(snap, e)
     end
+    def fetch_requisites(snap, dry_run, counters)
+      return if dry_run
+      begin
+        bundle = snap.fetch_with_assets(client: @client)
+        bundle.assets.downloadable.all.each do |asset_url|
+          asset_snap = find_asset_snapshot(asset_url)
+          next unless asset_snap
+          counters[:bytes] += write_asset(asset_snap)
+          counters[:downloaded] += 1
+        end
+      rescue StandardError
+        nil
+      end
+    end
+    def find_asset_snapshot(asset_url)
+      cdx = @cdx_api || CdxApi.new(client: @client)
+      cdx.near(asset_url, timestamp: Timestamp.now)
+    rescue NoSnapshotFound, StandardError
+      nil
+    end
+    def write_asset(snapshot)
+      content = fetch_content(snapshot)
+      filename = build_filename(snapshot)
+      FileUtils.mkdir_p(File.dirname(filename))
+      tmp_path = "#{filename}.tmp"
+      File.binwrite(tmp_path, content)
+      File.rename(tmp_path, filename)
+      content.bytesize
+    end
+    def fetch_content(snapshot)
+      @rate_limiter.wait(host: "web.archive.org")
+      Fetcher.new(client: @client).fetch(
+        snapshot.original_url, timestamp: snapshot.timestamp
+      ).content
+    end
     def download_snapshot(snap, state)
       content = fetch_and_save(snap)
       state.mark_completed(snap.timestamp, url: snap.original_url,
@@ -104,7 +168,7 @@ module Archaeo
     end
     def download_concurrent(snapshots, total, state, resume,
-                            dry_run, progress)
+                            dry_run, page_requisites, progress)
       queue = snapshots.each_with_index.to_a
       shared = { mutex: Mutex.new, errors: [],
                  downloaded: 0, skipped: 0, bytes: 0, failed: 0 }
@@ -112,7 +176,7 @@ module Archaeo
       threads = Array.new(@concurrency) do
         Thread.new do
           process_queue(queue, total, state, resume,
-                        dry_run, progress, shared)
+                        dry_run, page_requisites, progress, shared)
         end
       end
       threads.each(&:join)
@@ -122,7 +186,7 @@ module Archaeo
     end
     def process_queue(queue, total, state, resume, dry_run,
-                      progress, shared)
+                      _page_requisites, progress, shared)
       loop do
         snap, index = shared[:mutex].synchronize { queue.shift }
         break unless snap
@@ -177,6 +241,7 @@ module Archaeo
     end
     def fetch_page(snapshot)
+      @rate_limiter.wait(host: "web.archive.org")
       Fetcher.new(client: @client).fetch(
         snapshot.original_url, timestamp: snapshot.timestamp
       )
@@ -231,21 +296,14 @@ module Archaeo
     end
     def build_filename(snapshot)
+      safe_path = @path_sanitizer.sanitize(snapshot.original_url)
       ts = snapshot.timestamp.to_s
-      safe_path = snapshot.original_url
-        .sub(%r{\Ahttps?://}, "")
-        .gsub(%r{[<>:"|?*#]}, "_")
-        .gsub(%r{[/\\]}, File::SEPARATOR)
-        .gsub(%r{[?&=]}, "_")
-      safe_path = safe_path[0..-2] if safe_path.end_with?(File::SEPARATOR)
-      safe_path = "#{safe_path}index" if safe_path.empty?
-      segments = safe_path.split(File::SEPARATOR).map do |seg|
-        seg.length > 200 ? seg[0..200] : seg
-      end
+      segments = safe_path.split(File::SEPARATOR)
+      last = segments.pop || "index"
-      File.join(@output_dir, *segments, "#{ts}#{extension_for(snapshot)}")
+      File.join(@output_dir, *segments,
+                "#{last}_#{ts}#{extension_for(snapshot)}")
     end
   end
 end

data/lib/archaeo/cdx_api.rb CHANGED Viewed

@@ -40,8 +40,9 @@ module Archaeo
       last_skip_timestamp: "lastSkipTimestamp",
     }.freeze
-    def initialize(client: HttpClient.new)
+    def initialize(client: HttpClient.new, cache_dir: nil)
       @client = client
+      @cache = cache_dir ? CdxCache.new(cache_dir) : nil
     end
     # Returns an Enumerator of Snapshot objects, auto-paginating
@@ -50,13 +51,26 @@ module Archaeo
       url = UrlNormalizer.normalize(url)
       validate_options!(options)
-      Enumerator.new do |yielder|
-        if options.key?(:page)
-          fetch_page(url, options, yielder)
-        else
-          fetch_with_resume_key(url, options, yielder)
-        end
+      if @cache && !options.key?(:page)
+        return cached_snapshots(url, options)
       end
+      build_enumerator(url, options)
+    end
+    # Returns one snapshot per unique URL, picking the newest at or before
+    # the given timestamp for point-in-time site reconstruction.
+    def composite_snapshot(url, timestamp:, collapse: [])
+      ts = Timestamp.coerce(timestamp)
+      options = { to: ts.to_s, sort: "reverse" }
+      options[:collapse] = collapse unless collapse.empty?
+      seen = {}
+      snapshots(url, **options).each do |snap|
+        key = snap.original_url
+        seen[key] = snap unless seen.key?(key)
+      end
+      seen.values
     end
     def near(url, timestamp:)
@@ -153,6 +167,24 @@ module Archaeo
     private
+    def cached_snapshots(url, options)
+      Enumerator.new do |yielder|
+        @cache.fetch(url, **options) do
+          build_enumerator(url, options).to_a
+        end.each { |s| yielder << s }
+      end
+    end
+    def build_enumerator(url, options)
+      Enumerator.new do |yielder|
+        if options.key?(:page)
+          fetch_page(url, options, yielder)
+        else
+          fetch_with_resume_key(url, options, yielder)
+        end
+      end
+    end
     def fetch_with_resume_key(url, options, yielder)
       params = build_params(url, options)
       loop do

data/lib/archaeo/cdx_cache.rb ADDED Viewed

@@ -0,0 +1,105 @@
+# frozen_string_literal: true
+require "json"
+require "digest"
+module Archaeo
+  # Persists CDX API query results to disk for resume support.
+  #
+  # Caches snapshot lists keyed by query parameters so that
+  # interrupted downloads can resume without re-querying CDX.
+  class CdxCache
+    CACHE_DIR = ".cache"
+    def initialize(base_dir)
+      @base_dir = base_dir
+      @cache_dir = File.join(base_dir, CACHE_DIR)
+    end
+    def fetch(url, **options)
+      key = cache_key(url, options)
+      path = cache_path(key)
+      if File.exist?(path)
+        load_cache(path)
+      else
+        snapshots = yield
+        save_cache(path, url, options, snapshots)
+        snapshots
+      end
+    end
+    def cached?(url, **options)
+      File.exist?(cache_path(cache_key(url, options)))
+    end
+    def cache_key(url, options = {})
+      parts = [url.to_s]
+      parts << options[:from].to_s if options[:from]
+      parts << options[:to].to_s if options[:to]
+      parts << options[:match_type].to_s if options[:match_type]
+      parts += Array(options[:filters]).map(&:to_s) if options[:filters]
+      parts += Array(options[:collapse]).map(&:to_s) if options[:collapse]
+      parts << options[:sort].to_s if options[:sort]
+      Digest::SHA256.hexdigest(parts.join("|"))[0, 16]
+    end
+    def clear(url = nil, **options)
+      if url
+        FileUtils.rm_f(cache_path(cache_key(url, options)))
+      else
+        FileUtils.rm_rf(@cache_dir)
+      end
+    end
+    private
+    def cache_path(key)
+      FileUtils.mkdir_p(@cache_dir)
+      File.join(@cache_dir, "#{key}.cdx.json")
+    end
+    def load_cache(path)
+      data = JSON.parse(File.read(path))
+      data["snapshots"].map { |row| build_snapshot(row) }
+    end
+    def save_cache(path, url, options, snapshots)
+      data = {
+        "url" => url.to_s,
+        "options" => serialize_options(options),
+        "cached_at" => Time.now.utc.iso8601,
+        "snapshots" => snapshots.map(&:as_json),
+      }
+      tmp_path = "#{path}.tmp"
+      File.write(tmp_path, JSON.generate(data))
+      File.rename(tmp_path, path)
+    end
+    def serialize_options(options)
+      h = {}
+      h["from"] = options[:from].to_s if options[:from]
+      h["to"] = options[:to].to_s if options[:to]
+      h["match_type"] = options[:match_type].to_s if options[:match_type]
+      h["filters"] = Array(options[:filters]).map(&:to_s) if options[:filters]
+      if options[:collapse]
+        h["collapse"] =
+          Array(options[:collapse]).map(&:to_s)
+      end
+      h["sort"] = options[:sort].to_s if options[:sort]
+      h
+    end
+    def build_snapshot(row)
+      Snapshot.new(
+        urlkey: row["urlkey"],
+        timestamp: row["timestamp"],
+        original_url: row["original_url"],
+        mimetype: row["mimetype"],
+        status_code: row["status_code"],
+        digest: row["digest"],
+        length: row["length"],
+      )
+    end
+  end
+end

data/lib/archaeo/cli.rb CHANGED Viewed

@@ -37,6 +37,8 @@ module Archaeo
     option :limit, type: :numeric, desc: "Max snapshots to return"
     option :format, desc: "Output format (table, json, csv)",
                     default: "table"
+    option :fields, type: :array,
+                    desc: "Specific fields to print (timestamp,original,etc)"
     def snapshots(url)
       fmt = validate_output_format
       handle_errors do
@@ -123,11 +125,21 @@ module Archaeo
     end
     desc "save URL", "Save a URL to the Wayback Machine"
+    option :headers, type: :boolean, default: false,
+                     desc: "Show response headers"
     def save(url)
       handle_errors do
         result = SaveApi.new.save(url)
         label = result.cached? ? "Cached" : "Saved"
         puts "#{label}: #{result.archive_url}"
+        if options[:headers] && result.response_headers
+          puts "Status: #{result.status_code}"
+          puts "Response URL: #{result.response_url}" if result.response_url
+          puts "Headers:"
+          result.response_headers.each do |k, v|
+            puts "  #{k}: #{v}"
+          end
+        end
       end
     end
@@ -162,11 +174,16 @@ module Archaeo
          "Fetch a page and rewrite archive URLs to local paths"
     option :prefix, desc: "Local path prefix", default: "local"
     option :output, desc: "Write rewritten HTML to file"
+    option :rewrite_js, type: :boolean, default: false,
+                        desc: "Rewrite URLs in JavaScript strings"
+    option :rewrite_absolute, type: :boolean, default: false,
+                              desc: "Rewrite all absolute archive URLs"
     def rewrite(url, timestamp)
       handle_errors do
         coerced = Timestamp.coerce(timestamp)
         page = Fetcher.new.fetch(url, timestamp: coerced)
-        rewritten = build_rewriter(url, coerced).rewrite_html(page.content)
+        rewriter = build_rewriter(url, coerced)
+        rewritten = rewriter.rewrite_html(page.content)
         output_rewritten(rewritten)
       end
     end
@@ -215,22 +232,61 @@ module Archaeo
                          desc: "Number of parallel downloads"
     option :dry_run, type: :boolean, default: false,
                      desc: "Preview downloads without fetching"
+    option :all_timestamps, type: :boolean, default: false,
+                            desc: "Download all timestamps, not just latest"
+    option :only, desc: "Only download URLs matching this pattern"
+    option :exclude, desc: "Exclude URLs matching this pattern"
+    option :page_requisites, type: :boolean, default: false,
+                             desc: "Download linked assets (CSS/JS/images)"
+    option :snapshot_at, desc: "Download composite snapshot at timestamp"
+    option :rate_limit, type: :numeric, default: 0,
+                        desc: "Min seconds between requests"
     def download(url)
       handle_errors do
+        rate_limiter = RateLimiter.new(
+          min_interval: options[:rate_limit].to_f,
+        )
+        filter = build_filter
         downloader = BulkDownloader.new(
           output_dir: options[:output],
           concurrency: options[:concurrency],
+          rate_limiter: rate_limiter,
         )
-        download_with_progress(downloader, url)
+        download_with_progress(downloader, url, filter)
+      end
+    end
+    desc "health URL", "Check health of archived snapshots"
+    option :from, desc: "Start timestamp"
+    option :to, desc: "End timestamp"
+    option :sample, type: :numeric, desc: "Check only N snapshots"
+    option :format, desc: "Output format (table, json)", default: "table"
+    def health(url)
+      handle_errors do
+        checker = ArchiveHealthCheck.new
+        report = checker.check(
+          url,
+          from: options[:from],
+          to: options[:to],
+          sample: options[:sample],
+        )
+        output_health(report)
       end
     end
     desc "known_urls DOMAIN",
          "List all known URLs for a domain"
+    option :subdomain, type: :boolean, default: false,
+                       desc: "Include subdomain URLs"
+    option :file, desc: "Save URLs to file"
     def known_urls(domain)
       handle_errors do
-        CdxApi.new.known_urls(domain).each do |u|
-          puts u
+        match_type = options[:subdomain] ? "domain" : "prefix"
+        urls = CdxApi.new.known_urls(domain, match_type: match_type)
+        if options[:file]
+          save_urls_to_file(urls, options[:file])
+        else
+          urls.each { |u| puts u }
         end
       end
     end
@@ -331,7 +387,11 @@ module Archaeo
     def build_rewriter(url, timestamp)
       normalized = UrlNormalizer.normalize(url)
       archive_prefix = ArchiveUrl.new(normalized, timestamp: timestamp).to_s
-      UrlRewriter.new(archive_prefix, options[:prefix])
+      UrlRewriter.new(
+        archive_prefix, options[:prefix],
+        rewrite_js: options[:rewrite_js],
+        rewrite_absolute: options[:rewrite_absolute]
+      )
     end
     def output_rewritten(content)
@@ -366,14 +426,55 @@ module Archaeo
       end
     end
-    def download_with_progress(downloader, url)
+    def build_filter
+      only = options[:only]
+      exclude = options[:exclude]
+      return nil unless only || exclude
+      PatternFilter.new(only: only, exclude: exclude)
+    end
+    def download_with_progress(downloader, url, filter)
       summary = downloader.download(
-        url, from: options[:from], to: options[:to],
-             resume: options[:resume], dry_run: options[:dry_run]
+        url,
+        from: options[:from], to: options[:to],
+        resume: options[:resume], dry_run: options[:dry_run],
+        all_timestamps: options[:all_timestamps],
+        filter: filter,
+        page_requisites: options[:page_requisites],
+        snapshot_at: options[:snapshot_at]
       ) { |c, t, s| print_progress(c, t, s) }
       print_summary(summary)
     end
+    def output_health(report)
+      case options[:format]
+      when "json"
+        data = {
+          total: report.total,
+          accessible: report.accessible,
+          missing: report.missing,
+          errors: report.errors,
+        }
+        puts JSON.generate(data)
+      else
+        puts "Total: #{report.total}"
+        puts "Accessible: #{report.accessible}"
+        puts "Missing: #{report.missing}"
+        puts "Errors: #{report.errors}"
+      end
+    end
+    def save_urls_to_file(urls, file_path)
+      FileUtils.mkdir_p(File.dirname(file_path)) unless File.dirname(file_path) == "."
+      File.open(file_path, "w") do |f|
+        urls.each do |url|
+          f.puts(url)
+        end
+      end
+      warn "Saved #{urls.size} URLs to #{file_path}" unless quiet?
+    end
     def print_progress(current, total, snap)
       return if quiet?

data/lib/archaeo/download_state.rb CHANGED Viewed

@@ -62,6 +62,36 @@ module Archaeo
       end
     end
+    def file_exists?(timestamp, base_dir: @output_dir)
+      entry = entry_for(timestamp)
+      return false unless entry
+      file_path = find_file(base_dir, timestamp.to_s)
+      File.exist?(file_path)
+    end
+    def stale_entries(base_dir: @output_dir)
+      @mutex.synchronize do
+        entries.reject do |e|
+          find_file(base_dir,
+                    e["ts"]) && File.exist?(find_file(base_dir, e["ts"]))
+        end
+      end
+    end
+    def cleanup_stale(base_dir: @output_dir)
+      @mutex.synchronize do
+        stale = entries.reject do |e|
+          path = find_file(base_dir, e["ts"])
+          path && File.exist?(path)
+        end
+        @entries = entries - stale
+        @entries_key = nil
+        save
+        stale.size
+      end
+    end
     private
     def entries
@@ -103,5 +133,10 @@ module Archaeo
       File.write(tmp_path, content)
       File.rename(tmp_path, @path)
     end
+    def find_file(base_dir, timestamp)
+      pattern = File.join(base_dir, "**", "*#{timestamp}*")
+      Dir.glob(pattern).first
+    end
   end
 end