RubyGems - archaeo - Versions diffs - 0.2.9 → 0.2.10 - Mend

archaeo 0.2.9 → 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/lib/archaeo/archive_search.rb +120 -0
data/lib/archaeo/cli.rb +116 -4
data/lib/archaeo/color_output.rb +73 -0
data/lib/archaeo/content_tracker.rb +117 -0
data/lib/archaeo/parallel_cdx.rb +47 -0
data/lib/archaeo/version.rb +1 -1
data/lib/archaeo/warc_support.rb +249 -0
data/lib/archaeo.rb +9 -0
metadata +7 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: fb2b99e313bf2a3ac807cebf0052d369d83c8514ad89a1b9ca18deed421a0c4d
-  data.tar.gz: fa1f9536f838d8246706d5eca3350ba4eae97546ae88c48e28a27e5df952d987
+  metadata.gz: 235d2cba1b1e071156a873d7a63cf0fdb6ba8079eb6083e21755e723727db6d9
+  data.tar.gz: 65c040c3a5984fdc1a68ca106d9ae10eab64b212ce6a72b37bec39ec57d383e2
 SHA512:
-  metadata.gz: 148875e2dae2319e4c96d892c2233bd889e3c193a9ddad8995faded2d637ce398a3e971d27ca05c62c93674c3c6db05322814d823d8493c8d0318f052e1278d4
-  data.tar.gz: ee1d6df5dc3623d6aee2e7803b82306c69099a72b52475e6969ef5a2a4bdff73ea4544faf3aa6fdcd54e4594a1020dc7b5dd05ffe5d437327eda18a5c23d35ec
+  metadata.gz: e6eb3cdb88abb87332bbba762bf566643da717ce17557e31ed90a012bd7c164939b5eb719420f74dfa908215e0f604e71b5fb2bb8bcc7de2940e36b80524e963
+  data.tar.gz: f52bc54fe3c425eeae28093810f1d90c4200391696ffe9af0e3f91366d619e4e57a425ec1e6a6a8b9aa2465d337bb2b960782fac1c3ce068d7d4b673b8306641

data/lib/archaeo/archive_search.rb ADDED Viewed

@@ -0,0 +1,120 @@
+# frozen_string_literal: true
+module Archaeo
+  # Value object for a single search match within an archived snapshot.
+  SearchResult = Struct.new(
+    :url, :snapshot, :context, :match_offset,
+    keyword_init: true
+  ) do
+    def to_h
+      {
+        url: url,
+        snapshot: snapshot.as_json,
+        context: context,
+        match_offset: match_offset,
+      }
+    end
+    def as_json(*)
+      to_h
+    end
+  end
+  # Full-text search across archived snapshots.
+  #
+  # Fetches snapshots from CDX, downloads their content, and
+  # searches for the given query string. Returns matches with
+  # surrounding context for each hit.
+  class ArchiveSearch
+    CONTEXT_RADIUS = 80
+    def initialize(cdx_api: CdxApi.new, fetcher: Fetcher.new)
+      @cdx = cdx_api
+      @fetcher = fetcher
+    end
+    def search(url, query:, from: nil, to: nil,
+               max_results: nil, case_sensitive: false)
+      if query.nil? || query.empty?
+        raise ArgumentError,
+              "query must not be empty"
+      end
+      url = UrlNormalizer.normalize(url)
+      opts = build_options(from, to)
+      snapshots = @cdx.snapshots(url, **opts)
+        .select { |s| s.success? && s.mimetype.to_s.include?("text") }
+        .to_a
+      find_matches(snapshots, query, case_sensitive, max_results)
+    end
+    private
+    def build_options(from, to)
+      opts = { collapse: ["digest"] }
+      opts[:from] = Timestamp.coerce(from).to_s if from
+      opts[:to] = Timestamp.coerce(to).to_s if to
+      opts
+    end
+    def find_matches(snapshots, query, case_sensitive, max_results)
+      results = []
+      pattern = build_pattern(query, case_sensitive)
+      snapshots.each do |snap|
+        break if max_results && results.size >= max_results
+        content = fetch_content(snap)
+        next unless content
+        scan_content(content, pattern).each do |match_offset|
+          results << SearchResult.new(
+            url: snap.original_url,
+            snapshot: snap,
+            context: extract_context(content, match_offset, query.length),
+            match_offset: match_offset,
+          )
+          break if max_results && results.size >= max_results
+        end
+      end
+      results
+    end
+    def build_pattern(query, case_sensitive)
+      escaped = Regexp.escape(query)
+      return /#{escaped}/im unless case_sensitive
+      /#{escaped}/m
+    end
+    def fetch_content(snapshot)
+      page = @fetcher.fetch(
+        snapshot.original_url, timestamp: snapshot.timestamp
+      )
+      page.content if page.text?
+    rescue Error
+      nil
+    end
+    def scan_content(content, pattern)
+      offsets = []
+      content.scan(pattern) do
+        offsets << Regexp.last_match.offset(0).first
+      end
+      offsets
+    end
+    def extract_context(content, offset, length)
+      start_pos = [0, offset - CONTEXT_RADIUS].max
+      end_pos = [content.length, offset + length + CONTEXT_RADIUS].min
+      ctx = content[start_pos...end_pos]
+      ctx = "...#{ctx}" if start_pos.positive?
+      ctx = "#{ctx}..." if end_pos < content.length
+      ctx.tr("\n\r", " ").strip
+    end
+  end
+end

data/lib/archaeo/cli.rb CHANGED Viewed

@@ -12,6 +12,8 @@ module Archaeo
     class_option :quiet, type: :boolean, default: false,
                          desc: "Suppress progress messages"
+    class_option :no_color, type: :boolean, default: false,
+                            desc: "Disable colored output"
     def self.exit_on_failure?
       true
@@ -351,6 +353,68 @@ module Archaeo
       end
     end
+    desc "search URL QUERY", "Search archived snapshots for text"
+    option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
+    option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
+    option :max_results, type: :numeric, desc: "Maximum results to return"
+    option :case_sensitive, type: :boolean, default: false,
+                            desc: "Case-sensitive search"
+    option :format, desc: "Output format (table, json)", default: "table"
+    def search(url, query)
+      handle_errors do
+        searcher = ArchiveSearch.new
+        results = searcher.search(
+          url, query: query,
+               from: options[:from], to: options[:to],
+               max_results: options[:max_results],
+               case_sensitive: options[:case_sensitive]
+        )
+        output_search_results(results)
+      end
+    end
+    desc "track-changes URL",
+         "Track content changes over time"
+    option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
+    option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
+    option :format, desc: "Output format (table, json)", default: "table"
+    def track_changes(url)
+      handle_errors do
+        tracker = ContentTracker.new
+        report = tracker.track(url, from: options[:from], to: options[:to])
+        output_content_changes(report)
+      end
+    end
+    desc "warc-export URL", "Export snapshots to WARC format"
+    option :output, desc: "Output WARC file path", required: true
+    option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
+    option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
+    option :gzip, type: :boolean, default: false,
+                  desc: "Write gzip-compressed WARC (.warc.gz)"
+    def warc_export(url)
+      handle_errors do
+        fetcher = Fetcher.new
+        cdx = CdxApi.new
+        opts = {}
+        opts[:from] = options[:from] if options[:from]
+        opts[:to] = options[:to] if options[:to]
+        snapshots = cdx.snapshots(url, **opts)
+          .select(&:success?).to_a
+        pages = snapshots.filter_map do |snap|
+          fetcher.fetch(snap.original_url, timestamp: snap.timestamp)
+        rescue Error
+          nil
+        end
+        WarcWriter.new.write(options[:output], pages,
+                             compress: options[:gzip])
+        color = build_color
+        warn color.success("Exported #{pages.size} snapshots to #{options[:output]}")
+      end
+    end
     CDX_OPTION_MAP = {
       from: :from,
       to: :to,
@@ -370,16 +434,16 @@ module Archaeo
     def handle_errors
       yield
     rescue RateLimitError => e
-      warn "Rate limited: #{e.message}"
+      warn build_color.warning("Rate limited: #{e.message}")
       exit 1
     rescue NoSnapshotFound => e
-      warn "Not found: #{e.message}"
+      warn build_color.error("Not found: #{e.message}")
       exit 1
     rescue BlockedSiteError => e
-      warn "Blocked: #{e.message}"
+      warn build_color.error("Blocked: #{e.message}")
       exit 1
     rescue Error => e
-      warn "Error: #{e.message}"
+      warn build_color.error("Error: #{e.message}")
       exit 1
     end
@@ -746,5 +810,53 @@ module Archaeo
         end
       end
     end
+    def output_search_results(results)
+      case options[:format]
+      when "json"
+        puts JSON.generate(results.map(&:as_json))
+      else
+        if results.empty?
+          warn "No results found."
+          return
+        end
+        results.each do |result|
+          puts "#{result.snapshot.timestamp} #{result.url}"
+          puts "  #{result.context}"
+          puts
+        end
+        warn "#{results.size} result(s) found."
+      end
+    end
+    def output_content_changes(report)
+      case options[:format]
+      when "json"
+        puts JSON.generate(report.as_json)
+      else
+        puts "URL: #{report.url}"
+        puts "Total snapshots: #{report.total_snapshots}"
+        puts "Unique digests: #{report.unique_digests}"
+        puts "URLs changed: #{report.changed_urls.size}"
+        puts "URLs added: #{report.new_urls.size}"
+        puts "URLs removed: #{report.removed_urls.size}"
+        unless report.changed_urls.empty?
+          puts "Changed URLs:"
+          report.changed_urls.each { |u| puts "  #{u}" }
+        end
+        unless report.new_urls.empty?
+          puts "New URLs:"
+          report.new_urls.each { |u| puts "  + #{u}" }
+        end
+        unless report.removed_urls.empty?
+          puts "Removed URLs:"
+          report.removed_urls.each { |u| puts "  - #{u}" }
+        end
+      end
+    end
+    def build_color
+      ColorOutput.new(enabled: !options[:no_color])
+    end
   end
 end

data/lib/archaeo/color_output.rb ADDED Viewed

@@ -0,0 +1,73 @@
+# frozen_string_literal: true
+module Archaeo
+  # Minimal ANSI color helper for CLI output.
+  #
+  # Detects whether the output stream supports color and wraps
+  # strings with escape codes accordingly. Respects --no-color
+  # and TERM=dumb.
+  class ColorOutput
+    COLORS = {
+      red: 31,
+      green: 32,
+      yellow: 33,
+      blue: 34,
+      magenta: 35,
+      cyan: 36,
+      white: 37,
+    }.freeze
+    STYLES = {
+      bold: 1,
+      dim: 2,
+    }.freeze
+    def initialize(enabled: nil, stream: $stderr)
+      @enabled = enabled.nil? ? detect_color_support(stream) : enabled
+    end
+    COLORS.each do |name, code|
+      define_method(name) do |text|
+        colorize(text, code)
+      end
+    end
+    STYLES.each do |name, code|
+      define_method(name) do |text|
+        colorize(text, code)
+      end
+    end
+    def success(text)
+      green(bold(text))
+    end
+    def warning(text)
+      yellow(bold(text))
+    end
+    def error(text)
+      red(bold(text))
+    end
+    def info(text)
+      cyan(text)
+    end
+    private
+    def colorize(text, code)
+      return text unless @enabled
+      "\e[#{code}m#{text}\e[0m"
+    end
+    def detect_color_support(stream)
+      return false if stream.nil?
+      return false if ENV["NO_COLOR"]
+      return false if ENV["TERM"] == "dumb"
+      stream.tty?
+    end
+  end
+end

data/lib/archaeo/content_tracker.rb ADDED Viewed

@@ -0,0 +1,117 @@
+# frozen_string_literal: true
+require "digest"
+require "set"
+module Archaeo
+  # Value object summarizing content changes for a URL over a time range.
+  ContentChangeReport = Struct.new(
+    :url, :from, :to,
+    :changed_urls, :new_urls, :removed_urls,
+    :content_frequency, :total_snapshots, :unique_digests,
+    keyword_init: true
+  ) do
+    def any_changes?
+      !changed_urls.empty? || !new_urls.empty? || !removed_urls.empty?
+    end
+    def to_h
+      {
+        url: url,
+        from: from.to_s,
+        to: to.to_s,
+        changed_urls: changed_urls,
+        new_urls: new_urls,
+        removed_urls: removed_urls,
+        content_frequency: content_frequency,
+        total_snapshots: total_snapshots,
+        unique_digests: unique_digests,
+      }
+    end
+    def as_json(*)
+      to_h
+    end
+  end
+  # Tracks content changes for a URL across archived snapshots.
+  #
+  # Groups snapshots by original URL, then analyzes how content
+  # (identified by CDX digest) changed over the given time range.
+  class ContentTracker
+    def initialize(cdx_api: CdxApi.new, fetcher: Fetcher.new)
+      @cdx = cdx_api
+      @fetcher = fetcher
+    end
+    def track(url, from: nil, to: nil)
+      url = UrlNormalizer.normalize(url)
+      ts_from = from ? Timestamp.coerce(from) : nil
+      ts_to = to ? Timestamp.coerce(to) : nil
+      opts = {}
+      opts[:from] = ts_from.to_s if ts_from
+      opts[:to] = ts_to.to_s if ts_to
+      snapshots = @cdx.snapshots(url, **opts)
+        .select(&:success?).to_a
+      grouped = group_by_url(snapshots)
+      analyze(url, ts_from, ts_to, snapshots, grouped)
+    end
+    private
+    def group_by_url(snapshots)
+      snapshots.group_by(&:original_url)
+    end
+    def analyze(url, ts_from, ts_to, all_snapshots, grouped)
+      changed = []
+      new_urls = []
+      removed = []
+      frequency = {}
+      sorted = all_snapshots.sort_by(&:timestamp)
+      timestamps = sorted.map(&:timestamp).uniq
+      grouped.each do |original_url, snaps|
+        url_snaps = snaps.sort_by(&:timestamp)
+        digests = url_snaps.map(&:digest).reject(&:empty?)
+        if digests.uniq.size > 1
+          changed << original_url
+        end
+        frequency[original_url] = digests.uniq.size
+      end
+      if timestamps.size >= 2
+        first_half, second_half = split_by_time(sorted, timestamps)
+        first_urls = Set.new(first_half.map(&:original_url))
+        second_urls = Set.new(second_half.map(&:original_url))
+        new_urls = (second_urls - first_urls).to_a.sort
+        removed = (first_urls - second_urls).to_a.sort
+      end
+      ContentChangeReport.new(
+        url: url,
+        from: ts_from,
+        to: ts_to,
+        changed_urls: changed.sort,
+        new_urls: new_urls,
+        removed_urls: removed,
+        content_frequency: frequency,
+        total_snapshots: all_snapshots.size,
+        unique_digests: all_snapshots.map(&:digest).reject(&:empty?).uniq.size,
+      )
+    end
+    def split_by_time(snapshots, timestamps)
+      mid = timestamps[timestamps.size / 2]
+      first, second = snapshots.partition { |s| s.timestamp < mid }
+      [first, second]
+    end
+  end
+end

data/lib/archaeo/parallel_cdx.rb ADDED Viewed

@@ -0,0 +1,47 @@
+# frozen_string_literal: true
+module Archaeo
+  # Fetches CDX pages in parallel for faster bulk queries.
+  #
+  # Wraps CdxApi and uses a thread pool to fetch multiple CDX
+  # result pages simultaneously, then merges results in order.
+  class ParallelCdx
+    DEFAULT_CONCURRENCY = 4
+    def initialize(cdx_api: CdxApi.new, concurrency: DEFAULT_CONCURRENCY)
+      @cdx = cdx_api
+      @concurrency = [concurrency.to_i, 1].max
+    end
+    def snapshots(url, **options)
+      pages = @cdx.num_pages(url, **options)
+      return @cdx.snapshots(url, **options) if pages <= 1
+      fetch_parallel(url, options, pages)
+    end
+    private
+    def fetch_parallel(url, options, total_pages)
+      queue = (0...total_pages).to_a
+      results = Array.new(total_pages)
+      mutex = Mutex.new
+      threads = Array.new(@concurrency) do
+        Thread.new do
+          loop do
+            page_num = mutex.synchronize { queue.shift }
+            break unless page_num
+            opts = options.merge(page: page_num)
+            page_results = @cdx.snapshots(url, **opts).to_a
+            mutex.synchronize { results[page_num] = page_results }
+          end
+        end
+      end
+      threads.each(&:join)
+      results.compact.flatten
+    end
+  end
+end

data/lib/archaeo/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Archaeo
-  VERSION = "0.2.9"
+  VERSION = "0.2.10"
 end

data/lib/archaeo/warc_support.rb ADDED Viewed

@@ -0,0 +1,249 @@
+# frozen_string_literal: true
+require "digest"
+require "time"
+require "zlib"
+module Archaeo
+  # Reads WARC (Web ARChive) format files (.warc, .warc.gz).
+  #
+  # Parses WARC 1.0 records and yields WarcRecord value objects
+  # containing headers and body content.
+  class WarcReader
+    WARC_VERSION = "WARC/1.0"
+    CRLF = "\r\n"
+    HEADER_END = "\r\n\r\n"
+    def initialize
+      @record_count = 0
+    end
+    def read(path, &block)
+      io = open_warc(path)
+      read_records_from_io(io, &block)
+    ensure
+      io&.close
+    end
+    def read_records(path)
+      records = []
+      read(path) { |record| records << record }
+      records
+    end
+    private
+    def open_warc(path)
+      if path.end_with?(".gz")
+        Zlib::GzipReader.open(path)
+      else
+        File.open(path, "rb")
+      end
+    end
+    def read_records_from_io(io)
+      buffer = +""
+      loop do
+        chunk = io.read(8192)
+        buffer << chunk if chunk
+        while (record = try_parse_record(buffer))
+          yield record
+        end
+        break unless chunk
+      end
+      return if buffer.strip.empty?
+      record = try_parse_record(buffer, final: true)
+      yield record if record
+    end
+    def try_parse_record(buffer, final: false)
+      header_end = buffer.index(HEADER_END)
+      return nil unless header_end
+      header_block = buffer.byteslice(0, header_end)
+      headers = parse_warc_headers(header_block.split(CRLF))
+      return nil unless headers[:warc_type]
+      content_length = headers[:content_length].to_i
+      body_start = header_end + HEADER_END.length
+      body_end = body_start + content_length
+      return nil unless final || buffer.bytesize >= body_end
+      body = buffer.byteslice(body_start, content_length).to_s
+      record = WarcRecord.new(
+        version: headers.delete(:version),
+        headers: headers,
+        body: body,
+      )
+      total_consumed = body_end
+      total_consumed += 2 while buffer.byteslice(total_consumed, 2) == CRLF
+      remaining = buffer.byteslice(total_consumed,
+                                   buffer.bytesize - total_consumed)
+      buffer.replace(remaining.to_s)
+      record
+    end
+    def parse_warc_headers(lines)
+      headers = {}
+      lines.each do |line|
+        case line
+        when /\AWARC\/(\d+\.\d+)\z/
+          headers[:version] = $1
+        when /\A([^:]+):\s*(.*)\z/
+          key = $1.downcase.tr("-", "_").to_sym
+          headers[key] = $2
+        else
+          break if line.strip.empty?
+        end
+      end
+      headers
+    end
+  end
+  # Writes snapshots to WARC format files (.warc, .warc.gz).
+  #
+  # Produces valid WARC 1.0 files with response and metadata records.
+  class WarcWriter
+    WARC_VERSION = "WARC/1.0"
+    RECORD_SEP = "\r\n\r\n"
+    CRLF = "\r\n"
+    def initialize(software: "archaeo/#{VERSION}")
+      @software = software
+      @record_count = 0
+    end
+    def write(path, pages, compress: nil)
+      compress = path.end_with?(".gz") if compress.nil?
+      io = open_warc(path, compress)
+      write_warcinfo(io, path)
+      pages.each { |page| write_page(io, page) }
+    ensure
+      io&.close
+    end
+    private
+    def open_warc(path, compress)
+      if compress
+        Zlib::GzipWriter.open(path)
+      else
+        File.open(path, "wb")
+      end
+    end
+    def write_warcinfo(io, filename)
+      fields = {
+        software: @software,
+        format: "WARC File Format 1.0",
+        filename: File.basename(filename),
+      }
+      body = fields.map { |k, v| "#{k}: #{v}" }.join(CRLF) + CRLF
+      record_id = generate_record_id
+      headers = warc_headers(
+        type: "warcinfo",
+        record_id: record_id,
+        date: Time.now.utc.iso8601,
+        content_type: "application/warc-fields",
+        content_length: body.bytesize,
+      )
+      io.write(headers + body + RECORD_SEP)
+    end
+    def write_page(io, page)
+      record_id = generate_record_id
+      date = page.timestamp.to_time.utc.iso8601
+      http_headers = build_http_headers(page)
+      body = page.content.to_s
+      full_body = http_headers + body
+      headers = warc_headers(
+        type: "response",
+        record_id: record_id,
+        date: date,
+        target_uri: page.original_url.to_s,
+        content_type: "application/http;msgtype=response",
+        content_length: full_body.bytesize,
+      )
+      io.write(headers + full_body + RECORD_SEP)
+      @record_count += 1
+    end
+    def build_http_headers(page)
+      parts = ["HTTP/1.1 #{page.status_code}"]
+      parts << "Content-Type: #{page.content_type}"
+      parts << "Content-Length: #{page.size}"
+      parts.join(CRLF) + CRLF
+    end
+    def warc_headers(type:, record_id:, date:, target_uri: nil,
+                     content_type: nil, content_length: 0)
+      lines = [
+        WARC_VERSION.to_s,
+        "WARC-Type: #{type}",
+        "WARC-Record-ID: #{record_id}",
+        "WARC-Date: #{date}",
+      ]
+      lines << "WARC-Target-URI: #{target_uri}" if target_uri
+      lines << "Content-Type: #{content_type}" if content_type
+      lines << "Content-Length: #{content_length}"
+      lines.join(CRLF) + RECORD_SEP
+    end
+    def generate_record_id
+      @record_count += 1
+      uuid = Digest::SHA256.hexdigest(
+        "#{Time.now.utc.to_f}-#{@record_count}-#{rand(1 << 32)}",
+      )
+      "<urn:uuid:#{uuid[0, 8]}-#{uuid[8, 4]}-#{uuid[12, 4]}-" \
+        "#{uuid[16, 4]}-#{uuid[20, 12]}>"
+    end
+  end
+  # Value object representing a single WARC record.
+  WarcRecord = Struct.new(
+    :version, :headers, :body,
+    keyword_init: true
+  ) do
+    def warc_type
+      headers[:warc_type]
+    end
+    def target_uri
+      headers[:warc_target_uri]
+    end
+    def date
+      headers[:warc_date]
+    end
+    def content_type
+      headers[:content_type]
+    end
+    def content_length
+      headers[:content_length].to_i
+    end
+    def response?
+      warc_type == "response"
+    end
+    def warcinfo?
+      warc_type == "warcinfo"
+    end
+    def to_h
+      { version: version, headers: headers, body_length: body.to_s.bytesize }
+    end
+  end
+end

data/lib/archaeo.rb CHANGED Viewed

@@ -60,4 +60,13 @@ module Archaeo
   autoload :Configuration, "archaeo/configuration"
   autoload :CoverageReport, "archaeo/coverage_report"
   autoload :ProgressReport, "archaeo/progress_report"
+  autoload :ColorOutput, "archaeo/color_output"
+  autoload :WarcReader, "archaeo/warc_support"
+  autoload :WarcWriter, "archaeo/warc_support"
+  autoload :WarcRecord, "archaeo/warc_support"
+  autoload :ParallelCdx, "archaeo/parallel_cdx"
+  autoload :ContentTracker, "archaeo/content_tracker"
+  autoload :ContentChangeReport, "archaeo/content_tracker"
+  autoload :ArchiveSearch, "archaeo/archive_search"
+  autoload :SearchResult, "archaeo/archive_search"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: archaeo
 version: !ruby/object:Gem::Version
-  version: 0.2.9
+  version: 0.2.10
 platform: ruby
 authors:
 - Ribose Inc.
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-05-12 00:00:00.000000000 Z
+date: 2026-05-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: csv
@@ -72,6 +72,7 @@ files:
 - exe/archaeo
 - lib/archaeo.rb
 - lib/archaeo/archive_health_check.rb
+- lib/archaeo/archive_search.rb
 - lib/archaeo/archive_url.rb
 - lib/archaeo/asset_extractor.rb
 - lib/archaeo/asset_list.rb
@@ -83,7 +84,9 @@ files:
 - lib/archaeo/cdx_filter.rb
 - lib/archaeo/cdx_timeline.rb
 - lib/archaeo/cli.rb
+- lib/archaeo/color_output.rb
 - lib/archaeo/configuration.rb
+- lib/archaeo/content_tracker.rb
 - lib/archaeo/coverage_report.rb
 - lib/archaeo/download_scheduler.rb
 - lib/archaeo/download_state.rb
@@ -92,6 +95,7 @@ files:
 - lib/archaeo/http_client.rb
 - lib/archaeo/page.rb
 - lib/archaeo/page_bundle.rb
+- lib/archaeo/parallel_cdx.rb
 - lib/archaeo/path_sanitizer.rb
 - lib/archaeo/pattern_filter.rb
 - lib/archaeo/progress_report.rb
@@ -105,6 +109,7 @@ files:
 - lib/archaeo/url_normalizer.rb
 - lib/archaeo/url_rewriter.rb
 - lib/archaeo/version.rb
+- lib/archaeo/warc_support.rb
 - sig/archaeo.rbs
 homepage: https://github.com/riboseinc/archaeo
 licenses: