RubyGems - archaeo - Versions diffs - 0.2.8 → 0.2.9 - Mend

archaeo 0.2.8 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/lib/archaeo/bulk_downloader.rb +11 -1
data/lib/archaeo/cli.rb +122 -1
data/lib/archaeo/configuration.rb +94 -0
data/lib/archaeo/coverage_report.rb +101 -0
data/lib/archaeo/download_scheduler.rb +102 -0
data/lib/archaeo/http_client.rb +4 -1
data/lib/archaeo/page.rb +53 -0
data/lib/archaeo/progress_report.rb +50 -0
data/lib/archaeo/save_api.rb +4 -1
data/lib/archaeo/snapshot_diff.rb +135 -0
data/lib/archaeo/version.rb +1 -1
data/lib/archaeo.rb +5 -0
metadata +6 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 67239af7cc927c495c67a849ecefb1cdc886ce8d95ddd6e27a2decdde6a93cd3
-  data.tar.gz: 8ce4a0f786c2e7db3268b6660a1aa9e2f3b913ff99c22c85c3c2190457defc90
+  metadata.gz: fb2b99e313bf2a3ac807cebf0052d369d83c8514ad89a1b9ca18deed421a0c4d
+  data.tar.gz: fa1f9536f838d8246706d5eca3350ba4eae97546ae88c48e28a27e5df952d987
 SHA512:
-  metadata.gz: ca0a9cc2bf0ad33a0d3dfd88e3228fd79fc3291a42fd3d13bbfbe4e37e744b0e3a5dadcec1cab48c0e13b6af872a8e3f4e80ce3e6593b18f024416b9cf7370fa
-  data.tar.gz: bb4b1d9e720dfdcc18c7c4ccb73cc55e29a3e31fb6ffb5bf3b8c0fce1548a63a06da4a710ab5fc5020f142ede69dbdfbef5451944183e280e86e018379a792eb
+  metadata.gz: 148875e2dae2319e4c96d892c2233bd889e3c193a9ddad8995faded2d637ce398a3e971d27ca05c62c93674c3c6db05322814d823d8493c8d0318f052e1278d4
+  data.tar.gz: ee1d6df5dc3623d6aee2e7803b82306c69099a72b52475e6969ef5a2a4bdff73ea4544faf3aa6fdcd54e4594a1020dc7b5dd05ffe5d437327eda18a5c23d35ec

data/lib/archaeo/bulk_downloader.rb CHANGED Viewed

@@ -29,7 +29,8 @@ module Archaeo
     def download(url, from: nil, to: nil, resume: false,
                  dry_run: false, all_timestamps: false,
                  filter: nil, page_requisites: false,
-                 snapshot_at: nil, &block)
+                 snapshot_at: nil, max_snapshots: nil,
+                 strategy: nil, &block)
       start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
       url = UrlNormalizer.normalize(url)
       FileUtils.mkdir_p(@output_dir) unless dry_run
@@ -38,6 +39,8 @@ module Archaeo
                                        all_timestamps: all_timestamps,
                                        snapshot_at: snapshot_at)
       snapshots = apply_filter(snapshots, filter)
+      snapshots = schedule_snapshots(snapshots, strategy)
+      snapshots = snapshots.first(max_snapshots) if max_snapshots
       downloaded, skipped, bytes, failed =
         run_download(snapshots, resume, dry_run, page_requisites, block)
@@ -70,6 +73,13 @@ module Archaeo
       snapshots.select { |snap| filter.match?(snap.original_url) }
     end
+    def schedule_snapshots(snapshots, strategy)
+      return snapshots unless strategy
+      scheduler = DownloadScheduler.new(strategy: strategy)
+      scheduler.schedule(snapshots)
+    end
     def run_download(snapshots, resume, dry_run, page_requisites, progress)
       state = DownloadState.new(@output_dir)
       total = snapshots.size

data/lib/archaeo/cli.rb CHANGED Viewed

@@ -27,6 +27,8 @@ module Archaeo
     option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
     option :match_type,
            desc: "Match type (exact, prefix, host, domain)"
+    option :exact_url, type: :boolean, default: false,
+                       desc: "Match exact URL only"
     option :filter, type: :array, desc: "CDX filter expressions"
     option :filter_status, type: :array,
                            desc: "Only include these status codes"
@@ -39,6 +41,8 @@ module Archaeo
                     default: "table"
     option :fields, type: :array,
                     desc: "Specific fields to print (timestamp,original,etc)"
+    option :list_only, type: :boolean, default: false,
+                       desc: "List files that would be downloaded"
     def snapshots(url)
       fmt = validate_output_format
       handle_errors do
@@ -228,6 +232,8 @@ module Archaeo
     option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
     option :resume, type: :boolean, default: false,
                     desc: "Resume interrupted download"
+    option :reset, type: :boolean, default: false,
+                   desc: "Clear download state and cache for fresh start"
     option :concurrency, type: :numeric, default: 1,
                          desc: "Number of parallel downloads"
     option :dry_run, type: :boolean, default: false,
@@ -241,6 +247,15 @@ module Archaeo
     option :snapshot_at, desc: "Download composite snapshot at timestamp"
     option :rate_limit, type: :numeric, default: 0,
                         desc: "Min seconds between requests"
+    option :max_snapshots, type: :numeric,
+                           desc: "Limit to N most recent snapshots"
+    option :recursive_subdomains, type: :boolean, default: false,
+                                  desc: "Discover and download subdomains"
+    option :subdomain_depth, type: :numeric, default: 1,
+                             desc: "Max subdomain recursion depth"
+    option :strategy, desc: "Download strategy (newest_first, oldest_first, " \
+                            "breadth_first, depth_first)",
+                      default: "newest_first"
     def download(url)
       handle_errors do
         rate_limiter = RateLimiter.new(
@@ -307,6 +322,35 @@ module Archaeo
       end
     end
+    desc "coverage URL",
+         "Analyze archive coverage for a URL"
+    option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
+    option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
+    option :format, desc: "Output format (table, json)", default: "table"
+    def coverage(url)
+      handle_errors do
+        analyzer = CoverageAnalyzer.new
+        report = analyzer.analyze(url, from: options[:from], to: options[:to])
+        output_coverage(report)
+      end
+    end
+    desc "snapshot-diff URL TIMESTAMP_A TIMESTAMP_B",
+         "Compare two snapshots of a URL"
+    option :format, desc: "Output format (table, json)", default: "table"
+    def snapshot_diff(url, timestamp_a, timestamp_b)
+      handle_errors do
+        fetcher = Fetcher.new
+        page_a = fetcher.fetch(url, timestamp: timestamp_a)
+        page_b = fetcher.fetch(url, timestamp: timestamp_b)
+        diff = SnapshotDiff.new(
+          url: url, page_a: page_a, page_b: page_b,
+          timestamp_a: timestamp_a, timestamp_b: timestamp_b
+        )
+        output_snapshot_diff(diff)
+      end
+    end
     CDX_OPTION_MAP = {
       from: :from,
       to: :to,
@@ -352,6 +396,9 @@ module Archaeo
     def fetch_snapshots(url)
       cdx = CdxApi.new
       opts = build_cdx_options(options)
+      if options[:exact_url]
+        opts[:match_type] = "exact"
+      end
       cdx.snapshots(url, **opts).to_a
     end
@@ -435,6 +482,11 @@ module Archaeo
     end
     def download_with_progress(downloader, url, filter)
+      if options[:reset]
+        state = DownloadState.new(options[:output])
+        state.clear
+      end
       summary = downloader.download(
         url,
         from: options[:from], to: options[:to],
@@ -442,9 +494,32 @@ module Archaeo
         all_timestamps: options[:all_timestamps],
         filter: filter,
         page_requisites: options[:page_requisites],
-        snapshot_at: options[:snapshot_at]
+        snapshot_at: options[:snapshot_at],
+        max_snapshots: options[:max_snapshots],
+        strategy: options[:strategy]&.to_sym
       ) { |c, t, s| print_progress(c, t, s) }
       print_summary(summary)
+      return unless options[:recursive_subdomains]
+      discover_and_download_subdomains(url, downloader, filter)
+    end
+    def discover_and_download_subdomains(url, downloader, filter)
+      discovery = SubdomainDiscovery.new(
+        URI.parse(UrlNormalizer.normalize(url)).host,
+        max_depth: options[:subdomain_depth],
+      )
+      subdomains = discovery.scan_files(options[:output])
+      subdomains.each do |subdomain|
+        warn "Downloading subdomain: #{subdomain}" unless quiet?
+        downloader.download(
+          subdomain,
+          from: options[:from], to: options[:to],
+          resume: options[:resume],
+          filter: filter
+        ) { |c, t, s| print_progress(c, t, s) }
+      end
     end
     def output_health(report)
@@ -625,5 +700,51 @@ module Archaeo
       end
       dupes
     end
+    def output_coverage(report)
+      case options[:format]
+      when "json"
+        puts JSON.generate(report.as_json)
+      else
+        puts "URL: #{report.url}"
+        puts "Total URLs: #{report.total_urls}"
+        puts "Archived URLs: #{report.archived_urls}"
+        puts "Coverage: #{report.coverage_percent}%"
+        puts "Missing: #{report.missing_count}"
+        if report.has_gaps?
+          puts "Temporal gaps:"
+          report.temporal_gaps.each do |gap|
+            puts "  #{gap[:from]} → #{gap[:to]} (#{gap[:gap_days]} days)"
+          end
+        end
+        puts "Status distribution:"
+        report.status_distribution.sort_by { |_, v| -v }.each do |code, count|
+          puts "  #{code}: #{count}"
+        end
+      end
+    end
+    def output_snapshot_diff(diff)
+      case options[:format]
+      when "json"
+        puts JSON.generate(diff.as_json)
+      else
+        puts "Comparing #{diff.to_h[:timestamp_a]} vs #{diff.to_h[:timestamp_b]}"
+        puts "Content changed: #{diff.content_changed? ? 'Yes' : 'No'}"
+        link_changes = diff.link_changes
+        puts "Links added: #{link_changes[:added].size}"
+        puts "Links removed: #{link_changes[:removed].size}"
+        asset_changes = diff.asset_changes
+        puts "Assets added: #{asset_changes[:added].size}"
+        puts "Assets removed: #{asset_changes[:removed].size}"
+        structural = diff.structural_changes
+        unless structural.empty?
+          puts "Structural changes:"
+          structural.each do |tag, change|
+            puts "  <#{tag}>: #{change[:from]} → #{change[:to]}"
+          end
+        end
+      end
+    end
   end
 end

data/lib/archaeo/configuration.rb ADDED Viewed

@@ -0,0 +1,94 @@
+# frozen_string_literal: true
+require "yaml"
+module Archaeo
+  # Manages persistent configuration across sessions.
+  #
+  # Loads settings from .archaeo.yml files, supports named profiles,
+  # and falls back to sensible defaults. Settings cascade: defaults
+  # < global config < profile overrides.
+  class Configuration
+    DEFAULTS = {
+      "output_dir" => "archive",
+      "format" => "table",
+      "rate_limit" => 0,
+      "concurrency" => 1,
+      "max_retries" => 3,
+    }.freeze
+    def initialize(path: ".archaeo.yml")
+      @path = path
+      @data = load_config
+    end
+    def get(key, profile: nil)
+      keys = key.to_s.split(".")
+      value = dig_nested(@data, keys, profile)
+      value.nil? ? DEFAULTS[keys.last] : value
+    end
+    def profile(name)
+      profiles = @data["profiles"] || {}
+      profiles[name.to_s] || {}
+    end
+    def profiles
+      (@data["profiles"] || {}).keys
+    end
+    def set(key, value, profile: nil)
+      if profile
+        @data["profiles"] ||= {}
+        @data["profiles"][profile.to_s] ||= {}
+        @data["profiles"][profile.to_s][key.to_s] = value
+      else
+        @data["defaults"] ||= {}
+        @data["defaults"][key.to_s] = value
+      end
+      save_config
+    end
+    def to_h
+      {
+        defaults: @data.fetch("defaults", {}),
+        profiles: @data.fetch("profiles", {}),
+      }
+    end
+    def save(path: nil)
+      target = path || @path
+      File.write(target, YAML.dump(@data))
+    end
+    private
+    def load_config
+      return {} unless File.exist?(@path)
+      content = File.read(@path)
+      YAML.safe_load(content, permitted_classes: [Symbol]) || {}
+    rescue StandardError
+      {}
+    end
+    def save_config
+      FileUtils.mkdir_p(File.dirname(@path)) unless File.dirname(@path) == "."
+      File.write(@path, YAML.dump(@data))
+    end
+    def dig_nested(data, keys, profile_name)
+      if profile_name
+        profile_data = data.dig("profiles", profile_name.to_s) || {}
+        return dig_value(profile_data, keys)
+      end
+      defaults = data["defaults"] || {}
+      dig_value(defaults, keys)
+    end
+    def dig_value(hash, keys)
+      keys.reduce(hash) { |h, k| h.is_a?(Hash) ? h[k] : nil }
+    end
+  end
+end

data/lib/archaeo/coverage_report.rb ADDED Viewed

@@ -0,0 +1,101 @@
+# frozen_string_literal: true
+module Archaeo
+  # Analyzes how thoroughly a site was archived by the Wayback Machine.
+  #
+  # Produces coverage statistics including total URLs, archived URLs,
+  # coverage percentage, temporal gaps, and status distribution.
+  class CoverageReport
+    attr_reader :url, :total_urls, :archived_urls, :status_distribution,
+                :temporal_gaps, :missing_assets
+    def initialize(url:, total_urls:, archived_urls:,
+                   status_distribution: {}, temporal_gaps: [],
+                   missing_assets: [])
+      @url = url
+      @total_urls = total_urls
+      @archived_urls = archived_urls
+      @status_distribution = status_distribution
+      @temporal_gaps = temporal_gaps
+      @missing_assets = missing_assets
+    end
+    def coverage_percent
+      return 0.0 if total_urls.zero?
+      (archived_urls.to_f / total_urls * 100).round(1)
+    end
+    def missing_count
+      total_urls - archived_urls
+    end
+    def has_gaps?
+      !temporal_gaps.empty?
+    end
+    def to_h
+      {
+        url: @url,
+        total_urls: @total_urls,
+        archived_urls: @archived_urls,
+        coverage_percent: coverage_percent,
+        missing_count: missing_count,
+        status_distribution: @status_distribution,
+        temporal_gaps: @temporal_gaps,
+        missing_assets: @missing_assets,
+      }
+    end
+    def as_json(*)
+      to_h
+    end
+  end
+  # Builds a CoverageReport from CDX snapshot data.
+  class CoverageAnalyzer
+    def initialize(cdx_api: nil)
+      @cdx_api = cdx_api
+    end
+    def analyze(url, from: nil, to: nil)
+      cdx = @cdx_api || CdxApi.new
+      snapshots = cdx.snapshots(url, from: from, to: to).to_a
+      unique_urls = snapshots.map(&:original_url).uniq
+      status_dist = compute_status_distribution(snapshots)
+      gaps = compute_temporal_gaps(snapshots)
+      CoverageReport.new(
+        url: url,
+        total_urls: unique_urls.size,
+        archived_urls: snapshots.count(&:success?),
+        status_distribution: status_dist,
+        temporal_gaps: gaps,
+      )
+    end
+    private
+    def compute_status_distribution(snapshots)
+      snapshots.each_with_object(Hash.new(0)) do |snap, counts|
+        counts[snap.status_code] += 1
+      end
+    end
+    def compute_temporal_gaps(snapshots)
+      return [] if snapshots.size < 2
+      sorted = snapshots.sort_by(&:timestamp)
+      gaps = []
+      sorted.each_cons(2) do |a, b|
+        diff_days = (b.timestamp.to_time - a.timestamp.to_time) / 86400
+        next unless diff_days > 30
+        gaps << { from: a.timestamp.to_s, to: b.timestamp.to_s,
+                  gap_days: diff_days.round }
+      end
+      gaps
+    end
+  end
+end

data/lib/archaeo/download_scheduler.rb ADDED Viewed

@@ -0,0 +1,102 @@
+# frozen_string_literal: true
+module Archaeo
+  # Schedules and orders snapshot downloads by strategy and priority.
+  #
+  # Supports multiple ordering strategies (newest_first, oldest_first,
+  # breadth_first, depth_first) and priority rules (html_first,
+  # smallest_first, largest_first) for intelligent download ordering.
+  class DownloadScheduler
+    STRATEGIES = %i[newest_first oldest_first breadth_first depth_first].freeze
+    PRIORITIES = %i[html_first smallest_first largest_first].freeze
+    def initialize(strategy: :newest_first, priority: nil,
+                   max_file_size: nil, min_file_size: nil)
+      validate_strategy(strategy)
+      validate_priority(priority) if priority
+      @strategy = strategy
+      @priority = priority
+      @max_file_size = max_file_size
+      @min_file_size = min_file_size
+    end
+    def schedule(snapshots)
+      filtered = apply_size_filters(snapshots)
+      ordered = apply_strategy(filtered)
+      apply_priority(ordered)
+    end
+    private
+    def validate_strategy(strategy)
+      return if STRATEGIES.include?(strategy.to_sym)
+      raise ArgumentError,
+            "Invalid strategy: #{strategy}. Use: #{STRATEGIES.join(', ')}"
+    end
+    def validate_priority(priority)
+      return if PRIORITIES.include?(priority.to_sym)
+      raise ArgumentError,
+            "Invalid priority: #{priority}. Use: #{PRIORITIES.join(', ')}"
+    end
+    def apply_size_filters(snapshots)
+      result = snapshots
+      if @max_file_size
+        result = result.reject { |s| s.length && s.length > @max_file_size }
+      end
+      if @min_file_size
+        result = result.reject { |s| s.length && s.length < @min_file_size }
+      end
+      result
+    end
+    def apply_strategy(snapshots)
+      case @strategy.to_sym
+      when :newest_first
+        snapshots.sort_by { |s| -s.timestamp.to_i }
+      when :oldest_first
+        snapshots.sort_by(&:timestamp)
+      when :breadth_first
+        sort_by_depth(snapshots, depth: :shallow)
+      when :depth_first
+        sort_by_depth(snapshots, depth: :deep)
+      end
+    end
+    def apply_priority(snapshots)
+      return snapshots unless @priority
+      case @priority.to_sym
+      when :html_first
+        html, rest = snapshots.partition { |s| html?(s) }
+        html + rest
+      when :smallest_first
+        snapshots.sort_by { |s| s.length || 0 }
+      when :largest_first
+        snapshots.sort_by { |s| -(s.length || 0) }
+      end
+    end
+    def sort_by_depth(snapshots, depth:)
+      segments = snapshots.map do |snap|
+        path = snap.original_url.to_s
+        depth_count = path.count("/")
+        [snap, depth_count]
+      end
+      if depth == :shallow
+        segments.sort_by { |_, d| d }.map(&:first)
+      else
+        segments.sort_by { |_, d| -d }.map(&:first)
+      end
+    end
+    def html?(snapshot)
+      snapshot.mimetype.to_s.include?("text/html")
+    end
+  end
+end

data/lib/archaeo/http_client.rb CHANGED Viewed

@@ -64,13 +64,15 @@ module Archaeo
                    retry_delay: DEFAULT_RETRY_DELAY,
                    user_agent: nil,
                    on_request: nil,
-                   before_request: nil)
+                   before_request: nil,
+                   rate_limiter: nil)
       @timeout = timeout
       @max_retries = max_retries
       @retry_delay = retry_delay
       @user_agent = user_agent
       @on_request = on_request
       @before_request = before_request
+      @rate_limiter = rate_limiter
       @connections = {}
       @last_used = {}
       @mutex = Mutex.new
@@ -276,6 +278,7 @@ module Archaeo
     end
     def execute_tracked_request(uri, request, retry_count)
+      @rate_limiter&.wait(host: uri.host)
       http = connection_for(uri)
       start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
       raw = http.request(request)

data/lib/archaeo/page.rb CHANGED Viewed

@@ -93,6 +93,52 @@ module Archaeo
       end
     end
+    def headings
+      return [] unless html?
+      @headings ||= begin
+        doc = Nokogiri::HTML(@raw_content)
+        doc.css("h1, h2, h3, h4, h5, h6").map do |el|
+          { level: el.name[1].to_i, text: el.text.strip }
+        end
+      end
+    end
+    def images
+      return [] unless html?
+      @images ||= begin
+        doc = Nokogiri::HTML(@raw_content)
+        doc.css("img[src]").map do |el|
+          { src: el["src"], alt: el["alt"].to_s,
+            width: el["width"]&.to_i, height: el["height"]&.to_i }
+        end
+      end
+    end
+    def forms
+      return [] unless html?
+      @forms ||= begin
+        doc = Nokogiri::HTML(@raw_content)
+        doc.css("form").map do |form|
+          { action: form["action"].to_s, method: (form["method"] || "GET").upcase,
+            fields: extract_form_fields(form) }
+        end
+      end
+    end
+    def scripts
+      return [] unless html?
+      @scripts ||= begin
+        doc = Nokogiri::HTML(@raw_content)
+        doc.css("script").map do |el|
+          { src: el["src"].to_s, type: el["type"].to_s }
+        end
+      end
+    end
     def to_h
       {
         content_type: @content_type,
@@ -202,5 +248,12 @@ module Archaeo
     rescue URI::InvalidURIError
       nil
     end
+    def extract_form_fields(form)
+      inputs = form.css("input, select, textarea").map do |el|
+        { name: el["name"].to_s, type: (el["type"] || el.name).to_s }
+      end
+      inputs.reject { |f| f[:name].empty? }
+    end
   end
 end

data/lib/archaeo/progress_report.rb ADDED Viewed

@@ -0,0 +1,50 @@
+# frozen_string_literal: true
+module Archaeo
+  # Value object representing download progress at a point in time.
+  #
+  # Provides computed metrics like percentage, speed, and ETA
+  # based on current counters and elapsed time.
+  ProgressReport = Struct.new(
+    :current, :total, :downloaded_bytes, :elapsed, :current_url,
+    keyword_init: true
+  ) do
+    def percent_complete
+      return 0.0 if total.nil? || total.zero?
+      (current.to_f / total * 100).round(1)
+    end
+    def speed
+      return 0.0 if elapsed.nil? || elapsed.zero?
+      downloaded_bytes.to_f / elapsed
+    end
+    def eta
+      return nil if elapsed.nil? || elapsed.zero?
+      return nil if total.nil? || current.nil? || current.zero?
+      rate = current.to_f / elapsed
+      remaining = total - current
+      remaining / rate
+    end
+    def to_h
+      {
+        current: current,
+        total: total,
+        percent_complete: percent_complete,
+        downloaded_bytes: downloaded_bytes,
+        speed: speed,
+        eta: eta,
+        current_url: current_url,
+        elapsed: elapsed,
+      }
+    end
+    def as_json(*)
+      to_h.transform_values { |v| v.is_a?(Float) ? v.round(2) : v }
+    end
+  end
+end

data/lib/archaeo/save_api.rb CHANGED Viewed

@@ -11,9 +11,11 @@ module Archaeo
     TIMESTAMP_RE = %r{web\.archive\.org/web/(\d{14})}
     def initialize(client: HttpClient.new,
-                   max_tries: DEFAULT_MAX_TRIES)
+                   max_tries: DEFAULT_MAX_TRIES,
+                   rate_limiter: nil)
       @client = client
       @max_tries = max_tries
+      @rate_limiter = rate_limiter
     end
     def save(url)
@@ -44,6 +46,7 @@ module Archaeo
     def attempt_save(save_url, start_time, url)
       @max_tries.times do |attempt|
         sleep(retry_delay(attempt)) if attempt.positive?
+        @rate_limiter&.wait(host: "web.archive.org")
         response = @client.get(save_url)
         check_response_errors!(response, url)

data/lib/archaeo/snapshot_diff.rb ADDED Viewed

@@ -0,0 +1,135 @@
+# frozen_string_literal: true
+require "digest"
+module Archaeo
+  # Compares two archived snapshots of the same URL.
+  #
+  # Produces text diffs, structural change analysis, link and
+  # asset change tracking between snapshots at different timestamps.
+  class SnapshotDiff
+    attr_reader :url, :snapshot_a, :snapshot_b
+    def initialize(url:, page_a:, page_b:, timestamp_a:, timestamp_b:)
+      @url = url
+      @page_a = page_a
+      @page_b = page_b
+      @timestamp_a = Timestamp.coerce(timestamp_a)
+      @timestamp_b = Timestamp.coerce(timestamp_b)
+    end
+    def content_changed?
+      content_digest(@page_a.content) != content_digest(@page_b.content)
+    end
+    def text_diff
+      lines_a = @page_a.content.to_s.lines
+      lines_b = @page_b.content.to_s.lines
+      build_unified_diff(lines_a, lines_b)
+    end
+    def link_changes
+      links_a = extract_links(@page_a)
+      links_b = extract_links(@page_b)
+      compute_set_diff(links_a, links_b)
+    end
+    def asset_changes
+      assets_a = extract_assets(@page_a)
+      assets_b = extract_assets(@page_b)
+      compute_set_diff(assets_a, assets_b)
+    end
+    def structural_changes
+      return {} unless @page_a.html? && @page_b.html?
+      elements_a = count_elements(@page_a)
+      elements_b = count_elements(@page_b)
+      build_element_diff(elements_a, elements_b)
+    end
+    def to_h
+      {
+        url: @url,
+        timestamp_a: @timestamp_a.to_s,
+        timestamp_b: @timestamp_b.to_s,
+        content_changed: content_changed?,
+        links_added: link_changes[:added],
+        links_removed: link_changes[:removed],
+        assets_added: asset_changes[:added],
+        assets_removed: asset_changes[:removed],
+        structural_changes: structural_changes,
+      }
+    end
+    def as_json(*)
+      to_h
+    end
+    private
+    def content_digest(content)
+      Digest::SHA256.hexdigest(content.to_s)
+    end
+    def build_unified_diff(lines_a, lines_b)
+      diff = []
+      max_len = [lines_a.size, lines_b.size].max
+      max_len.times do |i|
+        la = lines_a[i]
+        lb = lines_b[i]
+        if la == lb
+          diff << " #{la}"
+        else
+          diff << "- #{la}" if la
+          diff << "+ #{lb}" if lb
+        end
+      end
+      diff.join
+    end
+    def extract_links(page)
+      return Set.new unless page.html?
+      page.links.filter_map { |l| l[:href] }.to_set
+    end
+    def extract_assets(page)
+      return Set.new unless page.html?
+      extractor = AssetExtractor.new(page.content, base_url: page.archive_url)
+      extractor.extract.all.to_set
+    rescue StandardError
+      Set.new
+    end
+    def count_elements(page)
+      require "nokogiri"
+      doc = Nokogiri::HTML(page.content)
+      counts = Hash.new(0)
+      doc.css("*").each { |el| counts[el.name] += 1 }
+      counts
+    end
+    def compute_set_diff(set_a, set_b)
+      {
+        added: (set_b - set_a).to_a.sort,
+        removed: (set_a - set_b).to_a.sort,
+        unchanged: (set_a & set_b).size,
+      }
+    end
+    def build_element_diff(counts_a, counts_b)
+      all_tags = (counts_a.keys + counts_b.keys).uniq.sort
+      changes = {}
+      all_tags.each do |tag|
+        ca = counts_a[tag]
+        cb = counts_b[tag]
+        next if ca == cb
+        changes[tag] = { from: ca, to: cb }
+      end
+      changes
+    end
+  end
+end

data/lib/archaeo/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Archaeo
-  VERSION = "0.2.8"
+  VERSION = "0.2.9"
 end

data/lib/archaeo.rb CHANGED Viewed

@@ -55,4 +55,9 @@ module Archaeo
   autoload :CdxCache, "archaeo/cdx_cache"
   autoload :SubdomainDiscovery, "archaeo/subdomain_discovery"
   autoload :ArchiveHealthCheck, "archaeo/archive_health_check"
+  autoload :DownloadScheduler, "archaeo/download_scheduler"
+  autoload :SnapshotDiff, "archaeo/snapshot_diff"
+  autoload :Configuration, "archaeo/configuration"
+  autoload :CoverageReport, "archaeo/coverage_report"
+  autoload :ProgressReport, "archaeo/progress_report"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: archaeo
 version: !ruby/object:Gem::Version
-  version: 0.2.8
+  version: 0.2.9
 platform: ruby
 authors:
 - Ribose Inc.
@@ -83,6 +83,9 @@ files:
 - lib/archaeo/cdx_filter.rb
 - lib/archaeo/cdx_timeline.rb
 - lib/archaeo/cli.rb
+- lib/archaeo/configuration.rb
+- lib/archaeo/coverage_report.rb
+- lib/archaeo/download_scheduler.rb
 - lib/archaeo/download_state.rb
 - lib/archaeo/encoding_detector.rb
 - lib/archaeo/fetcher.rb
@@ -91,10 +94,12 @@ files:
 - lib/archaeo/page_bundle.rb
 - lib/archaeo/path_sanitizer.rb
 - lib/archaeo/pattern_filter.rb
+- lib/archaeo/progress_report.rb
 - lib/archaeo/rate_limiter.rb
 - lib/archaeo/save_api.rb
 - lib/archaeo/save_result.rb
 - lib/archaeo/snapshot.rb
+- lib/archaeo/snapshot_diff.rb
 - lib/archaeo/subdomain_discovery.rb
 - lib/archaeo/timestamp.rb
 - lib/archaeo/url_normalizer.rb