RubyGems - archaeo - Versions diffs - 0.2.10 → 0.2.11 - Mend

archaeo 0.2.10 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/README.adoc +361 -11
data/lib/archaeo/cli.rb +26 -0
data/lib/archaeo/local_rewriter.rb +106 -0
data/lib/archaeo/page.rb +63 -0
data/lib/archaeo/version.rb +1 -1
data/lib/archaeo.rb +2 -0
metadata +2 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 235d2cba1b1e071156a873d7a63cf0fdb6ba8079eb6083e21755e723727db6d9
-  data.tar.gz: 65c040c3a5984fdc1a68ca106d9ae10eab64b212ce6a72b37bec39ec57d383e2
+  metadata.gz: 76a36571f0747712c2abda1a4aef93c7ade9a83b42590e23f0148b89138451b0
+  data.tar.gz: a9eed4768d084756fbb10eda17b1f2098246fd56a93cbe91b55f693850e5008a
 SHA512:
-  metadata.gz: e6eb3cdb88abb87332bbba762bf566643da717ce17557e31ed90a012bd7c164939b5eb719420f74dfa908215e0f604e71b5fb2bb8bcc7de2940e36b80524e963
-  data.tar.gz: f52bc54fe3c425eeae28093810f1d90c4200391696ffe9af0e3f91366d619e4e57a425ec1e6a6a8b9aa2465d337bb2b960782fac1c3ce068d7d4b673b8306641
+  metadata.gz: fa8e01a6aa31aa678a17ce2fc4f59e324c4e8779716b7c41d876dbd366af06dda30296af446919eedc3136efe5bc2527abef60d5aa4274745e94ef7415a775fa
+  data.tar.gz: b3fd25ec4d3b10c759992226dd2d699276dbd9def9318ef343f632b69faa5c4fb0017f78ae7aa87b1b85c4fb48a642d4d667c3f43e0e768162486d63f1bf7be1

data/README.adoc CHANGED Viewed

@@ -4,7 +4,7 @@
 Archaeo is a Ruby client for the Internet Archive's https://web.archive.org[Wayback Machine] APIs.
-It provides a model-driven interface for querying archived snapshots, checking availability, saving URLs, fetching archived content, and bulk downloading with resume support.
+It provides a model-driven interface for querying archived snapshots, checking availability, saving URLs, fetching archived content, bulk downloading with resume support, snapshot comparison, coverage analysis, content tracking, full-text search, WARC format I/O, and more.
 == Installation
@@ -100,6 +100,18 @@ cdx.num_pages("example.com")
 # Discover all known URLs for a domain
 cdx.known_urls("example.com")
+# Composite snapshot (point-in-time site reconstruction)
+cdx.composite_snapshot("example.com", timestamp: "20220615",
+                       collapse: ["digest"])
+# => picks newest snapshot per URL at or before the given timestamp
+# CDX caching (speeds up repeated queries)
+cdx = Archaeo::CdxApi.new(cache_dir: ".cache")
+# Parallel CDX fetching (thread pool for multi-page queries)
+parallel = Archaeo::ParallelCdx.new(concurrency: 4)
+snapshots = parallel.snapshots("example.com")
 ----
 === Check Availability
@@ -141,6 +153,14 @@ result.as_json      # => JSON-serializable Hash
 results = save.batch_save(%w[https://a.com https://b.com],
                           delay: 2, stop_on_error: false)
 results.each { |r| puts "#{r.url}: #{r.success?}" }
+# Inspect response details
+result.status_code       # => HTTP status from Save API
+result.response_url      # => redirect URL if any
+result.response_headers  # => Hash of response headers
+# With rate limiter
+save = Archaeo::SaveApi.new(rate_limiter: Archaeo::RateLimiter.new(min_interval: 1.0))
 ----
 === Fetch Archived Content
@@ -182,6 +202,12 @@ page = fetcher.fetch!("https://example.com/",
 # Page links and meta extraction
 page.links      # => [{ href: "...", text: "...", external: true/false }]
 page.meta_tags  # => { "description" => "...", "og:title" => "...", "canonical" => "..." }
+# Structured content extraction (HTML pages only)
+page.headings   # => [{ level: 1, text: "Title" }, { level: 2, text: "Subtitle" }]
+page.images     # => [{ src: "photo.jpg", alt: "...", width: 800, height: 600 }]
+page.forms      # => [{ action: "/submit", method: "POST", fields: [{ name: "q", type: "text" }] }]
+page.scripts    # => [{ src: "app.js", type: "text/javascript" }]
 ----
 === Fetch Page with Assets
@@ -260,6 +286,46 @@ downloader = Archaeo::BulkDownloader.new(
   output_dir: "archive", concurrency: 4,
 )
 downloader.download("example.com")
+# Download with page requisites (CSS/JS/images)
+downloader.download("example.com", page_requisites: true)
+# Point-in-time composite snapshot
+downloader.download("example.com", snapshot_at: "20220615")
+# All timestamps (not just latest per URL)
+downloader.download("example.com", all_timestamps: true)
+# URL pattern filtering
+filter = Archaeo::PatternFilter.new(only: ".*\\.html$", exclude: nil)
+downloader.download("example.com", filter: filter)
+# Download scheduling strategies
+scheduler = Archaeo::DownloadScheduler.new(
+  strategy: :breadth_first,   # or :depth_first, :newest_first, :oldest_first
+  priority: :html_first,
+  max_file_size: 50 * 1024 * 1024,
+)
+# Integrates with BulkDownloader via strategy: option
+# Rate limiting
+limiter = Archaeo::RateLimiter.new(min_interval: 0.5)
+downloader = Archaeo::BulkDownloader.new(
+  output_dir: "archive", rate_limiter: limiter,
+)
+# Limit snapshots
+downloader.download("example.com", max_snapshots: 10, strategy: :newest_first)
+# Progress reporting
+downloader.download("example.com") do |current, total, snap|
+  report = Archaeo::ProgressReport.new(
+    current: current, total: total,
+    downloaded_bytes: current * 1024, elapsed: 10.0,
+    current_url: snap.original_url,
+  )
+  puts "#{report.percent_complete}% — ETA #{report.eta}s"
+end
 ----
 === Download State (Resume Tracking)
@@ -351,9 +417,21 @@ rewriter.rewrite("https://web.archive.org/web/20220615000000/style.css")
 # Rewrite batch
 rewriter.rewrite_batch(["url1", "url2"])
-# Rewrite URLs within HTML (src, href, srcset, data-src, poster)
+# Rewrite URLs within HTML (src, href, srcset, data-src, poster, action, data-url)
 # Also rewrites inline style url() and <style> element url()
 rewritten_html = rewriter.rewrite_html(html_content)
+# Enhanced rewriting with JS strings, absolute URLs, and server extensions
+rewriter = Archaeo::UrlRewriter.new(
+  "https://web.archive.org/web/20220615000000/",
+  "local",
+  rewrite_js: true,           # rewrite URLs inside JS string literals
+  rewrite_absolute: true,     # rewrite all absolute archive URLs (not just prefix match)
+  server_extensions: true,    # handle .php/.asp/.jsp URLs specially
+)
+# Standalone CSS file rewriting
+rewritten_css = rewriter.rewrite_css(css_content)
 ----
 === Snapshot Convenience
@@ -468,6 +546,218 @@ client.pool_stats
 #      idle_times: { "web.archive.org": 12 } }
 ----
+=== Snapshot Comparison (Diff)
+[source,ruby]
+----
+diff = Archaeo::SnapshotDiff.new(
+  url: "https://example.com/",
+  page_a: page_a, page_b: page_b,
+  timestamp_a: "20220101", timestamp_b: "20220615",
+)
+diff.content_changed?    # => true/false (SHA256 digest comparison)
+diff.text_diff           # => unified diff of content lines
+diff.link_changes        # => { added: [...], removed: [...], unchanged: N }
+diff.asset_changes       # => { added: [...], removed: [...], unchanged: N }
+diff.structural_changes  # => { "a" => { from: 1, to: 2 }, ... }
+diff.to_h                # => Hash with all fields
+----
+=== Coverage Analysis
+[source,ruby]
+----
+analyzer = Archaeo::CoverageAnalyzer.new
+report = analyzer.analyze("example.com", from: "20220101", to: "20221231")
+report.url               # => "example.com"
+report.total_urls        # => unique URLs found
+report.archived_urls     # => URLs with at least one capture
+report.coverage_percent  # => 87.3
+report.temporal_gaps      # => [{ from: ts, to: ts, gap_days: 45 }, ...]
+report.has_gaps?         # => true/false
+report.status_distribution # => { 200 => 150, 404 => 10 }
+report.missing_assets    # => resources referenced but not archived
+----
+=== Archive Health Check
+[source,ruby]
+----
+checker = Archaeo::ArchiveHealthCheck.new
+report = checker.check("example.com", from: "20220101", to: "20221231")
+report.total       # => 150
+report.accessible  # => 148
+report.missing     # => 2
+report.errors      # => 0
+report.details     # => [HealthDetail, ...]
+# Sample a subset (for large collections)
+report = checker.check("example.com", sample: 50)
+----
+=== Content Tracking
+[source,ruby]
+----
+tracker = Archaeo::ContentTracker.new
+report = tracker.track("example.com", from: "20220101", to: "20221231")
+report.changed_urls       # => URLs whose digest changed over time
+report.new_urls           # => URLs that appeared in the second half
+report.removed_urls       # => URLs that disappeared in the second half
+report.content_frequency  # => { "url" => unique_digest_count }
+report.any_changes?       # => true if any changes detected
+----
+=== Archive Search
+[source,ruby]
+----
+searcher = Archaeo::ArchiveSearch.new
+results = searcher.search("example.com",
+                          query: "contact us",
+                          from: "20220101",
+                          to: "20221231",
+                          case_sensitive: false,
+                          max_results: 10)
+results.each do |match|
+  puts match.snapshot.timestamp  # => when it was archived
+  puts match.url                 # => the page URL
+  puts match.context             # => "...contact us..." with surrounding text
+end
+----
+=== WARC Support
+[source,ruby]
+----
+# Export snapshots to WARC format
+writer = Archaeo::WarcWriter.new
+writer.write("archive/output.warc", pages)
+# Gzip-compressed output
+writer.write("archive/output.warc.gz", pages, compress: true)
+# Read WARC files
+reader = Archaeo::WarcReader.new
+records = reader.read_records("archive/output.warc")
+records.each do |record|
+  record.warc_type    # => "response" or "warcinfo"
+  record.target_uri   # => original URL
+  record.body         # => archived content
+  record.response?    # => true for response records
+end
+----
+=== Configuration
+[source,ruby]
+----
+# Load .archaeo.yml config
+config = Archaeo::Configuration.new
+config.get("output_dir")             # => "archive" (default)
+config.get("rate_limit")             # => 0.5
+config.get("concurrency", profile: "fast")  # => 8
+# Persist settings
+config.set("rate_limit", 1.0)
+config.set("concurrency", 4, profile: "fast")
+# List profiles
+config.profiles  # => ["fast", "careful"]
+----
+=== Encoding Detection
+[source,ruby]
+----
+detector = Archaeo::EncodingDetector.new
+# Detect encoding from content + content-type charset
+encoding = detector.detect(binary_content, content_type: "text/html; charset=iso-8859-1")
+# => Encoding::ISO_8859_1
+# Detect from HTML meta tag
+encoding = detector.detect("<html><head><meta charset='utf-8'>...")
+# => Encoding::UTF_8
+# Multi-encoding fallback chain
+detector.detect(content)  # tries UTF-8, ISO-8859-1, Windows-1252
+----
+=== Path Sanitization
+[source,ruby]
+----
+sanitizer = Archaeo::PathSanitizer.new
+safe_path = sanitizer.sanitize("https://example.com/path?q=1&r=2")
+# => "path_q_1_r_2"
+# Handles query string hashing, recursive percent-decoding,
+# and file/directory conflict resolution
+----
+=== Pattern Filtering
+[source,ruby]
+----
+# Include/exclude URL patterns
+filter = Archaeo::PatternFilter.new(
+  only: ".*\\.html$",       # regex string or %r{} Regexp
+  exclude: /\\/api\\//,
+)
+filter.match?("https://example.com/page.html")  # => true
+filter.match?("https://example.com/style.css")   # => false
+filter.match?("https://example.com/api/data")    # => false (excluded)
+----
+=== Subdomain Discovery
+[source,ruby]
+----
+discovery = Archaeo::SubdomainDiscovery.new("example.com", max_depth: 2)
+# Scan downloaded files to discover subdomains
+subdomains = discovery.scan_files("archive/")
+# => ["cdn.example.com", "blog.example.com"]
+# Scan raw content (HTML, CSS, JS)
+subdomains = discovery.scan_content("<a href='https://blog.example.com/post'>")
+# => ["blog.example.com"]
+----
+=== Rate Limiting
+[source,ruby]
+----
+# Per-host rate limiter with adaptive backoff
+limiter = Archaeo::RateLimiter.new(min_interval: 0.5)
+limiter.wait(host: "web.archive.org")  # sleeps if needed
+limiter.wait(host: "api.example.com")  # independent per-host tracking
+----
+=== Color Output
+[source,ruby]
+----
+color = Archaeo::ColorOutput.new(enabled: true)
+color.success("Done!")    # green + bold
+color.warning("Careful")  # yellow + bold
+color.error("Failed!")    # red + bold
+color.info("Info")        # cyan
+# Auto-detects from TTY, NO_COLOR env, TERM=dumb
+color = Archaeo::ColorOutput.new  # enabled: auto-detected
+----
 === Command-Line Interface
 [source,bash]
@@ -543,11 +833,67 @@ archaeo download --concurrency 4 example.com --output ./archive
 # Resume interrupted download
 archaeo download example.com --resume
+# Download with page requisites (linked assets)
+archaeo download --page-requisites example.com
+# Point-in-time composite snapshot
+archaeo download --snapshot-at 20220615 example.com
+# All timestamps (not just latest)
+archaeo download --all-timestamps example.com
+# URL pattern filtering
+archaeo download --only '.*\.html$' --exclude '/api/' example.com
+# Download scheduling
+archaeo download --strategy newest_first --max-snapshots 10 example.com
+# Reset download state
+archaeo download --reset example.com
+# Rate limiting
+archaeo download --rate-limit 0.5 example.com
+# Recursive subdomain discovery
+archaeo download --recursive-subdomains --subdomain-depth 2 example.com
 # Suppress progress messages
 archaeo --quiet download example.com
+# Disable colored output
+archaeo --no-color download example.com
 # Discover all known URLs for a domain
 archaeo known_urls example.com
+archaeo known_urls --file urls.txt example.com
+archaeo known_urls --subdomain example.com
+# Check archive health
+archaeo health example.com
+archaeo health --from 20220101 --to 20221231 --sample 50 example.com
+# Analyze archive coverage
+archaeo coverage example.com
+archaeo coverage --from 20220101 --to 20221231 --format json example.com
+# Compare two snapshots
+archaeo snapshot-diff example.com 20220101 20220615
+archaeo snapshot-diff --format json example.com 20220101 20220615
+# Search archived content
+archaeo search example.com "contact us"
+archaeo search --from 20220101 --to 20221231 --max-results 10 example.com "about"
+# Track content changes over time
+archaeo track-changes example.com
+archaeo track-changes --from 20220101 --to 20221231 --format json example.com
+# Export to WARC format
+archaeo warc-export --output archive.warc example.com
+archaeo warc-export --output archive.warc.gz --gzip example.com
+# Save API with headers
+archaeo save --headers https://example.com/
 ----
 === Error Handling
@@ -585,28 +931,32 @@ Archaeo follows a model-driven, OOP design:
 | Layer | Classes | Purpose
 | *Models*
-| `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`, `CdxTimeline`
+| `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`, `CdxTimeline`, `ProgressReport`, `CoverageReport`, `ContentChangeReport`, `SearchResult`, `WarcRecord`, `HealthReport`
 | Domain value objects with `to_h`, `as_json`, `inspect` support
 | *URL Processing*
-| `UrlNormalizer`, `CdxFilter`, `UrlRewriter`
-| URL sanitization, validated filtering with composition, and HTML URL rewriting
+| `UrlNormalizer`, `CdxFilter`, `UrlRewriter`, `PatternFilter`, `PathSanitizer`
+| URL sanitization, validated filtering, regex include/exclude, path conflict resolution, and HTML/JS/CSS URL rewriting
 | *Asset Extraction*
 | `AssetExtractor`, `AssetList`
 | Parse HTML for resource URLs including preloads and modulepreload
 | *APIs*
-| `CdxApi`, `AvailabilityApi`, `SaveApi`
-| Query and mutate the archive
+| `CdxApi`, `ParallelCdx`, `AvailabilityApi`, `SaveApi`, `ArchiveSearch`
+| Query and mutate the archive, parallel CDX fetching, full-text search
 | *Operations*
-| `Fetcher`, `BulkDownloader`, `DownloadState`
-| Download content with resume, dry-run, digest verification, and download summaries
+| `Fetcher`, `BulkDownloader`, `DownloadState`, `DownloadScheduler`, `SubdomainDiscovery`
+| Download content with resume, scheduling strategies, subdomain discovery, and digest verification
+| *Analysis*
+| `SnapshotDiff`, `CoverageAnalyzer`, `ContentTracker`, `ArchiveHealthCheck`
+| Compare snapshots, analyze coverage, track changes over time, verify accessibility
 | *Infrastructure*
-| `HttpClient`
-| HTTP transport with retries, gzip, 429/503 handling, connection pooling, and per-request observability
+| `HttpClient`, `RateLimiter`, `EncodingDetector`, `CdxCache`, `Configuration`, `ColorOutput`, `WarcWriter`, `WarcReader`
+| HTTP transport, rate limiting, encoding detection, caching, config management, WARC I/O, and color output
 |===
 All API classes accept an `HttpClient` via dependency injection for testability.

data/lib/archaeo/cli.rb CHANGED Viewed

@@ -194,6 +194,32 @@ module Archaeo
       end
     end
+    desc "rewrite-local INPUT_DIR",
+         "Rewrite previously downloaded files to use local paths"
+    option :output, desc: "Output directory (default: rewrite in-place)",
+                    required: false
+    option :prefix, desc: "Local path prefix", default: "local"
+    option :rewrite_js, type: :boolean, default: false,
+                        desc: "Rewrite URLs in JavaScript strings"
+    option :rewrite_absolute, type: :boolean, default: false,
+                              desc: "Rewrite all absolute archive URLs"
+    def rewrite_local(input_dir)
+      handle_errors do
+        output_dir = options[:output] || input_dir
+        local_rewriter = LocalRewriter.new(
+          prefix: options[:prefix],
+          rewrite_js: options[:rewrite_js],
+          rewrite_absolute: options[:rewrite_absolute],
+        )
+        summary = local_rewriter.rewrite_directory(input_dir, output_dir)
+        color = build_color
+        warn color.success(
+          "Rewrote #{summary.rewritten}/#{summary.total} files " \
+          "in #{summary.elapsed.round(1)}s",
+        )
+      end
+    end
     desc "diff URL TIMESTAMP_A TIMESTAMP_B",
          "Compare assets of two archived snapshots"
     option :format, desc: "Output format (table, json)", default: "table"

data/lib/archaeo/local_rewriter.rb ADDED Viewed

@@ -0,0 +1,106 @@
+# frozen_string_literal: true
+require "fileutils"
+module Archaeo
+  LocalRewriteSummary = Struct.new(
+    :total, :rewritten, :skipped, :elapsed,
+    keyword_init: true
+  )
+  # Rewrites previously downloaded files by converting archive URLs
+  # to local paths. Operates on files already on disk without fetching.
+  class LocalRewriter
+    def initialize(prefix: "local", rewrite_js: false,
+                   rewrite_absolute: false)
+      @prefix = prefix
+      @rewrite_js = rewrite_js
+      @rewrite_absolute = rewrite_absolute
+    end
+    def rewrite_directory(input_dir, output_dir)
+      start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+      files = gather_files(input_dir)
+      rewritten = 0
+      skipped = 0
+      files.each do |path|
+        rel = path.sub(%r{\A#{Regexp.escape(input_dir)}/?}, "")
+        out_path = File.join(output_dir, rel)
+        result = rewrite_file(path, out_path)
+        result ? rewritten += 1 : skipped += 1
+      end
+      elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
+      LocalRewriteSummary.new(
+        total: files.size, rewritten: rewritten,
+        skipped: skipped, elapsed: elapsed
+      )
+    end
+    def rewrite_file(input_path, output_path)
+      content = File.read(input_path)
+      return nil unless rewrite_candidate?(content)
+      FileUtils.mkdir_p(File.dirname(output_path))
+      rewriter = build_rewriter
+      rewritten = apply_rewriting(rewriter, content, input_path)
+      File.write(output_path, rewritten)
+      true
+    end
+    private
+    def gather_files(dir)
+      Dir.glob(File.join(dir, "**", "*"))
+        .select { |f| File.file?(f) && text_file?(f) }
+    end
+    def text_file?(path)
+      ext = File.extname(path).downcase
+      TEXT_EXTENSIONS.include?(ext)
+    end
+    TEXT_EXTENSIONS = %w[
+      .html .htm .xhtml .css .js .json .xml .txt
+      .svg .md .yaml .yml .rss .atom
+    ].freeze
+    def rewrite_candidate?(content)
+      content.include?("web.archive.org")
+    end
+    def build_rewriter
+      UrlRewriter.new(
+        "https://web.archive.org", @prefix,
+        rewrite_js: @rewrite_js,
+        rewrite_absolute: @rewrite_absolute
+      )
+    end
+    def apply_rewriting(rewriter, content, path)
+      ext = File.extname(path).downcase
+      case ext
+      when ".html", ".htm", ".xhtml"
+        rewriter.rewrite_html(content)
+      when ".css"
+        rewriter.rewrite_css(content)
+      when ".js"
+        rewriter.rewrite_js(content)
+      else
+        rewrite_mixed(rewriter, content)
+      end
+    end
+    def rewrite_mixed(rewriter, content)
+      if content.include?("<") && content.include?(">")
+        rewriter.rewrite_html(content)
+      elsif content.include?("url(")
+        rewriter.rewrite_css(content)
+      else
+        rewriter.rewrite_js(content)
+      end
+    end
+  end
+end

data/lib/archaeo/page.rb CHANGED Viewed

@@ -139,6 +139,16 @@ module Archaeo
       end
     end
+    def microposts
+      return [] unless html?
+      @microposts ||= begin
+        doc = Nokogiri::HTML(@raw_content)
+        containers = find_article_containers(doc)
+        containers.filter_map { |el| extract_micropost(el) }
+      end
+    end
     def to_h
       {
         content_type: @content_type,
@@ -255,5 +265,58 @@ module Archaeo
       end
       inputs.reject { |f| f[:name].empty? }
     end
+    ARTICLE_SELECTORS = %w[
+      article [role=article] .post .entry .blog-post
+      .hentry .post-content .entry-content .article-content
+      .story .story-body .news-article
+    ].freeze
+    def find_article_containers(doc)
+      found = ARTICLE_SELECTORS
+        .filter_map { |sel| doc.css(sel) }
+        .flat_map(&:to_a)
+      found.any? ? found.uniq : [doc.at_css("body") || doc]
+    end
+    def extract_micropost(element)
+      title = extract_micropost_title(element)
+      body = extract_micropost_body(element)
+      return nil if body.nil? || body.strip.empty?
+      { title: title, body: body.strip,
+        date: extract_micropost_date(element),
+        author: extract_micropost_author(element) }
+    end
+    def extract_micropost_title(el)
+      heading = el.at_css("h1, h2, h3, [class*=title], [class*=heading]")
+      heading&.text&.strip
+    end
+    def extract_micropost_body(el)
+      paragraphs = el.css("p").map(&:text).join("\n")
+      return nil if paragraphs.strip.empty?
+      paragraphs
+    end
+    def extract_micropost_date(el)
+      time = el.at_css("time[datetime]")
+      return time["datetime"] if time
+      date_el = el.at_css(
+        "[class*=date], [class*=time], [class*=published], " \
+        "[property='datePublished']",
+      )
+      date_el&.text&.strip
+    end
+    def extract_micropost_author(el)
+      author_el = el.at_css(
+        "[class*=author], [rel=author], [property='author']",
+      )
+      author_el&.text&.strip
+    end
   end
 end

data/lib/archaeo/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Archaeo
-  VERSION = "0.2.10"
+  VERSION = "0.2.11"
 end

data/lib/archaeo.rb CHANGED Viewed

@@ -69,4 +69,6 @@ module Archaeo
   autoload :ContentChangeReport, "archaeo/content_tracker"
   autoload :ArchiveSearch, "archaeo/archive_search"
   autoload :SearchResult, "archaeo/archive_search"
+  autoload :LocalRewriter, "archaeo/local_rewriter"
+  autoload :LocalRewriteSummary, "archaeo/local_rewriter"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: archaeo
 version: !ruby/object:Gem::Version
-  version: 0.2.10
+  version: 0.2.11
 platform: ruby
 authors:
 - Ribose Inc.
@@ -93,6 +93,7 @@ files:
 - lib/archaeo/encoding_detector.rb
 - lib/archaeo/fetcher.rb
 - lib/archaeo/http_client.rb
+- lib/archaeo/local_rewriter.rb
 - lib/archaeo/page.rb
 - lib/archaeo/page_bundle.rb
 - lib/archaeo/parallel_cdx.rb