RubyGems - archaeo - Versions diffs - 0.1.0 - Mend

archaeo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +5 -0
data/CODE_OF_CONDUCT.md +10 -0
data/README.adoc +186 -0
data/Rakefile +12 -0
data/archaeo.gemspec +38 -0
data/bin/console +11 -0
data/bin/setup +8 -0
data/exe/archaeo +6 -0
data/lib/archaeo/archive_url.rb +54 -0
data/lib/archaeo/availability_api.rb +74 -0
data/lib/archaeo/availability_result.rb +22 -0
data/lib/archaeo/cdx_api.rb +162 -0
data/lib/archaeo/cli.rb +94 -0
data/lib/archaeo/fetcher.rb +62 -0
data/lib/archaeo/http_client.rb +137 -0
data/lib/archaeo/page.rb +22 -0
data/lib/archaeo/save_api.rb +112 -0
data/lib/archaeo/save_result.rb +21 -0
data/lib/archaeo/snapshot.rb +40 -0
data/lib/archaeo/timestamp.rb +103 -0
data/lib/archaeo/version.rb +5 -0
data/lib/archaeo.rb +30 -0
data/sig/archaeo.rbs +241 -0
metadata +84 -0

data/lib/archaeo/fetcher.rb ADDED Viewed

@@ -0,0 +1,62 @@
+# frozen_string_literal: true
+module Archaeo
+  # Downloads archived content from the Wayback Machine.
+  #
+  # Constructs the appropriate archive URL, follows redirects,
+  # and returns a Page model with content and metadata.
+  class Fetcher
+    MAX_REDIRECTS = 5
+    BASE = "https://web.archive.org"
+    def initialize(client: HttpClient.new)
+      @client = client
+    end
+    def fetch(url, timestamp:, identity: false)
+      ts = Timestamp.coerce(timestamp)
+      archive_url = ArchiveUrl.new(url, timestamp: ts,
+                                        identity: identity)
+      response = follow_redirects(archive_url.to_s)
+      build_page(response, archive_url.to_s, url, ts)
+    end
+    private
+    def build_page(response, archive_url, url, timestamp)
+      Page.new(
+        content: response.body,
+        content_type: response.headers["content-type"],
+        status_code: response.status,
+        archive_url: archive_url,
+        original_url: url,
+        timestamp: timestamp,
+      )
+    end
+    def follow_redirects(url, remaining = MAX_REDIRECTS)
+      raise Error, "Too many redirects for #{url}" if remaining.negative?
+      response = @client.get(url)
+      return response unless redirect?(response)
+      new_url = resolve_redirect(url, response.headers["location"])
+      follow_redirects(new_url, remaining - 1)
+    end
+    def redirect?(response)
+      status = response.status
+      location = response.headers["location"]
+      status.between?(300, 399) && location
+    end
+    def resolve_redirect(current_url, location)
+      return location if location.start_with?("http")
+      return "#{BASE}#{location}" if location.start_with?("/web/")
+      URI.join(current_url, location).to_s
+    rescue URI::InvalidURIError
+      location
+    end
+  end
+end

data/lib/archaeo/http_client.rb ADDED Viewed

@@ -0,0 +1,137 @@
+# frozen_string_literal: true
+require "net/http"
+require "uri"
+require "zlib"
+require "stringio"
+module Archaeo
+  # HTTP client with retry logic, gzip decompression, and
+  # rotating realistic User-Agent profiles.
+  #
+  # Injected via constructor for testability.
+  class HttpClient
+    DEFAULT_TIMEOUT = 30
+    DEFAULT_MAX_RETRIES = 3
+    DEFAULT_RETRY_DELAY = 2
+    TRANSIENT_ERRORS = [
+      Net::ReadTimeout,
+      Net::OpenTimeout,
+      IOError,
+      Errno::ECONNRESET,
+      Errno::ECONNREFUSED,
+    ].freeze
+    USER_AGENT_PROFILES = [
+      "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
+      "AppleWebKit/537.36 (KHTML, like Gecko) " \
+      "Chrome/131.0.0.0 Safari/537.36",
+      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
+      "AppleWebKit/537.36 (KHTML, like Gecko) " \
+      "Chrome/130.0.0.0 Safari/537.36",
+      "Mozilla/5.0 (X11; Linux x86_64) " \
+      "AppleWebKit/537.36 (KHTML, like Gecko) " \
+      "Chrome/131.0.0.0 Safari/537.36",
+      "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
+      "AppleWebKit/537.36 (KHTML, like Gecko) " \
+      "Chrome/129.0.0.0 Safari/537.36",
+      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
+      "AppleWebKit/537.36 (KHTML, like Gecko) " \
+      "Chrome/131.0.0.0 Safari/537.36",
+    ].freeze
+    # HTTP response with status code, headers (lowercase keys), and body.
+    class Response
+      attr_reader :status, :headers, :body
+      def initialize(status:, headers:, body:)
+        @status = status
+        @headers = headers
+        @body = body
+      end
+    end
+    def initialize(timeout: DEFAULT_TIMEOUT,
+                   max_retries: DEFAULT_MAX_RETRIES,
+                   retry_delay: DEFAULT_RETRY_DELAY,
+                   user_agent: nil)
+      @timeout = timeout
+      @max_retries = max_retries
+      @retry_delay = retry_delay
+      @user_agent = user_agent
+    end
+    def get(url, headers: {})
+      merged = default_headers.merge(headers)
+      attempt_with_retries(url, merged)
+    end
+    private
+    def select_user_agent
+      @user_agent || USER_AGENT_PROFILES.sample
+    end
+    def attempt_with_retries(url, headers)
+      retries = 0
+      begin
+        execute_get(url, headers)
+      rescue *TRANSIENT_ERRORS => e
+        retries += 1
+        raise_if_exhausted(retries, e)
+        sleep(@retry_delay * retries)
+        retry
+      end
+    end
+    def raise_if_exhausted(retries, error)
+      return unless retries > @max_retries
+      raise MaximumRetriesExceeded,
+            "Failed after #{retries} retries: #{error.message}"
+    end
+    def default_headers
+      {
+        "User-Agent" => select_user_agent,
+        "Accept" => "text/html,application/xhtml+xml," \
+                    "application/xml;q=0.9,*/*;q=0.8",
+        "Accept-Encoding" => "gzip",
+        "Accept-Language" => "en-US,en;q=0.9",
+        "Connection" => "keep-alive",
+      }
+    end
+    def execute_get(url, headers)
+      uri = URI(url)
+      Net::HTTP.start(uri.host, uri.port,
+                      use_ssl: uri.scheme == "https",
+                      read_timeout: @timeout,
+                      open_timeout: @timeout) do |http|
+        request = Net::HTTP::Get.new(uri)
+        headers.each { |k, v| request[k] = v }
+        raw = http.request(request)
+        build_response(raw)
+      end
+    end
+    def build_response(raw)
+      headers = raw.each_header.to_h { |k, v| [k.downcase, v] }
+      Response.new(
+        status: raw.code.to_i,
+        headers: headers,
+        body: decompress_body(raw),
+      )
+    end
+    def decompress_body(raw)
+      body = raw.body.to_s
+      return body unless raw["content-encoding"] == "gzip" && !body.empty?
+      Zlib::GzipReader.new(StringIO.new(body)).read
+    rescue Zlib::GzipFile::Error
+      body
+    end
+  end
+end

data/lib/archaeo/page.rb ADDED Viewed

@@ -0,0 +1,22 @@
+# frozen_string_literal: true
+module Archaeo
+  # Model representing a fetched archived page from the Wayback Machine.
+  #
+  # Contains the page content, metadata, and provenance information
+  # for a single archived resource.
+  class Page
+    attr_reader :content, :content_type, :status_code,
+                :archive_url, :original_url, :timestamp
+    def initialize(content:, content_type:, status_code:,
+                   archive_url:, original_url:, timestamp:)
+      @content = content
+      @content_type = content_type
+      @status_code = status_code
+      @archive_url = archive_url
+      @original_url = original_url
+      @timestamp = Timestamp.coerce(timestamp)
+    end
+  end
+end

data/lib/archaeo/save_api.rb ADDED Viewed

@@ -0,0 +1,112 @@
+# frozen_string_literal: true
+module Archaeo
+  # Client for the Wayback Machine SavePageNow (SPN) API.
+  #
+  # Request the Wayback Machine to archive a URL and retrieve the
+  # resulting archive URL and timestamp as a SaveResult.
+  class SaveApi
+    ENDPOINT = "https://web.archive.org/save"
+    DEFAULT_MAX_TRIES = 8
+    TIMESTAMP_RE = %r{web\.archive\.org/web/(\d{14})}
+    def initialize(client: HttpClient.new,
+                   max_tries: DEFAULT_MAX_TRIES)
+      @client = client
+      @max_tries = max_tries
+    end
+    def save(url)
+      save_url = "#{ENDPOINT}/#{url}"
+      start_time = Time.now.utc
+      attempt_save(save_url, start_time, url)
+    end
+    private
+    def attempt_save(save_url, start_time, url)
+      @max_tries.times do |attempt|
+        sleep(retry_delay(attempt)) if attempt.positive?
+        response = @client.get(save_url)
+        check_response_errors!(response, url)
+        result = process_save_response(response, start_time)
+        return result if result
+      end
+      raise MaximumRetriesExceeded,
+            "Failed to save #{url} after #{@max_tries} attempts"
+    end
+    def process_save_response(response, start_time)
+      archive_url = extract_archive_url(response)
+      return nil unless archive_url
+      ts = Timestamp.parse(extract_timestamp(archive_url))
+      cached = ts.to_time < start_time - 2700
+      SaveResult.new(archive_url: archive_url,
+                     timestamp: ts, cached: cached)
+    end
+    def check_response_errors!(response, url)
+      case response.status
+      when 429
+        raise RateLimitError, "Rate limited while saving #{url}"
+      when 509
+        raise SaveFailed, "Session limit reached while saving #{url}"
+      end
+    end
+    def retry_delay(attempt)
+      ((attempt + 1) % 3).zero? ? 10 : 5
+    end
+    def extract_archive_url(response)
+      headers = response.headers
+      from_content_location(headers) ||
+        from_memento_link(headers) ||
+        from_cache_key(headers) ||
+        from_location(headers)
+    end
+    def from_content_location(headers)
+      location = headers["content-location"]
+      return unless location&.match?(%r{^/web/\d{14}/})
+      "https://web.archive.org#{location}"
+    end
+    def from_memento_link(headers)
+      link = headers["link"].to_s
+      match = link.match(
+        %r{rel="memento".*?href="(web\.archive\.org/web/\d{14}/.*?)"},
+      )
+      return unless match
+      "https://#{match[1]}"
+    end
+    def from_cache_key(headers)
+      cache_key = headers["x-cache-key"].to_s
+      match = cache_key.match(/(https.*)[A-Z]{2}/)
+      match ? match[1] : nil
+    end
+    def from_location(headers)
+      location = headers["location"].to_s
+      match = location.match(%r{(web\.archive\.org/web/\d+/.*)$})
+      return unless match
+      "https://#{match[1]}"
+    end
+    def extract_timestamp(archive_url)
+      match = archive_url.match(TIMESTAMP_RE)
+      return match[1] if match
+      raise InvalidResponse,
+            "Cannot parse timestamp from: #{archive_url}"
+    end
+  end
+end

data/lib/archaeo/save_result.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# frozen_string_literal: true
+module Archaeo
+  # Model representing the outcome of a SavePageNow request.
+  #
+  # Contains the resulting archive URL, timestamp, and whether
+  # the page was already cached in the archive.
+  class SaveResult
+    attr_reader :archive_url, :timestamp
+    def initialize(archive_url:, timestamp:, cached:)
+      @archive_url = archive_url
+      @timestamp = Timestamp.coerce(timestamp)
+      @cached = cached
+    end
+    def cached?
+      @cached
+    end
+  end
+end

data/lib/archaeo/snapshot.rb ADDED Viewed

@@ -0,0 +1,40 @@
+# frozen_string_literal: true
+module Archaeo
+  # A single CDX Server API record representing an archived document.
+  #
+  # Maps the seven standard CDX fields and provides the computed
+  # archive URL via the ArchiveUrl model.
+  class Snapshot
+    FIELDS = %i[urlkey timestamp original_url
+                mimetype status_code digest length].freeze
+    attr_reader(*FIELDS)
+    def initialize(urlkey:, timestamp:, original_url:,
+                   mimetype: nil, status_code: nil,
+                   digest: nil, length: nil)
+      @urlkey = urlkey.to_s
+      @timestamp = Timestamp.coerce(timestamp)
+      @original_url = original_url.to_s
+      @mimetype = mimetype.to_s
+      @status_code = status_code.to_i
+      @digest = digest.to_s
+      @length = length.to_i
+    end
+    def archive_url
+      ArchiveUrl.new(original_url, timestamp: @timestamp).to_s
+    end
+    def ==(other)
+      other.is_a?(self.class) &&
+        FIELDS.all? { |f| send(f) == other.send(f) }
+    end
+    alias_method :eql?, :==
+    def hash
+      FIELDS.map { |f| send(f) }.hash
+    end
+  end
+end

data/lib/archaeo/timestamp.rb ADDED Viewed

@@ -0,0 +1,103 @@
+# frozen_string_literal: true
+module Archaeo
+  # Value object representing a Wayback Machine timestamp (YYYYMMDDHHmmss).
+  #
+  # Supports parsing, formatting, comparison, and coercion from various
+  # time representations.
+  class Timestamp
+    include Comparable
+    FORMAT = "%Y%m%d%H%M%S"
+    attr_reader :to_time
+    def initialize(year:, month: 1, day: 1,
+                   hour: 0, minute: 0, second: 0)
+      @to_time = Time.utc(year, month, day, hour, minute, second)
+    end
+    def self.parse(string)
+      year = string[0, 4].to_i
+      month = string[4, 2].to_i if string.length >= 6
+      day = string[6, 2].to_i if string.length >= 8
+      new(year: year, month: month, day: day,
+          **parse_time_parts(string))
+    end
+    def self.parse_time_parts(string)
+      return {} if string.length < 10
+      {
+        hour: string[8, 2].to_i,
+        minute: string[10, 2].to_i,
+        second: string[12, 2].to_i,
+      }
+    end
+    private_class_method :parse_time_parts
+    def self.from_time(time)
+      utc = time.getutc
+      new(year: utc.year, month: utc.month, day: utc.day,
+          hour: utc.hour, minute: utc.min, second: utc.sec)
+    end
+    def self.now
+      from_time(Time.now)
+    end
+    def self.coerce(value)
+      case value
+      when Timestamp then value
+      when String then parse(value)
+      when Time then from_time(value)
+      else
+        raise ArgumentError,
+              "Cannot coerce #{value.class} to Archaeo::Timestamp"
+      end
+    end
+    def to_s
+      @to_time.strftime(FORMAT)
+    end
+    def <=>(other)
+      return nil unless other.is_a?(self.class)
+      to_s <=> other.to_s
+    end
+    def hash
+      to_s.hash
+    end
+    def eql?(other)
+      self == other
+    end
+    def year
+      @to_time.year
+    end
+    def month
+      @to_time.month
+    end
+    def day
+      @to_time.day
+    end
+    def hour
+      @to_time.hour
+    end
+    def minute
+      @to_time.min
+    end
+    def second
+      @to_time.sec
+    end
+  end
+end

data/lib/archaeo/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module Archaeo
+  VERSION = "0.1.0"
+end

data/lib/archaeo.rb ADDED Viewed

@@ -0,0 +1,30 @@
+# frozen_string_literal: true
+require_relative "archaeo/version"
+# Archaeo provides a Ruby interface to the Internet Archive's Wayback Machine
+# APIs, including the CDX Server API, Availability API, SavePageNow API,
+# and content fetching.
+module Archaeo
+  class Error < StandardError; end
+  class NoSnapshotFound < Error; end
+  class BlockedSiteError < Error; end
+  class RateLimitError < Error; end
+  class MaximumRetriesExceeded < Error; end
+  class ArchiveNotAvailable < Error; end
+  class InvalidResponse < Error; end
+  class SaveFailed < Error; end
+  autoload :Timestamp, "archaeo/timestamp"
+  autoload :ArchiveUrl, "archaeo/archive_url"
+  autoload :Snapshot, "archaeo/snapshot"
+  autoload :Page, "archaeo/page"
+  autoload :SaveResult, "archaeo/save_result"
+  autoload :AvailabilityResult, "archaeo/availability_result"
+  autoload :HttpClient, "archaeo/http_client"
+  autoload :CdxApi, "archaeo/cdx_api"
+  autoload :AvailabilityApi, "archaeo/availability_api"
+  autoload :SaveApi, "archaeo/save_api"
+  autoload :Fetcher, "archaeo/fetcher"
+  autoload :Cli, "archaeo/cli"
+end