RubyGems - archaeo - Versions diffs - 0.1.0 → 0.2.0 - Mend

archaeo 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/archaeo.gemspec +5 -2
data/lib/archaeo/asset_extractor.rb +94 -0
data/lib/archaeo/asset_list.rb +52 -0
data/lib/archaeo/bulk_downloader.rb +67 -0
data/lib/archaeo/cdx_api.rb +111 -13
data/lib/archaeo/cdx_filter.rb +71 -0
data/lib/archaeo/cli.rb +86 -4
data/lib/archaeo/download_state.rb +49 -0
data/lib/archaeo/http_client.rb +75 -19
data/lib/archaeo/page.rb +66 -3
data/lib/archaeo/snapshot.rb +13 -3
data/lib/archaeo/url_normalizer.rb +56 -0
data/lib/archaeo/url_rewriter.rb +22 -0
data/lib/archaeo/version.rb +1 -1
data/lib/archaeo.rb +7 -0
metadata +42 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 65cb8ec1434b72774ed3a1d49ac87920bebb549cc5a4aebb0966b8d110d740ba
-  data.tar.gz: a10b0bf2b8555d3a259c8ec02364e3031697189a784411ab54c4a1bfd17ab402
+  metadata.gz: 24c6b37575e8f673a8e6acb7aba38264ac811236cb91e905663914d08a283289
+  data.tar.gz: 632f36d31ee83b23f727dd8eb8217929d3dad26c94c6948fb0621aff2b937701
 SHA512:
-  metadata.gz: 74eac73369d611a491152f7018d63b8fe3b46f8154a374d43d62f3ca023e1837dae666b98c795c211add5c6282fffe70576e581165aecd6644ecbccd15efe623
-  data.tar.gz: 03dd2e1ea518ef34a2b2c427c91a9a562aa328aced3b8cad4910fb634ba8e723f1cfb50bd57b247403a060c67f2f2f4638ad9e4293abb76e6225a731bb7493fd
+  metadata.gz: 33cf0ea6c5317be5aafba988e8652555a8e7a77620e93571a51c1537ae4cdd455e76d170fe78549a5e3068ec82db12138159cb87a31c1aafda5c036a6ca2511e
+  data.tar.gz: 2c0ae7a6a461913ed475dd3084e495f057db995ebf9ea8a1e6c9595aae74cd6bc65a0cdc6f2b2a3a831d9cc0e45e92e06a13a4f7653e89c0a8bbcd7226207677

data/archaeo.gemspec CHANGED Viewed

@@ -20,19 +20,22 @@ Gem::Specification.new do |spec|
   spec.metadata["homepage_uri"] = spec.homepage
   spec.metadata["source_code_uri"] = spec.homepage
   spec.metadata["changelog_uri"] =
-    "#{spec.homepage}/blob/main/CHANGELOG.md"
+    "#{spec.homepage}/blob/main/CHANGELOG.adoc"
   spec.metadata["rubygems_mfa_required"] = "true"
   spec.files = IO.popen(%w[git ls-files -z], chdir: __dir__,
                                              err: IO::NULL) do |ls|
     ls.readlines("\x0", chomp: true).reject do |f|
       f == __FILE__ ||
-        f.start_with?(*%w[Gemfile .gitignore .rspec spec/ .github/ .rubocop])
+        f.start_with?(*%w[Gemfile .gitignore .rspec spec/ .github/
+                          .rubocop TODO])
     end
   end
   spec.bindir = "exe"
   spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
   spec.require_paths = ["lib"]
+  spec.add_dependency "csv", "~> 3.3"
+  spec.add_dependency "nokogiri", "~> 1.14"
   spec.add_dependency "thor", "~> 1.3"
 end

data/lib/archaeo/asset_extractor.rb ADDED Viewed

@@ -0,0 +1,94 @@
+# frozen_string_literal: true
+require "nokogiri"
+require "uri"
+module Archaeo
+  # Extracts resource URLs from archived HTML content using Nokogiri.
+  #
+  # Parses the HTML DOM to find CSS, JavaScript, images, fonts,
+  # and media resources referenced by the page. Optionally resolves
+  # relative URLs against a base URL.
+  class AssetExtractor
+    def initialize(html, base_url: nil)
+      @doc = Nokogiri::HTML(html.to_s)
+      @base_url = base_url
+    end
+    def extract
+      list = AssetList.new
+      extract_css(list)
+      extract_js(list)
+      extract_images(list)
+      extract_fonts(list)
+      extract_media(list)
+      extract_inline_css(list)
+      list
+    end
+    private
+    def extract_css(list)
+      @doc.css('link[rel="stylesheet"]').each do |el|
+        list.add(resolve(el["href"]), type: :css)
+      end
+    end
+    def extract_js(list)
+      @doc.css("script[src]").each do |el|
+        list.add(resolve(el["src"]), type: :js)
+      end
+    end
+    def extract_images(list)
+      @doc.css("img[src]").each do |el|
+        list.add(resolve(el["src"]), type: :image)
+      end
+    end
+    def extract_fonts(list)
+      @doc.css('link[rel="preload"][as="font"]').each do |el|
+        list.add(resolve(el["href"]), type: :font)
+      end
+      @doc.css('link[rel="stylesheet"]').each do |el|
+        if font_stylesheet?(el["href"])
+          list.add(resolve(el["href"]),
+                   type: :font)
+        end
+      end
+    end
+    def extract_media(list)
+      @doc.css("source[src], video[src], audio[src]").each do |el|
+        list.add(resolve(el["src"]), type: :media)
+      end
+    end
+    def extract_inline_css(list)
+      @doc.css("style").each do |el|
+        extract_css_urls(el.text).each do |url|
+          list.add(resolve(url), type: :font)
+        end
+      end
+    end
+    def font_stylesheet?(href)
+      href.to_s.include?("fonts.googleapis.com") ||
+        href.to_s.include?("font")
+    end
+    def extract_css_urls(css_text)
+      css_text.scan(/url\(\s*['"]?([^'")\s]+)['"]?\s*\)/).flatten
+    end
+    def resolve(url)
+      return url if url.nil? || url.empty?
+      return url if url.start_with?("http", "//", "data:", "#")
+      return url unless @base_url
+      URI.join(@base_url, url).to_s
+    rescue URI::InvalidURIError
+      url
+    end
+  end
+end

data/lib/archaeo/asset_list.rb ADDED Viewed

@@ -0,0 +1,52 @@
+# frozen_string_literal: true
+module Archaeo
+  # Categorized collection of asset URLs extracted from an archived page.
+  #
+  # Assets are grouped by type (css, js, image, font, media) for
+  # convenient access during bulk download or local archiving.
+  class AssetList
+    CATEGORIES = %i[css js image font media].freeze
+    def initialize
+      @urls_by_type = {}
+      CATEGORIES.each { |c| @urls_by_type[c] = [] }
+    end
+    def add(url, type:)
+      @urls_by_type[type] << url unless url.nil? || url.empty?
+    end
+    def css
+      @urls_by_type[:css]
+    end
+    def js
+      @urls_by_type[:js]
+    end
+    def images
+      @urls_by_type[:image]
+    end
+    def fonts
+      @urls_by_type[:font]
+    end
+    def media
+      @urls_by_type[:media]
+    end
+    def all
+      @urls_by_type.values.flatten.uniq
+    end
+    def size
+      all.size
+    end
+    def empty?
+      all.empty?
+    end
+  end
+end

data/lib/archaeo/bulk_downloader.rb ADDED Viewed

@@ -0,0 +1,67 @@
+# frozen_string_literal: true
+require "fileutils"
+module Archaeo
+  # Downloads all archived snapshots of a URL with resume support.
+  #
+  # Queries the CDX API for matching snapshots, fetches each page,
+  # and saves content to disk. Progress is tracked in a state file
+  # for interrupted download recovery.
+  class BulkDownloader
+    def initialize(client: HttpClient.new, output_dir: "archive")
+      @client = client
+      @output_dir = output_dir
+    end
+    def download(url, from: nil, to: nil, resume: false)
+      FileUtils.mkdir_p(@output_dir)
+      state = DownloadState.new(@output_dir)
+      snapshots = fetch_snapshots(url, from: from, to: to)
+      total = snapshots.size
+      snapshots.each_with_index do |snap, index|
+        next if resume && state.completed?(snap.timestamp)
+        fetch_and_save(snap)
+        state.mark_completed(snap.timestamp)
+        yield index + 1, total, snap if block_given?
+      end
+    end
+    private
+    def fetch_snapshots(url, from:, to:)
+      cdx = CdxApi.new(client: @client)
+      options = {}
+      options[:from] = from if from
+      options[:to] = to if to
+      cdx.snapshots(url, **options)
+        .select { |snap| !snap.blocked? && snap.status_code == 200 }
+    end
+    def fetch_and_save(snapshot)
+      fetcher = Fetcher.new(client: @client)
+      page = fetcher.fetch(snapshot.original_url,
+                           timestamp: snapshot.timestamp)
+      filename = build_filename(snapshot)
+      FileUtils.mkdir_p(File.dirname(filename))
+      File.binwrite(filename, page.content)
+    end
+    def build_filename(snapshot)
+      ts = snapshot.timestamp.to_s
+      safe_path = snapshot.original_url
+        .sub(%r{\Ahttps?://}, "")
+        .gsub(%r{/}, File::SEPARATOR)
+        .gsub(%r{[?&=]}, "_")
+      safe_path = safe_path[0..-2] if safe_path.end_with?(File::SEPARATOR)
+      safe_path = "#{safe_path}index" if safe_path.empty?
+      File.join(@output_dir, safe_path, "#{ts}.html")
+    end
+  end
+end

data/lib/archaeo/cdx_api.rb CHANGED Viewed

@@ -6,8 +6,11 @@ require "uri"
 module Archaeo
   # Client for the Wayback Machine CDX Server API.
   #
-  # Query archived snapshots by URL, timestamp range, filters,
-  # and more. Returns Snapshot objects for each matching CDX record.
+  # Supports all CDX features: field selection, filtering with regex,
+  # collapsing, resume-key pagination, page-based pagination,
+  # closest timestamp match, resolve revisits, and counters.
+  #
+  # @see https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
   class CdxApi
     ENDPOINT = "https://web.archive.org/cdx/search/cdx"
@@ -27,17 +30,31 @@ module Archaeo
       sort: "sort",
       limit: "limit",
       closest: "closest",
+      offset: "offset",
+      page: "page",
+      page_size: "pageSize",
+      fast_latest: "fastLatest",
+      resolve_revisits: "resolveRevisits",
+      show_dupe_count: "showDupeCount",
+      show_skip_count: "showSkipCount",
+      last_skip_timestamp: "lastSkipTimestamp",
     }.freeze
     def initialize(client: HttpClient.new)
       @client = client
     end
+    # Returns an Enumerator of Snapshot objects, auto-paginating
+    # via resume key unless an explicit page is requested.
     def snapshots(url, **options)
       validate_options!(options)
       Enumerator.new do |yielder|
-        fetch_snapshots(url, options, yielder)
+        if options.key?(:page)
+          fetch_page(url, options, yielder)
+        else
+          fetch_with_resume_key(url, options, yielder)
+        end
       end
     end
@@ -75,24 +92,64 @@ module Archaeo
             "No snapshot found after #{ts} for #{url}"
     end
-    private
-    def fetch_snapshots(url, options, yielder)
-      params = build_params(url, options)
+    # Returns the number of pages for a paginated query.
+    def num_pages(url, **options)
+      params = { "url" => url, "showNumPages" => "true" }
+      merge_scalar_params!(params, options)
       response = @client.get(
         "#{ENDPOINT}?#{URI.encode_www_form(params)}",
       )
       unless response.status == 200
-        raise Error, "CDX API returned HTTP #{response.status}"
+        raise Error,
+              "CDX API returned HTTP #{response.status}"
+      end
+      response.body.strip.to_i
+    end
+    # Returns all unique original URLs under a domain.
+    def known_urls(domain, match_type: "domain")
+      snapshots(domain, match_type: match_type,
+                        collapse: ["urlkey"]).map(&:original_url).uniq
+    end
+    private
+    def fetch_with_resume_key(url, options, yielder)
+      params = build_params(url, options)
+      loop do
+        response = cdx_get(params)
+        return if response.body.nil? || response.body.strip.empty?
+        resume_key = parse_cdx_json(response.body, yielder)
+        break if resume_key.nil? || resume_key.empty?
+        params = params.merge("resumeKey" => resume_key)
       end
+    end
+    def fetch_page(url, options, yielder)
+      params = build_params(url, options)
+      response = cdx_get(params)
       return if response.body.nil? || response.body.strip.empty?
       parse_cdx_json(response.body, yielder)
     end
+    def cdx_get(params)
+      response = @client.get(
+        "#{ENDPOINT}?#{URI.encode_www_form(params)}",
+      )
+      return response if response.status == 200
+      raise Error, "CDX API returned HTTP #{response.status}"
+    end
     def validate_options!(options)
       validate_match_type!(options[:match_type])
       validate_sort!(options[:sort])
+      validate_filters!(options[:filters])
+      validate_collapses!(options[:collapse])
     end
     def validate_match_type!(type)
@@ -110,11 +167,27 @@ module Archaeo
             "Invalid sort: #{sort}. Use: #{SORT_ORDERS.join(', ')}"
     end
+    def validate_filters!(filters)
+      Array(filters).each { |f| CdxFilter.new(f) }
+    end
+    def validate_collapses!(collapses)
+      Array(collapses).each do |c|
+        field = c.to_s.split(":").first
+        next if CdxFilter::VALID_FIELDS.include?(field)
+        raise ArgumentError,
+              "Invalid collapse field: #{field}. " \
+              "Valid fields: #{CdxFilter::VALID_FIELDS.join(', ')}"
+      end
+    end
     def build_params(url, options)
       {
         "url" => url,
         "output" => "json",
         "fl" => ALL_FIELDS.join(","),
+        "showResumeKey" => "true",
         "gzip" => options.fetch(:gzip, true) ? "true" : "false",
       }.tap do |params|
         merge_scalar_params!(params, options)
@@ -126,23 +199,48 @@ module Archaeo
     def merge_scalar_params!(params, options)
       SCALAR_PARAMS.each do |key, api_key|
         value = options[key]
-        params[api_key] = value.to_s if value
+        next if value.nil?
+        params[api_key] = value.to_s
       end
     end
     def merge_array_params!(params, values, prefix)
       Array(values).each_with_index do |v, i|
-        params["#{prefix}#{i}"] = v
+        params["#{prefix}#{i}"] = v.to_s
       end
     end
+    # Parses CDX JSON response, handling the resume key trailer.
+    #
+    # JSON resume key format:
+    #   [header, row1, row2, ..., [], ["resume_key_value"]]
     def parse_cdx_json(body, yielder)
       json = JSON.parse(body)
-      return unless json.is_a?(Array) && json.length > 1
+      return nil unless json.is_a?(Array) && json.length > 1
+      json, resume_key = extract_resume_key(json)
-      header, *rows = json
+      header = json[0]
       field_map = header.each_with_index.to_h
-      rows.each { |row| yielder << build_snapshot(field_map, row) }
+      json[1..].each do |row|
+        next unless row.is_a?(Array) && !row.empty?
+        yielder << build_snapshot(field_map, row)
+      end
+      resume_key
+    end
+    def extract_resume_key(json)
+      last = json.last
+      return [json, nil] unless last.is_a?(Array) && last.length == 1
+      remaining = json[0..-2]
+      if remaining.last.is_a?(Array) && remaining.last.empty?
+        remaining = remaining[0..-2]
+      end
+      [remaining, last[0].to_s]
     end
     def build_snapshot(field_map, row)

data/lib/archaeo/cdx_filter.rb ADDED Viewed

@@ -0,0 +1,71 @@
+# frozen_string_literal: true
+module Archaeo
+  # Builds and validates CDX Server API filter expressions.
+  #
+  # CDX filter format: [!]field:regex
+  # The optional ! prefix inverts the match. The field must be a
+  # recognized CDX field name. The regex is a Java-compatible
+  # regex pattern matched against the field value.
+  class CdxFilter
+    VALID_FIELDS = %w[
+      urlkey timestamp original mimetype statuscode
+      digest length
+    ].freeze
+    def initialize(expression)
+      @expression = expression.to_s
+      validate!
+    end
+    def to_s
+      @expression
+    end
+    def negated?
+      @expression.start_with?("!")
+    end
+    def field
+      stripped = @expression.delete_prefix("!")
+      stripped.split(":", 2).first.to_s
+    end
+    def self.by_status(code)
+      new("statuscode:#{code}")
+    end
+    def self.excluding_status(code)
+      new("!statuscode:#{code}")
+    end
+    def self.by_mimetype(type)
+      new("mimetype:#{type}")
+    end
+    def self.excluding_mimetype(type)
+      new("!mimetype:#{type}")
+    end
+    def self.by_digest(digest)
+      new("digest:#{digest}")
+    end
+    def self.by_url(pattern)
+      new("original:#{pattern}")
+    end
+    private
+    def validate!
+      return if @expression.empty?
+      field_name = field
+      return if VALID_FIELDS.include?(field_name)
+      raise ArgumentError,
+            "Invalid CDX filter field: #{field_name}. " \
+            "Valid fields: #{VALID_FIELDS.join(', ')}"
+    end
+  end
+end

data/lib/archaeo/cli.rb CHANGED Viewed

@@ -1,10 +1,19 @@
 # frozen_string_literal: true
+require "csv"
+require "json"
 require "thor"
 module Archaeo
   # Command-line interface powered by Thor.
   class Cli < Thor
+    map %w[--version -v] => :version
+    desc "version", "Show archaeo version"
+    def version
+      puts "archaeo #{VERSION}"
+    end
     desc "snapshots URL", "List archived snapshots for a URL"
     option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
     option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
@@ -14,12 +23,16 @@ module Archaeo
     option :collapse, type: :array, desc: "CDX collapse fields"
     option :sort, desc: "Sort order (default, closest, reverse)"
     option :limit, type: :numeric, desc: "Max snapshots to return"
+    option :format, desc: "Output format (table, json, csv)",
+                    default: "table"
     def snapshots(url)
       cdx = CdxApi.new
       opts = build_cdx_options(options)
-      cdx.snapshots(url, **opts).each do |snap|
-        puts "#{snap.timestamp}  #{snap.status_code}  " \
-             "#{snap.original_url}"
+      snaps = cdx.snapshots(url, **opts).to_a
+      case options[:format]
+      when "json" then output_json(snaps)
+      when "csv" then output_csv(snaps)
+      else output_table(snaps)
       end
     end
@@ -64,12 +77,46 @@ module Archaeo
          "Fetch archived content for a URL at a timestamp"
     option :identity, type: :boolean, default: false,
                       desc: "Fetch raw (identity) content"
+    option :output, desc: "Write content to file"
     def fetch(url, timestamp)
       page = Fetcher.new.fetch(
         url, timestamp: timestamp,
              identity: options[:identity]
       )
-      $stdout.write(page.content)
+      if options[:output]
+        write_output(options[:output], page.content)
+      else
+        $stdout.write(page.content)
+      end
+    end
+    desc "download URL", "Download all archived snapshots of a URL"
+    option :output, desc: "Output directory", default: "archive"
+    option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
+    option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
+    option :resume, type: :boolean, default: false,
+                    desc: "Resume interrupted download"
+    def download(url)
+      downloader = BulkDownloader.new(output_dir: options[:output])
+      downloader.download(
+        url,
+        from: options[:from],
+        to: options[:to],
+        resume: options[:resume],
+      ) do |current, total, snap|
+        warn "[#{current}/#{total}] " \
+             "#{snap.timestamp} #{snap.original_url}"
+      end
+    end
+    desc "known_urls DOMAIN",
+         "List all known URLs for a domain"
+    def known_urls(domain)
+      CdxApi.new.known_urls(domain).each do |u|
+        puts u
+      end
     end
     CDX_OPTION_MAP = {
@@ -90,5 +137,40 @@ module Archaeo
         result[api_key] = value if value
       end
     end
+    def output_table(snaps)
+      snaps.each do |snap|
+        puts "#{snap.timestamp}  #{snap.status_code}  " \
+             "#{snap.original_url}"
+      end
+    end
+    def output_json(snaps)
+      data = snaps.map do |snap|
+        {
+          timestamp: snap.timestamp.to_s,
+          status_code: snap.status_code,
+          url: snap.original_url,
+          archive_url: snap.archive_url,
+        }
+      end
+      puts JSON.generate(data)
+    end
+    def output_csv(snaps)
+      puts CSV.generate do |csv|
+        csv << %w[timestamp status_code url archive_url]
+        snaps.each do |snap|
+          csv << [snap.timestamp.to_s, snap.status_code,
+                  snap.original_url, snap.archive_url]
+        end
+      end
+    end
+    def write_output(path, content)
+      FileUtils.mkdir_p(File.dirname(path))
+      File.binwrite(path, content)
+      warn "Written to #{path}"
+    end
   end
 end

data/lib/archaeo/download_state.rb ADDED Viewed

@@ -0,0 +1,49 @@
+# frozen_string_literal: true
+module Archaeo
+  # Tracks download progress for resume support.
+  #
+  # Persists completed snapshot timestamps to a state file within
+  # the output directory, allowing interrupted downloads to resume
+  # without re-fetching already downloaded snapshots.
+  class DownloadState
+    STATE_FILE = ".archaeo-state"
+    attr_reader :output_dir
+    def initialize(output_dir)
+      @output_dir = output_dir
+      @path = File.join(output_dir, STATE_FILE)
+    end
+    def completed?(timestamp)
+      timestamps.include?(timestamp.to_s)
+    end
+    def mark_completed(timestamp)
+      timestamps << timestamp.to_s
+      save
+    end
+    def clear
+      @timestamps = []
+      FileUtils.rm_f(@path)
+    end
+    private
+    def timestamps
+      @timestamps ||= load_timestamps
+    end
+    def load_timestamps
+      return [] unless File.exist?(@path)
+      File.readlines(@path, chomp: true).reject(&:empty?)
+    end
+    def save
+      File.write(@path, "#{timestamps.uniq.sort.join("\n")}\n")
+    end
+  end
+end

data/lib/archaeo/http_client.rb CHANGED Viewed

@@ -6,10 +6,11 @@ require "zlib"
 require "stringio"
 module Archaeo
-  # HTTP client with retry logic, gzip decompression, and
-  # rotating realistic User-Agent profiles.
+  # HTTP client with retry logic, gzip decompression,
+  # rotating realistic User-Agent profiles, and connection pooling.
   #
-  # Injected via constructor for testability.
+  # Injected via constructor for testability. Connections are reused
+  # across requests to the same host for improved performance.
   class HttpClient
     DEFAULT_TIMEOUT = 30
     DEFAULT_MAX_RETRIES = 3
@@ -21,6 +22,8 @@ module Archaeo
       IOError,
       Errno::ECONNRESET,
       Errno::ECONNREFUSED,
+      EOFError,
+      Errno::EPIPE,
     ].freeze
     USER_AGENT_PROFILES = [
@@ -60,11 +63,25 @@ module Archaeo
       @max_retries = max_retries
       @retry_delay = retry_delay
       @user_agent = user_agent
+      @connections = {}
+      @mutex = Mutex.new
     end
     def get(url, headers: {})
       merged = default_headers.merge(headers)
-      attempt_with_retries(url, merged)
+      uri = URI(url)
+      attempt_with_retries(uri, merged)
+    end
+    def shutdown
+      @mutex.synchronize do
+        @connections.each_value do |http|
+          http.finish
+        rescue StandardError
+          nil
+        end
+        @connections.clear
+      end
     end
     private
@@ -73,13 +90,52 @@ module Archaeo
       @user_agent || USER_AGENT_PROFILES.sample
     end
-    def attempt_with_retries(url, headers)
+    def connection_key(uri)
+      "#{uri.scheme}://#{uri.host}:#{uri.port}"
+    end
+    def connection_for(uri)
+      key = connection_key(uri)
+      @mutex.synchronize do
+        http = @connections[key]
+        if http && !http.active?
+          @connections.delete(key)
+          http = nil
+        end
+        @connections[key] = build_connection(uri) unless http
+        @connections[key]
+      end
+    end
+    def build_connection(uri)
+      http = Net::HTTP.new(uri.host, uri.port)
+      http.use_ssl = uri.scheme == "https"
+      http.read_timeout = @timeout
+      http.open_timeout = @timeout
+      http.start
+      http
+    end
+    def invalidate_connection(uri)
+      key = connection_key(uri)
+      @mutex.synchronize do
+        http = @connections.delete(key)
+        begin
+          http&.finish
+        rescue StandardError
+          nil
+        end
+      end
+    end
+    def attempt_with_retries(uri, headers)
       retries = 0
       begin
-        execute_get(url, headers)
+        execute_with_connection(uri, headers)
       rescue *TRANSIENT_ERRORS => e
         retries += 1
         raise_if_exhausted(retries, e)
+        invalidate_connection(uri)
         sleep(@retry_delay * retries)
         retry
       end
@@ -92,6 +148,19 @@ module Archaeo
             "Failed after #{retries} retries: #{error.message}"
     end
+    def execute_with_connection(uri, headers)
+      http = connection_for(uri)
+      request = Net::HTTP::Get.new(uri)
+      headers.each { |k, v| request[k] = v }
+      raw = http.request(request)
+      build_response(raw)
+    rescue *TRANSIENT_ERRORS
+      raise
+    rescue StandardError
+      invalidate_connection(uri)
+      raise
+    end
     def default_headers
       {
         "User-Agent" => select_user_agent,
@@ -103,19 +172,6 @@ module Archaeo
       }
     end
-    def execute_get(url, headers)
-      uri = URI(url)
-      Net::HTTP.start(uri.host, uri.port,
-                      use_ssl: uri.scheme == "https",
-                      read_timeout: @timeout,
-                      open_timeout: @timeout) do |http|
-        request = Net::HTTP::Get.new(uri)
-        headers.each { |k, v| request[k] = v }
-        raw = http.request(request)
-        build_response(raw)
-      end
-    end
     def build_response(raw)
       headers = raw.each_header.to_h { |k, v| [k.downcase, v] }
       Response.new(

data/lib/archaeo/page.rb CHANGED Viewed

@@ -1,22 +1,85 @@
 # frozen_string_literal: true
+require "nokogiri"
 module Archaeo
   # Model representing a fetched archived page from the Wayback Machine.
   #
   # Contains the page content, metadata, and provenance information
-  # for a single archived resource.
+  # for a single archived resource. Content is automatically transcoded
+  # to UTF-8 from the detected source encoding.
   class Page
-    attr_reader :content, :content_type, :status_code,
+    attr_reader :content_type, :status_code,
                 :archive_url, :original_url, :timestamp
     def initialize(content:, content_type:, status_code:,
                    archive_url:, original_url:, timestamp:)
-      @content = content
+      @raw_content = content
       @content_type = content_type
       @status_code = status_code
       @archive_url = archive_url
       @original_url = original_url
       @timestamp = Timestamp.coerce(timestamp)
     end
+    def content
+      @content ||= transcode(@raw_content)
+    end
+    def encoding
+      @encoding ||= detect_encoding
+    end
+    private
+    def detect_encoding
+      charset = extract_charset(@content_type)
+      return Encoding.find(charset) if charset
+      html_charset = detect_html_charset
+      return Encoding.find(html_charset) if html_charset
+      Encoding::UTF_8
+    rescue ArgumentError
+      Encoding::UTF_8
+    end
+    def extract_charset(content_type)
+      return nil unless content_type
+      match = content_type.match(/charset=([^\s;]+)/i)
+      match ? match[1] : nil
+    end
+    def detect_html_charset
+      doc = Nokogiri::HTML(@raw_content)
+      node = doc.at_css("meta[charset]")
+      return node["charset"] if node
+      content = doc.at_css('meta[http-equiv="Content-Type"]')&.[]("content")
+      return nil unless content
+      match = content.match(/charset=([^\s;]+)/i)
+      match ? match[1] : nil
+    rescue StandardError
+      nil
+    end
+    def transcode(raw)
+      return raw if raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
+      return raw if raw.empty?
+      encode_to_utf8(raw, encoding)
+    rescue Encoding::InvalidByteSequenceError,
+           Encoding::UndefinedConversionError
+      encode_to_utf8(raw, Encoding::UTF_8)
+    end
+    def encode_to_utf8(raw, source_encoding)
+      raw.force_encoding(source_encoding)
+        .encode("UTF-8",
+                invalid: :replace, undef: :replace,
+                replace: "?")
+    end
   end
 end

data/lib/archaeo/snapshot.rb CHANGED Viewed

@@ -9,6 +9,8 @@ module Archaeo
     FIELDS = %i[urlkey timestamp original_url
                 mimetype status_code digest length].freeze
+    BLOCKED_STATUS = -1
     attr_reader(*FIELDS)
     def initialize(urlkey:, timestamp:, original_url:,
@@ -27,14 +29,22 @@ module Archaeo
       ArchiveUrl.new(original_url, timestamp: @timestamp).to_s
     end
+    def blocked?
+      @status_code == BLOCKED_STATUS
+    end
+    def to_a
+      [@urlkey, @timestamp, @original_url, @mimetype,
+       @status_code, @digest, @length]
+    end
     def ==(other)
-      other.is_a?(self.class) &&
-        FIELDS.all? { |f| send(f) == other.send(f) }
+      other.is_a?(self.class) && to_a == other.to_a
     end
     alias_method :eql?, :==
     def hash
-      FIELDS.map { |f| send(f) }.hash
+      to_a.hash
     end
   end
 end

data/lib/archaeo/url_normalizer.rb ADDED Viewed

@@ -0,0 +1,56 @@
+# frozen_string_literal: true
+module Archaeo
+  # Sanitizes and normalizes URLs for Wayback Machine API queries.
+  #
+  # Handles common URL issues: whitespace, surrounding quotes,
+  # double percent-encoding, and inconsistent percent-encoding case.
+  class UrlNormalizer
+    attr_reader :original, :normalized
+    def initialize(url)
+      @original = url.to_s
+      @normalized = normalize(@original)
+    end
+    def self.normalize(url)
+      new(url).normalized
+    end
+    def self.with_scheme(url)
+      normalized = normalize(url)
+      normalized.match?(%r{\A[a-z][a-z0-9+\-.]*://}) ? normalized : "https://#{normalized}"
+    end
+    def to_s
+      @normalized
+    end
+    private
+    def normalize(url)
+      url = strip_whitespace(url)
+      url = strip_surrounding_quotes(url)
+      url = fix_double_percent_encoding(url)
+      normalize_percent_encoding(url)
+    end
+    def strip_whitespace(url)
+      url.strip
+    end
+    def strip_surrounding_quotes(url)
+      url = url[1..-2] if url.start_with?('"') && url.end_with?('"')
+      url = url[1..-2] if url.start_with?("'") && url.end_with?("'")
+      url
+    end
+    def fix_double_percent_encoding(url)
+      url.gsub(/%25([0-9A-Fa-f]{2})/i, '%\1')
+    end
+    def normalize_percent_encoding(url)
+      url.gsub(/%[0-9a-f]{2}/i, &:upcase)
+    end
+  end
+end

data/lib/archaeo/url_rewriter.rb ADDED Viewed

@@ -0,0 +1,22 @@
+# frozen_string_literal: true
+module Archaeo
+  # Rewrites Wayback Machine archive URLs to local file paths.
+  #
+  # Used for saving archived pages and their assets for offline
+  # browsing. Converts absolute archive URLs into relative paths
+  # rooted at a configurable local directory.
+  class UrlRewriter
+    def initialize(archive_prefix, local_prefix)
+      @archive_prefix = archive_prefix.to_s
+      @local_prefix = local_prefix.to_s
+    end
+    def rewrite(url)
+      return url unless url.start_with?(@archive_prefix)
+      relative = url.sub(@archive_prefix, "")
+      File.join(@local_prefix, relative)
+    end
+  end
+end

data/lib/archaeo/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Archaeo
-  VERSION = "0.1.0"
+  VERSION = "0.2.0"
 end

data/lib/archaeo.rb CHANGED Viewed

@@ -21,10 +21,17 @@ module Archaeo
   autoload :Page, "archaeo/page"
   autoload :SaveResult, "archaeo/save_result"
   autoload :AvailabilityResult, "archaeo/availability_result"
+  autoload :UrlNormalizer, "archaeo/url_normalizer"
+  autoload :CdxFilter, "archaeo/cdx_filter"
+  autoload :AssetList, "archaeo/asset_list"
+  autoload :AssetExtractor, "archaeo/asset_extractor"
+  autoload :UrlRewriter, "archaeo/url_rewriter"
+  autoload :DownloadState, "archaeo/download_state"
   autoload :HttpClient, "archaeo/http_client"
   autoload :CdxApi, "archaeo/cdx_api"
   autoload :AvailabilityApi, "archaeo/availability_api"
   autoload :SaveApi, "archaeo/save_api"
   autoload :Fetcher, "archaeo/fetcher"
+  autoload :BulkDownloader, "archaeo/bulk_downloader"
   autoload :Cli, "archaeo/cli"
 end

metadata CHANGED Viewed

@@ -1,14 +1,43 @@
 --- !ruby/object:Gem::Specification
 name: archaeo
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - Ribose Inc.
+autorequire:
 bindir: exe
 cert_chain: []
-date: 1980-01-02 00:00:00.000000000 Z
+date: 2026-05-09 00:00:00.000000000 Z
 dependencies:
+- !ruby/object:Gem::Dependency
+  name: csv
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.3'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.3'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.14'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.14'
 - !ruby/object:Gem::Dependency
   name: thor
   requirement: !ruby/object:Gem::Requirement
@@ -43,10 +72,15 @@ files:
 - exe/archaeo
 - lib/archaeo.rb
 - lib/archaeo/archive_url.rb
+- lib/archaeo/asset_extractor.rb
+- lib/archaeo/asset_list.rb
 - lib/archaeo/availability_api.rb
 - lib/archaeo/availability_result.rb
+- lib/archaeo/bulk_downloader.rb
 - lib/archaeo/cdx_api.rb
+- lib/archaeo/cdx_filter.rb
 - lib/archaeo/cli.rb
+- lib/archaeo/download_state.rb
 - lib/archaeo/fetcher.rb
 - lib/archaeo/http_client.rb
 - lib/archaeo/page.rb
@@ -54,6 +88,8 @@ files:
 - lib/archaeo/save_result.rb
 - lib/archaeo/snapshot.rb
 - lib/archaeo/timestamp.rb
+- lib/archaeo/url_normalizer.rb
+- lib/archaeo/url_rewriter.rb
 - lib/archaeo/version.rb
 - sig/archaeo.rbs
 homepage: https://github.com/riboseinc/archaeo
@@ -62,8 +98,9 @@ licenses:
 metadata:
   homepage_uri: https://github.com/riboseinc/archaeo
   source_code_uri: https://github.com/riboseinc/archaeo
-  changelog_uri: https://github.com/riboseinc/archaeo/blob/main/CHANGELOG.md
+  changelog_uri: https://github.com/riboseinc/archaeo/blob/main/CHANGELOG.adoc
   rubygems_mfa_required: 'true'
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -78,7 +115,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.6.9
+rubygems_version: 3.5.22
+signing_key:
 specification_version: 4
 summary: Ruby client for the Internet Archive Wayback Machine APIs
 test_files: []