RubyGems - archaeo - Versions diffs - 0.2.4 → 0.2.5 - Mend

archaeo 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/README.adoc +78 -3
data/lib/archaeo/archive_url.rb +12 -0
data/lib/archaeo/asset_extractor.rb +117 -8
data/lib/archaeo/asset_list.rb +24 -1
data/lib/archaeo/availability_api.rb +3 -1
data/lib/archaeo/availability_result.rb +16 -2
data/lib/archaeo/bulk_downloader.rb +81 -13
data/lib/archaeo/cdx_api.rb +7 -0
data/lib/archaeo/cdx_filter.rb +21 -1
data/lib/archaeo/cli.rb +134 -58
data/lib/archaeo/download_state.rb +17 -3
data/lib/archaeo/http_client.rb +96 -14
data/lib/archaeo/page.rb +29 -0
data/lib/archaeo/page_bundle.rb +14 -0
data/lib/archaeo/save_api.rb +3 -3
data/lib/archaeo/save_result.rb +3 -2
data/lib/archaeo/snapshot.rb +40 -0
data/lib/archaeo/timestamp.rb +22 -0
data/lib/archaeo/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 721131f1413aaacb26685abe006fdd243e3ef53e1d5f23764b2774717aae93ec
-  data.tar.gz: f3c90969cf684e06a6cdd1e0025a605141e1ee543430488fbf960e787ca1ba7d
+  metadata.gz: e318dfb4a6478af2e663418fda9952308323be35ef9fc6582a5fa3a327cdbb6d
+  data.tar.gz: 2f745ac2ea371e6b64d4f83ca39d0f247991882d3104bcff90e523b73e421f9b
 SHA512:
-  metadata.gz: 494ba22650c56df4a2ae119f0b6965679441bf988f013fe22f54c8c02e285d9df6ade6db4c2574ec23ba3e95f917e541e96dbd79a5b6deae178d7b6eaa5fd1a7
-  data.tar.gz: cbaf296d51ecae3ad77eee66100f6ca6aa40d0ddf0abd3b6c73b7c61b5cf92131b991437c28537a4db97de0182b7a9133df54ad07c7017d6f613760bcadf3cca
+  metadata.gz: dc20f6483c99aba0059a224dba1758cec00d3d5921e7f8296b9826554f8d45780981571df3bdd8c05d0704066e14163c7e1a192339da30b3b98a367b0860a669
+  data.tar.gz: f4ca21a9c5d5f68d29bfe24ff5caf598a9e8819c7bfc920e46cba3d3a9980f4a086433c0305897fc7506e8cc002d943b94b5c4ea15e12372d0b92389df30f3c3

data/README.adoc CHANGED Viewed

@@ -49,6 +49,11 @@ near   = cdx.near("example.com", timestamp: "20220101")
 before = cdx.before("example.com", timestamp: "20220101")
 after  = cdx.after("example.com", timestamp: "20220101")
+# Time range query
+cdx.between("example.com", from: "20220101", to: "20221231").each do |snap|
+  puts snap.timestamp
+end
 # Filter by status code, mimetype, or URL pattern
 cdx.snapshots("example.com",
   filters: [Archaeo::CdxFilter.by_status(200)],
@@ -57,6 +62,13 @@ cdx.snapshots("example.com",
   sort: "reverse",
 )
+# Compose multiple filters
+filters = Archaeo::CdxFilter.combine(
+  Archaeo::CdxFilter.only_successful,
+  Archaeo::CdxFilter.excluding_mimetype("text/css"),
+)
+cdx.snapshots("example.com", filters: filters)
 # Page-based pagination
 cdx.snapshots("example.com", page: 0)
@@ -77,6 +89,7 @@ result = api.near("example.com")
 result.available?   # => true/false
 result.archive_url  # => "https://web.archive.org/web/..."
 result.timestamp    # => Archaeo::Timestamp
+result.archived_status  # => HTTP status code of the archived page
 api.available?("example.com")  # => true/false
 ----
@@ -87,6 +100,7 @@ api.available?("example.com")  # => true/false
 ----
 save = Archaeo::SaveApi.new
 result = save.save("https://example.com/")
+result.url          # => "https://example.com/"
 result.archive_url  # => "https://web.archive.org/web/..."
 result.timestamp    # => Archaeo::Timestamp
 result.cached?      # => true if already archived
@@ -104,6 +118,10 @@ page.content        # => "<html>...</html>"
 page.content_type   # => "text/html"
 page.status_code    # => 200
 page.archive_url    # => full archive URL
+page.title          # => "Example Domain"
+page.html?          # => true
+page.json?          # => false
+page.size           # => content length in bytes
 # Raw (identity) mode -- no Wayback Machine rewriting
 page = fetcher.fetch("https://example.com/",
@@ -126,6 +144,12 @@ bundle.assets.js   # => ["https://example.com/app.js", ...]
 bundle.assets.images
 bundle.assets.fonts
 bundle.assets.media
+bundle.size        # => total count (page + assets)
+bundle.asset_count # => number of assets
+# Serialize asset list
+bundle.assets.to_json
+bundle.assets.counts  # => { css: 1, js: 2, image: 3, font: 0, media: 1 }
 ----
 === Bulk Download with Resume
@@ -143,6 +167,12 @@ downloader.download("example.com", resume: true)
 # Filter by date range
 downloader.download("example.com",
                     from: "20220101", to: "20221231")
+# Parallel downloads
+downloader = Archaeo::BulkDownloader.new(
+  output_dir: "archive", concurrency: 4,
+)
+downloader.download("example.com")
 ----
 === URL Normalization
@@ -168,6 +198,33 @@ Archaeo::CdxFilter.by_status(200)           # => "statuscode:200"
 Archaeo::CdxFilter.excluding_status(404)    # => "!statuscode:404"
 Archaeo::CdxFilter.by_mimetype("text/html") # => "mimetype:text/html"
 Archaeo::CdxFilter.by_url("example.com")    # => "original:example.com"
+# Compose filters
+filters = Archaeo::CdxFilter.only_successful
+error_filters = Archaeo::CdxFilter.excluding_errors
+----
+=== Snapshot Convenience
+[source,ruby]
+----
+snap = cdx.near("example.com", timestamp: "20220101")
+# Status predicates
+snap.success?       # => true (200)
+snap.redirect?      # => true for 3xx
+snap.client_error?  # => true for 4xx
+snap.server_error?  # => true for 5xx
+snap.error?         # => true for 4xx/5xx
+# Fetch content directly from a snapshot
+page = snap.fetch
+# Fetch with assets
+bundle = snap.fetch_with_assets
+# JSON-serializable representation
+snap.as_json  # => Hash with primitive values only
 ----
 === Timestamps
@@ -189,6 +246,15 @@ ts = Archaeo::Timestamp.now
 # Format as 14-digit string
 ts.to_s  # => "20220615000000"
+# Standard time formats
+ts.to_iso8601  # => "2022-06-15T00:00:00Z"
+ts.to_rfc3339  # => "2022-06-15T00:00:00+00:00"
+# Arithmetic
+ts + 3600          # => Timestamp one hour later
+ts - 3600          # => Timestamp one hour earlier
+ts1 - ts2          # => seconds between timestamps
 # Comparison
 ts1 < ts2   # => true/false
 ----
@@ -207,9 +273,15 @@ archaeo snapshots --format csv --from 20220101 --to 20221231 example.com
 # Find closest snapshot
 archaeo near example.com 20220101
+archaeo near --format json example.com 20220101
-# Check availability
+# Find oldest/newest
+archaeo oldest example.com
+archaeo newest --format json example.com
+# Check availability (with optional timestamp)
 archaeo available example.com
+archaeo available --timestamp 20220101 example.com
 # Save a URL
 archaeo save https://example.com/
@@ -226,6 +298,9 @@ archaeo fetch --identity https://example.com/ 20220615120000
 # Download all snapshots
 archaeo download example.com --output ./archive
+# Parallel downloads
+archaeo download --concurrency 4 example.com --output ./archive
 # Resume interrupted download
 archaeo download example.com --resume
@@ -267,7 +342,7 @@ Archaeo follows a model-driven, OOP design:
 | *URL Processing*
 | `UrlNormalizer`, `CdxFilter`, `UrlRewriter`
-| URL sanitization, filtering, and rewriting
+| URL sanitization, validated filtering with composition, and rewriting
 | *Asset Extraction*
 | `AssetExtractor`, `AssetList`
@@ -283,7 +358,7 @@ Archaeo follows a model-driven, OOP design:
 | *Infrastructure*
 | `HttpClient`
-| HTTP transport with retries, gzip, connection pooling
+| HTTP transport with retries, gzip, 429/503 handling, connection pooling with eviction
 |===
 All API classes accept an `HttpClient` via dependency injection for testability.

data/lib/archaeo/archive_url.rb CHANGED Viewed

@@ -36,6 +36,18 @@ module Archaeo
       @identity
     end
+    def ==(other)
+      other.is_a?(self.class) &&
+        original_url == other.original_url &&
+        timestamp == other.timestamp &&
+        identity? == other.identity?
+    end
+    alias_method :eql?, :==
+    def hash
+      [original_url, timestamp, identity?].hash
+    end
     def to_s
       suffix = identity? ? "id_" : ""
       "#{BASE}/#{@timestamp}#{suffix}/#{@original_url}"

data/lib/archaeo/asset_extractor.rb CHANGED Viewed

@@ -10,6 +10,20 @@ module Archaeo
   # and media resources referenced by the page. Optionally resolves
   # relative URLs against a base URL.
   class AssetExtractor
+    FONT_CDN_PATTERNS = %w[
+      fonts.googleapis.com
+      fonts.gstatic.com
+      use.typekit.net
+      fast.fonts.net
+      cloud.typography.com
+    ].freeze
+    CSS_URL_PATTERN = /url\(\s*['"]?([^'")\s]+)['"]?\s*\)/
+    CSS_IMAGE_PROPS = Regexp.new(
+      "(?:background-image|background|list-style-image|content|cursor)" \
+      "\\s*:[^;]*#{CSS_URL_PATTERN.source}",
+    )
     def initialize(html, base_url: nil)
       @doc = Nokogiri::HTML(html.to_s)
       @base_url = base_url
@@ -23,6 +37,7 @@ module Archaeo
       extract_fonts(list)
       extract_media(list)
       extract_inline_css(list)
+      extract_inline_styles(list)
       list
     end
@@ -32,9 +47,6 @@ module Archaeo
       @doc.css('link[rel="stylesheet"]').each do |el|
         list.add(resolve(el["href"]), type: :css)
       end
-      @doc.css('link[rel="icon"], link[rel="shortcut icon"]').each do |el|
-        list.add(resolve(el["href"]), type: :image)
-      end
     end
     def extract_js(list)
@@ -44,8 +56,42 @@ module Archaeo
     end
     def extract_images(list)
+      extract_img_tags(list)
+      extract_picture_sources(list)
+      extract_lazy_images(list)
+      extract_icon_links(list)
+    end
+    def extract_img_tags(list)
       @doc.css("img[src]").each do |el|
         list.add(resolve(el["src"]), type: :image)
+        extract_srcset(el["srcset"], list, :image)
+      end
+    end
+    def extract_picture_sources(list)
+      @doc.css("picture source[srcset]").each do |el|
+        extract_srcset(el["srcset"], list, :image)
+      end
+    end
+    def extract_lazy_images(list)
+      @doc.css("img[data-src]").each do |el|
+        list.add(resolve(el["data-src"]), type: :image)
+      end
+    end
+    def extract_icon_links(list)
+      @doc.css(
+        'link[rel~="icon"], link[rel="apple-touch-icon"], ' \
+        'link[rel="apple-touch-icon-precomposed"], ' \
+        'link[rel="mask-icon"]',
+      ).each do |el|
+        list.add(resolve(el["href"]), type: :image)
+      end
+      @doc.css('link[rel="manifest"]').each do |el|
+        list.add(resolve(el["href"]), type: :media)
       end
     end
@@ -55,29 +101,92 @@ module Archaeo
       end
       @doc.css('link[rel="stylesheet"]').each do |el|
         if font_stylesheet?(el["href"])
-          list.add(resolve(el["href"]),
-                   type: :font)
+          list.add(resolve(el["href"]), type: :font)
         end
       end
     end
     def extract_media(list)
+      extract_media_sources(list)
+      extract_video_posters(list)
+      extract_embeds(list)
+    end
+    def extract_media_sources(list)
       @doc.css("source[src], video[src], audio[src]").each do |el|
         list.add(resolve(el["src"]), type: :media)
       end
     end
+    def extract_video_posters(list)
+      @doc.css("video[poster]").each do |el|
+        list.add(resolve(el["poster"]), type: :image)
+      end
+    end
+    def extract_embeds(list)
+      @doc.css("iframe[src], embed[src]").each do |el|
+        list.add(resolve(el["src"]), type: :media)
+      end
+    end
     def extract_inline_css(list)
       @doc.css("style").each do |el|
-        extract_css_urls(el.text).each do |url|
+        text = el.text
+        extract_css_at_imports(text, list)
+        extract_css_font_urls(text, list)
+        extract_css_image_urls(text, list)
+      end
+    end
+    def extract_inline_styles(list)
+      @doc.css("[style]").each do |el|
+        style = el["style"]
+        next unless style
+        style.scan(/url\(\s*['"]?([^'")\s]+)['"]?\s*\)/).flatten.each do |url|
+          list.add(resolve(url), type: :image)
+        end
+      end
+    end
+    def extract_srcset(srcset_value, list, type)
+      return if srcset_value.nil?
+      srcset_value.split(",").each do |entry|
+        url = entry.strip.split(/\s+/, 2).first
+        list.add(resolve(url), type: type) if url && !url.empty?
+      end
+    end
+    def extract_css_at_imports(text, list)
+      text.scan(
+        /@import\s+(?:url\(\s*['"]?([^'")\s]+)['"]?\s*\)|['"]([^'"]+)['"])/,
+      ).flatten.compact.each do |url|
+        next if url.nil? || url.empty?
+        list.add(resolve(url), type: :css)
+      end
+    end
+    def extract_css_font_urls(text, list)
+      text.scan(/@font-face\s*\{[^}]*\}/m).each do |font_block|
+        extract_css_urls(font_block).each do |url|
           list.add(resolve(url), type: :font)
         end
       end
     end
+    def extract_css_image_urls(text, list)
+      text.scan(CSS_IMAGE_PROPS).flatten.each do |url|
+        list.add(resolve(url), type: :image)
+      end
+    end
     def font_stylesheet?(href)
-      href.to_s.include?("fonts.googleapis.com") ||
-        href.to_s.include?("font")
+      return false if href.nil?
+      FONT_CDN_PATTERNS.any? { |pattern| href.include?(pattern) }
     end
     def extract_css_urls(css_text)

data/lib/archaeo/asset_list.rb CHANGED Viewed

@@ -1,11 +1,15 @@
 # frozen_string_literal: true
+require "json"
 module Archaeo
   # Categorized collection of asset URLs extracted from an archived page.
   #
   # Assets are grouped by type (css, js, image, font, media) for
   # convenient access during bulk download or local archiving.
   class AssetList
+    include Enumerable
     CATEGORIES = %i[css js image font media].freeze
     def initialize
@@ -14,7 +18,14 @@ module Archaeo
     end
     def add(url, type:)
-      @urls_by_type[type] << url unless url.nil? || url.empty?
+      return if url.nil? || url.empty?
+      return if @urls_by_type[type].include?(url)
+      @urls_by_type[type] << url
+    end
+    def each(&block)
+      all.each(&block)
     end
     def css
@@ -48,5 +59,17 @@ module Archaeo
     def empty?
       all.empty?
     end
+    def to_h
+      @urls_by_type.transform_values(&:dup)
+    end
+    def to_json(*args)
+      to_h.to_json(*args)
+    end
+    def counts
+      @urls_by_type.transform_values(&:size)
+    end
   end
 end

data/lib/archaeo/availability_api.rb CHANGED Viewed

@@ -68,12 +68,14 @@ module Archaeo
     def build_result(closest, url)
       archive_url = closest["url"].to_s.sub(%r{^http://}, "https://")
       ts = Timestamp.parse(closest["timestamp"])
+      archived_status = closest["status"].to_i
       AvailabilityResult.new(
         url: url,
-        available: closest["status"].to_s == "200",
+        available: true,
         archive_url: archive_url,
         timestamp: ts,
+        archived_status: archived_status,
       )
     end
   end

data/lib/archaeo/availability_result.rb CHANGED Viewed

@@ -6,17 +6,31 @@ module Archaeo
   # Indicates whether a URL is archived and, if so, provides
   # the closest snapshot's archive URL and timestamp.
   class AvailabilityResult
-    attr_reader :url, :archive_url, :timestamp
+    attr_reader :url, :archive_url, :timestamp, :archived_status
-    def initialize(url:, available:, archive_url: nil, timestamp: nil)
+    def initialize(url:, available:, archive_url: nil,
+                   timestamp: nil, archived_status: nil)
       @url = url
       @available = available
       @archive_url = archive_url
       @timestamp = timestamp
+      @archived_status = archived_status
     end
     def available?
       @available
     end
+    def unavailable?
+      !@available
+    end
+    def to_s
+      if available?
+        "#{url} -> #{archive_url} (#{timestamp})"
+      else
+        "#{url} -> not available"
+      end
+    end
   end
 end

data/lib/archaeo/bulk_downloader.rb CHANGED Viewed

@@ -10,27 +10,26 @@ module Archaeo
   # for interrupted download recovery.
   class BulkDownloader
     def initialize(client: HttpClient.new, output_dir: "archive",
-                   cdx_api: nil)
+                   cdx_api: nil, concurrency: 1)
       @client = client
       @output_dir = output_dir
       @cdx_api = cdx_api
+      @concurrency = [1, concurrency.to_i].max
     end
-    def download(url, from: nil, to: nil, resume: false)
+    def download(url, from: nil, to: nil, resume: false, &block)
       url = UrlNormalizer.normalize(url)
       FileUtils.mkdir_p(@output_dir)
       state = DownloadState.new(@output_dir)
       snapshots = fetch_snapshots(url, from: from, to: to)
       total = snapshots.size
+      progress = block
-      snapshots.each_with_index do |snap, index|
-        next if resume && state.completed?(snap.timestamp)
-        fetch_and_save(snap)
-        state.mark_completed(snap.timestamp)
-        yield index + 1, total, snap if block_given?
+      if @concurrency == 1
+        download_sequential(snapshots, total, state, resume, progress)
+      else
+        download_concurrent(snapshots, total, state, resume, progress)
       end
     end
@@ -45,6 +44,54 @@ module Archaeo
         .select { |snap| !snap.blocked? && snap.status_code == 200 }
     end
+    def download_sequential(snapshots, total, state, resume, progress)
+      snapshots.each_with_index do |snap, index|
+        next if resume && state.completed?(snap.timestamp)
+        fetch_and_save(snap)
+        state.mark_completed(snap.timestamp)
+        progress&.call(index + 1, total, snap)
+      end
+    end
+    def download_concurrent(snapshots, total, state, resume, progress)
+      queue = snapshots.each_with_index.to_a
+      mutex = Mutex.new
+      errors = []
+      threads = Array.new(@concurrency) do
+        Thread.new do
+          process_queue(queue, total, state, resume, progress, mutex, errors)
+        end
+      end
+      threads.each(&:join)
+      return unless errors.any?
+      raise Error,
+            "#{errors.size} download(s) failed: " \
+            "#{errors.map { |s, _| s.timestamp }.join(', ')}"
+    end
+    def process_queue(queue, total, state, resume, progress, mutex, errors)
+      loop do
+        snap, index = mutex.synchronize { queue.shift }
+        break unless snap
+        next if resume && state.completed?(snap.timestamp)
+        begin
+          fetch_and_save(snap)
+          state.mark_completed(snap.timestamp)
+        rescue StandardError => e
+          mutex.synchronize { errors << [snap, e] }
+        end
+        progress&.call(index + 1, total, snap)
+      end
+    end
     def fetch_and_save(snapshot)
       fetcher = Fetcher.new(client: @client)
       page = fetcher.fetch(snapshot.original_url,
@@ -52,40 +99,61 @@ module Archaeo
       filename = build_filename(snapshot)
       FileUtils.mkdir_p(File.dirname(filename))
-      File.binwrite(filename, page.content)
+      tmp_path = "#{filename}.tmp"
+      File.binwrite(tmp_path, page.content)
+      File.rename(tmp_path, filename)
+    rescue StandardError
+      FileUtils.rm_f(tmp_path) if defined?(tmp_path)
+      raise
     end
     EXTENSION_MAP = {
       "text/html" => ".html",
       "text/css" => ".css",
+      "text/plain" => ".txt",
+      "text/javascript" => ".js",
       "application/javascript" => ".js",
+      "application/x-javascript" => ".js",
       "application/json" => ".json",
+      "application/xml" => ".xml",
       "application/pdf" => ".pdf",
+      "application/octet-stream" => ".bin",
       "image/png" => ".png",
       "image/jpeg" => ".jpg",
       "image/gif" => ".gif",
       "image/svg+xml" => ".svg",
       "image/webp" => ".webp",
+      "image/x-icon" => ".ico",
+      "image/bmp" => ".bmp",
       "font/woff2" => ".woff2",
       "font/woff" => ".woff",
+      "font/ttf" => ".ttf",
+      "font/eot" => ".eot",
       "video/mp4" => ".mp4",
       "audio/mpeg" => ".mp3",
     }.freeze
     def extension_for(snapshot)
-      EXTENSION_MAP[snapshot.mimetype] || ".bin"
+      mime = snapshot.mimetype.to_s.split(";").first.strip.downcase
+      EXTENSION_MAP[mime] || ".bin"
     end
     def build_filename(snapshot)
       ts = snapshot.timestamp.to_s
       safe_path = snapshot.original_url
         .sub(%r{\Ahttps?://}, "")
-        .gsub(%r{/}, File::SEPARATOR)
+        .gsub(%r{[<>:"|?*#]}, "_")
+        .gsub(%r{[/\\]}, File::SEPARATOR)
         .gsub(%r{[?&=]}, "_")
       safe_path = safe_path[0..-2] if safe_path.end_with?(File::SEPARATOR)
       safe_path = "#{safe_path}index" if safe_path.empty?
-      File.join(@output_dir, safe_path, "#{ts}#{extension_for(snapshot)}")
+      segments = safe_path.split(File::SEPARATOR).map do |seg|
+        seg.length > 200 ? seg[0..200] : seg
+      end
+      File.join(@output_dir, *segments, "#{ts}#{extension_for(snapshot)}")
     end
   end
 end

data/lib/archaeo/cdx_api.rb CHANGED Viewed

@@ -99,6 +99,13 @@ module Archaeo
             "No snapshot found after #{ts} for #{url}"
     end
+    def between(url, from:, to:, **options)
+      snapshots(url,
+                from: Timestamp.coerce(from).to_s,
+                to: Timestamp.coerce(to).to_s,
+                **options)
+    end
     # Returns the number of pages for a paginated query.
     def num_pages(url, **options)
       url = UrlNormalizer.normalize(url)

data/lib/archaeo/cdx_filter.rb CHANGED Viewed

@@ -59,10 +59,30 @@ module Archaeo
       new("urlkey:#{pattern}")
     end
+    def and(other)
+      [self, other]
+    end
+    def self.combine(*filters)
+      filters.flatten
+    end
+    def self.only_successful
+      [by_status(200)]
+    end
+    def self.excluding_errors
+      [excluding_status(404), excluding_status(500),
+       excluding_status(502), excluding_status(503)]
+    end
     private
     def validate!
-      return if @expression.empty?
+      if @expression.empty?
+        raise ArgumentError,
+              "CDX filter expression cannot be empty"
+      end
       field_name = field
       return if VALID_FIELDS.include?(field_name)