RubyGems - archaeo - Versions diffs - 0.2.0 → 0.2.2 - Mend

archaeo 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/README.adoc +128 -7
data/lib/archaeo/availability_api.rb +1 -0
data/lib/archaeo/bulk_downloader.rb +1 -0
data/lib/archaeo/cdx_api.rb +9 -0
data/lib/archaeo/cli.rb +2 -1
data/lib/archaeo/fetcher.rb +8 -0
data/lib/archaeo/page_bundle.rb +16 -0
data/lib/archaeo/save_api.rb +1 -0
data/lib/archaeo/url_normalizer.rb +17 -0
data/lib/archaeo/version.rb +1 -1
data/lib/archaeo.rb +1 -0
metadata +2 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 24c6b37575e8f673a8e6acb7aba38264ac811236cb91e905663914d08a283289
-  data.tar.gz: 632f36d31ee83b23f727dd8eb8217929d3dad26c94c6948fb0621aff2b937701
+  metadata.gz: cac7a475384c04aaa8a1879a207ac2bb5fad40347f21142de904cc78f5525717
+  data.tar.gz: 4282ca2795d7d8baefd750e0c283302ba5c8138d3105fcc94b66162d2671dceb
 SHA512:
-  metadata.gz: 33cf0ea6c5317be5aafba988e8652555a8e7a77620e93571a51c1537ae4cdd455e76d170fe78549a5e3068ec82db12138159cb87a31c1aafda5c036a6ca2511e
-  data.tar.gz: 2c0ae7a6a461913ed475dd3084e495f057db995ebf9ea8a1e6c9595aae74cd6bc65a0cdc6f2b2a3a831d9cc0e45e92e06a13a4f7653e89c0a8bbcd7226207677
+  metadata.gz: 4e1802542e03ca5f467d383897297593dc3e20232774d71c1fca57a56b8610d6b3e0985d8692554c7a7e4532604f66088c50782433cee3730276e8ac1c4de9e4
+  data.tar.gz: d763469fabd810b6c81deca4f2463982ec1e92016a029edf33871ea62e105c231ed43eb7dab0524a374a5782d0ab3cd2e8f53f9a6e0a01f61c5cfdcbcb3a3724

data/README.adoc CHANGED Viewed

@@ -4,7 +4,7 @@
 Archaeo is a Ruby client for the Internet Archive's https://web.archive.org[Wayback Machine] APIs.
-It provides a model-driven interface for querying archived snapshots, checking availability, saving URLs, and fetching archived content.
+It provides a model-driven interface for querying archived snapshots, checking availability, saving URLs, fetching archived content, and bulk downloading with resume support.
 == Installation
@@ -33,7 +33,7 @@ require "archaeo"
 ----
 cdx = Archaeo::CdxApi.new
-# Enumerate all snapshots
+# Enumerate all snapshots (auto-paginates via resume key)
 cdx.snapshots("example.com").each do |snapshot|
   puts snapshot.timestamp
   puts snapshot.original_url
@@ -48,6 +48,23 @@ near   = cdx.near("example.com", timestamp: "20220101")
 # Filter by time
 before = cdx.before("example.com", timestamp: "20220101")
 after  = cdx.after("example.com", timestamp: "20220101")
+# Filter by status code, mimetype, or URL pattern
+cdx.snapshots("example.com",
+  filters: [Archaeo::CdxFilter.by_status(200)],
+  collapse: ["digest"],
+  match_type: "domain",
+  sort: "reverse",
+)
+# Page-based pagination
+cdx.snapshots("example.com", page: 0)
+# Count pages
+cdx.num_pages("example.com")
+# Discover all known URLs for a domain
+cdx.known_urls("example.com")
 ----
 === Check Availability
@@ -94,6 +111,65 @@ page = fetcher.fetch("https://example.com/",
                      identity: true)
 ----
+=== Fetch Page with Assets
+[source,ruby]
+----
+fetcher = Archaeo::Fetcher.new
+bundle = fetcher.fetch_page_with_assets("https://example.com/",
+                                        timestamp: "20220615000000")
+bundle.page        # => Archaeo::Page
+bundle.assets      # => Archaeo::AssetList
+bundle.assets.css  # => ["https://example.com/style.css", ...]
+bundle.assets.js   # => ["https://example.com/app.js", ...]
+bundle.assets.images
+bundle.assets.fonts
+bundle.assets.media
+----
+=== Bulk Download with Resume
+[source,ruby]
+----
+downloader = Archaeo::BulkDownloader.new(output_dir: "archive")
+downloader.download("example.com") do |current, total, snapshot|
+  puts "[#{current}/#{total}] #{snapshot.original_url}"
+end
+# Resume interrupted download
+downloader.download("example.com", resume: true)
+# Filter by date range
+downloader.download("example.com",
+                    from: "20220101", to: "20221231")
+----
+=== URL Normalization
+[source,ruby]
+----
+Archaeo::UrlNormalizer.normalize("  https://example.com/  ")
+# => "https://example.com/"
+Archaeo::UrlNormalizer.normalize('"https://example.com/%252F"')
+# => "https://example.com/%2F"
+Archaeo::UrlNormalizer.with_scheme("example.com")
+# => "https://example.com"
+----
+=== CDX Filters
+[source,ruby]
+----
+# Build validated filter expressions
+Archaeo::CdxFilter.by_status(200)           # => "statuscode:200"
+Archaeo::CdxFilter.excluding_status(404)    # => "!statuscode:404"
+Archaeo::CdxFilter.by_mimetype("text/html") # => "mimetype:text/html"
+Archaeo::CdxFilter.by_url("example.com")    # => "original:example.com"
+----
 === Timestamps
 [source,ruby]
@@ -121,8 +197,13 @@ ts1 < ts2   # => true/false
 [source,bash]
 ----
-# List snapshots
+# Show version
+archaeo --version
+# List snapshots (table, json, or csv format)
 archaeo snapshots example.com
+archaeo snapshots --format json example.com
+archaeo snapshots --format csv --from 20220101 --to 20221231 example.com
 # Find closest snapshot
 archaeo near example.com 20220101
@@ -136,8 +217,40 @@ archaeo save https://example.com/
 # Fetch archived content
 archaeo fetch https://example.com/ 20220615120000
+# Fetch and save to file
+archaeo fetch --output page.html https://example.com/ 20220615120000
 # Fetch raw (identity) content
 archaeo fetch --identity https://example.com/ 20220615120000
+# Download all snapshots
+archaeo download example.com --output ./archive
+# Resume interrupted download
+archaeo download example.com --resume
+# Discover all known URLs for a domain
+archaeo known_urls example.com
+----
+=== Error Handling
+[source,ruby]
+----
+# Blocked site (robots.txt)
+Archaeo::BlockedSiteError
+# No snapshot found
+Archaeo::NoSnapshotFound
+# Rate limited by Wayback Machine
+Archaeo::RateLimitError
+# Maximum retries exceeded
+Archaeo::MaximumRetriesExceeded
+# SavePageNow session limit
+Archaeo::SaveFailed
 ----
 == Architecture
@@ -149,20 +262,28 @@ Archaeo follows a model-driven, OOP design:
 | Layer | Classes | Purpose
 | *Models*
-| `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `SaveResult`, `AvailabilityResult`
+| `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`
 | Domain value objects
+| *URL Processing*
+| `UrlNormalizer`, `CdxFilter`, `UrlRewriter`
+| URL sanitization, filtering, and rewriting
+| *Asset Extraction*
+| `AssetExtractor`, `AssetList`
+| Parse HTML for resource URLs
 | *APIs*
 | `CdxApi`, `AvailabilityApi`, `SaveApi`
 | Query and mutate the archive
 | *Operations*
-| `Fetcher`
-| Download archived content
+| `Fetcher`, `BulkDownloader`, `DownloadState`
+| Download content with resume support
 | *Infrastructure*
 | `HttpClient`
-| HTTP transport with retries and gzip
+| HTTP transport with retries, gzip, connection pooling
 |===
 All API classes accept an `HttpClient` via dependency injection for testability.

data/lib/archaeo/availability_api.rb CHANGED Viewed

@@ -16,6 +16,7 @@ module Archaeo
     end
     def near(url, timestamp: nil)
+      url = UrlNormalizer.normalize(url)
       params = { "url" => url }
       params["timestamp"] = timestamp.to_s if timestamp

data/lib/archaeo/bulk_downloader.rb CHANGED Viewed

@@ -15,6 +15,7 @@ module Archaeo
     end
     def download(url, from: nil, to: nil, resume: false)
+      url = UrlNormalizer.normalize(url)
       FileUtils.mkdir_p(@output_dir)
       state = DownloadState.new(@output_dir)

data/lib/archaeo/cdx_api.rb CHANGED Viewed

@@ -47,6 +47,7 @@ module Archaeo
     # Returns an Enumerator of Snapshot objects, auto-paginating
     # via resume key unless an explicit page is requested.
     def snapshots(url, **options)
+      url = UrlNormalizer.normalize(url)
       validate_options!(options)
       Enumerator.new do |yielder|
@@ -59,9 +60,15 @@ module Archaeo
     end
     def near(url, timestamp:)
+      url = UrlNormalizer.normalize(url)
       ts = Timestamp.coerce(timestamp)
       result = snapshots(url, sort: "closest",
                               closest: ts.to_s, limit: 1).first
+      if result&.blocked?
+        raise BlockedSiteError,
+              "Site is blocked: #{url}"
+      end
       result || raise(NoSnapshotFound,
                       "No snapshot found near #{ts} for #{url}")
     end
@@ -94,6 +101,7 @@ module Archaeo
     # Returns the number of pages for a paginated query.
     def num_pages(url, **options)
+      url = UrlNormalizer.normalize(url)
       params = { "url" => url, "showNumPages" => "true" }
       merge_scalar_params!(params, options)
       response = @client.get(
@@ -109,6 +117,7 @@ module Archaeo
     # Returns all unique original URLs under a domain.
     def known_urls(domain, match_type: "domain")
+      domain = UrlNormalizer.normalize(domain)
       snapshots(domain, match_type: match_type,
                         collapse: ["urlkey"]).map(&:original_url).uniq
     end

data/lib/archaeo/cli.rb CHANGED Viewed

@@ -158,13 +158,14 @@ module Archaeo
     end
     def output_csv(snaps)
-      puts CSV.generate do |csv|
+      csv = CSV.generate do |csv|
         csv << %w[timestamp status_code url archive_url]
         snaps.each do |snap|
           csv << [snap.timestamp.to_s, snap.status_code,
                   snap.original_url, snap.archive_url]
         end
       end
+      puts csv
     end
     def write_output(path, content)

data/lib/archaeo/fetcher.rb CHANGED Viewed

@@ -14,6 +14,7 @@ module Archaeo
     end
     def fetch(url, timestamp:, identity: false)
+      url = UrlNormalizer.normalize(url)
       ts = Timestamp.coerce(timestamp)
       archive_url = ArchiveUrl.new(url, timestamp: ts,
                                         identity: identity)
@@ -21,6 +22,13 @@ module Archaeo
       build_page(response, archive_url.to_s, url, ts)
     end
+    def fetch_page_with_assets(url, timestamp:)
+      page = fetch(url, timestamp: timestamp)
+      assets = AssetExtractor.new(page.content,
+                                  base_url: page.archive_url).extract
+      PageBundle.new(page: page, assets: assets)
+    end
     private
     def build_page(response, archive_url, url, timestamp)

data/lib/archaeo/page_bundle.rb ADDED Viewed

@@ -0,0 +1,16 @@
+# frozen_string_literal: true
+module Archaeo
+  # A fetched page together with all its extracted asset URLs.
+  #
+  # Bundles a Page with the AssetList discovered from its HTML,
+  # providing a single object for complete page archival.
+  class PageBundle
+    attr_reader :page, :assets
+    def initialize(page:, assets:)
+      @page = page
+      @assets = assets
+    end
+  end
+end

data/lib/archaeo/save_api.rb CHANGED Viewed

@@ -17,6 +17,7 @@ module Archaeo
     end
     def save(url)
+      url = UrlNormalizer.normalize(url)
       save_url = "#{ENDPOINT}/#{url}"
       start_time = Time.now.utc
       attempt_save(save_url, start_time, url)

data/lib/archaeo/url_normalizer.rb CHANGED Viewed

@@ -22,6 +22,23 @@ module Archaeo
       normalized.match?(%r{\A[a-z][a-z0-9+\-.]*://}) ? normalized : "https://#{normalized}"
     end
+    VALID_URL_RE = %r{\A([a-z][a-z0-9+\-.]*://)?[^\s]+\z}
+    def self.valid?(url)
+      normalized = normalize(url)
+      return false if normalized.empty?
+      normalized.match?(VALID_URL_RE)
+    end
+    def self.validate!(url)
+      normalized = normalize(url)
+      raise ArgumentError, "URL cannot be empty" if normalized.empty?
+      raise ArgumentError, "Invalid URL: #{url}" unless valid?(url)
+      normalized
+    end
     def to_s
       @normalized
     end

data/lib/archaeo/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Archaeo
-  VERSION = "0.2.0"
+  VERSION = "0.2.2"
 end

data/lib/archaeo.rb CHANGED Viewed

@@ -19,6 +19,7 @@ module Archaeo
   autoload :ArchiveUrl, "archaeo/archive_url"
   autoload :Snapshot, "archaeo/snapshot"
   autoload :Page, "archaeo/page"
+  autoload :PageBundle, "archaeo/page_bundle"
   autoload :SaveResult, "archaeo/save_result"
   autoload :AvailabilityResult, "archaeo/availability_result"
   autoload :UrlNormalizer, "archaeo/url_normalizer"

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: archaeo
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.2.2
 platform: ruby
 authors:
 - Ribose Inc.
@@ -84,6 +84,7 @@ files:
 - lib/archaeo/fetcher.rb
 - lib/archaeo/http_client.rb
 - lib/archaeo/page.rb
+- lib/archaeo/page_bundle.rb
 - lib/archaeo/save_api.rb
 - lib/archaeo/save_result.rb
 - lib/archaeo/snapshot.rb