RubyGems - archaeo - Versions diffs - 0.2.5 → 0.2.6 - Mend

archaeo 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/README.adoc +160 -6
data/lib/archaeo/archive_url.rb +16 -0
data/lib/archaeo/asset_extractor.rb +19 -0
data/lib/archaeo/asset_list.rb +31 -0
data/lib/archaeo/availability_result.rb +24 -0
data/lib/archaeo/bulk_downloader.rb +97 -28
data/lib/archaeo/cdx_api.rb +4 -0
data/lib/archaeo/cdx_filter.rb +12 -0
data/lib/archaeo/cli.rb +96 -10
data/lib/archaeo/download_state.rb +46 -15
data/lib/archaeo/fetcher.rb +16 -1
data/lib/archaeo/http_client.rb +43 -11
data/lib/archaeo/page.rb +32 -0
data/lib/archaeo/page_bundle.rb +28 -0
data/lib/archaeo/save_result.rb +19 -0
data/lib/archaeo/snapshot.rb +22 -0
data/lib/archaeo/timestamp.rb +14 -0
data/lib/archaeo/url_normalizer.rb +7 -1
data/lib/archaeo/url_rewriter.rb +46 -0
data/lib/archaeo/version.rb +1 -1
data/lib/archaeo.rb +1 -0
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e318dfb4a6478af2e663418fda9952308323be35ef9fc6582a5fa3a327cdbb6d
-  data.tar.gz: 2f745ac2ea371e6b64d4f83ca39d0f247991882d3104bcff90e523b73e421f9b
+  metadata.gz: dff73d8ab14a3b75bf98281d20b5427b55757b330d57e8899a2ffb04d9046c6d
+  data.tar.gz: d92c2f8e77d6ba5c51283f0581bb51182ebe84aab74ffe4e4971e0d405eab2cc
 SHA512:
-  metadata.gz: dc20f6483c99aba0059a224dba1758cec00d3d5921e7f8296b9826554f8d45780981571df3bdd8c05d0704066e14163c7e1a192339da30b3b98a367b0860a669
-  data.tar.gz: f4ca21a9c5d5f68d29bfe24ff5caf598a9e8819c7bfc920e46cba3d3a9980f4a086433c0305897fc7506e8cc002d943b94b5c4ea15e12372d0b92389df30f3c3
+  metadata.gz: ed1a823e5f358e53ec653e5eee902f95787b9bdedc5670a214f3c1784f4c1829f705d5a53ded5cfb6777f16ed33e32430f2aca13cb9c1fad628885daa55a60a1
+  data.tar.gz: 3b809e1aad60db5e04a356dff5ab450333ff88f56bf481630887f45b3b3f09035ee1ba959807d22129b827b30324b7730e7cf5c248656b62c38a0beab8ad2581

data/README.adoc CHANGED Viewed

@@ -54,6 +54,9 @@ cdx.between("example.com", from: "20220101", to: "20221231").each do |snap|
   puts snap.timestamp
 end
+# Count snapshots
+cdx.count("example.com")  # => Integer
 # Filter by status code, mimetype, or URL pattern
 cdx.snapshots("example.com",
   filters: [Archaeo::CdxFilter.by_status(200)],
@@ -69,6 +72,11 @@ filters = Archaeo::CdxFilter.combine(
 )
 cdx.snapshots("example.com", filters: filters)
+# Convenience filter factories
+Archaeo::CdxFilter.only_html              # text/html only
+Archaeo::CdxFilter.by_mimetype_prefix("image")  # any image/*
+Archaeo::CdxFilter.excluding_redirects    # exclude 3xx
 # Page-based pagination
 cdx.snapshots("example.com", page: 0)
@@ -90,6 +98,8 @@ result.available?   # => true/false
 result.archive_url  # => "https://web.archive.org/web/..."
 result.timestamp    # => Archaeo::Timestamp
 result.archived_status  # => HTTP status code of the archived page
+result.to_h         # => Hash representation
+result.as_json      # => JSON-serializable Hash
 api.available?("example.com")  # => true/false
 ----
@@ -104,6 +114,8 @@ result.url          # => "https://example.com/"
 result.archive_url  # => "https://web.archive.org/web/..."
 result.timestamp    # => Archaeo::Timestamp
 result.cached?      # => true if already archived
+result.to_h         # => Hash representation
+result.as_json      # => JSON-serializable Hash
 ----
 === Fetch Archived Content
@@ -120,13 +132,22 @@ page.status_code    # => 200
 page.archive_url    # => full archive URL
 page.title          # => "Example Domain"
 page.html?          # => true
+page.css?           # => true for text/css
 page.json?          # => false
 page.size           # => content length in bytes
+page.to_h           # => Hash with all fields
+page.as_json        # => JSON-serializable Hash
+page.inspect        # => "#<Archaeo::Page text/html 1234 bytes>"
 # Raw (identity) mode -- no Wayback Machine rewriting
 page = fetcher.fetch("https://example.com/",
                      timestamp: "20220615000000",
                      identity: true)
+# With digest verification (raises IntegrityError on mismatch)
+page = fetcher.fetch("https://example.com/",
+                     timestamp: "20220615000000",
+                     snapshot: snap)
 ----
 === Fetch Page with Assets
@@ -146,10 +167,25 @@ bundle.assets.fonts
 bundle.assets.media
 bundle.size        # => total count (page + assets)
 bundle.asset_count # => number of assets
+bundle.to_h        # => Hash representation
+bundle.to_json     # => JSON string
 # Serialize asset list
 bundle.assets.to_json
 bundle.assets.counts  # => { css: 1, js: 2, image: 3, font: 0, media: 1 }
+# Filter assets by type
+css_only = bundle.assets.filter(:css)
+images_and_fonts = bundle.assets.filter(:image, :font)
+# Merge asset lists (deduplicates)
+merged = bundle.assets.merge(other_assets)
+# Reconstruct from JSON
+restored = Archaeo::AssetList.from_json(json_string)
+# Safe type access
+bundle.assets.urls_by_type(:image)  # works for any type key
 ----
 === Bulk Download with Resume
@@ -157,13 +193,22 @@ bundle.assets.counts  # => { css: 1, js: 2, image: 3, font: 0, media: 1 }
 [source,ruby]
 ----
 downloader = Archaeo::BulkDownloader.new(output_dir: "archive")
-downloader.download("example.com") do |current, total, snapshot|
+summary = downloader.download("example.com") do |current, total, snapshot|
   puts "[#{current}/#{total}] #{snapshot.original_url}"
 end
+summary.total          # => total snapshots found
+summary.downloaded     # => successfully downloaded
+summary.skipped        # => skipped (already downloaded with resume)
+summary.bytes_written  # => total bytes written
+summary.elapsed        # => seconds elapsed
 # Resume interrupted download
 downloader.download("example.com", resume: true)
+# Dry run (preview without fetching)
+summary = downloader.download("example.com", dry_run: true)
 # Filter by date range
 downloader.download("example.com",
                     from: "20220101", to: "20221231")
@@ -175,6 +220,27 @@ downloader = Archaeo::BulkDownloader.new(
 downloader.download("example.com")
 ----
+=== Download State (Resume Tracking)
+[source,ruby]
+----
+state = Archaeo::DownloadState.new("archive")
+# Check if a snapshot was already downloaded
+state.completed?("20220615000000")  # => true/false
+# Get metadata for a completed snapshot
+entry = state.entry_for("20220615000000")
+# => { "ts" => "20220615000000", "at" => "2022-06-15T12:00:00Z",
+#      "url" => "https://example.com/", "bytes" => 12345 }
+# Total bytes downloaded
+state.total_bytes  # => Integer
+# Clear state for a fresh download
+state.clear
+----
 === URL Normalization
 [source,ruby]
@@ -187,6 +253,10 @@ Archaeo::UrlNormalizer.normalize('"https://example.com/%252F"')
 Archaeo::UrlNormalizer.with_scheme("example.com")
 # => "https://example.com"
+# Default ports are stripped
+Archaeo::UrlNormalizer.normalize("https://example.com:443/path")
+# => "https://example.com/path"
 ----
 === CDX Filters
@@ -202,6 +272,33 @@ Archaeo::CdxFilter.by_url("example.com")    # => "original:example.com"
 # Compose filters
 filters = Archaeo::CdxFilter.only_successful
 error_filters = Archaeo::CdxFilter.excluding_errors
+# Mimetype prefix matching
+Archaeo::CdxFilter.by_mimetype_prefix("image")  # => matches image/*
+# Convenience factories
+Archaeo::CdxFilter.only_html            # => text/html only
+Archaeo::CdxFilter.excluding_redirects  # => excludes 3xx statuses
+----
+=== URL Rewriting
+[source,ruby]
+----
+rewriter = Archaeo::UrlRewriter.new(
+  "https://web.archive.org/web/20220615000000/",
+  "local",
+)
+# Rewrite single URL
+rewriter.rewrite("https://web.archive.org/web/20220615000000/style.css")
+# => "local/style.css"
+# Rewrite batch
+rewriter.rewrite_batch(["url1", "url2"])
+# Rewrite URLs within HTML (src, href, srcset, data-src, poster)
+rewritten_html = rewriter.rewrite_html(html_content)
 ----
 === Snapshot Convenience
@@ -217,6 +314,14 @@ snap.client_error?  # => true for 4xx
 snap.server_error?  # => true for 5xx
 snap.error?         # => true for 4xx/5xx
+# Age helpers
+snap.age            # => seconds since capture
+snap.older_than?(3600)  # => true if older than 1 hour
+snap.newer_than?(3600)  # => true if newer than 1 hour
+# Identity URL (raw content, no Wayback rewriting)
+snap.identity_url
 # Fetch content directly from a snapshot
 page = snap.fetch
@@ -225,6 +330,7 @@ bundle = snap.fetch_with_assets
 # JSON-serializable representation
 snap.as_json  # => Hash with primitive values only
+snap.inspect  # => "#<Archaeo::Snapshot 20220101 ...>"
 ----
 === Timestamps
@@ -250,6 +356,10 @@ ts.to_s  # => "20220615000000"
 ts.to_iso8601  # => "2022-06-15T00:00:00Z"
 ts.to_rfc3339  # => "2022-06-15T00:00:00+00:00"
+# Decompose
+ts.to_h  # => { year: 2022, month: 6, day: 15, hour: 0, minute: 0, second: 0 }
+ts.to_a  # => [2022, 6, 15, 0, 0, 0]
 # Arithmetic
 ts + 3600          # => Timestamp one hour later
 ts - 3600          # => Timestamp one hour earlier
@@ -257,6 +367,27 @@ ts1 - ts2          # => seconds between timestamps
 # Comparison
 ts1 < ts2   # => true/false
+# Immutable -- frozen on creation
+ts.frozen?  # => true
+----
+=== HTTP Client Observability
+[source,ruby]
+----
+# Track every request with a callback
+client = Archaeo::HttpClient.new(
+  on_request: ->(uri, elapsed, status, retries) {
+    puts "#{status} #{uri} (#{elapsed.round(3)}s, #{retries} retries)"
+  },
+)
+# Inspect connection pool state
+client.pool_stats
+# => { active_connections: 2, max_pool_size: 8,
+#      hosts: ["web.archive.org"],
+#      idle_times: { "web.archive.org": 12 } }
 ----
 === Command-Line Interface
@@ -279,6 +410,16 @@ archaeo near --format json example.com 20220101
 archaeo oldest example.com
 archaeo newest --format json example.com
+# Find before/after a timestamp
+archaeo before example.com 20220101
+archaeo after example.com 20220101
+# List snapshots in a date range
+archaeo between example.com 20220101 20221231
+# Count snapshots
+archaeo count example.com
 # Check availability (with optional timestamp)
 archaeo available example.com
 archaeo available --timestamp 20220101 example.com
@@ -295,15 +436,25 @@ archaeo fetch --output page.html https://example.com/ 20220615120000
 # Fetch raw (identity) content
 archaeo fetch --identity https://example.com/ 20220615120000
+# Fetch a page and list its extracted assets
+archaeo fetch-assets https://example.com/ 20220615120000
+archaeo fetch-assets --format json https://example.com/ 20220615120000
 # Download all snapshots
 archaeo download example.com --output ./archive
+# Dry run (preview without fetching)
+archaeo download --dry_run example.com
 # Parallel downloads
 archaeo download --concurrency 4 example.com --output ./archive
 # Resume interrupted download
 archaeo download example.com --resume
+# Suppress progress messages
+archaeo --quiet download example.com
 # Discover all known URLs for a domain
 archaeo known_urls example.com
 ----
@@ -326,6 +477,9 @@ Archaeo::MaximumRetriesExceeded
 # SavePageNow session limit
 Archaeo::SaveFailed
+# Content digest mismatch
+Archaeo::IntegrityError
 ----
 == Architecture
@@ -338,15 +492,15 @@ Archaeo follows a model-driven, OOP design:
 | *Models*
 | `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`
-| Domain value objects
+| Domain value objects with `to_h`, `as_json`, `inspect` support
 | *URL Processing*
 | `UrlNormalizer`, `CdxFilter`, `UrlRewriter`
-| URL sanitization, validated filtering with composition, and rewriting
+| URL sanitization, validated filtering with composition, and HTML URL rewriting
 | *Asset Extraction*
 | `AssetExtractor`, `AssetList`
-| Parse HTML for resource URLs
+| Parse HTML for resource URLs including preloads and modulepreload
 | *APIs*
 | `CdxApi`, `AvailabilityApi`, `SaveApi`
@@ -354,11 +508,11 @@ Archaeo follows a model-driven, OOP design:
 | *Operations*
 | `Fetcher`, `BulkDownloader`, `DownloadState`
-| Download content with resume support
+| Download content with resume, dry-run, digest verification, and download summaries
 | *Infrastructure*
 | `HttpClient`
-| HTTP transport with retries, gzip, 429/503 handling, connection pooling with eviction
+| HTTP transport with retries, gzip, 429/503 handling, connection pooling, and per-request observability
 |===
 All API classes accept an `HttpClient` via dependency injection for testability.

data/lib/archaeo/archive_url.rb CHANGED Viewed

@@ -53,6 +53,22 @@ module Archaeo
       "#{BASE}/#{@timestamp}#{suffix}/#{@original_url}"
     end
+    def identity_url
+      return to_s if identity?
+      self.class.new(@original_url, timestamp: @timestamp, identity: true).to_s
+    end
+    def to_h
+      { original_url: @original_url, timestamp: @timestamp,
+        identity: @identity }
+    end
+    def as_json(*)
+      { original_url: @original_url, timestamp: @timestamp.to_s,
+        identity: @identity, url: to_s }
+    end
     def self.extract_original_url(string, ts_str, identity)
       marker = identity ? "#{ts_str}id_/" : "#{ts_str}/"
       idx = string.index(marker)

data/lib/archaeo/asset_extractor.rb CHANGED Viewed

@@ -24,6 +24,12 @@ module Archaeo
       "\\s*:[^;]*#{CSS_URL_PATTERN.source}",
     )
+    PRELOAD_TYPE_MAP = {
+      "style" => :css,
+      "script" => :js,
+      "image" => :image,
+    }.freeze
     def initialize(html, base_url: nil)
       @doc = Nokogiri::HTML(html.to_s)
       @base_url = base_url
@@ -38,6 +44,7 @@ module Archaeo
       extract_media(list)
       extract_inline_css(list)
       extract_inline_styles(list)
+      extract_preloads(list)
       list
     end
@@ -53,6 +60,9 @@ module Archaeo
       @doc.css("script[src]").each do |el|
         list.add(resolve(el["src"]), type: :js)
       end
+      @doc.css('link[rel="modulepreload"]').each do |el|
+        list.add(resolve(el["href"]), type: :js)
+      end
     end
     def extract_images(list)
@@ -202,5 +212,14 @@ module Archaeo
     rescue URI::InvalidURIError
       url
     end
+    def extract_preloads(list)
+      @doc.css('link[rel="preload"][as]').each do |el|
+        type = PRELOAD_TYPE_MAP[el["as"]]
+        next unless type
+        list.add(resolve(el["href"]), type: type)
+      end
+    end
   end
 end

data/lib/archaeo/asset_list.rb CHANGED Viewed

@@ -40,6 +40,10 @@ module Archaeo
       @urls_by_type[:image]
     end
+    def urls_by_type(type)
+      @urls_by_type[type] || []
+    end
     def fonts
       @urls_by_type[:font]
     end
@@ -71,5 +75,32 @@ module Archaeo
     def counts
       @urls_by_type.transform_values(&:size)
     end
+    def filter(*types)
+      result = self.class.new
+      types.each do |type|
+        @urls_by_type[type]&.each { |url| result.add(url, type: type) }
+      end
+      result
+    end
+    def merge(other)
+      CATEGORIES.each do |type|
+        other.urls_by_type(type).each { |url| add(url, type: type) }
+      end
+      self
+    end
+    def self.from_json(json_string)
+      data = JSON.parse(json_string)
+      list = new
+      data.each do |type, urls|
+        sym = type.to_sym
+        next unless CATEGORIES.include?(sym)
+        Array(urls).each { |url| list.add(url, type: sym) }
+      end
+      list
+    end
   end
 end

data/lib/archaeo/availability_result.rb CHANGED Viewed

@@ -32,5 +32,29 @@ module Archaeo
         "#{url} -> not available"
       end
     end
+    def to_h
+      {
+        url: @url,
+        available: @available,
+        archive_url: @archive_url,
+        timestamp: @timestamp,
+        archived_status: @archived_status,
+      }
+    end
+    def as_json(*)
+      {
+        url: @url,
+        available: @available,
+        archive_url: @archive_url,
+        timestamp: @timestamp&.to_s,
+        archived_status: @archived_status,
+      }
+    end
+    def inspect
+      "#<#{self.class.name} #{@url} available=#{@available}>"
+    end
   end
 end

data/lib/archaeo/bulk_downloader.rb CHANGED Viewed

@@ -3,6 +3,11 @@
 require "fileutils"
 module Archaeo
+  DownloadSummary = Struct.new(
+    :total, :downloaded, :skipped, :failed, :bytes_written, :elapsed,
+    keyword_init: true
+  )
   # Downloads all archived snapshots of a URL with resume support.
   #
   # Queries the CDX API for matching snapshots, fetches each page,
@@ -17,20 +22,17 @@ module Archaeo
       @concurrency = [1, concurrency.to_i].max
     end
-    def download(url, from: nil, to: nil, resume: false, &block)
+    def download(url, from: nil, to: nil, resume: false,
+                 dry_run: false, &block)
+      start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
       url = UrlNormalizer.normalize(url)
-      FileUtils.mkdir_p(@output_dir)
-      state = DownloadState.new(@output_dir)
+      FileUtils.mkdir_p(@output_dir) unless dry_run
       snapshots = fetch_snapshots(url, from: from, to: to)
-      total = snapshots.size
-      progress = block
+      downloaded, skipped, bytes =
+        run_download(snapshots, resume, dry_run, block)
-      if @concurrency == 1
-        download_sequential(snapshots, total, state, resume, progress)
-      else
-        download_concurrent(snapshots, total, state, resume, progress)
-      end
+      build_summary(start_time, snapshots.size, downloaded, skipped, bytes)
     end
     private
@@ -44,29 +46,75 @@ module Archaeo
         .select { |snap| !snap.blocked? && snap.status_code == 200 }
     end
-    def download_sequential(snapshots, total, state, resume, progress)
-      snapshots.each_with_index do |snap, index|
-        next if resume && state.completed?(snap.timestamp)
+    def run_download(snapshots, resume, dry_run, progress)
+      state = DownloadState.new(@output_dir)
+      total = snapshots.size
-        fetch_and_save(snap)
-        state.mark_completed(snap.timestamp)
+      if @concurrency == 1
+        download_sequential(snapshots, total, state, resume,
+                            dry_run, progress)
+      else
+        download_concurrent(snapshots, total, state, resume,
+                            dry_run, progress)
+      end
+    end
+    def build_summary(start_time, total, downloaded, skipped, bytes)
+      elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
+      DownloadSummary.new(
+        total: total, downloaded: downloaded, skipped: skipped,
+        failed: 0, bytes_written: bytes, elapsed: elapsed
+      )
+    end
+    def download_sequential(snapshots, total, state, resume,
+                            dry_run, progress)
+      counters = { downloaded: 0, skipped: 0, bytes: 0 }
+      snapshots.each_with_index do |snap, index|
+        process_sequential(snap, state, resume, dry_run, counters)
         progress&.call(index + 1, total, snap)
       end
+      [counters[:downloaded], counters[:skipped], counters[:bytes]]
+    end
+    def process_sequential(snap, state, resume, dry_run, counters)
+      if resume && state.completed?(snap.timestamp)
+        counters[:skipped] += 1
+        return
+      end
+      counters[:bytes] += download_snapshot(snap, state) unless dry_run
+      counters[:downloaded] += 1
     end
-    def download_concurrent(snapshots, total, state, resume, progress)
+    def download_snapshot(snap, state)
+      content = fetch_and_save(snap)
+      state.mark_completed(snap.timestamp, url: snap.original_url,
+                                           bytes: content.bytesize)
+      content.bytesize
+    end
+    def download_concurrent(snapshots, total, state, resume,
+                            dry_run, progress)
       queue = snapshots.each_with_index.to_a
-      mutex = Mutex.new
-      errors = []
+      shared = { mutex: Mutex.new, errors: [],
+                 downloaded: 0, skipped: 0, bytes: 0 }
       threads = Array.new(@concurrency) do
         Thread.new do
-          process_queue(queue, total, state, resume, progress, mutex, errors)
+          process_queue(queue, total, state, resume,
+                        dry_run, progress, shared)
         end
       end
       threads.each(&:join)
+      raise_on_errors(shared[:errors])
+      [shared[:downloaded], shared[:skipped], shared[:bytes]]
+    end
+    def raise_on_errors(errors)
       return unless errors.any?
       raise Error,
@@ -74,24 +122,44 @@ module Archaeo
             "#{errors.map { |s, _| s.timestamp }.join(', ')}"
     end
-    def process_queue(queue, total, state, resume, progress, mutex, errors)
+    def process_queue(queue, total, state, resume, dry_run,
+                      progress, shared)
       loop do
-        snap, index = mutex.synchronize { queue.shift }
+        snap, index = shared[:mutex].synchronize { queue.shift }
         break unless snap
-        next if resume && state.completed?(snap.timestamp)
-        begin
-          fetch_and_save(snap)
-          state.mark_completed(snap.timestamp)
-        rescue StandardError => e
-          mutex.synchronize { errors << [snap, e] }
+        if skip_snapshot?(snap, state, resume, shared)
+          progress&.call(index + 1, total, snap)
+          next
         end
+        concurrent_fetch(snap, state, dry_run, shared)
         progress&.call(index + 1, total, snap)
       end
     end
+    def skip_snapshot?(snap, state, resume, shared)
+      return false unless resume && state.completed?(snap.timestamp)
+      shared[:mutex].synchronize { shared[:skipped] += 1 }
+      true
+    end
+    def concurrent_fetch(snap, state, dry_run, shared)
+      unless dry_run
+        content = fetch_and_save(snap)
+        shared[:mutex].synchronize do
+          state.mark_completed(snap.timestamp,
+                               url: snap.original_url,
+                               bytes: content.bytesize)
+          shared[:bytes] += content.bytesize
+        end
+      end
+      shared[:mutex].synchronize { shared[:downloaded] += 1 }
+    rescue StandardError => e
+      shared[:mutex].synchronize { shared[:errors] << [snap, e] }
+    end
     def fetch_and_save(snapshot)
       fetcher = Fetcher.new(client: @client)
       page = fetcher.fetch(snapshot.original_url,
@@ -102,6 +170,7 @@ module Archaeo
       tmp_path = "#{filename}.tmp"
       File.binwrite(tmp_path, page.content)
       File.rename(tmp_path, filename)
+      page.content
     rescue StandardError
       FileUtils.rm_f(tmp_path) if defined?(tmp_path)
       raise

data/lib/archaeo/cdx_api.rb CHANGED Viewed

@@ -106,6 +106,10 @@ module Archaeo
                 **options)
     end
+    def count(url, **options)
+      snapshots(url, **options).count
+    end
     # Returns the number of pages for a paginated query.
     def num_pages(url, **options)
       url = UrlNormalizer.normalize(url)

data/lib/archaeo/cdx_filter.rb CHANGED Viewed

@@ -76,6 +76,18 @@ module Archaeo
        excluding_status(502), excluding_status(503)]
     end
+    def self.only_html
+      [by_mimetype("text/html")]
+    end
+    def self.by_mimetype_prefix(prefix)
+      new("mimetype:#{Regexp.escape(prefix)}.*")
+    end
+    def self.excluding_redirects
+      %w[301 302 303 307 308].map { |c| excluding_status(c) }
+    end
     private
     def validate!