RubyGems - crawlscope - Versions diffs - 0.1.0 → 0.2.0 - Mend

crawlscope 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +7 -11
data/README.md +20 -13
data/lib/crawlscope/browser.rb +8 -0
data/lib/crawlscope/cli.rb +10 -10
data/lib/crawlscope/configuration.rb +20 -5
data/lib/crawlscope/context.rb +9 -0
data/lib/crawlscope/{audit.rb → crawl.rb} +62 -58
data/lib/crawlscope/crawler.rb +19 -1
data/lib/crawlscope/http.rb +1 -1
data/lib/crawlscope/rake_tasks.rb +28 -0
data/lib/crawlscope/rules/links.rb +76 -43
data/lib/crawlscope/rules/structured_data.rb +14 -1
data/lib/crawlscope/run.rb +60 -0
data/lib/crawlscope/schema_registry.rb +3 -349
data/lib/crawlscope/schemas.rb +355 -0
data/lib/crawlscope/sitemap.rb +18 -6
data/lib/crawlscope/structured_data/audit.rb +7 -7
data/lib/crawlscope/structured_data/check.rb +35 -0
data/lib/crawlscope/structured_data/reporter.rb +69 -0
data/lib/crawlscope/url.rb +14 -0
data/lib/crawlscope/version.rb +1 -1
data/lib/tasks/crawlscope_tasks.rake +12 -23
data/test/crawlscope/browser_test.rb +155 -0
data/test/crawlscope/cli_test.rb +128 -6
data/test/crawlscope/configuration_test.rb +49 -0
data/test/crawlscope/{audit_test.rb → crawl_test.rb} +11 -5
data/test/crawlscope/crawler_test.rb +34 -0
data/test/crawlscope/http_test.rb +56 -0
data/test/crawlscope/links_rule_test.rb +110 -5
data/test/crawlscope/rule_registry_test.rb +32 -0
data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
data/test/crawlscope/schema_registry_test.rb +19 -0
data/test/crawlscope/sitemap_test.rb +55 -0
data/test/crawlscope/structured_data_document_test.rb +36 -0
data/test/crawlscope/structured_data_report_test.rb +3 -3
data/test/crawlscope/structured_data_reporter_test.rb +2 -2
data/test/crawlscope/structured_data_rule_test.rb +20 -0
data/test/crawlscope/structured_data_writer_test.rb +2 -2
data/test/crawlscope/url_test.rb +31 -0
metadata +14 -5
data/lib/crawlscope/task.rb +0 -131

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 58a83d74a7b2b8422df4f161db9d3a7fe3ff213495f0837fd29a08cc13715b86
-  data.tar.gz: 02bd5743bcaae94bfdcc169fb6fe782257527984da68b091f6b75db3420b4244
+  metadata.gz: ba21d55a2d9b787d7bb9d4e90f39e655a5fe2a884769dbef6f866d1e5779e076
+  data.tar.gz: b7c6b829412f8e436cd81d2d28bcd5fe22327f0bb9fcc34af307b4b5feac722c
 SHA512:
-  metadata.gz: c566f6899f45633db13a8ee47ac15f5e6054a4adff087774ce17ef15c26b10340694bd395e0de0efbdb5b652cf8ea04e3cbbb452d9467fd8167143f3675d5642
-  data.tar.gz: 1c087e1f4233224ea2c6b9b14de3bf34f4007b4689cd4fa8b9a3ea7ba688f78beb12431d0ffc7b6f54cae1eead319e3ba8293cef325440c614ca191b6ebf0e8b
+  metadata.gz: d4a6e75c44c7cff4e238ff50168b7807fec8542074bbcbe838c50cf5eba02f181576291f1033620f268484b4c75f588215789515bd6c3ee9d7e76e8e5b94ceaf
+  data.tar.gz: 5576d6a31853ebf3e6662e4bbc8f97d4da918a24352e02c4d9c7569e4300ae102d79c9a348e55ce884273c12dfa1717b8b22c16091fa26eb0d69c19b4b7dca36

data/CHANGELOG.md CHANGED Viewed

@@ -5,27 +5,23 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
-## [0.1.0] - 2026-04-23
+## [0.2.0] - 2026-04-24
-### Added
-- add crawlkit release-ready audit gem
-- add standalone validation commands
+### Changed
-- move default schema rules into crawlkit
+- simplify crawl and structured data boundaries
+- harden validation boundaries
-### Changed
-- strengthen public API coverage
+### Fixed
-- load shared test dependencies
+- handle child sitemaps
-- rename crawlkit to crawlscope
+- use URL for sitemap validation

data/README.md CHANGED Viewed

@@ -58,16 +58,16 @@ gem "ferrum"
 ## CLI Usage
-Validate a site directly from the gem:
+Validate a site from its default sitemap:
 ```bash
-crawlscope validate --base-url https://example.com
+crawlscope validate --url https://example.com
 ```
 Validate only specific rules:
 ```bash
-crawlscope validate --base-url https://example.com --rules metadata,links
+crawlscope validate --url https://example.com --rules metadata,links
 ```
 Validate structured data on one or more URLs:
@@ -77,10 +77,11 @@ crawlscope ldjson --url https://example.com/article
 crawlscope ldjson --url https://example.com/a --url https://example.com/b --summary
 ```
-If you do not pass `--sitemap`, `crawlscope` defaults to:
+To use a non-default sitemap, pass `--sitemap`:
-- `https://example.com/sitemap.xml` for real site URLs
-- `public/sitemap.xml` for localhost-style development URLs when that file exists
+```bash
+crawlscope validate --url https://example.com --sitemap https://example.com/sitemap.xml
+```
 Child sitemap indexes are supported automatically.
@@ -89,14 +90,14 @@ Child sitemap indexes are supported automatically.
 ```ruby
 require "crawlscope"
-audit = Crawlscope::Audit.new(
+crawl = Crawlscope::Crawl.new(
   base_url: "https://example.com",
   sitemap_path: "https://example.com/sitemap.xml",
   rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
   schema_registry: Crawlscope::SchemaRegistry.default
 )
-result = audit.call
+result = crawl.call
 puts result.ok?
 puts result.issues.to_a.map(&:message)
@@ -104,7 +105,7 @@ puts result.issues.to_a.map(&:message)
 ## Result Shape
-`Crawlscope::Audit` returns a `Crawlscope::Result` with:
+`Crawlscope::Crawl` returns a `Crawlscope::Result` with:
 - `urls`: sitemap URLs selected for validation
 - `pages`: fetched page snapshots
@@ -133,7 +134,7 @@ bin/rails crawlscope:validate
 Available environment overrides:
-- `BASE_URL`
+- `URL`
 - `SITEMAP`
 - `RULES=metadata,links`
 - `JS=1` or `RENDERER=browser`
@@ -155,11 +156,15 @@ bin/rails crawlscope:validate:ldjson URL=https://example.com/article
 The same validation surface is also available in the gem repository itself through plain `rake`:
 ```bash
-bundle exec rake crawlscope:validate BASE_URL=https://example.com
-bundle exec rake crawlscope:validate:metadata BASE_URL=https://example.com
+bundle exec rake crawlscope:validate URL=https://example.com
+bundle exec rake crawlscope:validate:metadata URL=https://example.com
 bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article
 ```
+`crawlscope:validate` runs all default sitemap rules: metadata, structured data, uniqueness, and links. `URL` is the site base. Without `SITEMAP`, Crawlscope uses `/sitemap.xml`. With `SITEMAP`, Crawlscope uses `URL` as the site base and validates URLs from that sitemap. `SITEMAP` may be a full URL or a local file path.
+`crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap.
 ### Structured Data URL Audit
 For one-off structured-data checks:
@@ -174,7 +179,7 @@ Optional flags:
 - `DEBUG=1`: print detected items
 - `SUMMARY=1`: print grouped failures
-- `REPORT_PATH=...`: write a JSON report
+- `REPORT_PATH=...`: write a JSON report. Treat this as trusted operator input; Crawlscope writes to the path the task process can access.
 - `JS=1` or `RENDERER=browser`: render with Ferrum
 ## Rules
@@ -237,6 +242,8 @@ Checks:
 - `WebApplication`
 - `WebSite`
+The default schema definitions live in `Crawlscope::Schemas`; `Crawlscope::SchemaRegistry` owns registration and validation.
 Host apps can replace or extend the registry:
 ```ruby

data/lib/crawlscope/browser.rb CHANGED Viewed

@@ -45,6 +45,8 @@ module Crawlscope
         doc: Nokogiri::HTML(body)
       )
     rescue => error
+      raise unless browser_error?(error)
       Page.new(
         url: url,
         normalized_url: Url.normalize(url, base_url: @base_url),
@@ -84,5 +86,11 @@ module Crawlscope
     rescue Ferrum::TimeoutError
       raise Timeout::Error, "Timed out waiting for browser network idle"
     end
+    def browser_error?(error)
+      error.is_a?(Timeout::Error) ||
+        error.is_a?(SystemCallError) ||
+        error.class.name.to_s.start_with?("Ferrum::")
+    end
   end
 end

data/lib/crawlscope/cli.rb CHANGED Viewed

@@ -37,7 +37,7 @@ module Crawlscope
         @err.puts(general_usage)
         1
       end
-    rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ArgumentError => error
+    rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ValidationError, ArgumentError => error
       @err.puts(error.message)
       @err.puts("")
       @err.puts(general_usage)
@@ -49,12 +49,12 @@ module Crawlscope
     def general_usage
       <<~TEXT
         Usage:
-          crawlscope validate --base-url https://example.com [options]
+          crawlscope validate --url https://example.com [options]
           crawlscope ldjson --url https://example.com/page [options]
           crawlscope version
         Commands:
-          validate    Audit sitemap URLs for metadata, structured data, uniqueness, and links
+          validate    Audit URLs for metadata, structured data, uniqueness, and links
           ldjson      Validate structured data on one or more URLs
           version     Print the gem version
       TEXT
@@ -109,7 +109,7 @@ module Crawlscope
       configure_renderer(options[:renderer])
-      result = task.validate_ldjson(
+      result = task.validate_json_ld(
         urls: urls,
         debug: options[:debug],
         renderer: options[:renderer],
@@ -123,7 +123,7 @@ module Crawlscope
     def run_validate
       options = {
-        base_url: normalized_string(ENV["BASE_URL"]),
+        url: normalized_string(ENV["URL"]),
         rule_names: normalized_string(ENV["RULES"]),
         sitemap_path: normalized_string(ENV["SITEMAP"])
       }
@@ -134,10 +134,10 @@ module Crawlscope
       @configuration.timeout_seconds = resolved_integer("TIMEOUT", default: @configuration.timeout_seconds, minimum: 1)
       parser = OptionParser.new do |opts|
-        opts.banner = "Usage: crawlscope validate --base-url https://example.com [options]"
+        opts.banner = "Usage: crawlscope validate --url https://example.com [options]"
-        opts.on("--base-url URL", "Set the site base URL") do |value|
-          options[:base_url] = value
+        opts.on("--url URL", "Set the site URL") do |value|
+          options[:url] = value
         end
         opts.on("--sitemap PATH_OR_URL", "Set the sitemap path or URL") do |value|
@@ -168,7 +168,7 @@ module Crawlscope
       parser.parse!(@argv)
       result = task.validate(
-        base_url: options[:base_url],
+        base_url: options[:url],
         sitemap_path: options[:sitemap_path],
         rule_names: options[:rule_names]
       )
@@ -239,7 +239,7 @@ module Crawlscope
     end
     def task
-      @task ||= Task.new(configuration: @configuration, reporter: Reporter.new(io: @out))
+      @task ||= Run.new(configuration: @configuration, reporter: Reporter.new(io: @out))
     end
   end
 end

data/lib/crawlscope/configuration.rb CHANGED Viewed

@@ -7,6 +7,7 @@ module Crawlscope
     DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS = 5
     DEFAULT_BROWSER_SCROLL_PAGE = true
     DEFAULT_CONCURRENCY = 10
+    RENDERERS = %i[http browser].freeze
     DEFAULT_TIMEOUT_SECONDS = 20
     attr_writer :allowed_statuses, :base_url, :browser_factory, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :rule_registry, :schema_registry, :scroll_page, :site_name, :sitemap_path, :timeout_seconds
@@ -26,7 +27,7 @@ module Crawlscope
     def concurrency
       value = resolve(@concurrency)
-      value.nil? ? DEFAULT_CONCURRENCY : value.to_i
+      positive_integer(value, default: DEFAULT_CONCURRENCY, name: "concurrency")
     end
     def browser_concurrency
@@ -42,7 +43,7 @@ module Crawlscope
     def network_idle_timeout_seconds
       value = resolve(@network_idle_timeout_seconds)
-      value.nil? ? DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS : value.to_i
+      positive_integer(value, default: DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, name: "network_idle_timeout_seconds")
     end
     def output
@@ -55,7 +56,10 @@ module Crawlscope
       normalized_value = value.to_s.strip
       normalized_value = "http" if normalized_value.empty?
-      normalized_value.to_sym
+      renderer = normalized_value.to_sym
+      return renderer if RENDERERS.include?(renderer)
+      raise ConfigurationError, "Crawlscope renderer must be http or browser"
     end
     def rule_registry
@@ -74,7 +78,7 @@ module Crawlscope
         raise ConfigurationError, "Crawlscope sitemap_path is not configured"
       end
-      Audit.new(
+      Crawl.new(
         base_url: base_url,
         sitemap_path: sitemap_path,
         browser_factory: browser_factory,
@@ -111,7 +115,7 @@ module Crawlscope
     def timeout_seconds
       value = resolve(@timeout_seconds)
-      value.nil? ? DEFAULT_TIMEOUT_SECONDS : value.to_i
+      positive_integer(value, default: DEFAULT_TIMEOUT_SECONDS, name: "timeout_seconds")
     end
     private
@@ -119,5 +123,16 @@ module Crawlscope
     def resolve(value)
       value.respond_to?(:call) ? value.call : value
     end
+    def positive_integer(value, default:, name:)
+      return default if value.nil?
+      integer = value.is_a?(Integer) ? value : Integer(value, 10)
+      raise ArgumentError if integer < 1
+      integer
+    rescue ArgumentError, TypeError
+      raise ConfigurationError, "Crawlscope #{name} must be an integer >= 1"
+    end
   end
 end

data/lib/crawlscope/context.rb ADDED Viewed

@@ -0,0 +1,9 @@
+# frozen_string_literal: true
+module Crawlscope
+  Context = Data.define(:allowed_statuses, :base_url, :resolve_target, :schema_registry) do
+    def fetch(name)
+      public_send(name)
+    end
+  end
+end

data/lib/crawlscope/{audit.rb → crawl.rb} RENAMED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 module Crawlscope
-  class Audit
+  class Crawl
     def initialize(base_url:, sitemap_path:, rules:, schema_registry:, browser_factory: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, network_idle_timeout_seconds: Configuration::DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, renderer: :http, scroll_page: Configuration::DEFAULT_BROWSER_SCROLL_PAGE, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS, allowed_statuses: Configuration::DEFAULT_ALLOWED_STATUSES)
       @base_url = base_url
       @sitemap_path = sitemap_path
@@ -17,28 +17,15 @@ module Crawlscope
     end
     def call
-      urls = Sitemap.new(path: @sitemap_path).urls(base_url: @base_url)
-      raise ValidationError, "No URLs found in sitemap: #{@sitemap_path}" if urls.empty?
-      @page_fetcher = build_page
-      pages = Crawler.new(
-        page_fetcher: @page_fetcher,
-        concurrency: @concurrency
-      ).call(urls)
+      urls = sitemap_urls
+      @page_fetcher = page
+      pages = Crawler.new(page_fetcher: @page_fetcher, concurrency: @concurrency).call(urls)
       issues = IssueCollection.new
-      collect_crawl_issues(pages, issues)
-      cache_pages(pages)
-      context = {
-        allowed_statuses: @allowed_statuses,
-        base_url: @base_url,
-        resolve_target: method(:resolve_target),
-        schema_registry: @schema_registry
-      }
-      @rules.each do |rule|
-        rule.call(urls: urls, pages: pages, issues: issues, context: context)
-      end
+      collect(pages, issues)
+      cache(pages)
+      scan(urls, pages, issues)
       Result.new(
         base_url: @base_url,
@@ -53,8 +40,15 @@ module Crawlscope
     private
-    def build_browser
-      Crawlscope::Browser.new(
+    def sitemap_urls
+      urls = Sitemap.new(path: @sitemap_path).urls(base_url: @base_url)
+      raise ValidationError, "No URLs found in sitemap: #{@sitemap_path}" if urls.empty?
+      urls
+    end
+    def browser
+      Browser.new(
         base_url: @base_url,
         timeout_seconds: @timeout_seconds,
         network_idle_timeout_seconds: @network_idle_timeout_seconds,
@@ -64,35 +58,24 @@ module Crawlscope
       raise ConfigurationError, "Browser rendering requires the ferrum gem (#{error.message})"
     end
-    def build_page
+    def page
       if @renderer == :browser
-        browser_factory = @browser_factory || method(:build_browser)
-        browser_factory.call
+        (@browser_factory || method(:browser)).call
       else
         Http.new(base_url: @base_url, timeout_seconds: @timeout_seconds)
       end
     end
-    def build_target_resolution(page, normalized_target_url, crawled:)
-      {
-        crawled: crawled,
-        error: page.error,
-        final_url: page.normalized_final_url || normalized_target_url,
-        status: page.status
-      }
-    end
-    def cache_pages(pages)
-      @page_by_url = {}
-      @target_resolution_cache = {}
-      pages.each do |page|
-        @page_by_url[page.normalized_url] = page unless page.normalized_url.to_s.empty?
-        @page_by_url[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
-      end
+    def context
+      Context.new(
+        allowed_statuses: @allowed_statuses,
+        base_url: @base_url,
+        resolve_target: method(:resolve),
+        schema_registry: @schema_registry
+      )
     end
-    def collect_crawl_issues(pages, issues)
+    def collect(pages, issues)
       pages.each do |page|
         if page.error
           issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: page.url, message: page.error, details: {})
@@ -102,27 +85,48 @@ module Crawlscope
       end
     end
-    def resolve_target(target_url)
-      normalized_target_url = Url.normalize(target_url, base_url: @base_url)
-      return @target_resolution_cache[normalized_target_url] if @target_resolution_cache.key?(normalized_target_url)
+    def cache(pages)
+      @pages = {}
+      @targets = {}
+      pages.each do |page|
+        @pages[page.normalized_url] = page unless page.normalized_url.to_s.empty?
+        @pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
+      end
+    end
+    def scan(urls, pages, issues)
+      @rules.each do |rule|
+        rule.call(urls: urls, pages: pages, issues: issues, context: context)
+      end
+    end
+    def resolve(target_url)
+      normalized_url = Url.normalize(target_url, base_url: @base_url)
+      return @targets[normalized_url] if @targets.key?(normalized_url)
-      resolution = resolve_from_crawled_page(normalized_target_url)
-      resolution ||= resolve_by_fetching_target(normalized_target_url)
-      @target_resolution_cache[normalized_target_url] = resolution
+      @targets[normalized_url] = resolved_page(normalized_url) || fetched_page(normalized_url)
     end
-    def resolve_by_fetching_target(normalized_target_url)
-      page = @page_fetcher.fetch(normalized_target_url)
-      @page_by_url[page.normalized_url] = page unless page.normalized_url.to_s.empty?
-      @page_by_url[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
-      build_target_resolution(page, normalized_target_url, crawled: false)
+    def fetched_page(normalized_url)
+      page = @page_fetcher.fetch(normalized_url)
+      @pages[page.normalized_url] = page unless page.normalized_url.to_s.empty?
+      @pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
+      resolution(page, normalized_url, crawled: false)
     end
-    def resolve_from_crawled_page(normalized_target_url)
-      page = @page_by_url[normalized_target_url]
-      return if page.nil?
+    def resolved_page(normalized_url)
+      page = @pages[normalized_url]
+      resolution(page, normalized_url, crawled: true) if page
+    end
-      build_target_resolution(page, normalized_target_url, crawled: true)
+    def resolution(page, normalized_url, crawled:)
+      {
+        crawled: crawled,
+        error: page.error,
+        final_url: page.normalized_final_url || normalized_url,
+        status: page.status
+      }
     end
   end
 end

data/lib/crawlscope/crawler.rb CHANGED Viewed

@@ -15,7 +15,7 @@ module Crawlscope
       urls.each do |url|
         pool.post do
-          pages << @page_fetcher.fetch(url)
+          pages << fetch(url)
         end
       end
@@ -24,5 +24,23 @@ module Crawlscope
       pages.to_a
     end
+    private
+    def fetch(url)
+      @page_fetcher.fetch(url)
+    rescue => error
+      Page.new(
+        url: url,
+        normalized_url: Url.normalize(url, base_url: url),
+        final_url: url,
+        normalized_final_url: Url.normalize(url, base_url: url),
+        status: nil,
+        headers: {},
+        body: nil,
+        doc: nil,
+        error: "#{error.class}: #{error.message}"
+      )
+    end
   end
 end

data/lib/crawlscope/http.rb CHANGED Viewed

@@ -43,7 +43,7 @@ module Crawlscope
         body: body,
         doc: doc
       )
-    rescue => error
+    rescue Faraday::Error, SocketError, SystemCallError, Timeout::Error => error
       Page.new(
         url: url,
         normalized_url: Url.normalize(url, base_url: @base_url),

data/lib/crawlscope/rake_tasks.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# frozen_string_literal: true
+module Crawlscope
+  module RakeTasks
+    module_function
+    def validate
+      run("validate")
+    end
+    def ldjson
+      run("ldjson")
+    end
+    def validate_rule(rule)
+      original_rules = ENV["RULES"]
+      ENV["RULES"] = rule
+      validate
+    ensure
+      ENV["RULES"] = original_rules
+    end
+    def run(command)
+      status = Cli.start([command], out: $stdout, err: $stderr)
+      exit(status) unless status.zero?
+    end
+  end
+end