RubyGems - crawlscope - Versions diffs - 0.1.0 → 0.2.0 - Mend

crawlscope 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +7 -11
data/README.md +20 -13
data/lib/crawlscope/browser.rb +8 -0
data/lib/crawlscope/cli.rb +10 -10
data/lib/crawlscope/configuration.rb +20 -5
data/lib/crawlscope/context.rb +9 -0
data/lib/crawlscope/{audit.rb → crawl.rb} +62 -58
data/lib/crawlscope/crawler.rb +19 -1
data/lib/crawlscope/http.rb +1 -1
data/lib/crawlscope/rake_tasks.rb +28 -0
data/lib/crawlscope/rules/links.rb +76 -43
data/lib/crawlscope/rules/structured_data.rb +14 -1
data/lib/crawlscope/run.rb +60 -0
data/lib/crawlscope/schema_registry.rb +3 -349
data/lib/crawlscope/schemas.rb +355 -0
data/lib/crawlscope/sitemap.rb +18 -6
data/lib/crawlscope/structured_data/audit.rb +7 -7
data/lib/crawlscope/structured_data/check.rb +35 -0
data/lib/crawlscope/structured_data/reporter.rb +69 -0
data/lib/crawlscope/url.rb +14 -0
data/lib/crawlscope/version.rb +1 -1
data/lib/tasks/crawlscope_tasks.rake +12 -23
data/test/crawlscope/browser_test.rb +155 -0
data/test/crawlscope/cli_test.rb +128 -6
data/test/crawlscope/configuration_test.rb +49 -0
data/test/crawlscope/{audit_test.rb → crawl_test.rb} +11 -5
data/test/crawlscope/crawler_test.rb +34 -0
data/test/crawlscope/http_test.rb +56 -0
data/test/crawlscope/links_rule_test.rb +110 -5
data/test/crawlscope/rule_registry_test.rb +32 -0
data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
data/test/crawlscope/schema_registry_test.rb +19 -0
data/test/crawlscope/sitemap_test.rb +55 -0
data/test/crawlscope/structured_data_document_test.rb +36 -0
data/test/crawlscope/structured_data_report_test.rb +3 -3
data/test/crawlscope/structured_data_reporter_test.rb +2 -2
data/test/crawlscope/structured_data_rule_test.rb +20 -0
data/test/crawlscope/structured_data_writer_test.rb +2 -2
data/test/crawlscope/url_test.rb +31 -0
metadata +14 -5
data/lib/crawlscope/task.rb +0 -131

data/test/crawlscope/browser_test.rb ADDED Viewed

@@ -0,0 +1,155 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeBrowserTest < Minitest::Test
+  Response = Data.define(:url, :headers)
+  class FakeBrowser
+    attr_reader :quit_called
+    def quit
+      @quit_called = true
+    end
+  end
+  class FakeNetwork
+    attr_reader :cleared, :idle_waits, :status
+    def initialize(response:, status: 200)
+      @response = response
+      @status = status
+      @cleared = []
+      @idle_waits = []
+    end
+    def clear(scope)
+      @cleared << scope
+    end
+    attr_reader :response
+    def wait_for_idle(duration:, timeout:)
+      @idle_waits << {duration: duration, timeout: timeout}
+    end
+  end
+  class FakePage
+    attr_reader :evaluations, :network, :visited_url
+    def initialize(network:, body: "<html></html>", current_url: "", url: "")
+      @network = network
+      @body = body
+      @current_url = current_url
+      @url = url
+      @evaluations = []
+    end
+    attr_reader :body
+    attr_reader :current_url
+    def evaluate(script)
+      @evaluations << script
+    end
+    def go_to(url)
+      @visited_url = url
+    end
+    attr_reader :url
+  end
+  def test_fetch_returns_rendered_page
+    network = FakeNetwork.new(response: Response.new(url: "https://example.com/final", headers: {"content-type" => "text/html"}))
+    page = FakePage.new(network: network, body: "<html><body>Hello</body></html>")
+    browser = browser_with(page: page, scroll_page: false)
+    result = browser.fetch("https://example.com/start")
+    assert_equal "https://example.com/start", page.visited_url
+    assert_equal [:traffic], network.cleared
+    assert_equal "https://example.com/final", result.final_url
+    assert_equal "https://example.com/final", result.normalized_final_url
+    assert_equal 200, result.status
+    assert result.html?
+    assert_equal [], page.evaluations
+  end
+  def test_fetch_scrolls_when_enabled
+    network = FakeNetwork.new(response: Response.new(url: "", headers: {}))
+    page = FakePage.new(network: network, current_url: "https://example.com/current")
+    browser = browser_with(page: page, scroll_page: true)
+    result = browser.fetch("https://example.com/start")
+    assert_equal "https://example.com/current", result.final_url
+    assert_equal 3, page.evaluations.size
+    assert_equal 4, network.idle_waits.size
+  end
+  def test_fetch_falls_back_to_page_url_and_original_url
+    page_url_network = FakeNetwork.new(response: nil)
+    page_url = FakePage.new(network: page_url_network, url: "https://example.com/page")
+    page_url_result = browser_with(page: page_url).fetch("https://example.com/start")
+    original_url_network = FakeNetwork.new(response: nil)
+    original_url = FakePage.new(network: original_url_network)
+    original_url_result = browser_with(page: original_url).fetch("https://example.com/start")
+    assert_equal "https://example.com/page", page_url_result.final_url
+    assert_equal "https://example.com/start", original_url_result.final_url
+  end
+  def test_fetch_returns_error_page_when_navigation_fails
+    page = Object.new
+    def page.network
+      raise Timeout::Error, "browser failed"
+    end
+    result = browser_with(page: page).fetch("https://example.com/start")
+    assert_equal "https://example.com/start", result.final_url
+    assert_nil result.status
+    assert_equal "Timeout::Error: browser failed", result.error
+  end
+  def test_fetch_reraises_programmer_errors
+    page = Object.new
+    def page.network
+      raise NoMethodError, "bad call"
+    end
+    browser = browser_with(page: page)
+    assert_raises(NoMethodError) { browser.fetch("https://example.com/start") }
+  end
+  def test_close_quits_browser
+    fake_browser = FakeBrowser.new
+    browser = browser_with(browser: fake_browser)
+    browser.close
+    assert fake_browser.quit_called
+  end
+  def test_close_allows_missing_browser
+    browser = browser_with(browser: nil)
+    assert_nil browser.close
+  end
+  private
+  def browser_with(page: FakePage.new(network: FakeNetwork.new(response: nil)), browser: FakeBrowser.new, scroll_page: false)
+    Crawlscope::Browser.allocate.tap do |instance|
+      instance.instance_variable_set(:@base_url, "https://example.com")
+      instance.instance_variable_set(:@timeout_seconds, 20)
+      instance.instance_variable_set(:@network_idle_timeout_seconds, 5)
+      instance.instance_variable_set(:@scroll_page, scroll_page)
+      instance.instance_variable_set(:@browser, browser)
+      instance.instance_variable_set(:@page, page)
+    end
+  end
+end

data/test/crawlscope/cli_test.rb CHANGED Viewed

@@ -19,7 +19,7 @@ class CrawlscopeCliTest < Minitest::Test
   end
   class FakeTask
-    attr_reader :validate_arguments, :ldjson_arguments
+    attr_reader :validate_arguments, :json_ld_arguments
     def validate(base_url:, sitemap_path:, rule_names:)
       @validate_arguments = {
@@ -31,8 +31,8 @@ class CrawlscopeCliTest < Minitest::Test
       success_result
     end
-    def validate_ldjson(urls:, debug:, renderer:, report_path:, summary:, timeout_seconds:)
-      @ldjson_arguments = {
+    def validate_json_ld(urls:, debug:, renderer:, report_path:, summary:, timeout_seconds:)
+      @json_ld_arguments = {
         urls: urls,
         debug: debug,
         renderer: renderer,
@@ -51,6 +51,20 @@ class CrawlscopeCliTest < Minitest::Test
     end
   end
+  class FailingTask < FakeTask
+    private
+    def success_result
+      Struct.new(:ok?).new(false)
+    end
+  end
+  class InvalidTask < FakeTask
+    def validate(base_url:, sitemap_path:, rule_names:)
+      raise Crawlscope::ValidationError, "No URLs found in sitemap: #{sitemap_path}"
+    end
+  end
   def test_version_prints_current_version
     out = StringIO.new
     err = StringIO.new
@@ -70,7 +84,7 @@ class CrawlscopeCliTest < Minitest::Test
     assert_equal 1, status
     assert_includes err.string, "Unknown command: unknown"
-    assert_includes err.string, "crawlscope validate --base-url"
+    assert_includes err.string, "crawlscope validate --url"
   end
   def test_validate_passes_arguments_to_task
@@ -80,7 +94,7 @@ class CrawlscopeCliTest < Minitest::Test
     err = StringIO.new
     status = Crawlscope::Cli.start(
-      ["validate", "--base-url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3"],
+      ["validate", "--url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3"],
       out: out,
       err: err,
       configuration: configuration,
@@ -125,12 +139,120 @@ class CrawlscopeCliTest < Minitest::Test
         summary: true,
         timeout_seconds: 20
       },
-      task.ldjson_arguments
+      task.json_ld_arguments
     )
     assert_same out, configuration.output
     assert_empty err.string
   end
+  def test_validate_caps_default_browser_concurrency
+    configuration = FakeConfiguration.new
+    task = FakeTask.new
+    out = StringIO.new
+    err = StringIO.new
+    with_env("JS" => "1") do
+      status = Crawlscope::Cli.start(["validate", "--url", "https://example.com"], out: out, err: err, configuration: configuration, task: task)
+      assert_equal 0, status
+    end
+    assert_equal :browser, configuration.renderer
+    assert_equal 4, configuration.concurrency
+    assert_includes out.string, "Default JS concurrency capped at 4"
+  end
+  def test_validate_uses_url_environment_as_base_url_for_default_sitemap
+    configuration = FakeConfiguration.new
+    task = FakeTask.new
+    with_env("URL" => "https://example.com") do
+      status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
+      assert_equal 0, status
+    end
+    assert_equal "https://example.com", task.validate_arguments[:base_url]
+    assert_nil task.validate_arguments[:sitemap_path]
+  end
+  def test_validate_uses_sitemap_mode_when_sitemap_is_configured
+    task = FakeTask.new
+    with_env("URL" => "https://example.com", "SITEMAP" => "https://example.com/sitemap.xml") do
+      status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: FakeConfiguration.new, task: task)
+      assert_equal 0, status
+    end
+    assert_equal "https://example.com", task.validate_arguments[:base_url]
+    assert_equal "https://example.com/sitemap.xml", task.validate_arguments[:sitemap_path]
+  end
+  def test_ldjson_accepts_repeated_urls_and_options
+    configuration = FakeConfiguration.new
+    task = FakeTask.new
+    out = StringIO.new
+    err = StringIO.new
+    status = Crawlscope::Cli.start(
+      ["ldjson", "--url", "https://example.com/a", "--url", "https://example.com/b", "--renderer", "browser", "--timeout", "12", "--network-idle-timeout", "3", "--report-path", "report.json", "--debug", "--summary"],
+      out: out,
+      err: err,
+      configuration: configuration,
+      task: task
+    )
+    assert_equal 0, status
+    assert_equal(
+      {
+        urls: ["https://example.com/a", "https://example.com/b"],
+        debug: true,
+        renderer: :browser,
+        report_path: "report.json",
+        summary: true,
+        timeout_seconds: 12
+      },
+      task.json_ld_arguments
+    )
+    assert_equal 3, configuration.network_idle_timeout_seconds
+  end
+  def test_ldjson_requires_urls
+    out = StringIO.new
+    err = StringIO.new
+    status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task: FakeTask.new)
+    assert_equal 1, status
+    assert_includes err.string, "Crawlscope URL is not configured"
+  end
+  def test_invalid_integer_option_returns_error
+    out = StringIO.new
+    err = StringIO.new
+    status = Crawlscope::Cli.start(["validate", "--timeout", "0"], out: out, err: err, configuration: FakeConfiguration.new, task: FakeTask.new)
+    assert_equal 1, status
+    assert_includes err.string, "timeout must be >= 1"
+  end
+  def test_failed_result_returns_failed_status
+    status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: FakeConfiguration.new, task: FailingTask.new)
+    assert_equal 1, status
+  end
+  def test_validation_errors_return_failed_status_without_reraising
+    err = StringIO.new
+    status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: err, configuration: FakeConfiguration.new, task: InvalidTask.new)
+    assert_equal 1, status
+    assert_includes err.string, "No URLs found in sitemap"
+  end
   private
   def with_env(overrides)

data/test/crawlscope/configuration_test.rb CHANGED Viewed

@@ -42,4 +42,53 @@ class CrawlscopeConfigurationTest < Minitest::Test
     assert_equal "Crawlscope sitemap_path is not configured", error.message
   end
+  def test_defaults_are_normalized
+    config = Crawlscope::Configuration.new
+    assert_equal [200, 301, 302], config.allowed_statuses
+    assert_equal 10, config.concurrency
+    assert_equal 4, config.browser_concurrency
+    assert_equal 5, config.network_idle_timeout_seconds
+    assert_equal :http, config.renderer
+    assert_equal 20, config.timeout_seconds
+    assert_equal $stdout, config.output
+    assert config.scroll_page?
+  end
+  def test_configured_values_are_normalized
+    config = Crawlscope::Configuration.new
+    config.allowed_statuses = ["200", "404"]
+    config.concurrency = "2"
+    config.network_idle_timeout_seconds = "7"
+    config.renderer = "browser"
+    config.timeout_seconds = "9"
+    config.scroll_page = false
+    assert_equal [200, 404], config.allowed_statuses
+    assert_equal 2, config.concurrency
+    assert_equal 2, config.browser_concurrency
+    assert_equal 7, config.network_idle_timeout_seconds
+    assert_equal :browser, config.renderer
+    assert_equal 9, config.timeout_seconds
+    refute config.scroll_page?
+  end
+  def test_renderer_must_be_supported
+    config = Crawlscope::Configuration.new
+    config.renderer = "webkit"
+    error = assert_raises(Crawlscope::ConfigurationError) { config.renderer }
+    assert_equal "Crawlscope renderer must be http or browser", error.message
+  end
+  def test_numeric_values_must_be_positive_integers
+    config = Crawlscope::Configuration.new
+    config.concurrency = "0"
+    error = assert_raises(Crawlscope::ConfigurationError) { config.concurrency }
+    assert_equal "Crawlscope concurrency must be an integer >= 1", error.message
+  end
 end

data/test/crawlscope/{audit_test.rb → crawl_test.rb} RENAMED Viewed

@@ -2,7 +2,7 @@
 require "test_helper"
-class CrawlscopeAuditTest < Minitest::Test
+class CrawlscopeCrawlTest < Minitest::Test
   def setup
     @tmp_dir = Dir.mktmpdir
     @sitemap_path = File.join(@tmp_dir, "sitemap.xml")
@@ -33,6 +33,9 @@ class CrawlscopeAuditTest < Minitest::Test
               <title>Pricing</title>
               <meta name="description" content="Plans for hotels and restaurants">
               <link rel="canonical" href="https://example.com/pricing">
+              <script type="application/ld+json">
+                {"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
+              </script>
             </head>
             <body>
               <main>
@@ -43,7 +46,7 @@ class CrawlscopeAuditTest < Minitest::Test
         HTML
       )
-    result = Crawlscope::Audit.new(
+    result = Crawlscope::Crawl.new(
       base_url: "https://example.com",
       sitemap_path: @sitemap_path,
       rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
@@ -84,7 +87,7 @@ class CrawlscopeAuditTest < Minitest::Test
         HTML
       )
-    result = Crawlscope::Audit.new(
+    result = Crawlscope::Crawl.new(
       base_url: "https://example.com",
       sitemap_path: @sitemap_path,
       rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
@@ -92,7 +95,7 @@ class CrawlscopeAuditTest < Minitest::Test
     ).call
     refute result.ok?
-    assert_equal %i[meta_description_too_long missing_canonical missing_h1 title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
+    assert_equal %i[meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
   end
   def test_uses_browser_when_renderer_is_browser
@@ -127,6 +130,9 @@ class CrawlscopeAuditTest < Minitest::Test
               <title>Pricing</title>
               <meta name="description" content="Plans for hotels and restaurants">
               <link rel="canonical" href="https://example.com/pricing">
+              <script type="application/ld+json">
+                {"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
+              </script>
             </head>
             <body>
               <main>
@@ -149,7 +155,7 @@ class CrawlscopeAuditTest < Minitest::Test
       end
     end.new
-    result = Crawlscope::Audit.new(
+    result = Crawlscope::Crawl.new(
       base_url: "https://example.com",
       sitemap_path: @sitemap_path,
       rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,

data/test/crawlscope/crawler_test.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeCrawlerTest < Minitest::Test
+  class RaisingFetcher
+    def fetch(url)
+      raise Timeout::Error, "fetch timed out" if url.include?("timeout")
+      Crawlscope::Page.new(
+        url: url,
+        normalized_url: url,
+        final_url: url,
+        normalized_final_url: url,
+        status: 200,
+        headers: {},
+        body: "<html></html>",
+        doc: Nokogiri::HTML("<html></html>")
+      )
+    end
+  end
+  def test_returns_error_page_when_fetcher_raises
+    pages = Crawlscope::Crawler.new(page_fetcher: RaisingFetcher.new, concurrency: 2).call(
+      ["https://example.com/ok", "https://example.com/timeout"]
+    )
+    assert_equal 2, pages.size
+    error_page = pages.find { |page| page.url == "https://example.com/timeout" }
+    assert_nil error_page.status
+    assert_equal "Timeout::Error: fetch timed out", error_page.error
+  end
+end

data/test/crawlscope/http_test.rb ADDED Viewed

@@ -0,0 +1,56 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeHttpTest < Minitest::Test
+  def test_fetch_parses_html_response
+    stub_request(:get, "https://example.com/page")
+      .to_return(status: 200, headers: {"Content-Type" => "text/html"}, body: "<html><body>Hello</body></html>")
+    page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/page")
+    assert_equal 200, page.status
+    assert page.html?
+    assert_equal "Hello", page.doc.at_css("body").text
+  end
+  def test_fetch_parses_responses_without_content_type_as_html
+    stub_request(:get, "https://example.com/page")
+      .to_return(status: 200, body: "<html><body>Hello</body></html>")
+    page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/page")
+    assert page.html?
+  end
+  def test_fetch_leaves_non_html_response_unparsed
+    stub_request(:get, "https://example.com/feed.xml")
+      .to_return(status: 200, headers: {"content-type" => "application/xml"}, body: "<feed></feed>")
+    page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/feed.xml")
+    assert_equal 200, page.status
+    refute page.html?
+    assert_equal "<feed></feed>", page.body
+  end
+  def test_fetch_returns_error_page_for_failed_requests
+    stub_request(:get, "https://example.com/down").to_timeout
+    page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/down")
+    assert_nil page.status
+    assert_includes page.error, "Faraday::ConnectionFailed"
+    assert_equal "https://example.com/down", page.final_url
+  end
+  def test_fetch_reraises_programmer_errors
+    http = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2)
+    def http.connection
+      raise NoMethodError, "bad call"
+    end
+    assert_raises(NoMethodError) { http.fetch("https://example.com/down") }
+  end
+end

data/test/crawlscope/links_rule_test.rb CHANGED Viewed

@@ -38,19 +38,124 @@ class CrawlscopeLinksRuleTest < Minitest::Test
       urls: ["https://example.com/guide", "https://example.com/pricing"],
       pages: pages,
       issues: issues,
-      context: {
-        allowed_statuses: [200, 301, 302],
-        base_url: "https://example.com",
-        resolve_target: method(:resolve_target)
-      }
+      context: context
     )
     assert_equal [:broken_internal_link], issues.to_a.map(&:code)
     assert_includes issues.to_a.first.message, "HTTP 404"
   end
+  def test_reports_unresolved_internal_links
+    issues = Crawlscope::IssueCollection.new
+    Crawlscope::Rules::Links.new.call(
+      urls: [],
+      pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/unknown\">Unknown</a></main>")],
+      issues: issues,
+      context: context(resolver: ->(_target_url) {})
+    )
+    assert_includes issues.to_a.map(&:code), :unresolved_internal_link
+    assert_includes issues.to_a.find { |issue| issue.code == :unresolved_internal_link }.message, "unable to validate internal link"
+  end
+  def test_ignores_fetch_errors_for_urls_already_crawled
+    issues = Crawlscope::IssueCollection.new
+    resolver = lambda do |target_url|
+      {
+        crawled: true,
+        error: "Timeout::Error: timed out",
+        final_url: target_url,
+        status: nil
+      }
+    end
+    Crawlscope::Rules::Links.new.call(
+      urls: [],
+      pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/timeout\">Timeout</a></main>")],
+      issues: issues,
+      context: context(resolver: resolver)
+    )
+    assert_empty issues.to_a
+  end
+  def test_reports_fetch_errors_for_uncrawled_targets
+    issues = Crawlscope::IssueCollection.new
+    resolver = lambda do |target_url|
+      {
+        crawled: false,
+        error: "Timeout::Error: timed out",
+        final_url: target_url,
+        status: nil
+      }
+    end
+    Crawlscope::Rules::Links.new.call(
+      urls: [],
+      pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/timeout\">Timeout</a></main>")],
+      issues: issues,
+      context: context(resolver: resolver)
+    )
+    assert_equal [:unresolved_internal_link], issues.to_a.map(&:code)
+  end
+  def test_reports_low_inbound_anchor_links
+    issues = Crawlscope::IssueCollection.new
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/guide", "https://example.com/pricing"],
+      pages: [
+        page(url: "https://example.com/guide", body: "<main><a href=\"/pricing\">Pricing</a></main>"),
+        page(url: "https://example.com/pricing", body: "<main><p>Pricing</p></main>")
+      ],
+      issues: issues,
+      context: context
+    )
+    assert_equal [:low_inbound_anchor_links], issues.to_a.map(&:code)
+    assert_equal "https://example.com/guide", issues.to_a.first.url
+  end
+  def test_ignores_links_that_should_not_be_crawled
+    issues = Crawlscope::IssueCollection.new
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/guide"],
+      pages: [
+        page(
+          url: "https://example.com/guide",
+          body: <<~HTML
+            <html>
+              <body>
+                <a href="#section">Jump</a>
+                <a href="mailto:test@example.com">Email</a>
+                <a href="https://other.example.com/page">External</a>
+                <a href="/rails/info">Rails</a>
+                <a href="/empty">   </a>
+              </body>
+            </html>
+          HTML
+        )
+      ],
+      issues: issues,
+      context: context
+    )
+    assert_empty issues.to_a
+  end
   private
+  def context(resolver: method(:resolve_target))
+    {
+      allowed_statuses: [200, 301, 302],
+      base_url: "https://example.com",
+      resolve_target: resolver
+    }
+  end
   def page(url:, body:)
     doc = Nokogiri::HTML(body)