RubyGems - crawlscope - Versions diffs - 0.5.0 → 0.6.0 - Mend

crawlscope 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +27 -0
data/README.md +32 -0
data/lib/crawlscope/cli.rb +16 -0
data/lib/crawlscope/configuration.rb +10 -1
data/lib/crawlscope/context.rb +1 -1
data/lib/crawlscope/crawl.rb +72 -14
data/lib/crawlscope/crawler.rb +3 -17
data/lib/crawlscope/document_text.rb +7 -2
data/lib/crawlscope/fetch_executor/async.rb +32 -0
data/lib/crawlscope/fetch_executor/threaded.rb +32 -0
data/lib/crawlscope/fetch_executor.rb +43 -0
data/lib/crawlscope/http.rb +7 -1
data/lib/crawlscope/reporter.rb +123 -14
data/lib/crawlscope/result.rb +1 -1
data/lib/crawlscope/rules/content_quality.rb +1 -1
data/lib/crawlscope/rules/indexability.rb +28 -6
data/lib/crawlscope/rules/links.rb +80 -16
data/lib/crawlscope/rules/uniqueness.rb +23 -4
data/lib/crawlscope/sitemap.rb +30 -11
data/lib/crawlscope/version.rb +1 -1
data/lib/tasks/crawlscope_tasks.rake +1 -1
data/test/crawlscope/cli_test.rb +28 -2
data/test/crawlscope/configuration_test.rb +21 -0
data/test/crawlscope/content_quality_rule_test.rb +18 -0
data/test/crawlscope/crawl_test.rb +142 -4
data/test/crawlscope/crawler_test.rb +61 -0
data/test/crawlscope/fetch_executor_test.rb +44 -0
data/test/crawlscope/links_rule_test.rb +101 -0
data/test/crawlscope/reporter_test.rb +136 -11
data/test/crawlscope/result_test.rb +35 -0
data/test/crawlscope/sitemap_test.rb +52 -0
data/test/performance/async_fetch_benchmark.rb +127 -0
data/test/performance/fetch_executor_matrix.rb +162 -0
data/test/performance/sitemap_expansion_benchmark.rb +121 -0
metadata +38 -2

data/test/crawlscope/content_quality_rule_test.rb CHANGED Viewed

@@ -27,6 +27,24 @@ class CrawlscopeContentQualityRuleTest < Minitest::Test
     refute_includes issues.to_a.map(&:code), :low_visible_text_ratio
   end
+  def test_visible_text_ratio_ignores_form_payload_markup
+    issues = Crawlscope::IssueCollection.new
+    page = page_with(
+      main: <<~HTML
+        <p>#{Array.new(260) { |index| "word#{index}" }.join(" ")}</p>
+        <form>
+          <div data-select-autocomplete-options-value="#{"x" * 50_000}">
+            <input type="text" name="country">
+          </div>
+        </form>
+      HTML
+    )
+    Crawlscope::Rules::ContentQuality.new.call(urls: [page.url], pages: [page], issues: issues)
+    refute_includes issues.to_a.map(&:code), :low_visible_text_ratio
+  end
   def test_reports_low_unique_token_ratio_for_repetitive_content
     issues = Crawlscope::IssueCollection.new
     page = page_with(main: ("hotel location service " * 100).strip)

data/test/crawlscope/crawl_test.rb CHANGED Viewed

@@ -3,6 +3,36 @@
 require "test_helper"
 class CrawlscopeCrawlTest < Minitest::Test
+  class RecordingExecutor
+    attr_reader :batches
+    def initialize
+      @batches = []
+    end
+    def call(urls)
+      @batches << urls
+      urls.map { |url| yield(url) }
+    end
+  end
+  class PageMapFetcher
+    attr_reader :closed
+    def initialize(pages)
+      @pages = pages
+      @closed = false
+    end
+    def close
+      @closed = true
+    end
+    def fetch(url)
+      @pages.fetch(url)
+    end
+  end
   def setup
     @tmp_dir = Dir.mktmpdir
     @sitemap_path = File.join(@tmp_dir, "sitemap.xml")
@@ -12,6 +42,29 @@ class CrawlscopeCrawlTest < Minitest::Test
     FileUtils.rm_rf(@tmp_dir)
   end
+  def test_http_renderer_defaults_to_async_executor
+    crawl = Crawlscope::Crawl.new(
+      base_url: "https://example.com",
+      sitemap_path: @sitemap_path,
+      rules: [],
+      schema_registry: Crawlscope::SchemaRegistry.default
+    )
+    assert_equal :async, crawl.instance_variable_get(:@fetch_executor)
+  end
+  def test_browser_renderer_defaults_to_threaded_executor
+    crawl = Crawlscope::Crawl.new(
+      base_url: "https://example.com",
+      sitemap_path: @sitemap_path,
+      rules: [],
+      schema_registry: Crawlscope::SchemaRegistry.default,
+      renderer: :browser
+    )
+    assert_equal :threaded, crawl.instance_variable_get(:@fetch_executor)
+  end
   def test_returns_ok_when_metadata_is_valid
     File.write(
       @sitemap_path,
@@ -56,7 +109,8 @@ class CrawlscopeCrawlTest < Minitest::Test
       base_url: "https://example.com",
       sitemap_path: @sitemap_path,
       rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
-      schema_registry: Crawlscope::SchemaRegistry.default
+      schema_registry: Crawlscope::SchemaRegistry.default,
+      fetch_executor: :threaded
     ).call
     assert result.ok?
@@ -97,10 +151,11 @@ class CrawlscopeCrawlTest < Minitest::Test
       base_url: "https://example.com",
       sitemap_path: @sitemap_path,
       rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
-      schema_registry: Crawlscope::SchemaRegistry.default
+      schema_registry: Crawlscope::SchemaRegistry.default,
+      fetch_executor: :threaded
     ).call
-    refute result.ok?
+    assert result.ok?
     assert_equal %i[
       incomplete_open_graph_tags
       meta_description_too_long
@@ -189,6 +244,31 @@ class CrawlscopeCrawlTest < Minitest::Test
     assert fake_browser.closed
   end
+  def test_async_executor_requires_http_renderer
+    File.write(
+      @sitemap_path,
+      <<~XML
+        <?xml version="1.0" encoding="UTF-8"?>
+        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+          <url><loc>https://example.com/pricing</loc></url>
+        </urlset>
+      XML
+    )
+    error = assert_raises(Crawlscope::ConfigurationError) do
+      Crawlscope::Crawl.new(
+        base_url: "https://example.com",
+        sitemap_path: @sitemap_path,
+        rules: [],
+        schema_registry: Crawlscope::SchemaRegistry.default,
+        renderer: :browser,
+        fetch_executor: :async
+      ).call
+    end
+    assert_equal "Async fetch execution is only supported with http rendering", error.message
+  end
   def test_reports_sitemap_redirect_url
     File.write(
       @sitemap_path,
@@ -209,9 +289,67 @@ class CrawlscopeCrawlTest < Minitest::Test
       base_url: "https://example.com",
       sitemap_path: @sitemap_path,
       rules: [],
-      schema_registry: Crawlscope::SchemaRegistry.default
+      schema_registry: Crawlscope::SchemaRegistry.default,
+      fetch_executor: :threaded
     ).call
     assert_includes result.issues.to_a.map(&:code), :sitemap_redirect_url
   end
+  def test_resolves_uncrawled_link_targets_as_a_bounded_batch
+    File.write(
+      @sitemap_path,
+      <<~XML
+        <?xml version="1.0" encoding="UTF-8"?>
+        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+          <url><loc>https://example.com/guide</loc></url>
+        </urlset>
+      XML
+    )
+    executor = RecordingExecutor.new
+    fetcher = PageMapFetcher.new(
+      "https://example.com/guide" => page(
+        "https://example.com/guide",
+        "<main><a href=\"/one\">One</a><a href=\"/two\">Two</a></main>"
+      ),
+      "https://example.com/one" => page("https://example.com/one", "<main>One</main>"),
+      "https://example.com/two" => page("https://example.com/two", "<main>Two</main>")
+    )
+    Crawlscope::Crawl.new(
+      base_url: "https://example.com",
+      sitemap_path: @sitemap_path,
+      rules: [Crawlscope::Rules::Links.new],
+      schema_registry: Crawlscope::SchemaRegistry.default,
+      renderer: :browser,
+      browser_factory: -> { fetcher },
+      fetch_executor: executor,
+      concurrency: 2
+    ).call
+    assert_equal(
+      [
+        ["https://example.com/guide"],
+        ["https://example.com/one", "https://example.com/two"]
+      ],
+      executor.batches
+    )
+    assert fetcher.closed
+  end
+  private
+  def page(url, body)
+    Crawlscope::Page.new(
+      url: url,
+      normalized_url: url,
+      final_url: url,
+      normalized_final_url: url,
+      status: 200,
+      headers: {"content-type" => "text/html"},
+      body: body,
+      doc: Nokogiri::HTML(body)
+    )
+  end
 end

data/test/crawlscope/crawler_test.rb CHANGED Viewed

@@ -31,4 +31,65 @@ class CrawlscopeCrawlerTest < Minitest::Test
     assert_nil error_page.status
     assert_equal "Timeout::Error: fetch timed out", error_page.error
   end
+  def test_preserves_input_order
+    pages = Crawlscope::Crawler.new(page_fetcher: RaisingFetcher.new, concurrency: 2).call(
+      ["https://example.com/one", "https://example.com/two", "https://example.com/three"]
+    )
+    assert_equal(
+      ["https://example.com/one", "https://example.com/two", "https://example.com/three"],
+      pages.map(&:url)
+    )
+  end
+  class AsyncFetcher
+    attr_reader :active_fetches
+    def initialize
+      @active_fetches = 0
+      @max_active_fetches = 0
+      @mutex = Mutex.new
+    end
+    def fetch(url)
+      @mutex.synchronize do
+        @active_fetches += 1
+        @max_active_fetches = [@max_active_fetches, @active_fetches].max
+      end
+      Async::Task.current.sleep(0.01)
+      Crawlscope::Page.new(
+        url: url,
+        normalized_url: url,
+        final_url: url,
+        normalized_final_url: url,
+        status: 200,
+        headers: {},
+        body: "<html></html>",
+        doc: Nokogiri::HTML("<html></html>")
+      )
+    ensure
+      @mutex.synchronize { @active_fetches -= 1 }
+    end
+    def max_active_fetches
+      @mutex.synchronize { @max_active_fetches }
+    end
+  end
+  def test_async_executor_respects_concurrency_and_preserves_order
+    fetcher = AsyncFetcher.new
+    pages = Crawlscope::Crawler.new(page_fetcher: fetcher, concurrency: 2, fetch_executor: :async).call(
+      ["https://example.com/one", "https://example.com/two", "https://example.com/three"]
+    )
+    assert_equal(
+      ["https://example.com/one", "https://example.com/two", "https://example.com/three"],
+      pages.map(&:url)
+    )
+    assert_operator fetcher.max_active_fetches, :<=, 2
+  end
 end

data/test/crawlscope/fetch_executor_test.rb ADDED Viewed

@@ -0,0 +1,44 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeFetchExecutorTest < Minitest::Test
+  class RecordingExecutor
+    attr_reader :items
+    def call(items)
+      @items = items
+      items.map { |item| yield(item) }
+    end
+  end
+  def test_map_preserves_input_order
+    results = Crawlscope::FetchExecutor.map(name: :threaded, concurrency: 2, items: [3, 1, 2]) do |item|
+      item * 10
+    end
+    assert_equal [30, 10, 20], results
+  end
+  def test_map_uses_sequential_fallback_for_single_item
+    executor = RecordingExecutor.new
+    results = Crawlscope::FetchExecutor.map(name: executor, concurrency: 4, items: ["one"]) do |item|
+      item.upcase
+    end
+    assert_equal ["ONE"], results
+    assert_nil executor.items
+  end
+  def test_map_uses_injected_executor_for_parallel_work
+    executor = RecordingExecutor.new
+    results = Crawlscope::FetchExecutor.map(name: executor, concurrency: 4, items: %w[a b]) do |item|
+      item.upcase
+    end
+    assert_equal %w[A B], results
+    assert_equal %w[a b], executor.items
+  end
+end

data/test/crawlscope/links_rule_test.rb CHANGED Viewed

@@ -218,6 +218,33 @@ class CrawlscopeLinksRuleTest < Minitest::Test
     assert_includes codes, :canonical_points_to_redirect
   end
+  def test_does_not_report_missing_inlinks_for_root_canonical
+    issues = Crawlscope::IssueCollection.new
+    resolver = lambda do |target_url|
+      {crawled: true, error: nil, final_url: target_url, html: true, status: 200}
+    end
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/", "https://example.com/about"],
+      pages: [
+        page(
+          url: "https://example.com/",
+          body: <<~HTML
+            <html>
+              <head><link rel="canonical" href="https://example.com/"></head>
+              <body><main><a href="/about">About</a></main></body>
+            </html>
+          HTML
+        ),
+        page(url: "https://example.com/about", body: "<main><a href=\"/\">Home</a></main>")
+      ],
+      issues: issues,
+      context: context(resolver: resolver)
+    )
+    refute_includes issues.to_a.map(&:code), :canonical_no_internal_inlinks
+  end
   def test_reports_indexable_internal_pages_missing_from_sitemap
     issues = Crawlscope::IssueCollection.new
     resolver = lambda do |target_url|
@@ -242,6 +269,54 @@ class CrawlscopeLinksRuleTest < Minitest::Test
     assert_equal "https://example.com/hidden", issue.url
   end
+  def test_does_not_report_noindex_internal_pages_missing_from_sitemap
+    issues = Crawlscope::IssueCollection.new
+    resolver = lambda do |target_url|
+      {
+        crawled: false,
+        doc: Nokogiri::HTML("<head><meta name=\"robots\" content=\"noindex, follow\"></head>"),
+        error: nil,
+        final_url: target_url,
+        headers: {},
+        html: true,
+        status: 200
+      }
+    end
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/guide"],
+      pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
+      issues: issues,
+      context: context(resolver: resolver)
+    )
+    refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap
+  end
+  def test_does_not_report_x_robots_noindex_internal_pages_missing_from_sitemap
+    issues = Crawlscope::IssueCollection.new
+    resolver = lambda do |target_url|
+      {
+        crawled: false,
+        doc: Nokogiri::HTML("<main>Hidden</main>"),
+        error: nil,
+        final_url: target_url,
+        headers: {"X-Robots-Tag" => "noindex"},
+        html: true,
+        status: 200
+      }
+    end
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/guide"],
+      pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
+      issues: issues,
+      context: context(resolver: resolver)
+    )
+    refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap
+  end
   def test_reports_url_hygiene_issues
     issues = Crawlscope::IssueCollection.new
     long_path = "a" * 2_050
@@ -300,6 +375,32 @@ class CrawlscopeLinksRuleTest < Minitest::Test
     assert_includes redirect_issue.message, "https://example.com/pricing"
   end
+  def test_reuses_link_target_resolution_for_later_link_checks
+    issues = Crawlscope::IssueCollection.new
+    resolution_counts = Hash.new(0)
+    resolver = lambda do |target_url|
+      resolution_counts[target_url] += 1
+      {
+        crawled: false,
+        doc: Nokogiri::HTML("<main>Hidden</main>"),
+        error: nil,
+        final_url: target_url,
+        headers: {},
+        html: true,
+        status: 200
+      }
+    end
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/guide"],
+      pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
+      issues: issues,
+      context: context(resolver: resolver)
+    )
+    assert_equal 1, resolution_counts.fetch("https://example.com/hidden")
+  end
   def test_ignores_links_that_should_not_be_crawled
     issues = Crawlscope::IssueCollection.new

data/test/crawlscope/reporter_test.rb CHANGED Viewed

@@ -23,11 +23,25 @@ class CrawlscopeReporterTest < Minitest::Test
     refute_includes output, "Status: FAILED"
   end
-  def test_reports_failed_result_with_grouped_counts_and_offenses
+  def test_reports_warning_result_with_grouped_one_line_issues
     io = StringIO.new
     issues = Crawlscope::IssueCollection.new
+    4.times do |index|
+      issues.add(
+        code: :low_dofollow_inlinks,
+        severity: :warning,
+        category: :links,
+        url: "https://example.com/page-#{index + 1}",
+        message: "dofollow inbound links 1 below 2",
+        details: {
+          dofollow_inbound_count: 1,
+          minimum: 2,
+          source_urls: ["https://example.com/source-#{index + 1}"]
+        }
+      )
+    end
     issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com/a", message: "missing <title>", details: {})
-    issues.add(code: :broken_internal_link, severity: :notice, category: :links, url: "https://example.com/b", message: "broken internal link", details: {})
     result = Crawlscope::Result.new(
       base_url: "https://example.com",
       sitemap_path: "/tmp/sitemap.xml",
@@ -40,15 +54,126 @@ class CrawlscopeReporterTest < Minitest::Test
     output = io.string
+    assert_includes output, "Status: WARNINGS"
+    refute_includes output, "Status: FAILED"
+    assert_includes output, "Issues: 5 total (5 warnings)"
+    assert_includes output, "Summary:"
+    assert_includes output, "links / low_dofollow_inlinks: 4"
+    assert_includes output, "  - /page-1  inbound 1/2  sources: /source-1"
+    assert_includes output, "  - /page-4  inbound 1/2  sources: /source-4"
+    assert_includes output, "metadata / missing_title: 1"
+    refute_includes output, "Severity:"
+    refute_includes output, "Category:"
+    refute_includes output, "... 1 more"
+  end
+  def test_reports_failed_status_when_errors_are_present
+    io = StringIO.new
+    issues = Crawlscope::IssueCollection.new
+    issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: "https://example.com/a", message: "timeout", details: {})
+    issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com/a", message: "missing <title>", details: {})
+    result = Crawlscope::Result.new(
+      base_url: "https://example.com",
+      sitemap_path: "/tmp/sitemap.xml",
+      urls: ["https://example.com/a"],
+      pages: [Object.new],
+      issues: issues
+    )
+    Crawlscope::Reporter.new(io: io).report(result)
+    output = io.string
     assert_includes output, "Status: FAILED"
-    assert_includes output, "Issues: 2"
-    assert_includes output, "Severity:"
-    assert_includes output, "notice: 1"
-    assert_includes output, "warning: 1"
-    assert_includes output, "Category:"
-    assert_includes output, "links: 1"
-    assert_includes output, "metadata: 1"
-    assert_includes output, "  - [warning] missing_title https://example.com/a missing <title>"
-    assert_includes output, "  - [notice] broken_internal_link https://example.com/b broken internal link"
+    assert_includes output, "Issues: 2 total (1 error, 1 warning)"
+  end
+  def test_limits_large_issue_groups
+    io = StringIO.new
+    issues = Crawlscope::IssueCollection.new
+    21.times do |index|
+      issues.add(
+        code: :low_dofollow_inlinks,
+        severity: :warning,
+        category: :links,
+        url: "https://example.com/page-#{index + 1}",
+        message: "dofollow inbound links 1 below 2",
+        details: {dofollow_inbound_count: 1, minimum: 2}
+      )
+    end
+    result = Crawlscope::Result.new(
+      base_url: "https://example.com",
+      sitemap_path: "/tmp/sitemap.xml",
+      urls: ["https://example.com"],
+      pages: [Object.new],
+      issues: issues
+    )
+    Crawlscope::Reporter.new(io: io).report(result)
+    output = io.string
+    assert_includes output, "links / low_dofollow_inlinks: 21"
+    assert_includes output, "  - /page-20  inbound 1/2"
+    refute_includes output, "  - /page-21"
+    assert_includes output, "  ... 1 more"
+  end
+  def test_reports_ratio_with_enough_precision_to_show_threshold_difference
+    io = StringIO.new
+    issues = Crawlscope::IssueCollection.new
+    issues.add(
+      code: :low_unique_token_ratio,
+      severity: :warning,
+      category: :content_quality,
+      url: "https://example.com/a",
+      message: "visible text has low token variety",
+      details: {ratio: 0.249, threshold: 0.25}
+    )
+    result = Crawlscope::Result.new(
+      base_url: "https://example.com",
+      sitemap_path: "/tmp/sitemap.xml",
+      urls: ["https://example.com/a"],
+      pages: [Object.new],
+      issues: issues
+    )
+    Crawlscope::Reporter.new(io: io).report(result)
+    assert_includes io.string, "ratio 0.249/0.250"
+  end
+  def test_reports_source_details_on_one_line
+    io = StringIO.new
+    issues = Crawlscope::IssueCollection.new
+    4.times do |index|
+      issues.add(
+        code: :indexable_page_missing_from_sitemap,
+        severity: :warning,
+        category: :sitemaps,
+        url: "https://example.com/overview-#{index + 1}",
+        message: "indexable internal page is missing from sitemap",
+        details: {source_url: "https://example.com/source-#{index + 1}"}
+      )
+    end
+    result = Crawlscope::Result.new(
+      base_url: "https://example.com",
+      sitemap_path: "/tmp/sitemap.xml",
+      urls: ["https://example.com"],
+      pages: [Object.new],
+      issues: issues
+    )
+    Crawlscope::Reporter.new(io: io).report(result)
+    output = io.string
+    assert_includes output, "sitemaps / indexable_page_missing_from_sitemap: 4"
+    assert_includes output, "  - /overview-1  indexable internal page is missing from sitemap  source: /source-1"
+    assert_includes output, "  - /overview-4  indexable internal page is missing from sitemap  source: /source-4"
   end
 end

data/test/crawlscope/result_test.rb ADDED Viewed

@@ -0,0 +1,35 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeResultTest < Minitest::Test
+  def test_ok_when_result_has_warnings_only
+    issues = Crawlscope::IssueCollection.new
+    issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com", message: "missing <title>", details: {})
+    result = result_with(issues)
+    assert result.ok?
+  end
+  def test_not_ok_when_result_has_errors
+    issues = Crawlscope::IssueCollection.new
+    issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: "https://example.com", message: "timeout", details: {})
+    result = result_with(issues)
+    refute result.ok?
+  end
+  private
+  def result_with(issues)
+    Crawlscope::Result.new(
+      base_url: "https://example.com",
+      sitemap_path: "/tmp/sitemap.xml",
+      urls: ["https://example.com"],
+      pages: [Object.new],
+      issues: issues
+    )
+  end
+end