RubyGems - crawlscope - Versions diffs - 0.5.0 → 0.6.0 - Mend

crawlscope 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +27 -0
data/README.md +32 -0
data/lib/crawlscope/cli.rb +16 -0
data/lib/crawlscope/configuration.rb +10 -1
data/lib/crawlscope/context.rb +1 -1
data/lib/crawlscope/crawl.rb +72 -14
data/lib/crawlscope/crawler.rb +3 -17
data/lib/crawlscope/document_text.rb +7 -2
data/lib/crawlscope/fetch_executor/async.rb +32 -0
data/lib/crawlscope/fetch_executor/threaded.rb +32 -0
data/lib/crawlscope/fetch_executor.rb +43 -0
data/lib/crawlscope/http.rb +7 -1
data/lib/crawlscope/reporter.rb +123 -14
data/lib/crawlscope/result.rb +1 -1
data/lib/crawlscope/rules/content_quality.rb +1 -1
data/lib/crawlscope/rules/indexability.rb +28 -6
data/lib/crawlscope/rules/links.rb +80 -16
data/lib/crawlscope/rules/uniqueness.rb +23 -4
data/lib/crawlscope/sitemap.rb +30 -11
data/lib/crawlscope/version.rb +1 -1
data/lib/tasks/crawlscope_tasks.rake +1 -1
data/test/crawlscope/cli_test.rb +28 -2
data/test/crawlscope/configuration_test.rb +21 -0
data/test/crawlscope/content_quality_rule_test.rb +18 -0
data/test/crawlscope/crawl_test.rb +142 -4
data/test/crawlscope/crawler_test.rb +61 -0
data/test/crawlscope/fetch_executor_test.rb +44 -0
data/test/crawlscope/links_rule_test.rb +101 -0
data/test/crawlscope/reporter_test.rb +136 -11
data/test/crawlscope/result_test.rb +35 -0
data/test/crawlscope/sitemap_test.rb +52 -0
data/test/performance/async_fetch_benchmark.rb +127 -0
data/test/performance/fetch_executor_matrix.rb +162 -0
data/test/performance/sitemap_expansion_benchmark.rb +121 -0
metadata +38 -2

data/lib/crawlscope/reporter.rb CHANGED Viewed

@@ -1,7 +1,11 @@
 # frozen_string_literal: true
+require "uri"
 module Crawlscope
   class Reporter
+    MAX_ISSUES_PER_GROUP = 20
     def initialize(io:)
       @io = io
     end
@@ -13,36 +17,141 @@ module Crawlscope
       @io.puts("URLs: #{result.urls.size}")
       @io.puts("Pages: #{result.pages.size}")
-      if result.ok?
+      if result.issues.size.zero?
         @io.puts("Status: OK")
         return
       end
-      @io.puts("Status: FAILED")
-      @io.puts("Issues: #{result.issues.size}")
+      @io.puts("Status: #{status_for(result.issues)}")
+      @io.puts("Issues: #{result.issues.size} total (#{severity_summary(result.issues)})")
       @io.puts("")
-      report_grouped_issues("Severity", result.issues.by_severity)
+      report_summary(result.issues)
       @io.puts("")
-      report_grouped_issues("Category", result.issues.by_category)
+      report_issue_groups(result.issues, base_url: result.base_url)
     end
     private
-    def report_grouped_issues(title, grouped_issues)
-      @io.puts("#{title}:")
+    def status_for(issues)
+      grouped = issues.by_severity
+      if grouped.key?(:error)
+        "FAILED"
+      elsif grouped.key?(:warning)
+        "WARNINGS"
+      else
+        "NOTICES"
+      end
+    end
+    def severity_summary(issues)
+      grouped = issues.by_severity
+      return "" if grouped.empty?
+      grouped
+        .sort_by { |severity, severity_issues| [-severity_issues.size, severity.to_s] }
+        .map { |severity, severity_issues| "#{severity_issues.size} #{pluralize(severity, severity_issues.size)}" }
+        .join(", ")
+    end
-      grouped_issues.sort_by { |name, _issues| name.to_s }.each do |name, issues|
-        @io.puts("#{name}: #{issues.size}")
-        issues.each do |issue|
-          @io.puts("  - #{offense(issue)}")
+    def report_summary(issues)
+      @io.puts("Summary:")
+      issues.by_category
+        .sort_by { |category, category_issues| [-category_issues.size, category.to_s] }
+        .each do |category, category_issues|
+          @io.puts("  #{category.to_s.ljust(16)} #{category_issues.size}")
         end
+    end
+    def report_issue_groups(issues, base_url:)
+      grouped = issues.to_a.group_by { |issue| [issue.category, issue.code] }
+      grouped
+        .sort_by { |(category, code), grouped_issues| [-grouped_issues.size, category.to_s, code.to_s] }
+        .each do |(category, code), grouped_issues|
+          @io.puts("#{category} / #{code}: #{grouped_issues.size}")
+          grouped_issues.first(MAX_ISSUES_PER_GROUP).each do |issue|
+            @io.puts("  - #{compact_issue(issue, base_url: base_url)}")
+          end
+          remaining_count = grouped_issues.size - MAX_ISSUES_PER_GROUP
+          @io.puts("  ... #{remaining_count} more") if remaining_count.positive?
+          @io.puts("")
+        end
+    end
+    def compact_issue(issue, base_url:)
+      parts = []
+      parts << relative_url(issue.url, base_url: base_url) if issue.url
+      detail = compact_detail(issue, base_url: base_url)
+      parts << detail unless detail.empty?
+      parts.compact.join("  ")
+    end
+    def compact_detail(issue, base_url:)
+      details = issue.details || {}
+      fragments = []
+      inbound = details[:dofollow_inbound_count] || details[:inbound_count]
+      fragments << "inbound #{inbound}/#{details[:minimum]}" if inbound && details[:minimum]
+      if details[:ratio] && details[:threshold]
+        fragments << "ratio #{format_number(details[:ratio])}/#{format_number(details[:threshold])}"
+      end
+      fragments << "count #{details[:count]}" if details[:count]
+      fragments << "length #{details[:length]}" if details[:length]
+      fragments << "status #{details[:status]}" if details[:status]
+      fragments << "final: #{relative_url(details[:final_url], base_url: base_url)}" if details[:final_url]
+      fragments << "sources: #{relative_urls(details[:source_urls], base_url: base_url).join(", ")}" if details[:source_urls]&.any?
+      fragments << "source: #{relative_url(details[:source_url], base_url: base_url)}" if details[:source_url]
+      fragments << "targets: #{relative_urls(details[:target_urls], base_url: base_url).join(", ")}" if details[:target_urls]&.any?
+      return issue.message if fragments.empty?
+      case issue.code
+      when :low_dofollow_inlinks, :low_inbound_anchor_links, :low_unique_token_ratio, :low_visible_text_ratio
+        fragments.join("  ")
+      else
+        ([issue.message] + fragments).join("  ")
       end
     end
-    def offense(issue)
-      parts = ["[#{issue.severity}]", issue.code, issue.url, issue.message]
-      parts.compact.join(" ")
+    def relative_urls(urls, base_url:)
+      Array(urls).map { |url| relative_url(url, base_url: base_url) }
+    end
+    def relative_url(url, base_url:)
+      return url unless url && base_url
+      uri = URI.parse(url)
+      base_uri = URI.parse(base_url)
+      return url unless uri.host == base_uri.host && uri.scheme == base_uri.scheme && uri.port == base_uri.port
+      relative = uri.path.to_s.empty? ? "/" : uri.path
+      relative += "?#{uri.query}" if uri.query
+      relative += "##{uri.fragment}" if uri.fragment
+      relative
+    rescue URI::InvalidURIError
+      url
+    end
+    def format_number(value)
+      return format("%.3f", value) if value.is_a?(Float)
+      value.to_s
+    end
+    def pluralize(word, count)
+      return word.to_s if count == 1
+      "#{word}s"
     end
   end
 end

data/lib/crawlscope/result.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 module Crawlscope
   Result = Data.define(:base_url, :sitemap_path, :urls, :pages, :issues) do
     def ok?
-      issues.none?(&:error?) && issues.none?(&:warning?) && issues.none?(&:notice?)
+      issues.none?(&:error?)
     end
   end
 end

data/lib/crawlscope/rules/content_quality.rb CHANGED Viewed

@@ -55,7 +55,7 @@ module Crawlscope
       end
       def validate_visible_text_ratio(page, issues)
-        html_bytes = DocumentText.html_for(page.doc).bytesize
+        html_bytes = DocumentText.content_ratio_html_for(page.doc).bytesize
         return if html_bytes.zero?
         visible_text = DocumentText.text_for(page.doc)

data/lib/crawlscope/rules/indexability.rb CHANGED Viewed

@@ -6,6 +6,31 @@ module Crawlscope
       ROBOTS_META_SELECTOR = 'meta[name="robots"], meta[name="googlebot"]'
       X_ROBOTS_TAG_HEADER = "x-robots-tag"
+      def self.noindex_header?(headers)
+        noindex?(header_value(headers, X_ROBOTS_TAG_HEADER))
+      end
+      def self.noindex_meta?(doc)
+        return false unless doc
+        doc.css(ROBOTS_META_SELECTOR).any? { |tag| noindex?(tag["content"].to_s) }
+      end
+      def self.header_value(headers, name)
+        headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
+      end
+      def self.directives(value)
+        value
+          .split(",")
+          .map { |directive| directive.split(":", 2).last.to_s.strip }
+          .reject(&:empty?)
+      end
+      def self.noindex?(value)
+        directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
+      end
       attr_reader :code
       def initialize
@@ -28,18 +53,15 @@ module Crawlscope
       end
       def header_value(page, name)
-        page.headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
+        self.class.header_value(page.headers, name)
       end
       def directives(value)
-        value
-          .split(",")
-          .map { |directive| directive.split(":", 2).last.to_s.strip }
-          .reject(&:empty?)
+        self.class.directives(value)
       end
       def noindex?(value)
-        directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
+        self.class.noindex?(value)
       end
       def follow?(value)

data/lib/crawlscope/rules/links.rb CHANGED Viewed

@@ -21,8 +21,13 @@ module Crawlscope
       def call(urls:, pages:, issues:, context:)
         @allowed_statuses = context.fetch(:allowed_statuses)
         @base_url = context.fetch(:base_url)
+        @concurrency = context_value(context, :concurrency, default: 1)
+        @fetch_executor = context_value(context, :fetch_executor)
         @resolve_target = context.fetch(:resolve_target)
+        @resolve_targets = context.resolve_targets if context.respond_to?(:resolve_targets)
+        @resolve_targets ||= context[:resolve_targets] if context.respond_to?(:[])
         @base_host = URI.parse(@base_url).host
+        @resolved_targets_by_url = {}
         links = extract_links(pages)
         validate_url_hygiene(urls, links, issues)
@@ -42,7 +47,12 @@ module Crawlscope
       end
       def extract_links(pages)
-        pages.select(&:html?).flat_map { |page| page_links(page) }
+        html_pages = pages.select(&:html?)
+        FetchExecutor.map(
+          name: @fetch_executor,
+          concurrency: @concurrency,
+          items: html_pages
+        ) { |page| page_links(page) }.flatten
       end
       def page_links(page)
@@ -175,9 +185,12 @@ module Crawlscope
       def resolve_links(links, issues)
         resolved_links = []
+        grouped_links_by_target = links.group_by { |link| link[:target_url] }
+        targets_by_url = resolve_targets(grouped_links_by_target.keys)
+        @resolved_targets_by_url.merge!(targets_by_url)
-        links.group_by { |link| link[:target_url] }.each do |target_url, grouped_links|
-          target = resolve_target(target_url)
+        grouped_links_by_target.each do |target_url, grouped_links|
+          target = target_for(target_url)
           if target.unresolved?
             report_unresolved_target(target_url, grouped_links, issues, target.resolution)
@@ -221,6 +234,23 @@ module Crawlscope
         LinkTarget.new(target_url: target_url, resolution: resolution)
       end
+      def resolve_targets(target_urls)
+        if @resolve_targets
+          @resolve_targets.call(target_urls).to_h do |target_url, resolution|
+            [target_url, LinkTarget.new(target_url: target_url, resolution: resolution)]
+          end
+        else
+          target_urls.to_h { |target_url| [target_url, resolve_target(target_url)] }
+        end
+      end
+      def target_for(target_url)
+        @resolved_targets_by_url.fetch(target_url) do
+          target = resolve_target(target_url)
+          @resolved_targets_by_url[target_url] = target
+        end
+      end
       LinkTarget = Data.define(:target_url, :resolution) do
         def allowed?(statuses)
           statuses.include?(status)
@@ -243,6 +273,11 @@ module Crawlscope
           resolution && resolution[:html]
         end
+        def noindex?
+          Crawlscope::Rules::Indexability.noindex_header?(resolution[:headers] || {}) ||
+            Crawlscope::Rules::Indexability.noindex_meta?(resolution[:doc])
+        end
         def status
           resolution && resolution[:status]
         end
@@ -419,8 +454,9 @@ module Crawlscope
           next if reported_urls.include?(final_url)
           next unless crawlable_path?(link[:final_path])
-          target = resolve_target(final_url)
+          target = target_for(final_url)
           next unless target.allowed?(@allowed_statuses) && target.html?
+          next if target.noindex?
           reported_urls << final_url
@@ -500,16 +536,14 @@ module Crawlscope
         return if sitemap_pages.size < 2
         dofollow_counts_by_path = dofollow_counts_by_final_path(resolved_links)
-        sitemap_pages.each do |page|
-          canonical_url = canonical_url_for(page)
-          next if canonical_url.nil?
-          target_uri = URI.parse(canonical_url)
-          next if target_uri.host != @base_host
-          canonical_path = Url.path(canonical_url)
-          if canonical_path && dofollow_counts_by_path[canonical_path].zero?
+        canonical_entries = canonical_entries_for(sitemap_pages)
+        @resolved_targets_by_url.merge!(resolve_targets(canonical_entries.map { |entry| entry.fetch(:canonical_url) }))
+        canonical_entries.each do |entry|
+          page = entry.fetch(:page)
+          canonical_url = entry.fetch(:canonical_url)
+          canonical_path = entry.fetch(:canonical_path)
+          if canonical_path && !root_path?(canonical_path) && dofollow_counts_by_path[canonical_path].zero?
             issues.add(
               code: :canonical_no_internal_inlinks,
               severity: :warning,
@@ -521,8 +555,24 @@ module Crawlscope
           end
           validate_canonical_target_status(page, canonical_url, issues)
+        end
+      end
+      def canonical_entries_for(pages)
+        pages.filter_map do |page|
+          canonical_url = canonical_url_for(page)
+          next if canonical_url.nil?
+          target_uri = URI.parse(canonical_url)
+          next if target_uri.host != @base_host
+          {
+            canonical_path: Url.path(canonical_url),
+            canonical_url: canonical_url,
+            page: page
+          }
         rescue URI::InvalidURIError
-          next
+          nil
         end
       end
@@ -542,8 +592,12 @@ module Crawlscope
         Url.normalize(canonical, base_url: page.url)
       end
+      def root_path?(path)
+        path == "/"
+      end
       def validate_canonical_target_status(page, canonical_url, issues)
-        target = resolve_target(canonical_url)
+        target = target_for(canonical_url)
         if target.unresolved? || target.ignored_error?
           return
@@ -569,6 +623,16 @@ module Crawlscope
           )
         end
       end
+      def context_value(context, name, default: nil)
+        if context.respond_to?(name)
+          context.public_send(name)
+        elsif context.respond_to?(:key?) && context.key?(name)
+          context[name]
+        else
+          default
+        end
+      end
     end
   end
 end

data/lib/crawlscope/rules/uniqueness.rb CHANGED Viewed

@@ -26,11 +26,10 @@ module Crawlscope
       end
       def call(urls:, pages:, issues:, context:)
-        page_summaries = pages.filter_map do |page|
-          next unless page.html?
+        @concurrency = context_value(context, :concurrency, default: 1)
+        @fetch_executor = context_value(context, :fetch_executor)
-          summary_for(page)
-        end
+        page_summaries = summarize_pages(pages)
         validate_duplicates(page_summaries, issues)
         validate_near_duplicates(page_summaries, issues)
@@ -66,6 +65,16 @@ module Crawlscope
         }
       end
+      def summarize_pages(pages)
+        html_pages = pages.select(&:html?)
+        FetchExecutor.map(
+          name: @fetch_executor,
+          concurrency: @concurrency,
+          items: html_pages
+        ) { |page| summary_for(page) }
+      end
       def validate_duplicates(page_summaries, issues)
         duplicates_for(page_summaries, :title).each do |value, urls|
           issues.add(
@@ -177,6 +186,16 @@ module Crawlscope
         intersection_size.to_f / smaller_set_size
       end
+      def context_value(context, name, default: nil)
+        if context.respond_to?(name)
+          context.public_send(name)
+        elsif context.respond_to?(:key?) && context.key?(name)
+          context[name]
+        else
+          default
+        end
+      end
     end
   end
 end

data/lib/crawlscope/sitemap.rb CHANGED Viewed

@@ -9,20 +9,31 @@ module Crawlscope
   class Sitemap
     SITEMAP_NAMESPACE = {"xmlns" => "http://www.sitemaps.org/schemas/sitemap/0.9"}.freeze
-    def initialize(path:)
+    def initialize(path:, adapter: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, fetch_executor: Configuration::DEFAULT_FETCH_EXECUTOR, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS)
       @path = path
+      @adapter = adapter
+      @concurrency = concurrency
+      @fetch_executor = fetch_executor
+      @timeout_seconds = timeout_seconds
     end
     def urls(base_url:)
-      collect_urls(@path, base_url: base_url, visited: Set.new).uniq
+      collect_urls(@path, base_url: base_url, visited: Set.new, visited_mutex: Mutex.new).uniq
     end
     private
-    def collect_urls(source, base_url:, visited:)
-      return [] if visited.include?(source)
+    def collect_urls(source, base_url:, visited:, visited_mutex:)
+      already_visited = visited_mutex.synchronize do
+        if visited.include?(source)
+          true
+        else
+          visited.add(source)
+          false
+        end
+      end
+      return [] if already_visited
-      visited.add(source)
       document = Nokogiri::XML(read(source))
       root_name = document.root&.name
       unless %w[sitemapindex urlset].include?(root_name)
@@ -30,10 +41,13 @@ module Crawlscope
       end
       if root_name == "sitemapindex"
-        document.xpath("//xmlns:sitemap/xmlns:loc", SITEMAP_NAMESPACE).flat_map do |node|
-          child_source = resolve_child_source(source, node.text.to_s.strip, base_url: base_url)
-          collect_urls(child_source, base_url: base_url, visited: visited)
+        child_sources = document.xpath("//xmlns:sitemap/xmlns:loc", SITEMAP_NAMESPACE).map do |node|
+          resolve_child_source(source, node.text.to_s.strip, base_url: base_url)
         end
+        fetch_executor.call(child_sources) do |child_source|
+          collect_urls(child_source, base_url: base_url, visited: visited, visited_mutex: visited_mutex)
+        end.flatten
       else
         document.xpath("//xmlns:url/xmlns:loc", SITEMAP_NAMESPACE).map do |node|
           Url.normalize_for_base(node.text.to_s.strip, base_url: base_url)
@@ -77,11 +91,16 @@ module Crawlscope
     end
     def connection
-      @connection ||= Faraday.new do |faraday|
+      Faraday.new do |faraday|
         faraday.response :follow_redirects, limit: Http::MAX_REDIRECTS
-        faraday.options.timeout = 20
-        faraday.options.open_timeout = 20
+        faraday.options.timeout = @timeout_seconds
+        faraday.options.open_timeout = @timeout_seconds
+        faraday.adapter @adapter if @adapter
       end
     end
+    def fetch_executor
+      @fetch_executor_instance ||= FetchExecutor.build(name: @fetch_executor, concurrency: @concurrency)
+    end
   end
 end

data/lib/crawlscope/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Crawlscope
-  VERSION = "0.5.0"
+  VERSION = "0.6.0"
 end

data/lib/tasks/crawlscope_tasks.rake CHANGED Viewed

@@ -1,5 +1,5 @@
 namespace :crawlscope do
-  desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
+  desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY, FETCH_EXECUTOR"
   task :validate, [:url, :sitemap, :rules] => :environment do |_task, args|
     Crawlscope::RakeTasks.validate(url: args[:url], sitemap_path: args[:sitemap], rule_names: args[:rules])
   end

data/test/crawlscope/cli_test.rb CHANGED Viewed

@@ -4,11 +4,12 @@ require "test_helper"
 class CrawlscopeCliTest < Minitest::Test
   class FakeConfiguration
-    attr_accessor :base_url, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
+    attr_accessor :base_url, :concurrency, :fetch_executor, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
     def initialize
       @base_url = nil
       @concurrency = 10
+      @fetch_executor = :async
       @network_idle_timeout_seconds = 5
       @renderer = :http
       @timeout_seconds = 20
@@ -95,7 +96,7 @@ class CrawlscopeCliTest < Minitest::Test
     err = StringIO.new
     status = Crawlscope::Cli.start(
-      ["validate", "--url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3"],
+      ["validate", "--url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3", "--fetch-executor", "async"],
       out: out,
       err: err,
       configuration: configuration,
@@ -115,10 +116,35 @@ class CrawlscopeCliTest < Minitest::Test
     assert_equal 30, configuration.timeout_seconds
     assert_equal 9, configuration.network_idle_timeout_seconds
     assert_equal 3, configuration.concurrency
+    assert_equal "async", configuration.fetch_executor
     assert_same out, configuration.output
     assert_empty err.string
   end
+  def test_validate_reads_fetch_executor_from_environment
+    configuration = FakeConfiguration.new
+    task = FakeTask.new
+    with_env("FETCH_EXECUTOR" => "async") do
+      status = Crawlscope::Cli.start(["validate", "--url", "https://example.com"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
+      assert_equal 0, status
+    end
+    assert_equal "async", configuration.fetch_executor
+  end
+  def test_validate_uses_threaded_executor_for_browser_rendering_by_default
+    configuration = FakeConfiguration.new
+    task = FakeTask.new
+    status = Crawlscope::Cli.start(["validate", "--url", "https://example.com", "--renderer", "browser"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
+    assert_equal 0, status
+    assert_equal :browser, configuration.renderer
+    assert_equal :threaded, configuration.fetch_executor
+  end
   def test_ldjson_reads_urls_from_environment
     configuration = FakeConfiguration.new
     task = FakeTask.new

data/test/crawlscope/configuration_test.rb CHANGED Viewed

@@ -13,6 +13,7 @@ class CrawlscopeConfigurationTest < Minitest::Test
       config.sitemap_path = -> { "/tmp/sitemap.xml" }
       config.site_name = -> { "Example" }
       config.concurrency = -> { 4 }
+      config.fetch_executor = -> { :threaded }
     end
     audit = Crawlscope.configuration.audit
@@ -20,6 +21,7 @@ class CrawlscopeConfigurationTest < Minitest::Test
     assert_equal "https://example.com", audit.instance_variable_get(:@base_url)
     assert_equal "/tmp/sitemap.xml", audit.instance_variable_get(:@sitemap_path)
     assert_equal 4, audit.instance_variable_get(:@concurrency)
+    assert_equal :threaded, audit.instance_variable_get(:@fetch_executor)
     assert_equal %i[
       indexability
       metadata
@@ -55,6 +57,7 @@ class CrawlscopeConfigurationTest < Minitest::Test
     assert_equal [200, 301, 302], config.allowed_statuses
     assert_equal 10, config.concurrency
+    assert_equal :async, config.fetch_executor
     assert_equal 4, config.browser_concurrency
     assert_equal 5, config.network_idle_timeout_seconds
     assert_equal :http, config.renderer
@@ -63,10 +66,18 @@ class CrawlscopeConfigurationTest < Minitest::Test
     assert config.scroll_page?
   end
+  def test_browser_renderer_defaults_to_threaded_fetch_executor
+    config = Crawlscope::Configuration.new
+    config.renderer = :browser
+    assert_equal :threaded, config.fetch_executor
+  end
   def test_configured_values_are_normalized
     config = Crawlscope::Configuration.new
     config.allowed_statuses = ["200", "404"]
     config.concurrency = "2"
+    config.fetch_executor = "async"
     config.network_idle_timeout_seconds = "7"
     config.renderer = "browser"
     config.timeout_seconds = "9"
@@ -74,6 +85,7 @@ class CrawlscopeConfigurationTest < Minitest::Test
     assert_equal [200, 404], config.allowed_statuses
     assert_equal 2, config.concurrency
+    assert_equal :async, config.fetch_executor
     assert_equal 2, config.browser_concurrency
     assert_equal 7, config.network_idle_timeout_seconds
     assert_equal :browser, config.renderer
@@ -90,6 +102,15 @@ class CrawlscopeConfigurationTest < Minitest::Test
     assert_equal "Crawlscope renderer must be http or browser", error.message
   end
+  def test_fetch_executor_must_be_supported
+    config = Crawlscope::Configuration.new
+    config.fetch_executor = "processes"
+    error = assert_raises(Crawlscope::ConfigurationError) { config.fetch_executor }
+    assert_equal "Crawlscope fetch_executor must be threaded or async", error.message
+  end
   def test_numeric_values_must_be_positive_integers
     config = Crawlscope::Configuration.new
     config.concurrency = "0"