RubyGems - crawlscope - Versions diffs - 0.5.0 → 0.6.0 - Mend

crawlscope 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +27 -0
data/README.md +32 -0
data/lib/crawlscope/cli.rb +16 -0
data/lib/crawlscope/configuration.rb +10 -1
data/lib/crawlscope/context.rb +1 -1
data/lib/crawlscope/crawl.rb +72 -14
data/lib/crawlscope/crawler.rb +3 -17
data/lib/crawlscope/document_text.rb +7 -2
data/lib/crawlscope/fetch_executor/async.rb +32 -0
data/lib/crawlscope/fetch_executor/threaded.rb +32 -0
data/lib/crawlscope/fetch_executor.rb +43 -0
data/lib/crawlscope/http.rb +7 -1
data/lib/crawlscope/reporter.rb +123 -14
data/lib/crawlscope/result.rb +1 -1
data/lib/crawlscope/rules/content_quality.rb +1 -1
data/lib/crawlscope/rules/indexability.rb +28 -6
data/lib/crawlscope/rules/links.rb +80 -16
data/lib/crawlscope/rules/uniqueness.rb +23 -4
data/lib/crawlscope/sitemap.rb +30 -11
data/lib/crawlscope/version.rb +1 -1
data/lib/tasks/crawlscope_tasks.rake +1 -1
data/test/crawlscope/cli_test.rb +28 -2
data/test/crawlscope/configuration_test.rb +21 -0
data/test/crawlscope/content_quality_rule_test.rb +18 -0
data/test/crawlscope/crawl_test.rb +142 -4
data/test/crawlscope/crawler_test.rb +61 -0
data/test/crawlscope/fetch_executor_test.rb +44 -0
data/test/crawlscope/links_rule_test.rb +101 -0
data/test/crawlscope/reporter_test.rb +136 -11
data/test/crawlscope/result_test.rb +35 -0
data/test/crawlscope/sitemap_test.rb +52 -0
data/test/performance/async_fetch_benchmark.rb +127 -0
data/test/performance/fetch_executor_matrix.rb +162 -0
data/test/performance/sitemap_expansion_benchmark.rb +121 -0
metadata +38 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 7d9e56ae9a55e3c4bb6b079585b8a302edcc1bfad9110938c9421c5224bf27f9
-  data.tar.gz: ab1908aa4a1bef4c15f055800ca9862ba973c9257f39020309de1f5554923cca
+  metadata.gz: cb1ed58c5dc558d7d7efcb357870fc4764a1d6d5caeb5ddc30e466334c986421
+  data.tar.gz: 9e90845271e781a0586c30c5c3f2c770b4a0c837474d78e8a19afa89c5b2fb6d
 SHA512:
-  metadata.gz: 8981de1e7bc19737df3048b1e19f28d585f22eed8f2b32ea4eea473ba377d3a261e08df8165114c8d967b7ec7d14a48c47a6b83cfe14261f6c83b56b39134766
-  data.tar.gz: 3df1e21bf74c12e994c0a932f9c581e4a6e12b55dd14975e43c5eca073bcc8eb4a49337d3c3c2e52137c4436fb5ddf52accd6fc11954171d77e75ce8f75e69a5
+  metadata.gz: cd645a628045089499e213491a08157e5268b2238007c86e20eeff996dcb0246037915dce0019554995ade10c57eae31ffc79d1b256a123160f728a7f6e74722
+  data.tar.gz: d12723425911cc2c6184f6f2f31f8d0dbe6fde8bc021c3c45c0b5490e500906be6f49da74c3457e04778bb6ed0c50438489fd1875191090de702cf6d0ed494f0

data/CHANGELOG.md CHANGED Viewed

@@ -5,6 +5,33 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [0.6.0] - 2026-06-01
+### Added
+- add bounded async crawl execution
+### Changed
+- default HTTP crawling to async
+- update Ruby CI matrix
+### Fixed
+- respect noindex targets in sitemap link audit
+- improve validation report readability
 ## [0.5.0] - 2026-05-31

data/README.md CHANGED Viewed

@@ -32,6 +32,8 @@ The default rule set includes:
 ## Installation
+Crawlscope requires Ruby 3.3 or newer.
 Add this line to your application's Gemfile:
 ```ruby
@@ -87,6 +89,29 @@ crawlscope validate --url https://example.com --sitemap https://example.com/site
 Child sitemap indexes are supported automatically.
+Validation output is grouped for terminal scanning:
+```text
+Crawlscope validation
+Base URL: https://example.com
+Sitemap: https://example.com/sitemap.xml
+URLs: 24
+Pages: 24
+Status: FAILED
+Issues: 3 3 warnings
+Summary:
+  links            2
+  metadata         1
+links / low_dofollow_inlinks: 2
+  - /pricing  inbound 1/2  sources: /
+  - /features  inbound 1/2  sources: /
+metadata / missing_title: 1
+  - /draft  missing <title>
+```
 ## Ruby Usage
 ```ruby
@@ -143,6 +168,7 @@ Available environment overrides:
 - `TIMEOUT=30`
 - `NETWORK_IDLE_TIMEOUT=10`
 - `CONCURRENCY=5`
+- `FETCH_EXECUTOR=threaded` or `FETCH_EXECUTOR=async`
 Available tasks:
@@ -173,6 +199,12 @@ bundle exec rake 'crawlscope:validate:ldjson[https://example.com/article]'
 Plain `rake` does not pass `--url` style flags to tasks. Use `URL=...` or the
 task-argument form above instead.
+`FETCH_EXECUTOR=async` is the default for HTTP crawling. It uses Ruby's fiber
+scheduler and Async::HTTP through Faraday, preserving the same `CONCURRENCY`
+bound. Use `FETCH_EXECUTOR=threaded` or `--fetch-executor threaded` for the
+thread-pool executor. Browser rendering uses the threaded executor by default
+because async fetch execution is only supported with HTTP rendering.
 `crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`.
 ### Structured Data URL Audit

data/lib/crawlscope/cli.rb CHANGED Viewed

@@ -134,6 +134,8 @@ module Crawlscope
       configure_renderer(resolved_renderer)
       @configuration.concurrency = resolved_concurrency
+      fetch_executor_configured = !normalized_string(ENV["FETCH_EXECUTOR"]).nil?
+      @configuration.fetch_executor = resolved_fetch_executor
       @configuration.network_idle_timeout_seconds = resolved_integer("NETWORK_IDLE_TIMEOUT", default: @configuration.network_idle_timeout_seconds, minimum: 1)
       @configuration.timeout_seconds = resolved_integer("TIMEOUT", default: @configuration.timeout_seconds, minimum: 1)
@@ -167,9 +169,15 @@ module Crawlscope
         opts.on("--concurrency COUNT", Integer, "Set crawl concurrency") do |value|
           @configuration.concurrency = integer_option(value, minimum: 1, name: "concurrency")
         end
+        opts.on("--fetch-executor NAME", "Use threaded or async fetch execution") do |value|
+          fetch_executor_configured = true
+          @configuration.fetch_executor = value
+        end
       end
       parser.parse!(@argv)
+      @configuration.fetch_executor = :threaded if @configuration.renderer == :browser && !fetch_executor_configured
       result = task.validate(
         base_url: options[:url],
@@ -221,6 +229,14 @@ module Crawlscope
       end
     end
+    def resolved_fetch_executor
+      configured_executor = normalized_string(ENV["FETCH_EXECUTOR"])
+      return configured_executor if configured_executor
+      return :threaded if @configuration.renderer == :browser
+      @configuration.fetch_executor
+    end
     def resolved_integer(name, default:, minimum:)
       raw_value = normalized_string(ENV[name])
       return default if raw_value.nil?

data/lib/crawlscope/configuration.rb CHANGED Viewed

@@ -7,10 +7,11 @@ module Crawlscope
     DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS = 5
     DEFAULT_BROWSER_SCROLL_PAGE = true
     DEFAULT_CONCURRENCY = 10
+    DEFAULT_FETCH_EXECUTOR = :async
     RENDERERS = %i[http browser].freeze
     DEFAULT_TIMEOUT_SECONDS = 20
-    attr_writer :allowed_statuses, :base_url, :browser_factory, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :rule_registry, :schema_registry, :scroll_page, :site_name, :sitemap_path, :timeout_seconds
+    attr_writer :allowed_statuses, :base_url, :browser_factory, :concurrency, :fetch_executor, :network_idle_timeout_seconds, :output, :renderer, :rule_registry, :schema_registry, :scroll_page, :site_name, :sitemap_path, :timeout_seconds
     def allowed_statuses
       value = resolve(@allowed_statuses)
@@ -30,6 +31,13 @@ module Crawlscope
       positive_integer(value, default: DEFAULT_CONCURRENCY, name: "concurrency")
     end
+    def fetch_executor
+      value = resolve(@fetch_executor)
+      default = (renderer == :browser) ? :threaded : DEFAULT_FETCH_EXECUTOR
+      FetchExecutor.normalize(value.nil? ? default : value)
+    end
     def browser_concurrency
       value = concurrency
       default_value = DEFAULT_BROWSER_CONCURRENCY
@@ -83,6 +91,7 @@ module Crawlscope
         sitemap_path: sitemap_path,
         browser_factory: browser_factory,
         concurrency: concurrency,
+        fetch_executor: fetch_executor,
         network_idle_timeout_seconds: network_idle_timeout_seconds,
         renderer: renderer,
         timeout_seconds: timeout_seconds,

data/lib/crawlscope/context.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 module Crawlscope
-  Context = Data.define(:allowed_statuses, :base_url, :resolve_target, :schema_registry) do
+  Context = Data.define(:allowed_statuses, :base_url, :concurrency, :fetch_executor, :resolve_target, :resolve_targets, :schema_registry) do
     def fetch(name)
       public_send(name)
     end

data/lib/crawlscope/crawl.rb CHANGED Viewed

@@ -2,7 +2,7 @@
 module Crawlscope
   class Crawl
-    def initialize(base_url:, sitemap_path:, rules:, schema_registry:, browser_factory: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, network_idle_timeout_seconds: Configuration::DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, renderer: :http, scroll_page: Configuration::DEFAULT_BROWSER_SCROLL_PAGE, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS, allowed_statuses: Configuration::DEFAULT_ALLOWED_STATUSES)
+    def initialize(base_url:, sitemap_path:, rules:, schema_registry:, browser_factory: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, fetch_executor: nil, network_idle_timeout_seconds: Configuration::DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, renderer: :http, scroll_page: Configuration::DEFAULT_BROWSER_SCROLL_PAGE, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS, allowed_statuses: Configuration::DEFAULT_ALLOWED_STATUSES)
       @base_url = base_url
       @sitemap_path = sitemap_path
       @rules = Array(rules)
@@ -11,16 +11,19 @@ module Crawlscope
       @concurrency = concurrency
       @network_idle_timeout_seconds = network_idle_timeout_seconds
       @renderer = renderer.to_sym
+      @fetch_executor = fetch_executor || default_fetch_executor
       @scroll_page = scroll_page
       @timeout_seconds = timeout_seconds
       @allowed_statuses = allowed_statuses
     end
     def call
+      validate_fetch_executor!
       urls = sitemap_urls
       @page_fetcher = page
-      pages = Crawler.new(page_fetcher: @page_fetcher, concurrency: @concurrency).call(urls)
+      pages = Crawler.new(page_fetcher: @page_fetcher, concurrency: @concurrency, fetch_executor: @fetch_executor).call(urls)
       issues = IssueCollection.new
       collect(pages, issues)
@@ -41,7 +44,13 @@ module Crawlscope
     private
     def sitemap_urls
-      urls = Sitemap.new(path: @sitemap_path).urls(base_url: @base_url)
+      urls = Sitemap.new(
+        path: @sitemap_path,
+        adapter: http_adapter,
+        concurrency: @concurrency,
+        fetch_executor: @fetch_executor,
+        timeout_seconds: @timeout_seconds
+      ).urls(base_url: @base_url)
       raise ValidationError, "No URLs found in sitemap: #{@sitemap_path}" if urls.empty?
       urls
@@ -62,15 +71,35 @@ module Crawlscope
       if @renderer == :browser
         (@browser_factory || method(:browser)).call
       else
-        Http.new(base_url: @base_url, timeout_seconds: @timeout_seconds)
+        Http.new(base_url: @base_url, timeout_seconds: @timeout_seconds, adapter: http_adapter)
       end
     end
+    def http_adapter
+      return unless FetchExecutor.normalize(@fetch_executor) == :async
+      require "async/http/faraday"
+      :async_http
+    end
+    def validate_fetch_executor!
+      return unless @renderer == :browser && FetchExecutor.normalize(@fetch_executor) == :async
+      raise ConfigurationError, "Async fetch execution is only supported with http rendering"
+    end
+    def default_fetch_executor
+      (@renderer == :browser) ? :threaded : Configuration::DEFAULT_FETCH_EXECUTOR
+    end
     def context
       Context.new(
         allowed_statuses: @allowed_statuses,
         base_url: @base_url,
+        concurrency: @concurrency,
+        fetch_executor: @fetch_executor,
         resolve_target: method(:resolve),
+        resolve_targets: method(:resolve_all),
         schema_registry: @schema_registry
       )
     end
@@ -93,11 +122,15 @@ module Crawlscope
       @targets = {}
       pages.each do |page|
-        @pages[page.normalized_url] = page unless page.normalized_url.to_s.empty?
-        @pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
+        cache_page(page)
       end
     end
+    def cache_page(page)
+      @pages[page.normalized_url] = page unless page.normalized_url.to_s.empty?
+      @pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
+    end
     def scan(urls, pages, issues)
       @rules.each do |rule|
         rule.call(urls: urls, pages: pages, issues: issues, context: context)
@@ -105,17 +138,40 @@ module Crawlscope
     end
     def resolve(target_url)
-      normalized_url = Url.normalize(target_url, base_url: @base_url)
-      return @targets[normalized_url] if @targets.key?(normalized_url)
+      resolve_all([target_url]).fetch(target_url)
+    end
+    def resolve_all(target_urls)
+      normalized_by_url = Array(target_urls).to_h do |target_url|
+        [target_url, Url.normalize(target_url, base_url: @base_url)]
+      end
+      normalized_urls = normalized_by_url.values.compact.uniq
+      missing_urls = []
+      normalized_urls.each do |normalized_url|
+        next if @targets.key?(normalized_url)
+        resolved = resolved_page(normalized_url)
+        if resolved
+          @targets[normalized_url] = resolved
+        else
+          missing_urls << normalized_url
+        end
+      end
+      fetched_pages(missing_urls).each do |page|
+        normalized_url = Url.normalize(page.url, base_url: @base_url)
+        cache_page(page)
+        @targets[normalized_url] = resolution(page, normalized_url, crawled: false)
+      end
-      @targets[normalized_url] = resolved_page(normalized_url) || fetched_page(normalized_url)
+      normalized_by_url.to_h { |target_url, normalized_url| [target_url, @targets[normalized_url]] }
     end
-    def fetched_page(normalized_url)
-      page = @page_fetcher.fetch(normalized_url)
-      @pages[page.normalized_url] = page unless page.normalized_url.to_s.empty?
-      @pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
-      resolution(page, normalized_url, crawled: false)
+    def fetched_pages(normalized_urls)
+      return [] if normalized_urls.empty?
+      Crawler.new(page_fetcher: @page_fetcher, concurrency: @concurrency, fetch_executor: @fetch_executor).call(normalized_urls)
     end
     def resolved_page(normalized_url)
@@ -126,8 +182,10 @@ module Crawlscope
     def resolution(page, normalized_url, crawled:)
       {
         crawled: crawled,
+        doc: page.doc,
         error: page.error,
         final_url: page.normalized_final_url || normalized_url,
+        headers: page.headers,
         html: page.html?,
         status: page.status
       }

data/lib/crawlscope/crawler.rb CHANGED Viewed

@@ -1,28 +1,14 @@
 # frozen_string_literal: true
-require "concurrent"
 module Crawlscope
   class Crawler
-    def initialize(page_fetcher:, concurrency:)
+    def initialize(page_fetcher:, concurrency:, fetch_executor: :threaded)
       @page_fetcher = page_fetcher
-      @concurrency = concurrency
+      @fetch_executor = FetchExecutor.build(name: fetch_executor, concurrency: concurrency)
     end
     def call(urls)
-      pages = Concurrent::Array.new
-      pool = Concurrent::FixedThreadPool.new(@concurrency)
-      urls.each do |url|
-        pool.post do
-          pages << fetch(url)
-        end
-      end
-      pool.shutdown
-      pool.wait_for_termination
-      pages.to_a
+      @fetch_executor.call(urls) { |url| fetch(url) }
     end
     private

data/lib/crawlscope/document_text.rb CHANGED Viewed

@@ -3,6 +3,7 @@
 module Crawlscope
   module DocumentText
     REMOVED_SELECTORS = "script, style, noscript, template, svg"
+    CONTENT_RATIO_REMOVED_SELECTORS = "#{REMOVED_SELECTORS}, form"
     TOKEN_PATTERN = /[[:alnum:]]+/
     module_function
@@ -15,6 +16,10 @@ module Crawlscope
       root_for(doc, selector: selector)&.to_html.to_s
     end
+    def content_ratio_html_for(doc, selector: "main")
+      root_for(doc, selector: selector, removed_selectors: CONTENT_RATIO_REMOVED_SELECTORS)&.to_html.to_s
+    end
     def text_for(doc, selector: "main")
       normalize(root_for(doc, selector: selector)&.text)
     end
@@ -27,11 +32,11 @@ module Crawlscope
       text.to_s.gsub(/\s+/, " ").strip
     end
-    def root_for(doc, selector:)
+    def root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS)
       return unless doc
       copy = doc.dup
-      copy.css(REMOVED_SELECTORS).remove
+      copy.css(removed_selectors).remove
       root = selector.to_s.empty? ? nil : copy.at_css(selector)
       root || copy.at_css("body") || copy

data/lib/crawlscope/fetch_executor/async.rb ADDED Viewed

@@ -0,0 +1,32 @@
+# frozen_string_literal: true
+require "async"
+require "async/semaphore"
+module Crawlscope
+  module FetchExecutor
+    class Async
+      def initialize(concurrency:)
+        @concurrency = concurrency
+      end
+      def call(items)
+        indexed_items = Array(items).each_with_index.to_a
+        results = Array.new(indexed_items.size)
+        Sync do |parent|
+          semaphore = ::Async::Semaphore.new(@concurrency)
+          tasks = indexed_items.map do |item, index|
+            semaphore.async(parent: parent) do
+              results[index] = yield(item)
+            end
+          end
+          tasks.each(&:wait)
+        end
+        results
+      end
+    end
+  end
+end

data/lib/crawlscope/fetch_executor/threaded.rb ADDED Viewed

@@ -0,0 +1,32 @@
+# frozen_string_literal: true
+require "concurrent"
+module Crawlscope
+  module FetchExecutor
+    class Threaded
+      def initialize(concurrency:)
+        @concurrency = concurrency
+      end
+      def call(items)
+        indexed_items = Array(items).each_with_index.to_a
+        results = Array.new(indexed_items.size)
+        mutex = Mutex.new
+        pool = Concurrent::FixedThreadPool.new(@concurrency)
+        indexed_items.each do |item, index|
+          pool.post do
+            result = yield(item)
+            mutex.synchronize { results[index] = result }
+          end
+        end
+        pool.shutdown
+        pool.wait_for_termination
+        results
+      end
+    end
+  end
+end

data/lib/crawlscope/fetch_executor.rb ADDED Viewed

@@ -0,0 +1,43 @@
+# frozen_string_literal: true
+module Crawlscope
+  module FetchExecutor
+    NAMES = %i[threaded async].freeze
+    module_function
+    def build(name:, concurrency:)
+      return name if name.respond_to?(:call)
+      case normalized_name(name)
+      when :threaded
+        Threaded.new(concurrency: concurrency)
+      when :async
+        Async.new(concurrency: concurrency)
+      end
+    end
+    def map(name:, concurrency:, items:, &block)
+      items = Array(items)
+      return items.map(&block) if items.size < 2 || concurrency.to_i <= 1
+      build(name: name, concurrency: concurrency).call(items, &block)
+    end
+    def normalize(name)
+      return name if name.respond_to?(:call)
+      normalized_name(name)
+    end
+    def normalized_name(name)
+      normalized = name.to_s.strip
+      normalized = "threaded" if normalized.empty?
+      value = normalized.to_sym
+      return value if NAMES.include?(value)
+      raise ConfigurationError, "Crawlscope fetch_executor must be threaded or async"
+    end
+  end
+end

data/lib/crawlscope/http.rb CHANGED Viewed

@@ -10,13 +10,18 @@ module Crawlscope
     MAX_REDIRECTS = 5
     USER_AGENT = "Mozilla/5.0 (compatible; Crawlscope/1.0)"
-    def initialize(base_url:, timeout_seconds:)
+    def initialize(base_url:, timeout_seconds:, adapter: nil)
       @base_url = base_url
       @timeout_seconds = timeout_seconds
+      @adapter = adapter
       @connections_by_thread = Concurrent::Map.new
     end
     def close
+      @connections_by_thread.each_value do |connection|
+        connection.close if connection.respond_to?(:close)
+      end
       @connections_by_thread.clear
     end
@@ -65,6 +70,7 @@ module Crawlscope
           faraday.response :follow_redirects, limit: MAX_REDIRECTS
           faraday.options.timeout = @timeout_seconds
           faraday.options.open_timeout = @timeout_seconds
+          faraday.adapter @adapter if @adapter
         end
       end
     end