RubyGems - crawlscope - Versions diffs - 0.4.0 → 0.5.0 - Mend

crawlscope 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +9 -0
data/README.md +6 -0
data/lib/crawlscope/cli.rb +4 -1
data/lib/crawlscope/crawl.rb +2 -0
data/lib/crawlscope/rake_tasks.rb +27 -12
data/lib/crawlscope/reporter.rb +20 -5
data/lib/crawlscope/rules/indexability.rb +130 -17
data/lib/crawlscope/rules/links.rb +312 -9
data/lib/crawlscope/rules/metadata.rb +61 -6
data/lib/crawlscope/rules/structured_data.rb +31 -0
data/lib/crawlscope/rules/uniqueness.rb +22 -0
data/lib/crawlscope/sitemap.rb +9 -1
data/lib/crawlscope/version.rb +1 -1
data/lib/tasks/crawlscope_tasks.rake +24 -24
data/test/crawlscope/cli_test.rb +1 -0
data/test/crawlscope/crawl_test.rb +26 -0
data/test/crawlscope/indexability_rule_test.rb +33 -0
data/test/crawlscope/links_rule_test.rb +148 -3
data/test/crawlscope/metadata_rule_test.rb +36 -0
data/test/crawlscope/rake_tasks_test.rb +70 -0
data/test/crawlscope/reporter_test.rb +7 -3
data/test/crawlscope/sitemap_test.rb +24 -0
data/test/crawlscope/structured_data_rule_test.rb +56 -0
data/test/crawlscope/uniqueness_rule_test.rb +17 -2
metadata +2 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 79e8c8f3993c545bf7647c28b8540d3757c7d9c91eeaf885cde6d55c4935ebb5
-  data.tar.gz: d9b6a987e04546c2d3ee7bb3cc6e1d5510e78963df035cb24d7c8783064afa45
+  metadata.gz: 7d9e56ae9a55e3c4bb6b079585b8a302edcc1bfad9110938c9421c5224bf27f9
+  data.tar.gz: ab1908aa4a1bef4c15f055800ca9862ba973c9257f39020309de1f5554923cca
 SHA512:
-  metadata.gz: eb49361b9f26992682db7622796c4b262a12fca37254aca5e1f1c49c85702b7e4fc347a880af0665f10238f5340cb61bc44433060ba7b3fbde0bdd379c85c763
-  data.tar.gz: 5fa53f930ef529279e063bd11f9becd112c8abb266078027486f22ad37e968bad744c5a35c9432ccb170ceb51e45d858e23a47c649c6ede1d4dd89fb331fd9f3
+  metadata.gz: 8981de1e7bc19737df3048b1e19f28d585f22eed8f2b32ea4eea473ba377d3a261e08df8165114c8d967b7ec7d14a48c47a6b83cfe14261f6c83b56b39134766
+  data.tar.gz: 3df1e21bf74c12e994c0a932f9c581e4a6e12b55dd14975e43c5eca073bcc8eb4a49337d3c3c2e52137c4436fb5ddf52accd6fc11954171d77e75ce8f75e69a5

data/CHANGELOG.md CHANGED Viewed

@@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [0.5.0] - 2026-05-31
+### Added
+- expand SEO audit checks
 ## [0.4.0] - 2026-05-21

data/README.md CHANGED Viewed

@@ -161,12 +161,18 @@ The same validation surface is also available in the gem repository itself throu
 ```bash
 bundle exec rake crawlscope:validate URL=https://example.com
+bundle exec rake 'crawlscope:validate[https://example.com]'
 bundle exec rake crawlscope:validate:metadata URL=https://example.com
+bundle exec rake 'crawlscope:validate:metadata[https://example.com]'
 bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article
+bundle exec rake 'crawlscope:validate:ldjson[https://example.com/article]'
 ```
 `crawlscope:validate` runs all default sitemap rules: indexability, metadata, structured data, uniqueness, content quality, and links. `URL` is the site base. Without `SITEMAP`, Crawlscope uses `/sitemap.xml`. With `SITEMAP`, Crawlscope uses `URL` as the site base and validates URLs from that sitemap. `SITEMAP` may be a full URL or a local file path.
+Plain `rake` does not pass `--url` style flags to tasks. Use `URL=...` or the
+task-argument form above instead.
 `crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`.
 ### Structured Data URL Audit

data/lib/crawlscope/cli.rb CHANGED Viewed

@@ -37,11 +37,14 @@ module Crawlscope
         @err.puts(general_usage)
         1
       end
-    rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ValidationError, ArgumentError => error
+    rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ArgumentError => error
       @err.puts(error.message)
       @err.puts("")
       @err.puts(general_usage)
       1
+    rescue ValidationError => error
+      @err.puts(error.message)
+      1
     end
     private

data/lib/crawlscope/crawl.rb CHANGED Viewed

@@ -83,6 +83,7 @@ module Crawlscope
           issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status})
         elsif redirected?(page)
           issues.add(code: :redirected_page, severity: :warning, category: :crawl, url: page.url, message: "redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
+          issues.add(code: :sitemap_redirect_url, severity: :warning, category: :sitemaps, url: page.url, message: "sitemap URL redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
         end
       end
     end
@@ -127,6 +128,7 @@ module Crawlscope
         crawled: crawled,
         error: page.error,
         final_url: page.normalized_final_url || normalized_url,
+        html: page.html?,
         status: page.status
       }
     end

data/lib/crawlscope/rake_tasks.rb CHANGED Viewed

@@ -4,25 +4,40 @@ module Crawlscope
   module RakeTasks
     module_function
-    def validate
-      run("validate")
+    def validate(url: nil, sitemap_path: nil, rule_names: nil)
+      run("validate", argv: validate_argv(url: url, sitemap_path: sitemap_path, rule_names: rule_names))
     end
-    def ldjson
-      run("ldjson")
+    def ldjson(urls: nil)
+      run("ldjson", argv: ldjson_argv(urls: urls))
     end
-    def validate_rule(rule)
-      original_rules = ENV["RULES"]
-      ENV["RULES"] = rule
-      validate
-    ensure
-      ENV["RULES"] = original_rules
+    def validate_rule(rule, url: nil, sitemap_path: nil)
+      validate(url: url, sitemap_path: sitemap_path, rule_names: rule)
     end
-    def run(command)
-      status = Cli.start([command], out: $stdout, err: $stderr)
+    def run(command, argv: [])
+      status = Cli.start([command, *argv], out: $stdout, err: $stderr)
       exit(status) unless status.zero?
     end
+    def validate_argv(url:, sitemap_path:, rule_names:)
+      [
+        option_pair("--url", url),
+        option_pair("--sitemap", sitemap_path),
+        option_pair("--rules", rule_names)
+      ].compact.flatten
+    end
+    def ldjson_argv(urls:)
+      Array(urls).flat_map { |url| option_pair("--url", url) }.compact
+    end
+    def option_pair(name, value)
+      value = value.to_s.strip
+      return if value.empty?
+      [name, value]
+    end
   end
 end

data/lib/crawlscope/reporter.rb CHANGED Viewed

@@ -20,14 +20,29 @@ module Crawlscope
       @io.puts("Status: FAILED")
       @io.puts("Issues: #{result.issues.size}")
+      @io.puts("")
-      result.issues.by_severity.sort_by { |severity, _issues| severity.to_s }.each do |severity, issues|
-        @io.puts("#{severity}: #{issues.size}")
-      end
+      report_grouped_issues("Severity", result.issues.by_severity)
+      @io.puts("")
+      report_grouped_issues("Category", result.issues.by_category)
+    end
+    private
-      result.issues.each do |issue|
-        @io.puts("- [#{issue.severity}] #{issue.url} #{issue.message}")
+    def report_grouped_issues(title, grouped_issues)
+      @io.puts("#{title}:")
+      grouped_issues.sort_by { |name, _issues| name.to_s }.each do |name, issues|
+        @io.puts("#{name}: #{issues.size}")
+        issues.each do |issue|
+          @io.puts("  - #{offense(issue)}")
+        end
       end
     end
+    def offense(issue)
+      parts = ["[#{issue.severity}]", issue.code, issue.url, issue.message]
+      parts.compact.join(" ")
+    end
   end
 end

data/lib/crawlscope/rules/indexability.rb CHANGED Viewed

@@ -13,45 +13,110 @@ module Crawlscope
       end
       def call(urls:, pages:, issues:, context: nil)
+        sitemap_urls = normalized_sitemap_urls(urls)
         pages.each do |page|
-          validate_meta_robots(page, issues) if page.html?
-          validate_x_robots_tag(page, issues)
+          validate_meta_robots(page, issues, sitemap_urls) if page.html?
+          validate_x_robots_tag(page, issues, sitemap_urls)
         end
       end
       private
+      def normalized_sitemap_urls(urls)
+        urls.map { |url| Url.normalize(url, base_url: url) }.compact
+      end
       def header_value(page, name)
         page.headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
       end
-      def noindex?(value)
+      def directives(value)
         value
           .split(",")
           .map { |directive| directive.split(":", 2).last.to_s.strip }
-          .any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
+          .reject(&:empty?)
+      end
+      def noindex?(value)
+        directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
+      end
+      def follow?(value)
+        directives(value).any? { |directive| directive.casecmp?("follow") }
       end
-      def validate_meta_robots(page, issues)
+      def nofollow?(value)
+        directives(value).any? { |directive| directive.casecmp?("nofollow") || directive.casecmp?("none") }
+      end
+      def validate_meta_robots(page, issues, sitemap_urls)
         page.doc.css(ROBOTS_META_SELECTOR).each do |tag|
           content = tag["content"].to_s
-          next unless noindex?(content)
-          issues.add(
-            code: :noindex_meta,
-            severity: :error,
-            category: :indexability,
-            url: page.url,
-            message: "robots meta tag prevents indexing",
-            details: {content: content, name: tag["name"].to_s}
-          )
+          report_noindex_meta(page, issues, content, tag["name"].to_s, sitemap_urls) if noindex?(content)
+          report_nofollow_meta(page, issues, content, tag["name"].to_s) if nofollow?(content)
+          report_noindex_follow_meta(page, issues, content, tag["name"].to_s) if noindex?(content) && follow?(content)
+          report_noindex_nofollow_meta(page, issues, content, tag["name"].to_s) if noindex?(content) && nofollow?(content)
         end
       end
-      def validate_x_robots_tag(page, issues)
+      def validate_x_robots_tag(page, issues, sitemap_urls)
         content = header_value(page, X_ROBOTS_TAG_HEADER)
-        return unless noindex?(content)
+        return if content.empty?
+        report_noindex_header(page, issues, content, sitemap_urls) if noindex?(content)
+        report_nofollow_header(page, issues, content) if nofollow?(content)
+        report_noindex_follow_header(page, issues, content) if noindex?(content) && follow?(content)
+        report_noindex_nofollow_header(page, issues, content) if noindex?(content) && nofollow?(content)
+      end
+      def report_noindex_meta(page, issues, content, name, sitemap_urls)
+        issues.add(
+          code: :noindex_meta,
+          severity: :error,
+          category: :indexability,
+          url: page.url,
+          message: "robots meta tag prevents indexing",
+          details: {content: content, name: name}
+        )
+        report_sitemap_noindex_url(page, issues, content, source: "meta", sitemap_urls: sitemap_urls)
+      end
+      def report_nofollow_meta(page, issues, content, name)
+        issues.add(
+          code: :nofollow_meta,
+          severity: :warning,
+          category: :indexability,
+          url: page.url,
+          message: "robots meta tag prevents following links",
+          details: {content: content, name: name}
+        )
+      end
+      def report_noindex_follow_meta(page, issues, content, name)
+        issues.add(
+          code: :noindex_follow_meta,
+          severity: :warning,
+          category: :indexability,
+          url: page.url,
+          message: "robots meta tag prevents indexing but allows following links",
+          details: {content: content, name: name}
+        )
+      end
+      def report_noindex_nofollow_meta(page, issues, content, name)
+        issues.add(
+          code: :noindex_nofollow_meta,
+          severity: :error,
+          category: :indexability,
+          url: page.url,
+          message: "robots meta tag prevents indexing and following links",
+          details: {content: content, name: name}
+        )
+      end
+      def report_noindex_header(page, issues, content, sitemap_urls)
         issues.add(
           code: :noindex_header,
           severity: :error,
@@ -60,6 +125,54 @@ module Crawlscope
           message: "X-Robots-Tag header prevents indexing",
           details: {content: content}
         )
+        report_sitemap_noindex_url(page, issues, content, source: "header", sitemap_urls: sitemap_urls)
+      end
+      def report_nofollow_header(page, issues, content)
+        issues.add(
+          code: :nofollow_header,
+          severity: :warning,
+          category: :indexability,
+          url: page.url,
+          message: "X-Robots-Tag header prevents following links",
+          details: {content: content}
+        )
+      end
+      def report_noindex_follow_header(page, issues, content)
+        issues.add(
+          code: :noindex_follow_header,
+          severity: :warning,
+          category: :indexability,
+          url: page.url,
+          message: "X-Robots-Tag header prevents indexing but allows following links",
+          details: {content: content}
+        )
+      end
+      def report_noindex_nofollow_header(page, issues, content)
+        issues.add(
+          code: :noindex_nofollow_header,
+          severity: :error,
+          category: :indexability,
+          url: page.url,
+          message: "X-Robots-Tag header prevents indexing and following links",
+          details: {content: content}
+        )
+      end
+      def report_sitemap_noindex_url(page, issues, content, source:, sitemap_urls:)
+        normalized_url = Url.normalize(page.url, base_url: page.url)
+        return unless sitemap_urls.include?(normalized_url)
+        issues.add(
+          code: :sitemap_noindex_url,
+          severity: :error,
+          category: :sitemaps,
+          url: page.url,
+          message: "sitemap URL is noindex",
+          details: {content: content, source: source}
+        )
       end
     end
   end