RubyGems - crawlscope - Versions diffs - 0.2.0 → 0.3.0 - Mend

crawlscope 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +10 -7
data/README.md +2 -2
data/lib/crawlscope/cli.rb +5 -0
data/lib/crawlscope/crawl.rb +6 -0
data/lib/crawlscope/rules/links.rb +24 -6
data/lib/crawlscope/rules/metadata.rb +57 -11
data/lib/crawlscope/rules/structured_data.rb +47 -0
data/lib/crawlscope/schemas.rb +52 -1
data/lib/crawlscope/version.rb +1 -1
data/lib/tasks/crawlscope_tasks.rake +1 -1
data/test/crawlscope/cli_test.rb +19 -5
data/test/crawlscope/crawl_test.rb +13 -3
data/test/crawlscope/links_rule_test.rb +39 -0
data/test/crawlscope/metadata_rule_test.rb +77 -0
data/test/crawlscope/structured_data_rule_test.rb +91 -0
metadata +2 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ba21d55a2d9b787d7bb9d4e90f39e655a5fe2a884769dbef6f866d1e5779e076
-  data.tar.gz: b7c6b829412f8e436cd81d2d28bcd5fe22327f0bb9fcc34af307b4b5feac722c
+  metadata.gz: b49aaaa6fdb5f7d5bd4dc63713d8c0090411e7063363645a900d8f59d803aaaa
+  data.tar.gz: 5dfcc35d60745c25db6faf3acaa4344e29e438c758740613d6216e2f47aeac6e
 SHA512:
-  metadata.gz: d4a6e75c44c7cff4e238ff50168b7807fec8542074bbcbe838c50cf5eba02f181576291f1033620f268484b4c75f588215789515bd6c3ee9d7e76e8e5b94ceaf
-  data.tar.gz: 5576d6a31853ebf3e6662e4bbc8f97d4da918a24352e02c4d9c7569e4300ae102d79c9a348e55ce884273c12dfa1717b8b22c16091fa26eb0d69c19b4b7dca36
+  metadata.gz: 9f66627274ce2ea969b5bb9b53a339215718c37baf47393c75bcf3a528c5c73658c6a71903fdbbf9e53796aaf3680be5f99ab4151b834efbf9450e05abbab83b
+  data.tar.gz: 3cf2e2c7f251a6af7b931f00da63436eaa7e09f078d73de112852a10665cf16eefb561c7d61d6bc8b0c3c014ca0db2df217d31c00b9f0ed321565ed554574261

data/CHANGELOG.md CHANGED Viewed

@@ -5,23 +5,26 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
-## [0.2.0] - 2026-04-24
+## [0.3.0] - 2026-04-28
-### Changed
+### Added
-- simplify crawl and structured data boundaries
+- add JobPost structured data
-- harden validation boundaries
+### Documentation
+- fix missing changelog entry
-### Fixed
-- handle child sitemaps
-- use URL for sitemap validation
+### Fixed
+- ldjson check now uses the same convention for default URL

data/README.md CHANGED Viewed

@@ -150,7 +150,7 @@ bin/rails crawlscope:validate:metadata
 bin/rails crawlscope:validate:structured_data
 bin/rails crawlscope:validate:uniqueness
 bin/rails crawlscope:validate:links
-bin/rails crawlscope:validate:ldjson URL=https://example.com/article
+bin/rails crawlscope:validate:ldjson
 ```
 The same validation surface is also available in the gem repository itself through plain `rake`:
@@ -163,7 +163,7 @@ bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article
 `crawlscope:validate` runs all default sitemap rules: metadata, structured data, uniqueness, and links. `URL` is the site base. Without `SITEMAP`, Crawlscope uses `/sitemap.xml`. With `SITEMAP`, Crawlscope uses `URL` as the site base and validates URLs from that sitemap. `SITEMAP` may be a full URL or a local file path.
-`crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap.
+`crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`.
 ### Structured Data URL Audit

data/lib/crawlscope/cli.rb CHANGED Viewed

@@ -105,6 +105,7 @@ module Crawlscope
       parser.parse!(@argv)
       urls = options[:urls].map(&:strip).reject(&:empty?)
+      urls = default_urls if urls.empty?
       raise ConfigurationError, "Crawlscope URL is not configured" if urls.empty?
       configure_renderer(options[:renderer])
@@ -238,6 +239,10 @@ module Crawlscope
       raw_urls.split(";").map(&:strip).reject(&:empty?)
     end
+    def default_urls
+      [normalized_string(@configuration.base_url) || "http://localhost:3000"]
+    end
     def task
       @task ||= Run.new(configuration: @configuration, reporter: Reporter.new(io: @out))
     end

data/lib/crawlscope/crawl.rb CHANGED Viewed

@@ -81,6 +81,8 @@ module Crawlscope
           issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: page.url, message: page.error, details: {})
         elsif !@allowed_statuses.include?(page.status)
           issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status})
+        elsif redirected?(page)
+          issues.add(code: :redirected_page, severity: :warning, category: :crawl, url: page.url, message: "redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
         end
       end
     end
@@ -128,5 +130,9 @@ module Crawlscope
         status: page.status
       }
     end
+    def redirected?(page)
+      page.normalized_url.to_s != page.normalized_final_url.to_s
+    end
   end
 end

data/lib/crawlscope/rules/links.rb CHANGED Viewed

@@ -5,7 +5,7 @@ require "uri"
 module Crawlscope
   module Rules
     class Links
-      CONTEXTUAL_LINK_SELECTORS = "main a[href], article a[href]"
+      LINK_SELECTORS = "a[href]"
       INTERNAL_PATH_PREFIXES_TO_SKIP = ["/rails/", "/cdn-cgi/"].freeze
       LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze
       MAX_SOURCES_IN_ERROR = 3
@@ -33,10 +33,7 @@ module Crawlscope
       private
       def contextual_links(doc)
-        links = doc.css(CONTEXTUAL_LINK_SELECTORS)
-        return links unless links.empty?
-        doc.css("a[href]")
+        doc.css(LINK_SELECTORS)
       end
       def extract_links(pages)
@@ -45,7 +42,7 @@ module Crawlscope
       def page_links(page)
         source_path = Url.path(page.normalized_url)
-        return [] unless crawlable_path?(source_path)
+        return [] unless crawlable_source_path?(source_path)
         contextual_links(page.doc).filter_map do |node|
           link_for(page: page, source_path: source_path, node: node)
@@ -146,6 +143,7 @@ module Crawlscope
             next
           end
+          report_redirect_target(target_url, grouped_links, issues, target) if target.redirect?
           next unless crawlable_path?(target.final_path)
           grouped_links.each do |link|
@@ -156,6 +154,18 @@ module Crawlscope
         resolved_links
       end
+      def report_redirect_target(target_url, grouped_links, issues, target)
+        source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
+        issues.add(
+          code: :internal_link_redirects,
+          severity: :warning,
+          category: :links,
+          url: target_url,
+          message: "internal link redirects to #{target.final_url} (sources: #{source_urls.join(", ")})",
+          details: {final_url: target.final_url, source_urls: source_urls, status: target.status}
+        )
+      end
       def resolve_target(target_url)
         resolution = @resolve_target.call(target_url)
         LinkTarget.new(target_url: target_url, resolution: resolution)
@@ -183,11 +193,19 @@ module Crawlscope
           resolution && resolution[:status]
         end
+        def redirect?
+          (status && (300..399).cover?(status.to_i)) || final_url != target_url
+        end
         def unresolved?
           resolution.nil? || (status.nil? && !ignored_error?)
         end
       end
+      def crawlable_source_path?(path)
+        !path.nil? && INTERNAL_PATH_PREFIXES_TO_SKIP.none? { |prefix| path.start_with?(prefix) }
+      end
       def skip_internal_path?(path)
         return true if path == "/"

data/lib/crawlscope/rules/metadata.rb CHANGED Viewed

@@ -1,10 +1,14 @@
 # frozen_string_literal: true
+require "uri"
 module Crawlscope
   module Rules
     class Metadata
       TITLE_MAX_LENGTH = 72
+      DESCRIPTION_MIN_LENGTH = 110
       DESCRIPTION_MAX_LENGTH = 160
+      REQUIRED_OPEN_GRAPH_PROPERTIES = %w[og:title og:description og:url og:type og:image].freeze
       attr_reader :code
@@ -21,22 +25,35 @@ module Crawlscope
           validate_title(page, issues)
           validate_description(page, issues)
           validate_canonical(page, issues)
+          validate_open_graph(page, issues)
         end
       end
       private
       def validate_h1(page, issues)
-        return unless page.doc.at_css("h1").nil?
-        issues.add(
-          code: :missing_h1,
-          severity: :warning,
-          category: :metadata,
-          url: page.url,
-          message: "missing <h1>",
-          details: {}
-        )
+        h1s = page.doc.css("h1")
+        return if h1s.one?
+        if h1s.empty?
+          issues.add(
+            code: :missing_h1,
+            severity: :warning,
+            category: :metadata,
+            url: page.url,
+            message: "missing <h1>",
+            details: {}
+          )
+        else
+          issues.add(
+            code: :multiple_h1,
+            severity: :warning,
+            category: :metadata,
+            url: page.url,
+            message: "multiple <h1> tags (#{h1s.size})",
+            details: {count: h1s.size}
+          )
+        end
       end
       def validate_title(page, issues)
@@ -56,6 +73,8 @@ module Crawlscope
         if description.empty?
           issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {})
+        elsif description.length < DESCRIPTION_MIN_LENGTH
+          issues.add(code: :meta_description_too_short, severity: :warning, category: :metadata, url: page.url, message: "meta description too short (#{description.length})", details: {length: description.length, minimum: DESCRIPTION_MIN_LENGTH})
         elsif description.length > DESCRIPTION_MAX_LENGTH
           issues.add(code: :meta_description_too_long, severity: :warning, category: :metadata, url: page.url, message: "meta description too long (#{description.length})", details: {length: description.length})
         end
@@ -71,7 +90,7 @@ module Crawlscope
         normalized_canonical = Url.normalize(canonical, base_url: page.url)
         normalized_page_url = Url.normalize(page.url, base_url: page.url)
-        return if normalized_canonical == normalized_page_url
+        return if canonical_matches_page?(normalized_canonical, normalized_page_url)
         issues.add(
           code: :canonical_mismatch,
@@ -88,6 +107,33 @@ module Crawlscope
         title.split(/[^[:alnum:]]+/).count { |token| token.casecmp?(@site_name) } > 1
       end
+      def validate_open_graph(page, issues)
+        missing = REQUIRED_OPEN_GRAPH_PROPERTIES.reject do |property|
+          page.doc.at_css(%(meta[property="#{property}"][content]))
+        end
+        return if missing.empty?
+        issues.add(
+          code: :incomplete_open_graph_tags,
+          severity: :warning,
+          category: :metadata,
+          url: page.url,
+          message: "Open Graph tags incomplete (missing #{missing.join(", ")})",
+          details: {missing: missing}
+        )
+      end
+      def canonical_matches_page?(canonical, page_url)
+        canonical == page_url || (local_url?(page_url) && Url.path(canonical) == Url.path(page_url))
+      end
+      def local_url?(url)
+        host = URI.parse(url.to_s).host.to_s
+        ["localhost", "127.0.0.1", "0.0.0.0", "::1"].include?(host)
+      rescue URI::InvalidURIError
+        false
+      end
     end
   end
 end

data/lib/crawlscope/rules/structured_data.rb CHANGED Viewed

@@ -3,6 +3,8 @@
 module Crawlscope
   module Rules
     class StructuredData
+      CAREER_DETAIL_PATH = %r{/careers/[^/]+/?\z}
       attr_reader :code
       def initialize
@@ -65,6 +67,51 @@ module Crawlscope
             details: {errors: errors, source: source}
           )
         end
+        validate_job_posting_count(page, items, issues)
+      end
+      def validate_job_posting_count(page, items, issues)
+        job_postings = items.select { |item| structured_data_types(item.data).include?("JobPosting") }
+        return if job_postings.size == 1
+        if job_postings.size > 1
+          issues.add(
+            code: :multiple_job_postings,
+            severity: :warning,
+            category: :structured_data,
+            url: page.url,
+            message: "multiple JobPosting structured data blocks found",
+            details: {count: job_postings.size}
+          )
+        elsif career_detail_page?(page.url)
+          issues.add(
+            code: :missing_job_posting,
+            severity: :warning,
+            category: :structured_data,
+            url: page.url,
+            message: "career detail page missing JobPosting structured data",
+            details: {expected_type: "JobPosting"}
+          )
+        end
+      end
+      def structured_data_types(data)
+        return [] unless data.is_a?(Hash)
+        types = Array(data["@type"]).map(&:to_s)
+        if data["@graph"].is_a?(Array)
+          types.concat(data["@graph"].flat_map { |entry| structured_data_types(entry) })
+        end
+        types
+      end
+      def career_detail_page?(url)
+        URI(url).path.match?(CAREER_DETAIL_PATH)
+      rescue URI::InvalidURIError
+        false
       end
     end
   end

data/lib/crawlscope/schemas.rb CHANGED Viewed

@@ -330,6 +330,56 @@ module Crawlscope
       }
     }.freeze
+    JOB_POSTING = {
+      type: "object",
+      additionalProperties: true,
+      required: ["@type", "title", "description", "datePosted", "hiringOrganization"],
+      properties: {
+        "@context" => {enum: ["https://schema.org", "https://schema.org/"]},
+        "@type" => {const: "JobPosting"},
+        :title => {type: "string"},
+        :description => {type: "string"},
+        :identifier => {type: "object"},
+        :datePosted => {type: "string"},
+        :validThrough => {type: "string"},
+        :employmentType => {
+          anyOf: [
+            {type: "string"},
+            {type: "array", minItems: 1, items: {type: "string"}}
+          ]
+        },
+        :directApply => {type: "boolean"},
+        :hiringOrganization => {
+          type: "object",
+          required: ["@type", "name"],
+          properties: {
+            "@type" => {const: "Organization"},
+            :name => {type: "string"},
+            :sameAs => {type: "string", format: "uri"},
+            :logo => {type: "string", format: "uri"}
+          }
+        },
+        :applicantLocationRequirements => {
+          anyOf: [
+            {type: "object"},
+            {type: "array", minItems: 1, items: {type: "object"}}
+          ]
+        },
+        :jobLocationType => {type: "string"},
+        :jobLocation => {
+          anyOf: [
+            {type: "object"},
+            {type: "array", minItems: 1, items: {type: "object"}}
+          ]
+        },
+        :baseSalary => {type: "object"}
+      },
+      anyOf: [
+        {required: ["jobLocation"]},
+        {required: ["jobLocationType", "applicantLocationRequirements"]}
+      ]
+    }.freeze
     def self.schemas
       {
         "FAQPage" => FAQ_PAGE,
@@ -348,7 +398,8 @@ module Crawlscope
         "Recipe" => RECIPE,
         "Event" => EVENT,
         "VideoObject" => VIDEO_OBJECT,
-        "WebPage" => WEB_PAGE
+        "WebPage" => WEB_PAGE,
+        "JobPosting" => JOB_POSTING
       }
     end
   end

data/lib/crawlscope/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Crawlscope
-  VERSION = "0.2.0"
+  VERSION = "0.3.0"
 end

data/lib/tasks/crawlscope_tasks.rake CHANGED Viewed

@@ -5,7 +5,7 @@ namespace :crawlscope do
   end
   namespace :validate do
-    desc "Directly validate JSON-LD on one or more URLs. ENV: URL (required, semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
+    desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
     task ldjson: :environment do
       Crawlscope::RakeTasks.ldjson
     end

data/test/crawlscope/cli_test.rb CHANGED Viewed

@@ -4,9 +4,10 @@ require "test_helper"
 class CrawlscopeCliTest < Minitest::Test
   class FakeConfiguration
-    attr_accessor :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
+    attr_accessor :base_url, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
     def initialize
+      @base_url = nil
       @concurrency = 10
       @network_idle_timeout_seconds = 5
       @renderer = :http
@@ -145,6 +146,17 @@ class CrawlscopeCliTest < Minitest::Test
     assert_empty err.string
   end
+  def test_ldjson_defaults_to_configured_base_url
+    configuration = FakeConfiguration.new
+    configuration.base_url = "https://example.com"
+    task = FakeTask.new
+    status = Crawlscope::Cli.start(["ldjson"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
+    assert_equal 0, status
+    assert_equal ["https://example.com"], task.json_ld_arguments[:urls]
+  end
   def test_validate_caps_default_browser_concurrency
     configuration = FakeConfiguration.new
     task = FakeTask.new
@@ -218,14 +230,16 @@ class CrawlscopeCliTest < Minitest::Test
     assert_equal 3, configuration.network_idle_timeout_seconds
   end
-  def test_ldjson_requires_urls
+  def test_ldjson_defaults_to_localhost
     out = StringIO.new
     err = StringIO.new
+    task = FakeTask.new
-    status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task: FakeTask.new)
+    status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task: task)
-    assert_equal 1, status
-    assert_includes err.string, "Crawlscope URL is not configured"
+    assert_equal 0, status
+    assert_equal ["http://localhost:3000"], task.json_ld_arguments[:urls]
+    assert_empty err.string
   end
   def test_invalid_integer_option_returns_error

data/test/crawlscope/crawl_test.rb CHANGED Viewed

@@ -31,8 +31,13 @@ class CrawlscopeCrawlTest < Minitest::Test
           <html>
             <head>
               <title>Pricing</title>
-              <meta name="description" content="Plans for hotels and restaurants">
+              <meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
               <link rel="canonical" href="https://example.com/pricing">
+              <meta property="og:title" content="Pricing">
+              <meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
+              <meta property="og:url" content="https://example.com/pricing">
+              <meta property="og:type" content="website">
+              <meta property="og:image" content="https://example.com/icon.png">
               <script type="application/ld+json">
                 {"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
               </script>
@@ -95,7 +100,7 @@ class CrawlscopeCrawlTest < Minitest::Test
     ).call
     refute result.ok?
-    assert_equal %i[meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
+    assert_equal %i[incomplete_open_graph_tags meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
   end
   def test_uses_browser_when_renderer_is_browser
@@ -128,8 +133,13 @@ class CrawlscopeCrawlTest < Minitest::Test
           <html>
             <head>
               <title>Pricing</title>
-              <meta name="description" content="Plans for hotels and restaurants">
+              <meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
               <link rel="canonical" href="https://example.com/pricing">
+              <meta property="og:title" content="Pricing">
+              <meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
+              <meta property="og:url" content="https://example.com/pricing">
+              <meta property="og:type" content="website">
+              <meta property="og:image" content="https://example.com/icon.png">
               <script type="application/ld+json">
                 {"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
               </script>

data/test/crawlscope/links_rule_test.rb CHANGED Viewed

@@ -118,6 +118,45 @@ class CrawlscopeLinksRuleTest < Minitest::Test
     assert_equal "https://example.com/guide", issues.to_a.first.url
   end
+  def test_counts_root_page_links_as_inbound_links
+    issues = Crawlscope::IssueCollection.new
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/", "https://example.com/about"],
+      pages: [
+        page(url: "https://example.com/", body: "<main><a href=\"/about\">About</a></main>"),
+        page(url: "https://example.com/about", body: "<main><p>About</p></main>")
+      ],
+      issues: issues,
+      context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
+    )
+    refute_includes issues.to_a.map(&:code), :low_inbound_anchor_links
+  end
+  def test_reports_internal_links_that_redirect
+    issues = Crawlscope::IssueCollection.new
+    resolver = lambda do |target_url|
+      {
+        crawled: false,
+        error: nil,
+        final_url: "https://example.com/pricing",
+        status: 200
+      }
+    end
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/guide"],
+      pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/plans\">Plans</a></main>")],
+      issues: issues,
+      context: context(resolver: resolver)
+    )
+    redirect_issue = issues.to_a.find { |issue| issue.code == :internal_link_redirects }
+    assert redirect_issue
+    assert_includes redirect_issue.message, "https://example.com/pricing"
+  end
   def test_ignores_links_that_should_not_be_crawled
     issues = Crawlscope::IssueCollection.new

data/test/crawlscope/metadata_rule_test.rb ADDED Viewed

@@ -0,0 +1,77 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeMetadataRuleTest < Minitest::Test
+  def test_reports_short_meta_description_multiple_h1_and_incomplete_open_graph
+    issues = Crawlscope::IssueCollection.new
+    Crawlscope::Rules::Metadata.new.call(
+      urls: [page.url],
+      pages: [page],
+      issues: issues
+    )
+    codes = issues.to_a.map(&:code)
+    assert_includes codes, :meta_description_too_short
+    assert_includes codes, :multiple_h1
+    assert_includes codes, :incomplete_open_graph_tags
+  end
+  def test_allows_localhost_page_with_matching_production_canonical_path
+    issues = Crawlscope::IssueCollection.new
+    local_page = page(
+      url: "http://localhost:3000/about",
+      body: <<~HTML
+        <html>
+          <head>
+            <title>About</title>
+            <meta name="description" content="A clear description that is long enough for search snippets, local validation checks, and realistic production metadata audits.">
+            <link rel="canonical" href="https://www.example.com/about">
+            <meta property="og:title" content="About">
+            <meta property="og:description" content="About page">
+            <meta property="og:url" content="https://www.example.com/about">
+            <meta property="og:type" content="website">
+            <meta property="og:image" content="https://www.example.com/icon.png">
+          </head>
+          <body><main><h1>About</h1></main></body>
+        </html>
+      HTML
+    )
+    Crawlscope::Rules::Metadata.new.call(
+      urls: [local_page.url],
+      pages: [local_page],
+      issues: issues
+    )
+    refute_includes issues.to_a.map(&:code), :canonical_mismatch
+  end
+  private
+  def page(url: "https://example.com/about", body: nil)
+    body ||= <<~HTML
+      <html>
+        <head>
+          <title>About</title>
+          <meta name="description" content="Too short">
+          <link rel="canonical" href="https://example.com/about">
+          <meta property="og:title" content="About">
+        </head>
+        <body><main><h1>About</h1><h1>Team</h1></main></body>
+      </html>
+    HTML
+    Crawlscope::Page.new(
+      url: url,
+      normalized_url: Crawlscope::Url.normalize(url, base_url: url),
+      final_url: url,
+      normalized_final_url: Crawlscope::Url.normalize(url, base_url: url),
+      status: 200,
+      headers: {"content-type" => "text/html"},
+      body: body,
+      doc: Nokogiri::HTML(body)
+    )
+  end
+end

data/test/crawlscope/structured_data_rule_test.rb CHANGED Viewed

@@ -79,6 +79,97 @@ class CrawlscopeStructuredDataRuleTest < Minitest::Test
     assert_equal ["json-ld", "microdata"], issues.to_a.first.details[:expected_sources]
   end
+  def test_validates_job_posting_markup
+    issues = Crawlscope::IssueCollection.new
+    rule = Crawlscope::Rules::StructuredData.new
+    page = page(
+      url: "https://example.com/careers/sales-partner",
+      body: <<~HTML
+        <html>
+          <head>
+            <script type="application/ld+json">
+              {
+                "@context":"https://schema.org/",
+                "@type":"JobPosting",
+                "title":"Sales Partner",
+                "description":"A real role description.",
+                "datePosted":"2026-04-28",
+                "hiringOrganization":{"@type":"Organization","name":"Example","sameAs":"https://example.com/","logo":"https://example.com/icon.png"},
+                "jobLocationType":"TELECOMMUTE",
+                "applicantLocationRequirements":[{"@type":"Country","name":"South Africa"}]
+              }
+            </script>
+          </head>
+          <body><h1>Sales Partner</h1></body>
+        </html>
+      HTML
+    )
+    rule.call(
+      urls: [page.url],
+      pages: [page],
+      issues: issues,
+      context: {schema_registry: Crawlscope::SchemaRegistry.default}
+    )
+    assert_empty issues.to_a
+  end
+  def test_reports_schema_errors_for_invalid_job_posting_markup
+    issues = Crawlscope::IssueCollection.new
+    rule = Crawlscope::Rules::StructuredData.new
+    page = page(
+      url: "https://example.com/careers/sales-partner",
+      body: <<~HTML
+        <html>
+          <head>
+            <script type="application/ld+json">
+              {"@context":"https://schema.org","@type":"JobPosting","title":"Sales Partner"}
+            </script>
+          </head>
+          <body><h1>Sales Partner</h1></body>
+        </html>
+      HTML
+    )
+    rule.call(
+      urls: [page.url],
+      pages: [page],
+      issues: issues,
+      context: {schema_registry: Crawlscope::SchemaRegistry.default}
+    )
+    assert_equal [:structured_data_schema_error], issues.to_a.map(&:code)
+    assert_includes issues.to_a.first.message, "description"
+  end
+  def test_reports_missing_job_posting_for_career_detail_pages
+    issues = Crawlscope::IssueCollection.new
+    rule = Crawlscope::Rules::StructuredData.new
+    page = page(
+      url: "https://example.com/careers/sales-partner",
+      body: <<~HTML
+        <html>
+          <head>
+            <script type="application/ld+json">
+              {"@context":"https://schema.org","@type":"WebPage","name":"Sales Partner"}
+            </script>
+          </head>
+          <body><h1>Sales Partner</h1></body>
+        </html>
+      HTML
+    )
+    rule.call(
+      urls: [page.url],
+      pages: [page],
+      issues: issues,
+      context: {schema_registry: Crawlscope::SchemaRegistry.default}
+    )
+    assert_equal [:missing_job_posting], issues.to_a.map(&:code)
+  end
   private
   def page(url:, body:)

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: crawlscope
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.0
 platform: ruby
 authors:
 - Paulo Fidalgo
@@ -233,6 +233,7 @@ files:
 - test/crawlscope/http_test.rb
 - test/crawlscope/links_rule_test.rb
 - test/crawlscope/loader_test.rb
+- test/crawlscope/metadata_rule_test.rb
 - test/crawlscope/reporter_test.rb
 - test/crawlscope/rule_registry_test.rb
 - test/crawlscope/run_test.rb