RubyGems - crawlscope - Versions diffs - 0.1.0 - Mend

crawlscope 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +31 -0
data/LICENSE.txt +21 -0
data/README.md +323 -0
data/exe/crawlscope +6 -0
data/lib/crawlscope/audit.rb +128 -0
data/lib/crawlscope/browser.rb +88 -0
data/lib/crawlscope/cli.rb +245 -0
data/lib/crawlscope/configuration.rb +123 -0
data/lib/crawlscope/crawler.rb +28 -0
data/lib/crawlscope/http.rb +77 -0
data/lib/crawlscope/issue.rb +17 -0
data/lib/crawlscope/issue_collection.rb +41 -0
data/lib/crawlscope/page.rb +23 -0
data/lib/crawlscope/railtie.rb +9 -0
data/lib/crawlscope/reporter.rb +33 -0
data/lib/crawlscope/result.rb +9 -0
data/lib/crawlscope/rule_registry.rb +39 -0
data/lib/crawlscope/rules/links.rb +220 -0
data/lib/crawlscope/rules/metadata.rb +93 -0
data/lib/crawlscope/rules/structured_data.rb +58 -0
data/lib/crawlscope/rules/uniqueness.rb +88 -0
data/lib/crawlscope/schema_registry.rb +431 -0
data/lib/crawlscope/sitemap.rb +67 -0
data/lib/crawlscope/structured_data/audit.rb +150 -0
data/lib/crawlscope/structured_data/document.rb +93 -0
data/lib/crawlscope/structured_data/report.rb +77 -0
data/lib/crawlscope/structured_data/reporter.rb +73 -0
data/lib/crawlscope/structured_data/writer.rb +26 -0
data/lib/crawlscope/task.rb +131 -0
data/lib/crawlscope/url.rb +43 -0
data/lib/crawlscope/version.rb +5 -0
data/lib/crawlscope.rb +34 -0
data/lib/tasks/crawlscope_tasks.rake +44 -0
data/test/crawlscope/audit_test.rb +165 -0
data/test/crawlscope/cli_test.rb +157 -0
data/test/crawlscope/configuration_test.rb +45 -0
data/test/crawlscope/links_rule_test.rb +87 -0
data/test/crawlscope/loader_test.rb +11 -0
data/test/crawlscope/reporter_test.rb +50 -0
data/test/crawlscope/schema_registry_test.rb +89 -0
data/test/crawlscope/sitemap_test.rb +51 -0
data/test/crawlscope/structured_data_audit_test.rb +118 -0
data/test/crawlscope/structured_data_document_test.rb +28 -0
data/test/crawlscope/structured_data_report_test.rb +37 -0
data/test/crawlscope/structured_data_reporter_test.rb +32 -0
data/test/crawlscope/structured_data_rule_test.rb +78 -0
data/test/crawlscope/structured_data_writer_test.rb +32 -0
data/test/crawlscope/task_test.rb +206 -0
data/test/crawlscope/uniqueness_rule_test.rb +46 -0
data/test/test_helper.rb +23 -0
metadata +271 -0

data/lib/crawlscope/structured_data/document.rb ADDED Viewed

@@ -0,0 +1,93 @@
+# frozen_string_literal: true
+require "json"
+require "nokogiri"
+module Crawlscope
+  module StructuredData
+    class Document
+      Item = Data.define(:source, :data)
+      def initialize(html:)
+        @doc = Nokogiri::HTML(html.to_s)
+      end
+      def items
+        @items ||= extract_json_ld_items + extract_microdata_items
+      end
+      def json_ld_items
+        items.filter_map do |item|
+          next unless item.source == "json-ld"
+          next unless item.data.is_a?(Hash)
+          next if item.data.key?(:error)
+          item.data
+        end
+      end
+      private
+      def extract_json_ld_items
+        @doc.css('script[type="application/ld+json"]').flat_map do |node|
+          parse_json_ld(node.content)
+        end
+      end
+      def parse_json_ld(content)
+        payload = JSON.parse(content)
+        entries = payload.is_a?(Array) ? payload : [payload]
+        entries.filter_map do |entry|
+          next unless entry.is_a?(Hash)
+          Item.new(source: "json-ld", data: entry)
+        end
+      rescue JSON::ParserError => error
+        [Item.new(source: "json-ld", data: {error: "Invalid JSON-LD", message: error.message})]
+      end
+      def extract_microdata_items
+        @doc.css("[itemtype]").filter_map do |node|
+          type = node["itemtype"].to_s
+          next unless type.start_with?("http://schema.org", "https://schema.org")
+          item = extract_microdata_item(node)
+          item["@type"] = type.sub(%r{.*/}, "")
+          Item.new(source: "microdata", data: item)
+        end
+      end
+      def extract_microdata_item(node)
+        item = {}
+        node.css("[itemprop]").each do |prop_node|
+          prop = prop_node["itemprop"]
+          value = extract_microdata_value(prop_node)
+          item[prop] = value
+        end
+        node.css("[itemtype]").select { |entry| entry["itemprop"].nil? }.each do |nested|
+          type = nested["itemtype"].to_s.sub(%r{.*/}, "")
+          nested_item = extract_microdata_item(nested)
+          nested_item["@type"] = type
+          item[type] ||= []
+          item[type] << nested_item
+        end
+        item
+      end
+      def extract_microdata_value(node)
+        return if node["itemprop"].nil?
+        return node["content"] if node["content"]
+        return node["datetime"] if node["datetime"]
+        return node["href"] || node["src"] if node["href"] || node["src"]
+        return node["value"] if node["value"]
+        return node["content"] if node.name == "meta"
+        node.text.strip.empty? ? nil : node.text.strip
+      end
+    end
+  end
+end

data/lib/crawlscope/structured_data/report.rb ADDED Viewed

@@ -0,0 +1,77 @@
+# frozen_string_literal: true
+module Crawlscope
+  module StructuredData
+    class Report
+      def initialize(result)
+        @result = result
+      end
+      def all_valid?
+        http_errors.empty? && missing_data.empty? && validation_errors.empty?
+      end
+      def failure_count
+        http_errors.size + missing_data.size + validation_errors.size
+      end
+      def http_errors
+        entries.select { |entry| entry.fetch_error && entry.status != 200 }
+      end
+      def missing_data
+        entries.select { |entry| entry.status == 200 && !entry.structured_data_found? }
+      end
+      def results
+        entries.each_with_object({}) do |entry, collection|
+          collection[entry.url] = result_for(entry)
+        end
+      end
+      def total
+        entries.size
+      end
+      def validation_errors
+        entries.select { |entry| entry.status == 200 && entry.errors.any? }
+      end
+      private
+      def entries
+        @result.entries
+      end
+      def result_for(entry)
+        if entry.fetch_error && entry.status == 200
+          {
+            status: entry.status,
+            error: entry.fetch_error,
+            structured_data_found: false,
+            validation_errors: [],
+            json_ld_count: 0
+          }
+        elsif entry.fetch_error
+          {
+            status: entry.status || "exception",
+            error: entry.fetch_error,
+            structured_data_found: false,
+            validation_errors: [],
+            json_ld_count: 0
+          }
+        else
+          {
+            status: entry.status || 200,
+            error: nil,
+            structured_data_found: entry.structured_data_found?,
+            validation_errors: entry.errors.flat_map { |error| error[:errors] },
+            json_ld_count: entry.json_ld_count,
+            skipped_reason: entry.skipped_reason,
+            content_type: entry.content_type
+          }.compact
+        end
+      end
+    end
+  end
+end

data/lib/crawlscope/structured_data/reporter.rb ADDED Viewed

@@ -0,0 +1,73 @@
+# frozen_string_literal: true
+require "uri"
+module Crawlscope
+  module StructuredData
+    class Reporter
+      def initialize(io:, report_path: nil)
+        @io = io
+        @report_path = report_path
+      end
+      def report(result)
+        report = Report.new(result)
+        if report.all_valid?
+          @io.puts("")
+          @io.puts("All #{report.total} URLs passed validation.")
+        else
+          report_failures(report)
+        end
+      end
+      private
+      def extract_path(url)
+        URI.parse(url).path
+      rescue URI::InvalidURIError
+        url
+      end
+      def print_category(name, items)
+        return if items.empty?
+        @io.puts("#{name} (#{items.size}):")
+        items.each { |item| yield item }
+        @io.puts("")
+      end
+      def report_failures(report)
+        @io.puts("")
+        @io.puts("VALIDATION FAILED (#{report.failure_count}/#{report.total} URLs)")
+        @io.puts("")
+        print_category("HTTP ERRORS", report.http_errors) do |entry|
+          @io.puts("• #{extract_path(entry.url)} (#{entry.status}: #{entry.fetch_error})")
+        end
+        print_category("MISSING STRUCTURED DATA", report.missing_data) do |entry|
+          @io.puts("• #{extract_path(entry.url)}")
+        end
+        print_category("VALIDATION ERRORS", report.validation_errors) do |entry|
+          @io.puts("• #{extract_path(entry.url)}")
+          entry.errors.each do |error|
+            error[:errors].each do |validation_error|
+              field = validation_error[:field] || validation_error["field"] || "$"
+              issue = validation_error[:issue] || validation_error["issue"] || "Unknown error"
+              @io.puts("    - #{field}: #{issue}")
+            end
+          end
+        end
+        if @report_path
+          @io.puts("Full details available in: #{@report_path}")
+        end
+        @io.puts("#{report.failure_count} of #{report.total} URLs failed validation.")
+      end
+    end
+  end
+end

data/lib/crawlscope/structured_data/writer.rb ADDED Viewed

@@ -0,0 +1,26 @@
+# frozen_string_literal: true
+require "json"
+require "fileutils"
+require "time"
+module Crawlscope
+  module StructuredData
+    class Writer
+      def initialize(path:)
+        @path = path
+      end
+      def write(result)
+        FileUtils.mkdir_p(File.dirname(@path))
+        File.write(
+          @path,
+          JSON.pretty_generate(
+            generated_at: Time.now.iso8601,
+            results: Report.new(result).results
+          )
+        )
+      end
+    end
+  end
+end

data/lib/crawlscope/task.rb ADDED Viewed

@@ -0,0 +1,131 @@
+# frozen_string_literal: true
+require "json"
+module Crawlscope
+  class Task
+    def initialize(configuration: Crawlscope.configuration, reporter: Reporter.new(io: configuration.output))
+      @configuration = configuration
+      @reporter = reporter
+    end
+    def validate(base_url: nil, sitemap_path: nil, rule_names: nil)
+      resolved_base_url = base_url || default_base_url
+      audit = @configuration.audit(
+        base_url: resolved_base_url,
+        sitemap_path: sitemap_path || default_sitemap_path(base_url: resolved_base_url),
+        rule_names: rule_names
+      )
+      result = audit.call
+      @reporter.report(result)
+      result
+    end
+    def validate_ldjson(urls:, debug: false, renderer: @configuration.renderer, timeout_seconds: @configuration.timeout_seconds, report_path: nil, summary: false)
+      audit = StructuredData::Audit.new(
+        browser_factory: @configuration.browser_factory,
+        network_idle_timeout_seconds: @configuration.network_idle_timeout_seconds,
+        renderer: renderer,
+        schema_registry: @configuration.schema_registry,
+        scroll_page: @configuration.scroll_page?,
+        timeout_seconds: timeout_seconds
+      )
+      result = audit.call(urls: urls)
+      report_ldjson_result(result, debug: debug, renderer: renderer)
+      StructuredData::Writer.new(path: report_path).write(result) if report_path
+      StructuredData::Reporter.new(io: @configuration.output, report_path: report_path).report(result) if summary
+      result
+    end
+    private
+    def default_base_url
+      value = @configuration.base_url
+      return value unless value.to_s.strip.empty?
+      "http://localhost:3000"
+    end
+    def default_sitemap_path(base_url:)
+      value = @configuration.sitemap_path
+      return value unless value.to_s.strip.empty?
+      local_path = File.expand_path("public/sitemap.xml", Dir.pwd)
+      if local_path_default?(base_url: base_url) && File.exist?(local_path)
+        return local_path
+      end
+      "#{base_url.to_s.chomp("/")}/sitemap.xml"
+    end
+    def local_path_default?(base_url:)
+      host = URI.parse(base_url.to_s).host.to_s
+      ["localhost", "127.0.0.1"].include?(host)
+    rescue URI::InvalidURIError
+      false
+    end
+    def report_ldjson_result(result, debug:, renderer:)
+      if renderer == :browser
+        @configuration.output.puts("JavaScript mode enabled (Ferrum)")
+      end
+      @configuration.output.puts("Validating JSON-LD on #{result.entries.size} URL(s)")
+      @configuration.output.puts("")
+      result.entries.each do |entry|
+        @configuration.output.puts("=" * 80)
+        @configuration.output.puts("URL: #{entry.url}")
+        @configuration.output.puts("=" * 80)
+        if entry.fetch_error
+          @configuration.output.puts("Error: #{entry.fetch_error}")
+          @configuration.output.puts("")
+          next
+        end
+        if entry.status
+          @configuration.output.puts("Status: #{entry.status}")
+        else
+          @configuration.output.puts("Status: JS runtime fetch")
+        end
+        @configuration.output.puts("Structured data found: #{entry.structured_items.size} (JSON-LD: #{entry.json_ld_count}, Microdata: #{entry.microdata_count})")
+        if debug && entry.structured_items.any?
+          @configuration.output.puts("")
+          @configuration.output.puts("--- Detected Structured Data ---")
+          entry.structured_items.each_with_index do |item, index|
+            @configuration.output.puts("")
+            @configuration.output.puts("## Item #{index + 1} [#{item[:source]}]")
+            @configuration.output.puts(JSON.pretty_generate(item[:data]))
+          end
+          @configuration.output.puts("")
+          @configuration.output.puts("--- End ---")
+        end
+        @configuration.output.puts("")
+        @configuration.output.puts("Validation results:")
+        if entry.errors.empty?
+          @configuration.output.puts("  All valid!")
+        else
+          entry.errors.each do |error|
+            @configuration.output.puts("  #{error[:type]}: INVALID [#{error[:source]}]")
+            error[:errors].each do |validation_error|
+              @configuration.output.puts("    - field: #{validation_error[:field]}, issue: #{validation_error[:issue]}")
+            end
+          end
+        end
+        @configuration.output.puts("")
+      end
+      @configuration.output.puts("STATUS: #{result.ok? ? "OK" : "FAILED"}")
+    end
+  end
+end

data/lib/crawlscope/url.rb ADDED Viewed

@@ -0,0 +1,43 @@
+# frozen_string_literal: true
+require "uri"
+module Crawlscope
+  module Url
+    module_function
+    def normalize(url, base_url:)
+      uri = URI.parse(url.to_s)
+      uri = URI.join(base_url.to_s, url.to_s) if uri.host.nil?
+      normalized_path = uri.path.to_s
+      normalized_path = "/" if normalized_path.empty?
+      normalized_path = normalized_path.chomp("/")
+      normalized_path = "/" if normalized_path.empty?
+      host = uri.host.to_s
+      host = "#{host}:#{uri.port}" if uri.port && uri.port != uri.default_port
+      "#{uri.scheme}://#{host}#{normalized_path}"
+    rescue URI::InvalidURIError
+      url.to_s
+    end
+    def path(url)
+      uri = URI.parse(url.to_s)
+      value = uri.path.to_s
+      value = "/" if value.empty?
+      value = value.chomp("/")
+      value.empty? ? "/" : value
+    rescue URI::InvalidURIError
+      nil
+    end
+    def remote?(value)
+      uri = URI.parse(value.to_s)
+      !uri.scheme.nil? && !uri.host.nil?
+    rescue URI::InvalidURIError
+      false
+    end
+  end
+end

data/lib/crawlscope/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module Crawlscope
+  VERSION = "0.1.0"
+end

data/lib/crawlscope.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+require "uri"
+require "zeitwerk"
+module Crawlscope
+  class Error < StandardError; end
+  class ConfigurationError < Error; end
+  class ValidationError < Error; end
+  class << self
+    attr_reader :loader
+    def configuration
+      @configuration ||= Configuration.new
+    end
+    def configure
+      yield(configuration)
+    end
+    def reset!
+      @configuration = Configuration.new
+    end
+  end
+end
+Crawlscope.instance_variable_set(:@loader, Zeitwerk::Loader.for_gem)
+Crawlscope.loader.ignore("#{__dir__}/tasks")
+Crawlscope.loader.ignore("#{__dir__}/crawlscope/railtie.rb")
+Crawlscope.loader.setup
+require "crawlscope/railtie" if defined?(Rails::Railtie)

data/lib/tasks/crawlscope_tasks.rake ADDED Viewed

@@ -0,0 +1,44 @@
+namespace :crawlscope do
+  desc "Validate sitemap URLs with the default Crawlscope rules. ENV: BASE_URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
+  task validate: :environment do
+    status = Crawlscope::Cli.start(["validate"], out: $stdout, err: $stderr)
+    exit(status) unless status.zero?
+  end
+  namespace :validate do
+    desc "Validate JSON-LD on one or more URLs. ENV: URL (required, semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
+    task ldjson: :environment do
+      status = Crawlscope::Cli.start(["ldjson"], out: $stdout, err: $stderr)
+      exit(status) unless status.zero?
+    end
+    desc "Validate sitemap URLs with the metadata rule. ENV: BASE_URL, SITEMAP, JS=1"
+    task metadata: :environment do
+      crawlscope_task_with_rules("metadata")
+    end
+    desc "Validate sitemap URLs with the structured_data rule. ENV: BASE_URL, SITEMAP, JS=1"
+    task structured_data: :environment do
+      crawlscope_task_with_rules("structured_data")
+    end
+    desc "Validate sitemap URLs with the uniqueness rule. ENV: BASE_URL, SITEMAP, JS=1"
+    task uniqueness: :environment do
+      crawlscope_task_with_rules("uniqueness")
+    end
+    desc "Validate sitemap URLs with the links rule. ENV: BASE_URL, SITEMAP, JS=1"
+    task links: :environment do
+      crawlscope_task_with_rules("links")
+    end
+  end
+  def crawlscope_task_with_rules(rules)
+    original_rules = ENV["RULES"]
+    ENV["RULES"] = rules
+    status = Crawlscope::Cli.start(["validate"], out: $stdout, err: $stderr)
+    exit(status) unless status.zero?
+  ensure
+    ENV["RULES"] = original_rules
+  end
+end

data/test/crawlscope/audit_test.rb ADDED Viewed

@@ -0,0 +1,165 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeAuditTest < Minitest::Test
+  def setup
+    @tmp_dir = Dir.mktmpdir
+    @sitemap_path = File.join(@tmp_dir, "sitemap.xml")
+  end
+  def teardown
+    FileUtils.rm_rf(@tmp_dir)
+  end
+  def test_returns_ok_when_metadata_is_valid
+    File.write(
+      @sitemap_path,
+      <<~XML
+        <?xml version="1.0" encoding="UTF-8"?>
+        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+          <url><loc>https://example.com/pricing</loc></url>
+        </urlset>
+      XML
+    )
+    stub_request(:get, "https://example.com/pricing")
+      .to_return(
+        status: 200,
+        headers: {"Content-Type" => "text/html"},
+        body: <<~HTML
+          <html>
+            <head>
+              <title>Pricing</title>
+              <meta name="description" content="Plans for hotels and restaurants">
+              <link rel="canonical" href="https://example.com/pricing">
+            </head>
+            <body>
+              <main>
+                <h1>Pricing</h1>
+              </main>
+            </body>
+          </html>
+        HTML
+      )
+    result = Crawlscope::Audit.new(
+      base_url: "https://example.com",
+      sitemap_path: @sitemap_path,
+      rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
+      schema_registry: Crawlscope::SchemaRegistry.default
+    ).call
+    assert result.ok?
+    assert_empty result.issues.to_a
+  end
+  def test_collects_metadata_issues_for_invalid_page
+    File.write(
+      @sitemap_path,
+      <<~XML
+        <?xml version="1.0" encoding="UTF-8"?>
+        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+          <url><loc>https://example.com/about</loc></url>
+        </urlset>
+      XML
+    )
+    stub_request(:get, "https://example.com/about")
+      .to_return(
+        status: 200,
+        headers: {"Content-Type" => "text/html"},
+        body: <<~HTML
+          <html>
+            <head>
+              <title>Example About Example</title>
+              <meta name="description" content="#{"a" * 161}">
+            </head>
+            <body>
+              <main>
+                <p>About</p>
+              </main>
+            </body>
+          </html>
+        HTML
+      )
+    result = Crawlscope::Audit.new(
+      base_url: "https://example.com",
+      sitemap_path: @sitemap_path,
+      rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
+      schema_registry: Crawlscope::SchemaRegistry.default
+    ).call
+    refute result.ok?
+    assert_equal %i[meta_description_too_long missing_canonical missing_h1 title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
+  end
+  def test_uses_browser_when_renderer_is_browser
+    File.write(
+      @sitemap_path,
+      <<~XML
+        <?xml version="1.0" encoding="UTF-8"?>
+        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+          <url><loc>https://example.com/pricing</loc></url>
+        </urlset>
+      XML
+    )
+    fake_browser = Class.new do
+      attr_reader :closed, :urls
+      def initialize
+        @closed = false
+        @urls = []
+      end
+      def close
+        @closed = true
+      end
+      def fetch(url)
+        @urls << url
+        body = <<~HTML
+          <html>
+            <head>
+              <title>Pricing</title>
+              <meta name="description" content="Plans for hotels and restaurants">
+              <link rel="canonical" href="https://example.com/pricing">
+            </head>
+            <body>
+              <main>
+                <h1>Pricing</h1>
+              </main>
+            </body>
+          </html>
+        HTML
+        Crawlscope::Page.new(
+          url: url,
+          normalized_url: url,
+          final_url: url,
+          normalized_final_url: url,
+          status: 200,
+          headers: {"content-type" => "text/html"},
+          body: body,
+          doc: Nokogiri::HTML(body)
+        )
+      end
+    end.new
+    result = Crawlscope::Audit.new(
+      base_url: "https://example.com",
+      sitemap_path: @sitemap_path,
+      rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
+      schema_registry: Crawlscope::SchemaRegistry.default,
+      renderer: :browser,
+      browser_factory: -> { fake_browser }
+    ).call
+    assert result.ok?
+    assert_equal ["https://example.com/pricing"], fake_browser.urls
+    assert fake_browser.closed
+  end
+end