RubyGems - crawlscope - Versions diffs - 0.1.0 - Mend

crawlscope 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +31 -0
data/LICENSE.txt +21 -0
data/README.md +323 -0
data/exe/crawlscope +6 -0
data/lib/crawlscope/audit.rb +128 -0
data/lib/crawlscope/browser.rb +88 -0
data/lib/crawlscope/cli.rb +245 -0
data/lib/crawlscope/configuration.rb +123 -0
data/lib/crawlscope/crawler.rb +28 -0
data/lib/crawlscope/http.rb +77 -0
data/lib/crawlscope/issue.rb +17 -0
data/lib/crawlscope/issue_collection.rb +41 -0
data/lib/crawlscope/page.rb +23 -0
data/lib/crawlscope/railtie.rb +9 -0
data/lib/crawlscope/reporter.rb +33 -0
data/lib/crawlscope/result.rb +9 -0
data/lib/crawlscope/rule_registry.rb +39 -0
data/lib/crawlscope/rules/links.rb +220 -0
data/lib/crawlscope/rules/metadata.rb +93 -0
data/lib/crawlscope/rules/structured_data.rb +58 -0
data/lib/crawlscope/rules/uniqueness.rb +88 -0
data/lib/crawlscope/schema_registry.rb +431 -0
data/lib/crawlscope/sitemap.rb +67 -0
data/lib/crawlscope/structured_data/audit.rb +150 -0
data/lib/crawlscope/structured_data/document.rb +93 -0
data/lib/crawlscope/structured_data/report.rb +77 -0
data/lib/crawlscope/structured_data/reporter.rb +73 -0
data/lib/crawlscope/structured_data/writer.rb +26 -0
data/lib/crawlscope/task.rb +131 -0
data/lib/crawlscope/url.rb +43 -0
data/lib/crawlscope/version.rb +5 -0
data/lib/crawlscope.rb +34 -0
data/lib/tasks/crawlscope_tasks.rake +44 -0
data/test/crawlscope/audit_test.rb +165 -0
data/test/crawlscope/cli_test.rb +157 -0
data/test/crawlscope/configuration_test.rb +45 -0
data/test/crawlscope/links_rule_test.rb +87 -0
data/test/crawlscope/loader_test.rb +11 -0
data/test/crawlscope/reporter_test.rb +50 -0
data/test/crawlscope/schema_registry_test.rb +89 -0
data/test/crawlscope/sitemap_test.rb +51 -0
data/test/crawlscope/structured_data_audit_test.rb +118 -0
data/test/crawlscope/structured_data_document_test.rb +28 -0
data/test/crawlscope/structured_data_report_test.rb +37 -0
data/test/crawlscope/structured_data_reporter_test.rb +32 -0
data/test/crawlscope/structured_data_rule_test.rb +78 -0
data/test/crawlscope/structured_data_writer_test.rb +32 -0
data/test/crawlscope/task_test.rb +206 -0
data/test/crawlscope/uniqueness_rule_test.rb +46 -0
data/test/test_helper.rb +23 -0
metadata +271 -0

data/test/crawlscope/cli_test.rb ADDED Viewed

@@ -0,0 +1,157 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeCliTest < Minitest::Test
+  class FakeConfiguration
+    attr_accessor :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
+    def initialize
+      @concurrency = 10
+      @network_idle_timeout_seconds = 5
+      @renderer = :http
+      @timeout_seconds = 20
+    end
+    def browser_concurrency
+      4
+    end
+  end
+  class FakeTask
+    attr_reader :validate_arguments, :ldjson_arguments
+    def validate(base_url:, sitemap_path:, rule_names:)
+      @validate_arguments = {
+        base_url: base_url,
+        sitemap_path: sitemap_path,
+        rule_names: rule_names
+      }
+      success_result
+    end
+    def validate_ldjson(urls:, debug:, renderer:, report_path:, summary:, timeout_seconds:)
+      @ldjson_arguments = {
+        urls: urls,
+        debug: debug,
+        renderer: renderer,
+        report_path: report_path,
+        summary: summary,
+        timeout_seconds: timeout_seconds
+      }
+      success_result
+    end
+    private
+    def success_result
+      Struct.new(:ok?).new(true)
+    end
+  end
+  def test_version_prints_current_version
+    out = StringIO.new
+    err = StringIO.new
+    status = Crawlscope::Cli.start(["version"], out: out, err: err)
+    assert_equal 0, status
+    assert_equal "#{Crawlscope::VERSION}\n", out.string
+    assert_empty err.string
+  end
+  def test_unknown_command_returns_error
+    out = StringIO.new
+    err = StringIO.new
+    status = Crawlscope::Cli.start(["unknown"], out: out, err: err)
+    assert_equal 1, status
+    assert_includes err.string, "Unknown command: unknown"
+    assert_includes err.string, "crawlscope validate --base-url"
+  end
+  def test_validate_passes_arguments_to_task
+    configuration = FakeConfiguration.new
+    task = FakeTask.new
+    out = StringIO.new
+    err = StringIO.new
+    status = Crawlscope::Cli.start(
+      ["validate", "--base-url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3"],
+      out: out,
+      err: err,
+      configuration: configuration,
+      task: task
+    )
+    assert_equal 0, status
+    assert_equal(
+      {
+        base_url: "https://example.com",
+        sitemap_path: "https://example.com/sitemap-pages.xml",
+        rule_names: "metadata,links"
+      },
+      task.validate_arguments
+    )
+    assert_equal :browser, configuration.renderer
+    assert_equal 30, configuration.timeout_seconds
+    assert_equal 9, configuration.network_idle_timeout_seconds
+    assert_equal 3, configuration.concurrency
+    assert_same out, configuration.output
+    assert_empty err.string
+  end
+  def test_ldjson_reads_urls_from_environment
+    configuration = FakeConfiguration.new
+    task = FakeTask.new
+    out = StringIO.new
+    err = StringIO.new
+    with_env("URL" => "https://example.com/a; https://example.com/b", "SUMMARY" => "1", "DEBUG" => "1") do
+      status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: configuration, task: task)
+      assert_equal 0, status
+    end
+    assert_equal(
+      {
+        urls: ["https://example.com/a", "https://example.com/b"],
+        debug: true,
+        renderer: :http,
+        report_path: nil,
+        summary: true,
+        timeout_seconds: 20
+      },
+      task.ldjson_arguments
+    )
+    assert_same out, configuration.output
+    assert_empty err.string
+  end
+  private
+  def with_env(overrides)
+    original_values = overrides.to_h { |key, _value| [key, ENV[key]] }
+    overrides.each do |key, value|
+      if value.nil?
+        ENV.delete(key)
+      else
+        ENV[key] = value
+      end
+    end
+    yield
+  ensure
+    original_values.each do |key, value|
+      if value.nil?
+        ENV.delete(key)
+      else
+        ENV[key] = value
+      end
+    end
+  end
+end

data/test/crawlscope/configuration_test.rb ADDED Viewed

@@ -0,0 +1,45 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeConfigurationTest < Minitest::Test
+  def teardown
+    Crawlscope.reset!
+  end
+  def test_audit_builds_from_configured_callables
+    Crawlscope.configure do |config|
+      config.base_url = -> { "https://example.com" }
+      config.sitemap_path = -> { "/tmp/sitemap.xml" }
+      config.site_name = -> { "Example" }
+      config.concurrency = -> { 4 }
+    end
+    audit = Crawlscope.configuration.audit
+    assert_equal "https://example.com", audit.instance_variable_get(:@base_url)
+    assert_equal "/tmp/sitemap.xml", audit.instance_variable_get(:@sitemap_path)
+    assert_equal 4, audit.instance_variable_get(:@concurrency)
+    assert_equal %i[metadata structured_data uniqueness links], audit.instance_variable_get(:@rules).map(&:code)
+  end
+  def test_audit_raises_without_base_url
+    Crawlscope.configure do |config|
+      config.sitemap_path = "/tmp/sitemap.xml"
+    end
+    error = assert_raises(Crawlscope::ConfigurationError) { Crawlscope.configuration.audit }
+    assert_equal "Crawlscope base_url is not configured", error.message
+  end
+  def test_audit_raises_without_sitemap_path
+    Crawlscope.configure do |config|
+      config.base_url = "https://example.com"
+    end
+    error = assert_raises(Crawlscope::ConfigurationError) { Crawlscope.configuration.audit }
+    assert_equal "Crawlscope sitemap_path is not configured", error.message
+  end
+end

data/test/crawlscope/links_rule_test.rb ADDED Viewed

@@ -0,0 +1,87 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeLinksRuleTest < Minitest::Test
+  def test_reports_broken_internal_links
+    issues = Crawlscope::IssueCollection.new
+    rule = Crawlscope::Rules::Links.new
+    pages = [
+      page(
+        url: "https://example.com/guide",
+        body: <<~HTML
+          <html>
+            <body>
+              <main>
+                <a href="/pricing">Pricing</a>
+                <a href="/missing">Missing</a>
+              </main>
+            </body>
+          </html>
+        HTML
+      ),
+      page(
+        url: "https://example.com/pricing",
+        body: <<~HTML
+          <html>
+            <body>
+              <main>
+                <a href="/guide">Guide</a>
+              </main>
+            </body>
+          </html>
+        HTML
+      )
+    ]
+    rule.call(
+      urls: ["https://example.com/guide", "https://example.com/pricing"],
+      pages: pages,
+      issues: issues,
+      context: {
+        allowed_statuses: [200, 301, 302],
+        base_url: "https://example.com",
+        resolve_target: method(:resolve_target)
+      }
+    )
+    assert_equal [:broken_internal_link], issues.to_a.map(&:code)
+    assert_includes issues.to_a.first.message, "HTTP 404"
+  end
+  private
+  def page(url:, body:)
+    doc = Nokogiri::HTML(body)
+    Crawlscope::Page.new(
+      url: url,
+      normalized_url: url,
+      final_url: url,
+      normalized_final_url: url,
+      status: 200,
+      headers: {"content-type" => "text/html"},
+      body: body,
+      doc: doc
+    )
+  end
+  def resolve_target(target_url)
+    case target_url
+    when "https://example.com/guide", "https://example.com/pricing"
+      {
+        crawled: true,
+        error: nil,
+        final_url: target_url,
+        status: 200
+      }
+    when "https://example.com/missing"
+      {
+        crawled: false,
+        error: nil,
+        final_url: target_url,
+        status: 404
+      }
+    end
+  end
+end

data/test/crawlscope/loader_test.rb ADDED Viewed

@@ -0,0 +1,11 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeLoaderTest < Minitest::Test
+  def test_eager_loads_cleanly
+    assert_silent do
+      Crawlscope.loader.eager_load
+    end
+  end
+end

data/test/crawlscope/reporter_test.rb ADDED Viewed

@@ -0,0 +1,50 @@
+# frozen_string_literal: true
+require "stringio"
+require "test_helper"
+class CrawlscopeReporterTest < Minitest::Test
+  def test_reports_ok_result
+    io = StringIO.new
+    result = Crawlscope::Result.new(
+      base_url: "https://example.com",
+      sitemap_path: "/tmp/sitemap.xml",
+      urls: ["https://example.com"],
+      pages: [Object.new],
+      issues: Crawlscope::IssueCollection.new
+    )
+    Crawlscope::Reporter.new(io: io).report(result)
+    output = io.string
+    assert_includes output, "Crawlscope validation"
+    assert_includes output, "Status: OK"
+    refute_includes output, "Status: FAILED"
+  end
+  def test_reports_failed_result_with_severity_counts
+    io = StringIO.new
+    issues = Crawlscope::IssueCollection.new
+    issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com/a", message: "missing <title>", details: {})
+    issues.add(code: :broken_internal_link, severity: :notice, category: :links, url: "https://example.com/b", message: "broken internal link", details: {})
+    result = Crawlscope::Result.new(
+      base_url: "https://example.com",
+      sitemap_path: "/tmp/sitemap.xml",
+      urls: ["https://example.com/a", "https://example.com/b"],
+      pages: [Object.new, Object.new],
+      issues: issues
+    )
+    Crawlscope::Reporter.new(io: io).report(result)
+    output = io.string
+    assert_includes output, "Status: FAILED"
+    assert_includes output, "Issues: 2"
+    assert_includes output, "notice: 1"
+    assert_includes output, "warning: 1"
+    assert_includes output, "- [warning] https://example.com/a missing <title>"
+    assert_includes output, "- [notice] https://example.com/b broken internal link"
+  end
+end

data/test/crawlscope/schema_registry_test.rb ADDED Viewed

@@ -0,0 +1,89 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeSchemaRegistryTest < Minitest::Test
+  def test_registers_and_fetches_schema_by_type
+    registry = Crawlscope::SchemaRegistry.default
+    schema = {"type" => "object"}
+    registry.register("Article", schema)
+    assert registry.registered?("Article")
+    assert_equal schema, registry.fetch("Article")
+  end
+  def test_dup_copies_registered_schemas
+    registry = Crawlscope::SchemaRegistry.new(schemas: {"ThingOne" => {"type" => "object"}})
+    copy = registry.dup
+    copy.register("ThingTwo", {"type" => "object"})
+    assert registry.registered?("ThingOne")
+    refute registry.registered?("ThingTwo")
+    assert copy.registered?("ThingTwo")
+  end
+  def test_validate_reports_default_schema_errors
+    errors = Crawlscope::SchemaRegistry.default.validate(
+      {
+        "@context" => "https://schema.org",
+        "@type" => "Article"
+      }
+    )
+    assert_predicate errors, :any?
+    assert_equal "Article", errors.first[:type]
+    assert_includes errors.first[:issue], "headline"
+  end
+  def test_default_registry_includes_extended_schema_types
+    registry = Crawlscope::SchemaRegistry.default
+    assert registry.registered?("HowTo")
+    assert registry.registered?("Recipe")
+    assert registry.registered?("Event")
+    assert registry.registered?("VideoObject")
+  end
+  def test_web_application_review_requires_review_rating
+    errors = Crawlscope::SchemaRegistry.default.validate(
+      {
+        "@context" => "https://schema.org",
+        "@type" => "WebApplication",
+        "name" => "ROI Calculator",
+        "url" => "https://example.com/tools/uplift",
+        "review" => {
+          "@type" => "Review",
+          "reviewBody" => "Helpful tool."
+        }
+      }
+    )
+    assert errors.any? { |error| error[:issue].include?("did not contain a required property of 'reviewRating'") }
+  end
+  def test_product_allows_image_object_variants
+    errors = Crawlscope::SchemaRegistry.default.validate(
+      {
+        "@context" => "https://schema.org",
+        "@type" => "Product",
+        "name" => "Example Product",
+        "image" => {
+          "@type" => "ImageObject",
+          "url" => "https://example.com/image.png"
+        }
+      }
+    )
+    assert_empty errors
+  end
+  def test_rule_registry_raises_for_unknown_rules
+    error = assert_raises(Crawlscope::ConfigurationError) do
+      Crawlscope::RuleRegistry.default.rules_for("metadata,unknown")
+    end
+    assert_equal "Unknown Crawlscope rules: unknown", error.message
+  end
+end

data/test/crawlscope/sitemap_test.rb ADDED Viewed

@@ -0,0 +1,51 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeSitemapTest < Minitest::Test
+  def test_parses_remote_sitemap_urlset
+    stub_request(:get, "https://www.example.com/sitemap.xml")
+      .to_return(
+        status: 200,
+        body: <<~XML
+          <?xml version="1.0" encoding="UTF-8"?>
+          <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+            <url><loc>https://www.example.com/</loc></url>
+            <url><loc>/pricing</loc></url>
+          </urlset>
+        XML
+      )
+    parser = Crawlscope::Sitemap.new(path: "https://www.example.com/sitemap.xml")
+    assert_equal ["https://www.example.com/", "https://www.example.com/pricing"], parser.urls(base_url: "https://www.example.com")
+  end
+  def test_parses_remote_sitemap_index_with_child_sitemap
+    stub_request(:get, "https://www.example.com/sitemap.xml")
+      .to_return(
+        status: 200,
+        body: <<~XML
+          <?xml version="1.0" encoding="UTF-8"?>
+          <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+            <sitemap><loc>/sitemaps/content.xml</loc></sitemap>
+          </sitemapindex>
+        XML
+      )
+    stub_request(:get, "https://www.example.com/sitemaps/content.xml")
+      .to_return(
+        status: 200,
+        body: <<~XML
+          <?xml version="1.0" encoding="UTF-8"?>
+          <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+            <url><loc>https://www.example.com/features/reviews</loc></url>
+          </urlset>
+        XML
+      )
+    parser = Crawlscope::Sitemap.new(path: "https://www.example.com/sitemap.xml")
+    assert_equal ["https://www.example.com/features/reviews"], parser.urls(base_url: "https://www.example.com")
+  end
+end

data/test/crawlscope/structured_data_audit_test.rb ADDED Viewed

@@ -0,0 +1,118 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeStructuredDataAuditTest < Minitest::Test
+  class FakeBrowser
+    attr_reader :closed
+    def initialize(page:)
+      @page = page
+      @closed = false
+    end
+    def close
+      @closed = true
+    end
+    def fetch(_url)
+      @page
+    end
+  end
+  def test_reports_schema_errors_for_invalid_article_markup
+    page = html_page(
+      url: "https://example.com/articles/test",
+      body: <<~HTML
+        <html>
+          <head>
+            <script type="application/ld+json">
+              {"@context":"https://schema.org","@type":"Article"}
+            </script>
+          </head>
+        </html>
+      HTML
+    )
+    browser = FakeBrowser.new(page: page)
+    audit = Crawlscope::StructuredData::Audit.new(
+      browser_factory: -> { browser },
+      renderer: :browser,
+      schema_registry: Crawlscope::SchemaRegistry.default,
+      timeout_seconds: 20
+    )
+    result = audit.call(urls: [page.url])
+    refute result.ok?
+    assert_equal 1, result.entries.size
+    assert_equal "Article", result.entries.first.errors.first[:type]
+    assert browser.closed
+  end
+  def test_reports_fetch_errors_for_non_success_statuses
+    page = Crawlscope::Page.new(
+      url: "https://example.com/missing",
+      normalized_url: "https://example.com/missing",
+      final_url: "https://example.com/missing",
+      normalized_final_url: "https://example.com/missing",
+      status: 404,
+      headers: {"content-type" => "text/html"},
+      body: "",
+      doc: Nokogiri::HTML("")
+    )
+    browser = FakeBrowser.new(page: page)
+    audit = Crawlscope::StructuredData::Audit.new(
+      browser_factory: -> { browser },
+      renderer: :browser,
+      schema_registry: Crawlscope::SchemaRegistry.default,
+      timeout_seconds: 20
+    )
+    result = audit.call(urls: [page.url])
+    refute result.ok?
+    assert_equal "Non-success status", result.entries.first.fetch_error
+  end
+  def test_skips_non_html_responses_without_treating_them_as_missing_data
+    page = Crawlscope::Page.new(
+      url: "https://example.com/feed.xml",
+      normalized_url: "https://example.com/feed.xml",
+      final_url: "https://example.com/feed.xml",
+      normalized_final_url: "https://example.com/feed.xml",
+      status: 200,
+      headers: {"content-type" => "application/xml"},
+      body: "<feed></feed>",
+      doc: nil
+    )
+    browser = FakeBrowser.new(page: page)
+    audit = Crawlscope::StructuredData::Audit.new(
+      browser_factory: -> { browser },
+      renderer: :browser,
+      schema_registry: Crawlscope::SchemaRegistry.default,
+      timeout_seconds: 20
+    )
+    result = audit.call(urls: [page.url])
+    assert result.ok?
+    assert_equal "application/xml", result.entries.first.content_type
+    assert_equal "non-html", result.entries.first.skipped_reason
+    assert_predicate result.entries.first, :structured_data_found?
+  end
+  private
+  def html_page(url:, body:)
+    Crawlscope::Page.new(
+      url: url,
+      normalized_url: url,
+      final_url: url,
+      normalized_final_url: url,
+      status: 200,
+      headers: {"content-type" => "text/html"},
+      body: body,
+      doc: Nokogiri::HTML(body)
+    )
+  end
+end

data/test/crawlscope/structured_data_document_test.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeStructuredDataDocumentTest < Minitest::Test
+  def test_items_returns_json_ld_and_microdata_entries
+    html = <<~HTML
+      <html>
+        <body>
+          <script type="application/ld+json">
+            {"@type":"Hotel","name":"Hotel Test"}
+          </script>
+          <div itemscope itemtype="https://schema.org/Organization">
+            <span itemprop="name">Acme Hospitality</span>
+          </div>
+        </body>
+      </html>
+    HTML
+    document = Crawlscope::StructuredData::Document.new(html: html)
+    items = document.items
+    assert_equal 2, items.size
+    assert_equal ["json-ld", "microdata"], items.map(&:source)
+    assert_equal "Hotel Test", document.json_ld_items.first["name"]
+  end
+end

data/test/crawlscope/structured_data_report_test.rb ADDED Viewed

@@ -0,0 +1,37 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeStructuredDataReportTest < Minitest::Test
+  def test_results_maps_validation_errors_and_skips
+    result = Crawlscope::StructuredData::Audit::Result.new(
+      entries: [
+        Crawlscope::StructuredData::Audit::Entry.new(
+          url: "https://example.com/article",
+          status: 200,
+          structured_items: [{source: "json-ld", data: {"@type" => "Article"}}],
+          errors: [{type: "Article", source: "json-ld", errors: [{field: "headline", issue: "is required"}]}],
+          fetch_error: nil,
+          content_type: "text/html",
+          skipped_reason: nil
+        ),
+        Crawlscope::StructuredData::Audit::Entry.new(
+          url: "https://example.com/feed.xml",
+          status: 200,
+          structured_items: [],
+          errors: [],
+          fetch_error: nil,
+          content_type: "application/xml",
+          skipped_reason: "non-html"
+        )
+      ]
+    )
+    report = Crawlscope::StructuredData::Report.new(result)
+    assert_equal [{field: "headline", issue: "is required"}], report.results["https://example.com/article"][:validation_errors]
+    assert_equal "non-html", report.results["https://example.com/feed.xml"][:skipped_reason]
+    assert_empty report.missing_data
+    assert_equal 1, report.validation_errors.size
+  end
+end