RubyGems - crawlscope - Versions diffs - 0.1.0 - Mend

crawlscope 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +31 -0
data/LICENSE.txt +21 -0
data/README.md +323 -0
data/exe/crawlscope +6 -0
data/lib/crawlscope/audit.rb +128 -0
data/lib/crawlscope/browser.rb +88 -0
data/lib/crawlscope/cli.rb +245 -0
data/lib/crawlscope/configuration.rb +123 -0
data/lib/crawlscope/crawler.rb +28 -0
data/lib/crawlscope/http.rb +77 -0
data/lib/crawlscope/issue.rb +17 -0
data/lib/crawlscope/issue_collection.rb +41 -0
data/lib/crawlscope/page.rb +23 -0
data/lib/crawlscope/railtie.rb +9 -0
data/lib/crawlscope/reporter.rb +33 -0
data/lib/crawlscope/result.rb +9 -0
data/lib/crawlscope/rule_registry.rb +39 -0
data/lib/crawlscope/rules/links.rb +220 -0
data/lib/crawlscope/rules/metadata.rb +93 -0
data/lib/crawlscope/rules/structured_data.rb +58 -0
data/lib/crawlscope/rules/uniqueness.rb +88 -0
data/lib/crawlscope/schema_registry.rb +431 -0
data/lib/crawlscope/sitemap.rb +67 -0
data/lib/crawlscope/structured_data/audit.rb +150 -0
data/lib/crawlscope/structured_data/document.rb +93 -0
data/lib/crawlscope/structured_data/report.rb +77 -0
data/lib/crawlscope/structured_data/reporter.rb +73 -0
data/lib/crawlscope/structured_data/writer.rb +26 -0
data/lib/crawlscope/task.rb +131 -0
data/lib/crawlscope/url.rb +43 -0
data/lib/crawlscope/version.rb +5 -0
data/lib/crawlscope.rb +34 -0
data/lib/tasks/crawlscope_tasks.rake +44 -0
data/test/crawlscope/audit_test.rb +165 -0
data/test/crawlscope/cli_test.rb +157 -0
data/test/crawlscope/configuration_test.rb +45 -0
data/test/crawlscope/links_rule_test.rb +87 -0
data/test/crawlscope/loader_test.rb +11 -0
data/test/crawlscope/reporter_test.rb +50 -0
data/test/crawlscope/schema_registry_test.rb +89 -0
data/test/crawlscope/sitemap_test.rb +51 -0
data/test/crawlscope/structured_data_audit_test.rb +118 -0
data/test/crawlscope/structured_data_document_test.rb +28 -0
data/test/crawlscope/structured_data_report_test.rb +37 -0
data/test/crawlscope/structured_data_reporter_test.rb +32 -0
data/test/crawlscope/structured_data_rule_test.rb +78 -0
data/test/crawlscope/structured_data_writer_test.rb +32 -0
data/test/crawlscope/task_test.rb +206 -0
data/test/crawlscope/uniqueness_rule_test.rb +46 -0
data/test/test_helper.rb +23 -0
metadata +271 -0

data/lib/crawlscope/rules/links.rb ADDED Viewed

@@ -0,0 +1,220 @@
+# frozen_string_literal: true
+require "uri"
+module Crawlscope
+  module Rules
+    class Links
+      CONTEXTUAL_LINK_SELECTORS = "main a[href], article a[href]"
+      INTERNAL_PATH_PREFIXES_TO_SKIP = ["/rails/", "/cdn-cgi/"].freeze
+      LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze
+      MAX_SOURCES_IN_ERROR = 3
+      MIN_INBOUND_ANCHOR_LINKS = 1
+      attr_reader :code
+      def initialize
+        @code = :links
+      end
+      def call(urls:, pages:, issues:, context:)
+        @allowed_statuses = context.fetch(:allowed_statuses)
+        @base_url = context.fetch(:base_url)
+        @resolve_target = context.fetch(:resolve_target)
+        @base_host = URI.parse(@base_url).host
+        links = extract_links(pages)
+        return if links.empty?
+        resolved_links = resolve_links(links, issues)
+        validate_inbound_counts(urls, pages, resolved_links, issues)
+      end
+      private
+      def contextual_links(doc)
+        links = doc.css(CONTEXTUAL_LINK_SELECTORS)
+        return links unless links.empty?
+        doc.css("a[href]")
+      end
+      def extract_links(pages)
+        links = []
+        pages.each do |page|
+          next unless page.html?
+          source_path = Url.path(page.normalized_url)
+          next if source_path.nil?
+          contextual_links(page.doc).each do |node|
+            href = node["href"].to_s.strip
+            next if href.empty?
+            next if href.start_with?("#")
+            next if LINK_SCHEMES_TO_SKIP.any? { |prefix| href.start_with?(prefix) }
+            anchor_text = normalize_anchor_text(node.text)
+            next if anchor_text.empty?
+            target_url = normalize_internal_link(page.normalized_url, href)
+            next if target_url.nil?
+            target_path = Url.path(target_url)
+            next if target_path.nil?
+            next if skip_internal_path?(target_path)
+            links << {
+              anchor_text: anchor_text,
+              source_path: source_path,
+              source_url: page.normalized_url,
+              target_path: target_path,
+              target_url: target_url
+            }
+          end
+        end
+        links
+      end
+      def normalize_anchor_text(text)
+        text.to_s.gsub(/\s+/, " ").strip
+      end
+      def normalize_internal_link(source_url, href)
+        absolute_url = URI.join(source_url, href).to_s
+        uri = URI.parse(absolute_url)
+        return if uri.host != @base_host
+        uri.fragment = nil
+        Url.normalize(uri.to_s, base_url: @base_url)
+      rescue URI::InvalidURIError
+        nil
+      end
+      def report_broken_target(target_url, grouped_links, issues, status)
+        source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
+        issues.add(
+          code: :broken_internal_link,
+          severity: :warning,
+          category: :links,
+          url: target_url,
+          message: "broken internal link (HTTP #{status}, sources: #{source_urls.join(", ")})",
+          details: {source_urls: source_urls, status: status}
+        )
+      end
+      def report_unresolved_target(target_url, grouped_links, issues, resolution)
+        source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
+        suffix = (resolution && resolution[:error]) ? " (#{resolution[:error]})" : ""
+        issues.add(
+          code: :unresolved_internal_link,
+          severity: :warning,
+          category: :links,
+          url: target_url,
+          message: "unable to validate internal link#{suffix} (sources: #{source_urls.join(", ")})",
+          details: {error: resolution && resolution[:error], source_urls: source_urls}
+        )
+      end
+      def resolve_links(links, issues)
+        resolved_links = []
+        links.group_by { |link| link[:target_url] }.each do |target_url, grouped_links|
+          resolution = @resolve_target.call(target_url)
+          if resolution.nil?
+            report_unresolved_target(target_url, grouped_links, issues, resolution)
+            next
+          end
+          status = resolution[:status]
+          if status.nil?
+            next if resolution[:crawled] && resolution[:error]
+            report_unresolved_target(target_url, grouped_links, issues, resolution)
+            next
+          end
+          unless @allowed_statuses.include?(status)
+            report_broken_target(target_url, grouped_links, issues, status)
+            next
+          end
+          final_url = resolution[:final_url].to_s.empty? ? target_url : resolution[:final_url]
+          final_path = Url.path(final_url)
+          next if final_path.nil?
+          next if skip_internal_path?(final_path)
+          grouped_links.each do |link|
+            resolved_links << link.merge(final_path: final_path, final_url: final_url)
+          end
+        end
+        resolved_links
+      end
+      def skip_internal_path?(path)
+        return true if path == "/"
+        INTERNAL_PATH_PREFIXES_TO_SKIP.any? { |prefix| path.start_with?(prefix) }
+      end
+      def validate_inbound_counts(urls, pages, resolved_links, issues)
+        sitemap_paths = urls.each_with_object({}) do |url, memo|
+          normalized_url = Url.normalize(url, base_url: @base_url)
+          path = Url.path(normalized_url)
+          next if path.nil?
+          next if skip_internal_path?(path)
+          memo[path] = normalized_url
+        end
+        html_paths = pages.each_with_object(Set.new) do |page, result|
+          next unless page.html?
+          [page.normalized_url, page.normalized_final_url].compact.each do |url|
+            path = Url.path(url)
+            next if path.nil?
+            next if skip_internal_path?(path)
+            result << path
+          end
+        end
+        inbound_anchor_counts = Hash.new(0)
+        sample_sources_by_target = Hash.new { |hash, key| hash[key] = [] }
+        resolved_links.each do |link|
+          target_path = link[:final_path]
+          next unless sitemap_paths.key?(target_path)
+          next if link[:source_path] == target_path
+          inbound_anchor_counts[target_path] += 1
+          source_samples = sample_sources_by_target[target_path]
+          source_samples << link[:source_url] unless source_samples.include?(link[:source_url])
+        end
+        sitemap_paths.each do |path, target_url|
+          next unless html_paths.include?(path)
+          inbound_count = inbound_anchor_counts[path]
+          next if inbound_count >= MIN_INBOUND_ANCHOR_LINKS
+          source_samples = sample_sources_by_target[path].first(MAX_SOURCES_IN_ERROR)
+          source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : ""
+          issues.add(
+            code: :low_inbound_anchor_links,
+            severity: :warning,
+            category: :links,
+            url: target_url,
+            message: "inbound anchor links #{inbound_count} below #{MIN_INBOUND_ANCHOR_LINKS}#{source_info}",
+            details: {inbound_count: inbound_count, minimum: MIN_INBOUND_ANCHOR_LINKS, source_urls: source_samples}
+          )
+        end
+      end
+    end
+  end
+end

data/lib/crawlscope/rules/metadata.rb ADDED Viewed

@@ -0,0 +1,93 @@
+# frozen_string_literal: true
+module Crawlscope
+  module Rules
+    class Metadata
+      TITLE_MAX_LENGTH = 72
+      DESCRIPTION_MAX_LENGTH = 160
+      attr_reader :code
+      def initialize(site_name: nil)
+        @site_name = site_name.to_s.strip
+        @code = :metadata
+      end
+      def call(urls:, pages:, issues:, context: nil)
+        pages.each do |page|
+          next unless page.html?
+          validate_h1(page, issues)
+          validate_title(page, issues)
+          validate_description(page, issues)
+          validate_canonical(page, issues)
+        end
+      end
+      private
+      def validate_h1(page, issues)
+        return unless page.doc.at_css("h1").nil?
+        issues.add(
+          code: :missing_h1,
+          severity: :warning,
+          category: :metadata,
+          url: page.url,
+          message: "missing <h1>",
+          details: {}
+        )
+      end
+      def validate_title(page, issues)
+        title = page.doc.at_css("title")&.text.to_s.strip
+        if title.empty?
+          issues.add(code: :missing_title, severity: :warning, category: :metadata, url: page.url, message: "missing <title>", details: {})
+        elsif title.length > TITLE_MAX_LENGTH
+          issues.add(code: :title_too_long, severity: :warning, category: :metadata, url: page.url, message: "title too long (#{title.length})", details: {length: title.length})
+        elsif repeated_site_name?(title)
+          issues.add(code: :title_repeats_site_name, severity: :warning, category: :metadata, url: page.url, message: "title repeats #{@site_name}", details: {site_name: @site_name})
+        end
+      end
+      def validate_description(page, issues)
+        description = page.doc.at_css('meta[name="description"]')&.[]("content").to_s.strip
+        if description.empty?
+          issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {})
+        elsif description.length > DESCRIPTION_MAX_LENGTH
+          issues.add(code: :meta_description_too_long, severity: :warning, category: :metadata, url: page.url, message: "meta description too long (#{description.length})", details: {length: description.length})
+        end
+      end
+      def validate_canonical(page, issues)
+        canonical = page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip
+        if canonical.empty?
+          issues.add(code: :missing_canonical, severity: :warning, category: :metadata, url: page.url, message: "missing canonical link", details: {})
+          return
+        end
+        normalized_canonical = Url.normalize(canonical, base_url: page.url)
+        normalized_page_url = Url.normalize(page.url, base_url: page.url)
+        return if normalized_canonical == normalized_page_url
+        issues.add(
+          code: :canonical_mismatch,
+          severity: :warning,
+          category: :metadata,
+          url: page.url,
+          message: "canonical mismatch (#{canonical})",
+          details: {canonical: canonical}
+        )
+      end
+      def repeated_site_name?(title)
+        return false if @site_name.empty?
+        title.split(/[^[:alnum:]]+/).count { |token| token.casecmp?(@site_name) } > 1
+      end
+    end
+  end
+end

data/lib/crawlscope/rules/structured_data.rb ADDED Viewed

@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+module Crawlscope
+  module Rules
+    class StructuredData
+      attr_reader :code
+      def initialize
+        @code = :structured_data
+      end
+      def call(urls:, pages:, issues:, context:)
+        schema_registry = context.fetch(:schema_registry)
+        pages.each do |page|
+          next unless page.html?
+          validate_page(page, issues, schema_registry)
+        end
+      end
+      private
+      def validate_page(page, issues, schema_registry)
+        document = Crawlscope::StructuredData::Document.new(html: page.body)
+        document.items.each do |item|
+          data = item.data
+          source = item.source
+          if data.is_a?(Hash) && data[:error]
+            issues.add(
+              code: :structured_data_parse_error,
+              severity: :warning,
+              category: :structured_data,
+              url: page.url,
+              message: "#{source} parse error: #{data[:message]}",
+              details: {source: source}
+            )
+            next
+          end
+          errors = schema_registry.validate(data)
+          next if errors.empty?
+          issues.add(
+            code: :structured_data_schema_error,
+            severity: :warning,
+            category: :structured_data,
+            url: page.url,
+            message: "#{source} schema errors: #{errors.to_json}",
+            details: {errors: errors, source: source}
+          )
+        end
+      end
+    end
+  end
+end

data/lib/crawlscope/rules/uniqueness.rb ADDED Viewed

@@ -0,0 +1,88 @@
+# frozen_string_literal: true
+require "digest"
+module Crawlscope
+  module Rules
+    class Uniqueness
+      attr_reader :code
+      def initialize
+        @code = :uniqueness
+      end
+      def call(urls:, pages:, issues:, context:)
+        page_summaries = pages.filter_map do |page|
+          next unless page.html?
+          summary_for(page)
+        end
+        validate_duplicates(page_summaries, issues)
+      end
+      private
+      def content_fingerprint_digest(doc)
+        text = doc.at_css("main")&.text.to_s
+        text = doc.at_css("body")&.text.to_s if text.empty?
+        normalized = text.gsub(/\s+/, " ").strip
+        return if normalized.length < 200
+        Digest::SHA256.hexdigest(normalized)
+      end
+      def duplicates_for(pages, field)
+        pages
+          .select { |page| !page[field].nil? && !page[field].to_s.empty? }
+          .group_by { |page| page[field] }
+          .transform_values { |items| items.map { |item| item[:url] } }
+          .select { |_value, urls| urls.size > 1 }
+      end
+      def summary_for(page)
+        {
+          content_fingerprint_digest: content_fingerprint_digest(page.doc),
+          description: page.doc.at_css('meta[name="description"]')&.[]("content").to_s.strip,
+          title: page.doc.at_css("title")&.text.to_s.strip,
+          url: page.url
+        }
+      end
+      def validate_duplicates(page_summaries, issues)
+        duplicates_for(page_summaries, :title).each do |value, urls|
+          issues.add(
+            code: :duplicate_title,
+            severity: :warning,
+            category: :uniqueness,
+            url: nil,
+            message: "duplicate title '#{value}' => #{urls.join(", ")}",
+            details: {urls: urls, value: value}
+          )
+        end
+        duplicates_for(page_summaries, :description).each do |value, urls|
+          issues.add(
+            code: :duplicate_meta_description,
+            severity: :warning,
+            category: :uniqueness,
+            url: nil,
+            message: "duplicate meta description '#{value}' => #{urls.join(", ")}",
+            details: {urls: urls, value: value}
+          )
+        end
+        duplicates_for(page_summaries, :content_fingerprint_digest).each_value do |urls|
+          issues.add(
+            code: :duplicate_content_fingerprint,
+            severity: :warning,
+            category: :uniqueness,
+            url: nil,
+            message: "duplicate page content fingerprint => #{urls.join(", ")}",
+            details: {urls: urls}
+          )
+        end
+      end
+    end
+  end
+end