RubyGems - crawlscope - Versions diffs - 0.1.0 → 0.3.0 - Mend

crawlscope 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +7 -8
data/README.md +21 -14
data/lib/crawlscope/browser.rb +8 -0
data/lib/crawlscope/cli.rb +15 -10
data/lib/crawlscope/configuration.rb +20 -5
data/lib/crawlscope/context.rb +9 -0
data/lib/crawlscope/{audit.rb → crawl.rb} +68 -58
data/lib/crawlscope/crawler.rb +19 -1
data/lib/crawlscope/http.rb +1 -1
data/lib/crawlscope/rake_tasks.rb +28 -0
data/lib/crawlscope/rules/links.rb +99 -48
data/lib/crawlscope/rules/metadata.rb +57 -11
data/lib/crawlscope/rules/structured_data.rb +61 -1
data/lib/crawlscope/run.rb +60 -0
data/lib/crawlscope/schema_registry.rb +3 -349
data/lib/crawlscope/schemas.rb +406 -0
data/lib/crawlscope/sitemap.rb +18 -6
data/lib/crawlscope/structured_data/audit.rb +7 -7
data/lib/crawlscope/structured_data/check.rb +35 -0
data/lib/crawlscope/structured_data/reporter.rb +69 -0
data/lib/crawlscope/url.rb +14 -0
data/lib/crawlscope/version.rb +1 -1
data/lib/tasks/crawlscope_tasks.rake +12 -23
data/test/crawlscope/browser_test.rb +155 -0
data/test/crawlscope/cli_test.rb +143 -7
data/test/crawlscope/configuration_test.rb +49 -0
data/test/crawlscope/{audit_test.rb → crawl_test.rb} +23 -7
data/test/crawlscope/crawler_test.rb +34 -0
data/test/crawlscope/http_test.rb +56 -0
data/test/crawlscope/links_rule_test.rb +149 -5
data/test/crawlscope/metadata_rule_test.rb +77 -0
data/test/crawlscope/rule_registry_test.rb +32 -0
data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
data/test/crawlscope/schema_registry_test.rb +19 -0
data/test/crawlscope/sitemap_test.rb +55 -0
data/test/crawlscope/structured_data_document_test.rb +36 -0
data/test/crawlscope/structured_data_report_test.rb +3 -3
data/test/crawlscope/structured_data_reporter_test.rb +2 -2
data/test/crawlscope/structured_data_rule_test.rb +111 -0
data/test/crawlscope/structured_data_writer_test.rb +2 -2
data/test/crawlscope/url_test.rb +31 -0
metadata +15 -5
data/lib/crawlscope/task.rb +0 -131

data/lib/crawlscope/rules/links.rb CHANGED Viewed

@@ -5,7 +5,7 @@ require "uri"
 module Crawlscope
   module Rules
     class Links
-      CONTEXTUAL_LINK_SELECTORS = "main a[href], article a[href]"
+      LINK_SELECTORS = "a[href]"
       INTERNAL_PATH_PREFIXES_TO_SKIP = ["/rails/", "/cdn-cgi/"].freeze
       LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze
       MAX_SOURCES_IN_ERROR = 3
@@ -33,48 +33,53 @@ module Crawlscope
       private
       def contextual_links(doc)
-        links = doc.css(CONTEXTUAL_LINK_SELECTORS)
-        return links unless links.empty?
-        doc.css("a[href]")
+        doc.css(LINK_SELECTORS)
       end
       def extract_links(pages)
-        links = []
+        pages.select(&:html?).flat_map { |page| page_links(page) }
+      end
-        pages.each do |page|
-          next unless page.html?
+      def page_links(page)
+        source_path = Url.path(page.normalized_url)
+        return [] unless crawlable_source_path?(source_path)
-          source_path = Url.path(page.normalized_url)
-          next if source_path.nil?
+        contextual_links(page.doc).filter_map do |node|
+          link_for(page: page, source_path: source_path, node: node)
+        end
+      end
-          contextual_links(page.doc).each do |node|
-            href = node["href"].to_s.strip
-            next if href.empty?
-            next if href.start_with?("#")
-            next if LINK_SCHEMES_TO_SKIP.any? { |prefix| href.start_with?(prefix) }
+      def link_for(page:, source_path:, node:)
+        href = node["href"].to_s.strip
+        return unless crawlable_href?(href)
-            anchor_text = normalize_anchor_text(node.text)
-            next if anchor_text.empty?
+        anchor_text = normalize_anchor_text(node.text)
+        return if anchor_text.empty?
-            target_url = normalize_internal_link(page.normalized_url, href)
-            next if target_url.nil?
+        target_url = normalize_internal_link(page.normalized_url, href)
+        return if target_url.nil?
-            target_path = Url.path(target_url)
-            next if target_path.nil?
-            next if skip_internal_path?(target_path)
+        target_path = Url.path(target_url)
+        return unless crawlable_path?(target_path)
-            links << {
-              anchor_text: anchor_text,
-              source_path: source_path,
-              source_url: page.normalized_url,
-              target_path: target_path,
-              target_url: target_url
-            }
-          end
-        end
+        {
+          anchor_text: anchor_text,
+          source_path: source_path,
+          source_url: page.normalized_url,
+          target_path: target_path,
+          target_url: target_url
+        }
+      end
+      def crawlable_href?(href)
+        return false if href.empty?
+        return false if href.start_with?("#")
+        LINK_SCHEMES_TO_SKIP.none? { |prefix| href.start_with?(prefix) }
+      end
-        links
+      def crawlable_path?(path)
+        !path.nil? && !skip_internal_path?(path)
       end
       def normalize_anchor_text(text)
@@ -122,39 +127,85 @@ module Crawlscope
         resolved_links = []
         links.group_by { |link| link[:target_url] }.each do |target_url, grouped_links|
-          resolution = @resolve_target.call(target_url)
-          if resolution.nil?
-            report_unresolved_target(target_url, grouped_links, issues, resolution)
+          target = resolve_target(target_url)
+          if target.unresolved?
+            report_unresolved_target(target_url, grouped_links, issues, target.resolution)
             next
           end
-          status = resolution[:status]
-          if status.nil?
-            next if resolution[:crawled] && resolution[:error]
-            report_unresolved_target(target_url, grouped_links, issues, resolution)
+          if target.ignored_error?
             next
           end
-          unless @allowed_statuses.include?(status)
-            report_broken_target(target_url, grouped_links, issues, status)
+          unless target.allowed?(@allowed_statuses)
+            report_broken_target(target_url, grouped_links, issues, target.status)
             next
           end
-          final_url = resolution[:final_url].to_s.empty? ? target_url : resolution[:final_url]
-          final_path = Url.path(final_url)
-          next if final_path.nil?
-          next if skip_internal_path?(final_path)
+          report_redirect_target(target_url, grouped_links, issues, target) if target.redirect?
+          next unless crawlable_path?(target.final_path)
           grouped_links.each do |link|
-            resolved_links << link.merge(final_path: final_path, final_url: final_url)
+            resolved_links << link.merge(final_path: target.final_path, final_url: target.final_url)
           end
         end
         resolved_links
       end
+      def report_redirect_target(target_url, grouped_links, issues, target)
+        source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
+        issues.add(
+          code: :internal_link_redirects,
+          severity: :warning,
+          category: :links,
+          url: target_url,
+          message: "internal link redirects to #{target.final_url} (sources: #{source_urls.join(", ")})",
+          details: {final_url: target.final_url, source_urls: source_urls, status: target.status}
+        )
+      end
+      def resolve_target(target_url)
+        resolution = @resolve_target.call(target_url)
+        LinkTarget.new(target_url: target_url, resolution: resolution)
+      end
+      LinkTarget = Data.define(:target_url, :resolution) do
+        def allowed?(statuses)
+          statuses.include?(status)
+        end
+        def final_path
+          Url.path(final_url)
+        end
+        def final_url
+          value = resolution[:final_url].to_s
+          value.empty? ? target_url : value
+        end
+        def ignored_error?
+          resolution && status.nil? && resolution[:crawled] && resolution[:error]
+        end
+        def status
+          resolution && resolution[:status]
+        end
+        def redirect?
+          (status && (300..399).cover?(status.to_i)) || final_url != target_url
+        end
+        def unresolved?
+          resolution.nil? || (status.nil? && !ignored_error?)
+        end
+      end
+      def crawlable_source_path?(path)
+        !path.nil? && INTERNAL_PATH_PREFIXES_TO_SKIP.none? { |prefix| path.start_with?(prefix) }
+      end
       def skip_internal_path?(path)
         return true if path == "/"

data/lib/crawlscope/rules/metadata.rb CHANGED Viewed

@@ -1,10 +1,14 @@
 # frozen_string_literal: true
+require "uri"
 module Crawlscope
   module Rules
     class Metadata
       TITLE_MAX_LENGTH = 72
+      DESCRIPTION_MIN_LENGTH = 110
       DESCRIPTION_MAX_LENGTH = 160
+      REQUIRED_OPEN_GRAPH_PROPERTIES = %w[og:title og:description og:url og:type og:image].freeze
       attr_reader :code
@@ -21,22 +25,35 @@ module Crawlscope
           validate_title(page, issues)
           validate_description(page, issues)
           validate_canonical(page, issues)
+          validate_open_graph(page, issues)
         end
       end
       private
       def validate_h1(page, issues)
-        return unless page.doc.at_css("h1").nil?
-        issues.add(
-          code: :missing_h1,
-          severity: :warning,
-          category: :metadata,
-          url: page.url,
-          message: "missing <h1>",
-          details: {}
-        )
+        h1s = page.doc.css("h1")
+        return if h1s.one?
+        if h1s.empty?
+          issues.add(
+            code: :missing_h1,
+            severity: :warning,
+            category: :metadata,
+            url: page.url,
+            message: "missing <h1>",
+            details: {}
+          )
+        else
+          issues.add(
+            code: :multiple_h1,
+            severity: :warning,
+            category: :metadata,
+            url: page.url,
+            message: "multiple <h1> tags (#{h1s.size})",
+            details: {count: h1s.size}
+          )
+        end
       end
       def validate_title(page, issues)
@@ -56,6 +73,8 @@ module Crawlscope
         if description.empty?
           issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {})
+        elsif description.length < DESCRIPTION_MIN_LENGTH
+          issues.add(code: :meta_description_too_short, severity: :warning, category: :metadata, url: page.url, message: "meta description too short (#{description.length})", details: {length: description.length, minimum: DESCRIPTION_MIN_LENGTH})
         elsif description.length > DESCRIPTION_MAX_LENGTH
           issues.add(code: :meta_description_too_long, severity: :warning, category: :metadata, url: page.url, message: "meta description too long (#{description.length})", details: {length: description.length})
         end
@@ -71,7 +90,7 @@ module Crawlscope
         normalized_canonical = Url.normalize(canonical, base_url: page.url)
         normalized_page_url = Url.normalize(page.url, base_url: page.url)
-        return if normalized_canonical == normalized_page_url
+        return if canonical_matches_page?(normalized_canonical, normalized_page_url)
         issues.add(
           code: :canonical_mismatch,
@@ -88,6 +107,33 @@ module Crawlscope
         title.split(/[^[:alnum:]]+/).count { |token| token.casecmp?(@site_name) } > 1
       end
+      def validate_open_graph(page, issues)
+        missing = REQUIRED_OPEN_GRAPH_PROPERTIES.reject do |property|
+          page.doc.at_css(%(meta[property="#{property}"][content]))
+        end
+        return if missing.empty?
+        issues.add(
+          code: :incomplete_open_graph_tags,
+          severity: :warning,
+          category: :metadata,
+          url: page.url,
+          message: "Open Graph tags incomplete (missing #{missing.join(", ")})",
+          details: {missing: missing}
+        )
+      end
+      def canonical_matches_page?(canonical, page_url)
+        canonical == page_url || (local_url?(page_url) && Url.path(canonical) == Url.path(page_url))
+      end
+      def local_url?(url)
+        host = URI.parse(url.to_s).host.to_s
+        ["localhost", "127.0.0.1", "0.0.0.0", "::1"].include?(host)
+      rescue URI::InvalidURIError
+        false
+      end
     end
   end
 end

data/lib/crawlscope/rules/structured_data.rb CHANGED Viewed

@@ -3,6 +3,8 @@
 module Crawlscope
   module Rules
     class StructuredData
+      CAREER_DETAIL_PATH = %r{/careers/[^/]+/?\z}
       attr_reader :code
       def initialize
@@ -23,8 +25,21 @@ module Crawlscope
       def validate_page(page, issues, schema_registry)
         document = Crawlscope::StructuredData::Document.new(html: page.body)
+        items = document.items
+        if items.empty?
+          issues.add(
+            code: :missing_structured_data,
+            severity: :warning,
+            category: :structured_data,
+            url: page.url,
+            message: "no structured data found; add JSON-LD or microdata markup",
+            details: {expected_sources: ["json-ld", "microdata"]}
+          )
+          return
+        end
-        document.items.each do |item|
+        items.each do |item|
           data = item.data
           source = item.source
@@ -52,6 +67,51 @@ module Crawlscope
             details: {errors: errors, source: source}
           )
         end
+        validate_job_posting_count(page, items, issues)
+      end
+      def validate_job_posting_count(page, items, issues)
+        job_postings = items.select { |item| structured_data_types(item.data).include?("JobPosting") }
+        return if job_postings.size == 1
+        if job_postings.size > 1
+          issues.add(
+            code: :multiple_job_postings,
+            severity: :warning,
+            category: :structured_data,
+            url: page.url,
+            message: "multiple JobPosting structured data blocks found",
+            details: {count: job_postings.size}
+          )
+        elsif career_detail_page?(page.url)
+          issues.add(
+            code: :missing_job_posting,
+            severity: :warning,
+            category: :structured_data,
+            url: page.url,
+            message: "career detail page missing JobPosting structured data",
+            details: {expected_type: "JobPosting"}
+          )
+        end
+      end
+      def structured_data_types(data)
+        return [] unless data.is_a?(Hash)
+        types = Array(data["@type"]).map(&:to_s)
+        if data["@graph"].is_a?(Array)
+          types.concat(data["@graph"].flat_map { |entry| structured_data_types(entry) })
+        end
+        types
+      end
+      def career_detail_page?(url)
+        URI(url).path.match?(CAREER_DETAIL_PATH)
+      rescue URI::InvalidURIError
+        false
       end
     end
   end

data/lib/crawlscope/run.rb ADDED Viewed

@@ -0,0 +1,60 @@
+# frozen_string_literal: true
+module Crawlscope
+  class Run
+    def initialize(configuration: Crawlscope.configuration, reporter: Reporter.new(io: configuration.output))
+      @configuration = configuration
+      @reporter = reporter
+    end
+    def validate(base_url: nil, sitemap_path: nil, rule_names: nil)
+      resolved_base_url = base_url || default_base_url
+      crawl = @configuration.audit(
+        base_url: resolved_base_url,
+        sitemap_path: sitemap_path || default_sitemap_path(base_url: resolved_base_url),
+        rule_names: rule_names
+      )
+      result = crawl.call
+      @reporter.report(result)
+      result
+    end
+    def validate_json_ld(urls:, debug: false, renderer: @configuration.renderer, timeout_seconds: @configuration.timeout_seconds, report_path: nil, summary: false)
+      StructuredData::Check.new(configuration: @configuration).call(
+        urls: urls,
+        debug: debug,
+        renderer: renderer,
+        timeout_seconds: timeout_seconds,
+        report_path: report_path,
+        summary: summary
+      )
+    end
+    private
+    def default_base_url
+      value = @configuration.base_url
+      return value unless value.to_s.strip.empty?
+      "http://localhost:3000"
+    end
+    def default_sitemap_path(base_url:)
+      value = @configuration.sitemap_path
+      return value unless value.to_s.strip.empty?
+      local_path = File.expand_path("public/sitemap.xml", Dir.pwd)
+      return local_path if local_path_default?(base_url: base_url) && File.exist?(local_path)
+      "#{base_url.to_s.chomp("/")}/sitemap.xml"
+    end
+    def local_path_default?(base_url:)
+      host = URI.parse(base_url.to_s).host.to_s
+      ["localhost", "127.0.0.1"].include?(host)
+    rescue URI::InvalidURIError
+      false
+    end
+  end
+end