RubyGems - crawlscope - Versions diffs - 0.4.0 → 0.5.0 - Mend

crawlscope 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +9 -0
data/README.md +6 -0
data/lib/crawlscope/cli.rb +4 -1
data/lib/crawlscope/crawl.rb +2 -0
data/lib/crawlscope/rake_tasks.rb +27 -12
data/lib/crawlscope/reporter.rb +20 -5
data/lib/crawlscope/rules/indexability.rb +130 -17
data/lib/crawlscope/rules/links.rb +312 -9
data/lib/crawlscope/rules/metadata.rb +61 -6
data/lib/crawlscope/rules/structured_data.rb +31 -0
data/lib/crawlscope/rules/uniqueness.rb +22 -0
data/lib/crawlscope/sitemap.rb +9 -1
data/lib/crawlscope/version.rb +1 -1
data/lib/tasks/crawlscope_tasks.rake +24 -24
data/test/crawlscope/cli_test.rb +1 -0
data/test/crawlscope/crawl_test.rb +26 -0
data/test/crawlscope/indexability_rule_test.rb +33 -0
data/test/crawlscope/links_rule_test.rb +148 -3
data/test/crawlscope/metadata_rule_test.rb +36 -0
data/test/crawlscope/rake_tasks_test.rb +70 -0
data/test/crawlscope/reporter_test.rb +7 -3
data/test/crawlscope/sitemap_test.rb +24 -0
data/test/crawlscope/structured_data_rule_test.rb +56 -0
data/test/crawlscope/uniqueness_rule_test.rb +17 -2
metadata +2 -1

data/lib/crawlscope/rules/links.rb CHANGED Viewed

@@ -10,6 +10,7 @@ module Crawlscope
       LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze
       MAX_SOURCES_IN_ERROR = 3
       MIN_INBOUND_ANCHOR_LINKS = 1
+      MIN_DOFOLLOW_INBOUND_LINKS = 2
       attr_reader :code
@@ -24,10 +25,14 @@ module Crawlscope
         @base_host = URI.parse(@base_url).host
         links = extract_links(pages)
-        return if links.empty?
+        validate_url_hygiene(urls, links, issues)
         resolved_links = resolve_links(links, issues)
+        validate_nofollow_outgoing_links(links, issues)
+        validate_http_internal_links(links, issues)
+        validate_pages_with_no_outgoing_links(urls, pages, links, issues)
+        validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues)
         validate_inbound_counts(urls, pages, resolved_links, issues)
+        validate_canonical_targets(urls, pages, resolved_links, issues)
       end
       private
@@ -64,6 +69,8 @@ module Crawlscope
         {
           anchor_text: anchor_text,
+          http_internal_link: http_internal_link?(page.normalized_url, href),
+          nofollow: nofollow_link?(node),
           source_path: source_path,
           source_url: page.normalized_url,
           target_path: target_path,
@@ -86,6 +93,19 @@ module Crawlscope
         text.to_s.gsub(/\s+/, " ").strip
       end
+      def nofollow_link?(node)
+        node["rel"].to_s.split(/\s+/).any? { |value| value.casecmp?("nofollow") }
+      end
+      def http_internal_link?(source_url, href)
+        source_uri = URI.parse(source_url.to_s)
+        target_uri = URI.parse(URI.join(source_url, href).to_s)
+        source_uri.scheme == "https" && target_uri.scheme == "http" && target_uri.host == @base_host
+      rescue URI::InvalidURIError
+        false
+      end
       def normalize_internal_link(source_url, href)
         absolute_url = URI.join(source_url, href).to_s
         uri = URI.parse(absolute_url)
@@ -109,6 +129,36 @@ module Crawlscope
         )
       end
+      def validate_nofollow_outgoing_links(links, issues)
+        links.select { |link| link[:nofollow] }.group_by { |link| link[:source_url] }.each do |source_url, grouped_links|
+          target_urls = grouped_links.map { |link| link[:target_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
+          issues.add(
+            code: :nofollow_internal_outlinks,
+            severity: :warning,
+            category: :links,
+            url: source_url,
+            message: "page has nofollow outgoing internal links",
+            details: {target_urls: target_urls}
+          )
+        end
+      end
+      def validate_http_internal_links(links, issues)
+        links.select { |link| link[:http_internal_link] }.group_by { |link| link[:source_url] }.each do |source_url, grouped_links|
+          target_urls = grouped_links.map { |link| link[:target_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
+          issues.add(
+            code: :http_internal_link,
+            severity: :warning,
+            category: :links,
+            url: source_url,
+            message: "HTTPS page links to internal HTTP URL",
+            details: {target_urls: target_urls}
+          )
+        end
+      end
       def report_unresolved_target(target_url, grouped_links, issues, resolution)
         source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
         suffix = (resolution && resolution[:error]) ? " (#{resolution[:error]})" : ""
@@ -189,6 +239,10 @@ module Crawlscope
           resolution && status.nil? && resolution[:crawled] && resolution[:error]
         end
+        def html?
+          resolution && resolution[:html]
+        end
         def status
           resolution && resolution[:status]
         end
@@ -221,6 +275,7 @@ module Crawlscope
           memo[path] = normalized_url
         end
+        return if sitemap_paths.size < 2
         html_paths = pages.each_with_object(Set.new) do |page, result|
           next unless page.html?
@@ -235,7 +290,11 @@ module Crawlscope
         end
         inbound_anchor_counts = Hash.new(0)
+        dofollow_inbound_counts = Hash.new(0)
+        nofollow_inbound_counts = Hash.new(0)
         sample_sources_by_target = Hash.new { |hash, key| hash[key] = [] }
+        dofollow_sources_by_target = Hash.new { |hash, key| hash[key] = [] }
+        nofollow_sources_by_target = Hash.new { |hash, key| hash[key] = [] }
         resolved_links.each do |link|
           target_path = link[:final_path]
@@ -245,24 +304,268 @@ module Crawlscope
           inbound_anchor_counts[target_path] += 1
           source_samples = sample_sources_by_target[target_path]
           source_samples << link[:source_url] unless source_samples.include?(link[:source_url])
+          if link[:nofollow]
+            nofollow_inbound_counts[target_path] += 1
+            nofollow_sources = nofollow_sources_by_target[target_path]
+            nofollow_sources << link[:source_url] unless nofollow_sources.include?(link[:source_url])
+          else
+            dofollow_inbound_counts[target_path] += 1
+            dofollow_sources = dofollow_sources_by_target[target_path]
+            dofollow_sources << link[:source_url] unless dofollow_sources.include?(link[:source_url])
+          end
         end
         sitemap_paths.each do |path, target_url|
           next unless html_paths.include?(path)
           inbound_count = inbound_anchor_counts[path]
-          next if inbound_count >= MIN_INBOUND_ANCHOR_LINKS
+          dofollow_count = dofollow_inbound_counts[path]
+          nofollow_count = nofollow_inbound_counts[path]
+          report_orphan_page(target_url, issues) if inbound_count.zero?
+          if inbound_count.positive? && inbound_count < MIN_INBOUND_ANCHOR_LINKS
+            source_samples = sample_sources_by_target[path].first(MAX_SOURCES_IN_ERROR)
+            source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : ""
+            issues.add(
+              code: :low_inbound_anchor_links,
+              severity: :warning,
+              category: :links,
+              url: target_url,
+              message: "inbound anchor links #{inbound_count} below #{MIN_INBOUND_ANCHOR_LINKS}#{source_info}",
+              details: {inbound_count: inbound_count, minimum: MIN_INBOUND_ANCHOR_LINKS, source_urls: source_samples}
+            )
+          end
-          source_samples = sample_sources_by_target[path].first(MAX_SOURCES_IN_ERROR)
-          source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : ""
+          report_low_dofollow_inlinks(target_url, path, dofollow_count, dofollow_sources_by_target, issues)
+          report_only_nofollow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources_by_target[path], issues)
+          report_mixed_follow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources_by_target[path], dofollow_sources_by_target[path], issues)
+        end
+      end
+      def validate_url_hygiene(urls, links, issues)
+        checked_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }
+        checked_urls.concat(links.map { |link| link[:target_url] })
+        checked_urls.compact.uniq.each do |url|
+          report_url_double_slash(url, issues)
+          report_url_too_long(url, issues)
+        end
+      end
+      def report_url_double_slash(url, issues)
+        path = URI.parse(url).path.to_s
+        return unless path.match?(%r{//+})
+        issues.add(
+          code: :url_double_slash,
+          severity: :notice,
+          category: :url,
+          url: url,
+          message: "URL path contains duplicate slashes",
+          details: {path: path}
+        )
+      rescue URI::InvalidURIError
+        nil
+      end
+      def report_url_too_long(url, issues)
+        return unless url.length > 2_048
+        issues.add(
+          code: :url_too_long,
+          severity: :notice,
+          category: :url,
+          url: url,
+          message: "URL too long (#{url.length})",
+          details: {length: url.length, maximum: 2_048}
+        )
+      end
+      def validate_pages_with_no_outgoing_links(urls, pages, links, issues)
+        sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact.to_set
+        return if sitemap_urls.size < 2
+        source_paths_with_links = links.map { |link| link[:source_path] }.to_set
+        pages.each do |page|
+          next unless page.html?
+          next unless sitemap_urls.include?(page.normalized_url)
+          source_path = Url.path(page.normalized_url)
+          next unless crawlable_source_path?(source_path)
+          next if source_paths_with_links.include?(source_path)
           issues.add(
-            code: :low_inbound_anchor_links,
+            code: :page_has_no_outgoing_links,
             severity: :warning,
             category: :links,
-            url: target_url,
-            message: "inbound anchor links #{inbound_count} below #{MIN_INBOUND_ANCHOR_LINKS}#{source_info}",
-            details: {inbound_count: inbound_count, minimum: MIN_INBOUND_ANCHOR_LINKS, source_urls: source_samples}
+            url: page.url,
+            message: "page has no outgoing internal links",
+            details: {}
+          )
+        end
+      end
+      def validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues)
+        sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact.to_set
+        reported_urls = Set.new
+        resolved_links.each do |link|
+          final_url = link[:final_url]
+          next if sitemap_urls.include?(final_url)
+          next if reported_urls.include?(final_url)
+          next unless crawlable_path?(link[:final_path])
+          target = resolve_target(final_url)
+          next unless target.allowed?(@allowed_statuses) && target.html?
+          reported_urls << final_url
+          issues.add(
+            code: :indexable_page_missing_from_sitemap,
+            severity: :warning,
+            category: :sitemaps,
+            url: final_url,
+            message: "indexable internal page is missing from sitemap",
+            details: {source_url: link[:source_url]}
+          )
+        end
+      end
+      def report_orphan_page(target_url, issues)
+        issues.add(
+          code: :orphan_page,
+          severity: :warning,
+          category: :links,
+          url: target_url,
+          message: "page has no incoming internal links",
+          details: {}
+        )
+      end
+      def report_low_dofollow_inlinks(target_url, path, dofollow_count, sources_by_target, issues)
+        return if dofollow_count.zero?
+        return if dofollow_count >= MIN_DOFOLLOW_INBOUND_LINKS
+        source_samples = sources_by_target[path].first(MAX_SOURCES_IN_ERROR)
+        source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : ""
+        issues.add(
+          code: :low_dofollow_inlinks,
+          severity: :warning,
+          category: :links,
+          url: target_url,
+          message: "dofollow inbound links #{dofollow_count} below #{MIN_DOFOLLOW_INBOUND_LINKS}#{source_info}",
+          details: {dofollow_inbound_count: dofollow_count, minimum: MIN_DOFOLLOW_INBOUND_LINKS, source_urls: source_samples}
+        )
+      end
+      def report_only_nofollow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources, issues)
+        return unless nofollow_count.positive? && dofollow_count.zero?
+        issues.add(
+          code: :only_nofollow_internal_inlinks,
+          severity: :warning,
+          category: :links,
+          url: target_url,
+          message: "page has nofollow incoming internal links only",
+          details: {nofollow_inbound_count: nofollow_count, source_urls: nofollow_sources.first(MAX_SOURCES_IN_ERROR)}
+        )
+      end
+      def report_mixed_follow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources, dofollow_sources, issues)
+        return unless nofollow_count.positive? && dofollow_count.positive?
+        issues.add(
+          code: :mixed_follow_internal_inlinks,
+          severity: :notice,
+          category: :links,
+          url: target_url,
+          message: "page has nofollow and dofollow incoming internal links",
+          details: {
+            dofollow_inbound_count: dofollow_count,
+            nofollow_inbound_count: nofollow_count,
+            dofollow_source_urls: dofollow_sources.first(MAX_SOURCES_IN_ERROR),
+            nofollow_source_urls: nofollow_sources.first(MAX_SOURCES_IN_ERROR)
+          }
+        )
+      end
+      def validate_canonical_targets(urls, pages, resolved_links, issues)
+        sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact
+        sitemap_pages = pages.select { |page| page.html? && sitemap_urls.include?(page.normalized_url) }
+        return if sitemap_pages.size < 2
+        dofollow_counts_by_path = dofollow_counts_by_final_path(resolved_links)
+        sitemap_pages.each do |page|
+          canonical_url = canonical_url_for(page)
+          next if canonical_url.nil?
+          target_uri = URI.parse(canonical_url)
+          next if target_uri.host != @base_host
+          canonical_path = Url.path(canonical_url)
+          if canonical_path && dofollow_counts_by_path[canonical_path].zero?
+            issues.add(
+              code: :canonical_no_internal_inlinks,
+              severity: :warning,
+              category: :links,
+              url: canonical_url,
+              message: "canonical URL has no incoming internal links",
+              details: {source_url: page.url}
+            )
+          end
+          validate_canonical_target_status(page, canonical_url, issues)
+        rescue URI::InvalidURIError
+          next
+        end
+      end
+      def dofollow_counts_by_final_path(resolved_links)
+        resolved_links.each_with_object(Hash.new(0)) do |link, counts|
+          next if link[:nofollow]
+          next if link[:source_path] == link[:final_path]
+          counts[link[:final_path]] += 1
+        end
+      end
+      def canonical_url_for(page)
+        canonical = page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip
+        return if canonical.empty?
+        Url.normalize(canonical, base_url: page.url)
+      end
+      def validate_canonical_target_status(page, canonical_url, issues)
+        target = resolve_target(canonical_url)
+        if target.unresolved? || target.ignored_error?
+          return
+        end
+        if target.redirect?
+          issues.add(
+            code: :canonical_points_to_redirect,
+            severity: :warning,
+            category: :metadata,
+            url: page.url,
+            message: "canonical points to redirect",
+            details: {canonical: canonical_url, final_url: target.final_url, status: target.status}
+          )
+        elsif !target.allowed?(@allowed_statuses)
+          issues.add(
+            code: :canonical_points_to_error,
+            severity: :warning,
+            category: :metadata,
+            url: page.url,
+            message: "canonical points to HTTP #{target.status}",
+            details: {canonical: canonical_url, status: target.status}
           )
         end
       end

data/lib/crawlscope/rules/metadata.rb CHANGED Viewed

@@ -18,22 +18,41 @@ module Crawlscope
       end
       def call(urls:, pages:, issues:, context: nil)
+        sitemap_urls = normalized_sitemap_urls(urls)
         pages.each do |page|
           next unless page.html?
           validate_h1(page, issues)
           validate_title(page, issues)
           validate_description(page, issues)
-          validate_canonical(page, issues)
+          validate_canonical(page, issues, sitemap_urls)
           validate_open_graph(page, issues)
         end
       end
       private
+      def normalized_sitemap_urls(urls)
+        urls.map { |url| Url.normalize(url, base_url: url) }.compact
+      end
       def validate_h1(page, issues)
         h1s = page.doc.css("h1")
-        return if h1s.one?
+        empty_h1s = h1s.select { |node| node.text.to_s.strip.empty? }
+        if empty_h1s.any?
+          issues.add(
+            code: :empty_h1,
+            severity: :warning,
+            category: :metadata,
+            url: page.url,
+            message: "empty <h1>",
+            details: {count: empty_h1s.size}
+          )
+        end
+        return if h1s.one? && empty_h1s.empty?
         if h1s.empty?
           issues.add(
@@ -57,7 +76,19 @@ module Crawlscope
       end
       def validate_title(page, issues)
-        title = page.doc.at_css("title")&.text.to_s.strip
+        titles = page.doc.css("head > title")
+        title = titles.first&.text.to_s.strip
+        if titles.size > 1
+          issues.add(
+            code: :multiple_title_tags,
+            severity: :warning,
+            category: :metadata,
+            url: page.url,
+            message: "multiple <title> tags (#{titles.size})",
+            details: {count: titles.size}
+          )
+        end
         if title.empty?
           issues.add(code: :missing_title, severity: :warning, category: :metadata, url: page.url, message: "missing <title>", details: {})
@@ -69,7 +100,19 @@ module Crawlscope
       end
       def validate_description(page, issues)
-        description = page.doc.at_css('meta[name="description"]')&.[]("content").to_s.strip
+        descriptions = page.doc.css('head > meta[name="description"]')
+        description = descriptions.first&.[]("content").to_s.strip
+        if descriptions.size > 1
+          issues.add(
+            code: :multiple_meta_descriptions,
+            severity: :warning,
+            category: :metadata,
+            url: page.url,
+            message: "multiple meta description tags (#{descriptions.size})",
+            details: {count: descriptions.size}
+          )
+        end
         if description.empty?
           issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {})
@@ -80,7 +123,7 @@ module Crawlscope
         end
       end
-      def validate_canonical(page, issues)
+      def validate_canonical(page, issues, sitemap_urls)
         canonical = page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip
         if canonical.empty?
@@ -92,13 +135,25 @@ module Crawlscope
         normalized_page_url = Url.normalize(page.url, base_url: page.url)
         return if canonical_matches_page?(normalized_canonical, normalized_page_url)
+        details = {canonical: canonical}
         issues.add(
           code: :canonical_mismatch,
           severity: :warning,
           category: :metadata,
           url: page.url,
           message: "canonical mismatch (#{canonical})",
-          details: {canonical: canonical}
+          details: details
+        )
+        return unless sitemap_urls.include?(normalized_page_url)
+        issues.add(
+          code: :non_canonical_page_in_sitemap,
+          severity: :warning,
+          category: :sitemaps,
+          url: page.url,
+          message: "non-canonical page is included in sitemap",
+          details: details
         )
       end

data/lib/crawlscope/rules/structured_data.rb CHANGED Viewed

@@ -55,6 +55,8 @@ module Crawlscope
             next
           end
+          validate_type_presence(page, source, data, issues)
           errors = schema_registry.validate(data)
           next if errors.empty?
@@ -96,6 +98,35 @@ module Crawlscope
         end
       end
+      def validate_type_presence(page, source, data, issues)
+        missing_paths = missing_type_paths(data)
+        return if missing_paths.empty?
+        issues.add(
+          code: :structured_data_missing_type,
+          severity: :warning,
+          category: :structured_data,
+          url: page.url,
+          message: "#{source} structured data missing @type",
+          details: {paths: missing_paths, source: source}
+        )
+      end
+      def missing_type_paths(data, path = "$")
+        return [] unless data.is_a?(Hash)
+        paths = []
+        paths << path if data["@type"].to_s.strip.empty?
+        if data["@graph"].is_a?(Array)
+          data["@graph"].each_with_index do |entry, index|
+            paths.concat(missing_type_paths(entry, "#{path}.@graph[#{index}]"))
+          end
+        end
+        paths
+      end
       def structured_data_types(data)
         return [] unless data.is_a?(Hash)

data/lib/crawlscope/rules/uniqueness.rb CHANGED Viewed

@@ -58,6 +58,7 @@ module Crawlscope
         {
           content_fingerprint_digest: content_fingerprint_digest(page.doc),
+          canonical: page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip,
           description: page.doc.at_css('meta[name="description"]')&.[]("content").to_s.strip,
           shingles: shingles_for(tokens),
           title: page.doc.at_css("title")&.text.to_s.strip,
@@ -98,6 +99,27 @@ module Crawlscope
             details: {urls: urls}
           )
         end
+        duplicate_content_clusters_without_canonical(page_summaries).each do |urls|
+          issues.add(
+            code: :duplicate_pages_without_canonical,
+            severity: :warning,
+            category: :uniqueness,
+            url: nil,
+            message: "duplicate pages without canonical => #{urls.join(", ")}",
+            details: {urls: urls}
+          )
+        end
+      end
+      def duplicate_content_clusters_without_canonical(page_summaries)
+        page_summaries
+          .select { |page| !page[:content_fingerprint_digest].nil? }
+          .group_by { |page| page[:content_fingerprint_digest] }
+          .values
+          .select { |pages| pages.size > 1 }
+          .select { |pages| pages.any? { |page| page[:canonical].to_s.empty? } }
+          .map { |pages| pages.map { |page| page[:url] } }
       end
       def shingles_for(tokens)

data/lib/crawlscope/sitemap.rb CHANGED Viewed

@@ -25,6 +25,9 @@ module Crawlscope
       visited.add(source)
       document = Nokogiri::XML(read(source))
       root_name = document.root&.name
+      unless %w[sitemapindex urlset].include?(root_name)
+        raise ValidationError, "Sitemap #{source} has unexpected root #{root_name.inspect}"
+      end
       if root_name == "sitemapindex"
         document.xpath("//xmlns:sitemap/xmlns:loc", SITEMAP_NAMESPACE).flat_map do |node|
@@ -40,7 +43,12 @@ module Crawlscope
     def read(source)
       if Url.remote?(source)
-        connection.get(source).body
+        response = connection.get(source)
+        unless response.status.to_i.between?(200, 299)
+          raise ValidationError, "Sitemap #{source} returned HTTP #{response.status}"
+        end
+        response.body
       else
         File.read(source)
       end

data/lib/crawlscope/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Crawlscope
-  VERSION = "0.4.0"
+  VERSION = "0.5.0"
 end