crawlscope 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,8 @@
3
3
  module Crawlscope
4
4
  module Rules
5
5
  class StructuredData
6
+ CAREER_DETAIL_PATH = %r{/careers/[^/]+/?\z}
7
+
6
8
  attr_reader :code
7
9
 
8
10
  def initialize
@@ -65,6 +67,51 @@ module Crawlscope
65
67
  details: {errors: errors, source: source}
66
68
  )
67
69
  end
70
+
71
+ validate_job_posting_count(page, items, issues)
72
+ end
73
+
74
+ def validate_job_posting_count(page, items, issues)
75
+ job_postings = items.select { |item| structured_data_types(item.data).include?("JobPosting") }
76
+ return if job_postings.size == 1
77
+
78
+ if job_postings.size > 1
79
+ issues.add(
80
+ code: :multiple_job_postings,
81
+ severity: :warning,
82
+ category: :structured_data,
83
+ url: page.url,
84
+ message: "multiple JobPosting structured data blocks found",
85
+ details: {count: job_postings.size}
86
+ )
87
+ elsif career_detail_page?(page.url)
88
+ issues.add(
89
+ code: :missing_job_posting,
90
+ severity: :warning,
91
+ category: :structured_data,
92
+ url: page.url,
93
+ message: "career detail page missing JobPosting structured data",
94
+ details: {expected_type: "JobPosting"}
95
+ )
96
+ end
97
+ end
98
+
99
+ def structured_data_types(data)
100
+ return [] unless data.is_a?(Hash)
101
+
102
+ types = Array(data["@type"]).map(&:to_s)
103
+
104
+ if data["@graph"].is_a?(Array)
105
+ types.concat(data["@graph"].flat_map { |entry| structured_data_types(entry) })
106
+ end
107
+
108
+ types
109
+ end
110
+
111
+ def career_detail_page?(url)
112
+ URI(url).path.match?(CAREER_DETAIL_PATH)
113
+ rescue URI::InvalidURIError
114
+ false
68
115
  end
69
116
  end
70
117
  end
@@ -5,10 +5,24 @@ require "digest"
5
5
  module Crawlscope
6
6
  module Rules
7
7
  class Uniqueness
8
+ MINIMUM_SHINGLES = 10
9
+ MAX_NEAR_DUPLICATE_PAGES = 250
10
+ NEAR_DUPLICATE_THRESHOLD = 0.9
11
+ SHINGLE_SIZE = 5
12
+
8
13
  attr_reader :code
9
14
 
10
- def initialize
15
+ def initialize(
16
+ near_duplicate_threshold: NEAR_DUPLICATE_THRESHOLD,
17
+ max_near_duplicate_pages: MAX_NEAR_DUPLICATE_PAGES,
18
+ minimum_shingles: MINIMUM_SHINGLES,
19
+ shingle_size: SHINGLE_SIZE
20
+ )
11
21
  @code = :uniqueness
22
+ @max_near_duplicate_pages = max_near_duplicate_pages
23
+ @minimum_shingles = minimum_shingles
24
+ @near_duplicate_threshold = near_duplicate_threshold
25
+ @shingle_size = shingle_size
12
26
  end
13
27
 
14
28
  def call(urls:, pages:, issues:, context:)
@@ -19,14 +33,13 @@ module Crawlscope
19
33
  end
20
34
 
21
35
  validate_duplicates(page_summaries, issues)
36
+ validate_near_duplicates(page_summaries, issues)
22
37
  end
23
38
 
24
39
  private
25
40
 
26
41
  def content_fingerprint_digest(doc)
27
- text = doc.at_css("main")&.text.to_s
28
- text = doc.at_css("body")&.text.to_s if text.empty?
29
- normalized = text.gsub(/\s+/, " ").strip
42
+ normalized = DocumentText.text_for(doc)
30
43
  return if normalized.length < 200
31
44
 
32
45
  Digest::SHA256.hexdigest(normalized)
@@ -41,9 +54,12 @@ module Crawlscope
41
54
  end
42
55
 
43
56
  def summary_for(page)
57
+ tokens = DocumentText.tokens(DocumentText.text_for(page.doc))
58
+
44
59
  {
45
60
  content_fingerprint_digest: content_fingerprint_digest(page.doc),
46
61
  description: page.doc.at_css('meta[name="description"]')&.[]("content").to_s.strip,
62
+ shingles: shingles_for(tokens),
47
63
  title: page.doc.at_css("title")&.text.to_s.strip,
48
64
  url: page.url
49
65
  }
@@ -83,6 +99,62 @@ module Crawlscope
83
99
  )
84
100
  end
85
101
  end
102
+
103
+ def shingles_for(tokens)
104
+ return [] if tokens.size < @shingle_size
105
+
106
+ tokens.each_cons(@shingle_size).map { |items| items.join(" ") }.uniq
107
+ end
108
+
109
+ def validate_near_duplicates(page_summaries, issues)
110
+ if near_duplicate_scan_limit_exceeded?(page_summaries)
111
+ issues.add(
112
+ code: :near_duplicate_scan_skipped,
113
+ severity: :warning,
114
+ category: :uniqueness,
115
+ url: nil,
116
+ message: "near duplicate scan skipped for #{page_summaries.size} pages",
117
+ details: {max_pages: @max_near_duplicate_pages, page_count: page_summaries.size}
118
+ )
119
+ return
120
+ end
121
+
122
+ page_summaries.combination(2) do |left, right|
123
+ next if same_content_fingerprint?(left, right)
124
+ next if left[:shingles].size < @minimum_shingles || right[:shingles].size < @minimum_shingles
125
+
126
+ similarity = shingle_similarity(left[:shingles], right[:shingles])
127
+ next if similarity < @near_duplicate_threshold
128
+
129
+ urls = [left[:url], right[:url]]
130
+
131
+ issues.add(
132
+ code: :near_duplicate_content,
133
+ severity: :warning,
134
+ category: :uniqueness,
135
+ url: nil,
136
+ message: "near duplicate page content (#{format("%.2f", similarity)}) => #{urls.join(", ")}",
137
+ details: {similarity: similarity.round(3), threshold: @near_duplicate_threshold, urls: urls}
138
+ )
139
+ end
140
+ end
141
+
142
+ def near_duplicate_scan_limit_exceeded?(page_summaries)
143
+ !@max_near_duplicate_pages.nil? && page_summaries.size > @max_near_duplicate_pages
144
+ end
145
+
146
+ def same_content_fingerprint?(left, right)
147
+ !left[:content_fingerprint_digest].nil? &&
148
+ left[:content_fingerprint_digest] == right[:content_fingerprint_digest]
149
+ end
150
+
151
+ def shingle_similarity(left, right)
152
+ intersection_size = (left & right).size
153
+ smaller_set_size = [left.size, right.size].min
154
+ return 0.0 if smaller_set_size.zero?
155
+
156
+ intersection_size.to_f / smaller_set_size
157
+ end
86
158
  end
87
159
  end
88
160
  end
@@ -330,6 +330,56 @@ module Crawlscope
330
330
  }
331
331
  }.freeze
332
332
 
333
+ JOB_POSTING = {
334
+ type: "object",
335
+ additionalProperties: true,
336
+ required: ["@type", "title", "description", "datePosted", "hiringOrganization"],
337
+ properties: {
338
+ "@context" => {enum: ["https://schema.org", "https://schema.org/"]},
339
+ "@type" => {const: "JobPosting"},
340
+ :title => {type: "string"},
341
+ :description => {type: "string"},
342
+ :identifier => {type: "object"},
343
+ :datePosted => {type: "string"},
344
+ :validThrough => {type: "string"},
345
+ :employmentType => {
346
+ anyOf: [
347
+ {type: "string"},
348
+ {type: "array", minItems: 1, items: {type: "string"}}
349
+ ]
350
+ },
351
+ :directApply => {type: "boolean"},
352
+ :hiringOrganization => {
353
+ type: "object",
354
+ required: ["@type", "name"],
355
+ properties: {
356
+ "@type" => {const: "Organization"},
357
+ :name => {type: "string"},
358
+ :sameAs => {type: "string", format: "uri"},
359
+ :logo => {type: "string", format: "uri"}
360
+ }
361
+ },
362
+ :applicantLocationRequirements => {
363
+ anyOf: [
364
+ {type: "object"},
365
+ {type: "array", minItems: 1, items: {type: "object"}}
366
+ ]
367
+ },
368
+ :jobLocationType => {type: "string"},
369
+ :jobLocation => {
370
+ anyOf: [
371
+ {type: "object"},
372
+ {type: "array", minItems: 1, items: {type: "object"}}
373
+ ]
374
+ },
375
+ :baseSalary => {type: "object"}
376
+ },
377
+ anyOf: [
378
+ {required: ["jobLocation"]},
379
+ {required: ["jobLocationType", "applicantLocationRequirements"]}
380
+ ]
381
+ }.freeze
382
+
333
383
  def self.schemas
334
384
  {
335
385
  "FAQPage" => FAQ_PAGE,
@@ -348,7 +398,8 @@ module Crawlscope
348
398
  "Recipe" => RECIPE,
349
399
  "Event" => EVENT,
350
400
  "VideoObject" => VIDEO_OBJECT,
351
- "WebPage" => WEB_PAGE
401
+ "WebPage" => WEB_PAGE,
402
+ "JobPosting" => JOB_POSTING
352
403
  }
353
404
  end
354
405
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Crawlscope
4
- VERSION = "0.2.0"
4
+ VERSION = "0.4.0"
5
5
  end
@@ -5,11 +5,16 @@ namespace :crawlscope do
5
5
  end
6
6
 
7
7
  namespace :validate do
8
- desc "Directly validate JSON-LD on one or more URLs. ENV: URL (required, semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
8
+ desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
9
9
  task ldjson: :environment do
10
10
  Crawlscope::RakeTasks.ldjson
11
11
  end
12
12
 
13
+ desc "Validate URLs with the indexability rule. ENV: URL, SITEMAP, JS=1"
14
+ task indexability: :environment do
15
+ Crawlscope::RakeTasks.validate_rule("indexability")
16
+ end
17
+
13
18
  desc "Validate URLs with the metadata rule. ENV: URL, SITEMAP, JS=1"
14
19
  task metadata: :environment do
15
20
  Crawlscope::RakeTasks.validate_rule("metadata")
@@ -25,6 +30,11 @@ namespace :crawlscope do
25
30
  Crawlscope::RakeTasks.validate_rule("uniqueness")
26
31
  end
27
32
 
33
+ desc "Validate URLs with the content_quality rule. ENV: URL, SITEMAP, JS=1"
34
+ task content_quality: :environment do
35
+ Crawlscope::RakeTasks.validate_rule("content_quality")
36
+ end
37
+
28
38
  desc "Validate URLs with the links rule. ENV: URL, SITEMAP, JS=1"
29
39
  task links: :environment do
30
40
  Crawlscope::RakeTasks.validate_rule("links")
@@ -4,9 +4,10 @@ require "test_helper"
4
4
 
5
5
  class CrawlscopeCliTest < Minitest::Test
6
6
  class FakeConfiguration
7
- attr_accessor :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
7
+ attr_accessor :base_url, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
8
8
 
9
9
  def initialize
10
+ @base_url = nil
10
11
  @concurrency = 10
11
12
  @network_idle_timeout_seconds = 5
12
13
  @renderer = :http
@@ -145,6 +146,17 @@ class CrawlscopeCliTest < Minitest::Test
145
146
  assert_empty err.string
146
147
  end
147
148
 
149
+ def test_ldjson_defaults_to_configured_base_url
150
+ configuration = FakeConfiguration.new
151
+ configuration.base_url = "https://example.com"
152
+ task = FakeTask.new
153
+
154
+ status = Crawlscope::Cli.start(["ldjson"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
155
+
156
+ assert_equal 0, status
157
+ assert_equal ["https://example.com"], task.json_ld_arguments[:urls]
158
+ end
159
+
148
160
  def test_validate_caps_default_browser_concurrency
149
161
  configuration = FakeConfiguration.new
150
162
  task = FakeTask.new
@@ -218,14 +230,16 @@ class CrawlscopeCliTest < Minitest::Test
218
230
  assert_equal 3, configuration.network_idle_timeout_seconds
219
231
  end
220
232
 
221
- def test_ldjson_requires_urls
233
+ def test_ldjson_defaults_to_localhost
222
234
  out = StringIO.new
223
235
  err = StringIO.new
236
+ task = FakeTask.new
224
237
 
225
- status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task: FakeTask.new)
238
+ status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task: task)
226
239
 
227
- assert_equal 1, status
228
- assert_includes err.string, "Crawlscope URL is not configured"
240
+ assert_equal 0, status
241
+ assert_equal ["http://localhost:3000"], task.json_ld_arguments[:urls]
242
+ assert_empty err.string
229
243
  end
230
244
 
231
245
  def test_invalid_integer_option_returns_error
@@ -20,7 +20,14 @@ class CrawlscopeConfigurationTest < Minitest::Test
20
20
  assert_equal "https://example.com", audit.instance_variable_get(:@base_url)
21
21
  assert_equal "/tmp/sitemap.xml", audit.instance_variable_get(:@sitemap_path)
22
22
  assert_equal 4, audit.instance_variable_get(:@concurrency)
23
- assert_equal %i[metadata structured_data uniqueness links], audit.instance_variable_get(:@rules).map(&:code)
23
+ assert_equal %i[
24
+ indexability
25
+ metadata
26
+ structured_data
27
+ uniqueness
28
+ content_quality
29
+ links
30
+ ], audit.instance_variable_get(:@rules).map(&:code)
24
31
  end
25
32
 
26
33
  def test_audit_raises_without_base_url
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeContentQualityRuleTest < Minitest::Test
6
+ def test_reports_thin_visible_text_and_low_html_text_ratio
7
+ issues = Crawlscope::IssueCollection.new
8
+ page = page_with(main: "Short page <div>#{"<span></span>" * 500}</div>")
9
+
10
+ Crawlscope::Rules::ContentQuality.new.call(urls: [page.url], pages: [page], issues: issues)
11
+
12
+ codes = issues.to_a.map(&:code)
13
+ assert_includes codes, :thin_visible_text
14
+ assert_includes codes, :low_visible_text_ratio
15
+ end
16
+
17
+ def test_visible_text_ratio_ignores_markup_outside_main_content
18
+ issues = Crawlscope::IssueCollection.new
19
+ page = page_with(
20
+ main: Array.new(260) { |index| "word#{index}" }.join(" "),
21
+ head_markup: "<style>#{"body{}" * 10_000}</style>",
22
+ extra_markup: "<nav>#{"<a href=\"/\">Navigation</a>" * 500}</nav>"
23
+ )
24
+
25
+ Crawlscope::Rules::ContentQuality.new.call(urls: [page.url], pages: [page], issues: issues)
26
+
27
+ refute_includes issues.to_a.map(&:code), :low_visible_text_ratio
28
+ end
29
+
30
+ def test_reports_low_unique_token_ratio_for_repetitive_content
31
+ issues = Crawlscope::IssueCollection.new
32
+ page = page_with(main: ("hotel location service " * 100).strip)
33
+
34
+ Crawlscope::Rules::ContentQuality.new.call(urls: [page.url], pages: [page], issues: issues)
35
+
36
+ issue = issues.to_a.find { |item| item.code == :low_unique_token_ratio }
37
+ assert issue
38
+ assert_operator issue.details[:ratio], :<, issue.details[:threshold]
39
+ end
40
+
41
+ private
42
+
43
+ def page_with(main:, extra_markup: "", head_markup: "")
44
+ body = <<~HTML
45
+ <html>
46
+ <head>
47
+ <title>Content quality</title>
48
+ #{head_markup}
49
+ </head>
50
+ <body>
51
+ #{extra_markup}
52
+ <main>#{main}</main>
53
+ </body>
54
+ </html>
55
+ HTML
56
+
57
+ Crawlscope::Page.new(
58
+ url: "https://example.com/page",
59
+ normalized_url: "https://example.com/page",
60
+ final_url: "https://example.com/page",
61
+ normalized_final_url: "https://example.com/page",
62
+ status: 200,
63
+ headers: {"content-type" => "text/html"},
64
+ body: body,
65
+ doc: Nokogiri::HTML(body)
66
+ )
67
+ end
68
+ end
@@ -31,8 +31,13 @@ class CrawlscopeCrawlTest < Minitest::Test
31
31
  <html>
32
32
  <head>
33
33
  <title>Pricing</title>
34
- <meta name="description" content="Plans for hotels and restaurants">
34
+ <meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
35
35
  <link rel="canonical" href="https://example.com/pricing">
36
+ <meta property="og:title" content="Pricing">
37
+ <meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
38
+ <meta property="og:url" content="https://example.com/pricing">
39
+ <meta property="og:type" content="website">
40
+ <meta property="og:image" content="https://example.com/icon.png">
36
41
  <script type="application/ld+json">
37
42
  {"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
38
43
  </script>
@@ -40,6 +45,7 @@ class CrawlscopeCrawlTest < Minitest::Test
40
45
  <body>
41
46
  <main>
42
47
  <h1>Pricing</h1>
48
+ <p>#{Array.new(260) { |index| "pricing#{index}" }.join(" ")}</p>
43
49
  </main>
44
50
  </body>
45
51
  </html>
@@ -95,7 +101,15 @@ class CrawlscopeCrawlTest < Minitest::Test
95
101
  ).call
96
102
 
97
103
  refute result.ok?
98
- assert_equal %i[meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
104
+ assert_equal %i[
105
+ incomplete_open_graph_tags
106
+ meta_description_too_long
107
+ missing_canonical
108
+ missing_h1
109
+ missing_structured_data
110
+ thin_visible_text
111
+ title_repeats_site_name
112
+ ].sort, result.issues.to_a.map(&:code).uniq.sort
99
113
  end
100
114
 
101
115
  def test_uses_browser_when_renderer_is_browser
@@ -128,8 +142,13 @@ class CrawlscopeCrawlTest < Minitest::Test
128
142
  <html>
129
143
  <head>
130
144
  <title>Pricing</title>
131
- <meta name="description" content="Plans for hotels and restaurants">
145
+ <meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
132
146
  <link rel="canonical" href="https://example.com/pricing">
147
+ <meta property="og:title" content="Pricing">
148
+ <meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
149
+ <meta property="og:url" content="https://example.com/pricing">
150
+ <meta property="og:type" content="website">
151
+ <meta property="og:image" content="https://example.com/icon.png">
133
152
  <script type="application/ld+json">
134
153
  {"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
135
154
  </script>
@@ -137,6 +156,7 @@ class CrawlscopeCrawlTest < Minitest::Test
137
156
  <body>
138
157
  <main>
139
158
  <h1>Pricing</h1>
159
+ <p>#{Array.new(260) { |index| "pricing#{index}" }.join(" ")}</p>
140
160
  </main>
141
161
  </body>
142
162
  </html>
@@ -0,0 +1,96 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeIndexabilityRuleTest < Minitest::Test
6
+ def test_reports_meta_noindex
7
+ issues = Crawlscope::IssueCollection.new
8
+ page = page_with(
9
+ body: <<~HTML
10
+ <html>
11
+ <head><meta name="robots" content="noindex, follow"></head>
12
+ <body><main>Visible content</main></body>
13
+ </html>
14
+ HTML
15
+ )
16
+
17
+ Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues)
18
+
19
+ issue = issues.to_a.fetch(0)
20
+ assert_equal :noindex_meta, issue.code
21
+ assert_equal :error, issue.severity
22
+ assert_equal "noindex, follow", issue.details[:content]
23
+ end
24
+
25
+ def test_reports_x_robots_tag_noindex
26
+ issues = Crawlscope::IssueCollection.new
27
+ page = page_with(headers: {"X-Robots-Tag" => "noindex"})
28
+
29
+ Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues)
30
+
31
+ issue = issues.to_a.fetch(0)
32
+ assert_equal :noindex_header, issue.code
33
+ assert_equal :error, issue.severity
34
+ assert_equal "noindex", issue.details[:content]
35
+ end
36
+
37
+ def test_reports_x_robots_tag_noindex_for_non_html_response
38
+ issues = Crawlscope::IssueCollection.new
39
+ page = page_with(
40
+ body: "%PDF-1.7",
41
+ doc: nil,
42
+ headers: {"content-type" => "application/pdf", "X-Robots-Tag" => "noindex"}
43
+ )
44
+
45
+ Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues)
46
+
47
+ issue = issues.to_a.fetch(0)
48
+ assert_equal :noindex_header, issue.code
49
+ assert_equal :error, issue.severity
50
+ assert_equal "noindex", issue.details[:content]
51
+ end
52
+
53
+ def test_reports_scoped_x_robots_tag_noindex
54
+ issues = Crawlscope::IssueCollection.new
55
+ page = page_with(headers: {"X-Robots-Tag" => "googlebot: noindex, nofollow"})
56
+
57
+ Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues)
58
+
59
+ issue = issues.to_a.fetch(0)
60
+ assert_equal :noindex_header, issue.code
61
+ assert_equal "googlebot: noindex, nofollow", issue.details[:content]
62
+ end
63
+
64
+ def test_reports_x_robots_tag_none
65
+ issues = Crawlscope::IssueCollection.new
66
+ page = page_with(headers: {"X-Robots-Tag" => "none"})
67
+
68
+ Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues)
69
+
70
+ issue = issues.to_a.fetch(0)
71
+ assert_equal :noindex_header, issue.code
72
+ assert_equal "none", issue.details[:content]
73
+ end
74
+
75
+ private
76
+
77
+ def page_with(body: nil, doc: :parse, headers: {"content-type" => "text/html"})
78
+ body ||= <<~HTML
79
+ <html>
80
+ <head><title>Indexable</title></head>
81
+ <body><main>Visible content</main></body>
82
+ </html>
83
+ HTML
84
+
85
+ Crawlscope::Page.new(
86
+ url: "https://example.com/page",
87
+ normalized_url: "https://example.com/page",
88
+ final_url: "https://example.com/page",
89
+ normalized_final_url: "https://example.com/page",
90
+ status: 200,
91
+ headers: headers,
92
+ body: body,
93
+ doc: (doc == :parse) ? Nokogiri::HTML(body) : doc
94
+ )
95
+ end
96
+ end
@@ -118,6 +118,45 @@ class CrawlscopeLinksRuleTest < Minitest::Test
118
118
  assert_equal "https://example.com/guide", issues.to_a.first.url
119
119
  end
120
120
 
121
+ def test_counts_root_page_links_as_inbound_links
122
+ issues = Crawlscope::IssueCollection.new
123
+
124
+ Crawlscope::Rules::Links.new.call(
125
+ urls: ["https://example.com/", "https://example.com/about"],
126
+ pages: [
127
+ page(url: "https://example.com/", body: "<main><a href=\"/about\">About</a></main>"),
128
+ page(url: "https://example.com/about", body: "<main><p>About</p></main>")
129
+ ],
130
+ issues: issues,
131
+ context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
132
+ )
133
+
134
+ refute_includes issues.to_a.map(&:code), :low_inbound_anchor_links
135
+ end
136
+
137
+ def test_reports_internal_links_that_redirect
138
+ issues = Crawlscope::IssueCollection.new
139
+ resolver = lambda do |target_url|
140
+ {
141
+ crawled: false,
142
+ error: nil,
143
+ final_url: "https://example.com/pricing",
144
+ status: 200
145
+ }
146
+ end
147
+
148
+ Crawlscope::Rules::Links.new.call(
149
+ urls: ["https://example.com/guide"],
150
+ pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/plans\">Plans</a></main>")],
151
+ issues: issues,
152
+ context: context(resolver: resolver)
153
+ )
154
+
155
+ redirect_issue = issues.to_a.find { |issue| issue.code == :internal_link_redirects }
156
+ assert redirect_issue
157
+ assert_includes redirect_issue.message, "https://example.com/pricing"
158
+ end
159
+
121
160
  def test_ignores_links_that_should_not_be_crawled
122
161
  issues = Crawlscope::IssueCollection.new
123
162