crawlscope 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +67 -0
- data/README.md +46 -9
- data/lib/crawlscope/cli.rb +5 -0
- data/lib/crawlscope/crawl.rb +6 -0
- data/lib/crawlscope/document_text.rb +40 -0
- data/lib/crawlscope/rule_registry.rb +3 -1
- data/lib/crawlscope/rules/content_quality.rb +99 -0
- data/lib/crawlscope/rules/indexability.rb +66 -0
- data/lib/crawlscope/rules/links.rb +24 -6
- data/lib/crawlscope/rules/metadata.rb +57 -11
- data/lib/crawlscope/rules/structured_data.rb +47 -0
- data/lib/crawlscope/rules/uniqueness.rb +76 -4
- data/lib/crawlscope/schemas.rb +52 -1
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +11 -1
- data/test/crawlscope/cli_test.rb +19 -5
- data/test/crawlscope/configuration_test.rb +8 -1
- data/test/crawlscope/content_quality_rule_test.rb +68 -0
- data/test/crawlscope/crawl_test.rb +23 -3
- data/test/crawlscope/indexability_rule_test.rb +96 -0
- data/test/crawlscope/links_rule_test.rb +39 -0
- data/test/crawlscope/metadata_rule_test.rb +77 -0
- data/test/crawlscope/structured_data_rule_test.rb +91 -0
- data/test/crawlscope/uniqueness_rule_test.rb +43 -2
- data/test/release_task_test.rb +86 -0
- metadata +9 -2
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
module Crawlscope
|
|
4
4
|
module Rules
|
|
5
5
|
class StructuredData
|
|
6
|
+
CAREER_DETAIL_PATH = %r{/careers/[^/]+/?\z}
|
|
7
|
+
|
|
6
8
|
attr_reader :code
|
|
7
9
|
|
|
8
10
|
def initialize
|
|
@@ -65,6 +67,51 @@ module Crawlscope
|
|
|
65
67
|
details: {errors: errors, source: source}
|
|
66
68
|
)
|
|
67
69
|
end
|
|
70
|
+
|
|
71
|
+
validate_job_posting_count(page, items, issues)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def validate_job_posting_count(page, items, issues)
|
|
75
|
+
job_postings = items.select { |item| structured_data_types(item.data).include?("JobPosting") }
|
|
76
|
+
return if job_postings.size == 1
|
|
77
|
+
|
|
78
|
+
if job_postings.size > 1
|
|
79
|
+
issues.add(
|
|
80
|
+
code: :multiple_job_postings,
|
|
81
|
+
severity: :warning,
|
|
82
|
+
category: :structured_data,
|
|
83
|
+
url: page.url,
|
|
84
|
+
message: "multiple JobPosting structured data blocks found",
|
|
85
|
+
details: {count: job_postings.size}
|
|
86
|
+
)
|
|
87
|
+
elsif career_detail_page?(page.url)
|
|
88
|
+
issues.add(
|
|
89
|
+
code: :missing_job_posting,
|
|
90
|
+
severity: :warning,
|
|
91
|
+
category: :structured_data,
|
|
92
|
+
url: page.url,
|
|
93
|
+
message: "career detail page missing JobPosting structured data",
|
|
94
|
+
details: {expected_type: "JobPosting"}
|
|
95
|
+
)
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def structured_data_types(data)
|
|
100
|
+
return [] unless data.is_a?(Hash)
|
|
101
|
+
|
|
102
|
+
types = Array(data["@type"]).map(&:to_s)
|
|
103
|
+
|
|
104
|
+
if data["@graph"].is_a?(Array)
|
|
105
|
+
types.concat(data["@graph"].flat_map { |entry| structured_data_types(entry) })
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
types
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def career_detail_page?(url)
|
|
112
|
+
URI(url).path.match?(CAREER_DETAIL_PATH)
|
|
113
|
+
rescue URI::InvalidURIError
|
|
114
|
+
false
|
|
68
115
|
end
|
|
69
116
|
end
|
|
70
117
|
end
|
|
@@ -5,10 +5,24 @@ require "digest"
|
|
|
5
5
|
module Crawlscope
|
|
6
6
|
module Rules
|
|
7
7
|
class Uniqueness
|
|
8
|
+
MINIMUM_SHINGLES = 10
|
|
9
|
+
MAX_NEAR_DUPLICATE_PAGES = 250
|
|
10
|
+
NEAR_DUPLICATE_THRESHOLD = 0.9
|
|
11
|
+
SHINGLE_SIZE = 5
|
|
12
|
+
|
|
8
13
|
attr_reader :code
|
|
9
14
|
|
|
10
|
-
def initialize
|
|
15
|
+
def initialize(
|
|
16
|
+
near_duplicate_threshold: NEAR_DUPLICATE_THRESHOLD,
|
|
17
|
+
max_near_duplicate_pages: MAX_NEAR_DUPLICATE_PAGES,
|
|
18
|
+
minimum_shingles: MINIMUM_SHINGLES,
|
|
19
|
+
shingle_size: SHINGLE_SIZE
|
|
20
|
+
)
|
|
11
21
|
@code = :uniqueness
|
|
22
|
+
@max_near_duplicate_pages = max_near_duplicate_pages
|
|
23
|
+
@minimum_shingles = minimum_shingles
|
|
24
|
+
@near_duplicate_threshold = near_duplicate_threshold
|
|
25
|
+
@shingle_size = shingle_size
|
|
12
26
|
end
|
|
13
27
|
|
|
14
28
|
def call(urls:, pages:, issues:, context:)
|
|
@@ -19,14 +33,13 @@ module Crawlscope
|
|
|
19
33
|
end
|
|
20
34
|
|
|
21
35
|
validate_duplicates(page_summaries, issues)
|
|
36
|
+
validate_near_duplicates(page_summaries, issues)
|
|
22
37
|
end
|
|
23
38
|
|
|
24
39
|
private
|
|
25
40
|
|
|
26
41
|
def content_fingerprint_digest(doc)
|
|
27
|
-
|
|
28
|
-
text = doc.at_css("body")&.text.to_s if text.empty?
|
|
29
|
-
normalized = text.gsub(/\s+/, " ").strip
|
|
42
|
+
normalized = DocumentText.text_for(doc)
|
|
30
43
|
return if normalized.length < 200
|
|
31
44
|
|
|
32
45
|
Digest::SHA256.hexdigest(normalized)
|
|
@@ -41,9 +54,12 @@ module Crawlscope
|
|
|
41
54
|
end
|
|
42
55
|
|
|
43
56
|
def summary_for(page)
|
|
57
|
+
tokens = DocumentText.tokens(DocumentText.text_for(page.doc))
|
|
58
|
+
|
|
44
59
|
{
|
|
45
60
|
content_fingerprint_digest: content_fingerprint_digest(page.doc),
|
|
46
61
|
description: page.doc.at_css('meta[name="description"]')&.[]("content").to_s.strip,
|
|
62
|
+
shingles: shingles_for(tokens),
|
|
47
63
|
title: page.doc.at_css("title")&.text.to_s.strip,
|
|
48
64
|
url: page.url
|
|
49
65
|
}
|
|
@@ -83,6 +99,62 @@ module Crawlscope
|
|
|
83
99
|
)
|
|
84
100
|
end
|
|
85
101
|
end
|
|
102
|
+
|
|
103
|
+
def shingles_for(tokens)
|
|
104
|
+
return [] if tokens.size < @shingle_size
|
|
105
|
+
|
|
106
|
+
tokens.each_cons(@shingle_size).map { |items| items.join(" ") }.uniq
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def validate_near_duplicates(page_summaries, issues)
|
|
110
|
+
if near_duplicate_scan_limit_exceeded?(page_summaries)
|
|
111
|
+
issues.add(
|
|
112
|
+
code: :near_duplicate_scan_skipped,
|
|
113
|
+
severity: :warning,
|
|
114
|
+
category: :uniqueness,
|
|
115
|
+
url: nil,
|
|
116
|
+
message: "near duplicate scan skipped for #{page_summaries.size} pages",
|
|
117
|
+
details: {max_pages: @max_near_duplicate_pages, page_count: page_summaries.size}
|
|
118
|
+
)
|
|
119
|
+
return
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
page_summaries.combination(2) do |left, right|
|
|
123
|
+
next if same_content_fingerprint?(left, right)
|
|
124
|
+
next if left[:shingles].size < @minimum_shingles || right[:shingles].size < @minimum_shingles
|
|
125
|
+
|
|
126
|
+
similarity = shingle_similarity(left[:shingles], right[:shingles])
|
|
127
|
+
next if similarity < @near_duplicate_threshold
|
|
128
|
+
|
|
129
|
+
urls = [left[:url], right[:url]]
|
|
130
|
+
|
|
131
|
+
issues.add(
|
|
132
|
+
code: :near_duplicate_content,
|
|
133
|
+
severity: :warning,
|
|
134
|
+
category: :uniqueness,
|
|
135
|
+
url: nil,
|
|
136
|
+
message: "near duplicate page content (#{format("%.2f", similarity)}) => #{urls.join(", ")}",
|
|
137
|
+
details: {similarity: similarity.round(3), threshold: @near_duplicate_threshold, urls: urls}
|
|
138
|
+
)
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def near_duplicate_scan_limit_exceeded?(page_summaries)
|
|
143
|
+
!@max_near_duplicate_pages.nil? && page_summaries.size > @max_near_duplicate_pages
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def same_content_fingerprint?(left, right)
|
|
147
|
+
!left[:content_fingerprint_digest].nil? &&
|
|
148
|
+
left[:content_fingerprint_digest] == right[:content_fingerprint_digest]
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def shingle_similarity(left, right)
|
|
152
|
+
intersection_size = (left & right).size
|
|
153
|
+
smaller_set_size = [left.size, right.size].min
|
|
154
|
+
return 0.0 if smaller_set_size.zero?
|
|
155
|
+
|
|
156
|
+
intersection_size.to_f / smaller_set_size
|
|
157
|
+
end
|
|
86
158
|
end
|
|
87
159
|
end
|
|
88
160
|
end
|
data/lib/crawlscope/schemas.rb
CHANGED
|
@@ -330,6 +330,56 @@ module Crawlscope
|
|
|
330
330
|
}
|
|
331
331
|
}.freeze
|
|
332
332
|
|
|
333
|
+
JOB_POSTING = {
|
|
334
|
+
type: "object",
|
|
335
|
+
additionalProperties: true,
|
|
336
|
+
required: ["@type", "title", "description", "datePosted", "hiringOrganization"],
|
|
337
|
+
properties: {
|
|
338
|
+
"@context" => {enum: ["https://schema.org", "https://schema.org/"]},
|
|
339
|
+
"@type" => {const: "JobPosting"},
|
|
340
|
+
:title => {type: "string"},
|
|
341
|
+
:description => {type: "string"},
|
|
342
|
+
:identifier => {type: "object"},
|
|
343
|
+
:datePosted => {type: "string"},
|
|
344
|
+
:validThrough => {type: "string"},
|
|
345
|
+
:employmentType => {
|
|
346
|
+
anyOf: [
|
|
347
|
+
{type: "string"},
|
|
348
|
+
{type: "array", minItems: 1, items: {type: "string"}}
|
|
349
|
+
]
|
|
350
|
+
},
|
|
351
|
+
:directApply => {type: "boolean"},
|
|
352
|
+
:hiringOrganization => {
|
|
353
|
+
type: "object",
|
|
354
|
+
required: ["@type", "name"],
|
|
355
|
+
properties: {
|
|
356
|
+
"@type" => {const: "Organization"},
|
|
357
|
+
:name => {type: "string"},
|
|
358
|
+
:sameAs => {type: "string", format: "uri"},
|
|
359
|
+
:logo => {type: "string", format: "uri"}
|
|
360
|
+
}
|
|
361
|
+
},
|
|
362
|
+
:applicantLocationRequirements => {
|
|
363
|
+
anyOf: [
|
|
364
|
+
{type: "object"},
|
|
365
|
+
{type: "array", minItems: 1, items: {type: "object"}}
|
|
366
|
+
]
|
|
367
|
+
},
|
|
368
|
+
:jobLocationType => {type: "string"},
|
|
369
|
+
:jobLocation => {
|
|
370
|
+
anyOf: [
|
|
371
|
+
{type: "object"},
|
|
372
|
+
{type: "array", minItems: 1, items: {type: "object"}}
|
|
373
|
+
]
|
|
374
|
+
},
|
|
375
|
+
:baseSalary => {type: "object"}
|
|
376
|
+
},
|
|
377
|
+
anyOf: [
|
|
378
|
+
{required: ["jobLocation"]},
|
|
379
|
+
{required: ["jobLocationType", "applicantLocationRequirements"]}
|
|
380
|
+
]
|
|
381
|
+
}.freeze
|
|
382
|
+
|
|
333
383
|
def self.schemas
|
|
334
384
|
{
|
|
335
385
|
"FAQPage" => FAQ_PAGE,
|
|
@@ -348,7 +398,8 @@ module Crawlscope
|
|
|
348
398
|
"Recipe" => RECIPE,
|
|
349
399
|
"Event" => EVENT,
|
|
350
400
|
"VideoObject" => VIDEO_OBJECT,
|
|
351
|
-
"WebPage" => WEB_PAGE
|
|
401
|
+
"WebPage" => WEB_PAGE,
|
|
402
|
+
"JobPosting" => JOB_POSTING
|
|
352
403
|
}
|
|
353
404
|
end
|
|
354
405
|
end
|
data/lib/crawlscope/version.rb
CHANGED
|
@@ -5,11 +5,16 @@ namespace :crawlscope do
|
|
|
5
5
|
end
|
|
6
6
|
|
|
7
7
|
namespace :validate do
|
|
8
|
-
desc "Directly validate JSON-LD on one or more URLs. ENV: URL (
|
|
8
|
+
desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
|
|
9
9
|
task ldjson: :environment do
|
|
10
10
|
Crawlscope::RakeTasks.ldjson
|
|
11
11
|
end
|
|
12
12
|
|
|
13
|
+
desc "Validate URLs with the indexability rule. ENV: URL, SITEMAP, JS=1"
|
|
14
|
+
task indexability: :environment do
|
|
15
|
+
Crawlscope::RakeTasks.validate_rule("indexability")
|
|
16
|
+
end
|
|
17
|
+
|
|
13
18
|
desc "Validate URLs with the metadata rule. ENV: URL, SITEMAP, JS=1"
|
|
14
19
|
task metadata: :environment do
|
|
15
20
|
Crawlscope::RakeTasks.validate_rule("metadata")
|
|
@@ -25,6 +30,11 @@ namespace :crawlscope do
|
|
|
25
30
|
Crawlscope::RakeTasks.validate_rule("uniqueness")
|
|
26
31
|
end
|
|
27
32
|
|
|
33
|
+
desc "Validate URLs with the content_quality rule. ENV: URL, SITEMAP, JS=1"
|
|
34
|
+
task content_quality: :environment do
|
|
35
|
+
Crawlscope::RakeTasks.validate_rule("content_quality")
|
|
36
|
+
end
|
|
37
|
+
|
|
28
38
|
desc "Validate URLs with the links rule. ENV: URL, SITEMAP, JS=1"
|
|
29
39
|
task links: :environment do
|
|
30
40
|
Crawlscope::RakeTasks.validate_rule("links")
|
data/test/crawlscope/cli_test.rb
CHANGED
|
@@ -4,9 +4,10 @@ require "test_helper"
|
|
|
4
4
|
|
|
5
5
|
class CrawlscopeCliTest < Minitest::Test
|
|
6
6
|
class FakeConfiguration
|
|
7
|
-
attr_accessor :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
|
|
7
|
+
attr_accessor :base_url, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
|
|
8
8
|
|
|
9
9
|
def initialize
|
|
10
|
+
@base_url = nil
|
|
10
11
|
@concurrency = 10
|
|
11
12
|
@network_idle_timeout_seconds = 5
|
|
12
13
|
@renderer = :http
|
|
@@ -145,6 +146,17 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
145
146
|
assert_empty err.string
|
|
146
147
|
end
|
|
147
148
|
|
|
149
|
+
def test_ldjson_defaults_to_configured_base_url
|
|
150
|
+
configuration = FakeConfiguration.new
|
|
151
|
+
configuration.base_url = "https://example.com"
|
|
152
|
+
task = FakeTask.new
|
|
153
|
+
|
|
154
|
+
status = Crawlscope::Cli.start(["ldjson"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
|
|
155
|
+
|
|
156
|
+
assert_equal 0, status
|
|
157
|
+
assert_equal ["https://example.com"], task.json_ld_arguments[:urls]
|
|
158
|
+
end
|
|
159
|
+
|
|
148
160
|
def test_validate_caps_default_browser_concurrency
|
|
149
161
|
configuration = FakeConfiguration.new
|
|
150
162
|
task = FakeTask.new
|
|
@@ -218,14 +230,16 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
218
230
|
assert_equal 3, configuration.network_idle_timeout_seconds
|
|
219
231
|
end
|
|
220
232
|
|
|
221
|
-
def
|
|
233
|
+
def test_ldjson_defaults_to_localhost
|
|
222
234
|
out = StringIO.new
|
|
223
235
|
err = StringIO.new
|
|
236
|
+
task = FakeTask.new
|
|
224
237
|
|
|
225
|
-
status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task:
|
|
238
|
+
status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task: task)
|
|
226
239
|
|
|
227
|
-
assert_equal
|
|
228
|
-
|
|
240
|
+
assert_equal 0, status
|
|
241
|
+
assert_equal ["http://localhost:3000"], task.json_ld_arguments[:urls]
|
|
242
|
+
assert_empty err.string
|
|
229
243
|
end
|
|
230
244
|
|
|
231
245
|
def test_invalid_integer_option_returns_error
|
|
@@ -20,7 +20,14 @@ class CrawlscopeConfigurationTest < Minitest::Test
|
|
|
20
20
|
assert_equal "https://example.com", audit.instance_variable_get(:@base_url)
|
|
21
21
|
assert_equal "/tmp/sitemap.xml", audit.instance_variable_get(:@sitemap_path)
|
|
22
22
|
assert_equal 4, audit.instance_variable_get(:@concurrency)
|
|
23
|
-
assert_equal %i[
|
|
23
|
+
assert_equal %i[
|
|
24
|
+
indexability
|
|
25
|
+
metadata
|
|
26
|
+
structured_data
|
|
27
|
+
uniqueness
|
|
28
|
+
content_quality
|
|
29
|
+
links
|
|
30
|
+
], audit.instance_variable_get(:@rules).map(&:code)
|
|
24
31
|
end
|
|
25
32
|
|
|
26
33
|
def test_audit_raises_without_base_url
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeContentQualityRuleTest < Minitest::Test
|
|
6
|
+
def test_reports_thin_visible_text_and_low_html_text_ratio
|
|
7
|
+
issues = Crawlscope::IssueCollection.new
|
|
8
|
+
page = page_with(main: "Short page <div>#{"<span></span>" * 500}</div>")
|
|
9
|
+
|
|
10
|
+
Crawlscope::Rules::ContentQuality.new.call(urls: [page.url], pages: [page], issues: issues)
|
|
11
|
+
|
|
12
|
+
codes = issues.to_a.map(&:code)
|
|
13
|
+
assert_includes codes, :thin_visible_text
|
|
14
|
+
assert_includes codes, :low_visible_text_ratio
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def test_visible_text_ratio_ignores_markup_outside_main_content
|
|
18
|
+
issues = Crawlscope::IssueCollection.new
|
|
19
|
+
page = page_with(
|
|
20
|
+
main: Array.new(260) { |index| "word#{index}" }.join(" "),
|
|
21
|
+
head_markup: "<style>#{"body{}" * 10_000}</style>",
|
|
22
|
+
extra_markup: "<nav>#{"<a href=\"/\">Navigation</a>" * 500}</nav>"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
Crawlscope::Rules::ContentQuality.new.call(urls: [page.url], pages: [page], issues: issues)
|
|
26
|
+
|
|
27
|
+
refute_includes issues.to_a.map(&:code), :low_visible_text_ratio
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def test_reports_low_unique_token_ratio_for_repetitive_content
|
|
31
|
+
issues = Crawlscope::IssueCollection.new
|
|
32
|
+
page = page_with(main: ("hotel location service " * 100).strip)
|
|
33
|
+
|
|
34
|
+
Crawlscope::Rules::ContentQuality.new.call(urls: [page.url], pages: [page], issues: issues)
|
|
35
|
+
|
|
36
|
+
issue = issues.to_a.find { |item| item.code == :low_unique_token_ratio }
|
|
37
|
+
assert issue
|
|
38
|
+
assert_operator issue.details[:ratio], :<, issue.details[:threshold]
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
def page_with(main:, extra_markup: "", head_markup: "")
|
|
44
|
+
body = <<~HTML
|
|
45
|
+
<html>
|
|
46
|
+
<head>
|
|
47
|
+
<title>Content quality</title>
|
|
48
|
+
#{head_markup}
|
|
49
|
+
</head>
|
|
50
|
+
<body>
|
|
51
|
+
#{extra_markup}
|
|
52
|
+
<main>#{main}</main>
|
|
53
|
+
</body>
|
|
54
|
+
</html>
|
|
55
|
+
HTML
|
|
56
|
+
|
|
57
|
+
Crawlscope::Page.new(
|
|
58
|
+
url: "https://example.com/page",
|
|
59
|
+
normalized_url: "https://example.com/page",
|
|
60
|
+
final_url: "https://example.com/page",
|
|
61
|
+
normalized_final_url: "https://example.com/page",
|
|
62
|
+
status: 200,
|
|
63
|
+
headers: {"content-type" => "text/html"},
|
|
64
|
+
body: body,
|
|
65
|
+
doc: Nokogiri::HTML(body)
|
|
66
|
+
)
|
|
67
|
+
end
|
|
68
|
+
end
|
|
@@ -31,8 +31,13 @@ class CrawlscopeCrawlTest < Minitest::Test
|
|
|
31
31
|
<html>
|
|
32
32
|
<head>
|
|
33
33
|
<title>Pricing</title>
|
|
34
|
-
<meta name="description" content="Plans for hotels and restaurants">
|
|
34
|
+
<meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
|
|
35
35
|
<link rel="canonical" href="https://example.com/pricing">
|
|
36
|
+
<meta property="og:title" content="Pricing">
|
|
37
|
+
<meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
|
|
38
|
+
<meta property="og:url" content="https://example.com/pricing">
|
|
39
|
+
<meta property="og:type" content="website">
|
|
40
|
+
<meta property="og:image" content="https://example.com/icon.png">
|
|
36
41
|
<script type="application/ld+json">
|
|
37
42
|
{"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
|
|
38
43
|
</script>
|
|
@@ -40,6 +45,7 @@ class CrawlscopeCrawlTest < Minitest::Test
|
|
|
40
45
|
<body>
|
|
41
46
|
<main>
|
|
42
47
|
<h1>Pricing</h1>
|
|
48
|
+
<p>#{Array.new(260) { |index| "pricing#{index}" }.join(" ")}</p>
|
|
43
49
|
</main>
|
|
44
50
|
</body>
|
|
45
51
|
</html>
|
|
@@ -95,7 +101,15 @@ class CrawlscopeCrawlTest < Minitest::Test
|
|
|
95
101
|
).call
|
|
96
102
|
|
|
97
103
|
refute result.ok?
|
|
98
|
-
assert_equal %i[
|
|
104
|
+
assert_equal %i[
|
|
105
|
+
incomplete_open_graph_tags
|
|
106
|
+
meta_description_too_long
|
|
107
|
+
missing_canonical
|
|
108
|
+
missing_h1
|
|
109
|
+
missing_structured_data
|
|
110
|
+
thin_visible_text
|
|
111
|
+
title_repeats_site_name
|
|
112
|
+
].sort, result.issues.to_a.map(&:code).uniq.sort
|
|
99
113
|
end
|
|
100
114
|
|
|
101
115
|
def test_uses_browser_when_renderer_is_browser
|
|
@@ -128,8 +142,13 @@ class CrawlscopeCrawlTest < Minitest::Test
|
|
|
128
142
|
<html>
|
|
129
143
|
<head>
|
|
130
144
|
<title>Pricing</title>
|
|
131
|
-
<meta name="description" content="Plans for hotels and restaurants">
|
|
145
|
+
<meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
|
|
132
146
|
<link rel="canonical" href="https://example.com/pricing">
|
|
147
|
+
<meta property="og:title" content="Pricing">
|
|
148
|
+
<meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
|
|
149
|
+
<meta property="og:url" content="https://example.com/pricing">
|
|
150
|
+
<meta property="og:type" content="website">
|
|
151
|
+
<meta property="og:image" content="https://example.com/icon.png">
|
|
133
152
|
<script type="application/ld+json">
|
|
134
153
|
{"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
|
|
135
154
|
</script>
|
|
@@ -137,6 +156,7 @@ class CrawlscopeCrawlTest < Minitest::Test
|
|
|
137
156
|
<body>
|
|
138
157
|
<main>
|
|
139
158
|
<h1>Pricing</h1>
|
|
159
|
+
<p>#{Array.new(260) { |index| "pricing#{index}" }.join(" ")}</p>
|
|
140
160
|
</main>
|
|
141
161
|
</body>
|
|
142
162
|
</html>
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeIndexabilityRuleTest < Minitest::Test
|
|
6
|
+
def test_reports_meta_noindex
|
|
7
|
+
issues = Crawlscope::IssueCollection.new
|
|
8
|
+
page = page_with(
|
|
9
|
+
body: <<~HTML
|
|
10
|
+
<html>
|
|
11
|
+
<head><meta name="robots" content="noindex, follow"></head>
|
|
12
|
+
<body><main>Visible content</main></body>
|
|
13
|
+
</html>
|
|
14
|
+
HTML
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues)
|
|
18
|
+
|
|
19
|
+
issue = issues.to_a.fetch(0)
|
|
20
|
+
assert_equal :noindex_meta, issue.code
|
|
21
|
+
assert_equal :error, issue.severity
|
|
22
|
+
assert_equal "noindex, follow", issue.details[:content]
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def test_reports_x_robots_tag_noindex
|
|
26
|
+
issues = Crawlscope::IssueCollection.new
|
|
27
|
+
page = page_with(headers: {"X-Robots-Tag" => "noindex"})
|
|
28
|
+
|
|
29
|
+
Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues)
|
|
30
|
+
|
|
31
|
+
issue = issues.to_a.fetch(0)
|
|
32
|
+
assert_equal :noindex_header, issue.code
|
|
33
|
+
assert_equal :error, issue.severity
|
|
34
|
+
assert_equal "noindex", issue.details[:content]
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def test_reports_x_robots_tag_noindex_for_non_html_response
|
|
38
|
+
issues = Crawlscope::IssueCollection.new
|
|
39
|
+
page = page_with(
|
|
40
|
+
body: "%PDF-1.7",
|
|
41
|
+
doc: nil,
|
|
42
|
+
headers: {"content-type" => "application/pdf", "X-Robots-Tag" => "noindex"}
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues)
|
|
46
|
+
|
|
47
|
+
issue = issues.to_a.fetch(0)
|
|
48
|
+
assert_equal :noindex_header, issue.code
|
|
49
|
+
assert_equal :error, issue.severity
|
|
50
|
+
assert_equal "noindex", issue.details[:content]
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def test_reports_scoped_x_robots_tag_noindex
|
|
54
|
+
issues = Crawlscope::IssueCollection.new
|
|
55
|
+
page = page_with(headers: {"X-Robots-Tag" => "googlebot: noindex, nofollow"})
|
|
56
|
+
|
|
57
|
+
Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues)
|
|
58
|
+
|
|
59
|
+
issue = issues.to_a.fetch(0)
|
|
60
|
+
assert_equal :noindex_header, issue.code
|
|
61
|
+
assert_equal "googlebot: noindex, nofollow", issue.details[:content]
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def test_reports_x_robots_tag_none
|
|
65
|
+
issues = Crawlscope::IssueCollection.new
|
|
66
|
+
page = page_with(headers: {"X-Robots-Tag" => "none"})
|
|
67
|
+
|
|
68
|
+
Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues)
|
|
69
|
+
|
|
70
|
+
issue = issues.to_a.fetch(0)
|
|
71
|
+
assert_equal :noindex_header, issue.code
|
|
72
|
+
assert_equal "none", issue.details[:content]
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
private
|
|
76
|
+
|
|
77
|
+
def page_with(body: nil, doc: :parse, headers: {"content-type" => "text/html"})
|
|
78
|
+
body ||= <<~HTML
|
|
79
|
+
<html>
|
|
80
|
+
<head><title>Indexable</title></head>
|
|
81
|
+
<body><main>Visible content</main></body>
|
|
82
|
+
</html>
|
|
83
|
+
HTML
|
|
84
|
+
|
|
85
|
+
Crawlscope::Page.new(
|
|
86
|
+
url: "https://example.com/page",
|
|
87
|
+
normalized_url: "https://example.com/page",
|
|
88
|
+
final_url: "https://example.com/page",
|
|
89
|
+
normalized_final_url: "https://example.com/page",
|
|
90
|
+
status: 200,
|
|
91
|
+
headers: headers,
|
|
92
|
+
body: body,
|
|
93
|
+
doc: (doc == :parse) ? Nokogiri::HTML(body) : doc
|
|
94
|
+
)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
@@ -118,6 +118,45 @@ class CrawlscopeLinksRuleTest < Minitest::Test
|
|
|
118
118
|
assert_equal "https://example.com/guide", issues.to_a.first.url
|
|
119
119
|
end
|
|
120
120
|
|
|
121
|
+
def test_counts_root_page_links_as_inbound_links
|
|
122
|
+
issues = Crawlscope::IssueCollection.new
|
|
123
|
+
|
|
124
|
+
Crawlscope::Rules::Links.new.call(
|
|
125
|
+
urls: ["https://example.com/", "https://example.com/about"],
|
|
126
|
+
pages: [
|
|
127
|
+
page(url: "https://example.com/", body: "<main><a href=\"/about\">About</a></main>"),
|
|
128
|
+
page(url: "https://example.com/about", body: "<main><p>About</p></main>")
|
|
129
|
+
],
|
|
130
|
+
issues: issues,
|
|
131
|
+
context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
refute_includes issues.to_a.map(&:code), :low_inbound_anchor_links
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def test_reports_internal_links_that_redirect
|
|
138
|
+
issues = Crawlscope::IssueCollection.new
|
|
139
|
+
resolver = lambda do |target_url|
|
|
140
|
+
{
|
|
141
|
+
crawled: false,
|
|
142
|
+
error: nil,
|
|
143
|
+
final_url: "https://example.com/pricing",
|
|
144
|
+
status: 200
|
|
145
|
+
}
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
Crawlscope::Rules::Links.new.call(
|
|
149
|
+
urls: ["https://example.com/guide"],
|
|
150
|
+
pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/plans\">Plans</a></main>")],
|
|
151
|
+
issues: issues,
|
|
152
|
+
context: context(resolver: resolver)
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
redirect_issue = issues.to_a.find { |issue| issue.code == :internal_link_redirects }
|
|
156
|
+
assert redirect_issue
|
|
157
|
+
assert_includes redirect_issue.message, "https://example.com/pricing"
|
|
158
|
+
end
|
|
159
|
+
|
|
121
160
|
def test_ignores_links_that_should_not_be_crawled
|
|
122
161
|
issues = Crawlscope::IssueCollection.new
|
|
123
162
|
|