crawlscope 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -7
- data/README.md +2 -2
- data/lib/crawlscope/cli.rb +5 -0
- data/lib/crawlscope/crawl.rb +6 -0
- data/lib/crawlscope/rules/links.rb +24 -6
- data/lib/crawlscope/rules/metadata.rb +57 -11
- data/lib/crawlscope/rules/structured_data.rb +47 -0
- data/lib/crawlscope/schemas.rb +52 -1
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +1 -1
- data/test/crawlscope/cli_test.rb +19 -5
- data/test/crawlscope/crawl_test.rb +13 -3
- data/test/crawlscope/links_rule_test.rb +39 -0
- data/test/crawlscope/metadata_rule_test.rb +77 -0
- data/test/crawlscope/structured_data_rule_test.rb +91 -0
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b49aaaa6fdb5f7d5bd4dc63713d8c0090411e7063363645a900d8f59d803aaaa
|
|
4
|
+
data.tar.gz: 5dfcc35d60745c25db6faf3acaa4344e29e438c758740613d6216e2f47aeac6e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9f66627274ce2ea969b5bb9b53a339215718c37baf47393c75bcf3a528c5c73658c6a71903fdbbf9e53796aaf3680be5f99ab4151b834efbf9450e05abbab83b
|
|
7
|
+
data.tar.gz: 3cf2e2c7f251a6af7b931f00da63436eaa7e09f078d73de112852a10665cf16eefb561c7d61d6bc8b0c3c014ca0db2df217d31c00b9f0ed321565ed554574261
|
data/CHANGELOG.md
CHANGED
|
@@ -5,23 +5,26 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
-
## [0.
|
|
8
|
+
## [0.3.0] - 2026-04-28
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
###
|
|
11
|
+
### Added
|
|
12
12
|
|
|
13
|
-
-
|
|
13
|
+
- add JobPost structured data
|
|
14
14
|
|
|
15
|
-
- harden validation boundaries
|
|
16
15
|
|
|
17
16
|
|
|
18
17
|
|
|
18
|
+
### Documentation
|
|
19
|
+
|
|
20
|
+
- fix missing changelog entry
|
|
21
|
+
|
|
19
22
|
|
|
20
|
-
### Fixed
|
|
21
23
|
|
|
22
|
-
- handle child sitemaps
|
|
23
24
|
|
|
24
|
-
|
|
25
|
+
### Fixed
|
|
26
|
+
|
|
27
|
+
- ldjson check now uses the same convention for default URL
|
|
25
28
|
|
|
26
29
|
|
|
27
30
|
|
data/README.md
CHANGED
|
@@ -150,7 +150,7 @@ bin/rails crawlscope:validate:metadata
|
|
|
150
150
|
bin/rails crawlscope:validate:structured_data
|
|
151
151
|
bin/rails crawlscope:validate:uniqueness
|
|
152
152
|
bin/rails crawlscope:validate:links
|
|
153
|
-
bin/rails crawlscope:validate:ldjson
|
|
153
|
+
bin/rails crawlscope:validate:ldjson
|
|
154
154
|
```
|
|
155
155
|
|
|
156
156
|
The same validation surface is also available in the gem repository itself through plain `rake`:
|
|
@@ -163,7 +163,7 @@ bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article
|
|
|
163
163
|
|
|
164
164
|
`crawlscope:validate` runs all default sitemap rules: metadata, structured data, uniqueness, and links. `URL` is the site base. Without `SITEMAP`, Crawlscope uses `/sitemap.xml`. With `SITEMAP`, Crawlscope uses `URL` as the site base and validates URLs from that sitemap. `SITEMAP` may be a full URL or a local file path.
|
|
165
165
|
|
|
166
|
-
`crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap.
|
|
166
|
+
`crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`.
|
|
167
167
|
|
|
168
168
|
### Structured Data URL Audit
|
|
169
169
|
|
data/lib/crawlscope/cli.rb
CHANGED
|
@@ -105,6 +105,7 @@ module Crawlscope
|
|
|
105
105
|
parser.parse!(@argv)
|
|
106
106
|
|
|
107
107
|
urls = options[:urls].map(&:strip).reject(&:empty?)
|
|
108
|
+
urls = default_urls if urls.empty?
|
|
108
109
|
raise ConfigurationError, "Crawlscope URL is not configured" if urls.empty?
|
|
109
110
|
|
|
110
111
|
configure_renderer(options[:renderer])
|
|
@@ -238,6 +239,10 @@ module Crawlscope
|
|
|
238
239
|
raw_urls.split(";").map(&:strip).reject(&:empty?)
|
|
239
240
|
end
|
|
240
241
|
|
|
242
|
+
def default_urls
|
|
243
|
+
[normalized_string(@configuration.base_url) || "http://localhost:3000"]
|
|
244
|
+
end
|
|
245
|
+
|
|
241
246
|
def task
|
|
242
247
|
@task ||= Run.new(configuration: @configuration, reporter: Reporter.new(io: @out))
|
|
243
248
|
end
|
data/lib/crawlscope/crawl.rb
CHANGED
|
@@ -81,6 +81,8 @@ module Crawlscope
|
|
|
81
81
|
issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: page.url, message: page.error, details: {})
|
|
82
82
|
elsif !@allowed_statuses.include?(page.status)
|
|
83
83
|
issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status})
|
|
84
|
+
elsif redirected?(page)
|
|
85
|
+
issues.add(code: :redirected_page, severity: :warning, category: :crawl, url: page.url, message: "redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
|
|
84
86
|
end
|
|
85
87
|
end
|
|
86
88
|
end
|
|
@@ -128,5 +130,9 @@ module Crawlscope
|
|
|
128
130
|
status: page.status
|
|
129
131
|
}
|
|
130
132
|
end
|
|
133
|
+
|
|
134
|
+
def redirected?(page)
|
|
135
|
+
page.normalized_url.to_s != page.normalized_final_url.to_s
|
|
136
|
+
end
|
|
131
137
|
end
|
|
132
138
|
end
|
|
@@ -5,7 +5,7 @@ require "uri"
|
|
|
5
5
|
module Crawlscope
|
|
6
6
|
module Rules
|
|
7
7
|
class Links
|
|
8
|
-
|
|
8
|
+
LINK_SELECTORS = "a[href]"
|
|
9
9
|
INTERNAL_PATH_PREFIXES_TO_SKIP = ["/rails/", "/cdn-cgi/"].freeze
|
|
10
10
|
LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze
|
|
11
11
|
MAX_SOURCES_IN_ERROR = 3
|
|
@@ -33,10 +33,7 @@ module Crawlscope
|
|
|
33
33
|
private
|
|
34
34
|
|
|
35
35
|
def contextual_links(doc)
|
|
36
|
-
|
|
37
|
-
return links unless links.empty?
|
|
38
|
-
|
|
39
|
-
doc.css("a[href]")
|
|
36
|
+
doc.css(LINK_SELECTORS)
|
|
40
37
|
end
|
|
41
38
|
|
|
42
39
|
def extract_links(pages)
|
|
@@ -45,7 +42,7 @@ module Crawlscope
|
|
|
45
42
|
|
|
46
43
|
def page_links(page)
|
|
47
44
|
source_path = Url.path(page.normalized_url)
|
|
48
|
-
return [] unless
|
|
45
|
+
return [] unless crawlable_source_path?(source_path)
|
|
49
46
|
|
|
50
47
|
contextual_links(page.doc).filter_map do |node|
|
|
51
48
|
link_for(page: page, source_path: source_path, node: node)
|
|
@@ -146,6 +143,7 @@ module Crawlscope
|
|
|
146
143
|
next
|
|
147
144
|
end
|
|
148
145
|
|
|
146
|
+
report_redirect_target(target_url, grouped_links, issues, target) if target.redirect?
|
|
149
147
|
next unless crawlable_path?(target.final_path)
|
|
150
148
|
|
|
151
149
|
grouped_links.each do |link|
|
|
@@ -156,6 +154,18 @@ module Crawlscope
|
|
|
156
154
|
resolved_links
|
|
157
155
|
end
|
|
158
156
|
|
|
157
|
+
def report_redirect_target(target_url, grouped_links, issues, target)
|
|
158
|
+
source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
|
|
159
|
+
issues.add(
|
|
160
|
+
code: :internal_link_redirects,
|
|
161
|
+
severity: :warning,
|
|
162
|
+
category: :links,
|
|
163
|
+
url: target_url,
|
|
164
|
+
message: "internal link redirects to #{target.final_url} (sources: #{source_urls.join(", ")})",
|
|
165
|
+
details: {final_url: target.final_url, source_urls: source_urls, status: target.status}
|
|
166
|
+
)
|
|
167
|
+
end
|
|
168
|
+
|
|
159
169
|
def resolve_target(target_url)
|
|
160
170
|
resolution = @resolve_target.call(target_url)
|
|
161
171
|
LinkTarget.new(target_url: target_url, resolution: resolution)
|
|
@@ -183,11 +193,19 @@ module Crawlscope
|
|
|
183
193
|
resolution && resolution[:status]
|
|
184
194
|
end
|
|
185
195
|
|
|
196
|
+
def redirect?
|
|
197
|
+
(status && (300..399).cover?(status.to_i)) || final_url != target_url
|
|
198
|
+
end
|
|
199
|
+
|
|
186
200
|
def unresolved?
|
|
187
201
|
resolution.nil? || (status.nil? && !ignored_error?)
|
|
188
202
|
end
|
|
189
203
|
end
|
|
190
204
|
|
|
205
|
+
def crawlable_source_path?(path)
|
|
206
|
+
!path.nil? && INTERNAL_PATH_PREFIXES_TO_SKIP.none? { |prefix| path.start_with?(prefix) }
|
|
207
|
+
end
|
|
208
|
+
|
|
191
209
|
def skip_internal_path?(path)
|
|
192
210
|
return true if path == "/"
|
|
193
211
|
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "uri"
|
|
4
|
+
|
|
3
5
|
module Crawlscope
|
|
4
6
|
module Rules
|
|
5
7
|
class Metadata
|
|
6
8
|
TITLE_MAX_LENGTH = 72
|
|
9
|
+
DESCRIPTION_MIN_LENGTH = 110
|
|
7
10
|
DESCRIPTION_MAX_LENGTH = 160
|
|
11
|
+
REQUIRED_OPEN_GRAPH_PROPERTIES = %w[og:title og:description og:url og:type og:image].freeze
|
|
8
12
|
|
|
9
13
|
attr_reader :code
|
|
10
14
|
|
|
@@ -21,22 +25,35 @@ module Crawlscope
|
|
|
21
25
|
validate_title(page, issues)
|
|
22
26
|
validate_description(page, issues)
|
|
23
27
|
validate_canonical(page, issues)
|
|
28
|
+
validate_open_graph(page, issues)
|
|
24
29
|
end
|
|
25
30
|
end
|
|
26
31
|
|
|
27
32
|
private
|
|
28
33
|
|
|
29
34
|
def validate_h1(page, issues)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
35
|
+
h1s = page.doc.css("h1")
|
|
36
|
+
return if h1s.one?
|
|
37
|
+
|
|
38
|
+
if h1s.empty?
|
|
39
|
+
issues.add(
|
|
40
|
+
code: :missing_h1,
|
|
41
|
+
severity: :warning,
|
|
42
|
+
category: :metadata,
|
|
43
|
+
url: page.url,
|
|
44
|
+
message: "missing <h1>",
|
|
45
|
+
details: {}
|
|
46
|
+
)
|
|
47
|
+
else
|
|
48
|
+
issues.add(
|
|
49
|
+
code: :multiple_h1,
|
|
50
|
+
severity: :warning,
|
|
51
|
+
category: :metadata,
|
|
52
|
+
url: page.url,
|
|
53
|
+
message: "multiple <h1> tags (#{h1s.size})",
|
|
54
|
+
details: {count: h1s.size}
|
|
55
|
+
)
|
|
56
|
+
end
|
|
40
57
|
end
|
|
41
58
|
|
|
42
59
|
def validate_title(page, issues)
|
|
@@ -56,6 +73,8 @@ module Crawlscope
|
|
|
56
73
|
|
|
57
74
|
if description.empty?
|
|
58
75
|
issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {})
|
|
76
|
+
elsif description.length < DESCRIPTION_MIN_LENGTH
|
|
77
|
+
issues.add(code: :meta_description_too_short, severity: :warning, category: :metadata, url: page.url, message: "meta description too short (#{description.length})", details: {length: description.length, minimum: DESCRIPTION_MIN_LENGTH})
|
|
59
78
|
elsif description.length > DESCRIPTION_MAX_LENGTH
|
|
60
79
|
issues.add(code: :meta_description_too_long, severity: :warning, category: :metadata, url: page.url, message: "meta description too long (#{description.length})", details: {length: description.length})
|
|
61
80
|
end
|
|
@@ -71,7 +90,7 @@ module Crawlscope
|
|
|
71
90
|
|
|
72
91
|
normalized_canonical = Url.normalize(canonical, base_url: page.url)
|
|
73
92
|
normalized_page_url = Url.normalize(page.url, base_url: page.url)
|
|
74
|
-
return if normalized_canonical
|
|
93
|
+
return if canonical_matches_page?(normalized_canonical, normalized_page_url)
|
|
75
94
|
|
|
76
95
|
issues.add(
|
|
77
96
|
code: :canonical_mismatch,
|
|
@@ -88,6 +107,33 @@ module Crawlscope
|
|
|
88
107
|
|
|
89
108
|
title.split(/[^[:alnum:]]+/).count { |token| token.casecmp?(@site_name) } > 1
|
|
90
109
|
end
|
|
110
|
+
|
|
111
|
+
def validate_open_graph(page, issues)
|
|
112
|
+
missing = REQUIRED_OPEN_GRAPH_PROPERTIES.reject do |property|
|
|
113
|
+
page.doc.at_css(%(meta[property="#{property}"][content]))
|
|
114
|
+
end
|
|
115
|
+
return if missing.empty?
|
|
116
|
+
|
|
117
|
+
issues.add(
|
|
118
|
+
code: :incomplete_open_graph_tags,
|
|
119
|
+
severity: :warning,
|
|
120
|
+
category: :metadata,
|
|
121
|
+
url: page.url,
|
|
122
|
+
message: "Open Graph tags incomplete (missing #{missing.join(", ")})",
|
|
123
|
+
details: {missing: missing}
|
|
124
|
+
)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def canonical_matches_page?(canonical, page_url)
|
|
128
|
+
canonical == page_url || (local_url?(page_url) && Url.path(canonical) == Url.path(page_url))
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def local_url?(url)
|
|
132
|
+
host = URI.parse(url.to_s).host.to_s
|
|
133
|
+
["localhost", "127.0.0.1", "0.0.0.0", "::1"].include?(host)
|
|
134
|
+
rescue URI::InvalidURIError
|
|
135
|
+
false
|
|
136
|
+
end
|
|
91
137
|
end
|
|
92
138
|
end
|
|
93
139
|
end
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
module Crawlscope
|
|
4
4
|
module Rules
|
|
5
5
|
class StructuredData
|
|
6
|
+
CAREER_DETAIL_PATH = %r{/careers/[^/]+/?\z}
|
|
7
|
+
|
|
6
8
|
attr_reader :code
|
|
7
9
|
|
|
8
10
|
def initialize
|
|
@@ -65,6 +67,51 @@ module Crawlscope
|
|
|
65
67
|
details: {errors: errors, source: source}
|
|
66
68
|
)
|
|
67
69
|
end
|
|
70
|
+
|
|
71
|
+
validate_job_posting_count(page, items, issues)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def validate_job_posting_count(page, items, issues)
|
|
75
|
+
job_postings = items.select { |item| structured_data_types(item.data).include?("JobPosting") }
|
|
76
|
+
return if job_postings.size == 1
|
|
77
|
+
|
|
78
|
+
if job_postings.size > 1
|
|
79
|
+
issues.add(
|
|
80
|
+
code: :multiple_job_postings,
|
|
81
|
+
severity: :warning,
|
|
82
|
+
category: :structured_data,
|
|
83
|
+
url: page.url,
|
|
84
|
+
message: "multiple JobPosting structured data blocks found",
|
|
85
|
+
details: {count: job_postings.size}
|
|
86
|
+
)
|
|
87
|
+
elsif career_detail_page?(page.url)
|
|
88
|
+
issues.add(
|
|
89
|
+
code: :missing_job_posting,
|
|
90
|
+
severity: :warning,
|
|
91
|
+
category: :structured_data,
|
|
92
|
+
url: page.url,
|
|
93
|
+
message: "career detail page missing JobPosting structured data",
|
|
94
|
+
details: {expected_type: "JobPosting"}
|
|
95
|
+
)
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def structured_data_types(data)
|
|
100
|
+
return [] unless data.is_a?(Hash)
|
|
101
|
+
|
|
102
|
+
types = Array(data["@type"]).map(&:to_s)
|
|
103
|
+
|
|
104
|
+
if data["@graph"].is_a?(Array)
|
|
105
|
+
types.concat(data["@graph"].flat_map { |entry| structured_data_types(entry) })
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
types
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def career_detail_page?(url)
|
|
112
|
+
URI(url).path.match?(CAREER_DETAIL_PATH)
|
|
113
|
+
rescue URI::InvalidURIError
|
|
114
|
+
false
|
|
68
115
|
end
|
|
69
116
|
end
|
|
70
117
|
end
|
data/lib/crawlscope/schemas.rb
CHANGED
|
@@ -330,6 +330,56 @@ module Crawlscope
|
|
|
330
330
|
}
|
|
331
331
|
}.freeze
|
|
332
332
|
|
|
333
|
+
JOB_POSTING = {
|
|
334
|
+
type: "object",
|
|
335
|
+
additionalProperties: true,
|
|
336
|
+
required: ["@type", "title", "description", "datePosted", "hiringOrganization"],
|
|
337
|
+
properties: {
|
|
338
|
+
"@context" => {enum: ["https://schema.org", "https://schema.org/"]},
|
|
339
|
+
"@type" => {const: "JobPosting"},
|
|
340
|
+
:title => {type: "string"},
|
|
341
|
+
:description => {type: "string"},
|
|
342
|
+
:identifier => {type: "object"},
|
|
343
|
+
:datePosted => {type: "string"},
|
|
344
|
+
:validThrough => {type: "string"},
|
|
345
|
+
:employmentType => {
|
|
346
|
+
anyOf: [
|
|
347
|
+
{type: "string"},
|
|
348
|
+
{type: "array", minItems: 1, items: {type: "string"}}
|
|
349
|
+
]
|
|
350
|
+
},
|
|
351
|
+
:directApply => {type: "boolean"},
|
|
352
|
+
:hiringOrganization => {
|
|
353
|
+
type: "object",
|
|
354
|
+
required: ["@type", "name"],
|
|
355
|
+
properties: {
|
|
356
|
+
"@type" => {const: "Organization"},
|
|
357
|
+
:name => {type: "string"},
|
|
358
|
+
:sameAs => {type: "string", format: "uri"},
|
|
359
|
+
:logo => {type: "string", format: "uri"}
|
|
360
|
+
}
|
|
361
|
+
},
|
|
362
|
+
:applicantLocationRequirements => {
|
|
363
|
+
anyOf: [
|
|
364
|
+
{type: "object"},
|
|
365
|
+
{type: "array", minItems: 1, items: {type: "object"}}
|
|
366
|
+
]
|
|
367
|
+
},
|
|
368
|
+
:jobLocationType => {type: "string"},
|
|
369
|
+
:jobLocation => {
|
|
370
|
+
anyOf: [
|
|
371
|
+
{type: "object"},
|
|
372
|
+
{type: "array", minItems: 1, items: {type: "object"}}
|
|
373
|
+
]
|
|
374
|
+
},
|
|
375
|
+
:baseSalary => {type: "object"}
|
|
376
|
+
},
|
|
377
|
+
anyOf: [
|
|
378
|
+
{required: ["jobLocation"]},
|
|
379
|
+
{required: ["jobLocationType", "applicantLocationRequirements"]}
|
|
380
|
+
]
|
|
381
|
+
}.freeze
|
|
382
|
+
|
|
333
383
|
def self.schemas
|
|
334
384
|
{
|
|
335
385
|
"FAQPage" => FAQ_PAGE,
|
|
@@ -348,7 +398,8 @@ module Crawlscope
|
|
|
348
398
|
"Recipe" => RECIPE,
|
|
349
399
|
"Event" => EVENT,
|
|
350
400
|
"VideoObject" => VIDEO_OBJECT,
|
|
351
|
-
"WebPage" => WEB_PAGE
|
|
401
|
+
"WebPage" => WEB_PAGE,
|
|
402
|
+
"JobPosting" => JOB_POSTING
|
|
352
403
|
}
|
|
353
404
|
end
|
|
354
405
|
end
|
data/lib/crawlscope/version.rb
CHANGED
|
@@ -5,7 +5,7 @@ namespace :crawlscope do
|
|
|
5
5
|
end
|
|
6
6
|
|
|
7
7
|
namespace :validate do
|
|
8
|
-
desc "Directly validate JSON-LD on one or more URLs. ENV: URL (
|
|
8
|
+
desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
|
|
9
9
|
task ldjson: :environment do
|
|
10
10
|
Crawlscope::RakeTasks.ldjson
|
|
11
11
|
end
|
data/test/crawlscope/cli_test.rb
CHANGED
|
@@ -4,9 +4,10 @@ require "test_helper"
|
|
|
4
4
|
|
|
5
5
|
class CrawlscopeCliTest < Minitest::Test
|
|
6
6
|
class FakeConfiguration
|
|
7
|
-
attr_accessor :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
|
|
7
|
+
attr_accessor :base_url, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
|
|
8
8
|
|
|
9
9
|
def initialize
|
|
10
|
+
@base_url = nil
|
|
10
11
|
@concurrency = 10
|
|
11
12
|
@network_idle_timeout_seconds = 5
|
|
12
13
|
@renderer = :http
|
|
@@ -145,6 +146,17 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
145
146
|
assert_empty err.string
|
|
146
147
|
end
|
|
147
148
|
|
|
149
|
+
def test_ldjson_defaults_to_configured_base_url
|
|
150
|
+
configuration = FakeConfiguration.new
|
|
151
|
+
configuration.base_url = "https://example.com"
|
|
152
|
+
task = FakeTask.new
|
|
153
|
+
|
|
154
|
+
status = Crawlscope::Cli.start(["ldjson"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
|
|
155
|
+
|
|
156
|
+
assert_equal 0, status
|
|
157
|
+
assert_equal ["https://example.com"], task.json_ld_arguments[:urls]
|
|
158
|
+
end
|
|
159
|
+
|
|
148
160
|
def test_validate_caps_default_browser_concurrency
|
|
149
161
|
configuration = FakeConfiguration.new
|
|
150
162
|
task = FakeTask.new
|
|
@@ -218,14 +230,16 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
218
230
|
assert_equal 3, configuration.network_idle_timeout_seconds
|
|
219
231
|
end
|
|
220
232
|
|
|
221
|
-
def
|
|
233
|
+
def test_ldjson_defaults_to_localhost
|
|
222
234
|
out = StringIO.new
|
|
223
235
|
err = StringIO.new
|
|
236
|
+
task = FakeTask.new
|
|
224
237
|
|
|
225
|
-
status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task:
|
|
238
|
+
status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task: task)
|
|
226
239
|
|
|
227
|
-
assert_equal
|
|
228
|
-
|
|
240
|
+
assert_equal 0, status
|
|
241
|
+
assert_equal ["http://localhost:3000"], task.json_ld_arguments[:urls]
|
|
242
|
+
assert_empty err.string
|
|
229
243
|
end
|
|
230
244
|
|
|
231
245
|
def test_invalid_integer_option_returns_error
|
|
@@ -31,8 +31,13 @@ class CrawlscopeCrawlTest < Minitest::Test
|
|
|
31
31
|
<html>
|
|
32
32
|
<head>
|
|
33
33
|
<title>Pricing</title>
|
|
34
|
-
<meta name="description" content="Plans for hotels and restaurants">
|
|
34
|
+
<meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
|
|
35
35
|
<link rel="canonical" href="https://example.com/pricing">
|
|
36
|
+
<meta property="og:title" content="Pricing">
|
|
37
|
+
<meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
|
|
38
|
+
<meta property="og:url" content="https://example.com/pricing">
|
|
39
|
+
<meta property="og:type" content="website">
|
|
40
|
+
<meta property="og:image" content="https://example.com/icon.png">
|
|
36
41
|
<script type="application/ld+json">
|
|
37
42
|
{"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
|
|
38
43
|
</script>
|
|
@@ -95,7 +100,7 @@ class CrawlscopeCrawlTest < Minitest::Test
|
|
|
95
100
|
).call
|
|
96
101
|
|
|
97
102
|
refute result.ok?
|
|
98
|
-
assert_equal %i[meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
|
|
103
|
+
assert_equal %i[incomplete_open_graph_tags meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
|
|
99
104
|
end
|
|
100
105
|
|
|
101
106
|
def test_uses_browser_when_renderer_is_browser
|
|
@@ -128,8 +133,13 @@ class CrawlscopeCrawlTest < Minitest::Test
|
|
|
128
133
|
<html>
|
|
129
134
|
<head>
|
|
130
135
|
<title>Pricing</title>
|
|
131
|
-
<meta name="description" content="Plans for hotels and restaurants">
|
|
136
|
+
<meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
|
|
132
137
|
<link rel="canonical" href="https://example.com/pricing">
|
|
138
|
+
<meta property="og:title" content="Pricing">
|
|
139
|
+
<meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
|
|
140
|
+
<meta property="og:url" content="https://example.com/pricing">
|
|
141
|
+
<meta property="og:type" content="website">
|
|
142
|
+
<meta property="og:image" content="https://example.com/icon.png">
|
|
133
143
|
<script type="application/ld+json">
|
|
134
144
|
{"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
|
|
135
145
|
</script>
|
|
@@ -118,6 +118,45 @@ class CrawlscopeLinksRuleTest < Minitest::Test
|
|
|
118
118
|
assert_equal "https://example.com/guide", issues.to_a.first.url
|
|
119
119
|
end
|
|
120
120
|
|
|
121
|
+
def test_counts_root_page_links_as_inbound_links
|
|
122
|
+
issues = Crawlscope::IssueCollection.new
|
|
123
|
+
|
|
124
|
+
Crawlscope::Rules::Links.new.call(
|
|
125
|
+
urls: ["https://example.com/", "https://example.com/about"],
|
|
126
|
+
pages: [
|
|
127
|
+
page(url: "https://example.com/", body: "<main><a href=\"/about\">About</a></main>"),
|
|
128
|
+
page(url: "https://example.com/about", body: "<main><p>About</p></main>")
|
|
129
|
+
],
|
|
130
|
+
issues: issues,
|
|
131
|
+
context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
refute_includes issues.to_a.map(&:code), :low_inbound_anchor_links
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def test_reports_internal_links_that_redirect
|
|
138
|
+
issues = Crawlscope::IssueCollection.new
|
|
139
|
+
resolver = lambda do |target_url|
|
|
140
|
+
{
|
|
141
|
+
crawled: false,
|
|
142
|
+
error: nil,
|
|
143
|
+
final_url: "https://example.com/pricing",
|
|
144
|
+
status: 200
|
|
145
|
+
}
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
Crawlscope::Rules::Links.new.call(
|
|
149
|
+
urls: ["https://example.com/guide"],
|
|
150
|
+
pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/plans\">Plans</a></main>")],
|
|
151
|
+
issues: issues,
|
|
152
|
+
context: context(resolver: resolver)
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
redirect_issue = issues.to_a.find { |issue| issue.code == :internal_link_redirects }
|
|
156
|
+
assert redirect_issue
|
|
157
|
+
assert_includes redirect_issue.message, "https://example.com/pricing"
|
|
158
|
+
end
|
|
159
|
+
|
|
121
160
|
def test_ignores_links_that_should_not_be_crawled
|
|
122
161
|
issues = Crawlscope::IssueCollection.new
|
|
123
162
|
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeMetadataRuleTest < Minitest::Test
|
|
6
|
+
def test_reports_short_meta_description_multiple_h1_and_incomplete_open_graph
|
|
7
|
+
issues = Crawlscope::IssueCollection.new
|
|
8
|
+
|
|
9
|
+
Crawlscope::Rules::Metadata.new.call(
|
|
10
|
+
urls: [page.url],
|
|
11
|
+
pages: [page],
|
|
12
|
+
issues: issues
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
codes = issues.to_a.map(&:code)
|
|
16
|
+
assert_includes codes, :meta_description_too_short
|
|
17
|
+
assert_includes codes, :multiple_h1
|
|
18
|
+
assert_includes codes, :incomplete_open_graph_tags
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def test_allows_localhost_page_with_matching_production_canonical_path
|
|
22
|
+
issues = Crawlscope::IssueCollection.new
|
|
23
|
+
local_page = page(
|
|
24
|
+
url: "http://localhost:3000/about",
|
|
25
|
+
body: <<~HTML
|
|
26
|
+
<html>
|
|
27
|
+
<head>
|
|
28
|
+
<title>About</title>
|
|
29
|
+
<meta name="description" content="A clear description that is long enough for search snippets, local validation checks, and realistic production metadata audits.">
|
|
30
|
+
<link rel="canonical" href="https://www.example.com/about">
|
|
31
|
+
<meta property="og:title" content="About">
|
|
32
|
+
<meta property="og:description" content="About page">
|
|
33
|
+
<meta property="og:url" content="https://www.example.com/about">
|
|
34
|
+
<meta property="og:type" content="website">
|
|
35
|
+
<meta property="og:image" content="https://www.example.com/icon.png">
|
|
36
|
+
</head>
|
|
37
|
+
<body><main><h1>About</h1></main></body>
|
|
38
|
+
</html>
|
|
39
|
+
HTML
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
Crawlscope::Rules::Metadata.new.call(
|
|
43
|
+
urls: [local_page.url],
|
|
44
|
+
pages: [local_page],
|
|
45
|
+
issues: issues
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
refute_includes issues.to_a.map(&:code), :canonical_mismatch
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
def page(url: "https://example.com/about", body: nil)
|
|
54
|
+
body ||= <<~HTML
|
|
55
|
+
<html>
|
|
56
|
+
<head>
|
|
57
|
+
<title>About</title>
|
|
58
|
+
<meta name="description" content="Too short">
|
|
59
|
+
<link rel="canonical" href="https://example.com/about">
|
|
60
|
+
<meta property="og:title" content="About">
|
|
61
|
+
</head>
|
|
62
|
+
<body><main><h1>About</h1><h1>Team</h1></main></body>
|
|
63
|
+
</html>
|
|
64
|
+
HTML
|
|
65
|
+
|
|
66
|
+
Crawlscope::Page.new(
|
|
67
|
+
url: url,
|
|
68
|
+
normalized_url: Crawlscope::Url.normalize(url, base_url: url),
|
|
69
|
+
final_url: url,
|
|
70
|
+
normalized_final_url: Crawlscope::Url.normalize(url, base_url: url),
|
|
71
|
+
status: 200,
|
|
72
|
+
headers: {"content-type" => "text/html"},
|
|
73
|
+
body: body,
|
|
74
|
+
doc: Nokogiri::HTML(body)
|
|
75
|
+
)
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -79,6 +79,97 @@ class CrawlscopeStructuredDataRuleTest < Minitest::Test
|
|
|
79
79
|
assert_equal ["json-ld", "microdata"], issues.to_a.first.details[:expected_sources]
|
|
80
80
|
end
|
|
81
81
|
|
|
82
|
+
def test_validates_job_posting_markup
|
|
83
|
+
issues = Crawlscope::IssueCollection.new
|
|
84
|
+
rule = Crawlscope::Rules::StructuredData.new
|
|
85
|
+
page = page(
|
|
86
|
+
url: "https://example.com/careers/sales-partner",
|
|
87
|
+
body: <<~HTML
|
|
88
|
+
<html>
|
|
89
|
+
<head>
|
|
90
|
+
<script type="application/ld+json">
|
|
91
|
+
{
|
|
92
|
+
"@context":"https://schema.org/",
|
|
93
|
+
"@type":"JobPosting",
|
|
94
|
+
"title":"Sales Partner",
|
|
95
|
+
"description":"A real role description.",
|
|
96
|
+
"datePosted":"2026-04-28",
|
|
97
|
+
"hiringOrganization":{"@type":"Organization","name":"Example","sameAs":"https://example.com/","logo":"https://example.com/icon.png"},
|
|
98
|
+
"jobLocationType":"TELECOMMUTE",
|
|
99
|
+
"applicantLocationRequirements":[{"@type":"Country","name":"South Africa"}]
|
|
100
|
+
}
|
|
101
|
+
</script>
|
|
102
|
+
</head>
|
|
103
|
+
<body><h1>Sales Partner</h1></body>
|
|
104
|
+
</html>
|
|
105
|
+
HTML
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
rule.call(
|
|
109
|
+
urls: [page.url],
|
|
110
|
+
pages: [page],
|
|
111
|
+
issues: issues,
|
|
112
|
+
context: {schema_registry: Crawlscope::SchemaRegistry.default}
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
assert_empty issues.to_a
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def test_reports_schema_errors_for_invalid_job_posting_markup
|
|
119
|
+
issues = Crawlscope::IssueCollection.new
|
|
120
|
+
rule = Crawlscope::Rules::StructuredData.new
|
|
121
|
+
page = page(
|
|
122
|
+
url: "https://example.com/careers/sales-partner",
|
|
123
|
+
body: <<~HTML
|
|
124
|
+
<html>
|
|
125
|
+
<head>
|
|
126
|
+
<script type="application/ld+json">
|
|
127
|
+
{"@context":"https://schema.org","@type":"JobPosting","title":"Sales Partner"}
|
|
128
|
+
</script>
|
|
129
|
+
</head>
|
|
130
|
+
<body><h1>Sales Partner</h1></body>
|
|
131
|
+
</html>
|
|
132
|
+
HTML
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
rule.call(
|
|
136
|
+
urls: [page.url],
|
|
137
|
+
pages: [page],
|
|
138
|
+
issues: issues,
|
|
139
|
+
context: {schema_registry: Crawlscope::SchemaRegistry.default}
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
assert_equal [:structured_data_schema_error], issues.to_a.map(&:code)
|
|
143
|
+
assert_includes issues.to_a.first.message, "description"
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def test_reports_missing_job_posting_for_career_detail_pages
|
|
147
|
+
issues = Crawlscope::IssueCollection.new
|
|
148
|
+
rule = Crawlscope::Rules::StructuredData.new
|
|
149
|
+
page = page(
|
|
150
|
+
url: "https://example.com/careers/sales-partner",
|
|
151
|
+
body: <<~HTML
|
|
152
|
+
<html>
|
|
153
|
+
<head>
|
|
154
|
+
<script type="application/ld+json">
|
|
155
|
+
{"@context":"https://schema.org","@type":"WebPage","name":"Sales Partner"}
|
|
156
|
+
</script>
|
|
157
|
+
</head>
|
|
158
|
+
<body><h1>Sales Partner</h1></body>
|
|
159
|
+
</html>
|
|
160
|
+
HTML
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
rule.call(
|
|
164
|
+
urls: [page.url],
|
|
165
|
+
pages: [page],
|
|
166
|
+
issues: issues,
|
|
167
|
+
context: {schema_registry: Crawlscope::SchemaRegistry.default}
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
assert_equal [:missing_job_posting], issues.to_a.map(&:code)
|
|
171
|
+
end
|
|
172
|
+
|
|
82
173
|
private
|
|
83
174
|
|
|
84
175
|
def page(url:, body:)
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: crawlscope
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Paulo Fidalgo
|
|
@@ -233,6 +233,7 @@ files:
|
|
|
233
233
|
- test/crawlscope/http_test.rb
|
|
234
234
|
- test/crawlscope/links_rule_test.rb
|
|
235
235
|
- test/crawlscope/loader_test.rb
|
|
236
|
+
- test/crawlscope/metadata_rule_test.rb
|
|
236
237
|
- test/crawlscope/reporter_test.rb
|
|
237
238
|
- test/crawlscope/rule_registry_test.rb
|
|
238
239
|
- test/crawlscope/run_test.rb
|