crawlscope 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -8
- data/README.md +21 -14
- data/lib/crawlscope/browser.rb +8 -0
- data/lib/crawlscope/cli.rb +15 -10
- data/lib/crawlscope/configuration.rb +20 -5
- data/lib/crawlscope/context.rb +9 -0
- data/lib/crawlscope/{audit.rb → crawl.rb} +68 -58
- data/lib/crawlscope/crawler.rb +19 -1
- data/lib/crawlscope/http.rb +1 -1
- data/lib/crawlscope/rake_tasks.rb +28 -0
- data/lib/crawlscope/rules/links.rb +99 -48
- data/lib/crawlscope/rules/metadata.rb +57 -11
- data/lib/crawlscope/rules/structured_data.rb +61 -1
- data/lib/crawlscope/run.rb +60 -0
- data/lib/crawlscope/schema_registry.rb +3 -349
- data/lib/crawlscope/schemas.rb +406 -0
- data/lib/crawlscope/sitemap.rb +18 -6
- data/lib/crawlscope/structured_data/audit.rb +7 -7
- data/lib/crawlscope/structured_data/check.rb +35 -0
- data/lib/crawlscope/structured_data/reporter.rb +69 -0
- data/lib/crawlscope/url.rb +14 -0
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +12 -23
- data/test/crawlscope/browser_test.rb +155 -0
- data/test/crawlscope/cli_test.rb +143 -7
- data/test/crawlscope/configuration_test.rb +49 -0
- data/test/crawlscope/{audit_test.rb → crawl_test.rb} +23 -7
- data/test/crawlscope/crawler_test.rb +34 -0
- data/test/crawlscope/http_test.rb +56 -0
- data/test/crawlscope/links_rule_test.rb +149 -5
- data/test/crawlscope/metadata_rule_test.rb +77 -0
- data/test/crawlscope/rule_registry_test.rb +32 -0
- data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
- data/test/crawlscope/schema_registry_test.rb +19 -0
- data/test/crawlscope/sitemap_test.rb +55 -0
- data/test/crawlscope/structured_data_document_test.rb +36 -0
- data/test/crawlscope/structured_data_report_test.rb +3 -3
- data/test/crawlscope/structured_data_reporter_test.rb +2 -2
- data/test/crawlscope/structured_data_rule_test.rb +111 -0
- data/test/crawlscope/structured_data_writer_test.rb +2 -2
- data/test/crawlscope/url_test.rb +31 -0
- metadata +15 -5
- data/lib/crawlscope/task.rb +0 -131
|
@@ -38,19 +38,163 @@ class CrawlscopeLinksRuleTest < Minitest::Test
|
|
|
38
38
|
urls: ["https://example.com/guide", "https://example.com/pricing"],
|
|
39
39
|
pages: pages,
|
|
40
40
|
issues: issues,
|
|
41
|
-
context:
|
|
42
|
-
allowed_statuses: [200, 301, 302],
|
|
43
|
-
base_url: "https://example.com",
|
|
44
|
-
resolve_target: method(:resolve_target)
|
|
45
|
-
}
|
|
41
|
+
context: context
|
|
46
42
|
)
|
|
47
43
|
|
|
48
44
|
assert_equal [:broken_internal_link], issues.to_a.map(&:code)
|
|
49
45
|
assert_includes issues.to_a.first.message, "HTTP 404"
|
|
50
46
|
end
|
|
51
47
|
|
|
48
|
+
def test_reports_unresolved_internal_links
|
|
49
|
+
issues = Crawlscope::IssueCollection.new
|
|
50
|
+
|
|
51
|
+
Crawlscope::Rules::Links.new.call(
|
|
52
|
+
urls: [],
|
|
53
|
+
pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/unknown\">Unknown</a></main>")],
|
|
54
|
+
issues: issues,
|
|
55
|
+
context: context(resolver: ->(_target_url) {})
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
assert_includes issues.to_a.map(&:code), :unresolved_internal_link
|
|
59
|
+
assert_includes issues.to_a.find { |issue| issue.code == :unresolved_internal_link }.message, "unable to validate internal link"
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def test_ignores_fetch_errors_for_urls_already_crawled
|
|
63
|
+
issues = Crawlscope::IssueCollection.new
|
|
64
|
+
resolver = lambda do |target_url|
|
|
65
|
+
{
|
|
66
|
+
crawled: true,
|
|
67
|
+
error: "Timeout::Error: timed out",
|
|
68
|
+
final_url: target_url,
|
|
69
|
+
status: nil
|
|
70
|
+
}
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
Crawlscope::Rules::Links.new.call(
|
|
74
|
+
urls: [],
|
|
75
|
+
pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/timeout\">Timeout</a></main>")],
|
|
76
|
+
issues: issues,
|
|
77
|
+
context: context(resolver: resolver)
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
assert_empty issues.to_a
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def test_reports_fetch_errors_for_uncrawled_targets
|
|
84
|
+
issues = Crawlscope::IssueCollection.new
|
|
85
|
+
resolver = lambda do |target_url|
|
|
86
|
+
{
|
|
87
|
+
crawled: false,
|
|
88
|
+
error: "Timeout::Error: timed out",
|
|
89
|
+
final_url: target_url,
|
|
90
|
+
status: nil
|
|
91
|
+
}
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
Crawlscope::Rules::Links.new.call(
|
|
95
|
+
urls: [],
|
|
96
|
+
pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/timeout\">Timeout</a></main>")],
|
|
97
|
+
issues: issues,
|
|
98
|
+
context: context(resolver: resolver)
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
assert_equal [:unresolved_internal_link], issues.to_a.map(&:code)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def test_reports_low_inbound_anchor_links
|
|
105
|
+
issues = Crawlscope::IssueCollection.new
|
|
106
|
+
|
|
107
|
+
Crawlscope::Rules::Links.new.call(
|
|
108
|
+
urls: ["https://example.com/guide", "https://example.com/pricing"],
|
|
109
|
+
pages: [
|
|
110
|
+
page(url: "https://example.com/guide", body: "<main><a href=\"/pricing\">Pricing</a></main>"),
|
|
111
|
+
page(url: "https://example.com/pricing", body: "<main><p>Pricing</p></main>")
|
|
112
|
+
],
|
|
113
|
+
issues: issues,
|
|
114
|
+
context: context
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
assert_equal [:low_inbound_anchor_links], issues.to_a.map(&:code)
|
|
118
|
+
assert_equal "https://example.com/guide", issues.to_a.first.url
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def test_counts_root_page_links_as_inbound_links
|
|
122
|
+
issues = Crawlscope::IssueCollection.new
|
|
123
|
+
|
|
124
|
+
Crawlscope::Rules::Links.new.call(
|
|
125
|
+
urls: ["https://example.com/", "https://example.com/about"],
|
|
126
|
+
pages: [
|
|
127
|
+
page(url: "https://example.com/", body: "<main><a href=\"/about\">About</a></main>"),
|
|
128
|
+
page(url: "https://example.com/about", body: "<main><p>About</p></main>")
|
|
129
|
+
],
|
|
130
|
+
issues: issues,
|
|
131
|
+
context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
refute_includes issues.to_a.map(&:code), :low_inbound_anchor_links
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def test_reports_internal_links_that_redirect
|
|
138
|
+
issues = Crawlscope::IssueCollection.new
|
|
139
|
+
resolver = lambda do |target_url|
|
|
140
|
+
{
|
|
141
|
+
crawled: false,
|
|
142
|
+
error: nil,
|
|
143
|
+
final_url: "https://example.com/pricing",
|
|
144
|
+
status: 200
|
|
145
|
+
}
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
Crawlscope::Rules::Links.new.call(
|
|
149
|
+
urls: ["https://example.com/guide"],
|
|
150
|
+
pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/plans\">Plans</a></main>")],
|
|
151
|
+
issues: issues,
|
|
152
|
+
context: context(resolver: resolver)
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
redirect_issue = issues.to_a.find { |issue| issue.code == :internal_link_redirects }
|
|
156
|
+
assert redirect_issue
|
|
157
|
+
assert_includes redirect_issue.message, "https://example.com/pricing"
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def test_ignores_links_that_should_not_be_crawled
|
|
161
|
+
issues = Crawlscope::IssueCollection.new
|
|
162
|
+
|
|
163
|
+
Crawlscope::Rules::Links.new.call(
|
|
164
|
+
urls: ["https://example.com/guide"],
|
|
165
|
+
pages: [
|
|
166
|
+
page(
|
|
167
|
+
url: "https://example.com/guide",
|
|
168
|
+
body: <<~HTML
|
|
169
|
+
<html>
|
|
170
|
+
<body>
|
|
171
|
+
<a href="#section">Jump</a>
|
|
172
|
+
<a href="mailto:test@example.com">Email</a>
|
|
173
|
+
<a href="https://other.example.com/page">External</a>
|
|
174
|
+
<a href="/rails/info">Rails</a>
|
|
175
|
+
<a href="/empty"> </a>
|
|
176
|
+
</body>
|
|
177
|
+
</html>
|
|
178
|
+
HTML
|
|
179
|
+
)
|
|
180
|
+
],
|
|
181
|
+
issues: issues,
|
|
182
|
+
context: context
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
assert_empty issues.to_a
|
|
186
|
+
end
|
|
187
|
+
|
|
52
188
|
private
|
|
53
189
|
|
|
190
|
+
def context(resolver: method(:resolve_target))
|
|
191
|
+
{
|
|
192
|
+
allowed_statuses: [200, 301, 302],
|
|
193
|
+
base_url: "https://example.com",
|
|
194
|
+
resolve_target: resolver
|
|
195
|
+
}
|
|
196
|
+
end
|
|
197
|
+
|
|
54
198
|
def page(url:, body:)
|
|
55
199
|
doc = Nokogiri::HTML(body)
|
|
56
200
|
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeMetadataRuleTest < Minitest::Test
|
|
6
|
+
def test_reports_short_meta_description_multiple_h1_and_incomplete_open_graph
|
|
7
|
+
issues = Crawlscope::IssueCollection.new
|
|
8
|
+
|
|
9
|
+
Crawlscope::Rules::Metadata.new.call(
|
|
10
|
+
urls: [page.url],
|
|
11
|
+
pages: [page],
|
|
12
|
+
issues: issues
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
codes = issues.to_a.map(&:code)
|
|
16
|
+
assert_includes codes, :meta_description_too_short
|
|
17
|
+
assert_includes codes, :multiple_h1
|
|
18
|
+
assert_includes codes, :incomplete_open_graph_tags
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def test_allows_localhost_page_with_matching_production_canonical_path
|
|
22
|
+
issues = Crawlscope::IssueCollection.new
|
|
23
|
+
local_page = page(
|
|
24
|
+
url: "http://localhost:3000/about",
|
|
25
|
+
body: <<~HTML
|
|
26
|
+
<html>
|
|
27
|
+
<head>
|
|
28
|
+
<title>About</title>
|
|
29
|
+
<meta name="description" content="A clear description that is long enough for search snippets, local validation checks, and realistic production metadata audits.">
|
|
30
|
+
<link rel="canonical" href="https://www.example.com/about">
|
|
31
|
+
<meta property="og:title" content="About">
|
|
32
|
+
<meta property="og:description" content="About page">
|
|
33
|
+
<meta property="og:url" content="https://www.example.com/about">
|
|
34
|
+
<meta property="og:type" content="website">
|
|
35
|
+
<meta property="og:image" content="https://www.example.com/icon.png">
|
|
36
|
+
</head>
|
|
37
|
+
<body><main><h1>About</h1></main></body>
|
|
38
|
+
</html>
|
|
39
|
+
HTML
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
Crawlscope::Rules::Metadata.new.call(
|
|
43
|
+
urls: [local_page.url],
|
|
44
|
+
pages: [local_page],
|
|
45
|
+
issues: issues
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
refute_includes issues.to_a.map(&:code), :canonical_mismatch
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
def page(url: "https://example.com/about", body: nil)
|
|
54
|
+
body ||= <<~HTML
|
|
55
|
+
<html>
|
|
56
|
+
<head>
|
|
57
|
+
<title>About</title>
|
|
58
|
+
<meta name="description" content="Too short">
|
|
59
|
+
<link rel="canonical" href="https://example.com/about">
|
|
60
|
+
<meta property="og:title" content="About">
|
|
61
|
+
</head>
|
|
62
|
+
<body><main><h1>About</h1><h1>Team</h1></main></body>
|
|
63
|
+
</html>
|
|
64
|
+
HTML
|
|
65
|
+
|
|
66
|
+
Crawlscope::Page.new(
|
|
67
|
+
url: url,
|
|
68
|
+
normalized_url: Crawlscope::Url.normalize(url, base_url: url),
|
|
69
|
+
final_url: url,
|
|
70
|
+
normalized_final_url: Crawlscope::Url.normalize(url, base_url: url),
|
|
71
|
+
status: 200,
|
|
72
|
+
headers: {"content-type" => "text/html"},
|
|
73
|
+
body: body,
|
|
74
|
+
doc: Nokogiri::HTML(body)
|
|
75
|
+
)
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeRuleRegistryTest < Minitest::Test
|
|
6
|
+
Rule = Data.define(:code)
|
|
7
|
+
|
|
8
|
+
def test_rules_for_returns_defaults_when_names_are_blank
|
|
9
|
+
metadata = Rule.new(:metadata)
|
|
10
|
+
links = Rule.new(:links)
|
|
11
|
+
registry = Crawlscope::RuleRegistry.new(rules: [metadata, links], default_codes: %i[links])
|
|
12
|
+
|
|
13
|
+
assert_equal [links], registry.rules_for(nil)
|
|
14
|
+
assert_equal [links], registry.rules_for("")
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def test_rules_for_accepts_csv_and_arrays
|
|
18
|
+
metadata = Rule.new(:metadata)
|
|
19
|
+
links = Rule.new(:links)
|
|
20
|
+
registry = Crawlscope::RuleRegistry.new(rules: [metadata, links])
|
|
21
|
+
|
|
22
|
+
assert_equal [metadata, links], registry.rules_for(["metadata, links"])
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def test_rules_for_rejects_unknown_rules
|
|
26
|
+
registry = Crawlscope::RuleRegistry.new(rules: [Rule.new(:metadata)])
|
|
27
|
+
|
|
28
|
+
error = assert_raises(Crawlscope::ConfigurationError) { registry.rules_for("links") }
|
|
29
|
+
|
|
30
|
+
assert_equal "Unknown Crawlscope rules: links", error.message
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "test_helper"
|
|
4
4
|
|
|
5
|
-
class
|
|
5
|
+
class CrawlscopeRunTest < Minitest::Test
|
|
6
6
|
FakeResult = Data.define(:reported) do
|
|
7
7
|
def ok?
|
|
8
8
|
true
|
|
@@ -47,16 +47,29 @@ class CrawlscopeTaskTest < Minitest::Test
|
|
|
47
47
|
end
|
|
48
48
|
end
|
|
49
49
|
|
|
50
|
-
class
|
|
50
|
+
class JsonLdConfiguration
|
|
51
51
|
attr_reader :output
|
|
52
52
|
|
|
53
|
-
def initialize(output:,
|
|
53
|
+
def initialize(output:, page:)
|
|
54
54
|
@output = output
|
|
55
|
-
@
|
|
55
|
+
@page = page
|
|
56
|
+
@closed = false
|
|
56
57
|
end
|
|
57
58
|
|
|
58
59
|
def browser_factory
|
|
59
|
-
-> {
|
|
60
|
+
-> { self }
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def close
|
|
64
|
+
@closed = true
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def closed?
|
|
68
|
+
@closed
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def fetch(_url)
|
|
72
|
+
@page
|
|
60
73
|
end
|
|
61
74
|
|
|
62
75
|
def network_idle_timeout_seconds
|
|
@@ -80,30 +93,13 @@ class CrawlscopeTaskTest < Minitest::Test
|
|
|
80
93
|
end
|
|
81
94
|
end
|
|
82
95
|
|
|
83
|
-
class FakeBrowser
|
|
84
|
-
attr_reader :closed
|
|
85
|
-
|
|
86
|
-
def initialize(page:)
|
|
87
|
-
@page = page
|
|
88
|
-
@closed = false
|
|
89
|
-
end
|
|
90
|
-
|
|
91
|
-
def close
|
|
92
|
-
@closed = true
|
|
93
|
-
end
|
|
94
|
-
|
|
95
|
-
def fetch(_url)
|
|
96
|
-
@page
|
|
97
|
-
end
|
|
98
|
-
end
|
|
99
|
-
|
|
100
96
|
def test_validate_passes_rule_names_to_configuration_audit
|
|
101
97
|
result = FakeResult.new(reported: true)
|
|
102
98
|
configuration = FakeConfiguration.new(result: result)
|
|
103
99
|
reporter = FakeReporter.new
|
|
104
100
|
|
|
105
|
-
|
|
106
|
-
returned_result =
|
|
101
|
+
run = Crawlscope::Run.new(configuration: configuration, reporter: reporter)
|
|
102
|
+
returned_result = run.validate(rule_names: "links")
|
|
107
103
|
|
|
108
104
|
assert_equal(
|
|
109
105
|
{
|
|
@@ -122,7 +118,7 @@ class CrawlscopeTaskTest < Minitest::Test
|
|
|
122
118
|
configuration = FakeConfiguration.new(result: result, base_url: "https://example.com", sitemap_path: nil)
|
|
123
119
|
reporter = FakeReporter.new
|
|
124
120
|
|
|
125
|
-
Crawlscope::
|
|
121
|
+
Crawlscope::Run.new(configuration: configuration, reporter: reporter).validate
|
|
126
122
|
|
|
127
123
|
assert_equal(
|
|
128
124
|
{
|
|
@@ -144,7 +140,7 @@ class CrawlscopeTaskTest < Minitest::Test
|
|
|
144
140
|
File.write(sitemap_path, "<urlset></urlset>")
|
|
145
141
|
|
|
146
142
|
Dir.chdir(tmp_dir) do
|
|
147
|
-
Crawlscope::
|
|
143
|
+
Crawlscope::Run.new(configuration: configuration, reporter: reporter).validate
|
|
148
144
|
end
|
|
149
145
|
|
|
150
146
|
assert_equal(
|
|
@@ -159,7 +155,7 @@ class CrawlscopeTaskTest < Minitest::Test
|
|
|
159
155
|
FileUtils.rm_rf(tmp_dir) if tmp_dir
|
|
160
156
|
end
|
|
161
157
|
|
|
162
|
-
def
|
|
158
|
+
def test_validate_json_ld_reports_valid_structured_data
|
|
163
159
|
body = <<~HTML
|
|
164
160
|
<html>
|
|
165
161
|
<head>
|
|
@@ -177,15 +173,14 @@ class CrawlscopeTaskTest < Minitest::Test
|
|
|
177
173
|
status: 200,
|
|
178
174
|
headers: {"content-type" => "text/html"},
|
|
179
175
|
body: body,
|
|
180
|
-
doc:
|
|
176
|
+
doc: nil
|
|
181
177
|
)
|
|
182
|
-
browser = FakeBrowser.new(page: page)
|
|
183
178
|
output = StringIO.new
|
|
184
|
-
configuration =
|
|
179
|
+
configuration = JsonLdConfiguration.new(output: output, page: page)
|
|
185
180
|
report_dir = Dir.mktmpdir
|
|
186
181
|
report_path = File.join(report_dir, "structured-data.json")
|
|
187
182
|
|
|
188
|
-
result = Crawlscope::
|
|
183
|
+
result = Crawlscope::Run.new(configuration: configuration).validate_json_ld(
|
|
189
184
|
urls: [page.url],
|
|
190
185
|
debug: true,
|
|
191
186
|
report_path: report_path,
|
|
@@ -193,9 +188,9 @@ class CrawlscopeTaskTest < Minitest::Test
|
|
|
193
188
|
)
|
|
194
189
|
|
|
195
190
|
assert result.ok?
|
|
196
|
-
|
|
191
|
+
assert_predicate configuration, :closed?
|
|
197
192
|
assert File.exist?(report_path)
|
|
198
|
-
|
|
193
|
+
assert_equal ["https://example.com"], JSON.parse(File.read(report_path)).fetch("results").keys
|
|
199
194
|
assert_includes output.string, "JavaScript mode enabled (Ferrum)"
|
|
200
195
|
assert_includes output.string, "Validating JSON-LD on 1 URL(s)"
|
|
201
196
|
assert_includes output.string, "All valid!"
|
|
@@ -86,4 +86,23 @@ class CrawlscopeSchemaRegistryTest < Minitest::Test
|
|
|
86
86
|
|
|
87
87
|
assert_equal "Unknown Crawlscope rules: unknown", error.message
|
|
88
88
|
end
|
|
89
|
+
|
|
90
|
+
def test_validate_accepts_arrays_graphs_unknown_types_and_non_hashes
|
|
91
|
+
registry = Crawlscope::SchemaRegistry.default
|
|
92
|
+
|
|
93
|
+
errors = registry.validate(
|
|
94
|
+
[
|
|
95
|
+
"ignored",
|
|
96
|
+
{"@type" => "UnknownThing"},
|
|
97
|
+
{
|
|
98
|
+
"@graph" => [
|
|
99
|
+
{"@type" => "Article"},
|
|
100
|
+
{"@type" => "WebSite", "name" => "Example"}
|
|
101
|
+
]
|
|
102
|
+
}
|
|
103
|
+
]
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
assert_equal ["Article"], errors.map { |error| error[:type] }
|
|
107
|
+
end
|
|
89
108
|
end
|
|
@@ -48,4 +48,59 @@ class CrawlscopeSitemapTest < Minitest::Test
|
|
|
48
48
|
|
|
49
49
|
assert_equal ["https://www.example.com/features/reviews"], parser.urls(base_url: "https://www.example.com")
|
|
50
50
|
end
|
|
51
|
+
|
|
52
|
+
def test_rebases_remote_sitemap_index_children_to_base_url
|
|
53
|
+
stub_request(:get, "http://localhost:3000/sitemap.xml")
|
|
54
|
+
.to_return(
|
|
55
|
+
status: 200,
|
|
56
|
+
body: <<~XML
|
|
57
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
58
|
+
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
59
|
+
<sitemap><loc>https://www.example.com/sitemap-marketing.xml</loc></sitemap>
|
|
60
|
+
</sitemapindex>
|
|
61
|
+
XML
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
stub_request(:get, "http://localhost:3000/sitemap-marketing.xml")
|
|
65
|
+
.to_return(
|
|
66
|
+
status: 200,
|
|
67
|
+
body: <<~XML
|
|
68
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
69
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
70
|
+
<url><loc>https://www.example.com/features/reviews</loc></url>
|
|
71
|
+
</urlset>
|
|
72
|
+
XML
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
parser = Crawlscope::Sitemap.new(path: "http://localhost:3000/sitemap.xml")
|
|
76
|
+
|
|
77
|
+
assert_equal ["http://localhost:3000/features/reviews"], parser.urls(base_url: "http://localhost:3000")
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def test_parses_local_sitemap_index_with_absolute_child_sitemap_loc
|
|
81
|
+
Dir.mktmpdir do |dir|
|
|
82
|
+
File.write(
|
|
83
|
+
File.join(dir, "sitemap.xml"),
|
|
84
|
+
<<~XML
|
|
85
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
86
|
+
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
87
|
+
<sitemap><loc>https://www.example.com/sitemap-pages.xml</loc></sitemap>
|
|
88
|
+
</sitemapindex>
|
|
89
|
+
XML
|
|
90
|
+
)
|
|
91
|
+
File.write(
|
|
92
|
+
File.join(dir, "sitemap-pages.xml"),
|
|
93
|
+
<<~XML
|
|
94
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
95
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
96
|
+
<url><loc>https://www.example.com/features/reviews</loc></url>
|
|
97
|
+
</urlset>
|
|
98
|
+
XML
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
parser = Crawlscope::Sitemap.new(path: File.join(dir, "sitemap.xml"))
|
|
102
|
+
|
|
103
|
+
assert_equal ["http://localhost:3000/features/reviews"], parser.urls(base_url: "http://localhost:3000")
|
|
104
|
+
end
|
|
105
|
+
end
|
|
51
106
|
end
|
|
@@ -25,4 +25,40 @@ class CrawlscopeStructuredDataDocumentTest < Minitest::Test
|
|
|
25
25
|
assert_equal ["json-ld", "microdata"], items.map(&:source)
|
|
26
26
|
assert_equal "Hotel Test", document.json_ld_items.first["name"]
|
|
27
27
|
end
|
|
28
|
+
|
|
29
|
+
def test_json_ld_handles_arrays_invalid_json_and_non_object_entries
|
|
30
|
+
html = <<~HTML
|
|
31
|
+
<script type="application/ld+json">
|
|
32
|
+
[{"@type":"WebSite","name":"Example"}, "ignored"]
|
|
33
|
+
</script>
|
|
34
|
+
<script type="application/ld+json">
|
|
35
|
+
{"@type":
|
|
36
|
+
</script>
|
|
37
|
+
HTML
|
|
38
|
+
|
|
39
|
+
document = Crawlscope::StructuredData::Document.new(html: html)
|
|
40
|
+
|
|
41
|
+
assert_equal 2, document.items.size
|
|
42
|
+
assert_equal ["WebSite"], document.json_ld_items.map { |item| item["@type"] }
|
|
43
|
+
assert_equal "Invalid JSON-LD", document.items.last.data[:error]
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def test_microdata_extracts_common_value_attributes
|
|
47
|
+
html = <<~HTML
|
|
48
|
+
<div itemscope itemtype="https://schema.org/Event">
|
|
49
|
+
<meta itemprop="name" content="Launch">
|
|
50
|
+
<time itemprop="startDate" datetime="2026-04-24T10:00:00Z"></time>
|
|
51
|
+
<a itemprop="url" href="https://example.com/event">Event</a>
|
|
52
|
+
<data itemprop="position" value="1"></data>
|
|
53
|
+
</div>
|
|
54
|
+
HTML
|
|
55
|
+
|
|
56
|
+
item = Crawlscope::StructuredData::Document.new(html: html).items.first.data
|
|
57
|
+
|
|
58
|
+
assert_equal "Event", item["@type"]
|
|
59
|
+
assert_equal "Launch", item["name"]
|
|
60
|
+
assert_equal "2026-04-24T10:00:00Z", item["startDate"]
|
|
61
|
+
assert_equal "https://example.com/event", item["url"]
|
|
62
|
+
assert_equal "1", item["position"]
|
|
63
|
+
end
|
|
28
64
|
end
|
|
@@ -4,9 +4,9 @@ require "test_helper"
|
|
|
4
4
|
|
|
5
5
|
class CrawlscopeStructuredDataReportTest < Minitest::Test
|
|
6
6
|
def test_results_maps_validation_errors_and_skips
|
|
7
|
-
result = Crawlscope::StructuredData::Audit::
|
|
7
|
+
result = Crawlscope::StructuredData::Audit::Outcome.new(
|
|
8
8
|
entries: [
|
|
9
|
-
Crawlscope::StructuredData::Audit::
|
|
9
|
+
Crawlscope::StructuredData::Audit::Page.new(
|
|
10
10
|
url: "https://example.com/article",
|
|
11
11
|
status: 200,
|
|
12
12
|
structured_items: [{source: "json-ld", data: {"@type" => "Article"}}],
|
|
@@ -15,7 +15,7 @@ class CrawlscopeStructuredDataReportTest < Minitest::Test
|
|
|
15
15
|
content_type: "text/html",
|
|
16
16
|
skipped_reason: nil
|
|
17
17
|
),
|
|
18
|
-
Crawlscope::StructuredData::Audit::
|
|
18
|
+
Crawlscope::StructuredData::Audit::Page.new(
|
|
19
19
|
url: "https://example.com/feed.xml",
|
|
20
20
|
status: 200,
|
|
21
21
|
structured_items: [],
|
|
@@ -5,9 +5,9 @@ require "test_helper"
|
|
|
5
5
|
|
|
6
6
|
class CrawlscopeStructuredDataReporterTest < Minitest::Test
|
|
7
7
|
def test_reports_failures_and_report_path
|
|
8
|
-
result = Crawlscope::StructuredData::Audit::
|
|
8
|
+
result = Crawlscope::StructuredData::Audit::Outcome.new(
|
|
9
9
|
entries: [
|
|
10
|
-
Crawlscope::StructuredData::Audit::
|
|
10
|
+
Crawlscope::StructuredData::Audit::Page.new(
|
|
11
11
|
url: "https://example.com/article",
|
|
12
12
|
status: 200,
|
|
13
13
|
structured_items: [{source: "json-ld", data: {"@type" => "Article"}}],
|
|
@@ -59,6 +59,117 @@ class CrawlscopeStructuredDataRuleTest < Minitest::Test
|
|
|
59
59
|
assert_equal [:structured_data_parse_error], issues.to_a.map(&:code)
|
|
60
60
|
end
|
|
61
61
|
|
|
62
|
+
def test_reports_missing_structured_data_for_html_pages
|
|
63
|
+
issues = Crawlscope::IssueCollection.new
|
|
64
|
+
rule = Crawlscope::Rules::StructuredData.new
|
|
65
|
+
page = page(
|
|
66
|
+
url: "https://example.com/articles/test",
|
|
67
|
+
body: "<html><body><main><h1>Article</h1></main></body></html>"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
rule.call(
|
|
71
|
+
urls: [page.url],
|
|
72
|
+
pages: [page],
|
|
73
|
+
issues: issues,
|
|
74
|
+
context: {schema_registry: Crawlscope::SchemaRegistry.default}
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
assert_equal [:missing_structured_data], issues.to_a.map(&:code)
|
|
78
|
+
assert_equal "no structured data found; add JSON-LD or microdata markup", issues.to_a.first.message
|
|
79
|
+
assert_equal ["json-ld", "microdata"], issues.to_a.first.details[:expected_sources]
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def test_validates_job_posting_markup
|
|
83
|
+
issues = Crawlscope::IssueCollection.new
|
|
84
|
+
rule = Crawlscope::Rules::StructuredData.new
|
|
85
|
+
page = page(
|
|
86
|
+
url: "https://example.com/careers/sales-partner",
|
|
87
|
+
body: <<~HTML
|
|
88
|
+
<html>
|
|
89
|
+
<head>
|
|
90
|
+
<script type="application/ld+json">
|
|
91
|
+
{
|
|
92
|
+
"@context":"https://schema.org/",
|
|
93
|
+
"@type":"JobPosting",
|
|
94
|
+
"title":"Sales Partner",
|
|
95
|
+
"description":"A real role description.",
|
|
96
|
+
"datePosted":"2026-04-28",
|
|
97
|
+
"hiringOrganization":{"@type":"Organization","name":"Example","sameAs":"https://example.com/","logo":"https://example.com/icon.png"},
|
|
98
|
+
"jobLocationType":"TELECOMMUTE",
|
|
99
|
+
"applicantLocationRequirements":[{"@type":"Country","name":"South Africa"}]
|
|
100
|
+
}
|
|
101
|
+
</script>
|
|
102
|
+
</head>
|
|
103
|
+
<body><h1>Sales Partner</h1></body>
|
|
104
|
+
</html>
|
|
105
|
+
HTML
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
rule.call(
|
|
109
|
+
urls: [page.url],
|
|
110
|
+
pages: [page],
|
|
111
|
+
issues: issues,
|
|
112
|
+
context: {schema_registry: Crawlscope::SchemaRegistry.default}
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
assert_empty issues.to_a
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def test_reports_schema_errors_for_invalid_job_posting_markup
|
|
119
|
+
issues = Crawlscope::IssueCollection.new
|
|
120
|
+
rule = Crawlscope::Rules::StructuredData.new
|
|
121
|
+
page = page(
|
|
122
|
+
url: "https://example.com/careers/sales-partner",
|
|
123
|
+
body: <<~HTML
|
|
124
|
+
<html>
|
|
125
|
+
<head>
|
|
126
|
+
<script type="application/ld+json">
|
|
127
|
+
{"@context":"https://schema.org","@type":"JobPosting","title":"Sales Partner"}
|
|
128
|
+
</script>
|
|
129
|
+
</head>
|
|
130
|
+
<body><h1>Sales Partner</h1></body>
|
|
131
|
+
</html>
|
|
132
|
+
HTML
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
rule.call(
|
|
136
|
+
urls: [page.url],
|
|
137
|
+
pages: [page],
|
|
138
|
+
issues: issues,
|
|
139
|
+
context: {schema_registry: Crawlscope::SchemaRegistry.default}
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
assert_equal [:structured_data_schema_error], issues.to_a.map(&:code)
|
|
143
|
+
assert_includes issues.to_a.first.message, "description"
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def test_reports_missing_job_posting_for_career_detail_pages
|
|
147
|
+
issues = Crawlscope::IssueCollection.new
|
|
148
|
+
rule = Crawlscope::Rules::StructuredData.new
|
|
149
|
+
page = page(
|
|
150
|
+
url: "https://example.com/careers/sales-partner",
|
|
151
|
+
body: <<~HTML
|
|
152
|
+
<html>
|
|
153
|
+
<head>
|
|
154
|
+
<script type="application/ld+json">
|
|
155
|
+
{"@context":"https://schema.org","@type":"WebPage","name":"Sales Partner"}
|
|
156
|
+
</script>
|
|
157
|
+
</head>
|
|
158
|
+
<body><h1>Sales Partner</h1></body>
|
|
159
|
+
</html>
|
|
160
|
+
HTML
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
rule.call(
|
|
164
|
+
urls: [page.url],
|
|
165
|
+
pages: [page],
|
|
166
|
+
issues: issues,
|
|
167
|
+
context: {schema_registry: Crawlscope::SchemaRegistry.default}
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
assert_equal [:missing_job_posting], issues.to_a.map(&:code)
|
|
171
|
+
end
|
|
172
|
+
|
|
62
173
|
private
|
|
63
174
|
|
|
64
175
|
def page(url:, body:)
|