crawlscope 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +7 -8
  3. data/README.md +21 -14
  4. data/lib/crawlscope/browser.rb +8 -0
  5. data/lib/crawlscope/cli.rb +15 -10
  6. data/lib/crawlscope/configuration.rb +20 -5
  7. data/lib/crawlscope/context.rb +9 -0
  8. data/lib/crawlscope/{audit.rb → crawl.rb} +68 -58
  9. data/lib/crawlscope/crawler.rb +19 -1
  10. data/lib/crawlscope/http.rb +1 -1
  11. data/lib/crawlscope/rake_tasks.rb +28 -0
  12. data/lib/crawlscope/rules/links.rb +99 -48
  13. data/lib/crawlscope/rules/metadata.rb +57 -11
  14. data/lib/crawlscope/rules/structured_data.rb +61 -1
  15. data/lib/crawlscope/run.rb +60 -0
  16. data/lib/crawlscope/schema_registry.rb +3 -349
  17. data/lib/crawlscope/schemas.rb +406 -0
  18. data/lib/crawlscope/sitemap.rb +18 -6
  19. data/lib/crawlscope/structured_data/audit.rb +7 -7
  20. data/lib/crawlscope/structured_data/check.rb +35 -0
  21. data/lib/crawlscope/structured_data/reporter.rb +69 -0
  22. data/lib/crawlscope/url.rb +14 -0
  23. data/lib/crawlscope/version.rb +1 -1
  24. data/lib/tasks/crawlscope_tasks.rake +12 -23
  25. data/test/crawlscope/browser_test.rb +155 -0
  26. data/test/crawlscope/cli_test.rb +143 -7
  27. data/test/crawlscope/configuration_test.rb +49 -0
  28. data/test/crawlscope/{audit_test.rb → crawl_test.rb} +23 -7
  29. data/test/crawlscope/crawler_test.rb +34 -0
  30. data/test/crawlscope/http_test.rb +56 -0
  31. data/test/crawlscope/links_rule_test.rb +149 -5
  32. data/test/crawlscope/metadata_rule_test.rb +77 -0
  33. data/test/crawlscope/rule_registry_test.rb +32 -0
  34. data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
  35. data/test/crawlscope/schema_registry_test.rb +19 -0
  36. data/test/crawlscope/sitemap_test.rb +55 -0
  37. data/test/crawlscope/structured_data_document_test.rb +36 -0
  38. data/test/crawlscope/structured_data_report_test.rb +3 -3
  39. data/test/crawlscope/structured_data_reporter_test.rb +2 -2
  40. data/test/crawlscope/structured_data_rule_test.rb +111 -0
  41. data/test/crawlscope/structured_data_writer_test.rb +2 -2
  42. data/test/crawlscope/url_test.rb +31 -0
  43. metadata +15 -5
  44. data/lib/crawlscope/task.rb +0 -131
@@ -38,19 +38,163 @@ class CrawlscopeLinksRuleTest < Minitest::Test
38
38
  urls: ["https://example.com/guide", "https://example.com/pricing"],
39
39
  pages: pages,
40
40
  issues: issues,
41
- context: {
42
- allowed_statuses: [200, 301, 302],
43
- base_url: "https://example.com",
44
- resolve_target: method(:resolve_target)
45
- }
41
+ context: context
46
42
  )
47
43
 
48
44
  assert_equal [:broken_internal_link], issues.to_a.map(&:code)
49
45
  assert_includes issues.to_a.first.message, "HTTP 404"
50
46
  end
51
47
 
48
+ def test_reports_unresolved_internal_links
49
+ issues = Crawlscope::IssueCollection.new
50
+
51
+ Crawlscope::Rules::Links.new.call(
52
+ urls: [],
53
+ pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/unknown\">Unknown</a></main>")],
54
+ issues: issues,
55
+ context: context(resolver: ->(_target_url) {})
56
+ )
57
+
58
+ assert_includes issues.to_a.map(&:code), :unresolved_internal_link
59
+ assert_includes issues.to_a.find { |issue| issue.code == :unresolved_internal_link }.message, "unable to validate internal link"
60
+ end
61
+
62
+ def test_ignores_fetch_errors_for_urls_already_crawled
63
+ issues = Crawlscope::IssueCollection.new
64
+ resolver = lambda do |target_url|
65
+ {
66
+ crawled: true,
67
+ error: "Timeout::Error: timed out",
68
+ final_url: target_url,
69
+ status: nil
70
+ }
71
+ end
72
+
73
+ Crawlscope::Rules::Links.new.call(
74
+ urls: [],
75
+ pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/timeout\">Timeout</a></main>")],
76
+ issues: issues,
77
+ context: context(resolver: resolver)
78
+ )
79
+
80
+ assert_empty issues.to_a
81
+ end
82
+
83
+ def test_reports_fetch_errors_for_uncrawled_targets
84
+ issues = Crawlscope::IssueCollection.new
85
+ resolver = lambda do |target_url|
86
+ {
87
+ crawled: false,
88
+ error: "Timeout::Error: timed out",
89
+ final_url: target_url,
90
+ status: nil
91
+ }
92
+ end
93
+
94
+ Crawlscope::Rules::Links.new.call(
95
+ urls: [],
96
+ pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/timeout\">Timeout</a></main>")],
97
+ issues: issues,
98
+ context: context(resolver: resolver)
99
+ )
100
+
101
+ assert_equal [:unresolved_internal_link], issues.to_a.map(&:code)
102
+ end
103
+
104
+ def test_reports_low_inbound_anchor_links
105
+ issues = Crawlscope::IssueCollection.new
106
+
107
+ Crawlscope::Rules::Links.new.call(
108
+ urls: ["https://example.com/guide", "https://example.com/pricing"],
109
+ pages: [
110
+ page(url: "https://example.com/guide", body: "<main><a href=\"/pricing\">Pricing</a></main>"),
111
+ page(url: "https://example.com/pricing", body: "<main><p>Pricing</p></main>")
112
+ ],
113
+ issues: issues,
114
+ context: context
115
+ )
116
+
117
+ assert_equal [:low_inbound_anchor_links], issues.to_a.map(&:code)
118
+ assert_equal "https://example.com/guide", issues.to_a.first.url
119
+ end
120
+
121
+ def test_counts_root_page_links_as_inbound_links
122
+ issues = Crawlscope::IssueCollection.new
123
+
124
+ Crawlscope::Rules::Links.new.call(
125
+ urls: ["https://example.com/", "https://example.com/about"],
126
+ pages: [
127
+ page(url: "https://example.com/", body: "<main><a href=\"/about\">About</a></main>"),
128
+ page(url: "https://example.com/about", body: "<main><p>About</p></main>")
129
+ ],
130
+ issues: issues,
131
+ context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
132
+ )
133
+
134
+ refute_includes issues.to_a.map(&:code), :low_inbound_anchor_links
135
+ end
136
+
137
+ def test_reports_internal_links_that_redirect
138
+ issues = Crawlscope::IssueCollection.new
139
+ resolver = lambda do |target_url|
140
+ {
141
+ crawled: false,
142
+ error: nil,
143
+ final_url: "https://example.com/pricing",
144
+ status: 200
145
+ }
146
+ end
147
+
148
+ Crawlscope::Rules::Links.new.call(
149
+ urls: ["https://example.com/guide"],
150
+ pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/plans\">Plans</a></main>")],
151
+ issues: issues,
152
+ context: context(resolver: resolver)
153
+ )
154
+
155
+ redirect_issue = issues.to_a.find { |issue| issue.code == :internal_link_redirects }
156
+ assert redirect_issue
157
+ assert_includes redirect_issue.message, "https://example.com/pricing"
158
+ end
159
+
160
+ def test_ignores_links_that_should_not_be_crawled
161
+ issues = Crawlscope::IssueCollection.new
162
+
163
+ Crawlscope::Rules::Links.new.call(
164
+ urls: ["https://example.com/guide"],
165
+ pages: [
166
+ page(
167
+ url: "https://example.com/guide",
168
+ body: <<~HTML
169
+ <html>
170
+ <body>
171
+ <a href="#section">Jump</a>
172
+ <a href="mailto:test@example.com">Email</a>
173
+ <a href="https://other.example.com/page">External</a>
174
+ <a href="/rails/info">Rails</a>
175
+ <a href="/empty"> </a>
176
+ </body>
177
+ </html>
178
+ HTML
179
+ )
180
+ ],
181
+ issues: issues,
182
+ context: context
183
+ )
184
+
185
+ assert_empty issues.to_a
186
+ end
187
+
52
188
  private
53
189
 
190
+ def context(resolver: method(:resolve_target))
191
+ {
192
+ allowed_statuses: [200, 301, 302],
193
+ base_url: "https://example.com",
194
+ resolve_target: resolver
195
+ }
196
+ end
197
+
54
198
  def page(url:, body:)
55
199
  doc = Nokogiri::HTML(body)
56
200
 
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeMetadataRuleTest < Minitest::Test
6
+ def test_reports_short_meta_description_multiple_h1_and_incomplete_open_graph
7
+ issues = Crawlscope::IssueCollection.new
8
+
9
+ Crawlscope::Rules::Metadata.new.call(
10
+ urls: [page.url],
11
+ pages: [page],
12
+ issues: issues
13
+ )
14
+
15
+ codes = issues.to_a.map(&:code)
16
+ assert_includes codes, :meta_description_too_short
17
+ assert_includes codes, :multiple_h1
18
+ assert_includes codes, :incomplete_open_graph_tags
19
+ end
20
+
21
+ def test_allows_localhost_page_with_matching_production_canonical_path
22
+ issues = Crawlscope::IssueCollection.new
23
+ local_page = page(
24
+ url: "http://localhost:3000/about",
25
+ body: <<~HTML
26
+ <html>
27
+ <head>
28
+ <title>About</title>
29
+ <meta name="description" content="A clear description that is long enough for search snippets, local validation checks, and realistic production metadata audits.">
30
+ <link rel="canonical" href="https://www.example.com/about">
31
+ <meta property="og:title" content="About">
32
+ <meta property="og:description" content="About page">
33
+ <meta property="og:url" content="https://www.example.com/about">
34
+ <meta property="og:type" content="website">
35
+ <meta property="og:image" content="https://www.example.com/icon.png">
36
+ </head>
37
+ <body><main><h1>About</h1></main></body>
38
+ </html>
39
+ HTML
40
+ )
41
+
42
+ Crawlscope::Rules::Metadata.new.call(
43
+ urls: [local_page.url],
44
+ pages: [local_page],
45
+ issues: issues
46
+ )
47
+
48
+ refute_includes issues.to_a.map(&:code), :canonical_mismatch
49
+ end
50
+
51
+ private
52
+
53
+ def page(url: "https://example.com/about", body: nil)
54
+ body ||= <<~HTML
55
+ <html>
56
+ <head>
57
+ <title>About</title>
58
+ <meta name="description" content="Too short">
59
+ <link rel="canonical" href="https://example.com/about">
60
+ <meta property="og:title" content="About">
61
+ </head>
62
+ <body><main><h1>About</h1><h1>Team</h1></main></body>
63
+ </html>
64
+ HTML
65
+
66
+ Crawlscope::Page.new(
67
+ url: url,
68
+ normalized_url: Crawlscope::Url.normalize(url, base_url: url),
69
+ final_url: url,
70
+ normalized_final_url: Crawlscope::Url.normalize(url, base_url: url),
71
+ status: 200,
72
+ headers: {"content-type" => "text/html"},
73
+ body: body,
74
+ doc: Nokogiri::HTML(body)
75
+ )
76
+ end
77
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeRuleRegistryTest < Minitest::Test
6
+ Rule = Data.define(:code)
7
+
8
+ def test_rules_for_returns_defaults_when_names_are_blank
9
+ metadata = Rule.new(:metadata)
10
+ links = Rule.new(:links)
11
+ registry = Crawlscope::RuleRegistry.new(rules: [metadata, links], default_codes: %i[links])
12
+
13
+ assert_equal [links], registry.rules_for(nil)
14
+ assert_equal [links], registry.rules_for("")
15
+ end
16
+
17
+ def test_rules_for_accepts_csv_and_arrays
18
+ metadata = Rule.new(:metadata)
19
+ links = Rule.new(:links)
20
+ registry = Crawlscope::RuleRegistry.new(rules: [metadata, links])
21
+
22
+ assert_equal [metadata, links], registry.rules_for(["metadata, links"])
23
+ end
24
+
25
+ def test_rules_for_rejects_unknown_rules
26
+ registry = Crawlscope::RuleRegistry.new(rules: [Rule.new(:metadata)])
27
+
28
+ error = assert_raises(Crawlscope::ConfigurationError) { registry.rules_for("links") }
29
+
30
+ assert_equal "Unknown Crawlscope rules: links", error.message
31
+ end
32
+ end
@@ -2,7 +2,7 @@
2
2
 
3
3
  require "test_helper"
4
4
 
5
- class CrawlscopeTaskTest < Minitest::Test
5
+ class CrawlscopeRunTest < Minitest::Test
6
6
  FakeResult = Data.define(:reported) do
7
7
  def ok?
8
8
  true
@@ -47,16 +47,29 @@ class CrawlscopeTaskTest < Minitest::Test
47
47
  end
48
48
  end
49
49
 
50
- class LdjsonConfiguration
50
+ class JsonLdConfiguration
51
51
  attr_reader :output
52
52
 
53
- def initialize(output:, browser:)
53
+ def initialize(output:, page:)
54
54
  @output = output
55
- @browser = browser
55
+ @page = page
56
+ @closed = false
56
57
  end
57
58
 
58
59
  def browser_factory
59
- -> { @browser }
60
+ -> { self }
61
+ end
62
+
63
+ def close
64
+ @closed = true
65
+ end
66
+
67
+ def closed?
68
+ @closed
69
+ end
70
+
71
+ def fetch(_url)
72
+ @page
60
73
  end
61
74
 
62
75
  def network_idle_timeout_seconds
@@ -80,30 +93,13 @@ class CrawlscopeTaskTest < Minitest::Test
80
93
  end
81
94
  end
82
95
 
83
- class FakeBrowser
84
- attr_reader :closed
85
-
86
- def initialize(page:)
87
- @page = page
88
- @closed = false
89
- end
90
-
91
- def close
92
- @closed = true
93
- end
94
-
95
- def fetch(_url)
96
- @page
97
- end
98
- end
99
-
100
96
  def test_validate_passes_rule_names_to_configuration_audit
101
97
  result = FakeResult.new(reported: true)
102
98
  configuration = FakeConfiguration.new(result: result)
103
99
  reporter = FakeReporter.new
104
100
 
105
- task = Crawlscope::Task.new(configuration: configuration, reporter: reporter)
106
- returned_result = task.validate(rule_names: "links")
101
+ run = Crawlscope::Run.new(configuration: configuration, reporter: reporter)
102
+ returned_result = run.validate(rule_names: "links")
107
103
 
108
104
  assert_equal(
109
105
  {
@@ -122,7 +118,7 @@ class CrawlscopeTaskTest < Minitest::Test
122
118
  configuration = FakeConfiguration.new(result: result, base_url: "https://example.com", sitemap_path: nil)
123
119
  reporter = FakeReporter.new
124
120
 
125
- Crawlscope::Task.new(configuration: configuration, reporter: reporter).validate
121
+ Crawlscope::Run.new(configuration: configuration, reporter: reporter).validate
126
122
 
127
123
  assert_equal(
128
124
  {
@@ -144,7 +140,7 @@ class CrawlscopeTaskTest < Minitest::Test
144
140
  File.write(sitemap_path, "<urlset></urlset>")
145
141
 
146
142
  Dir.chdir(tmp_dir) do
147
- Crawlscope::Task.new(configuration: configuration, reporter: reporter).validate
143
+ Crawlscope::Run.new(configuration: configuration, reporter: reporter).validate
148
144
  end
149
145
 
150
146
  assert_equal(
@@ -159,7 +155,7 @@ class CrawlscopeTaskTest < Minitest::Test
159
155
  FileUtils.rm_rf(tmp_dir) if tmp_dir
160
156
  end
161
157
 
162
- def test_validate_ldjson_uses_real_audit_and_writes_report
158
+ def test_validate_json_ld_reports_valid_structured_data
163
159
  body = <<~HTML
164
160
  <html>
165
161
  <head>
@@ -177,15 +173,14 @@ class CrawlscopeTaskTest < Minitest::Test
177
173
  status: 200,
178
174
  headers: {"content-type" => "text/html"},
179
175
  body: body,
180
- doc: Nokogiri::HTML(body)
176
+ doc: nil
181
177
  )
182
- browser = FakeBrowser.new(page: page)
183
178
  output = StringIO.new
184
- configuration = LdjsonConfiguration.new(output: output, browser: browser)
179
+ configuration = JsonLdConfiguration.new(output: output, page: page)
185
180
  report_dir = Dir.mktmpdir
186
181
  report_path = File.join(report_dir, "structured-data.json")
187
182
 
188
- result = Crawlscope::Task.new(configuration: configuration).validate_ldjson(
183
+ result = Crawlscope::Run.new(configuration: configuration).validate_json_ld(
189
184
  urls: [page.url],
190
185
  debug: true,
191
186
  report_path: report_path,
@@ -193,9 +188,9 @@ class CrawlscopeTaskTest < Minitest::Test
193
188
  )
194
189
 
195
190
  assert result.ok?
196
- assert browser.closed
191
+ assert_predicate configuration, :closed?
197
192
  assert File.exist?(report_path)
198
- assert_includes File.read(report_path), "https://example.com"
193
+ assert_equal ["https://example.com"], JSON.parse(File.read(report_path)).fetch("results").keys
199
194
  assert_includes output.string, "JavaScript mode enabled (Ferrum)"
200
195
  assert_includes output.string, "Validating JSON-LD on 1 URL(s)"
201
196
  assert_includes output.string, "All valid!"
@@ -86,4 +86,23 @@ class CrawlscopeSchemaRegistryTest < Minitest::Test
86
86
 
87
87
  assert_equal "Unknown Crawlscope rules: unknown", error.message
88
88
  end
89
+
90
+ def test_validate_accepts_arrays_graphs_unknown_types_and_non_hashes
91
+ registry = Crawlscope::SchemaRegistry.default
92
+
93
+ errors = registry.validate(
94
+ [
95
+ "ignored",
96
+ {"@type" => "UnknownThing"},
97
+ {
98
+ "@graph" => [
99
+ {"@type" => "Article"},
100
+ {"@type" => "WebSite", "name" => "Example"}
101
+ ]
102
+ }
103
+ ]
104
+ )
105
+
106
+ assert_equal ["Article"], errors.map { |error| error[:type] }
107
+ end
89
108
  end
@@ -48,4 +48,59 @@ class CrawlscopeSitemapTest < Minitest::Test
48
48
 
49
49
  assert_equal ["https://www.example.com/features/reviews"], parser.urls(base_url: "https://www.example.com")
50
50
  end
51
+
52
+ def test_rebases_remote_sitemap_index_children_to_base_url
53
+ stub_request(:get, "http://localhost:3000/sitemap.xml")
54
+ .to_return(
55
+ status: 200,
56
+ body: <<~XML
57
+ <?xml version="1.0" encoding="UTF-8"?>
58
+ <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
59
+ <sitemap><loc>https://www.example.com/sitemap-marketing.xml</loc></sitemap>
60
+ </sitemapindex>
61
+ XML
62
+ )
63
+
64
+ stub_request(:get, "http://localhost:3000/sitemap-marketing.xml")
65
+ .to_return(
66
+ status: 200,
67
+ body: <<~XML
68
+ <?xml version="1.0" encoding="UTF-8"?>
69
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
70
+ <url><loc>https://www.example.com/features/reviews</loc></url>
71
+ </urlset>
72
+ XML
73
+ )
74
+
75
+ parser = Crawlscope::Sitemap.new(path: "http://localhost:3000/sitemap.xml")
76
+
77
+ assert_equal ["http://localhost:3000/features/reviews"], parser.urls(base_url: "http://localhost:3000")
78
+ end
79
+
80
+ def test_parses_local_sitemap_index_with_absolute_child_sitemap_loc
81
+ Dir.mktmpdir do |dir|
82
+ File.write(
83
+ File.join(dir, "sitemap.xml"),
84
+ <<~XML
85
+ <?xml version="1.0" encoding="UTF-8"?>
86
+ <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
87
+ <sitemap><loc>https://www.example.com/sitemap-pages.xml</loc></sitemap>
88
+ </sitemapindex>
89
+ XML
90
+ )
91
+ File.write(
92
+ File.join(dir, "sitemap-pages.xml"),
93
+ <<~XML
94
+ <?xml version="1.0" encoding="UTF-8"?>
95
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
96
+ <url><loc>https://www.example.com/features/reviews</loc></url>
97
+ </urlset>
98
+ XML
99
+ )
100
+
101
+ parser = Crawlscope::Sitemap.new(path: File.join(dir, "sitemap.xml"))
102
+
103
+ assert_equal ["http://localhost:3000/features/reviews"], parser.urls(base_url: "http://localhost:3000")
104
+ end
105
+ end
51
106
  end
@@ -25,4 +25,40 @@ class CrawlscopeStructuredDataDocumentTest < Minitest::Test
25
25
  assert_equal ["json-ld", "microdata"], items.map(&:source)
26
26
  assert_equal "Hotel Test", document.json_ld_items.first["name"]
27
27
  end
28
+
29
+ def test_json_ld_handles_arrays_invalid_json_and_non_object_entries
30
+ html = <<~HTML
31
+ <script type="application/ld+json">
32
+ [{"@type":"WebSite","name":"Example"}, "ignored"]
33
+ </script>
34
+ <script type="application/ld+json">
35
+ {"@type":
36
+ </script>
37
+ HTML
38
+
39
+ document = Crawlscope::StructuredData::Document.new(html: html)
40
+
41
+ assert_equal 2, document.items.size
42
+ assert_equal ["WebSite"], document.json_ld_items.map { |item| item["@type"] }
43
+ assert_equal "Invalid JSON-LD", document.items.last.data[:error]
44
+ end
45
+
46
+ def test_microdata_extracts_common_value_attributes
47
+ html = <<~HTML
48
+ <div itemscope itemtype="https://schema.org/Event">
49
+ <meta itemprop="name" content="Launch">
50
+ <time itemprop="startDate" datetime="2026-04-24T10:00:00Z"></time>
51
+ <a itemprop="url" href="https://example.com/event">Event</a>
52
+ <data itemprop="position" value="1"></data>
53
+ </div>
54
+ HTML
55
+
56
+ item = Crawlscope::StructuredData::Document.new(html: html).items.first.data
57
+
58
+ assert_equal "Event", item["@type"]
59
+ assert_equal "Launch", item["name"]
60
+ assert_equal "2026-04-24T10:00:00Z", item["startDate"]
61
+ assert_equal "https://example.com/event", item["url"]
62
+ assert_equal "1", item["position"]
63
+ end
28
64
  end
@@ -4,9 +4,9 @@ require "test_helper"
4
4
 
5
5
  class CrawlscopeStructuredDataReportTest < Minitest::Test
6
6
  def test_results_maps_validation_errors_and_skips
7
- result = Crawlscope::StructuredData::Audit::Result.new(
7
+ result = Crawlscope::StructuredData::Audit::Outcome.new(
8
8
  entries: [
9
- Crawlscope::StructuredData::Audit::Entry.new(
9
+ Crawlscope::StructuredData::Audit::Page.new(
10
10
  url: "https://example.com/article",
11
11
  status: 200,
12
12
  structured_items: [{source: "json-ld", data: {"@type" => "Article"}}],
@@ -15,7 +15,7 @@ class CrawlscopeStructuredDataReportTest < Minitest::Test
15
15
  content_type: "text/html",
16
16
  skipped_reason: nil
17
17
  ),
18
- Crawlscope::StructuredData::Audit::Entry.new(
18
+ Crawlscope::StructuredData::Audit::Page.new(
19
19
  url: "https://example.com/feed.xml",
20
20
  status: 200,
21
21
  structured_items: [],
@@ -5,9 +5,9 @@ require "test_helper"
5
5
 
6
6
  class CrawlscopeStructuredDataReporterTest < Minitest::Test
7
7
  def test_reports_failures_and_report_path
8
- result = Crawlscope::StructuredData::Audit::Result.new(
8
+ result = Crawlscope::StructuredData::Audit::Outcome.new(
9
9
  entries: [
10
- Crawlscope::StructuredData::Audit::Entry.new(
10
+ Crawlscope::StructuredData::Audit::Page.new(
11
11
  url: "https://example.com/article",
12
12
  status: 200,
13
13
  structured_items: [{source: "json-ld", data: {"@type" => "Article"}}],
@@ -59,6 +59,117 @@ class CrawlscopeStructuredDataRuleTest < Minitest::Test
59
59
  assert_equal [:structured_data_parse_error], issues.to_a.map(&:code)
60
60
  end
61
61
 
62
+ def test_reports_missing_structured_data_for_html_pages
63
+ issues = Crawlscope::IssueCollection.new
64
+ rule = Crawlscope::Rules::StructuredData.new
65
+ page = page(
66
+ url: "https://example.com/articles/test",
67
+ body: "<html><body><main><h1>Article</h1></main></body></html>"
68
+ )
69
+
70
+ rule.call(
71
+ urls: [page.url],
72
+ pages: [page],
73
+ issues: issues,
74
+ context: {schema_registry: Crawlscope::SchemaRegistry.default}
75
+ )
76
+
77
+ assert_equal [:missing_structured_data], issues.to_a.map(&:code)
78
+ assert_equal "no structured data found; add JSON-LD or microdata markup", issues.to_a.first.message
79
+ assert_equal ["json-ld", "microdata"], issues.to_a.first.details[:expected_sources]
80
+ end
81
+
82
+ def test_validates_job_posting_markup
83
+ issues = Crawlscope::IssueCollection.new
84
+ rule = Crawlscope::Rules::StructuredData.new
85
+ page = page(
86
+ url: "https://example.com/careers/sales-partner",
87
+ body: <<~HTML
88
+ <html>
89
+ <head>
90
+ <script type="application/ld+json">
91
+ {
92
+ "@context":"https://schema.org/",
93
+ "@type":"JobPosting",
94
+ "title":"Sales Partner",
95
+ "description":"A real role description.",
96
+ "datePosted":"2026-04-28",
97
+ "hiringOrganization":{"@type":"Organization","name":"Example","sameAs":"https://example.com/","logo":"https://example.com/icon.png"},
98
+ "jobLocationType":"TELECOMMUTE",
99
+ "applicantLocationRequirements":[{"@type":"Country","name":"South Africa"}]
100
+ }
101
+ </script>
102
+ </head>
103
+ <body><h1>Sales Partner</h1></body>
104
+ </html>
105
+ HTML
106
+ )
107
+
108
+ rule.call(
109
+ urls: [page.url],
110
+ pages: [page],
111
+ issues: issues,
112
+ context: {schema_registry: Crawlscope::SchemaRegistry.default}
113
+ )
114
+
115
+ assert_empty issues.to_a
116
+ end
117
+
118
+ def test_reports_schema_errors_for_invalid_job_posting_markup
119
+ issues = Crawlscope::IssueCollection.new
120
+ rule = Crawlscope::Rules::StructuredData.new
121
+ page = page(
122
+ url: "https://example.com/careers/sales-partner",
123
+ body: <<~HTML
124
+ <html>
125
+ <head>
126
+ <script type="application/ld+json">
127
+ {"@context":"https://schema.org","@type":"JobPosting","title":"Sales Partner"}
128
+ </script>
129
+ </head>
130
+ <body><h1>Sales Partner</h1></body>
131
+ </html>
132
+ HTML
133
+ )
134
+
135
+ rule.call(
136
+ urls: [page.url],
137
+ pages: [page],
138
+ issues: issues,
139
+ context: {schema_registry: Crawlscope::SchemaRegistry.default}
140
+ )
141
+
142
+ assert_equal [:structured_data_schema_error], issues.to_a.map(&:code)
143
+ assert_includes issues.to_a.first.message, "description"
144
+ end
145
+
146
+ def test_reports_missing_job_posting_for_career_detail_pages
147
+ issues = Crawlscope::IssueCollection.new
148
+ rule = Crawlscope::Rules::StructuredData.new
149
+ page = page(
150
+ url: "https://example.com/careers/sales-partner",
151
+ body: <<~HTML
152
+ <html>
153
+ <head>
154
+ <script type="application/ld+json">
155
+ {"@context":"https://schema.org","@type":"WebPage","name":"Sales Partner"}
156
+ </script>
157
+ </head>
158
+ <body><h1>Sales Partner</h1></body>
159
+ </html>
160
+ HTML
161
+ )
162
+
163
+ rule.call(
164
+ urls: [page.url],
165
+ pages: [page],
166
+ issues: issues,
167
+ context: {schema_registry: Crawlscope::SchemaRegistry.default}
168
+ )
169
+
170
+ assert_equal [:missing_job_posting], issues.to_a.map(&:code)
171
+ end
172
+
62
173
  private
63
174
 
64
175
  def page(url:, body:)