crawlscope 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +31 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +323 -0
  5. data/exe/crawlscope +6 -0
  6. data/lib/crawlscope/audit.rb +128 -0
  7. data/lib/crawlscope/browser.rb +88 -0
  8. data/lib/crawlscope/cli.rb +245 -0
  9. data/lib/crawlscope/configuration.rb +123 -0
  10. data/lib/crawlscope/crawler.rb +28 -0
  11. data/lib/crawlscope/http.rb +77 -0
  12. data/lib/crawlscope/issue.rb +17 -0
  13. data/lib/crawlscope/issue_collection.rb +41 -0
  14. data/lib/crawlscope/page.rb +23 -0
  15. data/lib/crawlscope/railtie.rb +9 -0
  16. data/lib/crawlscope/reporter.rb +33 -0
  17. data/lib/crawlscope/result.rb +9 -0
  18. data/lib/crawlscope/rule_registry.rb +39 -0
  19. data/lib/crawlscope/rules/links.rb +220 -0
  20. data/lib/crawlscope/rules/metadata.rb +93 -0
  21. data/lib/crawlscope/rules/structured_data.rb +58 -0
  22. data/lib/crawlscope/rules/uniqueness.rb +88 -0
  23. data/lib/crawlscope/schema_registry.rb +431 -0
  24. data/lib/crawlscope/sitemap.rb +67 -0
  25. data/lib/crawlscope/structured_data/audit.rb +150 -0
  26. data/lib/crawlscope/structured_data/document.rb +93 -0
  27. data/lib/crawlscope/structured_data/report.rb +77 -0
  28. data/lib/crawlscope/structured_data/reporter.rb +73 -0
  29. data/lib/crawlscope/structured_data/writer.rb +26 -0
  30. data/lib/crawlscope/task.rb +131 -0
  31. data/lib/crawlscope/url.rb +43 -0
  32. data/lib/crawlscope/version.rb +5 -0
  33. data/lib/crawlscope.rb +34 -0
  34. data/lib/tasks/crawlscope_tasks.rake +44 -0
  35. data/test/crawlscope/audit_test.rb +165 -0
  36. data/test/crawlscope/cli_test.rb +157 -0
  37. data/test/crawlscope/configuration_test.rb +45 -0
  38. data/test/crawlscope/links_rule_test.rb +87 -0
  39. data/test/crawlscope/loader_test.rb +11 -0
  40. data/test/crawlscope/reporter_test.rb +50 -0
  41. data/test/crawlscope/schema_registry_test.rb +89 -0
  42. data/test/crawlscope/sitemap_test.rb +51 -0
  43. data/test/crawlscope/structured_data_audit_test.rb +118 -0
  44. data/test/crawlscope/structured_data_document_test.rb +28 -0
  45. data/test/crawlscope/structured_data_report_test.rb +37 -0
  46. data/test/crawlscope/structured_data_reporter_test.rb +32 -0
  47. data/test/crawlscope/structured_data_rule_test.rb +78 -0
  48. data/test/crawlscope/structured_data_writer_test.rb +32 -0
  49. data/test/crawlscope/task_test.rb +206 -0
  50. data/test/crawlscope/uniqueness_rule_test.rb +46 -0
  51. data/test/test_helper.rb +23 -0
  52. metadata +271 -0
@@ -0,0 +1,157 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeCliTest < Minitest::Test
6
+ class FakeConfiguration
7
+ attr_accessor :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
8
+
9
+ def initialize
10
+ @concurrency = 10
11
+ @network_idle_timeout_seconds = 5
12
+ @renderer = :http
13
+ @timeout_seconds = 20
14
+ end
15
+
16
+ def browser_concurrency
17
+ 4
18
+ end
19
+ end
20
+
21
+ class FakeTask
22
+ attr_reader :validate_arguments, :ldjson_arguments
23
+
24
+ def validate(base_url:, sitemap_path:, rule_names:)
25
+ @validate_arguments = {
26
+ base_url: base_url,
27
+ sitemap_path: sitemap_path,
28
+ rule_names: rule_names
29
+ }
30
+
31
+ success_result
32
+ end
33
+
34
+ def validate_ldjson(urls:, debug:, renderer:, report_path:, summary:, timeout_seconds:)
35
+ @ldjson_arguments = {
36
+ urls: urls,
37
+ debug: debug,
38
+ renderer: renderer,
39
+ report_path: report_path,
40
+ summary: summary,
41
+ timeout_seconds: timeout_seconds
42
+ }
43
+
44
+ success_result
45
+ end
46
+
47
+ private
48
+
49
+ def success_result
50
+ Struct.new(:ok?).new(true)
51
+ end
52
+ end
53
+
54
+ def test_version_prints_current_version
55
+ out = StringIO.new
56
+ err = StringIO.new
57
+
58
+ status = Crawlscope::Cli.start(["version"], out: out, err: err)
59
+
60
+ assert_equal 0, status
61
+ assert_equal "#{Crawlscope::VERSION}\n", out.string
62
+ assert_empty err.string
63
+ end
64
+
65
+ def test_unknown_command_returns_error
66
+ out = StringIO.new
67
+ err = StringIO.new
68
+
69
+ status = Crawlscope::Cli.start(["unknown"], out: out, err: err)
70
+
71
+ assert_equal 1, status
72
+ assert_includes err.string, "Unknown command: unknown"
73
+ assert_includes err.string, "crawlscope validate --base-url"
74
+ end
75
+
76
+ def test_validate_passes_arguments_to_task
77
+ configuration = FakeConfiguration.new
78
+ task = FakeTask.new
79
+ out = StringIO.new
80
+ err = StringIO.new
81
+
82
+ status = Crawlscope::Cli.start(
83
+ ["validate", "--base-url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3"],
84
+ out: out,
85
+ err: err,
86
+ configuration: configuration,
87
+ task: task
88
+ )
89
+
90
+ assert_equal 0, status
91
+ assert_equal(
92
+ {
93
+ base_url: "https://example.com",
94
+ sitemap_path: "https://example.com/sitemap-pages.xml",
95
+ rule_names: "metadata,links"
96
+ },
97
+ task.validate_arguments
98
+ )
99
+ assert_equal :browser, configuration.renderer
100
+ assert_equal 30, configuration.timeout_seconds
101
+ assert_equal 9, configuration.network_idle_timeout_seconds
102
+ assert_equal 3, configuration.concurrency
103
+ assert_same out, configuration.output
104
+ assert_empty err.string
105
+ end
106
+
107
+ def test_ldjson_reads_urls_from_environment
108
+ configuration = FakeConfiguration.new
109
+ task = FakeTask.new
110
+ out = StringIO.new
111
+ err = StringIO.new
112
+
113
+ with_env("URL" => "https://example.com/a; https://example.com/b", "SUMMARY" => "1", "DEBUG" => "1") do
114
+ status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: configuration, task: task)
115
+
116
+ assert_equal 0, status
117
+ end
118
+
119
+ assert_equal(
120
+ {
121
+ urls: ["https://example.com/a", "https://example.com/b"],
122
+ debug: true,
123
+ renderer: :http,
124
+ report_path: nil,
125
+ summary: true,
126
+ timeout_seconds: 20
127
+ },
128
+ task.ldjson_arguments
129
+ )
130
+ assert_same out, configuration.output
131
+ assert_empty err.string
132
+ end
133
+
134
+ private
135
+
136
+ def with_env(overrides)
137
+ original_values = overrides.to_h { |key, _value| [key, ENV[key]] }
138
+
139
+ overrides.each do |key, value|
140
+ if value.nil?
141
+ ENV.delete(key)
142
+ else
143
+ ENV[key] = value
144
+ end
145
+ end
146
+
147
+ yield
148
+ ensure
149
+ original_values.each do |key, value|
150
+ if value.nil?
151
+ ENV.delete(key)
152
+ else
153
+ ENV[key] = value
154
+ end
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeConfigurationTest < Minitest::Test
6
+ def teardown
7
+ Crawlscope.reset!
8
+ end
9
+
10
+ def test_audit_builds_from_configured_callables
11
+ Crawlscope.configure do |config|
12
+ config.base_url = -> { "https://example.com" }
13
+ config.sitemap_path = -> { "/tmp/sitemap.xml" }
14
+ config.site_name = -> { "Example" }
15
+ config.concurrency = -> { 4 }
16
+ end
17
+
18
+ audit = Crawlscope.configuration.audit
19
+
20
+ assert_equal "https://example.com", audit.instance_variable_get(:@base_url)
21
+ assert_equal "/tmp/sitemap.xml", audit.instance_variable_get(:@sitemap_path)
22
+ assert_equal 4, audit.instance_variable_get(:@concurrency)
23
+ assert_equal %i[metadata structured_data uniqueness links], audit.instance_variable_get(:@rules).map(&:code)
24
+ end
25
+
26
+ def test_audit_raises_without_base_url
27
+ Crawlscope.configure do |config|
28
+ config.sitemap_path = "/tmp/sitemap.xml"
29
+ end
30
+
31
+ error = assert_raises(Crawlscope::ConfigurationError) { Crawlscope.configuration.audit }
32
+
33
+ assert_equal "Crawlscope base_url is not configured", error.message
34
+ end
35
+
36
+ def test_audit_raises_without_sitemap_path
37
+ Crawlscope.configure do |config|
38
+ config.base_url = "https://example.com"
39
+ end
40
+
41
+ error = assert_raises(Crawlscope::ConfigurationError) { Crawlscope.configuration.audit }
42
+
43
+ assert_equal "Crawlscope sitemap_path is not configured", error.message
44
+ end
45
+ end
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeLinksRuleTest < Minitest::Test
6
+ def test_reports_broken_internal_links
7
+ issues = Crawlscope::IssueCollection.new
8
+ rule = Crawlscope::Rules::Links.new
9
+ pages = [
10
+ page(
11
+ url: "https://example.com/guide",
12
+ body: <<~HTML
13
+ <html>
14
+ <body>
15
+ <main>
16
+ <a href="/pricing">Pricing</a>
17
+ <a href="/missing">Missing</a>
18
+ </main>
19
+ </body>
20
+ </html>
21
+ HTML
22
+ ),
23
+ page(
24
+ url: "https://example.com/pricing",
25
+ body: <<~HTML
26
+ <html>
27
+ <body>
28
+ <main>
29
+ <a href="/guide">Guide</a>
30
+ </main>
31
+ </body>
32
+ </html>
33
+ HTML
34
+ )
35
+ ]
36
+
37
+ rule.call(
38
+ urls: ["https://example.com/guide", "https://example.com/pricing"],
39
+ pages: pages,
40
+ issues: issues,
41
+ context: {
42
+ allowed_statuses: [200, 301, 302],
43
+ base_url: "https://example.com",
44
+ resolve_target: method(:resolve_target)
45
+ }
46
+ )
47
+
48
+ assert_equal [:broken_internal_link], issues.to_a.map(&:code)
49
+ assert_includes issues.to_a.first.message, "HTTP 404"
50
+ end
51
+
52
+ private
53
+
54
+ def page(url:, body:)
55
+ doc = Nokogiri::HTML(body)
56
+
57
+ Crawlscope::Page.new(
58
+ url: url,
59
+ normalized_url: url,
60
+ final_url: url,
61
+ normalized_final_url: url,
62
+ status: 200,
63
+ headers: {"content-type" => "text/html"},
64
+ body: body,
65
+ doc: doc
66
+ )
67
+ end
68
+
69
+ def resolve_target(target_url)
70
+ case target_url
71
+ when "https://example.com/guide", "https://example.com/pricing"
72
+ {
73
+ crawled: true,
74
+ error: nil,
75
+ final_url: target_url,
76
+ status: 200
77
+ }
78
+ when "https://example.com/missing"
79
+ {
80
+ crawled: false,
81
+ error: nil,
82
+ final_url: target_url,
83
+ status: 404
84
+ }
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeLoaderTest < Minitest::Test
6
+ def test_eager_loads_cleanly
7
+ assert_silent do
8
+ Crawlscope.loader.eager_load
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "stringio"
4
+ require "test_helper"
5
+
6
+ class CrawlscopeReporterTest < Minitest::Test
7
+ def test_reports_ok_result
8
+ io = StringIO.new
9
+ result = Crawlscope::Result.new(
10
+ base_url: "https://example.com",
11
+ sitemap_path: "/tmp/sitemap.xml",
12
+ urls: ["https://example.com"],
13
+ pages: [Object.new],
14
+ issues: Crawlscope::IssueCollection.new
15
+ )
16
+
17
+ Crawlscope::Reporter.new(io: io).report(result)
18
+
19
+ output = io.string
20
+
21
+ assert_includes output, "Crawlscope validation"
22
+ assert_includes output, "Status: OK"
23
+ refute_includes output, "Status: FAILED"
24
+ end
25
+
26
+ def test_reports_failed_result_with_severity_counts
27
+ io = StringIO.new
28
+ issues = Crawlscope::IssueCollection.new
29
+ issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com/a", message: "missing <title>", details: {})
30
+ issues.add(code: :broken_internal_link, severity: :notice, category: :links, url: "https://example.com/b", message: "broken internal link", details: {})
31
+ result = Crawlscope::Result.new(
32
+ base_url: "https://example.com",
33
+ sitemap_path: "/tmp/sitemap.xml",
34
+ urls: ["https://example.com/a", "https://example.com/b"],
35
+ pages: [Object.new, Object.new],
36
+ issues: issues
37
+ )
38
+
39
+ Crawlscope::Reporter.new(io: io).report(result)
40
+
41
+ output = io.string
42
+
43
+ assert_includes output, "Status: FAILED"
44
+ assert_includes output, "Issues: 2"
45
+ assert_includes output, "notice: 1"
46
+ assert_includes output, "warning: 1"
47
+ assert_includes output, "- [warning] https://example.com/a missing <title>"
48
+ assert_includes output, "- [notice] https://example.com/b broken internal link"
49
+ end
50
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeSchemaRegistryTest < Minitest::Test
6
+ def test_registers_and_fetches_schema_by_type
7
+ registry = Crawlscope::SchemaRegistry.default
8
+ schema = {"type" => "object"}
9
+
10
+ registry.register("Article", schema)
11
+
12
+ assert registry.registered?("Article")
13
+ assert_equal schema, registry.fetch("Article")
14
+ end
15
+
16
+ def test_dup_copies_registered_schemas
17
+ registry = Crawlscope::SchemaRegistry.new(schemas: {"ThingOne" => {"type" => "object"}})
18
+
19
+ copy = registry.dup
20
+ copy.register("ThingTwo", {"type" => "object"})
21
+
22
+ assert registry.registered?("ThingOne")
23
+ refute registry.registered?("ThingTwo")
24
+ assert copy.registered?("ThingTwo")
25
+ end
26
+
27
+ def test_validate_reports_default_schema_errors
28
+ errors = Crawlscope::SchemaRegistry.default.validate(
29
+ {
30
+ "@context" => "https://schema.org",
31
+ "@type" => "Article"
32
+ }
33
+ )
34
+
35
+ assert_predicate errors, :any?
36
+ assert_equal "Article", errors.first[:type]
37
+ assert_includes errors.first[:issue], "headline"
38
+ end
39
+
40
+ def test_default_registry_includes_extended_schema_types
41
+ registry = Crawlscope::SchemaRegistry.default
42
+
43
+ assert registry.registered?("HowTo")
44
+ assert registry.registered?("Recipe")
45
+ assert registry.registered?("Event")
46
+ assert registry.registered?("VideoObject")
47
+ end
48
+
49
+ def test_web_application_review_requires_review_rating
50
+ errors = Crawlscope::SchemaRegistry.default.validate(
51
+ {
52
+ "@context" => "https://schema.org",
53
+ "@type" => "WebApplication",
54
+ "name" => "ROI Calculator",
55
+ "url" => "https://example.com/tools/uplift",
56
+ "review" => {
57
+ "@type" => "Review",
58
+ "reviewBody" => "Helpful tool."
59
+ }
60
+ }
61
+ )
62
+
63
+ assert errors.any? { |error| error[:issue].include?("did not contain a required property of 'reviewRating'") }
64
+ end
65
+
66
+ def test_product_allows_image_object_variants
67
+ errors = Crawlscope::SchemaRegistry.default.validate(
68
+ {
69
+ "@context" => "https://schema.org",
70
+ "@type" => "Product",
71
+ "name" => "Example Product",
72
+ "image" => {
73
+ "@type" => "ImageObject",
74
+ "url" => "https://example.com/image.png"
75
+ }
76
+ }
77
+ )
78
+
79
+ assert_empty errors
80
+ end
81
+
82
+ def test_rule_registry_raises_for_unknown_rules
83
+ error = assert_raises(Crawlscope::ConfigurationError) do
84
+ Crawlscope::RuleRegistry.default.rules_for("metadata,unknown")
85
+ end
86
+
87
+ assert_equal "Unknown Crawlscope rules: unknown", error.message
88
+ end
89
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeSitemapTest < Minitest::Test
6
+ def test_parses_remote_sitemap_urlset
7
+ stub_request(:get, "https://www.example.com/sitemap.xml")
8
+ .to_return(
9
+ status: 200,
10
+ body: <<~XML
11
+ <?xml version="1.0" encoding="UTF-8"?>
12
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
13
+ <url><loc>https://www.example.com/</loc></url>
14
+ <url><loc>/pricing</loc></url>
15
+ </urlset>
16
+ XML
17
+ )
18
+
19
+ parser = Crawlscope::Sitemap.new(path: "https://www.example.com/sitemap.xml")
20
+
21
+ assert_equal ["https://www.example.com/", "https://www.example.com/pricing"], parser.urls(base_url: "https://www.example.com")
22
+ end
23
+
24
+ def test_parses_remote_sitemap_index_with_child_sitemap
25
+ stub_request(:get, "https://www.example.com/sitemap.xml")
26
+ .to_return(
27
+ status: 200,
28
+ body: <<~XML
29
+ <?xml version="1.0" encoding="UTF-8"?>
30
+ <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
31
+ <sitemap><loc>/sitemaps/content.xml</loc></sitemap>
32
+ </sitemapindex>
33
+ XML
34
+ )
35
+
36
+ stub_request(:get, "https://www.example.com/sitemaps/content.xml")
37
+ .to_return(
38
+ status: 200,
39
+ body: <<~XML
40
+ <?xml version="1.0" encoding="UTF-8"?>
41
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
42
+ <url><loc>https://www.example.com/features/reviews</loc></url>
43
+ </urlset>
44
+ XML
45
+ )
46
+
47
+ parser = Crawlscope::Sitemap.new(path: "https://www.example.com/sitemap.xml")
48
+
49
+ assert_equal ["https://www.example.com/features/reviews"], parser.urls(base_url: "https://www.example.com")
50
+ end
51
+ end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeStructuredDataAuditTest < Minitest::Test
6
+ class FakeBrowser
7
+ attr_reader :closed
8
+
9
+ def initialize(page:)
10
+ @page = page
11
+ @closed = false
12
+ end
13
+
14
+ def close
15
+ @closed = true
16
+ end
17
+
18
+ def fetch(_url)
19
+ @page
20
+ end
21
+ end
22
+
23
+ def test_reports_schema_errors_for_invalid_article_markup
24
+ page = html_page(
25
+ url: "https://example.com/articles/test",
26
+ body: <<~HTML
27
+ <html>
28
+ <head>
29
+ <script type="application/ld+json">
30
+ {"@context":"https://schema.org","@type":"Article"}
31
+ </script>
32
+ </head>
33
+ </html>
34
+ HTML
35
+ )
36
+ browser = FakeBrowser.new(page: page)
37
+ audit = Crawlscope::StructuredData::Audit.new(
38
+ browser_factory: -> { browser },
39
+ renderer: :browser,
40
+ schema_registry: Crawlscope::SchemaRegistry.default,
41
+ timeout_seconds: 20
42
+ )
43
+
44
+ result = audit.call(urls: [page.url])
45
+
46
+ refute result.ok?
47
+ assert_equal 1, result.entries.size
48
+ assert_equal "Article", result.entries.first.errors.first[:type]
49
+ assert browser.closed
50
+ end
51
+
52
+ def test_reports_fetch_errors_for_non_success_statuses
53
+ page = Crawlscope::Page.new(
54
+ url: "https://example.com/missing",
55
+ normalized_url: "https://example.com/missing",
56
+ final_url: "https://example.com/missing",
57
+ normalized_final_url: "https://example.com/missing",
58
+ status: 404,
59
+ headers: {"content-type" => "text/html"},
60
+ body: "",
61
+ doc: Nokogiri::HTML("")
62
+ )
63
+ browser = FakeBrowser.new(page: page)
64
+ audit = Crawlscope::StructuredData::Audit.new(
65
+ browser_factory: -> { browser },
66
+ renderer: :browser,
67
+ schema_registry: Crawlscope::SchemaRegistry.default,
68
+ timeout_seconds: 20
69
+ )
70
+
71
+ result = audit.call(urls: [page.url])
72
+
73
+ refute result.ok?
74
+ assert_equal "Non-success status", result.entries.first.fetch_error
75
+ end
76
+
77
+ def test_skips_non_html_responses_without_treating_them_as_missing_data
78
+ page = Crawlscope::Page.new(
79
+ url: "https://example.com/feed.xml",
80
+ normalized_url: "https://example.com/feed.xml",
81
+ final_url: "https://example.com/feed.xml",
82
+ normalized_final_url: "https://example.com/feed.xml",
83
+ status: 200,
84
+ headers: {"content-type" => "application/xml"},
85
+ body: "<feed></feed>",
86
+ doc: nil
87
+ )
88
+ browser = FakeBrowser.new(page: page)
89
+ audit = Crawlscope::StructuredData::Audit.new(
90
+ browser_factory: -> { browser },
91
+ renderer: :browser,
92
+ schema_registry: Crawlscope::SchemaRegistry.default,
93
+ timeout_seconds: 20
94
+ )
95
+
96
+ result = audit.call(urls: [page.url])
97
+
98
+ assert result.ok?
99
+ assert_equal "application/xml", result.entries.first.content_type
100
+ assert_equal "non-html", result.entries.first.skipped_reason
101
+ assert_predicate result.entries.first, :structured_data_found?
102
+ end
103
+
104
+ private
105
+
106
+ def html_page(url:, body:)
107
+ Crawlscope::Page.new(
108
+ url: url,
109
+ normalized_url: url,
110
+ final_url: url,
111
+ normalized_final_url: url,
112
+ status: 200,
113
+ headers: {"content-type" => "text/html"},
114
+ body: body,
115
+ doc: Nokogiri::HTML(body)
116
+ )
117
+ end
118
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeStructuredDataDocumentTest < Minitest::Test
6
+ def test_items_returns_json_ld_and_microdata_entries
7
+ html = <<~HTML
8
+ <html>
9
+ <body>
10
+ <script type="application/ld+json">
11
+ {"@type":"Hotel","name":"Hotel Test"}
12
+ </script>
13
+
14
+ <div itemscope itemtype="https://schema.org/Organization">
15
+ <span itemprop="name">Acme Hospitality</span>
16
+ </div>
17
+ </body>
18
+ </html>
19
+ HTML
20
+
21
+ document = Crawlscope::StructuredData::Document.new(html: html)
22
+ items = document.items
23
+
24
+ assert_equal 2, items.size
25
+ assert_equal ["json-ld", "microdata"], items.map(&:source)
26
+ assert_equal "Hotel Test", document.json_ld_items.first["name"]
27
+ end
28
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeStructuredDataReportTest < Minitest::Test
6
+ def test_results_maps_validation_errors_and_skips
7
+ result = Crawlscope::StructuredData::Audit::Result.new(
8
+ entries: [
9
+ Crawlscope::StructuredData::Audit::Entry.new(
10
+ url: "https://example.com/article",
11
+ status: 200,
12
+ structured_items: [{source: "json-ld", data: {"@type" => "Article"}}],
13
+ errors: [{type: "Article", source: "json-ld", errors: [{field: "headline", issue: "is required"}]}],
14
+ fetch_error: nil,
15
+ content_type: "text/html",
16
+ skipped_reason: nil
17
+ ),
18
+ Crawlscope::StructuredData::Audit::Entry.new(
19
+ url: "https://example.com/feed.xml",
20
+ status: 200,
21
+ structured_items: [],
22
+ errors: [],
23
+ fetch_error: nil,
24
+ content_type: "application/xml",
25
+ skipped_reason: "non-html"
26
+ )
27
+ ]
28
+ )
29
+
30
+ report = Crawlscope::StructuredData::Report.new(result)
31
+
32
+ assert_equal [{field: "headline", issue: "is required"}], report.results["https://example.com/article"][:validation_errors]
33
+ assert_equal "non-html", report.results["https://example.com/feed.xml"][:skipped_reason]
34
+ assert_empty report.missing_data
35
+ assert_equal 1, report.validation_errors.size
36
+ end
37
+ end