crawlscope 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +31 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +323 -0
  5. data/exe/crawlscope +6 -0
  6. data/lib/crawlscope/audit.rb +128 -0
  7. data/lib/crawlscope/browser.rb +88 -0
  8. data/lib/crawlscope/cli.rb +245 -0
  9. data/lib/crawlscope/configuration.rb +123 -0
  10. data/lib/crawlscope/crawler.rb +28 -0
  11. data/lib/crawlscope/http.rb +77 -0
  12. data/lib/crawlscope/issue.rb +17 -0
  13. data/lib/crawlscope/issue_collection.rb +41 -0
  14. data/lib/crawlscope/page.rb +23 -0
  15. data/lib/crawlscope/railtie.rb +9 -0
  16. data/lib/crawlscope/reporter.rb +33 -0
  17. data/lib/crawlscope/result.rb +9 -0
  18. data/lib/crawlscope/rule_registry.rb +39 -0
  19. data/lib/crawlscope/rules/links.rb +220 -0
  20. data/lib/crawlscope/rules/metadata.rb +93 -0
  21. data/lib/crawlscope/rules/structured_data.rb +58 -0
  22. data/lib/crawlscope/rules/uniqueness.rb +88 -0
  23. data/lib/crawlscope/schema_registry.rb +431 -0
  24. data/lib/crawlscope/sitemap.rb +67 -0
  25. data/lib/crawlscope/structured_data/audit.rb +150 -0
  26. data/lib/crawlscope/structured_data/document.rb +93 -0
  27. data/lib/crawlscope/structured_data/report.rb +77 -0
  28. data/lib/crawlscope/structured_data/reporter.rb +73 -0
  29. data/lib/crawlscope/structured_data/writer.rb +26 -0
  30. data/lib/crawlscope/task.rb +131 -0
  31. data/lib/crawlscope/url.rb +43 -0
  32. data/lib/crawlscope/version.rb +5 -0
  33. data/lib/crawlscope.rb +34 -0
  34. data/lib/tasks/crawlscope_tasks.rake +44 -0
  35. data/test/crawlscope/audit_test.rb +165 -0
  36. data/test/crawlscope/cli_test.rb +157 -0
  37. data/test/crawlscope/configuration_test.rb +45 -0
  38. data/test/crawlscope/links_rule_test.rb +87 -0
  39. data/test/crawlscope/loader_test.rb +11 -0
  40. data/test/crawlscope/reporter_test.rb +50 -0
  41. data/test/crawlscope/schema_registry_test.rb +89 -0
  42. data/test/crawlscope/sitemap_test.rb +51 -0
  43. data/test/crawlscope/structured_data_audit_test.rb +118 -0
  44. data/test/crawlscope/structured_data_document_test.rb +28 -0
  45. data/test/crawlscope/structured_data_report_test.rb +37 -0
  46. data/test/crawlscope/structured_data_reporter_test.rb +32 -0
  47. data/test/crawlscope/structured_data_rule_test.rb +78 -0
  48. data/test/crawlscope/structured_data_writer_test.rb +32 -0
  49. data/test/crawlscope/task_test.rb +206 -0
  50. data/test/crawlscope/uniqueness_rule_test.rb +46 -0
  51. data/test/test_helper.rb +23 -0
  52. metadata +271 -0
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "nokogiri"
5
+
6
+ module Crawlscope
7
+ module StructuredData
8
+ class Document
9
+ Item = Data.define(:source, :data)
10
+
11
+ def initialize(html:)
12
+ @doc = Nokogiri::HTML(html.to_s)
13
+ end
14
+
15
+ def items
16
+ @items ||= extract_json_ld_items + extract_microdata_items
17
+ end
18
+
19
+ def json_ld_items
20
+ items.filter_map do |item|
21
+ next unless item.source == "json-ld"
22
+ next unless item.data.is_a?(Hash)
23
+ next if item.data.key?(:error)
24
+
25
+ item.data
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def extract_json_ld_items
32
+ @doc.css('script[type="application/ld+json"]').flat_map do |node|
33
+ parse_json_ld(node.content)
34
+ end
35
+ end
36
+
37
+ def parse_json_ld(content)
38
+ payload = JSON.parse(content)
39
+ entries = payload.is_a?(Array) ? payload : [payload]
40
+
41
+ entries.filter_map do |entry|
42
+ next unless entry.is_a?(Hash)
43
+
44
+ Item.new(source: "json-ld", data: entry)
45
+ end
46
+ rescue JSON::ParserError => error
47
+ [Item.new(source: "json-ld", data: {error: "Invalid JSON-LD", message: error.message})]
48
+ end
49
+
50
+ def extract_microdata_items
51
+ @doc.css("[itemtype]").filter_map do |node|
52
+ type = node["itemtype"].to_s
53
+ next unless type.start_with?("http://schema.org", "https://schema.org")
54
+
55
+ item = extract_microdata_item(node)
56
+ item["@type"] = type.sub(%r{.*/}, "")
57
+ Item.new(source: "microdata", data: item)
58
+ end
59
+ end
60
+
61
+ def extract_microdata_item(node)
62
+ item = {}
63
+
64
+ node.css("[itemprop]").each do |prop_node|
65
+ prop = prop_node["itemprop"]
66
+ value = extract_microdata_value(prop_node)
67
+ item[prop] = value
68
+ end
69
+
70
+ node.css("[itemtype]").select { |entry| entry["itemprop"].nil? }.each do |nested|
71
+ type = nested["itemtype"].to_s.sub(%r{.*/}, "")
72
+ nested_item = extract_microdata_item(nested)
73
+ nested_item["@type"] = type
74
+ item[type] ||= []
75
+ item[type] << nested_item
76
+ end
77
+
78
+ item
79
+ end
80
+
81
+ def extract_microdata_value(node)
82
+ return if node["itemprop"].nil?
83
+ return node["content"] if node["content"]
84
+ return node["datetime"] if node["datetime"]
85
+ return node["href"] || node["src"] if node["href"] || node["src"]
86
+ return node["value"] if node["value"]
87
+ return node["content"] if node.name == "meta"
88
+
89
+ node.text.strip.empty? ? nil : node.text.strip
90
+ end
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ module StructuredData
5
+ class Report
6
+ def initialize(result)
7
+ @result = result
8
+ end
9
+
10
+ def all_valid?
11
+ http_errors.empty? && missing_data.empty? && validation_errors.empty?
12
+ end
13
+
14
+ def failure_count
15
+ http_errors.size + missing_data.size + validation_errors.size
16
+ end
17
+
18
+ def http_errors
19
+ entries.select { |entry| entry.fetch_error && entry.status != 200 }
20
+ end
21
+
22
+ def missing_data
23
+ entries.select { |entry| entry.status == 200 && !entry.structured_data_found? }
24
+ end
25
+
26
+ def results
27
+ entries.each_with_object({}) do |entry, collection|
28
+ collection[entry.url] = result_for(entry)
29
+ end
30
+ end
31
+
32
+ def total
33
+ entries.size
34
+ end
35
+
36
+ def validation_errors
37
+ entries.select { |entry| entry.status == 200 && entry.errors.any? }
38
+ end
39
+
40
+ private
41
+
42
+ def entries
43
+ @result.entries
44
+ end
45
+
46
+ def result_for(entry)
47
+ if entry.fetch_error && entry.status == 200
48
+ {
49
+ status: entry.status,
50
+ error: entry.fetch_error,
51
+ structured_data_found: false,
52
+ validation_errors: [],
53
+ json_ld_count: 0
54
+ }
55
+ elsif entry.fetch_error
56
+ {
57
+ status: entry.status || "exception",
58
+ error: entry.fetch_error,
59
+ structured_data_found: false,
60
+ validation_errors: [],
61
+ json_ld_count: 0
62
+ }
63
+ else
64
+ {
65
+ status: entry.status || 200,
66
+ error: nil,
67
+ structured_data_found: entry.structured_data_found?,
68
+ validation_errors: entry.errors.flat_map { |error| error[:errors] },
69
+ json_ld_count: entry.json_ld_count,
70
+ skipped_reason: entry.skipped_reason,
71
+ content_type: entry.content_type
72
+ }.compact
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module Crawlscope
6
+ module StructuredData
7
+ class Reporter
8
+ def initialize(io:, report_path: nil)
9
+ @io = io
10
+ @report_path = report_path
11
+ end
12
+
13
+ def report(result)
14
+ report = Report.new(result)
15
+
16
+ if report.all_valid?
17
+ @io.puts("")
18
+ @io.puts("All #{report.total} URLs passed validation.")
19
+ else
20
+ report_failures(report)
21
+ end
22
+ end
23
+
24
+ private
25
+
26
+ def extract_path(url)
27
+ URI.parse(url).path
28
+ rescue URI::InvalidURIError
29
+ url
30
+ end
31
+
32
+ def print_category(name, items)
33
+ return if items.empty?
34
+
35
+ @io.puts("#{name} (#{items.size}):")
36
+ items.each { |item| yield item }
37
+ @io.puts("")
38
+ end
39
+
40
+ def report_failures(report)
41
+ @io.puts("")
42
+ @io.puts("VALIDATION FAILED (#{report.failure_count}/#{report.total} URLs)")
43
+ @io.puts("")
44
+
45
+ print_category("HTTP ERRORS", report.http_errors) do |entry|
46
+ @io.puts("• #{extract_path(entry.url)} (#{entry.status}: #{entry.fetch_error})")
47
+ end
48
+
49
+ print_category("MISSING STRUCTURED DATA", report.missing_data) do |entry|
50
+ @io.puts("• #{extract_path(entry.url)}")
51
+ end
52
+
53
+ print_category("VALIDATION ERRORS", report.validation_errors) do |entry|
54
+ @io.puts("• #{extract_path(entry.url)}")
55
+
56
+ entry.errors.each do |error|
57
+ error[:errors].each do |validation_error|
58
+ field = validation_error[:field] || validation_error["field"] || "$"
59
+ issue = validation_error[:issue] || validation_error["issue"] || "Unknown error"
60
+ @io.puts(" - #{field}: #{issue}")
61
+ end
62
+ end
63
+ end
64
+
65
+ if @report_path
66
+ @io.puts("Full details available in: #{@report_path}")
67
+ end
68
+
69
+ @io.puts("#{report.failure_count} of #{report.total} URLs failed validation.")
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "fileutils"
5
+ require "time"
6
+
7
+ module Crawlscope
8
+ module StructuredData
9
+ class Writer
10
+ def initialize(path:)
11
+ @path = path
12
+ end
13
+
14
+ def write(result)
15
+ FileUtils.mkdir_p(File.dirname(@path))
16
+ File.write(
17
+ @path,
18
+ JSON.pretty_generate(
19
+ generated_at: Time.now.iso8601,
20
+ results: Report.new(result).results
21
+ )
22
+ )
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Crawlscope
6
+ class Task
7
+ def initialize(configuration: Crawlscope.configuration, reporter: Reporter.new(io: configuration.output))
8
+ @configuration = configuration
9
+ @reporter = reporter
10
+ end
11
+
12
+ def validate(base_url: nil, sitemap_path: nil, rule_names: nil)
13
+ resolved_base_url = base_url || default_base_url
14
+ audit = @configuration.audit(
15
+ base_url: resolved_base_url,
16
+ sitemap_path: sitemap_path || default_sitemap_path(base_url: resolved_base_url),
17
+ rule_names: rule_names
18
+ )
19
+
20
+ result = audit.call
21
+ @reporter.report(result)
22
+ result
23
+ end
24
+
25
+ def validate_ldjson(urls:, debug: false, renderer: @configuration.renderer, timeout_seconds: @configuration.timeout_seconds, report_path: nil, summary: false)
26
+ audit = StructuredData::Audit.new(
27
+ browser_factory: @configuration.browser_factory,
28
+ network_idle_timeout_seconds: @configuration.network_idle_timeout_seconds,
29
+ renderer: renderer,
30
+ schema_registry: @configuration.schema_registry,
31
+ scroll_page: @configuration.scroll_page?,
32
+ timeout_seconds: timeout_seconds
33
+ )
34
+ result = audit.call(urls: urls)
35
+
36
+ report_ldjson_result(result, debug: debug, renderer: renderer)
37
+ StructuredData::Writer.new(path: report_path).write(result) if report_path
38
+ StructuredData::Reporter.new(io: @configuration.output, report_path: report_path).report(result) if summary
39
+ result
40
+ end
41
+
42
+ private
43
+
44
+ def default_base_url
45
+ value = @configuration.base_url
46
+ return value unless value.to_s.strip.empty?
47
+
48
+ "http://localhost:3000"
49
+ end
50
+
51
+ def default_sitemap_path(base_url:)
52
+ value = @configuration.sitemap_path
53
+ return value unless value.to_s.strip.empty?
54
+
55
+ local_path = File.expand_path("public/sitemap.xml", Dir.pwd)
56
+ if local_path_default?(base_url: base_url) && File.exist?(local_path)
57
+ return local_path
58
+ end
59
+
60
+ "#{base_url.to_s.chomp("/")}/sitemap.xml"
61
+ end
62
+
63
+ def local_path_default?(base_url:)
64
+ host = URI.parse(base_url.to_s).host.to_s
65
+ ["localhost", "127.0.0.1"].include?(host)
66
+ rescue URI::InvalidURIError
67
+ false
68
+ end
69
+
70
+ def report_ldjson_result(result, debug:, renderer:)
71
+ if renderer == :browser
72
+ @configuration.output.puts("JavaScript mode enabled (Ferrum)")
73
+ end
74
+
75
+ @configuration.output.puts("Validating JSON-LD on #{result.entries.size} URL(s)")
76
+ @configuration.output.puts("")
77
+
78
+ result.entries.each do |entry|
79
+ @configuration.output.puts("=" * 80)
80
+ @configuration.output.puts("URL: #{entry.url}")
81
+ @configuration.output.puts("=" * 80)
82
+
83
+ if entry.fetch_error
84
+ @configuration.output.puts("Error: #{entry.fetch_error}")
85
+ @configuration.output.puts("")
86
+ next
87
+ end
88
+
89
+ if entry.status
90
+ @configuration.output.puts("Status: #{entry.status}")
91
+ else
92
+ @configuration.output.puts("Status: JS runtime fetch")
93
+ end
94
+
95
+ @configuration.output.puts("Structured data found: #{entry.structured_items.size} (JSON-LD: #{entry.json_ld_count}, Microdata: #{entry.microdata_count})")
96
+
97
+ if debug && entry.structured_items.any?
98
+ @configuration.output.puts("")
99
+ @configuration.output.puts("--- Detected Structured Data ---")
100
+
101
+ entry.structured_items.each_with_index do |item, index|
102
+ @configuration.output.puts("")
103
+ @configuration.output.puts("## Item #{index + 1} [#{item[:source]}]")
104
+ @configuration.output.puts(JSON.pretty_generate(item[:data]))
105
+ end
106
+
107
+ @configuration.output.puts("")
108
+ @configuration.output.puts("--- End ---")
109
+ end
110
+
111
+ @configuration.output.puts("")
112
+ @configuration.output.puts("Validation results:")
113
+
114
+ if entry.errors.empty?
115
+ @configuration.output.puts(" All valid!")
116
+ else
117
+ entry.errors.each do |error|
118
+ @configuration.output.puts(" #{error[:type]}: INVALID [#{error[:source]}]")
119
+ error[:errors].each do |validation_error|
120
+ @configuration.output.puts(" - field: #{validation_error[:field]}, issue: #{validation_error[:issue]}")
121
+ end
122
+ end
123
+ end
124
+
125
+ @configuration.output.puts("")
126
+ end
127
+
128
+ @configuration.output.puts("STATUS: #{result.ok? ? "OK" : "FAILED"}")
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module Crawlscope
6
+ module Url
7
+ module_function
8
+
9
+ def normalize(url, base_url:)
10
+ uri = URI.parse(url.to_s)
11
+ uri = URI.join(base_url.to_s, url.to_s) if uri.host.nil?
12
+
13
+ normalized_path = uri.path.to_s
14
+ normalized_path = "/" if normalized_path.empty?
15
+ normalized_path = normalized_path.chomp("/")
16
+ normalized_path = "/" if normalized_path.empty?
17
+
18
+ host = uri.host.to_s
19
+ host = "#{host}:#{uri.port}" if uri.port && uri.port != uri.default_port
20
+
21
+ "#{uri.scheme}://#{host}#{normalized_path}"
22
+ rescue URI::InvalidURIError
23
+ url.to_s
24
+ end
25
+
26
+ def path(url)
27
+ uri = URI.parse(url.to_s)
28
+ value = uri.path.to_s
29
+ value = "/" if value.empty?
30
+ value = value.chomp("/")
31
+ value.empty? ? "/" : value
32
+ rescue URI::InvalidURIError
33
+ nil
34
+ end
35
+
36
+ def remote?(value)
37
+ uri = URI.parse(value.to_s)
38
+ !uri.scheme.nil? && !uri.host.nil?
39
+ rescue URI::InvalidURIError
40
+ false
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ VERSION = "0.1.0"
5
+ end
data/lib/crawlscope.rb ADDED
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+ require "zeitwerk"
5
+
6
+ module Crawlscope
7
+ class Error < StandardError; end
8
+
9
+ class ConfigurationError < Error; end
10
+ class ValidationError < Error; end
11
+
12
+ class << self
13
+ attr_reader :loader
14
+
15
+ def configuration
16
+ @configuration ||= Configuration.new
17
+ end
18
+
19
+ def configure
20
+ yield(configuration)
21
+ end
22
+
23
+ def reset!
24
+ @configuration = Configuration.new
25
+ end
26
+ end
27
+ end
28
+
29
+ Crawlscope.instance_variable_set(:@loader, Zeitwerk::Loader.for_gem)
30
+ Crawlscope.loader.ignore("#{__dir__}/tasks")
31
+ Crawlscope.loader.ignore("#{__dir__}/crawlscope/railtie.rb")
32
+ Crawlscope.loader.setup
33
+
34
+ require "crawlscope/railtie" if defined?(Rails::Railtie)
@@ -0,0 +1,44 @@
1
+ namespace :crawlscope do
2
+ desc "Validate sitemap URLs with the default Crawlscope rules. ENV: BASE_URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
3
+ task validate: :environment do
4
+ status = Crawlscope::Cli.start(["validate"], out: $stdout, err: $stderr)
5
+ exit(status) unless status.zero?
6
+ end
7
+
8
+ namespace :validate do
9
+ desc "Validate JSON-LD on one or more URLs. ENV: URL (required, semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
10
+ task ldjson: :environment do
11
+ status = Crawlscope::Cli.start(["ldjson"], out: $stdout, err: $stderr)
12
+ exit(status) unless status.zero?
13
+ end
14
+
15
+ desc "Validate sitemap URLs with the metadata rule. ENV: BASE_URL, SITEMAP, JS=1"
16
+ task metadata: :environment do
17
+ crawlscope_task_with_rules("metadata")
18
+ end
19
+
20
+ desc "Validate sitemap URLs with the structured_data rule. ENV: BASE_URL, SITEMAP, JS=1"
21
+ task structured_data: :environment do
22
+ crawlscope_task_with_rules("structured_data")
23
+ end
24
+
25
+ desc "Validate sitemap URLs with the uniqueness rule. ENV: BASE_URL, SITEMAP, JS=1"
26
+ task uniqueness: :environment do
27
+ crawlscope_task_with_rules("uniqueness")
28
+ end
29
+
30
+ desc "Validate sitemap URLs with the links rule. ENV: BASE_URL, SITEMAP, JS=1"
31
+ task links: :environment do
32
+ crawlscope_task_with_rules("links")
33
+ end
34
+ end
35
+
36
+ def crawlscope_task_with_rules(rules)
37
+ original_rules = ENV["RULES"]
38
+ ENV["RULES"] = rules
39
+ status = Crawlscope::Cli.start(["validate"], out: $stdout, err: $stderr)
40
+ exit(status) unless status.zero?
41
+ ensure
42
+ ENV["RULES"] = original_rules
43
+ end
44
+ end
@@ -0,0 +1,165 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeAuditTest < Minitest::Test
6
+ def setup
7
+ @tmp_dir = Dir.mktmpdir
8
+ @sitemap_path = File.join(@tmp_dir, "sitemap.xml")
9
+ end
10
+
11
+ def teardown
12
+ FileUtils.rm_rf(@tmp_dir)
13
+ end
14
+
15
+ def test_returns_ok_when_metadata_is_valid
16
+ File.write(
17
+ @sitemap_path,
18
+ <<~XML
19
+ <?xml version="1.0" encoding="UTF-8"?>
20
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
21
+ <url><loc>https://example.com/pricing</loc></url>
22
+ </urlset>
23
+ XML
24
+ )
25
+
26
+ stub_request(:get, "https://example.com/pricing")
27
+ .to_return(
28
+ status: 200,
29
+ headers: {"Content-Type" => "text/html"},
30
+ body: <<~HTML
31
+ <html>
32
+ <head>
33
+ <title>Pricing</title>
34
+ <meta name="description" content="Plans for hotels and restaurants">
35
+ <link rel="canonical" href="https://example.com/pricing">
36
+ </head>
37
+ <body>
38
+ <main>
39
+ <h1>Pricing</h1>
40
+ </main>
41
+ </body>
42
+ </html>
43
+ HTML
44
+ )
45
+
46
+ result = Crawlscope::Audit.new(
47
+ base_url: "https://example.com",
48
+ sitemap_path: @sitemap_path,
49
+ rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
50
+ schema_registry: Crawlscope::SchemaRegistry.default
51
+ ).call
52
+
53
+ assert result.ok?
54
+ assert_empty result.issues.to_a
55
+ end
56
+
57
+ def test_collects_metadata_issues_for_invalid_page
58
+ File.write(
59
+ @sitemap_path,
60
+ <<~XML
61
+ <?xml version="1.0" encoding="UTF-8"?>
62
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
63
+ <url><loc>https://example.com/about</loc></url>
64
+ </urlset>
65
+ XML
66
+ )
67
+
68
+ stub_request(:get, "https://example.com/about")
69
+ .to_return(
70
+ status: 200,
71
+ headers: {"Content-Type" => "text/html"},
72
+ body: <<~HTML
73
+ <html>
74
+ <head>
75
+ <title>Example About Example</title>
76
+ <meta name="description" content="#{"a" * 161}">
77
+ </head>
78
+ <body>
79
+ <main>
80
+ <p>About</p>
81
+ </main>
82
+ </body>
83
+ </html>
84
+ HTML
85
+ )
86
+
87
+ result = Crawlscope::Audit.new(
88
+ base_url: "https://example.com",
89
+ sitemap_path: @sitemap_path,
90
+ rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
91
+ schema_registry: Crawlscope::SchemaRegistry.default
92
+ ).call
93
+
94
+ refute result.ok?
95
+ assert_equal %i[meta_description_too_long missing_canonical missing_h1 title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
96
+ end
97
+
98
+ def test_uses_browser_when_renderer_is_browser
99
+ File.write(
100
+ @sitemap_path,
101
+ <<~XML
102
+ <?xml version="1.0" encoding="UTF-8"?>
103
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
104
+ <url><loc>https://example.com/pricing</loc></url>
105
+ </urlset>
106
+ XML
107
+ )
108
+
109
+ fake_browser = Class.new do
110
+ attr_reader :closed, :urls
111
+
112
+ def initialize
113
+ @closed = false
114
+ @urls = []
115
+ end
116
+
117
+ def close
118
+ @closed = true
119
+ end
120
+
121
+ def fetch(url)
122
+ @urls << url
123
+
124
+ body = <<~HTML
125
+ <html>
126
+ <head>
127
+ <title>Pricing</title>
128
+ <meta name="description" content="Plans for hotels and restaurants">
129
+ <link rel="canonical" href="https://example.com/pricing">
130
+ </head>
131
+ <body>
132
+ <main>
133
+ <h1>Pricing</h1>
134
+ </main>
135
+ </body>
136
+ </html>
137
+ HTML
138
+
139
+ Crawlscope::Page.new(
140
+ url: url,
141
+ normalized_url: url,
142
+ final_url: url,
143
+ normalized_final_url: url,
144
+ status: 200,
145
+ headers: {"content-type" => "text/html"},
146
+ body: body,
147
+ doc: Nokogiri::HTML(body)
148
+ )
149
+ end
150
+ end.new
151
+
152
+ result = Crawlscope::Audit.new(
153
+ base_url: "https://example.com",
154
+ sitemap_path: @sitemap_path,
155
+ rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
156
+ schema_registry: Crawlscope::SchemaRegistry.default,
157
+ renderer: :browser,
158
+ browser_factory: -> { fake_browser }
159
+ ).call
160
+
161
+ assert result.ok?
162
+ assert_equal ["https://example.com/pricing"], fake_browser.urls
163
+ assert fake_browser.closed
164
+ end
165
+ end