crawlscope 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +7 -11
  3. data/README.md +20 -13
  4. data/lib/crawlscope/browser.rb +8 -0
  5. data/lib/crawlscope/cli.rb +10 -10
  6. data/lib/crawlscope/configuration.rb +20 -5
  7. data/lib/crawlscope/context.rb +9 -0
  8. data/lib/crawlscope/{audit.rb → crawl.rb} +62 -58
  9. data/lib/crawlscope/crawler.rb +19 -1
  10. data/lib/crawlscope/http.rb +1 -1
  11. data/lib/crawlscope/rake_tasks.rb +28 -0
  12. data/lib/crawlscope/rules/links.rb +76 -43
  13. data/lib/crawlscope/rules/structured_data.rb +14 -1
  14. data/lib/crawlscope/run.rb +60 -0
  15. data/lib/crawlscope/schema_registry.rb +3 -349
  16. data/lib/crawlscope/schemas.rb +355 -0
  17. data/lib/crawlscope/sitemap.rb +18 -6
  18. data/lib/crawlscope/structured_data/audit.rb +7 -7
  19. data/lib/crawlscope/structured_data/check.rb +35 -0
  20. data/lib/crawlscope/structured_data/reporter.rb +69 -0
  21. data/lib/crawlscope/url.rb +14 -0
  22. data/lib/crawlscope/version.rb +1 -1
  23. data/lib/tasks/crawlscope_tasks.rake +12 -23
  24. data/test/crawlscope/browser_test.rb +155 -0
  25. data/test/crawlscope/cli_test.rb +128 -6
  26. data/test/crawlscope/configuration_test.rb +49 -0
  27. data/test/crawlscope/{audit_test.rb → crawl_test.rb} +11 -5
  28. data/test/crawlscope/crawler_test.rb +34 -0
  29. data/test/crawlscope/http_test.rb +56 -0
  30. data/test/crawlscope/links_rule_test.rb +110 -5
  31. data/test/crawlscope/rule_registry_test.rb +32 -0
  32. data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
  33. data/test/crawlscope/schema_registry_test.rb +19 -0
  34. data/test/crawlscope/sitemap_test.rb +55 -0
  35. data/test/crawlscope/structured_data_document_test.rb +36 -0
  36. data/test/crawlscope/structured_data_report_test.rb +3 -3
  37. data/test/crawlscope/structured_data_reporter_test.rb +2 -2
  38. data/test/crawlscope/structured_data_rule_test.rb +20 -0
  39. data/test/crawlscope/structured_data_writer_test.rb +2 -2
  40. data/test/crawlscope/url_test.rb +31 -0
  41. metadata +14 -5
  42. data/lib/crawlscope/task.rb +0 -131
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeRuleRegistryTest < Minitest::Test
6
+ Rule = Data.define(:code)
7
+
8
+ def test_rules_for_returns_defaults_when_names_are_blank
9
+ metadata = Rule.new(:metadata)
10
+ links = Rule.new(:links)
11
+ registry = Crawlscope::RuleRegistry.new(rules: [metadata, links], default_codes: %i[links])
12
+
13
+ assert_equal [links], registry.rules_for(nil)
14
+ assert_equal [links], registry.rules_for("")
15
+ end
16
+
17
+ def test_rules_for_accepts_csv_and_arrays
18
+ metadata = Rule.new(:metadata)
19
+ links = Rule.new(:links)
20
+ registry = Crawlscope::RuleRegistry.new(rules: [metadata, links])
21
+
22
+ assert_equal [metadata, links], registry.rules_for(["metadata, links"])
23
+ end
24
+
25
+ def test_rules_for_rejects_unknown_rules
26
+ registry = Crawlscope::RuleRegistry.new(rules: [Rule.new(:metadata)])
27
+
28
+ error = assert_raises(Crawlscope::ConfigurationError) { registry.rules_for("links") }
29
+
30
+ assert_equal "Unknown Crawlscope rules: links", error.message
31
+ end
32
+ end
@@ -2,7 +2,7 @@
2
2
 
3
3
  require "test_helper"
4
4
 
5
- class CrawlscopeTaskTest < Minitest::Test
5
+ class CrawlscopeRunTest < Minitest::Test
6
6
  FakeResult = Data.define(:reported) do
7
7
  def ok?
8
8
  true
@@ -47,16 +47,29 @@ class CrawlscopeTaskTest < Minitest::Test
47
47
  end
48
48
  end
49
49
 
50
- class LdjsonConfiguration
50
+ class JsonLdConfiguration
51
51
  attr_reader :output
52
52
 
53
- def initialize(output:, browser:)
53
+ def initialize(output:, page:)
54
54
  @output = output
55
- @browser = browser
55
+ @page = page
56
+ @closed = false
56
57
  end
57
58
 
58
59
  def browser_factory
59
- -> { @browser }
60
+ -> { self }
61
+ end
62
+
63
+ def close
64
+ @closed = true
65
+ end
66
+
67
+ def closed?
68
+ @closed
69
+ end
70
+
71
+ def fetch(_url)
72
+ @page
60
73
  end
61
74
 
62
75
  def network_idle_timeout_seconds
@@ -80,30 +93,13 @@ class CrawlscopeTaskTest < Minitest::Test
80
93
  end
81
94
  end
82
95
 
83
- class FakeBrowser
84
- attr_reader :closed
85
-
86
- def initialize(page:)
87
- @page = page
88
- @closed = false
89
- end
90
-
91
- def close
92
- @closed = true
93
- end
94
-
95
- def fetch(_url)
96
- @page
97
- end
98
- end
99
-
100
96
  def test_validate_passes_rule_names_to_configuration_audit
101
97
  result = FakeResult.new(reported: true)
102
98
  configuration = FakeConfiguration.new(result: result)
103
99
  reporter = FakeReporter.new
104
100
 
105
- task = Crawlscope::Task.new(configuration: configuration, reporter: reporter)
106
- returned_result = task.validate(rule_names: "links")
101
+ run = Crawlscope::Run.new(configuration: configuration, reporter: reporter)
102
+ returned_result = run.validate(rule_names: "links")
107
103
 
108
104
  assert_equal(
109
105
  {
@@ -122,7 +118,7 @@ class CrawlscopeTaskTest < Minitest::Test
122
118
  configuration = FakeConfiguration.new(result: result, base_url: "https://example.com", sitemap_path: nil)
123
119
  reporter = FakeReporter.new
124
120
 
125
- Crawlscope::Task.new(configuration: configuration, reporter: reporter).validate
121
+ Crawlscope::Run.new(configuration: configuration, reporter: reporter).validate
126
122
 
127
123
  assert_equal(
128
124
  {
@@ -144,7 +140,7 @@ class CrawlscopeTaskTest < Minitest::Test
144
140
  File.write(sitemap_path, "<urlset></urlset>")
145
141
 
146
142
  Dir.chdir(tmp_dir) do
147
- Crawlscope::Task.new(configuration: configuration, reporter: reporter).validate
143
+ Crawlscope::Run.new(configuration: configuration, reporter: reporter).validate
148
144
  end
149
145
 
150
146
  assert_equal(
@@ -159,7 +155,7 @@ class CrawlscopeTaskTest < Minitest::Test
159
155
  FileUtils.rm_rf(tmp_dir) if tmp_dir
160
156
  end
161
157
 
162
- def test_validate_ldjson_uses_real_audit_and_writes_report
158
+ def test_validate_json_ld_reports_valid_structured_data
163
159
  body = <<~HTML
164
160
  <html>
165
161
  <head>
@@ -177,15 +173,14 @@ class CrawlscopeTaskTest < Minitest::Test
177
173
  status: 200,
178
174
  headers: {"content-type" => "text/html"},
179
175
  body: body,
180
- doc: Nokogiri::HTML(body)
176
+ doc: nil
181
177
  )
182
- browser = FakeBrowser.new(page: page)
183
178
  output = StringIO.new
184
- configuration = LdjsonConfiguration.new(output: output, browser: browser)
179
+ configuration = JsonLdConfiguration.new(output: output, page: page)
185
180
  report_dir = Dir.mktmpdir
186
181
  report_path = File.join(report_dir, "structured-data.json")
187
182
 
188
- result = Crawlscope::Task.new(configuration: configuration).validate_ldjson(
183
+ result = Crawlscope::Run.new(configuration: configuration).validate_json_ld(
189
184
  urls: [page.url],
190
185
  debug: true,
191
186
  report_path: report_path,
@@ -193,9 +188,9 @@ class CrawlscopeTaskTest < Minitest::Test
193
188
  )
194
189
 
195
190
  assert result.ok?
196
- assert browser.closed
191
+ assert_predicate configuration, :closed?
197
192
  assert File.exist?(report_path)
198
- assert_includes File.read(report_path), "https://example.com"
193
+ assert_equal ["https://example.com"], JSON.parse(File.read(report_path)).fetch("results").keys
199
194
  assert_includes output.string, "JavaScript mode enabled (Ferrum)"
200
195
  assert_includes output.string, "Validating JSON-LD on 1 URL(s)"
201
196
  assert_includes output.string, "All valid!"
@@ -86,4 +86,23 @@ class CrawlscopeSchemaRegistryTest < Minitest::Test
86
86
 
87
87
  assert_equal "Unknown Crawlscope rules: unknown", error.message
88
88
  end
89
+
90
+ def test_validate_accepts_arrays_graphs_unknown_types_and_non_hashes
91
+ registry = Crawlscope::SchemaRegistry.default
92
+
93
+ errors = registry.validate(
94
+ [
95
+ "ignored",
96
+ {"@type" => "UnknownThing"},
97
+ {
98
+ "@graph" => [
99
+ {"@type" => "Article"},
100
+ {"@type" => "WebSite", "name" => "Example"}
101
+ ]
102
+ }
103
+ ]
104
+ )
105
+
106
+ assert_equal ["Article"], errors.map { |error| error[:type] }
107
+ end
89
108
  end
@@ -48,4 +48,59 @@ class CrawlscopeSitemapTest < Minitest::Test
48
48
 
49
49
  assert_equal ["https://www.example.com/features/reviews"], parser.urls(base_url: "https://www.example.com")
50
50
  end
51
+
52
+ def test_rebases_remote_sitemap_index_children_to_base_url
53
+ stub_request(:get, "http://localhost:3000/sitemap.xml")
54
+ .to_return(
55
+ status: 200,
56
+ body: <<~XML
57
+ <?xml version="1.0" encoding="UTF-8"?>
58
+ <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
59
+ <sitemap><loc>https://www.example.com/sitemap-marketing.xml</loc></sitemap>
60
+ </sitemapindex>
61
+ XML
62
+ )
63
+
64
+ stub_request(:get, "http://localhost:3000/sitemap-marketing.xml")
65
+ .to_return(
66
+ status: 200,
67
+ body: <<~XML
68
+ <?xml version="1.0" encoding="UTF-8"?>
69
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
70
+ <url><loc>https://www.example.com/features/reviews</loc></url>
71
+ </urlset>
72
+ XML
73
+ )
74
+
75
+ parser = Crawlscope::Sitemap.new(path: "http://localhost:3000/sitemap.xml")
76
+
77
+ assert_equal ["http://localhost:3000/features/reviews"], parser.urls(base_url: "http://localhost:3000")
78
+ end
79
+
80
+ def test_parses_local_sitemap_index_with_absolute_child_sitemap_loc
81
+ Dir.mktmpdir do |dir|
82
+ File.write(
83
+ File.join(dir, "sitemap.xml"),
84
+ <<~XML
85
+ <?xml version="1.0" encoding="UTF-8"?>
86
+ <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
87
+ <sitemap><loc>https://www.example.com/sitemap-pages.xml</loc></sitemap>
88
+ </sitemapindex>
89
+ XML
90
+ )
91
+ File.write(
92
+ File.join(dir, "sitemap-pages.xml"),
93
+ <<~XML
94
+ <?xml version="1.0" encoding="UTF-8"?>
95
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
96
+ <url><loc>https://www.example.com/features/reviews</loc></url>
97
+ </urlset>
98
+ XML
99
+ )
100
+
101
+ parser = Crawlscope::Sitemap.new(path: File.join(dir, "sitemap.xml"))
102
+
103
+ assert_equal ["http://localhost:3000/features/reviews"], parser.urls(base_url: "http://localhost:3000")
104
+ end
105
+ end
51
106
  end
@@ -25,4 +25,40 @@ class CrawlscopeStructuredDataDocumentTest < Minitest::Test
25
25
  assert_equal ["json-ld", "microdata"], items.map(&:source)
26
26
  assert_equal "Hotel Test", document.json_ld_items.first["name"]
27
27
  end
28
+
29
+ def test_json_ld_handles_arrays_invalid_json_and_non_object_entries
30
+ html = <<~HTML
31
+ <script type="application/ld+json">
32
+ [{"@type":"WebSite","name":"Example"}, "ignored"]
33
+ </script>
34
+ <script type="application/ld+json">
35
+ {"@type":
36
+ </script>
37
+ HTML
38
+
39
+ document = Crawlscope::StructuredData::Document.new(html: html)
40
+
41
+ assert_equal 2, document.items.size
42
+ assert_equal ["WebSite"], document.json_ld_items.map { |item| item["@type"] }
43
+ assert_equal "Invalid JSON-LD", document.items.last.data[:error]
44
+ end
45
+
46
+ def test_microdata_extracts_common_value_attributes
47
+ html = <<~HTML
48
+ <div itemscope itemtype="https://schema.org/Event">
49
+ <meta itemprop="name" content="Launch">
50
+ <time itemprop="startDate" datetime="2026-04-24T10:00:00Z"></time>
51
+ <a itemprop="url" href="https://example.com/event">Event</a>
52
+ <data itemprop="position" value="1"></data>
53
+ </div>
54
+ HTML
55
+
56
+ item = Crawlscope::StructuredData::Document.new(html: html).items.first.data
57
+
58
+ assert_equal "Event", item["@type"]
59
+ assert_equal "Launch", item["name"]
60
+ assert_equal "2026-04-24T10:00:00Z", item["startDate"]
61
+ assert_equal "https://example.com/event", item["url"]
62
+ assert_equal "1", item["position"]
63
+ end
28
64
  end
@@ -4,9 +4,9 @@ require "test_helper"
4
4
 
5
5
  class CrawlscopeStructuredDataReportTest < Minitest::Test
6
6
  def test_results_maps_validation_errors_and_skips
7
- result = Crawlscope::StructuredData::Audit::Result.new(
7
+ result = Crawlscope::StructuredData::Audit::Outcome.new(
8
8
  entries: [
9
- Crawlscope::StructuredData::Audit::Entry.new(
9
+ Crawlscope::StructuredData::Audit::Page.new(
10
10
  url: "https://example.com/article",
11
11
  status: 200,
12
12
  structured_items: [{source: "json-ld", data: {"@type" => "Article"}}],
@@ -15,7 +15,7 @@ class CrawlscopeStructuredDataReportTest < Minitest::Test
15
15
  content_type: "text/html",
16
16
  skipped_reason: nil
17
17
  ),
18
- Crawlscope::StructuredData::Audit::Entry.new(
18
+ Crawlscope::StructuredData::Audit::Page.new(
19
19
  url: "https://example.com/feed.xml",
20
20
  status: 200,
21
21
  structured_items: [],
@@ -5,9 +5,9 @@ require "test_helper"
5
5
 
6
6
  class CrawlscopeStructuredDataReporterTest < Minitest::Test
7
7
  def test_reports_failures_and_report_path
8
- result = Crawlscope::StructuredData::Audit::Result.new(
8
+ result = Crawlscope::StructuredData::Audit::Outcome.new(
9
9
  entries: [
10
- Crawlscope::StructuredData::Audit::Entry.new(
10
+ Crawlscope::StructuredData::Audit::Page.new(
11
11
  url: "https://example.com/article",
12
12
  status: 200,
13
13
  structured_items: [{source: "json-ld", data: {"@type" => "Article"}}],
@@ -59,6 +59,26 @@ class CrawlscopeStructuredDataRuleTest < Minitest::Test
59
59
  assert_equal [:structured_data_parse_error], issues.to_a.map(&:code)
60
60
  end
61
61
 
62
+ def test_reports_missing_structured_data_for_html_pages
63
+ issues = Crawlscope::IssueCollection.new
64
+ rule = Crawlscope::Rules::StructuredData.new
65
+ page = page(
66
+ url: "https://example.com/articles/test",
67
+ body: "<html><body><main><h1>Article</h1></main></body></html>"
68
+ )
69
+
70
+ rule.call(
71
+ urls: [page.url],
72
+ pages: [page],
73
+ issues: issues,
74
+ context: {schema_registry: Crawlscope::SchemaRegistry.default}
75
+ )
76
+
77
+ assert_equal [:missing_structured_data], issues.to_a.map(&:code)
78
+ assert_equal "no structured data found; add JSON-LD or microdata markup", issues.to_a.first.message
79
+ assert_equal ["json-ld", "microdata"], issues.to_a.first.details[:expected_sources]
80
+ end
81
+
62
82
  private
63
83
 
64
84
  def page(url:, body:)
@@ -5,9 +5,9 @@ require "test_helper"
5
5
 
6
6
  class CrawlscopeStructuredDataWriterTest < Minitest::Test
7
7
  def test_writes_json_report
8
- result = Crawlscope::StructuredData::Audit::Result.new(
8
+ result = Crawlscope::StructuredData::Audit::Outcome.new(
9
9
  entries: [
10
- Crawlscope::StructuredData::Audit::Entry.new(
10
+ Crawlscope::StructuredData::Audit::Page.new(
11
11
  url: "https://example.com/article",
12
12
  status: 200,
13
13
  structured_items: [{source: "json-ld", data: {"@type" => "Article"}}],
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeUrlTest < Minitest::Test
6
+ def test_normalize_resolves_relative_urls_and_removes_trailing_slash
7
+ assert_equal "https://example.com/pricing", Crawlscope::Url.normalize("/pricing/", base_url: "https://example.com")
8
+ end
9
+
10
+ def test_normalize_preserves_non_default_port
11
+ assert_equal "http://localhost:3000/pricing", Crawlscope::Url.normalize("/pricing", base_url: "http://localhost:3000")
12
+ end
13
+
14
+ def test_normalize_for_base_rebases_absolute_urls
15
+ assert_equal(
16
+ "http://localhost:3000/features",
17
+ Crawlscope::Url.normalize_for_base("https://www.example.com/features", base_url: "http://localhost:3000")
18
+ )
19
+ end
20
+
21
+ def test_path_normalizes_blank_and_trailing_slash
22
+ assert_equal "/", Crawlscope::Url.path("https://example.com")
23
+ assert_equal "/features", Crawlscope::Url.path("https://example.com/features/")
24
+ end
25
+
26
+ def test_invalid_urls_are_returned_or_ignored
27
+ assert_equal "http:// bad", Crawlscope::Url.normalize("http:// bad", base_url: "https://example.com")
28
+ assert_nil Crawlscope::Url.path("http:// bad")
29
+ refute Crawlscope::Url.remote?("http:// bad")
30
+ end
31
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crawlscope
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paulo Fidalgo
@@ -193,16 +193,18 @@ files:
193
193
  - README.md
194
194
  - exe/crawlscope
195
195
  - lib/crawlscope.rb
196
- - lib/crawlscope/audit.rb
197
196
  - lib/crawlscope/browser.rb
198
197
  - lib/crawlscope/cli.rb
199
198
  - lib/crawlscope/configuration.rb
199
+ - lib/crawlscope/context.rb
200
+ - lib/crawlscope/crawl.rb
200
201
  - lib/crawlscope/crawler.rb
201
202
  - lib/crawlscope/http.rb
202
203
  - lib/crawlscope/issue.rb
203
204
  - lib/crawlscope/issue_collection.rb
204
205
  - lib/crawlscope/page.rb
205
206
  - lib/crawlscope/railtie.rb
207
+ - lib/crawlscope/rake_tasks.rb
206
208
  - lib/crawlscope/reporter.rb
207
209
  - lib/crawlscope/result.rb
208
210
  - lib/crawlscope/rule_registry.rb
@@ -210,23 +212,30 @@ files:
210
212
  - lib/crawlscope/rules/metadata.rb
211
213
  - lib/crawlscope/rules/structured_data.rb
212
214
  - lib/crawlscope/rules/uniqueness.rb
215
+ - lib/crawlscope/run.rb
213
216
  - lib/crawlscope/schema_registry.rb
217
+ - lib/crawlscope/schemas.rb
214
218
  - lib/crawlscope/sitemap.rb
215
219
  - lib/crawlscope/structured_data/audit.rb
220
+ - lib/crawlscope/structured_data/check.rb
216
221
  - lib/crawlscope/structured_data/document.rb
217
222
  - lib/crawlscope/structured_data/report.rb
218
223
  - lib/crawlscope/structured_data/reporter.rb
219
224
  - lib/crawlscope/structured_data/writer.rb
220
- - lib/crawlscope/task.rb
221
225
  - lib/crawlscope/url.rb
222
226
  - lib/crawlscope/version.rb
223
227
  - lib/tasks/crawlscope_tasks.rake
224
- - test/crawlscope/audit_test.rb
228
+ - test/crawlscope/browser_test.rb
225
229
  - test/crawlscope/cli_test.rb
226
230
  - test/crawlscope/configuration_test.rb
231
+ - test/crawlscope/crawl_test.rb
232
+ - test/crawlscope/crawler_test.rb
233
+ - test/crawlscope/http_test.rb
227
234
  - test/crawlscope/links_rule_test.rb
228
235
  - test/crawlscope/loader_test.rb
229
236
  - test/crawlscope/reporter_test.rb
237
+ - test/crawlscope/rule_registry_test.rb
238
+ - test/crawlscope/run_test.rb
230
239
  - test/crawlscope/schema_registry_test.rb
231
240
  - test/crawlscope/sitemap_test.rb
232
241
  - test/crawlscope/structured_data_audit_test.rb
@@ -235,8 +244,8 @@ files:
235
244
  - test/crawlscope/structured_data_reporter_test.rb
236
245
  - test/crawlscope/structured_data_rule_test.rb
237
246
  - test/crawlscope/structured_data_writer_test.rb
238
- - test/crawlscope/task_test.rb
239
247
  - test/crawlscope/uniqueness_rule_test.rb
248
+ - test/crawlscope/url_test.rb
240
249
  - test/test_helper.rb
241
250
  homepage: https://www.ethos-link.com/opensource/crawlscope
242
251
  licenses:
@@ -1,131 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "json"
4
-
5
- module Crawlscope
6
- class Task
7
- def initialize(configuration: Crawlscope.configuration, reporter: Reporter.new(io: configuration.output))
8
- @configuration = configuration
9
- @reporter = reporter
10
- end
11
-
12
- def validate(base_url: nil, sitemap_path: nil, rule_names: nil)
13
- resolved_base_url = base_url || default_base_url
14
- audit = @configuration.audit(
15
- base_url: resolved_base_url,
16
- sitemap_path: sitemap_path || default_sitemap_path(base_url: resolved_base_url),
17
- rule_names: rule_names
18
- )
19
-
20
- result = audit.call
21
- @reporter.report(result)
22
- result
23
- end
24
-
25
- def validate_ldjson(urls:, debug: false, renderer: @configuration.renderer, timeout_seconds: @configuration.timeout_seconds, report_path: nil, summary: false)
26
- audit = StructuredData::Audit.new(
27
- browser_factory: @configuration.browser_factory,
28
- network_idle_timeout_seconds: @configuration.network_idle_timeout_seconds,
29
- renderer: renderer,
30
- schema_registry: @configuration.schema_registry,
31
- scroll_page: @configuration.scroll_page?,
32
- timeout_seconds: timeout_seconds
33
- )
34
- result = audit.call(urls: urls)
35
-
36
- report_ldjson_result(result, debug: debug, renderer: renderer)
37
- StructuredData::Writer.new(path: report_path).write(result) if report_path
38
- StructuredData::Reporter.new(io: @configuration.output, report_path: report_path).report(result) if summary
39
- result
40
- end
41
-
42
- private
43
-
44
- def default_base_url
45
- value = @configuration.base_url
46
- return value unless value.to_s.strip.empty?
47
-
48
- "http://localhost:3000"
49
- end
50
-
51
- def default_sitemap_path(base_url:)
52
- value = @configuration.sitemap_path
53
- return value unless value.to_s.strip.empty?
54
-
55
- local_path = File.expand_path("public/sitemap.xml", Dir.pwd)
56
- if local_path_default?(base_url: base_url) && File.exist?(local_path)
57
- return local_path
58
- end
59
-
60
- "#{base_url.to_s.chomp("/")}/sitemap.xml"
61
- end
62
-
63
- def local_path_default?(base_url:)
64
- host = URI.parse(base_url.to_s).host.to_s
65
- ["localhost", "127.0.0.1"].include?(host)
66
- rescue URI::InvalidURIError
67
- false
68
- end
69
-
70
- def report_ldjson_result(result, debug:, renderer:)
71
- if renderer == :browser
72
- @configuration.output.puts("JavaScript mode enabled (Ferrum)")
73
- end
74
-
75
- @configuration.output.puts("Validating JSON-LD on #{result.entries.size} URL(s)")
76
- @configuration.output.puts("")
77
-
78
- result.entries.each do |entry|
79
- @configuration.output.puts("=" * 80)
80
- @configuration.output.puts("URL: #{entry.url}")
81
- @configuration.output.puts("=" * 80)
82
-
83
- if entry.fetch_error
84
- @configuration.output.puts("Error: #{entry.fetch_error}")
85
- @configuration.output.puts("")
86
- next
87
- end
88
-
89
- if entry.status
90
- @configuration.output.puts("Status: #{entry.status}")
91
- else
92
- @configuration.output.puts("Status: JS runtime fetch")
93
- end
94
-
95
- @configuration.output.puts("Structured data found: #{entry.structured_items.size} (JSON-LD: #{entry.json_ld_count}, Microdata: #{entry.microdata_count})")
96
-
97
- if debug && entry.structured_items.any?
98
- @configuration.output.puts("")
99
- @configuration.output.puts("--- Detected Structured Data ---")
100
-
101
- entry.structured_items.each_with_index do |item, index|
102
- @configuration.output.puts("")
103
- @configuration.output.puts("## Item #{index + 1} [#{item[:source]}]")
104
- @configuration.output.puts(JSON.pretty_generate(item[:data]))
105
- end
106
-
107
- @configuration.output.puts("")
108
- @configuration.output.puts("--- End ---")
109
- end
110
-
111
- @configuration.output.puts("")
112
- @configuration.output.puts("Validation results:")
113
-
114
- if entry.errors.empty?
115
- @configuration.output.puts(" All valid!")
116
- else
117
- entry.errors.each do |error|
118
- @configuration.output.puts(" #{error[:type]}: INVALID [#{error[:source]}]")
119
- error[:errors].each do |validation_error|
120
- @configuration.output.puts(" - field: #{validation_error[:field]}, issue: #{validation_error[:issue]}")
121
- end
122
- end
123
- end
124
-
125
- @configuration.output.puts("")
126
- end
127
-
128
- @configuration.output.puts("STATUS: #{result.ok? ? "OK" : "FAILED"}")
129
- end
130
- end
131
- end