crawlscope 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/README.md +32 -0
  4. data/lib/crawlscope/cli.rb +16 -0
  5. data/lib/crawlscope/configuration.rb +10 -1
  6. data/lib/crawlscope/context.rb +1 -1
  7. data/lib/crawlscope/crawl.rb +72 -14
  8. data/lib/crawlscope/crawler.rb +3 -17
  9. data/lib/crawlscope/document_text.rb +7 -2
  10. data/lib/crawlscope/fetch_executor/async.rb +32 -0
  11. data/lib/crawlscope/fetch_executor/threaded.rb +32 -0
  12. data/lib/crawlscope/fetch_executor.rb +43 -0
  13. data/lib/crawlscope/http.rb +7 -1
  14. data/lib/crawlscope/reporter.rb +123 -14
  15. data/lib/crawlscope/result.rb +1 -1
  16. data/lib/crawlscope/rules/content_quality.rb +1 -1
  17. data/lib/crawlscope/rules/indexability.rb +28 -6
  18. data/lib/crawlscope/rules/links.rb +80 -16
  19. data/lib/crawlscope/rules/uniqueness.rb +23 -4
  20. data/lib/crawlscope/sitemap.rb +30 -11
  21. data/lib/crawlscope/version.rb +1 -1
  22. data/lib/tasks/crawlscope_tasks.rake +1 -1
  23. data/test/crawlscope/cli_test.rb +28 -2
  24. data/test/crawlscope/configuration_test.rb +21 -0
  25. data/test/crawlscope/content_quality_rule_test.rb +18 -0
  26. data/test/crawlscope/crawl_test.rb +142 -4
  27. data/test/crawlscope/crawler_test.rb +61 -0
  28. data/test/crawlscope/fetch_executor_test.rb +44 -0
  29. data/test/crawlscope/links_rule_test.rb +101 -0
  30. data/test/crawlscope/reporter_test.rb +136 -11
  31. data/test/crawlscope/result_test.rb +35 -0
  32. data/test/crawlscope/sitemap_test.rb +52 -0
  33. data/test/performance/async_fetch_benchmark.rb +127 -0
  34. data/test/performance/fetch_executor_matrix.rb +162 -0
  35. data/test/performance/sitemap_expansion_benchmark.rb +121 -0
  36. metadata +38 -2
@@ -27,6 +27,24 @@ class CrawlscopeContentQualityRuleTest < Minitest::Test
27
27
  refute_includes issues.to_a.map(&:code), :low_visible_text_ratio
28
28
  end
29
29
 
30
+ def test_visible_text_ratio_ignores_form_payload_markup
31
+ issues = Crawlscope::IssueCollection.new
32
+ page = page_with(
33
+ main: <<~HTML
34
+ <p>#{Array.new(260) { |index| "word#{index}" }.join(" ")}</p>
35
+ <form>
36
+ <div data-select-autocomplete-options-value="#{"x" * 50_000}">
37
+ <input type="text" name="country">
38
+ </div>
39
+ </form>
40
+ HTML
41
+ )
42
+
43
+ Crawlscope::Rules::ContentQuality.new.call(urls: [page.url], pages: [page], issues: issues)
44
+
45
+ refute_includes issues.to_a.map(&:code), :low_visible_text_ratio
46
+ end
47
+
30
48
  def test_reports_low_unique_token_ratio_for_repetitive_content
31
49
  issues = Crawlscope::IssueCollection.new
32
50
  page = page_with(main: ("hotel location service " * 100).strip)
@@ -3,6 +3,36 @@
3
3
  require "test_helper"
4
4
 
5
5
  class CrawlscopeCrawlTest < Minitest::Test
6
+ class RecordingExecutor
7
+ attr_reader :batches
8
+
9
+ def initialize
10
+ @batches = []
11
+ end
12
+
13
+ def call(urls)
14
+ @batches << urls
15
+ urls.map { |url| yield(url) }
16
+ end
17
+ end
18
+
19
+ class PageMapFetcher
20
+ attr_reader :closed
21
+
22
+ def initialize(pages)
23
+ @pages = pages
24
+ @closed = false
25
+ end
26
+
27
+ def close
28
+ @closed = true
29
+ end
30
+
31
+ def fetch(url)
32
+ @pages.fetch(url)
33
+ end
34
+ end
35
+
6
36
  def setup
7
37
  @tmp_dir = Dir.mktmpdir
8
38
  @sitemap_path = File.join(@tmp_dir, "sitemap.xml")
@@ -12,6 +42,29 @@ class CrawlscopeCrawlTest < Minitest::Test
12
42
  FileUtils.rm_rf(@tmp_dir)
13
43
  end
14
44
 
45
+ def test_http_renderer_defaults_to_async_executor
46
+ crawl = Crawlscope::Crawl.new(
47
+ base_url: "https://example.com",
48
+ sitemap_path: @sitemap_path,
49
+ rules: [],
50
+ schema_registry: Crawlscope::SchemaRegistry.default
51
+ )
52
+
53
+ assert_equal :async, crawl.instance_variable_get(:@fetch_executor)
54
+ end
55
+
56
+ def test_browser_renderer_defaults_to_threaded_executor
57
+ crawl = Crawlscope::Crawl.new(
58
+ base_url: "https://example.com",
59
+ sitemap_path: @sitemap_path,
60
+ rules: [],
61
+ schema_registry: Crawlscope::SchemaRegistry.default,
62
+ renderer: :browser
63
+ )
64
+
65
+ assert_equal :threaded, crawl.instance_variable_get(:@fetch_executor)
66
+ end
67
+
15
68
  def test_returns_ok_when_metadata_is_valid
16
69
  File.write(
17
70
  @sitemap_path,
@@ -56,7 +109,8 @@ class CrawlscopeCrawlTest < Minitest::Test
56
109
  base_url: "https://example.com",
57
110
  sitemap_path: @sitemap_path,
58
111
  rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
59
- schema_registry: Crawlscope::SchemaRegistry.default
112
+ schema_registry: Crawlscope::SchemaRegistry.default,
113
+ fetch_executor: :threaded
60
114
  ).call
61
115
 
62
116
  assert result.ok?
@@ -97,10 +151,11 @@ class CrawlscopeCrawlTest < Minitest::Test
97
151
  base_url: "https://example.com",
98
152
  sitemap_path: @sitemap_path,
99
153
  rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
100
- schema_registry: Crawlscope::SchemaRegistry.default
154
+ schema_registry: Crawlscope::SchemaRegistry.default,
155
+ fetch_executor: :threaded
101
156
  ).call
102
157
 
103
- refute result.ok?
158
+ assert result.ok?
104
159
  assert_equal %i[
105
160
  incomplete_open_graph_tags
106
161
  meta_description_too_long
@@ -189,6 +244,31 @@ class CrawlscopeCrawlTest < Minitest::Test
189
244
  assert fake_browser.closed
190
245
  end
191
246
 
247
+ def test_async_executor_requires_http_renderer
248
+ File.write(
249
+ @sitemap_path,
250
+ <<~XML
251
+ <?xml version="1.0" encoding="UTF-8"?>
252
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
253
+ <url><loc>https://example.com/pricing</loc></url>
254
+ </urlset>
255
+ XML
256
+ )
257
+
258
+ error = assert_raises(Crawlscope::ConfigurationError) do
259
+ Crawlscope::Crawl.new(
260
+ base_url: "https://example.com",
261
+ sitemap_path: @sitemap_path,
262
+ rules: [],
263
+ schema_registry: Crawlscope::SchemaRegistry.default,
264
+ renderer: :browser,
265
+ fetch_executor: :async
266
+ ).call
267
+ end
268
+
269
+ assert_equal "Async fetch execution is only supported with http rendering", error.message
270
+ end
271
+
192
272
  def test_reports_sitemap_redirect_url
193
273
  File.write(
194
274
  @sitemap_path,
@@ -209,9 +289,67 @@ class CrawlscopeCrawlTest < Minitest::Test
209
289
  base_url: "https://example.com",
210
290
  sitemap_path: @sitemap_path,
211
291
  rules: [],
212
- schema_registry: Crawlscope::SchemaRegistry.default
292
+ schema_registry: Crawlscope::SchemaRegistry.default,
293
+ fetch_executor: :threaded
213
294
  ).call
214
295
 
215
296
  assert_includes result.issues.to_a.map(&:code), :sitemap_redirect_url
216
297
  end
298
+
299
+ def test_resolves_uncrawled_link_targets_as_a_bounded_batch
300
+ File.write(
301
+ @sitemap_path,
302
+ <<~XML
303
+ <?xml version="1.0" encoding="UTF-8"?>
304
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
305
+ <url><loc>https://example.com/guide</loc></url>
306
+ </urlset>
307
+ XML
308
+ )
309
+
310
+ executor = RecordingExecutor.new
311
+ fetcher = PageMapFetcher.new(
312
+ "https://example.com/guide" => page(
313
+ "https://example.com/guide",
314
+ "<main><a href=\"/one\">One</a><a href=\"/two\">Two</a></main>"
315
+ ),
316
+ "https://example.com/one" => page("https://example.com/one", "<main>One</main>"),
317
+ "https://example.com/two" => page("https://example.com/two", "<main>Two</main>")
318
+ )
319
+
320
+ Crawlscope::Crawl.new(
321
+ base_url: "https://example.com",
322
+ sitemap_path: @sitemap_path,
323
+ rules: [Crawlscope::Rules::Links.new],
324
+ schema_registry: Crawlscope::SchemaRegistry.default,
325
+ renderer: :browser,
326
+ browser_factory: -> { fetcher },
327
+ fetch_executor: executor,
328
+ concurrency: 2
329
+ ).call
330
+
331
+ assert_equal(
332
+ [
333
+ ["https://example.com/guide"],
334
+ ["https://example.com/one", "https://example.com/two"]
335
+ ],
336
+ executor.batches
337
+ )
338
+ assert fetcher.closed
339
+ end
340
+
341
+ private
342
+
343
+ def page(url, body)
344
+ Crawlscope::Page.new(
345
+ url: url,
346
+ normalized_url: url,
347
+ final_url: url,
348
+ normalized_final_url: url,
349
+ status: 200,
350
+ headers: {"content-type" => "text/html"},
351
+ body: body,
352
+ doc: Nokogiri::HTML(body)
353
+ )
354
+ end
217
355
  end
@@ -31,4 +31,65 @@ class CrawlscopeCrawlerTest < Minitest::Test
31
31
  assert_nil error_page.status
32
32
  assert_equal "Timeout::Error: fetch timed out", error_page.error
33
33
  end
34
+
35
+ def test_preserves_input_order
36
+ pages = Crawlscope::Crawler.new(page_fetcher: RaisingFetcher.new, concurrency: 2).call(
37
+ ["https://example.com/one", "https://example.com/two", "https://example.com/three"]
38
+ )
39
+
40
+ assert_equal(
41
+ ["https://example.com/one", "https://example.com/two", "https://example.com/three"],
42
+ pages.map(&:url)
43
+ )
44
+ end
45
+
46
+ class AsyncFetcher
47
+ attr_reader :active_fetches
48
+
49
+ def initialize
50
+ @active_fetches = 0
51
+ @max_active_fetches = 0
52
+ @mutex = Mutex.new
53
+ end
54
+
55
+ def fetch(url)
56
+ @mutex.synchronize do
57
+ @active_fetches += 1
58
+ @max_active_fetches = [@max_active_fetches, @active_fetches].max
59
+ end
60
+
61
+ Async::Task.current.sleep(0.01)
62
+
63
+ Crawlscope::Page.new(
64
+ url: url,
65
+ normalized_url: url,
66
+ final_url: url,
67
+ normalized_final_url: url,
68
+ status: 200,
69
+ headers: {},
70
+ body: "<html></html>",
71
+ doc: Nokogiri::HTML("<html></html>")
72
+ )
73
+ ensure
74
+ @mutex.synchronize { @active_fetches -= 1 }
75
+ end
76
+
77
+ def max_active_fetches
78
+ @mutex.synchronize { @max_active_fetches }
79
+ end
80
+ end
81
+
82
+ def test_async_executor_respects_concurrency_and_preserves_order
83
+ fetcher = AsyncFetcher.new
84
+
85
+ pages = Crawlscope::Crawler.new(page_fetcher: fetcher, concurrency: 2, fetch_executor: :async).call(
86
+ ["https://example.com/one", "https://example.com/two", "https://example.com/three"]
87
+ )
88
+
89
+ assert_equal(
90
+ ["https://example.com/one", "https://example.com/two", "https://example.com/three"],
91
+ pages.map(&:url)
92
+ )
93
+ assert_operator fetcher.max_active_fetches, :<=, 2
94
+ end
34
95
  end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeFetchExecutorTest < Minitest::Test
6
+ class RecordingExecutor
7
+ attr_reader :items
8
+
9
+ def call(items)
10
+ @items = items
11
+ items.map { |item| yield(item) }
12
+ end
13
+ end
14
+
15
+ def test_map_preserves_input_order
16
+ results = Crawlscope::FetchExecutor.map(name: :threaded, concurrency: 2, items: [3, 1, 2]) do |item|
17
+ item * 10
18
+ end
19
+
20
+ assert_equal [30, 10, 20], results
21
+ end
22
+
23
+ def test_map_uses_sequential_fallback_for_single_item
24
+ executor = RecordingExecutor.new
25
+
26
+ results = Crawlscope::FetchExecutor.map(name: executor, concurrency: 4, items: ["one"]) do |item|
27
+ item.upcase
28
+ end
29
+
30
+ assert_equal ["ONE"], results
31
+ assert_nil executor.items
32
+ end
33
+
34
+ def test_map_uses_injected_executor_for_parallel_work
35
+ executor = RecordingExecutor.new
36
+
37
+ results = Crawlscope::FetchExecutor.map(name: executor, concurrency: 4, items: %w[a b]) do |item|
38
+ item.upcase
39
+ end
40
+
41
+ assert_equal %w[A B], results
42
+ assert_equal %w[a b], executor.items
43
+ end
44
+ end
@@ -218,6 +218,33 @@ class CrawlscopeLinksRuleTest < Minitest::Test
218
218
  assert_includes codes, :canonical_points_to_redirect
219
219
  end
220
220
 
221
+ def test_does_not_report_missing_inlinks_for_root_canonical
222
+ issues = Crawlscope::IssueCollection.new
223
+ resolver = lambda do |target_url|
224
+ {crawled: true, error: nil, final_url: target_url, html: true, status: 200}
225
+ end
226
+
227
+ Crawlscope::Rules::Links.new.call(
228
+ urls: ["https://example.com/", "https://example.com/about"],
229
+ pages: [
230
+ page(
231
+ url: "https://example.com/",
232
+ body: <<~HTML
233
+ <html>
234
+ <head><link rel="canonical" href="https://example.com/"></head>
235
+ <body><main><a href="/about">About</a></main></body>
236
+ </html>
237
+ HTML
238
+ ),
239
+ page(url: "https://example.com/about", body: "<main><a href=\"/\">Home</a></main>")
240
+ ],
241
+ issues: issues,
242
+ context: context(resolver: resolver)
243
+ )
244
+
245
+ refute_includes issues.to_a.map(&:code), :canonical_no_internal_inlinks
246
+ end
247
+
221
248
  def test_reports_indexable_internal_pages_missing_from_sitemap
222
249
  issues = Crawlscope::IssueCollection.new
223
250
  resolver = lambda do |target_url|
@@ -242,6 +269,54 @@ class CrawlscopeLinksRuleTest < Minitest::Test
242
269
  assert_equal "https://example.com/hidden", issue.url
243
270
  end
244
271
 
272
+ def test_does_not_report_noindex_internal_pages_missing_from_sitemap
273
+ issues = Crawlscope::IssueCollection.new
274
+ resolver = lambda do |target_url|
275
+ {
276
+ crawled: false,
277
+ doc: Nokogiri::HTML("<head><meta name=\"robots\" content=\"noindex, follow\"></head>"),
278
+ error: nil,
279
+ final_url: target_url,
280
+ headers: {},
281
+ html: true,
282
+ status: 200
283
+ }
284
+ end
285
+
286
+ Crawlscope::Rules::Links.new.call(
287
+ urls: ["https://example.com/guide"],
288
+ pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
289
+ issues: issues,
290
+ context: context(resolver: resolver)
291
+ )
292
+
293
+ refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap
294
+ end
295
+
296
+ def test_does_not_report_x_robots_noindex_internal_pages_missing_from_sitemap
297
+ issues = Crawlscope::IssueCollection.new
298
+ resolver = lambda do |target_url|
299
+ {
300
+ crawled: false,
301
+ doc: Nokogiri::HTML("<main>Hidden</main>"),
302
+ error: nil,
303
+ final_url: target_url,
304
+ headers: {"X-Robots-Tag" => "noindex"},
305
+ html: true,
306
+ status: 200
307
+ }
308
+ end
309
+
310
+ Crawlscope::Rules::Links.new.call(
311
+ urls: ["https://example.com/guide"],
312
+ pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
313
+ issues: issues,
314
+ context: context(resolver: resolver)
315
+ )
316
+
317
+ refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap
318
+ end
319
+
245
320
  def test_reports_url_hygiene_issues
246
321
  issues = Crawlscope::IssueCollection.new
247
322
  long_path = "a" * 2_050
@@ -300,6 +375,32 @@ class CrawlscopeLinksRuleTest < Minitest::Test
300
375
  assert_includes redirect_issue.message, "https://example.com/pricing"
301
376
  end
302
377
 
378
+ def test_reuses_link_target_resolution_for_later_link_checks
379
+ issues = Crawlscope::IssueCollection.new
380
+ resolution_counts = Hash.new(0)
381
+ resolver = lambda do |target_url|
382
+ resolution_counts[target_url] += 1
383
+ {
384
+ crawled: false,
385
+ doc: Nokogiri::HTML("<main>Hidden</main>"),
386
+ error: nil,
387
+ final_url: target_url,
388
+ headers: {},
389
+ html: true,
390
+ status: 200
391
+ }
392
+ end
393
+
394
+ Crawlscope::Rules::Links.new.call(
395
+ urls: ["https://example.com/guide"],
396
+ pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
397
+ issues: issues,
398
+ context: context(resolver: resolver)
399
+ )
400
+
401
+ assert_equal 1, resolution_counts.fetch("https://example.com/hidden")
402
+ end
403
+
303
404
  def test_ignores_links_that_should_not_be_crawled
304
405
  issues = Crawlscope::IssueCollection.new
305
406
 
@@ -23,11 +23,25 @@ class CrawlscopeReporterTest < Minitest::Test
23
23
  refute_includes output, "Status: FAILED"
24
24
  end
25
25
 
26
- def test_reports_failed_result_with_grouped_counts_and_offenses
26
+ def test_reports_warning_result_with_grouped_one_line_issues
27
27
  io = StringIO.new
28
28
  issues = Crawlscope::IssueCollection.new
29
+ 4.times do |index|
30
+ issues.add(
31
+ code: :low_dofollow_inlinks,
32
+ severity: :warning,
33
+ category: :links,
34
+ url: "https://example.com/page-#{index + 1}",
35
+ message: "dofollow inbound links 1 below 2",
36
+ details: {
37
+ dofollow_inbound_count: 1,
38
+ minimum: 2,
39
+ source_urls: ["https://example.com/source-#{index + 1}"]
40
+ }
41
+ )
42
+ end
29
43
  issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com/a", message: "missing <title>", details: {})
30
- issues.add(code: :broken_internal_link, severity: :notice, category: :links, url: "https://example.com/b", message: "broken internal link", details: {})
44
+
31
45
  result = Crawlscope::Result.new(
32
46
  base_url: "https://example.com",
33
47
  sitemap_path: "/tmp/sitemap.xml",
@@ -40,15 +54,126 @@ class CrawlscopeReporterTest < Minitest::Test
40
54
 
41
55
  output = io.string
42
56
 
57
+ assert_includes output, "Status: WARNINGS"
58
+ refute_includes output, "Status: FAILED"
59
+ assert_includes output, "Issues: 5 total (5 warnings)"
60
+ assert_includes output, "Summary:"
61
+ assert_includes output, "links / low_dofollow_inlinks: 4"
62
+ assert_includes output, " - /page-1 inbound 1/2 sources: /source-1"
63
+ assert_includes output, " - /page-4 inbound 1/2 sources: /source-4"
64
+ assert_includes output, "metadata / missing_title: 1"
65
+ refute_includes output, "Severity:"
66
+ refute_includes output, "Category:"
67
+ refute_includes output, "... 1 more"
68
+ end
69
+
70
+ def test_reports_failed_status_when_errors_are_present
71
+ io = StringIO.new
72
+ issues = Crawlscope::IssueCollection.new
73
+ issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: "https://example.com/a", message: "timeout", details: {})
74
+ issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com/a", message: "missing <title>", details: {})
75
+
76
+ result = Crawlscope::Result.new(
77
+ base_url: "https://example.com",
78
+ sitemap_path: "/tmp/sitemap.xml",
79
+ urls: ["https://example.com/a"],
80
+ pages: [Object.new],
81
+ issues: issues
82
+ )
83
+
84
+ Crawlscope::Reporter.new(io: io).report(result)
85
+
86
+ output = io.string
87
+
43
88
  assert_includes output, "Status: FAILED"
44
- assert_includes output, "Issues: 2"
45
- assert_includes output, "Severity:"
46
- assert_includes output, "notice: 1"
47
- assert_includes output, "warning: 1"
48
- assert_includes output, "Category:"
49
- assert_includes output, "links: 1"
50
- assert_includes output, "metadata: 1"
51
- assert_includes output, " - [warning] missing_title https://example.com/a missing <title>"
52
- assert_includes output, " - [notice] broken_internal_link https://example.com/b broken internal link"
89
+ assert_includes output, "Issues: 2 total (1 error, 1 warning)"
90
+ end
91
+
92
+ def test_limits_large_issue_groups
93
+ io = StringIO.new
94
+ issues = Crawlscope::IssueCollection.new
95
+ 21.times do |index|
96
+ issues.add(
97
+ code: :low_dofollow_inlinks,
98
+ severity: :warning,
99
+ category: :links,
100
+ url: "https://example.com/page-#{index + 1}",
101
+ message: "dofollow inbound links 1 below 2",
102
+ details: {dofollow_inbound_count: 1, minimum: 2}
103
+ )
104
+ end
105
+
106
+ result = Crawlscope::Result.new(
107
+ base_url: "https://example.com",
108
+ sitemap_path: "/tmp/sitemap.xml",
109
+ urls: ["https://example.com"],
110
+ pages: [Object.new],
111
+ issues: issues
112
+ )
113
+
114
+ Crawlscope::Reporter.new(io: io).report(result)
115
+
116
+ output = io.string
117
+
118
+ assert_includes output, "links / low_dofollow_inlinks: 21"
119
+ assert_includes output, " - /page-20 inbound 1/2"
120
+ refute_includes output, " - /page-21"
121
+ assert_includes output, " ... 1 more"
122
+ end
123
+
124
+ def test_reports_ratio_with_enough_precision_to_show_threshold_difference
125
+ io = StringIO.new
126
+ issues = Crawlscope::IssueCollection.new
127
+ issues.add(
128
+ code: :low_unique_token_ratio,
129
+ severity: :warning,
130
+ category: :content_quality,
131
+ url: "https://example.com/a",
132
+ message: "visible text has low token variety",
133
+ details: {ratio: 0.249, threshold: 0.25}
134
+ )
135
+
136
+ result = Crawlscope::Result.new(
137
+ base_url: "https://example.com",
138
+ sitemap_path: "/tmp/sitemap.xml",
139
+ urls: ["https://example.com/a"],
140
+ pages: [Object.new],
141
+ issues: issues
142
+ )
143
+
144
+ Crawlscope::Reporter.new(io: io).report(result)
145
+
146
+ assert_includes io.string, "ratio 0.249/0.250"
147
+ end
148
+
149
+ def test_reports_source_details_on_one_line
150
+ io = StringIO.new
151
+ issues = Crawlscope::IssueCollection.new
152
+ 4.times do |index|
153
+ issues.add(
154
+ code: :indexable_page_missing_from_sitemap,
155
+ severity: :warning,
156
+ category: :sitemaps,
157
+ url: "https://example.com/overview-#{index + 1}",
158
+ message: "indexable internal page is missing from sitemap",
159
+ details: {source_url: "https://example.com/source-#{index + 1}"}
160
+ )
161
+ end
162
+
163
+ result = Crawlscope::Result.new(
164
+ base_url: "https://example.com",
165
+ sitemap_path: "/tmp/sitemap.xml",
166
+ urls: ["https://example.com"],
167
+ pages: [Object.new],
168
+ issues: issues
169
+ )
170
+
171
+ Crawlscope::Reporter.new(io: io).report(result)
172
+
173
+ output = io.string
174
+
175
+ assert_includes output, "sitemaps / indexable_page_missing_from_sitemap: 4"
176
+ assert_includes output, " - /overview-1 indexable internal page is missing from sitemap source: /source-1"
177
+ assert_includes output, " - /overview-4 indexable internal page is missing from sitemap source: /source-4"
53
178
  end
54
179
  end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeResultTest < Minitest::Test
6
+ def test_ok_when_result_has_warnings_only
7
+ issues = Crawlscope::IssueCollection.new
8
+ issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com", message: "missing <title>", details: {})
9
+
10
+ result = result_with(issues)
11
+
12
+ assert result.ok?
13
+ end
14
+
15
+ def test_not_ok_when_result_has_errors
16
+ issues = Crawlscope::IssueCollection.new
17
+ issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: "https://example.com", message: "timeout", details: {})
18
+
19
+ result = result_with(issues)
20
+
21
+ refute result.ok?
22
+ end
23
+
24
+ private
25
+
26
+ def result_with(issues)
27
+ Crawlscope::Result.new(
28
+ base_url: "https://example.com",
29
+ sitemap_path: "/tmp/sitemap.xml",
30
+ urls: ["https://example.com"],
31
+ pages: [Object.new],
32
+ issues: issues
33
+ )
34
+ end
35
+ end