crawlscope 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/README.md +32 -0
- data/lib/crawlscope/cli.rb +16 -0
- data/lib/crawlscope/configuration.rb +10 -1
- data/lib/crawlscope/context.rb +1 -1
- data/lib/crawlscope/crawl.rb +72 -14
- data/lib/crawlscope/crawler.rb +3 -17
- data/lib/crawlscope/document_text.rb +7 -2
- data/lib/crawlscope/fetch_executor/async.rb +32 -0
- data/lib/crawlscope/fetch_executor/threaded.rb +32 -0
- data/lib/crawlscope/fetch_executor.rb +43 -0
- data/lib/crawlscope/http.rb +7 -1
- data/lib/crawlscope/reporter.rb +123 -14
- data/lib/crawlscope/result.rb +1 -1
- data/lib/crawlscope/rules/content_quality.rb +1 -1
- data/lib/crawlscope/rules/indexability.rb +28 -6
- data/lib/crawlscope/rules/links.rb +80 -16
- data/lib/crawlscope/rules/uniqueness.rb +23 -4
- data/lib/crawlscope/sitemap.rb +30 -11
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +1 -1
- data/test/crawlscope/cli_test.rb +28 -2
- data/test/crawlscope/configuration_test.rb +21 -0
- data/test/crawlscope/content_quality_rule_test.rb +18 -0
- data/test/crawlscope/crawl_test.rb +142 -4
- data/test/crawlscope/crawler_test.rb +61 -0
- data/test/crawlscope/fetch_executor_test.rb +44 -0
- data/test/crawlscope/links_rule_test.rb +101 -0
- data/test/crawlscope/reporter_test.rb +136 -11
- data/test/crawlscope/result_test.rb +35 -0
- data/test/crawlscope/sitemap_test.rb +52 -0
- data/test/performance/async_fetch_benchmark.rb +127 -0
- data/test/performance/fetch_executor_matrix.rb +162 -0
- data/test/performance/sitemap_expansion_benchmark.rb +121 -0
- metadata +38 -2
|
@@ -27,6 +27,24 @@ class CrawlscopeContentQualityRuleTest < Minitest::Test
|
|
|
27
27
|
refute_includes issues.to_a.map(&:code), :low_visible_text_ratio
|
|
28
28
|
end
|
|
29
29
|
|
|
30
|
+
def test_visible_text_ratio_ignores_form_payload_markup
|
|
31
|
+
issues = Crawlscope::IssueCollection.new
|
|
32
|
+
page = page_with(
|
|
33
|
+
main: <<~HTML
|
|
34
|
+
<p>#{Array.new(260) { |index| "word#{index}" }.join(" ")}</p>
|
|
35
|
+
<form>
|
|
36
|
+
<div data-select-autocomplete-options-value="#{"x" * 50_000}">
|
|
37
|
+
<input type="text" name="country">
|
|
38
|
+
</div>
|
|
39
|
+
</form>
|
|
40
|
+
HTML
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
Crawlscope::Rules::ContentQuality.new.call(urls: [page.url], pages: [page], issues: issues)
|
|
44
|
+
|
|
45
|
+
refute_includes issues.to_a.map(&:code), :low_visible_text_ratio
|
|
46
|
+
end
|
|
47
|
+
|
|
30
48
|
def test_reports_low_unique_token_ratio_for_repetitive_content
|
|
31
49
|
issues = Crawlscope::IssueCollection.new
|
|
32
50
|
page = page_with(main: ("hotel location service " * 100).strip)
|
|
@@ -3,6 +3,36 @@
|
|
|
3
3
|
require "test_helper"
|
|
4
4
|
|
|
5
5
|
class CrawlscopeCrawlTest < Minitest::Test
|
|
6
|
+
class RecordingExecutor
|
|
7
|
+
attr_reader :batches
|
|
8
|
+
|
|
9
|
+
def initialize
|
|
10
|
+
@batches = []
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def call(urls)
|
|
14
|
+
@batches << urls
|
|
15
|
+
urls.map { |url| yield(url) }
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
class PageMapFetcher
|
|
20
|
+
attr_reader :closed
|
|
21
|
+
|
|
22
|
+
def initialize(pages)
|
|
23
|
+
@pages = pages
|
|
24
|
+
@closed = false
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def close
|
|
28
|
+
@closed = true
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def fetch(url)
|
|
32
|
+
@pages.fetch(url)
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
6
36
|
def setup
|
|
7
37
|
@tmp_dir = Dir.mktmpdir
|
|
8
38
|
@sitemap_path = File.join(@tmp_dir, "sitemap.xml")
|
|
@@ -12,6 +42,29 @@ class CrawlscopeCrawlTest < Minitest::Test
|
|
|
12
42
|
FileUtils.rm_rf(@tmp_dir)
|
|
13
43
|
end
|
|
14
44
|
|
|
45
|
+
def test_http_renderer_defaults_to_async_executor
|
|
46
|
+
crawl = Crawlscope::Crawl.new(
|
|
47
|
+
base_url: "https://example.com",
|
|
48
|
+
sitemap_path: @sitemap_path,
|
|
49
|
+
rules: [],
|
|
50
|
+
schema_registry: Crawlscope::SchemaRegistry.default
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
assert_equal :async, crawl.instance_variable_get(:@fetch_executor)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def test_browser_renderer_defaults_to_threaded_executor
|
|
57
|
+
crawl = Crawlscope::Crawl.new(
|
|
58
|
+
base_url: "https://example.com",
|
|
59
|
+
sitemap_path: @sitemap_path,
|
|
60
|
+
rules: [],
|
|
61
|
+
schema_registry: Crawlscope::SchemaRegistry.default,
|
|
62
|
+
renderer: :browser
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
assert_equal :threaded, crawl.instance_variable_get(:@fetch_executor)
|
|
66
|
+
end
|
|
67
|
+
|
|
15
68
|
def test_returns_ok_when_metadata_is_valid
|
|
16
69
|
File.write(
|
|
17
70
|
@sitemap_path,
|
|
@@ -56,7 +109,8 @@ class CrawlscopeCrawlTest < Minitest::Test
|
|
|
56
109
|
base_url: "https://example.com",
|
|
57
110
|
sitemap_path: @sitemap_path,
|
|
58
111
|
rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
|
|
59
|
-
schema_registry: Crawlscope::SchemaRegistry.default
|
|
112
|
+
schema_registry: Crawlscope::SchemaRegistry.default,
|
|
113
|
+
fetch_executor: :threaded
|
|
60
114
|
).call
|
|
61
115
|
|
|
62
116
|
assert result.ok?
|
|
@@ -97,10 +151,11 @@ class CrawlscopeCrawlTest < Minitest::Test
|
|
|
97
151
|
base_url: "https://example.com",
|
|
98
152
|
sitemap_path: @sitemap_path,
|
|
99
153
|
rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
|
|
100
|
-
schema_registry: Crawlscope::SchemaRegistry.default
|
|
154
|
+
schema_registry: Crawlscope::SchemaRegistry.default,
|
|
155
|
+
fetch_executor: :threaded
|
|
101
156
|
).call
|
|
102
157
|
|
|
103
|
-
|
|
158
|
+
assert result.ok?
|
|
104
159
|
assert_equal %i[
|
|
105
160
|
incomplete_open_graph_tags
|
|
106
161
|
meta_description_too_long
|
|
@@ -189,6 +244,31 @@ class CrawlscopeCrawlTest < Minitest::Test
|
|
|
189
244
|
assert fake_browser.closed
|
|
190
245
|
end
|
|
191
246
|
|
|
247
|
+
def test_async_executor_requires_http_renderer
|
|
248
|
+
File.write(
|
|
249
|
+
@sitemap_path,
|
|
250
|
+
<<~XML
|
|
251
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
252
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
253
|
+
<url><loc>https://example.com/pricing</loc></url>
|
|
254
|
+
</urlset>
|
|
255
|
+
XML
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
error = assert_raises(Crawlscope::ConfigurationError) do
|
|
259
|
+
Crawlscope::Crawl.new(
|
|
260
|
+
base_url: "https://example.com",
|
|
261
|
+
sitemap_path: @sitemap_path,
|
|
262
|
+
rules: [],
|
|
263
|
+
schema_registry: Crawlscope::SchemaRegistry.default,
|
|
264
|
+
renderer: :browser,
|
|
265
|
+
fetch_executor: :async
|
|
266
|
+
).call
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
assert_equal "Async fetch execution is only supported with http rendering", error.message
|
|
270
|
+
end
|
|
271
|
+
|
|
192
272
|
def test_reports_sitemap_redirect_url
|
|
193
273
|
File.write(
|
|
194
274
|
@sitemap_path,
|
|
@@ -209,9 +289,67 @@ class CrawlscopeCrawlTest < Minitest::Test
|
|
|
209
289
|
base_url: "https://example.com",
|
|
210
290
|
sitemap_path: @sitemap_path,
|
|
211
291
|
rules: [],
|
|
212
|
-
schema_registry: Crawlscope::SchemaRegistry.default
|
|
292
|
+
schema_registry: Crawlscope::SchemaRegistry.default,
|
|
293
|
+
fetch_executor: :threaded
|
|
213
294
|
).call
|
|
214
295
|
|
|
215
296
|
assert_includes result.issues.to_a.map(&:code), :sitemap_redirect_url
|
|
216
297
|
end
|
|
298
|
+
|
|
299
|
+
def test_resolves_uncrawled_link_targets_as_a_bounded_batch
|
|
300
|
+
File.write(
|
|
301
|
+
@sitemap_path,
|
|
302
|
+
<<~XML
|
|
303
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
304
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
305
|
+
<url><loc>https://example.com/guide</loc></url>
|
|
306
|
+
</urlset>
|
|
307
|
+
XML
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
executor = RecordingExecutor.new
|
|
311
|
+
fetcher = PageMapFetcher.new(
|
|
312
|
+
"https://example.com/guide" => page(
|
|
313
|
+
"https://example.com/guide",
|
|
314
|
+
"<main><a href=\"/one\">One</a><a href=\"/two\">Two</a></main>"
|
|
315
|
+
),
|
|
316
|
+
"https://example.com/one" => page("https://example.com/one", "<main>One</main>"),
|
|
317
|
+
"https://example.com/two" => page("https://example.com/two", "<main>Two</main>")
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
Crawlscope::Crawl.new(
|
|
321
|
+
base_url: "https://example.com",
|
|
322
|
+
sitemap_path: @sitemap_path,
|
|
323
|
+
rules: [Crawlscope::Rules::Links.new],
|
|
324
|
+
schema_registry: Crawlscope::SchemaRegistry.default,
|
|
325
|
+
renderer: :browser,
|
|
326
|
+
browser_factory: -> { fetcher },
|
|
327
|
+
fetch_executor: executor,
|
|
328
|
+
concurrency: 2
|
|
329
|
+
).call
|
|
330
|
+
|
|
331
|
+
assert_equal(
|
|
332
|
+
[
|
|
333
|
+
["https://example.com/guide"],
|
|
334
|
+
["https://example.com/one", "https://example.com/two"]
|
|
335
|
+
],
|
|
336
|
+
executor.batches
|
|
337
|
+
)
|
|
338
|
+
assert fetcher.closed
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
private
|
|
342
|
+
|
|
343
|
+
def page(url, body)
|
|
344
|
+
Crawlscope::Page.new(
|
|
345
|
+
url: url,
|
|
346
|
+
normalized_url: url,
|
|
347
|
+
final_url: url,
|
|
348
|
+
normalized_final_url: url,
|
|
349
|
+
status: 200,
|
|
350
|
+
headers: {"content-type" => "text/html"},
|
|
351
|
+
body: body,
|
|
352
|
+
doc: Nokogiri::HTML(body)
|
|
353
|
+
)
|
|
354
|
+
end
|
|
217
355
|
end
|
|
@@ -31,4 +31,65 @@ class CrawlscopeCrawlerTest < Minitest::Test
|
|
|
31
31
|
assert_nil error_page.status
|
|
32
32
|
assert_equal "Timeout::Error: fetch timed out", error_page.error
|
|
33
33
|
end
|
|
34
|
+
|
|
35
|
+
def test_preserves_input_order
|
|
36
|
+
pages = Crawlscope::Crawler.new(page_fetcher: RaisingFetcher.new, concurrency: 2).call(
|
|
37
|
+
["https://example.com/one", "https://example.com/two", "https://example.com/three"]
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
assert_equal(
|
|
41
|
+
["https://example.com/one", "https://example.com/two", "https://example.com/three"],
|
|
42
|
+
pages.map(&:url)
|
|
43
|
+
)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
class AsyncFetcher
|
|
47
|
+
attr_reader :active_fetches
|
|
48
|
+
|
|
49
|
+
def initialize
|
|
50
|
+
@active_fetches = 0
|
|
51
|
+
@max_active_fetches = 0
|
|
52
|
+
@mutex = Mutex.new
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def fetch(url)
|
|
56
|
+
@mutex.synchronize do
|
|
57
|
+
@active_fetches += 1
|
|
58
|
+
@max_active_fetches = [@max_active_fetches, @active_fetches].max
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
Async::Task.current.sleep(0.01)
|
|
62
|
+
|
|
63
|
+
Crawlscope::Page.new(
|
|
64
|
+
url: url,
|
|
65
|
+
normalized_url: url,
|
|
66
|
+
final_url: url,
|
|
67
|
+
normalized_final_url: url,
|
|
68
|
+
status: 200,
|
|
69
|
+
headers: {},
|
|
70
|
+
body: "<html></html>",
|
|
71
|
+
doc: Nokogiri::HTML("<html></html>")
|
|
72
|
+
)
|
|
73
|
+
ensure
|
|
74
|
+
@mutex.synchronize { @active_fetches -= 1 }
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def max_active_fetches
|
|
78
|
+
@mutex.synchronize { @max_active_fetches }
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def test_async_executor_respects_concurrency_and_preserves_order
|
|
83
|
+
fetcher = AsyncFetcher.new
|
|
84
|
+
|
|
85
|
+
pages = Crawlscope::Crawler.new(page_fetcher: fetcher, concurrency: 2, fetch_executor: :async).call(
|
|
86
|
+
["https://example.com/one", "https://example.com/two", "https://example.com/three"]
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
assert_equal(
|
|
90
|
+
["https://example.com/one", "https://example.com/two", "https://example.com/three"],
|
|
91
|
+
pages.map(&:url)
|
|
92
|
+
)
|
|
93
|
+
assert_operator fetcher.max_active_fetches, :<=, 2
|
|
94
|
+
end
|
|
34
95
|
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeFetchExecutorTest < Minitest::Test
|
|
6
|
+
class RecordingExecutor
|
|
7
|
+
attr_reader :items
|
|
8
|
+
|
|
9
|
+
def call(items)
|
|
10
|
+
@items = items
|
|
11
|
+
items.map { |item| yield(item) }
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def test_map_preserves_input_order
|
|
16
|
+
results = Crawlscope::FetchExecutor.map(name: :threaded, concurrency: 2, items: [3, 1, 2]) do |item|
|
|
17
|
+
item * 10
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
assert_equal [30, 10, 20], results
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def test_map_uses_sequential_fallback_for_single_item
|
|
24
|
+
executor = RecordingExecutor.new
|
|
25
|
+
|
|
26
|
+
results = Crawlscope::FetchExecutor.map(name: executor, concurrency: 4, items: ["one"]) do |item|
|
|
27
|
+
item.upcase
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
assert_equal ["ONE"], results
|
|
31
|
+
assert_nil executor.items
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def test_map_uses_injected_executor_for_parallel_work
|
|
35
|
+
executor = RecordingExecutor.new
|
|
36
|
+
|
|
37
|
+
results = Crawlscope::FetchExecutor.map(name: executor, concurrency: 4, items: %w[a b]) do |item|
|
|
38
|
+
item.upcase
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
assert_equal %w[A B], results
|
|
42
|
+
assert_equal %w[a b], executor.items
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -218,6 +218,33 @@ class CrawlscopeLinksRuleTest < Minitest::Test
|
|
|
218
218
|
assert_includes codes, :canonical_points_to_redirect
|
|
219
219
|
end
|
|
220
220
|
|
|
221
|
+
def test_does_not_report_missing_inlinks_for_root_canonical
|
|
222
|
+
issues = Crawlscope::IssueCollection.new
|
|
223
|
+
resolver = lambda do |target_url|
|
|
224
|
+
{crawled: true, error: nil, final_url: target_url, html: true, status: 200}
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
Crawlscope::Rules::Links.new.call(
|
|
228
|
+
urls: ["https://example.com/", "https://example.com/about"],
|
|
229
|
+
pages: [
|
|
230
|
+
page(
|
|
231
|
+
url: "https://example.com/",
|
|
232
|
+
body: <<~HTML
|
|
233
|
+
<html>
|
|
234
|
+
<head><link rel="canonical" href="https://example.com/"></head>
|
|
235
|
+
<body><main><a href="/about">About</a></main></body>
|
|
236
|
+
</html>
|
|
237
|
+
HTML
|
|
238
|
+
),
|
|
239
|
+
page(url: "https://example.com/about", body: "<main><a href=\"/\">Home</a></main>")
|
|
240
|
+
],
|
|
241
|
+
issues: issues,
|
|
242
|
+
context: context(resolver: resolver)
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
refute_includes issues.to_a.map(&:code), :canonical_no_internal_inlinks
|
|
246
|
+
end
|
|
247
|
+
|
|
221
248
|
def test_reports_indexable_internal_pages_missing_from_sitemap
|
|
222
249
|
issues = Crawlscope::IssueCollection.new
|
|
223
250
|
resolver = lambda do |target_url|
|
|
@@ -242,6 +269,54 @@ class CrawlscopeLinksRuleTest < Minitest::Test
|
|
|
242
269
|
assert_equal "https://example.com/hidden", issue.url
|
|
243
270
|
end
|
|
244
271
|
|
|
272
|
+
def test_does_not_report_noindex_internal_pages_missing_from_sitemap
|
|
273
|
+
issues = Crawlscope::IssueCollection.new
|
|
274
|
+
resolver = lambda do |target_url|
|
|
275
|
+
{
|
|
276
|
+
crawled: false,
|
|
277
|
+
doc: Nokogiri::HTML("<head><meta name=\"robots\" content=\"noindex, follow\"></head>"),
|
|
278
|
+
error: nil,
|
|
279
|
+
final_url: target_url,
|
|
280
|
+
headers: {},
|
|
281
|
+
html: true,
|
|
282
|
+
status: 200
|
|
283
|
+
}
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
Crawlscope::Rules::Links.new.call(
|
|
287
|
+
urls: ["https://example.com/guide"],
|
|
288
|
+
pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
|
|
289
|
+
issues: issues,
|
|
290
|
+
context: context(resolver: resolver)
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
def test_does_not_report_x_robots_noindex_internal_pages_missing_from_sitemap
|
|
297
|
+
issues = Crawlscope::IssueCollection.new
|
|
298
|
+
resolver = lambda do |target_url|
|
|
299
|
+
{
|
|
300
|
+
crawled: false,
|
|
301
|
+
doc: Nokogiri::HTML("<main>Hidden</main>"),
|
|
302
|
+
error: nil,
|
|
303
|
+
final_url: target_url,
|
|
304
|
+
headers: {"X-Robots-Tag" => "noindex"},
|
|
305
|
+
html: true,
|
|
306
|
+
status: 200
|
|
307
|
+
}
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
Crawlscope::Rules::Links.new.call(
|
|
311
|
+
urls: ["https://example.com/guide"],
|
|
312
|
+
pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
|
|
313
|
+
issues: issues,
|
|
314
|
+
context: context(resolver: resolver)
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
refute_includes issues.to_a.map(&:code), :indexable_page_missing_from_sitemap
|
|
318
|
+
end
|
|
319
|
+
|
|
245
320
|
def test_reports_url_hygiene_issues
|
|
246
321
|
issues = Crawlscope::IssueCollection.new
|
|
247
322
|
long_path = "a" * 2_050
|
|
@@ -300,6 +375,32 @@ class CrawlscopeLinksRuleTest < Minitest::Test
|
|
|
300
375
|
assert_includes redirect_issue.message, "https://example.com/pricing"
|
|
301
376
|
end
|
|
302
377
|
|
|
378
|
+
def test_reuses_link_target_resolution_for_later_link_checks
|
|
379
|
+
issues = Crawlscope::IssueCollection.new
|
|
380
|
+
resolution_counts = Hash.new(0)
|
|
381
|
+
resolver = lambda do |target_url|
|
|
382
|
+
resolution_counts[target_url] += 1
|
|
383
|
+
{
|
|
384
|
+
crawled: false,
|
|
385
|
+
doc: Nokogiri::HTML("<main>Hidden</main>"),
|
|
386
|
+
error: nil,
|
|
387
|
+
final_url: target_url,
|
|
388
|
+
headers: {},
|
|
389
|
+
html: true,
|
|
390
|
+
status: 200
|
|
391
|
+
}
|
|
392
|
+
end
|
|
393
|
+
|
|
394
|
+
Crawlscope::Rules::Links.new.call(
|
|
395
|
+
urls: ["https://example.com/guide"],
|
|
396
|
+
pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
|
|
397
|
+
issues: issues,
|
|
398
|
+
context: context(resolver: resolver)
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
assert_equal 1, resolution_counts.fetch("https://example.com/hidden")
|
|
402
|
+
end
|
|
403
|
+
|
|
303
404
|
def test_ignores_links_that_should_not_be_crawled
|
|
304
405
|
issues = Crawlscope::IssueCollection.new
|
|
305
406
|
|
|
@@ -23,11 +23,25 @@ class CrawlscopeReporterTest < Minitest::Test
|
|
|
23
23
|
refute_includes output, "Status: FAILED"
|
|
24
24
|
end
|
|
25
25
|
|
|
26
|
-
def
|
|
26
|
+
def test_reports_warning_result_with_grouped_one_line_issues
|
|
27
27
|
io = StringIO.new
|
|
28
28
|
issues = Crawlscope::IssueCollection.new
|
|
29
|
+
4.times do |index|
|
|
30
|
+
issues.add(
|
|
31
|
+
code: :low_dofollow_inlinks,
|
|
32
|
+
severity: :warning,
|
|
33
|
+
category: :links,
|
|
34
|
+
url: "https://example.com/page-#{index + 1}",
|
|
35
|
+
message: "dofollow inbound links 1 below 2",
|
|
36
|
+
details: {
|
|
37
|
+
dofollow_inbound_count: 1,
|
|
38
|
+
minimum: 2,
|
|
39
|
+
source_urls: ["https://example.com/source-#{index + 1}"]
|
|
40
|
+
}
|
|
41
|
+
)
|
|
42
|
+
end
|
|
29
43
|
issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com/a", message: "missing <title>", details: {})
|
|
30
|
-
|
|
44
|
+
|
|
31
45
|
result = Crawlscope::Result.new(
|
|
32
46
|
base_url: "https://example.com",
|
|
33
47
|
sitemap_path: "/tmp/sitemap.xml",
|
|
@@ -40,15 +54,126 @@ class CrawlscopeReporterTest < Minitest::Test
|
|
|
40
54
|
|
|
41
55
|
output = io.string
|
|
42
56
|
|
|
57
|
+
assert_includes output, "Status: WARNINGS"
|
|
58
|
+
refute_includes output, "Status: FAILED"
|
|
59
|
+
assert_includes output, "Issues: 5 total (5 warnings)"
|
|
60
|
+
assert_includes output, "Summary:"
|
|
61
|
+
assert_includes output, "links / low_dofollow_inlinks: 4"
|
|
62
|
+
assert_includes output, " - /page-1 inbound 1/2 sources: /source-1"
|
|
63
|
+
assert_includes output, " - /page-4 inbound 1/2 sources: /source-4"
|
|
64
|
+
assert_includes output, "metadata / missing_title: 1"
|
|
65
|
+
refute_includes output, "Severity:"
|
|
66
|
+
refute_includes output, "Category:"
|
|
67
|
+
refute_includes output, "... 1 more"
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def test_reports_failed_status_when_errors_are_present
|
|
71
|
+
io = StringIO.new
|
|
72
|
+
issues = Crawlscope::IssueCollection.new
|
|
73
|
+
issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: "https://example.com/a", message: "timeout", details: {})
|
|
74
|
+
issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com/a", message: "missing <title>", details: {})
|
|
75
|
+
|
|
76
|
+
result = Crawlscope::Result.new(
|
|
77
|
+
base_url: "https://example.com",
|
|
78
|
+
sitemap_path: "/tmp/sitemap.xml",
|
|
79
|
+
urls: ["https://example.com/a"],
|
|
80
|
+
pages: [Object.new],
|
|
81
|
+
issues: issues
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
Crawlscope::Reporter.new(io: io).report(result)
|
|
85
|
+
|
|
86
|
+
output = io.string
|
|
87
|
+
|
|
43
88
|
assert_includes output, "Status: FAILED"
|
|
44
|
-
assert_includes output, "Issues: 2"
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
89
|
+
assert_includes output, "Issues: 2 total (1 error, 1 warning)"
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def test_limits_large_issue_groups
|
|
93
|
+
io = StringIO.new
|
|
94
|
+
issues = Crawlscope::IssueCollection.new
|
|
95
|
+
21.times do |index|
|
|
96
|
+
issues.add(
|
|
97
|
+
code: :low_dofollow_inlinks,
|
|
98
|
+
severity: :warning,
|
|
99
|
+
category: :links,
|
|
100
|
+
url: "https://example.com/page-#{index + 1}",
|
|
101
|
+
message: "dofollow inbound links 1 below 2",
|
|
102
|
+
details: {dofollow_inbound_count: 1, minimum: 2}
|
|
103
|
+
)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
result = Crawlscope::Result.new(
|
|
107
|
+
base_url: "https://example.com",
|
|
108
|
+
sitemap_path: "/tmp/sitemap.xml",
|
|
109
|
+
urls: ["https://example.com"],
|
|
110
|
+
pages: [Object.new],
|
|
111
|
+
issues: issues
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
Crawlscope::Reporter.new(io: io).report(result)
|
|
115
|
+
|
|
116
|
+
output = io.string
|
|
117
|
+
|
|
118
|
+
assert_includes output, "links / low_dofollow_inlinks: 21"
|
|
119
|
+
assert_includes output, " - /page-20 inbound 1/2"
|
|
120
|
+
refute_includes output, " - /page-21"
|
|
121
|
+
assert_includes output, " ... 1 more"
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def test_reports_ratio_with_enough_precision_to_show_threshold_difference
|
|
125
|
+
io = StringIO.new
|
|
126
|
+
issues = Crawlscope::IssueCollection.new
|
|
127
|
+
issues.add(
|
|
128
|
+
code: :low_unique_token_ratio,
|
|
129
|
+
severity: :warning,
|
|
130
|
+
category: :content_quality,
|
|
131
|
+
url: "https://example.com/a",
|
|
132
|
+
message: "visible text has low token variety",
|
|
133
|
+
details: {ratio: 0.249, threshold: 0.25}
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
result = Crawlscope::Result.new(
|
|
137
|
+
base_url: "https://example.com",
|
|
138
|
+
sitemap_path: "/tmp/sitemap.xml",
|
|
139
|
+
urls: ["https://example.com/a"],
|
|
140
|
+
pages: [Object.new],
|
|
141
|
+
issues: issues
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
Crawlscope::Reporter.new(io: io).report(result)
|
|
145
|
+
|
|
146
|
+
assert_includes io.string, "ratio 0.249/0.250"
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def test_reports_source_details_on_one_line
|
|
150
|
+
io = StringIO.new
|
|
151
|
+
issues = Crawlscope::IssueCollection.new
|
|
152
|
+
4.times do |index|
|
|
153
|
+
issues.add(
|
|
154
|
+
code: :indexable_page_missing_from_sitemap,
|
|
155
|
+
severity: :warning,
|
|
156
|
+
category: :sitemaps,
|
|
157
|
+
url: "https://example.com/overview-#{index + 1}",
|
|
158
|
+
message: "indexable internal page is missing from sitemap",
|
|
159
|
+
details: {source_url: "https://example.com/source-#{index + 1}"}
|
|
160
|
+
)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
result = Crawlscope::Result.new(
|
|
164
|
+
base_url: "https://example.com",
|
|
165
|
+
sitemap_path: "/tmp/sitemap.xml",
|
|
166
|
+
urls: ["https://example.com"],
|
|
167
|
+
pages: [Object.new],
|
|
168
|
+
issues: issues
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
Crawlscope::Reporter.new(io: io).report(result)
|
|
172
|
+
|
|
173
|
+
output = io.string
|
|
174
|
+
|
|
175
|
+
assert_includes output, "sitemaps / indexable_page_missing_from_sitemap: 4"
|
|
176
|
+
assert_includes output, " - /overview-1 indexable internal page is missing from sitemap source: /source-1"
|
|
177
|
+
assert_includes output, " - /overview-4 indexable internal page is missing from sitemap source: /source-4"
|
|
53
178
|
end
|
|
54
179
|
end
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeResultTest < Minitest::Test
|
|
6
|
+
def test_ok_when_result_has_warnings_only
|
|
7
|
+
issues = Crawlscope::IssueCollection.new
|
|
8
|
+
issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com", message: "missing <title>", details: {})
|
|
9
|
+
|
|
10
|
+
result = result_with(issues)
|
|
11
|
+
|
|
12
|
+
assert result.ok?
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def test_not_ok_when_result_has_errors
|
|
16
|
+
issues = Crawlscope::IssueCollection.new
|
|
17
|
+
issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: "https://example.com", message: "timeout", details: {})
|
|
18
|
+
|
|
19
|
+
result = result_with(issues)
|
|
20
|
+
|
|
21
|
+
refute result.ok?
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
private
|
|
25
|
+
|
|
26
|
+
def result_with(issues)
|
|
27
|
+
Crawlscope::Result.new(
|
|
28
|
+
base_url: "https://example.com",
|
|
29
|
+
sitemap_path: "/tmp/sitemap.xml",
|
|
30
|
+
urls: ["https://example.com"],
|
|
31
|
+
pages: [Object.new],
|
|
32
|
+
issues: issues
|
|
33
|
+
)
|
|
34
|
+
end
|
|
35
|
+
end
|