crawlscope 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +6 -0
- data/lib/crawlscope/cli.rb +4 -1
- data/lib/crawlscope/crawl.rb +2 -0
- data/lib/crawlscope/rake_tasks.rb +27 -12
- data/lib/crawlscope/reporter.rb +20 -5
- data/lib/crawlscope/rules/indexability.rb +130 -17
- data/lib/crawlscope/rules/links.rb +312 -9
- data/lib/crawlscope/rules/metadata.rb +61 -6
- data/lib/crawlscope/rules/structured_data.rb +31 -0
- data/lib/crawlscope/rules/uniqueness.rb +22 -0
- data/lib/crawlscope/sitemap.rb +9 -1
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +24 -24
- data/test/crawlscope/cli_test.rb +1 -0
- data/test/crawlscope/crawl_test.rb +26 -0
- data/test/crawlscope/indexability_rule_test.rb +33 -0
- data/test/crawlscope/links_rule_test.rb +148 -3
- data/test/crawlscope/metadata_rule_test.rb +36 -0
- data/test/crawlscope/rake_tasks_test.rb +70 -0
- data/test/crawlscope/reporter_test.rb +7 -3
- data/test/crawlscope/sitemap_test.rb +24 -0
- data/test/crawlscope/structured_data_rule_test.rb +56 -0
- data/test/crawlscope/uniqueness_rule_test.rb +17 -2
- metadata +2 -1
|
@@ -10,6 +10,7 @@ module Crawlscope
|
|
|
10
10
|
LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze
|
|
11
11
|
MAX_SOURCES_IN_ERROR = 3
|
|
12
12
|
MIN_INBOUND_ANCHOR_LINKS = 1
|
|
13
|
+
MIN_DOFOLLOW_INBOUND_LINKS = 2
|
|
13
14
|
|
|
14
15
|
attr_reader :code
|
|
15
16
|
|
|
@@ -24,10 +25,14 @@ module Crawlscope
|
|
|
24
25
|
@base_host = URI.parse(@base_url).host
|
|
25
26
|
|
|
26
27
|
links = extract_links(pages)
|
|
27
|
-
|
|
28
|
-
|
|
28
|
+
validate_url_hygiene(urls, links, issues)
|
|
29
29
|
resolved_links = resolve_links(links, issues)
|
|
30
|
+
validate_nofollow_outgoing_links(links, issues)
|
|
31
|
+
validate_http_internal_links(links, issues)
|
|
32
|
+
validate_pages_with_no_outgoing_links(urls, pages, links, issues)
|
|
33
|
+
validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues)
|
|
30
34
|
validate_inbound_counts(urls, pages, resolved_links, issues)
|
|
35
|
+
validate_canonical_targets(urls, pages, resolved_links, issues)
|
|
31
36
|
end
|
|
32
37
|
|
|
33
38
|
private
|
|
@@ -64,6 +69,8 @@ module Crawlscope
|
|
|
64
69
|
|
|
65
70
|
{
|
|
66
71
|
anchor_text: anchor_text,
|
|
72
|
+
http_internal_link: http_internal_link?(page.normalized_url, href),
|
|
73
|
+
nofollow: nofollow_link?(node),
|
|
67
74
|
source_path: source_path,
|
|
68
75
|
source_url: page.normalized_url,
|
|
69
76
|
target_path: target_path,
|
|
@@ -86,6 +93,19 @@ module Crawlscope
|
|
|
86
93
|
text.to_s.gsub(/\s+/, " ").strip
|
|
87
94
|
end
|
|
88
95
|
|
|
96
|
+
def nofollow_link?(node)
|
|
97
|
+
node["rel"].to_s.split(/\s+/).any? { |value| value.casecmp?("nofollow") }
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def http_internal_link?(source_url, href)
|
|
101
|
+
source_uri = URI.parse(source_url.to_s)
|
|
102
|
+
target_uri = URI.parse(URI.join(source_url, href).to_s)
|
|
103
|
+
|
|
104
|
+
source_uri.scheme == "https" && target_uri.scheme == "http" && target_uri.host == @base_host
|
|
105
|
+
rescue URI::InvalidURIError
|
|
106
|
+
false
|
|
107
|
+
end
|
|
108
|
+
|
|
89
109
|
def normalize_internal_link(source_url, href)
|
|
90
110
|
absolute_url = URI.join(source_url, href).to_s
|
|
91
111
|
uri = URI.parse(absolute_url)
|
|
@@ -109,6 +129,36 @@ module Crawlscope
|
|
|
109
129
|
)
|
|
110
130
|
end
|
|
111
131
|
|
|
132
|
+
def validate_nofollow_outgoing_links(links, issues)
|
|
133
|
+
links.select { |link| link[:nofollow] }.group_by { |link| link[:source_url] }.each do |source_url, grouped_links|
|
|
134
|
+
target_urls = grouped_links.map { |link| link[:target_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
|
|
135
|
+
|
|
136
|
+
issues.add(
|
|
137
|
+
code: :nofollow_internal_outlinks,
|
|
138
|
+
severity: :warning,
|
|
139
|
+
category: :links,
|
|
140
|
+
url: source_url,
|
|
141
|
+
message: "page has nofollow outgoing internal links",
|
|
142
|
+
details: {target_urls: target_urls}
|
|
143
|
+
)
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def validate_http_internal_links(links, issues)
|
|
148
|
+
links.select { |link| link[:http_internal_link] }.group_by { |link| link[:source_url] }.each do |source_url, grouped_links|
|
|
149
|
+
target_urls = grouped_links.map { |link| link[:target_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
|
|
150
|
+
|
|
151
|
+
issues.add(
|
|
152
|
+
code: :http_internal_link,
|
|
153
|
+
severity: :warning,
|
|
154
|
+
category: :links,
|
|
155
|
+
url: source_url,
|
|
156
|
+
message: "HTTPS page links to internal HTTP URL",
|
|
157
|
+
details: {target_urls: target_urls}
|
|
158
|
+
)
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
112
162
|
def report_unresolved_target(target_url, grouped_links, issues, resolution)
|
|
113
163
|
source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
|
|
114
164
|
suffix = (resolution && resolution[:error]) ? " (#{resolution[:error]})" : ""
|
|
@@ -189,6 +239,10 @@ module Crawlscope
|
|
|
189
239
|
resolution && status.nil? && resolution[:crawled] && resolution[:error]
|
|
190
240
|
end
|
|
191
241
|
|
|
242
|
+
def html?
|
|
243
|
+
resolution && resolution[:html]
|
|
244
|
+
end
|
|
245
|
+
|
|
192
246
|
def status
|
|
193
247
|
resolution && resolution[:status]
|
|
194
248
|
end
|
|
@@ -221,6 +275,7 @@ module Crawlscope
|
|
|
221
275
|
|
|
222
276
|
memo[path] = normalized_url
|
|
223
277
|
end
|
|
278
|
+
return if sitemap_paths.size < 2
|
|
224
279
|
|
|
225
280
|
html_paths = pages.each_with_object(Set.new) do |page, result|
|
|
226
281
|
next unless page.html?
|
|
@@ -235,7 +290,11 @@ module Crawlscope
|
|
|
235
290
|
end
|
|
236
291
|
|
|
237
292
|
inbound_anchor_counts = Hash.new(0)
|
|
293
|
+
dofollow_inbound_counts = Hash.new(0)
|
|
294
|
+
nofollow_inbound_counts = Hash.new(0)
|
|
238
295
|
sample_sources_by_target = Hash.new { |hash, key| hash[key] = [] }
|
|
296
|
+
dofollow_sources_by_target = Hash.new { |hash, key| hash[key] = [] }
|
|
297
|
+
nofollow_sources_by_target = Hash.new { |hash, key| hash[key] = [] }
|
|
239
298
|
|
|
240
299
|
resolved_links.each do |link|
|
|
241
300
|
target_path = link[:final_path]
|
|
@@ -245,24 +304,268 @@ module Crawlscope
|
|
|
245
304
|
inbound_anchor_counts[target_path] += 1
|
|
246
305
|
source_samples = sample_sources_by_target[target_path]
|
|
247
306
|
source_samples << link[:source_url] unless source_samples.include?(link[:source_url])
|
|
307
|
+
|
|
308
|
+
if link[:nofollow]
|
|
309
|
+
nofollow_inbound_counts[target_path] += 1
|
|
310
|
+
nofollow_sources = nofollow_sources_by_target[target_path]
|
|
311
|
+
nofollow_sources << link[:source_url] unless nofollow_sources.include?(link[:source_url])
|
|
312
|
+
else
|
|
313
|
+
dofollow_inbound_counts[target_path] += 1
|
|
314
|
+
dofollow_sources = dofollow_sources_by_target[target_path]
|
|
315
|
+
dofollow_sources << link[:source_url] unless dofollow_sources.include?(link[:source_url])
|
|
316
|
+
end
|
|
248
317
|
end
|
|
249
318
|
|
|
250
319
|
sitemap_paths.each do |path, target_url|
|
|
251
320
|
next unless html_paths.include?(path)
|
|
252
321
|
|
|
253
322
|
inbound_count = inbound_anchor_counts[path]
|
|
254
|
-
|
|
323
|
+
dofollow_count = dofollow_inbound_counts[path]
|
|
324
|
+
nofollow_count = nofollow_inbound_counts[path]
|
|
325
|
+
|
|
326
|
+
report_orphan_page(target_url, issues) if inbound_count.zero?
|
|
327
|
+
|
|
328
|
+
if inbound_count.positive? && inbound_count < MIN_INBOUND_ANCHOR_LINKS
|
|
329
|
+
source_samples = sample_sources_by_target[path].first(MAX_SOURCES_IN_ERROR)
|
|
330
|
+
source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : ""
|
|
331
|
+
|
|
332
|
+
issues.add(
|
|
333
|
+
code: :low_inbound_anchor_links,
|
|
334
|
+
severity: :warning,
|
|
335
|
+
category: :links,
|
|
336
|
+
url: target_url,
|
|
337
|
+
message: "inbound anchor links #{inbound_count} below #{MIN_INBOUND_ANCHOR_LINKS}#{source_info}",
|
|
338
|
+
details: {inbound_count: inbound_count, minimum: MIN_INBOUND_ANCHOR_LINKS, source_urls: source_samples}
|
|
339
|
+
)
|
|
340
|
+
end
|
|
255
341
|
|
|
256
|
-
|
|
257
|
-
|
|
342
|
+
report_low_dofollow_inlinks(target_url, path, dofollow_count, dofollow_sources_by_target, issues)
|
|
343
|
+
report_only_nofollow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources_by_target[path], issues)
|
|
344
|
+
report_mixed_follow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources_by_target[path], dofollow_sources_by_target[path], issues)
|
|
345
|
+
end
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
def validate_url_hygiene(urls, links, issues)
|
|
349
|
+
checked_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }
|
|
350
|
+
checked_urls.concat(links.map { |link| link[:target_url] })
|
|
351
|
+
|
|
352
|
+
checked_urls.compact.uniq.each do |url|
|
|
353
|
+
report_url_double_slash(url, issues)
|
|
354
|
+
report_url_too_long(url, issues)
|
|
355
|
+
end
|
|
356
|
+
end
|
|
357
|
+
|
|
358
|
+
def report_url_double_slash(url, issues)
|
|
359
|
+
path = URI.parse(url).path.to_s
|
|
360
|
+
return unless path.match?(%r{//+})
|
|
361
|
+
|
|
362
|
+
issues.add(
|
|
363
|
+
code: :url_double_slash,
|
|
364
|
+
severity: :notice,
|
|
365
|
+
category: :url,
|
|
366
|
+
url: url,
|
|
367
|
+
message: "URL path contains duplicate slashes",
|
|
368
|
+
details: {path: path}
|
|
369
|
+
)
|
|
370
|
+
rescue URI::InvalidURIError
|
|
371
|
+
nil
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
def report_url_too_long(url, issues)
|
|
375
|
+
return unless url.length > 2_048
|
|
376
|
+
|
|
377
|
+
issues.add(
|
|
378
|
+
code: :url_too_long,
|
|
379
|
+
severity: :notice,
|
|
380
|
+
category: :url,
|
|
381
|
+
url: url,
|
|
382
|
+
message: "URL too long (#{url.length})",
|
|
383
|
+
details: {length: url.length, maximum: 2_048}
|
|
384
|
+
)
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
def validate_pages_with_no_outgoing_links(urls, pages, links, issues)
|
|
388
|
+
sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact.to_set
|
|
389
|
+
return if sitemap_urls.size < 2
|
|
390
|
+
|
|
391
|
+
source_paths_with_links = links.map { |link| link[:source_path] }.to_set
|
|
392
|
+
|
|
393
|
+
pages.each do |page|
|
|
394
|
+
next unless page.html?
|
|
395
|
+
next unless sitemap_urls.include?(page.normalized_url)
|
|
396
|
+
|
|
397
|
+
source_path = Url.path(page.normalized_url)
|
|
398
|
+
next unless crawlable_source_path?(source_path)
|
|
399
|
+
next if source_paths_with_links.include?(source_path)
|
|
258
400
|
|
|
259
401
|
issues.add(
|
|
260
|
-
code: :
|
|
402
|
+
code: :page_has_no_outgoing_links,
|
|
261
403
|
severity: :warning,
|
|
262
404
|
category: :links,
|
|
263
|
-
url:
|
|
264
|
-
message: "
|
|
265
|
-
details: {
|
|
405
|
+
url: page.url,
|
|
406
|
+
message: "page has no outgoing internal links",
|
|
407
|
+
details: {}
|
|
408
|
+
)
|
|
409
|
+
end
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
def validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues)
|
|
413
|
+
sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact.to_set
|
|
414
|
+
reported_urls = Set.new
|
|
415
|
+
|
|
416
|
+
resolved_links.each do |link|
|
|
417
|
+
final_url = link[:final_url]
|
|
418
|
+
next if sitemap_urls.include?(final_url)
|
|
419
|
+
next if reported_urls.include?(final_url)
|
|
420
|
+
next unless crawlable_path?(link[:final_path])
|
|
421
|
+
|
|
422
|
+
target = resolve_target(final_url)
|
|
423
|
+
next unless target.allowed?(@allowed_statuses) && target.html?
|
|
424
|
+
|
|
425
|
+
reported_urls << final_url
|
|
426
|
+
|
|
427
|
+
issues.add(
|
|
428
|
+
code: :indexable_page_missing_from_sitemap,
|
|
429
|
+
severity: :warning,
|
|
430
|
+
category: :sitemaps,
|
|
431
|
+
url: final_url,
|
|
432
|
+
message: "indexable internal page is missing from sitemap",
|
|
433
|
+
details: {source_url: link[:source_url]}
|
|
434
|
+
)
|
|
435
|
+
end
|
|
436
|
+
end
|
|
437
|
+
|
|
438
|
+
def report_orphan_page(target_url, issues)
|
|
439
|
+
issues.add(
|
|
440
|
+
code: :orphan_page,
|
|
441
|
+
severity: :warning,
|
|
442
|
+
category: :links,
|
|
443
|
+
url: target_url,
|
|
444
|
+
message: "page has no incoming internal links",
|
|
445
|
+
details: {}
|
|
446
|
+
)
|
|
447
|
+
end
|
|
448
|
+
|
|
449
|
+
def report_low_dofollow_inlinks(target_url, path, dofollow_count, sources_by_target, issues)
|
|
450
|
+
return if dofollow_count.zero?
|
|
451
|
+
return if dofollow_count >= MIN_DOFOLLOW_INBOUND_LINKS
|
|
452
|
+
|
|
453
|
+
source_samples = sources_by_target[path].first(MAX_SOURCES_IN_ERROR)
|
|
454
|
+
source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : ""
|
|
455
|
+
|
|
456
|
+
issues.add(
|
|
457
|
+
code: :low_dofollow_inlinks,
|
|
458
|
+
severity: :warning,
|
|
459
|
+
category: :links,
|
|
460
|
+
url: target_url,
|
|
461
|
+
message: "dofollow inbound links #{dofollow_count} below #{MIN_DOFOLLOW_INBOUND_LINKS}#{source_info}",
|
|
462
|
+
details: {dofollow_inbound_count: dofollow_count, minimum: MIN_DOFOLLOW_INBOUND_LINKS, source_urls: source_samples}
|
|
463
|
+
)
|
|
464
|
+
end
|
|
465
|
+
|
|
466
|
+
def report_only_nofollow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources, issues)
|
|
467
|
+
return unless nofollow_count.positive? && dofollow_count.zero?
|
|
468
|
+
|
|
469
|
+
issues.add(
|
|
470
|
+
code: :only_nofollow_internal_inlinks,
|
|
471
|
+
severity: :warning,
|
|
472
|
+
category: :links,
|
|
473
|
+
url: target_url,
|
|
474
|
+
message: "page has nofollow incoming internal links only",
|
|
475
|
+
details: {nofollow_inbound_count: nofollow_count, source_urls: nofollow_sources.first(MAX_SOURCES_IN_ERROR)}
|
|
476
|
+
)
|
|
477
|
+
end
|
|
478
|
+
|
|
479
|
+
def report_mixed_follow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources, dofollow_sources, issues)
|
|
480
|
+
return unless nofollow_count.positive? && dofollow_count.positive?
|
|
481
|
+
|
|
482
|
+
issues.add(
|
|
483
|
+
code: :mixed_follow_internal_inlinks,
|
|
484
|
+
severity: :notice,
|
|
485
|
+
category: :links,
|
|
486
|
+
url: target_url,
|
|
487
|
+
message: "page has nofollow and dofollow incoming internal links",
|
|
488
|
+
details: {
|
|
489
|
+
dofollow_inbound_count: dofollow_count,
|
|
490
|
+
nofollow_inbound_count: nofollow_count,
|
|
491
|
+
dofollow_source_urls: dofollow_sources.first(MAX_SOURCES_IN_ERROR),
|
|
492
|
+
nofollow_source_urls: nofollow_sources.first(MAX_SOURCES_IN_ERROR)
|
|
493
|
+
}
|
|
494
|
+
)
|
|
495
|
+
end
|
|
496
|
+
|
|
497
|
+
def validate_canonical_targets(urls, pages, resolved_links, issues)
|
|
498
|
+
sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact
|
|
499
|
+
sitemap_pages = pages.select { |page| page.html? && sitemap_urls.include?(page.normalized_url) }
|
|
500
|
+
return if sitemap_pages.size < 2
|
|
501
|
+
|
|
502
|
+
dofollow_counts_by_path = dofollow_counts_by_final_path(resolved_links)
|
|
503
|
+
|
|
504
|
+
sitemap_pages.each do |page|
|
|
505
|
+
canonical_url = canonical_url_for(page)
|
|
506
|
+
next if canonical_url.nil?
|
|
507
|
+
|
|
508
|
+
target_uri = URI.parse(canonical_url)
|
|
509
|
+
next if target_uri.host != @base_host
|
|
510
|
+
|
|
511
|
+
canonical_path = Url.path(canonical_url)
|
|
512
|
+
if canonical_path && dofollow_counts_by_path[canonical_path].zero?
|
|
513
|
+
issues.add(
|
|
514
|
+
code: :canonical_no_internal_inlinks,
|
|
515
|
+
severity: :warning,
|
|
516
|
+
category: :links,
|
|
517
|
+
url: canonical_url,
|
|
518
|
+
message: "canonical URL has no incoming internal links",
|
|
519
|
+
details: {source_url: page.url}
|
|
520
|
+
)
|
|
521
|
+
end
|
|
522
|
+
|
|
523
|
+
validate_canonical_target_status(page, canonical_url, issues)
|
|
524
|
+
rescue URI::InvalidURIError
|
|
525
|
+
next
|
|
526
|
+
end
|
|
527
|
+
end
|
|
528
|
+
|
|
529
|
+
def dofollow_counts_by_final_path(resolved_links)
|
|
530
|
+
resolved_links.each_with_object(Hash.new(0)) do |link, counts|
|
|
531
|
+
next if link[:nofollow]
|
|
532
|
+
next if link[:source_path] == link[:final_path]
|
|
533
|
+
|
|
534
|
+
counts[link[:final_path]] += 1
|
|
535
|
+
end
|
|
536
|
+
end
|
|
537
|
+
|
|
538
|
+
def canonical_url_for(page)
|
|
539
|
+
canonical = page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip
|
|
540
|
+
return if canonical.empty?
|
|
541
|
+
|
|
542
|
+
Url.normalize(canonical, base_url: page.url)
|
|
543
|
+
end
|
|
544
|
+
|
|
545
|
+
def validate_canonical_target_status(page, canonical_url, issues)
|
|
546
|
+
target = resolve_target(canonical_url)
|
|
547
|
+
|
|
548
|
+
if target.unresolved? || target.ignored_error?
|
|
549
|
+
return
|
|
550
|
+
end
|
|
551
|
+
|
|
552
|
+
if target.redirect?
|
|
553
|
+
issues.add(
|
|
554
|
+
code: :canonical_points_to_redirect,
|
|
555
|
+
severity: :warning,
|
|
556
|
+
category: :metadata,
|
|
557
|
+
url: page.url,
|
|
558
|
+
message: "canonical points to redirect",
|
|
559
|
+
details: {canonical: canonical_url, final_url: target.final_url, status: target.status}
|
|
560
|
+
)
|
|
561
|
+
elsif !target.allowed?(@allowed_statuses)
|
|
562
|
+
issues.add(
|
|
563
|
+
code: :canonical_points_to_error,
|
|
564
|
+
severity: :warning,
|
|
565
|
+
category: :metadata,
|
|
566
|
+
url: page.url,
|
|
567
|
+
message: "canonical points to HTTP #{target.status}",
|
|
568
|
+
details: {canonical: canonical_url, status: target.status}
|
|
266
569
|
)
|
|
267
570
|
end
|
|
268
571
|
end
|
|
@@ -18,22 +18,41 @@ module Crawlscope
|
|
|
18
18
|
end
|
|
19
19
|
|
|
20
20
|
def call(urls:, pages:, issues:, context: nil)
|
|
21
|
+
sitemap_urls = normalized_sitemap_urls(urls)
|
|
22
|
+
|
|
21
23
|
pages.each do |page|
|
|
22
24
|
next unless page.html?
|
|
23
25
|
|
|
24
26
|
validate_h1(page, issues)
|
|
25
27
|
validate_title(page, issues)
|
|
26
28
|
validate_description(page, issues)
|
|
27
|
-
validate_canonical(page, issues)
|
|
29
|
+
validate_canonical(page, issues, sitemap_urls)
|
|
28
30
|
validate_open_graph(page, issues)
|
|
29
31
|
end
|
|
30
32
|
end
|
|
31
33
|
|
|
32
34
|
private
|
|
33
35
|
|
|
36
|
+
def normalized_sitemap_urls(urls)
|
|
37
|
+
urls.map { |url| Url.normalize(url, base_url: url) }.compact
|
|
38
|
+
end
|
|
39
|
+
|
|
34
40
|
def validate_h1(page, issues)
|
|
35
41
|
h1s = page.doc.css("h1")
|
|
36
|
-
|
|
42
|
+
empty_h1s = h1s.select { |node| node.text.to_s.strip.empty? }
|
|
43
|
+
|
|
44
|
+
if empty_h1s.any?
|
|
45
|
+
issues.add(
|
|
46
|
+
code: :empty_h1,
|
|
47
|
+
severity: :warning,
|
|
48
|
+
category: :metadata,
|
|
49
|
+
url: page.url,
|
|
50
|
+
message: "empty <h1>",
|
|
51
|
+
details: {count: empty_h1s.size}
|
|
52
|
+
)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
return if h1s.one? && empty_h1s.empty?
|
|
37
56
|
|
|
38
57
|
if h1s.empty?
|
|
39
58
|
issues.add(
|
|
@@ -57,7 +76,19 @@ module Crawlscope
|
|
|
57
76
|
end
|
|
58
77
|
|
|
59
78
|
def validate_title(page, issues)
|
|
60
|
-
|
|
79
|
+
titles = page.doc.css("head > title")
|
|
80
|
+
title = titles.first&.text.to_s.strip
|
|
81
|
+
|
|
82
|
+
if titles.size > 1
|
|
83
|
+
issues.add(
|
|
84
|
+
code: :multiple_title_tags,
|
|
85
|
+
severity: :warning,
|
|
86
|
+
category: :metadata,
|
|
87
|
+
url: page.url,
|
|
88
|
+
message: "multiple <title> tags (#{titles.size})",
|
|
89
|
+
details: {count: titles.size}
|
|
90
|
+
)
|
|
91
|
+
end
|
|
61
92
|
|
|
62
93
|
if title.empty?
|
|
63
94
|
issues.add(code: :missing_title, severity: :warning, category: :metadata, url: page.url, message: "missing <title>", details: {})
|
|
@@ -69,7 +100,19 @@ module Crawlscope
|
|
|
69
100
|
end
|
|
70
101
|
|
|
71
102
|
def validate_description(page, issues)
|
|
72
|
-
|
|
103
|
+
descriptions = page.doc.css('head > meta[name="description"]')
|
|
104
|
+
description = descriptions.first&.[]("content").to_s.strip
|
|
105
|
+
|
|
106
|
+
if descriptions.size > 1
|
|
107
|
+
issues.add(
|
|
108
|
+
code: :multiple_meta_descriptions,
|
|
109
|
+
severity: :warning,
|
|
110
|
+
category: :metadata,
|
|
111
|
+
url: page.url,
|
|
112
|
+
message: "multiple meta description tags (#{descriptions.size})",
|
|
113
|
+
details: {count: descriptions.size}
|
|
114
|
+
)
|
|
115
|
+
end
|
|
73
116
|
|
|
74
117
|
if description.empty?
|
|
75
118
|
issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {})
|
|
@@ -80,7 +123,7 @@ module Crawlscope
|
|
|
80
123
|
end
|
|
81
124
|
end
|
|
82
125
|
|
|
83
|
-
def validate_canonical(page, issues)
|
|
126
|
+
def validate_canonical(page, issues, sitemap_urls)
|
|
84
127
|
canonical = page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip
|
|
85
128
|
|
|
86
129
|
if canonical.empty?
|
|
@@ -92,13 +135,25 @@ module Crawlscope
|
|
|
92
135
|
normalized_page_url = Url.normalize(page.url, base_url: page.url)
|
|
93
136
|
return if canonical_matches_page?(normalized_canonical, normalized_page_url)
|
|
94
137
|
|
|
138
|
+
details = {canonical: canonical}
|
|
95
139
|
issues.add(
|
|
96
140
|
code: :canonical_mismatch,
|
|
97
141
|
severity: :warning,
|
|
98
142
|
category: :metadata,
|
|
99
143
|
url: page.url,
|
|
100
144
|
message: "canonical mismatch (#{canonical})",
|
|
101
|
-
details:
|
|
145
|
+
details: details
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
return unless sitemap_urls.include?(normalized_page_url)
|
|
149
|
+
|
|
150
|
+
issues.add(
|
|
151
|
+
code: :non_canonical_page_in_sitemap,
|
|
152
|
+
severity: :warning,
|
|
153
|
+
category: :sitemaps,
|
|
154
|
+
url: page.url,
|
|
155
|
+
message: "non-canonical page is included in sitemap",
|
|
156
|
+
details: details
|
|
102
157
|
)
|
|
103
158
|
end
|
|
104
159
|
|
|
@@ -55,6 +55,8 @@ module Crawlscope
|
|
|
55
55
|
next
|
|
56
56
|
end
|
|
57
57
|
|
|
58
|
+
validate_type_presence(page, source, data, issues)
|
|
59
|
+
|
|
58
60
|
errors = schema_registry.validate(data)
|
|
59
61
|
next if errors.empty?
|
|
60
62
|
|
|
@@ -96,6 +98,35 @@ module Crawlscope
|
|
|
96
98
|
end
|
|
97
99
|
end
|
|
98
100
|
|
|
101
|
+
def validate_type_presence(page, source, data, issues)
|
|
102
|
+
missing_paths = missing_type_paths(data)
|
|
103
|
+
return if missing_paths.empty?
|
|
104
|
+
|
|
105
|
+
issues.add(
|
|
106
|
+
code: :structured_data_missing_type,
|
|
107
|
+
severity: :warning,
|
|
108
|
+
category: :structured_data,
|
|
109
|
+
url: page.url,
|
|
110
|
+
message: "#{source} structured data missing @type",
|
|
111
|
+
details: {paths: missing_paths, source: source}
|
|
112
|
+
)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def missing_type_paths(data, path = "$")
|
|
116
|
+
return [] unless data.is_a?(Hash)
|
|
117
|
+
|
|
118
|
+
paths = []
|
|
119
|
+
paths << path if data["@type"].to_s.strip.empty?
|
|
120
|
+
|
|
121
|
+
if data["@graph"].is_a?(Array)
|
|
122
|
+
data["@graph"].each_with_index do |entry, index|
|
|
123
|
+
paths.concat(missing_type_paths(entry, "#{path}.@graph[#{index}]"))
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
paths
|
|
128
|
+
end
|
|
129
|
+
|
|
99
130
|
def structured_data_types(data)
|
|
100
131
|
return [] unless data.is_a?(Hash)
|
|
101
132
|
|
|
@@ -58,6 +58,7 @@ module Crawlscope
|
|
|
58
58
|
|
|
59
59
|
{
|
|
60
60
|
content_fingerprint_digest: content_fingerprint_digest(page.doc),
|
|
61
|
+
canonical: page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip,
|
|
61
62
|
description: page.doc.at_css('meta[name="description"]')&.[]("content").to_s.strip,
|
|
62
63
|
shingles: shingles_for(tokens),
|
|
63
64
|
title: page.doc.at_css("title")&.text.to_s.strip,
|
|
@@ -98,6 +99,27 @@ module Crawlscope
|
|
|
98
99
|
details: {urls: urls}
|
|
99
100
|
)
|
|
100
101
|
end
|
|
102
|
+
|
|
103
|
+
duplicate_content_clusters_without_canonical(page_summaries).each do |urls|
|
|
104
|
+
issues.add(
|
|
105
|
+
code: :duplicate_pages_without_canonical,
|
|
106
|
+
severity: :warning,
|
|
107
|
+
category: :uniqueness,
|
|
108
|
+
url: nil,
|
|
109
|
+
message: "duplicate pages without canonical => #{urls.join(", ")}",
|
|
110
|
+
details: {urls: urls}
|
|
111
|
+
)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def duplicate_content_clusters_without_canonical(page_summaries)
|
|
116
|
+
page_summaries
|
|
117
|
+
.select { |page| !page[:content_fingerprint_digest].nil? }
|
|
118
|
+
.group_by { |page| page[:content_fingerprint_digest] }
|
|
119
|
+
.values
|
|
120
|
+
.select { |pages| pages.size > 1 }
|
|
121
|
+
.select { |pages| pages.any? { |page| page[:canonical].to_s.empty? } }
|
|
122
|
+
.map { |pages| pages.map { |page| page[:url] } }
|
|
101
123
|
end
|
|
102
124
|
|
|
103
125
|
def shingles_for(tokens)
|
data/lib/crawlscope/sitemap.rb
CHANGED
|
@@ -25,6 +25,9 @@ module Crawlscope
|
|
|
25
25
|
visited.add(source)
|
|
26
26
|
document = Nokogiri::XML(read(source))
|
|
27
27
|
root_name = document.root&.name
|
|
28
|
+
unless %w[sitemapindex urlset].include?(root_name)
|
|
29
|
+
raise ValidationError, "Sitemap #{source} has unexpected root #{root_name.inspect}"
|
|
30
|
+
end
|
|
28
31
|
|
|
29
32
|
if root_name == "sitemapindex"
|
|
30
33
|
document.xpath("//xmlns:sitemap/xmlns:loc", SITEMAP_NAMESPACE).flat_map do |node|
|
|
@@ -40,7 +43,12 @@ module Crawlscope
|
|
|
40
43
|
|
|
41
44
|
def read(source)
|
|
42
45
|
if Url.remote?(source)
|
|
43
|
-
connection.get(source)
|
|
46
|
+
response = connection.get(source)
|
|
47
|
+
unless response.status.to_i.between?(200, 299)
|
|
48
|
+
raise ValidationError, "Sitemap #{source} returned HTTP #{response.status}"
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
response.body
|
|
44
52
|
else
|
|
45
53
|
File.read(source)
|
|
46
54
|
end
|
data/lib/crawlscope/version.rb
CHANGED