crawlscope 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,7 @@ module Crawlscope
10
10
  LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze
11
11
  MAX_SOURCES_IN_ERROR = 3
12
12
  MIN_INBOUND_ANCHOR_LINKS = 1
13
+ MIN_DOFOLLOW_INBOUND_LINKS = 2
13
14
 
14
15
  attr_reader :code
15
16
 
@@ -24,10 +25,14 @@ module Crawlscope
24
25
  @base_host = URI.parse(@base_url).host
25
26
 
26
27
  links = extract_links(pages)
27
- return if links.empty?
28
-
28
+ validate_url_hygiene(urls, links, issues)
29
29
  resolved_links = resolve_links(links, issues)
30
+ validate_nofollow_outgoing_links(links, issues)
31
+ validate_http_internal_links(links, issues)
32
+ validate_pages_with_no_outgoing_links(urls, pages, links, issues)
33
+ validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues)
30
34
  validate_inbound_counts(urls, pages, resolved_links, issues)
35
+ validate_canonical_targets(urls, pages, resolved_links, issues)
31
36
  end
32
37
 
33
38
  private
@@ -64,6 +69,8 @@ module Crawlscope
64
69
 
65
70
  {
66
71
  anchor_text: anchor_text,
72
+ http_internal_link: http_internal_link?(page.normalized_url, href),
73
+ nofollow: nofollow_link?(node),
67
74
  source_path: source_path,
68
75
  source_url: page.normalized_url,
69
76
  target_path: target_path,
@@ -86,6 +93,19 @@ module Crawlscope
86
93
  text.to_s.gsub(/\s+/, " ").strip
87
94
  end
88
95
 
96
+ def nofollow_link?(node)
97
+ node["rel"].to_s.split(/\s+/).any? { |value| value.casecmp?("nofollow") }
98
+ end
99
+
100
+ def http_internal_link?(source_url, href)
101
+ source_uri = URI.parse(source_url.to_s)
102
+ target_uri = URI.parse(URI.join(source_url, href).to_s)
103
+
104
+ source_uri.scheme == "https" && target_uri.scheme == "http" && target_uri.host == @base_host
105
+ rescue URI::InvalidURIError
106
+ false
107
+ end
108
+
89
109
  def normalize_internal_link(source_url, href)
90
110
  absolute_url = URI.join(source_url, href).to_s
91
111
  uri = URI.parse(absolute_url)
@@ -109,6 +129,36 @@ module Crawlscope
109
129
  )
110
130
  end
111
131
 
132
+ def validate_nofollow_outgoing_links(links, issues)
133
+ links.select { |link| link[:nofollow] }.group_by { |link| link[:source_url] }.each do |source_url, grouped_links|
134
+ target_urls = grouped_links.map { |link| link[:target_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
135
+
136
+ issues.add(
137
+ code: :nofollow_internal_outlinks,
138
+ severity: :warning,
139
+ category: :links,
140
+ url: source_url,
141
+ message: "page has nofollow outgoing internal links",
142
+ details: {target_urls: target_urls}
143
+ )
144
+ end
145
+ end
146
+
147
+ def validate_http_internal_links(links, issues)
148
+ links.select { |link| link[:http_internal_link] }.group_by { |link| link[:source_url] }.each do |source_url, grouped_links|
149
+ target_urls = grouped_links.map { |link| link[:target_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
150
+
151
+ issues.add(
152
+ code: :http_internal_link,
153
+ severity: :warning,
154
+ category: :links,
155
+ url: source_url,
156
+ message: "HTTPS page links to internal HTTP URL",
157
+ details: {target_urls: target_urls}
158
+ )
159
+ end
160
+ end
161
+
112
162
  def report_unresolved_target(target_url, grouped_links, issues, resolution)
113
163
  source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
114
164
  suffix = (resolution && resolution[:error]) ? " (#{resolution[:error]})" : ""
@@ -189,6 +239,10 @@ module Crawlscope
189
239
  resolution && status.nil? && resolution[:crawled] && resolution[:error]
190
240
  end
191
241
 
242
+ def html?
243
+ resolution && resolution[:html]
244
+ end
245
+
192
246
  def status
193
247
  resolution && resolution[:status]
194
248
  end
@@ -221,6 +275,7 @@ module Crawlscope
221
275
 
222
276
  memo[path] = normalized_url
223
277
  end
278
+ return if sitemap_paths.size < 2
224
279
 
225
280
  html_paths = pages.each_with_object(Set.new) do |page, result|
226
281
  next unless page.html?
@@ -235,7 +290,11 @@ module Crawlscope
235
290
  end
236
291
 
237
292
  inbound_anchor_counts = Hash.new(0)
293
+ dofollow_inbound_counts = Hash.new(0)
294
+ nofollow_inbound_counts = Hash.new(0)
238
295
  sample_sources_by_target = Hash.new { |hash, key| hash[key] = [] }
296
+ dofollow_sources_by_target = Hash.new { |hash, key| hash[key] = [] }
297
+ nofollow_sources_by_target = Hash.new { |hash, key| hash[key] = [] }
239
298
 
240
299
  resolved_links.each do |link|
241
300
  target_path = link[:final_path]
@@ -245,24 +304,268 @@ module Crawlscope
245
304
  inbound_anchor_counts[target_path] += 1
246
305
  source_samples = sample_sources_by_target[target_path]
247
306
  source_samples << link[:source_url] unless source_samples.include?(link[:source_url])
307
+
308
+ if link[:nofollow]
309
+ nofollow_inbound_counts[target_path] += 1
310
+ nofollow_sources = nofollow_sources_by_target[target_path]
311
+ nofollow_sources << link[:source_url] unless nofollow_sources.include?(link[:source_url])
312
+ else
313
+ dofollow_inbound_counts[target_path] += 1
314
+ dofollow_sources = dofollow_sources_by_target[target_path]
315
+ dofollow_sources << link[:source_url] unless dofollow_sources.include?(link[:source_url])
316
+ end
248
317
  end
249
318
 
250
319
  sitemap_paths.each do |path, target_url|
251
320
  next unless html_paths.include?(path)
252
321
 
253
322
  inbound_count = inbound_anchor_counts[path]
254
- next if inbound_count >= MIN_INBOUND_ANCHOR_LINKS
323
+ dofollow_count = dofollow_inbound_counts[path]
324
+ nofollow_count = nofollow_inbound_counts[path]
325
+
326
+ report_orphan_page(target_url, issues) if inbound_count.zero?
327
+
328
+ if inbound_count.positive? && inbound_count < MIN_INBOUND_ANCHOR_LINKS
329
+ source_samples = sample_sources_by_target[path].first(MAX_SOURCES_IN_ERROR)
330
+ source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : ""
331
+
332
+ issues.add(
333
+ code: :low_inbound_anchor_links,
334
+ severity: :warning,
335
+ category: :links,
336
+ url: target_url,
337
+ message: "inbound anchor links #{inbound_count} below #{MIN_INBOUND_ANCHOR_LINKS}#{source_info}",
338
+ details: {inbound_count: inbound_count, minimum: MIN_INBOUND_ANCHOR_LINKS, source_urls: source_samples}
339
+ )
340
+ end
255
341
 
256
- source_samples = sample_sources_by_target[path].first(MAX_SOURCES_IN_ERROR)
257
- source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : ""
342
+ report_low_dofollow_inlinks(target_url, path, dofollow_count, dofollow_sources_by_target, issues)
343
+ report_only_nofollow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources_by_target[path], issues)
344
+ report_mixed_follow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources_by_target[path], dofollow_sources_by_target[path], issues)
345
+ end
346
+ end
347
+
348
+ def validate_url_hygiene(urls, links, issues)
349
+ checked_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }
350
+ checked_urls.concat(links.map { |link| link[:target_url] })
351
+
352
+ checked_urls.compact.uniq.each do |url|
353
+ report_url_double_slash(url, issues)
354
+ report_url_too_long(url, issues)
355
+ end
356
+ end
357
+
358
+ def report_url_double_slash(url, issues)
359
+ path = URI.parse(url).path.to_s
360
+ return unless path.match?(%r{//+})
361
+
362
+ issues.add(
363
+ code: :url_double_slash,
364
+ severity: :notice,
365
+ category: :url,
366
+ url: url,
367
+ message: "URL path contains duplicate slashes",
368
+ details: {path: path}
369
+ )
370
+ rescue URI::InvalidURIError
371
+ nil
372
+ end
373
+
374
+ def report_url_too_long(url, issues)
375
+ return unless url.length > 2_048
376
+
377
+ issues.add(
378
+ code: :url_too_long,
379
+ severity: :notice,
380
+ category: :url,
381
+ url: url,
382
+ message: "URL too long (#{url.length})",
383
+ details: {length: url.length, maximum: 2_048}
384
+ )
385
+ end
386
+
387
+ def validate_pages_with_no_outgoing_links(urls, pages, links, issues)
388
+ sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact.to_set
389
+ return if sitemap_urls.size < 2
390
+
391
+ source_paths_with_links = links.map { |link| link[:source_path] }.to_set
392
+
393
+ pages.each do |page|
394
+ next unless page.html?
395
+ next unless sitemap_urls.include?(page.normalized_url)
396
+
397
+ source_path = Url.path(page.normalized_url)
398
+ next unless crawlable_source_path?(source_path)
399
+ next if source_paths_with_links.include?(source_path)
258
400
 
259
401
  issues.add(
260
- code: :low_inbound_anchor_links,
402
+ code: :page_has_no_outgoing_links,
261
403
  severity: :warning,
262
404
  category: :links,
263
- url: target_url,
264
- message: "inbound anchor links #{inbound_count} below #{MIN_INBOUND_ANCHOR_LINKS}#{source_info}",
265
- details: {inbound_count: inbound_count, minimum: MIN_INBOUND_ANCHOR_LINKS, source_urls: source_samples}
405
+ url: page.url,
406
+ message: "page has no outgoing internal links",
407
+ details: {}
408
+ )
409
+ end
410
+ end
411
+
412
+ def validate_indexable_pages_missing_from_sitemap(urls, resolved_links, issues)
413
+ sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact.to_set
414
+ reported_urls = Set.new
415
+
416
+ resolved_links.each do |link|
417
+ final_url = link[:final_url]
418
+ next if sitemap_urls.include?(final_url)
419
+ next if reported_urls.include?(final_url)
420
+ next unless crawlable_path?(link[:final_path])
421
+
422
+ target = resolve_target(final_url)
423
+ next unless target.allowed?(@allowed_statuses) && target.html?
424
+
425
+ reported_urls << final_url
426
+
427
+ issues.add(
428
+ code: :indexable_page_missing_from_sitemap,
429
+ severity: :warning,
430
+ category: :sitemaps,
431
+ url: final_url,
432
+ message: "indexable internal page is missing from sitemap",
433
+ details: {source_url: link[:source_url]}
434
+ )
435
+ end
436
+ end
437
+
438
+ def report_orphan_page(target_url, issues)
439
+ issues.add(
440
+ code: :orphan_page,
441
+ severity: :warning,
442
+ category: :links,
443
+ url: target_url,
444
+ message: "page has no incoming internal links",
445
+ details: {}
446
+ )
447
+ end
448
+
449
+ def report_low_dofollow_inlinks(target_url, path, dofollow_count, sources_by_target, issues)
450
+ return if dofollow_count.zero?
451
+ return if dofollow_count >= MIN_DOFOLLOW_INBOUND_LINKS
452
+
453
+ source_samples = sources_by_target[path].first(MAX_SOURCES_IN_ERROR)
454
+ source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : ""
455
+
456
+ issues.add(
457
+ code: :low_dofollow_inlinks,
458
+ severity: :warning,
459
+ category: :links,
460
+ url: target_url,
461
+ message: "dofollow inbound links #{dofollow_count} below #{MIN_DOFOLLOW_INBOUND_LINKS}#{source_info}",
462
+ details: {dofollow_inbound_count: dofollow_count, minimum: MIN_DOFOLLOW_INBOUND_LINKS, source_urls: source_samples}
463
+ )
464
+ end
465
+
466
+ def report_only_nofollow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources, issues)
467
+ return unless nofollow_count.positive? && dofollow_count.zero?
468
+
469
+ issues.add(
470
+ code: :only_nofollow_internal_inlinks,
471
+ severity: :warning,
472
+ category: :links,
473
+ url: target_url,
474
+ message: "page has nofollow incoming internal links only",
475
+ details: {nofollow_inbound_count: nofollow_count, source_urls: nofollow_sources.first(MAX_SOURCES_IN_ERROR)}
476
+ )
477
+ end
478
+
479
+ def report_mixed_follow_internal_inlinks(target_url, nofollow_count, dofollow_count, nofollow_sources, dofollow_sources, issues)
480
+ return unless nofollow_count.positive? && dofollow_count.positive?
481
+
482
+ issues.add(
483
+ code: :mixed_follow_internal_inlinks,
484
+ severity: :notice,
485
+ category: :links,
486
+ url: target_url,
487
+ message: "page has nofollow and dofollow incoming internal links",
488
+ details: {
489
+ dofollow_inbound_count: dofollow_count,
490
+ nofollow_inbound_count: nofollow_count,
491
+ dofollow_source_urls: dofollow_sources.first(MAX_SOURCES_IN_ERROR),
492
+ nofollow_source_urls: nofollow_sources.first(MAX_SOURCES_IN_ERROR)
493
+ }
494
+ )
495
+ end
496
+
497
+ def validate_canonical_targets(urls, pages, resolved_links, issues)
498
+ sitemap_urls = urls.map { |url| Url.normalize(url, base_url: @base_url) }.compact
499
+ sitemap_pages = pages.select { |page| page.html? && sitemap_urls.include?(page.normalized_url) }
500
+ return if sitemap_pages.size < 2
501
+
502
+ dofollow_counts_by_path = dofollow_counts_by_final_path(resolved_links)
503
+
504
+ sitemap_pages.each do |page|
505
+ canonical_url = canonical_url_for(page)
506
+ next if canonical_url.nil?
507
+
508
+ target_uri = URI.parse(canonical_url)
509
+ next if target_uri.host != @base_host
510
+
511
+ canonical_path = Url.path(canonical_url)
512
+ if canonical_path && dofollow_counts_by_path[canonical_path].zero?
513
+ issues.add(
514
+ code: :canonical_no_internal_inlinks,
515
+ severity: :warning,
516
+ category: :links,
517
+ url: canonical_url,
518
+ message: "canonical URL has no incoming internal links",
519
+ details: {source_url: page.url}
520
+ )
521
+ end
522
+
523
+ validate_canonical_target_status(page, canonical_url, issues)
524
+ rescue URI::InvalidURIError
525
+ next
526
+ end
527
+ end
528
+
529
+ def dofollow_counts_by_final_path(resolved_links)
530
+ resolved_links.each_with_object(Hash.new(0)) do |link, counts|
531
+ next if link[:nofollow]
532
+ next if link[:source_path] == link[:final_path]
533
+
534
+ counts[link[:final_path]] += 1
535
+ end
536
+ end
537
+
538
+ def canonical_url_for(page)
539
+ canonical = page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip
540
+ return if canonical.empty?
541
+
542
+ Url.normalize(canonical, base_url: page.url)
543
+ end
544
+
545
+ def validate_canonical_target_status(page, canonical_url, issues)
546
+ target = resolve_target(canonical_url)
547
+
548
+ if target.unresolved? || target.ignored_error?
549
+ return
550
+ end
551
+
552
+ if target.redirect?
553
+ issues.add(
554
+ code: :canonical_points_to_redirect,
555
+ severity: :warning,
556
+ category: :metadata,
557
+ url: page.url,
558
+ message: "canonical points to redirect",
559
+ details: {canonical: canonical_url, final_url: target.final_url, status: target.status}
560
+ )
561
+ elsif !target.allowed?(@allowed_statuses)
562
+ issues.add(
563
+ code: :canonical_points_to_error,
564
+ severity: :warning,
565
+ category: :metadata,
566
+ url: page.url,
567
+ message: "canonical points to HTTP #{target.status}",
568
+ details: {canonical: canonical_url, status: target.status}
266
569
  )
267
570
  end
268
571
  end
@@ -18,22 +18,41 @@ module Crawlscope
18
18
  end
19
19
 
20
20
  def call(urls:, pages:, issues:, context: nil)
21
+ sitemap_urls = normalized_sitemap_urls(urls)
22
+
21
23
  pages.each do |page|
22
24
  next unless page.html?
23
25
 
24
26
  validate_h1(page, issues)
25
27
  validate_title(page, issues)
26
28
  validate_description(page, issues)
27
- validate_canonical(page, issues)
29
+ validate_canonical(page, issues, sitemap_urls)
28
30
  validate_open_graph(page, issues)
29
31
  end
30
32
  end
31
33
 
32
34
  private
33
35
 
36
+ def normalized_sitemap_urls(urls)
37
+ urls.map { |url| Url.normalize(url, base_url: url) }.compact
38
+ end
39
+
34
40
  def validate_h1(page, issues)
35
41
  h1s = page.doc.css("h1")
36
- return if h1s.one?
42
+ empty_h1s = h1s.select { |node| node.text.to_s.strip.empty? }
43
+
44
+ if empty_h1s.any?
45
+ issues.add(
46
+ code: :empty_h1,
47
+ severity: :warning,
48
+ category: :metadata,
49
+ url: page.url,
50
+ message: "empty <h1>",
51
+ details: {count: empty_h1s.size}
52
+ )
53
+ end
54
+
55
+ return if h1s.one? && empty_h1s.empty?
37
56
 
38
57
  if h1s.empty?
39
58
  issues.add(
@@ -57,7 +76,19 @@ module Crawlscope
57
76
  end
58
77
 
59
78
  def validate_title(page, issues)
60
- title = page.doc.at_css("title")&.text.to_s.strip
79
+ titles = page.doc.css("head > title")
80
+ title = titles.first&.text.to_s.strip
81
+
82
+ if titles.size > 1
83
+ issues.add(
84
+ code: :multiple_title_tags,
85
+ severity: :warning,
86
+ category: :metadata,
87
+ url: page.url,
88
+ message: "multiple <title> tags (#{titles.size})",
89
+ details: {count: titles.size}
90
+ )
91
+ end
61
92
 
62
93
  if title.empty?
63
94
  issues.add(code: :missing_title, severity: :warning, category: :metadata, url: page.url, message: "missing <title>", details: {})
@@ -69,7 +100,19 @@ module Crawlscope
69
100
  end
70
101
 
71
102
  def validate_description(page, issues)
72
- description = page.doc.at_css('meta[name="description"]')&.[]("content").to_s.strip
103
+ descriptions = page.doc.css('head > meta[name="description"]')
104
+ description = descriptions.first&.[]("content").to_s.strip
105
+
106
+ if descriptions.size > 1
107
+ issues.add(
108
+ code: :multiple_meta_descriptions,
109
+ severity: :warning,
110
+ category: :metadata,
111
+ url: page.url,
112
+ message: "multiple meta description tags (#{descriptions.size})",
113
+ details: {count: descriptions.size}
114
+ )
115
+ end
73
116
 
74
117
  if description.empty?
75
118
  issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {})
@@ -80,7 +123,7 @@ module Crawlscope
80
123
  end
81
124
  end
82
125
 
83
- def validate_canonical(page, issues)
126
+ def validate_canonical(page, issues, sitemap_urls)
84
127
  canonical = page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip
85
128
 
86
129
  if canonical.empty?
@@ -92,13 +135,25 @@ module Crawlscope
92
135
  normalized_page_url = Url.normalize(page.url, base_url: page.url)
93
136
  return if canonical_matches_page?(normalized_canonical, normalized_page_url)
94
137
 
138
+ details = {canonical: canonical}
95
139
  issues.add(
96
140
  code: :canonical_mismatch,
97
141
  severity: :warning,
98
142
  category: :metadata,
99
143
  url: page.url,
100
144
  message: "canonical mismatch (#{canonical})",
101
- details: {canonical: canonical}
145
+ details: details
146
+ )
147
+
148
+ return unless sitemap_urls.include?(normalized_page_url)
149
+
150
+ issues.add(
151
+ code: :non_canonical_page_in_sitemap,
152
+ severity: :warning,
153
+ category: :sitemaps,
154
+ url: page.url,
155
+ message: "non-canonical page is included in sitemap",
156
+ details: details
102
157
  )
103
158
  end
104
159
 
@@ -55,6 +55,8 @@ module Crawlscope
55
55
  next
56
56
  end
57
57
 
58
+ validate_type_presence(page, source, data, issues)
59
+
58
60
  errors = schema_registry.validate(data)
59
61
  next if errors.empty?
60
62
 
@@ -96,6 +98,35 @@ module Crawlscope
96
98
  end
97
99
  end
98
100
 
101
+ def validate_type_presence(page, source, data, issues)
102
+ missing_paths = missing_type_paths(data)
103
+ return if missing_paths.empty?
104
+
105
+ issues.add(
106
+ code: :structured_data_missing_type,
107
+ severity: :warning,
108
+ category: :structured_data,
109
+ url: page.url,
110
+ message: "#{source} structured data missing @type",
111
+ details: {paths: missing_paths, source: source}
112
+ )
113
+ end
114
+
115
+ def missing_type_paths(data, path = "$")
116
+ return [] unless data.is_a?(Hash)
117
+
118
+ paths = []
119
+ paths << path if data["@type"].to_s.strip.empty?
120
+
121
+ if data["@graph"].is_a?(Array)
122
+ data["@graph"].each_with_index do |entry, index|
123
+ paths.concat(missing_type_paths(entry, "#{path}.@graph[#{index}]"))
124
+ end
125
+ end
126
+
127
+ paths
128
+ end
129
+
99
130
  def structured_data_types(data)
100
131
  return [] unless data.is_a?(Hash)
101
132
 
@@ -58,6 +58,7 @@ module Crawlscope
58
58
 
59
59
  {
60
60
  content_fingerprint_digest: content_fingerprint_digest(page.doc),
61
+ canonical: page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip,
61
62
  description: page.doc.at_css('meta[name="description"]')&.[]("content").to_s.strip,
62
63
  shingles: shingles_for(tokens),
63
64
  title: page.doc.at_css("title")&.text.to_s.strip,
@@ -98,6 +99,27 @@ module Crawlscope
98
99
  details: {urls: urls}
99
100
  )
100
101
  end
102
+
103
+ duplicate_content_clusters_without_canonical(page_summaries).each do |urls|
104
+ issues.add(
105
+ code: :duplicate_pages_without_canonical,
106
+ severity: :warning,
107
+ category: :uniqueness,
108
+ url: nil,
109
+ message: "duplicate pages without canonical => #{urls.join(", ")}",
110
+ details: {urls: urls}
111
+ )
112
+ end
113
+ end
114
+
115
+ def duplicate_content_clusters_without_canonical(page_summaries)
116
+ page_summaries
117
+ .select { |page| !page[:content_fingerprint_digest].nil? }
118
+ .group_by { |page| page[:content_fingerprint_digest] }
119
+ .values
120
+ .select { |pages| pages.size > 1 }
121
+ .select { |pages| pages.any? { |page| page[:canonical].to_s.empty? } }
122
+ .map { |pages| pages.map { |page| page[:url] } }
101
123
  end
102
124
 
103
125
  def shingles_for(tokens)
@@ -25,6 +25,9 @@ module Crawlscope
25
25
  visited.add(source)
26
26
  document = Nokogiri::XML(read(source))
27
27
  root_name = document.root&.name
28
+ unless %w[sitemapindex urlset].include?(root_name)
29
+ raise ValidationError, "Sitemap #{source} has unexpected root #{root_name.inspect}"
30
+ end
28
31
 
29
32
  if root_name == "sitemapindex"
30
33
  document.xpath("//xmlns:sitemap/xmlns:loc", SITEMAP_NAMESPACE).flat_map do |node|
@@ -40,7 +43,12 @@ module Crawlscope
40
43
 
41
44
  def read(source)
42
45
  if Url.remote?(source)
43
- connection.get(source).body
46
+ response = connection.get(source)
47
+ unless response.status.to_i.between?(200, 299)
48
+ raise ValidationError, "Sitemap #{source} returned HTTP #{response.status}"
49
+ end
50
+
51
+ response.body
44
52
  else
45
53
  File.read(source)
46
54
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Crawlscope
4
- VERSION = "0.4.0"
4
+ VERSION = "0.5.0"
5
5
  end