crawlscope 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 79e8c8f3993c545bf7647c28b8540d3757c7d9c91eeaf885cde6d55c4935ebb5
4
- data.tar.gz: d9b6a987e04546c2d3ee7bb3cc6e1d5510e78963df035cb24d7c8783064afa45
3
+ metadata.gz: 7d9e56ae9a55e3c4bb6b079585b8a302edcc1bfad9110938c9421c5224bf27f9
4
+ data.tar.gz: ab1908aa4a1bef4c15f055800ca9862ba973c9257f39020309de1f5554923cca
5
5
  SHA512:
6
- metadata.gz: eb49361b9f26992682db7622796c4b262a12fca37254aca5e1f1c49c85702b7e4fc347a880af0665f10238f5340cb61bc44433060ba7b3fbde0bdd379c85c763
7
- data.tar.gz: 5fa53f930ef529279e063bd11f9becd112c8abb266078027486f22ad37e968bad744c5a35c9432ccb170ceb51e45d858e23a47c649c6ede1d4dd89fb331fd9f3
6
+ metadata.gz: 8981de1e7bc19737df3048b1e19f28d585f22eed8f2b32ea4eea473ba377d3a261e08df8165114c8d967b7ec7d14a48c47a6b83cfe14261f6c83b56b39134766
7
+ data.tar.gz: 3df1e21bf74c12e994c0a932f9c581e4a6e12b55dd14975e43c5eca073bcc8eb4a49337d3c3c2e52137c4436fb5ddf52accd6fc11954171d77e75ce8f75e69a5
data/CHANGELOG.md CHANGED
@@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.5.0] - 2026-05-31
9
+
10
+
11
+ ### Added
12
+
13
+ - expand SEO audit checks
14
+
15
+
16
+
8
17
  ## [0.4.0] - 2026-05-21
9
18
 
10
19
 
data/README.md CHANGED
@@ -161,12 +161,18 @@ The same validation surface is also available in the gem repository itself throu
161
161
 
162
162
  ```bash
163
163
  bundle exec rake crawlscope:validate URL=https://example.com
164
+ bundle exec rake 'crawlscope:validate[https://example.com]'
164
165
  bundle exec rake crawlscope:validate:metadata URL=https://example.com
166
+ bundle exec rake 'crawlscope:validate:metadata[https://example.com]'
165
167
  bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article
168
+ bundle exec rake 'crawlscope:validate:ldjson[https://example.com/article]'
166
169
  ```
167
170
 
168
171
  `crawlscope:validate` runs all default sitemap rules: indexability, metadata, structured data, uniqueness, content quality, and links. `URL` is the site base. Without `SITEMAP`, Crawlscope uses `/sitemap.xml`. With `SITEMAP`, Crawlscope uses `URL` as the site base and validates URLs from that sitemap. `SITEMAP` may be a full URL or a local file path.
169
172
 
173
+ Plain `rake` does not pass `--url` style flags to tasks. Use `URL=...` or the
174
+ task-argument form above instead.
175
+
170
176
  `crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`.
171
177
 
172
178
  ### Structured Data URL Audit
@@ -37,11 +37,14 @@ module Crawlscope
37
37
  @err.puts(general_usage)
38
38
  1
39
39
  end
40
- rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ValidationError, ArgumentError => error
40
+ rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ArgumentError => error
41
41
  @err.puts(error.message)
42
42
  @err.puts("")
43
43
  @err.puts(general_usage)
44
44
  1
45
+ rescue ValidationError => error
46
+ @err.puts(error.message)
47
+ 1
45
48
  end
46
49
 
47
50
  private
@@ -83,6 +83,7 @@ module Crawlscope
83
83
  issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status})
84
84
  elsif redirected?(page)
85
85
  issues.add(code: :redirected_page, severity: :warning, category: :crawl, url: page.url, message: "redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
86
+ issues.add(code: :sitemap_redirect_url, severity: :warning, category: :sitemaps, url: page.url, message: "sitemap URL redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
86
87
  end
87
88
  end
88
89
  end
@@ -127,6 +128,7 @@ module Crawlscope
127
128
  crawled: crawled,
128
129
  error: page.error,
129
130
  final_url: page.normalized_final_url || normalized_url,
131
+ html: page.html?,
130
132
  status: page.status
131
133
  }
132
134
  end
@@ -4,25 +4,40 @@ module Crawlscope
4
4
  module RakeTasks
5
5
  module_function
6
6
 
7
- def validate
8
- run("validate")
7
+ def validate(url: nil, sitemap_path: nil, rule_names: nil)
8
+ run("validate", argv: validate_argv(url: url, sitemap_path: sitemap_path, rule_names: rule_names))
9
9
  end
10
10
 
11
- def ldjson
12
- run("ldjson")
11
+ def ldjson(urls: nil)
12
+ run("ldjson", argv: ldjson_argv(urls: urls))
13
13
  end
14
14
 
15
- def validate_rule(rule)
16
- original_rules = ENV["RULES"]
17
- ENV["RULES"] = rule
18
- validate
19
- ensure
20
- ENV["RULES"] = original_rules
15
+ def validate_rule(rule, url: nil, sitemap_path: nil)
16
+ validate(url: url, sitemap_path: sitemap_path, rule_names: rule)
21
17
  end
22
18
 
23
- def run(command)
24
- status = Cli.start([command], out: $stdout, err: $stderr)
19
+ def run(command, argv: [])
20
+ status = Cli.start([command, *argv], out: $stdout, err: $stderr)
25
21
  exit(status) unless status.zero?
26
22
  end
23
+
24
+ def validate_argv(url:, sitemap_path:, rule_names:)
25
+ [
26
+ option_pair("--url", url),
27
+ option_pair("--sitemap", sitemap_path),
28
+ option_pair("--rules", rule_names)
29
+ ].compact.flatten
30
+ end
31
+
32
+ def ldjson_argv(urls:)
33
+ Array(urls).flat_map { |url| option_pair("--url", url) }.compact
34
+ end
35
+
36
+ def option_pair(name, value)
37
+ value = value.to_s.strip
38
+ return if value.empty?
39
+
40
+ [name, value]
41
+ end
27
42
  end
28
43
  end
@@ -20,14 +20,29 @@ module Crawlscope
20
20
 
21
21
  @io.puts("Status: FAILED")
22
22
  @io.puts("Issues: #{result.issues.size}")
23
+ @io.puts("")
23
24
 
24
- result.issues.by_severity.sort_by { |severity, _issues| severity.to_s }.each do |severity, issues|
25
- @io.puts("#{severity}: #{issues.size}")
26
- end
25
+ report_grouped_issues("Severity", result.issues.by_severity)
26
+ @io.puts("")
27
+ report_grouped_issues("Category", result.issues.by_category)
28
+ end
29
+
30
+ private
27
31
 
28
- result.issues.each do |issue|
29
- @io.puts("- [#{issue.severity}] #{issue.url} #{issue.message}")
32
+ def report_grouped_issues(title, grouped_issues)
33
+ @io.puts("#{title}:")
34
+
35
+ grouped_issues.sort_by { |name, _issues| name.to_s }.each do |name, issues|
36
+ @io.puts("#{name}: #{issues.size}")
37
+ issues.each do |issue|
38
+ @io.puts(" - #{offense(issue)}")
39
+ end
30
40
  end
31
41
  end
42
+
43
+ def offense(issue)
44
+ parts = ["[#{issue.severity}]", issue.code, issue.url, issue.message]
45
+ parts.compact.join(" ")
46
+ end
32
47
  end
33
48
  end
@@ -13,45 +13,110 @@ module Crawlscope
13
13
  end
14
14
 
15
15
  def call(urls:, pages:, issues:, context: nil)
16
+ sitemap_urls = normalized_sitemap_urls(urls)
17
+
16
18
  pages.each do |page|
17
- validate_meta_robots(page, issues) if page.html?
18
- validate_x_robots_tag(page, issues)
19
+ validate_meta_robots(page, issues, sitemap_urls) if page.html?
20
+ validate_x_robots_tag(page, issues, sitemap_urls)
19
21
  end
20
22
  end
21
23
 
22
24
  private
23
25
 
26
+ def normalized_sitemap_urls(urls)
27
+ urls.map { |url| Url.normalize(url, base_url: url) }.compact
28
+ end
29
+
24
30
  def header_value(page, name)
25
31
  page.headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
26
32
  end
27
33
 
28
- def noindex?(value)
34
+ def directives(value)
29
35
  value
30
36
  .split(",")
31
37
  .map { |directive| directive.split(":", 2).last.to_s.strip }
32
- .any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
38
+ .reject(&:empty?)
39
+ end
40
+
41
+ def noindex?(value)
42
+ directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
43
+ end
44
+
45
+ def follow?(value)
46
+ directives(value).any? { |directive| directive.casecmp?("follow") }
33
47
  end
34
48
 
35
- def validate_meta_robots(page, issues)
49
+ def nofollow?(value)
50
+ directives(value).any? { |directive| directive.casecmp?("nofollow") || directive.casecmp?("none") }
51
+ end
52
+
53
+ def validate_meta_robots(page, issues, sitemap_urls)
36
54
  page.doc.css(ROBOTS_META_SELECTOR).each do |tag|
37
55
  content = tag["content"].to_s
38
- next unless noindex?(content)
39
-
40
- issues.add(
41
- code: :noindex_meta,
42
- severity: :error,
43
- category: :indexability,
44
- url: page.url,
45
- message: "robots meta tag prevents indexing",
46
- details: {content: content, name: tag["name"].to_s}
47
- )
56
+
57
+ report_noindex_meta(page, issues, content, tag["name"].to_s, sitemap_urls) if noindex?(content)
58
+ report_nofollow_meta(page, issues, content, tag["name"].to_s) if nofollow?(content)
59
+ report_noindex_follow_meta(page, issues, content, tag["name"].to_s) if noindex?(content) && follow?(content)
60
+ report_noindex_nofollow_meta(page, issues, content, tag["name"].to_s) if noindex?(content) && nofollow?(content)
48
61
  end
49
62
  end
50
63
 
51
- def validate_x_robots_tag(page, issues)
64
+ def validate_x_robots_tag(page, issues, sitemap_urls)
52
65
  content = header_value(page, X_ROBOTS_TAG_HEADER)
53
- return unless noindex?(content)
66
+ return if content.empty?
67
+
68
+ report_noindex_header(page, issues, content, sitemap_urls) if noindex?(content)
69
+ report_nofollow_header(page, issues, content) if nofollow?(content)
70
+ report_noindex_follow_header(page, issues, content) if noindex?(content) && follow?(content)
71
+ report_noindex_nofollow_header(page, issues, content) if noindex?(content) && nofollow?(content)
72
+ end
54
73
 
74
+ def report_noindex_meta(page, issues, content, name, sitemap_urls)
75
+ issues.add(
76
+ code: :noindex_meta,
77
+ severity: :error,
78
+ category: :indexability,
79
+ url: page.url,
80
+ message: "robots meta tag prevents indexing",
81
+ details: {content: content, name: name}
82
+ )
83
+ report_sitemap_noindex_url(page, issues, content, source: "meta", sitemap_urls: sitemap_urls)
84
+ end
85
+
86
+ def report_nofollow_meta(page, issues, content, name)
87
+ issues.add(
88
+ code: :nofollow_meta,
89
+ severity: :warning,
90
+ category: :indexability,
91
+ url: page.url,
92
+ message: "robots meta tag prevents following links",
93
+ details: {content: content, name: name}
94
+ )
95
+ end
96
+
97
+ def report_noindex_follow_meta(page, issues, content, name)
98
+ issues.add(
99
+ code: :noindex_follow_meta,
100
+ severity: :warning,
101
+ category: :indexability,
102
+ url: page.url,
103
+ message: "robots meta tag prevents indexing but allows following links",
104
+ details: {content: content, name: name}
105
+ )
106
+ end
107
+
108
+ def report_noindex_nofollow_meta(page, issues, content, name)
109
+ issues.add(
110
+ code: :noindex_nofollow_meta,
111
+ severity: :error,
112
+ category: :indexability,
113
+ url: page.url,
114
+ message: "robots meta tag prevents indexing and following links",
115
+ details: {content: content, name: name}
116
+ )
117
+ end
118
+
119
+ def report_noindex_header(page, issues, content, sitemap_urls)
55
120
  issues.add(
56
121
  code: :noindex_header,
57
122
  severity: :error,
@@ -60,6 +125,54 @@ module Crawlscope
60
125
  message: "X-Robots-Tag header prevents indexing",
61
126
  details: {content: content}
62
127
  )
128
+ report_sitemap_noindex_url(page, issues, content, source: "header", sitemap_urls: sitemap_urls)
129
+ end
130
+
131
+ def report_nofollow_header(page, issues, content)
132
+ issues.add(
133
+ code: :nofollow_header,
134
+ severity: :warning,
135
+ category: :indexability,
136
+ url: page.url,
137
+ message: "X-Robots-Tag header prevents following links",
138
+ details: {content: content}
139
+ )
140
+ end
141
+
142
+ def report_noindex_follow_header(page, issues, content)
143
+ issues.add(
144
+ code: :noindex_follow_header,
145
+ severity: :warning,
146
+ category: :indexability,
147
+ url: page.url,
148
+ message: "X-Robots-Tag header prevents indexing but allows following links",
149
+ details: {content: content}
150
+ )
151
+ end
152
+
153
+ def report_noindex_nofollow_header(page, issues, content)
154
+ issues.add(
155
+ code: :noindex_nofollow_header,
156
+ severity: :error,
157
+ category: :indexability,
158
+ url: page.url,
159
+ message: "X-Robots-Tag header prevents indexing and following links",
160
+ details: {content: content}
161
+ )
162
+ end
163
+
164
+ def report_sitemap_noindex_url(page, issues, content, source:, sitemap_urls:)
165
+ normalized_url = Url.normalize(page.url, base_url: page.url)
166
+ return unless sitemap_urls.include?(normalized_url)
167
+
168
+ issues.add(
169
+ code: :sitemap_noindex_url,
170
+ severity: :error,
171
+ category: :sitemaps,
172
+ url: page.url,
173
+ message: "sitemap URL is noindex",
174
+ details: {content: content, source: source}
175
+ )
63
176
  end
64
177
  end
65
178
  end