crawlscope 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +6 -0
- data/lib/crawlscope/cli.rb +4 -1
- data/lib/crawlscope/crawl.rb +2 -0
- data/lib/crawlscope/rake_tasks.rb +27 -12
- data/lib/crawlscope/reporter.rb +20 -5
- data/lib/crawlscope/rules/indexability.rb +130 -17
- data/lib/crawlscope/rules/links.rb +312 -9
- data/lib/crawlscope/rules/metadata.rb +61 -6
- data/lib/crawlscope/rules/structured_data.rb +31 -0
- data/lib/crawlscope/rules/uniqueness.rb +22 -0
- data/lib/crawlscope/sitemap.rb +9 -1
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +24 -24
- data/test/crawlscope/cli_test.rb +1 -0
- data/test/crawlscope/crawl_test.rb +26 -0
- data/test/crawlscope/indexability_rule_test.rb +33 -0
- data/test/crawlscope/links_rule_test.rb +148 -3
- data/test/crawlscope/metadata_rule_test.rb +36 -0
- data/test/crawlscope/rake_tasks_test.rb +70 -0
- data/test/crawlscope/reporter_test.rb +7 -3
- data/test/crawlscope/sitemap_test.rb +24 -0
- data/test/crawlscope/structured_data_rule_test.rb +56 -0
- data/test/crawlscope/uniqueness_rule_test.rb +17 -2
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 7d9e56ae9a55e3c4bb6b079585b8a302edcc1bfad9110938c9421c5224bf27f9
|
|
4
|
+
data.tar.gz: ab1908aa4a1bef4c15f055800ca9862ba973c9257f39020309de1f5554923cca
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 8981de1e7bc19737df3048b1e19f28d585f22eed8f2b32ea4eea473ba377d3a261e08df8165114c8d967b7ec7d14a48c47a6b83cfe14261f6c83b56b39134766
|
|
7
|
+
data.tar.gz: 3df1e21bf74c12e994c0a932f9c581e4a6e12b55dd14975e43c5eca073bcc8eb4a49337d3c3c2e52137c4436fb5ddf52accd6fc11954171d77e75ce8f75e69a5
|
data/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.5.0] - 2026-05-31
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
|
|
13
|
+
- expand SEO audit checks
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
8
17
|
## [0.4.0] - 2026-05-21
|
|
9
18
|
|
|
10
19
|
|
data/README.md
CHANGED
|
@@ -161,12 +161,18 @@ The same validation surface is also available in the gem repository itself throu
|
|
|
161
161
|
|
|
162
162
|
```bash
|
|
163
163
|
bundle exec rake crawlscope:validate URL=https://example.com
|
|
164
|
+
bundle exec rake 'crawlscope:validate[https://example.com]'
|
|
164
165
|
bundle exec rake crawlscope:validate:metadata URL=https://example.com
|
|
166
|
+
bundle exec rake 'crawlscope:validate:metadata[https://example.com]'
|
|
165
167
|
bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article
|
|
168
|
+
bundle exec rake 'crawlscope:validate:ldjson[https://example.com/article]'
|
|
166
169
|
```
|
|
167
170
|
|
|
168
171
|
`crawlscope:validate` runs all default sitemap rules: indexability, metadata, structured data, uniqueness, content quality, and links. `URL` is the site base. Without `SITEMAP`, Crawlscope uses `/sitemap.xml`. With `SITEMAP`, Crawlscope uses `URL` as the site base and validates URLs from that sitemap. `SITEMAP` may be a full URL or a local file path.
|
|
169
172
|
|
|
173
|
+
Plain `rake` does not pass `--url` style flags to tasks. Use `URL=...` or the
|
|
174
|
+
task-argument form above instead.
|
|
175
|
+
|
|
170
176
|
`crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`.
|
|
171
177
|
|
|
172
178
|
### Structured Data URL Audit
|
data/lib/crawlscope/cli.rb
CHANGED
|
@@ -37,11 +37,14 @@ module Crawlscope
|
|
|
37
37
|
@err.puts(general_usage)
|
|
38
38
|
1
|
|
39
39
|
end
|
|
40
|
-
rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError,
|
|
40
|
+
rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ArgumentError => error
|
|
41
41
|
@err.puts(error.message)
|
|
42
42
|
@err.puts("")
|
|
43
43
|
@err.puts(general_usage)
|
|
44
44
|
1
|
|
45
|
+
rescue ValidationError => error
|
|
46
|
+
@err.puts(error.message)
|
|
47
|
+
1
|
|
45
48
|
end
|
|
46
49
|
|
|
47
50
|
private
|
data/lib/crawlscope/crawl.rb
CHANGED
|
@@ -83,6 +83,7 @@ module Crawlscope
|
|
|
83
83
|
issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status})
|
|
84
84
|
elsif redirected?(page)
|
|
85
85
|
issues.add(code: :redirected_page, severity: :warning, category: :crawl, url: page.url, message: "redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
|
|
86
|
+
issues.add(code: :sitemap_redirect_url, severity: :warning, category: :sitemaps, url: page.url, message: "sitemap URL redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
|
|
86
87
|
end
|
|
87
88
|
end
|
|
88
89
|
end
|
|
@@ -127,6 +128,7 @@ module Crawlscope
|
|
|
127
128
|
crawled: crawled,
|
|
128
129
|
error: page.error,
|
|
129
130
|
final_url: page.normalized_final_url || normalized_url,
|
|
131
|
+
html: page.html?,
|
|
130
132
|
status: page.status
|
|
131
133
|
}
|
|
132
134
|
end
|
|
@@ -4,25 +4,40 @@ module Crawlscope
|
|
|
4
4
|
module RakeTasks
|
|
5
5
|
module_function
|
|
6
6
|
|
|
7
|
-
def validate
|
|
8
|
-
run("validate")
|
|
7
|
+
def validate(url: nil, sitemap_path: nil, rule_names: nil)
|
|
8
|
+
run("validate", argv: validate_argv(url: url, sitemap_path: sitemap_path, rule_names: rule_names))
|
|
9
9
|
end
|
|
10
10
|
|
|
11
|
-
def ldjson
|
|
12
|
-
run("ldjson")
|
|
11
|
+
def ldjson(urls: nil)
|
|
12
|
+
run("ldjson", argv: ldjson_argv(urls: urls))
|
|
13
13
|
end
|
|
14
14
|
|
|
15
|
-
def validate_rule(rule)
|
|
16
|
-
|
|
17
|
-
ENV["RULES"] = rule
|
|
18
|
-
validate
|
|
19
|
-
ensure
|
|
20
|
-
ENV["RULES"] = original_rules
|
|
15
|
+
def validate_rule(rule, url: nil, sitemap_path: nil)
|
|
16
|
+
validate(url: url, sitemap_path: sitemap_path, rule_names: rule)
|
|
21
17
|
end
|
|
22
18
|
|
|
23
|
-
def run(command)
|
|
24
|
-
status = Cli.start([command], out: $stdout, err: $stderr)
|
|
19
|
+
def run(command, argv: [])
|
|
20
|
+
status = Cli.start([command, *argv], out: $stdout, err: $stderr)
|
|
25
21
|
exit(status) unless status.zero?
|
|
26
22
|
end
|
|
23
|
+
|
|
24
|
+
def validate_argv(url:, sitemap_path:, rule_names:)
|
|
25
|
+
[
|
|
26
|
+
option_pair("--url", url),
|
|
27
|
+
option_pair("--sitemap", sitemap_path),
|
|
28
|
+
option_pair("--rules", rule_names)
|
|
29
|
+
].compact.flatten
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def ldjson_argv(urls:)
|
|
33
|
+
Array(urls).flat_map { |url| option_pair("--url", url) }.compact
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def option_pair(name, value)
|
|
37
|
+
value = value.to_s.strip
|
|
38
|
+
return if value.empty?
|
|
39
|
+
|
|
40
|
+
[name, value]
|
|
41
|
+
end
|
|
27
42
|
end
|
|
28
43
|
end
|
data/lib/crawlscope/reporter.rb
CHANGED
|
@@ -20,14 +20,29 @@ module Crawlscope
|
|
|
20
20
|
|
|
21
21
|
@io.puts("Status: FAILED")
|
|
22
22
|
@io.puts("Issues: #{result.issues.size}")
|
|
23
|
+
@io.puts("")
|
|
23
24
|
|
|
24
|
-
result.issues.by_severity
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
report_grouped_issues("Severity", result.issues.by_severity)
|
|
26
|
+
@io.puts("")
|
|
27
|
+
report_grouped_issues("Category", result.issues.by_category)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
27
31
|
|
|
28
|
-
|
|
29
|
-
|
|
32
|
+
def report_grouped_issues(title, grouped_issues)
|
|
33
|
+
@io.puts("#{title}:")
|
|
34
|
+
|
|
35
|
+
grouped_issues.sort_by { |name, _issues| name.to_s }.each do |name, issues|
|
|
36
|
+
@io.puts("#{name}: #{issues.size}")
|
|
37
|
+
issues.each do |issue|
|
|
38
|
+
@io.puts(" - #{offense(issue)}")
|
|
39
|
+
end
|
|
30
40
|
end
|
|
31
41
|
end
|
|
42
|
+
|
|
43
|
+
def offense(issue)
|
|
44
|
+
parts = ["[#{issue.severity}]", issue.code, issue.url, issue.message]
|
|
45
|
+
parts.compact.join(" ")
|
|
46
|
+
end
|
|
32
47
|
end
|
|
33
48
|
end
|
|
@@ -13,45 +13,110 @@ module Crawlscope
|
|
|
13
13
|
end
|
|
14
14
|
|
|
15
15
|
def call(urls:, pages:, issues:, context: nil)
|
|
16
|
+
sitemap_urls = normalized_sitemap_urls(urls)
|
|
17
|
+
|
|
16
18
|
pages.each do |page|
|
|
17
|
-
validate_meta_robots(page, issues) if page.html?
|
|
18
|
-
validate_x_robots_tag(page, issues)
|
|
19
|
+
validate_meta_robots(page, issues, sitemap_urls) if page.html?
|
|
20
|
+
validate_x_robots_tag(page, issues, sitemap_urls)
|
|
19
21
|
end
|
|
20
22
|
end
|
|
21
23
|
|
|
22
24
|
private
|
|
23
25
|
|
|
26
|
+
def normalized_sitemap_urls(urls)
|
|
27
|
+
urls.map { |url| Url.normalize(url, base_url: url) }.compact
|
|
28
|
+
end
|
|
29
|
+
|
|
24
30
|
def header_value(page, name)
|
|
25
31
|
page.headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
|
|
26
32
|
end
|
|
27
33
|
|
|
28
|
-
def
|
|
34
|
+
def directives(value)
|
|
29
35
|
value
|
|
30
36
|
.split(",")
|
|
31
37
|
.map { |directive| directive.split(":", 2).last.to_s.strip }
|
|
32
|
-
.
|
|
38
|
+
.reject(&:empty?)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def noindex?(value)
|
|
42
|
+
directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def follow?(value)
|
|
46
|
+
directives(value).any? { |directive| directive.casecmp?("follow") }
|
|
33
47
|
end
|
|
34
48
|
|
|
35
|
-
def
|
|
49
|
+
def nofollow?(value)
|
|
50
|
+
directives(value).any? { |directive| directive.casecmp?("nofollow") || directive.casecmp?("none") }
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def validate_meta_robots(page, issues, sitemap_urls)
|
|
36
54
|
page.doc.css(ROBOTS_META_SELECTOR).each do |tag|
|
|
37
55
|
content = tag["content"].to_s
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
issues.
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
category: :indexability,
|
|
44
|
-
url: page.url,
|
|
45
|
-
message: "robots meta tag prevents indexing",
|
|
46
|
-
details: {content: content, name: tag["name"].to_s}
|
|
47
|
-
)
|
|
56
|
+
|
|
57
|
+
report_noindex_meta(page, issues, content, tag["name"].to_s, sitemap_urls) if noindex?(content)
|
|
58
|
+
report_nofollow_meta(page, issues, content, tag["name"].to_s) if nofollow?(content)
|
|
59
|
+
report_noindex_follow_meta(page, issues, content, tag["name"].to_s) if noindex?(content) && follow?(content)
|
|
60
|
+
report_noindex_nofollow_meta(page, issues, content, tag["name"].to_s) if noindex?(content) && nofollow?(content)
|
|
48
61
|
end
|
|
49
62
|
end
|
|
50
63
|
|
|
51
|
-
def validate_x_robots_tag(page, issues)
|
|
64
|
+
def validate_x_robots_tag(page, issues, sitemap_urls)
|
|
52
65
|
content = header_value(page, X_ROBOTS_TAG_HEADER)
|
|
53
|
-
return
|
|
66
|
+
return if content.empty?
|
|
67
|
+
|
|
68
|
+
report_noindex_header(page, issues, content, sitemap_urls) if noindex?(content)
|
|
69
|
+
report_nofollow_header(page, issues, content) if nofollow?(content)
|
|
70
|
+
report_noindex_follow_header(page, issues, content) if noindex?(content) && follow?(content)
|
|
71
|
+
report_noindex_nofollow_header(page, issues, content) if noindex?(content) && nofollow?(content)
|
|
72
|
+
end
|
|
54
73
|
|
|
74
|
+
def report_noindex_meta(page, issues, content, name, sitemap_urls)
|
|
75
|
+
issues.add(
|
|
76
|
+
code: :noindex_meta,
|
|
77
|
+
severity: :error,
|
|
78
|
+
category: :indexability,
|
|
79
|
+
url: page.url,
|
|
80
|
+
message: "robots meta tag prevents indexing",
|
|
81
|
+
details: {content: content, name: name}
|
|
82
|
+
)
|
|
83
|
+
report_sitemap_noindex_url(page, issues, content, source: "meta", sitemap_urls: sitemap_urls)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def report_nofollow_meta(page, issues, content, name)
|
|
87
|
+
issues.add(
|
|
88
|
+
code: :nofollow_meta,
|
|
89
|
+
severity: :warning,
|
|
90
|
+
category: :indexability,
|
|
91
|
+
url: page.url,
|
|
92
|
+
message: "robots meta tag prevents following links",
|
|
93
|
+
details: {content: content, name: name}
|
|
94
|
+
)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def report_noindex_follow_meta(page, issues, content, name)
|
|
98
|
+
issues.add(
|
|
99
|
+
code: :noindex_follow_meta,
|
|
100
|
+
severity: :warning,
|
|
101
|
+
category: :indexability,
|
|
102
|
+
url: page.url,
|
|
103
|
+
message: "robots meta tag prevents indexing but allows following links",
|
|
104
|
+
details: {content: content, name: name}
|
|
105
|
+
)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def report_noindex_nofollow_meta(page, issues, content, name)
|
|
109
|
+
issues.add(
|
|
110
|
+
code: :noindex_nofollow_meta,
|
|
111
|
+
severity: :error,
|
|
112
|
+
category: :indexability,
|
|
113
|
+
url: page.url,
|
|
114
|
+
message: "robots meta tag prevents indexing and following links",
|
|
115
|
+
details: {content: content, name: name}
|
|
116
|
+
)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def report_noindex_header(page, issues, content, sitemap_urls)
|
|
55
120
|
issues.add(
|
|
56
121
|
code: :noindex_header,
|
|
57
122
|
severity: :error,
|
|
@@ -60,6 +125,54 @@ module Crawlscope
|
|
|
60
125
|
message: "X-Robots-Tag header prevents indexing",
|
|
61
126
|
details: {content: content}
|
|
62
127
|
)
|
|
128
|
+
report_sitemap_noindex_url(page, issues, content, source: "header", sitemap_urls: sitemap_urls)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def report_nofollow_header(page, issues, content)
|
|
132
|
+
issues.add(
|
|
133
|
+
code: :nofollow_header,
|
|
134
|
+
severity: :warning,
|
|
135
|
+
category: :indexability,
|
|
136
|
+
url: page.url,
|
|
137
|
+
message: "X-Robots-Tag header prevents following links",
|
|
138
|
+
details: {content: content}
|
|
139
|
+
)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def report_noindex_follow_header(page, issues, content)
|
|
143
|
+
issues.add(
|
|
144
|
+
code: :noindex_follow_header,
|
|
145
|
+
severity: :warning,
|
|
146
|
+
category: :indexability,
|
|
147
|
+
url: page.url,
|
|
148
|
+
message: "X-Robots-Tag header prevents indexing but allows following links",
|
|
149
|
+
details: {content: content}
|
|
150
|
+
)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def report_noindex_nofollow_header(page, issues, content)
|
|
154
|
+
issues.add(
|
|
155
|
+
code: :noindex_nofollow_header,
|
|
156
|
+
severity: :error,
|
|
157
|
+
category: :indexability,
|
|
158
|
+
url: page.url,
|
|
159
|
+
message: "X-Robots-Tag header prevents indexing and following links",
|
|
160
|
+
details: {content: content}
|
|
161
|
+
)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def report_sitemap_noindex_url(page, issues, content, source:, sitemap_urls:)
|
|
165
|
+
normalized_url = Url.normalize(page.url, base_url: page.url)
|
|
166
|
+
return unless sitemap_urls.include?(normalized_url)
|
|
167
|
+
|
|
168
|
+
issues.add(
|
|
169
|
+
code: :sitemap_noindex_url,
|
|
170
|
+
severity: :error,
|
|
171
|
+
category: :sitemaps,
|
|
172
|
+
url: page.url,
|
|
173
|
+
message: "sitemap URL is noindex",
|
|
174
|
+
details: {content: content, source: source}
|
|
175
|
+
)
|
|
63
176
|
end
|
|
64
177
|
end
|
|
65
178
|
end
|