crawlscope 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +67 -0
- data/README.md +46 -9
- data/lib/crawlscope/cli.rb +5 -0
- data/lib/crawlscope/crawl.rb +6 -0
- data/lib/crawlscope/document_text.rb +40 -0
- data/lib/crawlscope/rule_registry.rb +3 -1
- data/lib/crawlscope/rules/content_quality.rb +99 -0
- data/lib/crawlscope/rules/indexability.rb +66 -0
- data/lib/crawlscope/rules/links.rb +24 -6
- data/lib/crawlscope/rules/metadata.rb +57 -11
- data/lib/crawlscope/rules/structured_data.rb +47 -0
- data/lib/crawlscope/rules/uniqueness.rb +76 -4
- data/lib/crawlscope/schemas.rb +52 -1
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +11 -1
- data/test/crawlscope/cli_test.rb +19 -5
- data/test/crawlscope/configuration_test.rb +8 -1
- data/test/crawlscope/content_quality_rule_test.rb +68 -0
- data/test/crawlscope/crawl_test.rb +23 -3
- data/test/crawlscope/indexability_rule_test.rb +96 -0
- data/test/crawlscope/links_rule_test.rb +39 -0
- data/test/crawlscope/metadata_rule_test.rb +77 -0
- data/test/crawlscope/structured_data_rule_test.rb +91 -0
- data/test/crawlscope/uniqueness_rule_test.rb +43 -2
- data/test/release_task_test.rb +86 -0
- metadata +9 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 79e8c8f3993c545bf7647c28b8540d3757c7d9c91eeaf885cde6d55c4935ebb5
|
|
4
|
+
data.tar.gz: d9b6a987e04546c2d3ee7bb3cc6e1d5510e78963df035cb24d7c8783064afa45
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: eb49361b9f26992682db7622796c4b262a12fca37254aca5e1f1c49c85702b7e4fc347a880af0665f10238f5340cb61bc44433060ba7b3fbde0bdd379c85c763
|
|
7
|
+
data.tar.gz: 5fa53f930ef529279e063bd11f9becd112c8abb266078027486f22ad37e968bad744c5a35c9432ccb170ceb51e45d858e23a47c649c6ede1d4dd89fb331fd9f3
|
data/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,49 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.4.0] - 2026-05-21
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
|
|
13
|
+
- add indexability and content quality checks
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
### Fixed
|
|
19
|
+
|
|
20
|
+
- preserve release changelog history
|
|
21
|
+
|
|
22
|
+
- scope content ratio to main content
|
|
23
|
+
|
|
24
|
+
- harden indexability and uniqueness rules
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
## [0.3.0] - 2026-04-28
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
### Added
|
|
32
|
+
|
|
33
|
+
- add JobPost structured data
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
### Documentation
|
|
39
|
+
|
|
40
|
+
- fix missing changelog entry
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
### Fixed
|
|
46
|
+
|
|
47
|
+
- ldjson check now uses the same convention for default URL
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
|
|
8
51
|
## [0.2.0] - 2026-04-24
|
|
9
52
|
|
|
10
53
|
|
|
@@ -25,3 +68,27 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
25
68
|
|
|
26
69
|
|
|
27
70
|
|
|
71
|
+
## [0.1.0] - 2026-04-23
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
### Added
|
|
75
|
+
|
|
76
|
+
- add crawlkit release-ready audit gem
|
|
77
|
+
|
|
78
|
+
- add standalone validation commands
|
|
79
|
+
|
|
80
|
+
- move default schema rules into crawlkit
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
### Changed
|
|
86
|
+
|
|
87
|
+
- strengthen public API coverage
|
|
88
|
+
|
|
89
|
+
- load shared test dependencies
|
|
90
|
+
|
|
91
|
+
- rename crawlkit to crawlscope
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
|
data/README.md
CHANGED
|
@@ -23,9 +23,11 @@ It works in three modes:
|
|
|
23
23
|
|
|
24
24
|
The default rule set includes:
|
|
25
25
|
|
|
26
|
+
- indexability blockers
|
|
26
27
|
- metadata validation
|
|
27
28
|
- structured-data validation
|
|
28
29
|
- uniqueness checks
|
|
30
|
+
- content-quality checks
|
|
29
31
|
- internal-link checks
|
|
30
32
|
|
|
31
33
|
## Installation
|
|
@@ -146,11 +148,13 @@ Available tasks:
|
|
|
146
148
|
|
|
147
149
|
```bash
|
|
148
150
|
bin/rails crawlscope:validate
|
|
151
|
+
bin/rails crawlscope:validate:indexability
|
|
149
152
|
bin/rails crawlscope:validate:metadata
|
|
150
153
|
bin/rails crawlscope:validate:structured_data
|
|
151
154
|
bin/rails crawlscope:validate:uniqueness
|
|
155
|
+
bin/rails crawlscope:validate:content_quality
|
|
152
156
|
bin/rails crawlscope:validate:links
|
|
153
|
-
bin/rails crawlscope:validate:ldjson
|
|
157
|
+
bin/rails crawlscope:validate:ldjson
|
|
154
158
|
```
|
|
155
159
|
|
|
156
160
|
The same validation surface is also available in the gem repository itself through plain `rake`:
|
|
@@ -161,9 +165,9 @@ bundle exec rake crawlscope:validate:metadata URL=https://example.com
|
|
|
161
165
|
bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article
|
|
162
166
|
```
|
|
163
167
|
|
|
164
|
-
`crawlscope:validate` runs all default sitemap rules: metadata, structured data, uniqueness, and links. `URL` is the site base. Without `SITEMAP`, Crawlscope uses `/sitemap.xml`. With `SITEMAP`, Crawlscope uses `URL` as the site base and validates URLs from that sitemap. `SITEMAP` may be a full URL or a local file path.
|
|
168
|
+
`crawlscope:validate` runs all default sitemap rules: indexability, metadata, structured data, uniqueness, content quality, and links. `URL` is the site base. Without `SITEMAP`, Crawlscope uses `/sitemap.xml`. With `SITEMAP`, Crawlscope uses `URL` as the site base and validates URLs from that sitemap. `SITEMAP` may be a full URL or a local file path.
|
|
165
169
|
|
|
166
|
-
`crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap.
|
|
170
|
+
`crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`.
|
|
167
171
|
|
|
168
172
|
### Structured Data URL Audit
|
|
169
173
|
|
|
@@ -186,11 +190,20 @@ Optional flags:
|
|
|
186
190
|
|
|
187
191
|
Built-in rules:
|
|
188
192
|
|
|
193
|
+
- `indexability`
|
|
189
194
|
- `metadata`
|
|
190
195
|
- `structured_data`
|
|
191
196
|
- `uniqueness`
|
|
197
|
+
- `content_quality`
|
|
192
198
|
- `links`
|
|
193
199
|
|
|
200
|
+
### Indexability
|
|
201
|
+
|
|
202
|
+
Checks:
|
|
203
|
+
|
|
204
|
+
- page-level meta robots `noindex`
|
|
205
|
+
- `X-Robots-Tag: noindex`
|
|
206
|
+
|
|
194
207
|
### Metadata
|
|
195
208
|
|
|
196
209
|
Checks:
|
|
@@ -220,6 +233,19 @@ Checks:
|
|
|
220
233
|
- duplicate titles
|
|
221
234
|
- duplicate meta descriptions
|
|
222
235
|
- duplicate content fingerprints
|
|
236
|
+
- near-duplicate visible content for up to 250 HTML pages
|
|
237
|
+
|
|
238
|
+
For larger crawls, exact duplicate checks still run and Crawlscope reports
|
|
239
|
+
`near_duplicate_scan_skipped`. Configure `Rules::Uniqueness` with
|
|
240
|
+
`max_near_duplicate_pages:` in a custom rule registry to change the limit.
|
|
241
|
+
|
|
242
|
+
### Content Quality
|
|
243
|
+
|
|
244
|
+
Checks:
|
|
245
|
+
|
|
246
|
+
- thin visible text
|
|
247
|
+
- low visible-text-to-HTML ratio
|
|
248
|
+
- low unique-token ratio
|
|
223
249
|
|
|
224
250
|
### Links
|
|
225
251
|
|
|
@@ -268,7 +294,12 @@ bundle exec rake
|
|
|
268
294
|
|
|
269
295
|
### Git hooks
|
|
270
296
|
|
|
271
|
-
We use [lefthook](https://lefthook.dev/) with the Ruby
|
|
297
|
+
We use [lefthook](https://lefthook.dev/) with the Ruby
|
|
298
|
+
[commitlint](https://github.com/arandilopez/commitlint) gem to enforce
|
|
299
|
+
Conventional Commits on every commit. We also use
|
|
300
|
+
[Standard Ruby](https://standardrb.com/) to keep code style consistent. CI
|
|
301
|
+
validates commit messages, Standard Ruby, tests, and git-cliff changelog
|
|
302
|
+
generation on pull requests and pushes to main/master.
|
|
272
303
|
|
|
273
304
|
Run the hook installer once per clone:
|
|
274
305
|
|
|
@@ -284,11 +315,16 @@ rake install
|
|
|
284
315
|
|
|
285
316
|
## Release
|
|
286
317
|
|
|
287
|
-
Releases are tag-driven and published by GitHub Actions to RubyGems.
|
|
318
|
+
Releases are tag-driven and published by GitHub Actions to RubyGems.
|
|
319
|
+
Local release commands never publish directly.
|
|
288
320
|
|
|
289
|
-
Install [git-cliff](https://git-cliff.org/) locally before preparing a
|
|
321
|
+
Install [git-cliff](https://git-cliff.org/) locally before preparing a
|
|
322
|
+
release. The release task prepends the next `CHANGELOG.md` section from
|
|
323
|
+
Conventional Commits.
|
|
290
324
|
|
|
291
|
-
Before preparing a release, make sure you are on `main` or `master` with a
|
|
325
|
+
Before preparing a release, make sure you are on `main` or `master` with a
|
|
326
|
+
clean worktree. If the release contains a breaking public-contract change,
|
|
327
|
+
update `UPGRADE.md` with the host-app migration steps first.
|
|
292
328
|
|
|
293
329
|
Then run one of:
|
|
294
330
|
|
|
@@ -301,12 +337,13 @@ bundle exec rake 'release:prepare[0.1.0]'
|
|
|
301
337
|
|
|
302
338
|
The task will:
|
|
303
339
|
|
|
304
|
-
1.
|
|
340
|
+
1. Prepend the next `CHANGELOG.md` section with `git-cliff`.
|
|
305
341
|
1. Update `lib/crawlscope/version.rb`.
|
|
306
342
|
1. Commit the release changes.
|
|
307
343
|
1. Create and push the `vX.Y.Z` tag.
|
|
308
344
|
|
|
309
|
-
The `Release` workflow then runs tests, publishes the gem to RubyGems,
|
|
345
|
+
The `Release` workflow then runs tests, publishes the gem to RubyGems,
|
|
346
|
+
and creates the GitHub release from the changelog entry.
|
|
310
347
|
|
|
311
348
|
## Contributing
|
|
312
349
|
|
data/lib/crawlscope/cli.rb
CHANGED
|
@@ -105,6 +105,7 @@ module Crawlscope
|
|
|
105
105
|
parser.parse!(@argv)
|
|
106
106
|
|
|
107
107
|
urls = options[:urls].map(&:strip).reject(&:empty?)
|
|
108
|
+
urls = default_urls if urls.empty?
|
|
108
109
|
raise ConfigurationError, "Crawlscope URL is not configured" if urls.empty?
|
|
109
110
|
|
|
110
111
|
configure_renderer(options[:renderer])
|
|
@@ -238,6 +239,10 @@ module Crawlscope
|
|
|
238
239
|
raw_urls.split(";").map(&:strip).reject(&:empty?)
|
|
239
240
|
end
|
|
240
241
|
|
|
242
|
+
def default_urls
|
|
243
|
+
[normalized_string(@configuration.base_url) || "http://localhost:3000"]
|
|
244
|
+
end
|
|
245
|
+
|
|
241
246
|
def task
|
|
242
247
|
@task ||= Run.new(configuration: @configuration, reporter: Reporter.new(io: @out))
|
|
243
248
|
end
|
data/lib/crawlscope/crawl.rb
CHANGED
|
@@ -81,6 +81,8 @@ module Crawlscope
|
|
|
81
81
|
issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: page.url, message: page.error, details: {})
|
|
82
82
|
elsif !@allowed_statuses.include?(page.status)
|
|
83
83
|
issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status})
|
|
84
|
+
elsif redirected?(page)
|
|
85
|
+
issues.add(code: :redirected_page, severity: :warning, category: :crawl, url: page.url, message: "redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
|
|
84
86
|
end
|
|
85
87
|
end
|
|
86
88
|
end
|
|
@@ -128,5 +130,9 @@ module Crawlscope
|
|
|
128
130
|
status: page.status
|
|
129
131
|
}
|
|
130
132
|
end
|
|
133
|
+
|
|
134
|
+
def redirected?(page)
|
|
135
|
+
page.normalized_url.to_s != page.normalized_final_url.to_s
|
|
136
|
+
end
|
|
131
137
|
end
|
|
132
138
|
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Crawlscope
|
|
4
|
+
module DocumentText
|
|
5
|
+
REMOVED_SELECTORS = "script, style, noscript, template, svg"
|
|
6
|
+
TOKEN_PATTERN = /[[:alnum:]]+/
|
|
7
|
+
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
def body_text(doc)
|
|
11
|
+
text_for(doc, selector: nil)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def html_for(doc, selector: "main")
|
|
15
|
+
root_for(doc, selector: selector)&.to_html.to_s
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def text_for(doc, selector: "main")
|
|
19
|
+
normalize(root_for(doc, selector: selector)&.text)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def tokens(text)
|
|
23
|
+
normalize(text).downcase.scan(TOKEN_PATTERN).reject { |token| token.length < 2 }
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def normalize(text)
|
|
27
|
+
text.to_s.gsub(/\s+/, " ").strip
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def root_for(doc, selector:)
|
|
31
|
+
return unless doc
|
|
32
|
+
|
|
33
|
+
copy = doc.dup
|
|
34
|
+
copy.css(REMOVED_SELECTORS).remove
|
|
35
|
+
|
|
36
|
+
root = selector.to_s.empty? ? nil : copy.at_css(selector)
|
|
37
|
+
root || copy.at_css("body") || copy
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -12,12 +12,14 @@ module Crawlscope
|
|
|
12
12
|
def self.default(site_name: nil)
|
|
13
13
|
new(
|
|
14
14
|
rules: [
|
|
15
|
+
Rules::Indexability.new,
|
|
15
16
|
Rules::Metadata.new(site_name: site_name),
|
|
16
17
|
Rules::StructuredData.new,
|
|
17
18
|
Rules::Uniqueness.new,
|
|
19
|
+
Rules::ContentQuality.new,
|
|
18
20
|
Rules::Links.new
|
|
19
21
|
],
|
|
20
|
-
default_codes: %i[metadata structured_data uniqueness links]
|
|
22
|
+
default_codes: %i[indexability metadata structured_data uniqueness content_quality links]
|
|
21
23
|
)
|
|
22
24
|
end
|
|
23
25
|
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Crawlscope
|
|
4
|
+
module Rules
|
|
5
|
+
class ContentQuality
|
|
6
|
+
MIN_VISIBLE_TEXT_RATIO = 0.08
|
|
7
|
+
MIN_VISIBLE_WORDS = 250
|
|
8
|
+
MIN_UNIQUE_TOKEN_RATIO = 0.25
|
|
9
|
+
|
|
10
|
+
attr_reader :code
|
|
11
|
+
|
|
12
|
+
def initialize(
|
|
13
|
+
min_visible_text_ratio: MIN_VISIBLE_TEXT_RATIO,
|
|
14
|
+
min_visible_words: MIN_VISIBLE_WORDS,
|
|
15
|
+
min_unique_token_ratio: MIN_UNIQUE_TOKEN_RATIO
|
|
16
|
+
)
|
|
17
|
+
@code = :content_quality
|
|
18
|
+
@min_visible_text_ratio = min_visible_text_ratio
|
|
19
|
+
@min_visible_words = min_visible_words
|
|
20
|
+
@min_unique_token_ratio = min_unique_token_ratio
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def call(urls:, pages:, issues:, context: nil)
|
|
24
|
+
pages.each do |page|
|
|
25
|
+
next unless page.html?
|
|
26
|
+
|
|
27
|
+
validate_visible_words(page, issues)
|
|
28
|
+
validate_visible_text_ratio(page, issues)
|
|
29
|
+
validate_unique_token_ratio(page, issues)
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
def validate_unique_token_ratio(page, issues)
|
|
36
|
+
tokens = DocumentText.tokens(DocumentText.text_for(page.doc))
|
|
37
|
+
return if tokens.size < @min_visible_words
|
|
38
|
+
|
|
39
|
+
ratio = tokens.uniq.size.to_f / tokens.size
|
|
40
|
+
return if ratio >= @min_unique_token_ratio
|
|
41
|
+
|
|
42
|
+
issues.add(
|
|
43
|
+
code: :low_unique_token_ratio,
|
|
44
|
+
severity: :warning,
|
|
45
|
+
category: :content_quality,
|
|
46
|
+
url: page.url,
|
|
47
|
+
message: "visible text has low token variety (#{format_ratio(ratio)})",
|
|
48
|
+
details: {
|
|
49
|
+
ratio: ratio.round(3),
|
|
50
|
+
threshold: @min_unique_token_ratio,
|
|
51
|
+
token_count: tokens.size,
|
|
52
|
+
unique_token_count: tokens.uniq.size
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def validate_visible_text_ratio(page, issues)
|
|
58
|
+
html_bytes = DocumentText.html_for(page.doc).bytesize
|
|
59
|
+
return if html_bytes.zero?
|
|
60
|
+
|
|
61
|
+
visible_text = DocumentText.text_for(page.doc)
|
|
62
|
+
ratio = visible_text.bytesize.to_f / html_bytes
|
|
63
|
+
return if ratio >= @min_visible_text_ratio
|
|
64
|
+
|
|
65
|
+
issues.add(
|
|
66
|
+
code: :low_visible_text_ratio,
|
|
67
|
+
severity: :warning,
|
|
68
|
+
category: :content_quality,
|
|
69
|
+
url: page.url,
|
|
70
|
+
message: "low visible text to HTML ratio (#{format_ratio(ratio)})",
|
|
71
|
+
details: {
|
|
72
|
+
html_bytes: html_bytes,
|
|
73
|
+
ratio: ratio.round(3),
|
|
74
|
+
threshold: @min_visible_text_ratio,
|
|
75
|
+
visible_text_bytes: visible_text.bytesize
|
|
76
|
+
}
|
|
77
|
+
)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def validate_visible_words(page, issues)
|
|
81
|
+
word_count = DocumentText.tokens(DocumentText.text_for(page.doc)).size
|
|
82
|
+
return if word_count >= @min_visible_words
|
|
83
|
+
|
|
84
|
+
issues.add(
|
|
85
|
+
code: :thin_visible_text,
|
|
86
|
+
severity: :warning,
|
|
87
|
+
category: :content_quality,
|
|
88
|
+
url: page.url,
|
|
89
|
+
message: "thin visible text (#{word_count} words)",
|
|
90
|
+
details: {word_count: word_count, minimum: @min_visible_words}
|
|
91
|
+
)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def format_ratio(value)
|
|
95
|
+
format("%.2f", value)
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Crawlscope
|
|
4
|
+
module Rules
|
|
5
|
+
class Indexability
|
|
6
|
+
ROBOTS_META_SELECTOR = 'meta[name="robots"], meta[name="googlebot"]'
|
|
7
|
+
X_ROBOTS_TAG_HEADER = "x-robots-tag"
|
|
8
|
+
|
|
9
|
+
attr_reader :code
|
|
10
|
+
|
|
11
|
+
def initialize
|
|
12
|
+
@code = :indexability
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def call(urls:, pages:, issues:, context: nil)
|
|
16
|
+
pages.each do |page|
|
|
17
|
+
validate_meta_robots(page, issues) if page.html?
|
|
18
|
+
validate_x_robots_tag(page, issues)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
private
|
|
23
|
+
|
|
24
|
+
def header_value(page, name)
|
|
25
|
+
page.headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def noindex?(value)
|
|
29
|
+
value
|
|
30
|
+
.split(",")
|
|
31
|
+
.map { |directive| directive.split(":", 2).last.to_s.strip }
|
|
32
|
+
.any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def validate_meta_robots(page, issues)
|
|
36
|
+
page.doc.css(ROBOTS_META_SELECTOR).each do |tag|
|
|
37
|
+
content = tag["content"].to_s
|
|
38
|
+
next unless noindex?(content)
|
|
39
|
+
|
|
40
|
+
issues.add(
|
|
41
|
+
code: :noindex_meta,
|
|
42
|
+
severity: :error,
|
|
43
|
+
category: :indexability,
|
|
44
|
+
url: page.url,
|
|
45
|
+
message: "robots meta tag prevents indexing",
|
|
46
|
+
details: {content: content, name: tag["name"].to_s}
|
|
47
|
+
)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def validate_x_robots_tag(page, issues)
|
|
52
|
+
content = header_value(page, X_ROBOTS_TAG_HEADER)
|
|
53
|
+
return unless noindex?(content)
|
|
54
|
+
|
|
55
|
+
issues.add(
|
|
56
|
+
code: :noindex_header,
|
|
57
|
+
severity: :error,
|
|
58
|
+
category: :indexability,
|
|
59
|
+
url: page.url,
|
|
60
|
+
message: "X-Robots-Tag header prevents indexing",
|
|
61
|
+
details: {content: content}
|
|
62
|
+
)
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
@@ -5,7 +5,7 @@ require "uri"
|
|
|
5
5
|
module Crawlscope
|
|
6
6
|
module Rules
|
|
7
7
|
class Links
|
|
8
|
-
|
|
8
|
+
LINK_SELECTORS = "a[href]"
|
|
9
9
|
INTERNAL_PATH_PREFIXES_TO_SKIP = ["/rails/", "/cdn-cgi/"].freeze
|
|
10
10
|
LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze
|
|
11
11
|
MAX_SOURCES_IN_ERROR = 3
|
|
@@ -33,10 +33,7 @@ module Crawlscope
|
|
|
33
33
|
private
|
|
34
34
|
|
|
35
35
|
def contextual_links(doc)
|
|
36
|
-
|
|
37
|
-
return links unless links.empty?
|
|
38
|
-
|
|
39
|
-
doc.css("a[href]")
|
|
36
|
+
doc.css(LINK_SELECTORS)
|
|
40
37
|
end
|
|
41
38
|
|
|
42
39
|
def extract_links(pages)
|
|
@@ -45,7 +42,7 @@ module Crawlscope
|
|
|
45
42
|
|
|
46
43
|
def page_links(page)
|
|
47
44
|
source_path = Url.path(page.normalized_url)
|
|
48
|
-
return [] unless
|
|
45
|
+
return [] unless crawlable_source_path?(source_path)
|
|
49
46
|
|
|
50
47
|
contextual_links(page.doc).filter_map do |node|
|
|
51
48
|
link_for(page: page, source_path: source_path, node: node)
|
|
@@ -146,6 +143,7 @@ module Crawlscope
|
|
|
146
143
|
next
|
|
147
144
|
end
|
|
148
145
|
|
|
146
|
+
report_redirect_target(target_url, grouped_links, issues, target) if target.redirect?
|
|
149
147
|
next unless crawlable_path?(target.final_path)
|
|
150
148
|
|
|
151
149
|
grouped_links.each do |link|
|
|
@@ -156,6 +154,18 @@ module Crawlscope
|
|
|
156
154
|
resolved_links
|
|
157
155
|
end
|
|
158
156
|
|
|
157
|
+
def report_redirect_target(target_url, grouped_links, issues, target)
|
|
158
|
+
source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
|
|
159
|
+
issues.add(
|
|
160
|
+
code: :internal_link_redirects,
|
|
161
|
+
severity: :warning,
|
|
162
|
+
category: :links,
|
|
163
|
+
url: target_url,
|
|
164
|
+
message: "internal link redirects to #{target.final_url} (sources: #{source_urls.join(", ")})",
|
|
165
|
+
details: {final_url: target.final_url, source_urls: source_urls, status: target.status}
|
|
166
|
+
)
|
|
167
|
+
end
|
|
168
|
+
|
|
159
169
|
def resolve_target(target_url)
|
|
160
170
|
resolution = @resolve_target.call(target_url)
|
|
161
171
|
LinkTarget.new(target_url: target_url, resolution: resolution)
|
|
@@ -183,11 +193,19 @@ module Crawlscope
|
|
|
183
193
|
resolution && resolution[:status]
|
|
184
194
|
end
|
|
185
195
|
|
|
196
|
+
def redirect?
|
|
197
|
+
(status && (300..399).cover?(status.to_i)) || final_url != target_url
|
|
198
|
+
end
|
|
199
|
+
|
|
186
200
|
def unresolved?
|
|
187
201
|
resolution.nil? || (status.nil? && !ignored_error?)
|
|
188
202
|
end
|
|
189
203
|
end
|
|
190
204
|
|
|
205
|
+
def crawlable_source_path?(path)
|
|
206
|
+
!path.nil? && INTERNAL_PATH_PREFIXES_TO_SKIP.none? { |prefix| path.start_with?(prefix) }
|
|
207
|
+
end
|
|
208
|
+
|
|
191
209
|
def skip_internal_path?(path)
|
|
192
210
|
return true if path == "/"
|
|
193
211
|
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "uri"
|
|
4
|
+
|
|
3
5
|
module Crawlscope
|
|
4
6
|
module Rules
|
|
5
7
|
class Metadata
|
|
6
8
|
TITLE_MAX_LENGTH = 72
|
|
9
|
+
DESCRIPTION_MIN_LENGTH = 110
|
|
7
10
|
DESCRIPTION_MAX_LENGTH = 160
|
|
11
|
+
REQUIRED_OPEN_GRAPH_PROPERTIES = %w[og:title og:description og:url og:type og:image].freeze
|
|
8
12
|
|
|
9
13
|
attr_reader :code
|
|
10
14
|
|
|
@@ -21,22 +25,35 @@ module Crawlscope
|
|
|
21
25
|
validate_title(page, issues)
|
|
22
26
|
validate_description(page, issues)
|
|
23
27
|
validate_canonical(page, issues)
|
|
28
|
+
validate_open_graph(page, issues)
|
|
24
29
|
end
|
|
25
30
|
end
|
|
26
31
|
|
|
27
32
|
private
|
|
28
33
|
|
|
29
34
|
def validate_h1(page, issues)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
35
|
+
h1s = page.doc.css("h1")
|
|
36
|
+
return if h1s.one?
|
|
37
|
+
|
|
38
|
+
if h1s.empty?
|
|
39
|
+
issues.add(
|
|
40
|
+
code: :missing_h1,
|
|
41
|
+
severity: :warning,
|
|
42
|
+
category: :metadata,
|
|
43
|
+
url: page.url,
|
|
44
|
+
message: "missing <h1>",
|
|
45
|
+
details: {}
|
|
46
|
+
)
|
|
47
|
+
else
|
|
48
|
+
issues.add(
|
|
49
|
+
code: :multiple_h1,
|
|
50
|
+
severity: :warning,
|
|
51
|
+
category: :metadata,
|
|
52
|
+
url: page.url,
|
|
53
|
+
message: "multiple <h1> tags (#{h1s.size})",
|
|
54
|
+
details: {count: h1s.size}
|
|
55
|
+
)
|
|
56
|
+
end
|
|
40
57
|
end
|
|
41
58
|
|
|
42
59
|
def validate_title(page, issues)
|
|
@@ -56,6 +73,8 @@ module Crawlscope
|
|
|
56
73
|
|
|
57
74
|
if description.empty?
|
|
58
75
|
issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {})
|
|
76
|
+
elsif description.length < DESCRIPTION_MIN_LENGTH
|
|
77
|
+
issues.add(code: :meta_description_too_short, severity: :warning, category: :metadata, url: page.url, message: "meta description too short (#{description.length})", details: {length: description.length, minimum: DESCRIPTION_MIN_LENGTH})
|
|
59
78
|
elsif description.length > DESCRIPTION_MAX_LENGTH
|
|
60
79
|
issues.add(code: :meta_description_too_long, severity: :warning, category: :metadata, url: page.url, message: "meta description too long (#{description.length})", details: {length: description.length})
|
|
61
80
|
end
|
|
@@ -71,7 +90,7 @@ module Crawlscope
|
|
|
71
90
|
|
|
72
91
|
normalized_canonical = Url.normalize(canonical, base_url: page.url)
|
|
73
92
|
normalized_page_url = Url.normalize(page.url, base_url: page.url)
|
|
74
|
-
return if normalized_canonical
|
|
93
|
+
return if canonical_matches_page?(normalized_canonical, normalized_page_url)
|
|
75
94
|
|
|
76
95
|
issues.add(
|
|
77
96
|
code: :canonical_mismatch,
|
|
@@ -88,6 +107,33 @@ module Crawlscope
|
|
|
88
107
|
|
|
89
108
|
title.split(/[^[:alnum:]]+/).count { |token| token.casecmp?(@site_name) } > 1
|
|
90
109
|
end
|
|
110
|
+
|
|
111
|
+
def validate_open_graph(page, issues)
|
|
112
|
+
missing = REQUIRED_OPEN_GRAPH_PROPERTIES.reject do |property|
|
|
113
|
+
page.doc.at_css(%(meta[property="#{property}"][content]))
|
|
114
|
+
end
|
|
115
|
+
return if missing.empty?
|
|
116
|
+
|
|
117
|
+
issues.add(
|
|
118
|
+
code: :incomplete_open_graph_tags,
|
|
119
|
+
severity: :warning,
|
|
120
|
+
category: :metadata,
|
|
121
|
+
url: page.url,
|
|
122
|
+
message: "Open Graph tags incomplete (missing #{missing.join(", ")})",
|
|
123
|
+
details: {missing: missing}
|
|
124
|
+
)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def canonical_matches_page?(canonical, page_url)
|
|
128
|
+
canonical == page_url || (local_url?(page_url) && Url.path(canonical) == Url.path(page_url))
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def local_url?(url)
|
|
132
|
+
host = URI.parse(url.to_s).host.to_s
|
|
133
|
+
["localhost", "127.0.0.1", "0.0.0.0", "::1"].include?(host)
|
|
134
|
+
rescue URI::InvalidURIError
|
|
135
|
+
false
|
|
136
|
+
end
|
|
91
137
|
end
|
|
92
138
|
end
|
|
93
139
|
end
|