crawlscope 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +31 -0
- data/LICENSE.txt +21 -0
- data/README.md +323 -0
- data/exe/crawlscope +6 -0
- data/lib/crawlscope/audit.rb +128 -0
- data/lib/crawlscope/browser.rb +88 -0
- data/lib/crawlscope/cli.rb +245 -0
- data/lib/crawlscope/configuration.rb +123 -0
- data/lib/crawlscope/crawler.rb +28 -0
- data/lib/crawlscope/http.rb +77 -0
- data/lib/crawlscope/issue.rb +17 -0
- data/lib/crawlscope/issue_collection.rb +41 -0
- data/lib/crawlscope/page.rb +23 -0
- data/lib/crawlscope/railtie.rb +9 -0
- data/lib/crawlscope/reporter.rb +33 -0
- data/lib/crawlscope/result.rb +9 -0
- data/lib/crawlscope/rule_registry.rb +39 -0
- data/lib/crawlscope/rules/links.rb +220 -0
- data/lib/crawlscope/rules/metadata.rb +93 -0
- data/lib/crawlscope/rules/structured_data.rb +58 -0
- data/lib/crawlscope/rules/uniqueness.rb +88 -0
- data/lib/crawlscope/schema_registry.rb +431 -0
- data/lib/crawlscope/sitemap.rb +67 -0
- data/lib/crawlscope/structured_data/audit.rb +150 -0
- data/lib/crawlscope/structured_data/document.rb +93 -0
- data/lib/crawlscope/structured_data/report.rb +77 -0
- data/lib/crawlscope/structured_data/reporter.rb +73 -0
- data/lib/crawlscope/structured_data/writer.rb +26 -0
- data/lib/crawlscope/task.rb +131 -0
- data/lib/crawlscope/url.rb +43 -0
- data/lib/crawlscope/version.rb +5 -0
- data/lib/crawlscope.rb +34 -0
- data/lib/tasks/crawlscope_tasks.rake +44 -0
- data/test/crawlscope/audit_test.rb +165 -0
- data/test/crawlscope/cli_test.rb +157 -0
- data/test/crawlscope/configuration_test.rb +45 -0
- data/test/crawlscope/links_rule_test.rb +87 -0
- data/test/crawlscope/loader_test.rb +11 -0
- data/test/crawlscope/reporter_test.rb +50 -0
- data/test/crawlscope/schema_registry_test.rb +89 -0
- data/test/crawlscope/sitemap_test.rb +51 -0
- data/test/crawlscope/structured_data_audit_test.rb +118 -0
- data/test/crawlscope/structured_data_document_test.rb +28 -0
- data/test/crawlscope/structured_data_report_test.rb +37 -0
- data/test/crawlscope/structured_data_reporter_test.rb +32 -0
- data/test/crawlscope/structured_data_rule_test.rb +78 -0
- data/test/crawlscope/structured_data_writer_test.rb +32 -0
- data/test/crawlscope/task_test.rb +206 -0
- data/test/crawlscope/uniqueness_rule_test.rb +46 -0
- data/test/test_helper.rb +23 -0
- metadata +271 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 58a83d74a7b2b8422df4f161db9d3a7fe3ff213495f0837fd29a08cc13715b86
|
|
4
|
+
data.tar.gz: 02bd5743bcaae94bfdcc169fb6fe782257527984da68b091f6b75db3420b4244
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: c566f6899f45633db13a8ee47ac15f5e6054a4adff087774ce17ef15c26b10340694bd395e0de0efbdb5b652cf8ea04e3cbbb452d9467fd8167143f3675d5642
|
|
7
|
+
data.tar.gz: 1c087e1f4233224ea2c6b9b14de3bf34f4007b4689cd4fa8b9a3ea7ba688f78beb12431d0ffc7b6f54cae1eead319e3ba8293cef325440c614ca191b6ebf0e8b
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.1.0] - 2026-04-23
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
|
|
13
|
+
- add crawlkit release-ready audit gem
|
|
14
|
+
|
|
15
|
+
- add standalone validation commands
|
|
16
|
+
|
|
17
|
+
- move default schema rules into crawlkit
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
### Changed
|
|
23
|
+
|
|
24
|
+
- strengthen public API coverage
|
|
25
|
+
|
|
26
|
+
- load shared test dependencies
|
|
27
|
+
|
|
28
|
+
- rename crawlkit to crawlscope
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ethos Link
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
# Crawlscope
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/rb/crawlscope)
|
|
4
|
+
[](https://github.com/ethos-link/crawlscope/actions/workflows/ruby.yml)
|
|
5
|
+
|
|
6
|
+
`crawlscope` is a small Ruby gem for sitemap-driven SEO validation.
|
|
7
|
+
|
|
8
|
+
It is built by [Ethos Link](https://www.ethos-link.com) and used in production by [Reviato](https://www.reviato.com).
|
|
9
|
+
|
|
10
|
+
It is designed for Rails apps and plain Ruby scripts that want:
|
|
11
|
+
|
|
12
|
+
- deterministic sitemap crawling
|
|
13
|
+
- structured validation issues instead of free-form strings
|
|
14
|
+
- app-configurable rule and schema registries
|
|
15
|
+
- first-party rake tasks instead of a large DSL
|
|
16
|
+
- optional browser rendering for JavaScript-heavy pages
|
|
17
|
+
|
|
18
|
+
It works in three modes:
|
|
19
|
+
|
|
20
|
+
- as a plain Ruby library
|
|
21
|
+
- as a standalone CLI
|
|
22
|
+
- as Rails rake tasks through the included Railtie
|
|
23
|
+
|
|
24
|
+
The default rule set includes:
|
|
25
|
+
|
|
26
|
+
- metadata validation
|
|
27
|
+
- structured-data validation
|
|
28
|
+
- uniqueness checks
|
|
29
|
+
- internal-link checks
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
Add this line to your application's Gemfile:
|
|
34
|
+
|
|
35
|
+
```ruby
|
|
36
|
+
gem "crawlscope"
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
And then execute:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
bundle install
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Or install it directly:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
gem install crawlscope
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
If you want browser rendering, also add:
|
|
52
|
+
|
|
53
|
+
```ruby
|
|
54
|
+
gem "ferrum"
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
`crawlscope` only loads Ferrum when you run in browser mode.
|
|
58
|
+
|
|
59
|
+
## CLI Usage
|
|
60
|
+
|
|
61
|
+
Validate a site directly from the gem:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
crawlscope validate --base-url https://example.com
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Validate only specific rules:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
crawlscope validate --base-url https://example.com --rules metadata,links
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Validate structured data on one or more URLs:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
crawlscope ldjson --url https://example.com/article
|
|
77
|
+
crawlscope ldjson --url https://example.com/a --url https://example.com/b --summary
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
If you do not pass `--sitemap`, `crawlscope` defaults to:
|
|
81
|
+
|
|
82
|
+
- `https://example.com/sitemap.xml` for real site URLs
|
|
83
|
+
- `public/sitemap.xml` for localhost-style development URLs when that file exists
|
|
84
|
+
|
|
85
|
+
Child sitemap indexes are supported automatically.
|
|
86
|
+
|
|
87
|
+
## Ruby Usage
|
|
88
|
+
|
|
89
|
+
```ruby
|
|
90
|
+
require "crawlscope"
|
|
91
|
+
|
|
92
|
+
audit = Crawlscope::Audit.new(
|
|
93
|
+
base_url: "https://example.com",
|
|
94
|
+
sitemap_path: "https://example.com/sitemap.xml",
|
|
95
|
+
rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
|
|
96
|
+
schema_registry: Crawlscope::SchemaRegistry.default
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
result = audit.call
|
|
100
|
+
|
|
101
|
+
puts result.ok?
|
|
102
|
+
puts result.issues.to_a.map(&:message)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Result Shape
|
|
106
|
+
|
|
107
|
+
`Crawlscope::Audit` returns a `Crawlscope::Result` with:
|
|
108
|
+
|
|
109
|
+
- `urls`: sitemap URLs selected for validation
|
|
110
|
+
- `pages`: fetched page snapshots
|
|
111
|
+
- `issues`: structured issues with `code`, `severity`, `category`, `url`, and `message`
|
|
112
|
+
|
|
113
|
+
`result.ok?` returns `false` if any error, warning, or notice is present.
|
|
114
|
+
|
|
115
|
+
## Rails Usage
|
|
116
|
+
|
|
117
|
+
In an initializer:
|
|
118
|
+
|
|
119
|
+
```ruby
|
|
120
|
+
Crawlscope.configure do |config|
|
|
121
|
+
config.base_url = -> { "https://example.com" }
|
|
122
|
+
config.sitemap_path = -> { Rails.public_path.join("sitemap.xml").to_s }
|
|
123
|
+
config.site_name = "Example"
|
|
124
|
+
config.schema_registry = -> { Crawlscope::SchemaRegistry.default }
|
|
125
|
+
end
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Then run:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
bin/rails crawlscope:validate
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Available environment overrides:
|
|
135
|
+
|
|
136
|
+
- `BASE_URL`
|
|
137
|
+
- `SITEMAP`
|
|
138
|
+
- `RULES=metadata,links`
|
|
139
|
+
- `JS=1` or `RENDERER=browser`
|
|
140
|
+
- `TIMEOUT=30`
|
|
141
|
+
- `NETWORK_IDLE_TIMEOUT=10`
|
|
142
|
+
- `CONCURRENCY=5`
|
|
143
|
+
|
|
144
|
+
Available tasks:
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
bin/rails crawlscope:validate
|
|
148
|
+
bin/rails crawlscope:validate:metadata
|
|
149
|
+
bin/rails crawlscope:validate:structured_data
|
|
150
|
+
bin/rails crawlscope:validate:uniqueness
|
|
151
|
+
bin/rails crawlscope:validate:links
|
|
152
|
+
bin/rails crawlscope:validate:ldjson URL=https://example.com/article
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
The same validation surface is also available in the gem repository itself through plain `rake`:
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
bundle exec rake crawlscope:validate BASE_URL=https://example.com
|
|
159
|
+
bundle exec rake crawlscope:validate:metadata BASE_URL=https://example.com
|
|
160
|
+
bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### Structured Data URL Audit
|
|
164
|
+
|
|
165
|
+
For one-off structured-data checks:
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
bin/rails crawlscope:validate:ldjson URL=https://example.com/article
|
|
169
|
+
bin/rails crawlscope:validate:ldjson URL='https://example.com/a;https://example.com/b' SUMMARY=1
|
|
170
|
+
bin/rails crawlscope:validate:ldjson URL=https://example.com/article REPORT_PATH=tmp/structured-data.json
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Optional flags:
|
|
174
|
+
|
|
175
|
+
- `DEBUG=1`: print detected items
|
|
176
|
+
- `SUMMARY=1`: print grouped failures
|
|
177
|
+
- `REPORT_PATH=...`: write a JSON report
|
|
178
|
+
- `JS=1` or `RENDERER=browser`: render with Ferrum
|
|
179
|
+
|
|
180
|
+
## Rules
|
|
181
|
+
|
|
182
|
+
Built-in rules:
|
|
183
|
+
|
|
184
|
+
- `metadata`
|
|
185
|
+
- `structured_data`
|
|
186
|
+
- `uniqueness`
|
|
187
|
+
- `links`
|
|
188
|
+
|
|
189
|
+
### Metadata
|
|
190
|
+
|
|
191
|
+
Checks:
|
|
192
|
+
|
|
193
|
+
- missing `<h1>`
|
|
194
|
+
- missing `<title>`
|
|
195
|
+
- title length
|
|
196
|
+
- repeated site name in the title
|
|
197
|
+
- missing meta description
|
|
198
|
+
- meta description length
|
|
199
|
+
- missing canonical link
|
|
200
|
+
- canonical mismatch
|
|
201
|
+
|
|
202
|
+
### Structured Data
|
|
203
|
+
|
|
204
|
+
Checks:
|
|
205
|
+
|
|
206
|
+
- malformed JSON-LD
|
|
207
|
+
- missing required fields for supported schema types
|
|
208
|
+
- schema validation failures from the configured registry
|
|
209
|
+
- direct URL structured-data audits through `crawlscope:validate:ldjson`
|
|
210
|
+
|
|
211
|
+
### Uniqueness
|
|
212
|
+
|
|
213
|
+
Checks:
|
|
214
|
+
|
|
215
|
+
- duplicate titles
|
|
216
|
+
- duplicate meta descriptions
|
|
217
|
+
- duplicate content fingerprints
|
|
218
|
+
|
|
219
|
+
### Links
|
|
220
|
+
|
|
221
|
+
Checks:
|
|
222
|
+
|
|
223
|
+
- broken internal links
|
|
224
|
+
- unresolved internal links
|
|
225
|
+
- low inbound anchor-link counts
|
|
226
|
+
|
|
227
|
+
## Schema Registry
|
|
228
|
+
|
|
229
|
+
`crawlscope` ships with a default schema registry for common types such as:
|
|
230
|
+
|
|
231
|
+
- `Article`
|
|
232
|
+
- `FAQPage`
|
|
233
|
+
- `Organization`
|
|
234
|
+
- `Product`
|
|
235
|
+
- `Review`
|
|
236
|
+
- `SoftwareApplication`
|
|
237
|
+
- `WebApplication`
|
|
238
|
+
- `WebSite`
|
|
239
|
+
|
|
240
|
+
Host apps can replace or extend the registry:
|
|
241
|
+
|
|
242
|
+
```ruby
|
|
243
|
+
Crawlscope.configure do |config|
|
|
244
|
+
config.schema_registry = -> { MyApp::StructuredData::SchemaRegistry.new }
|
|
245
|
+
end
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
That makes `crawlscope` useful as the audit engine while the app remains the owner of stricter product-specific schema rules.
|
|
249
|
+
|
|
250
|
+
## Development
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
git clone https://github.com/ethos-link/crawlscope.git
|
|
254
|
+
cd crawlscope
|
|
255
|
+
|
|
256
|
+
bundle install
|
|
257
|
+
bundle exec rake test
|
|
258
|
+
bundle exec rake standard
|
|
259
|
+
bundle exec rake
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
### Git hooks
|
|
263
|
+
|
|
264
|
+
We use [lefthook](https://lefthook.dev/) with the Ruby [commitlint](https://github.com/arandilopez/commitlint) gem to enforce Conventional Commits on every commit. We also use [Standard Ruby](https://standardrb.com/) to keep code style consistent. CI validates commit messages, Standard Ruby, tests, and git-cliff changelog generation on pull requests and pushes to main/master.
|
|
265
|
+
|
|
266
|
+
Run the hook installer once per clone:
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
bundle exec lefthook install
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
### Install locally
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
rake install
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
## Release
|
|
279
|
+
|
|
280
|
+
Releases are tag-driven and published by GitHub Actions to RubyGems. Local release commands never publish directly.
|
|
281
|
+
|
|
282
|
+
Install [git-cliff](https://git-cliff.org/) locally before preparing a release. The release task regenerates `CHANGELOG.md` from Conventional Commits.
|
|
283
|
+
|
|
284
|
+
Before preparing a release, make sure you are on `main` or `master` with a clean worktree.
|
|
285
|
+
|
|
286
|
+
Then run one of:
|
|
287
|
+
|
|
288
|
+
```bash
|
|
289
|
+
bundle exec rake 'release:prepare[patch]'
|
|
290
|
+
bundle exec rake 'release:prepare[minor]'
|
|
291
|
+
bundle exec rake 'release:prepare[major]'
|
|
292
|
+
bundle exec rake 'release:prepare[0.1.0]'
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
The task will:
|
|
296
|
+
|
|
297
|
+
1. Regenerate `CHANGELOG.md` with `git-cliff`.
|
|
298
|
+
1. Update `lib/crawlscope/version.rb`.
|
|
299
|
+
1. Commit the release changes.
|
|
300
|
+
1. Create and push the `vX.Y.Z` tag.
|
|
301
|
+
|
|
302
|
+
The `Release` workflow then runs tests, publishes the gem to RubyGems, and creates the GitHub release from the changelog entry.
|
|
303
|
+
|
|
304
|
+
## Contributing
|
|
305
|
+
|
|
306
|
+
1. Fork it
|
|
307
|
+
1. Create a branch (`git checkout -b feature/my-feature`)
|
|
308
|
+
1. Commit your changes
|
|
309
|
+
1. Push (`git push origin feature/my-feature`)
|
|
310
|
+
1. Open a Pull Request
|
|
311
|
+
|
|
312
|
+
Please use [Conventional Commits](https://www.conventionalcommits.org/) for commit messages.
|
|
313
|
+
|
|
314
|
+
## License
|
|
315
|
+
|
|
316
|
+
MIT License, see [LICENSE.txt](LICENSE.txt)
|
|
317
|
+
|
|
318
|
+
## About
|
|
319
|
+
|
|
320
|
+
Made by the team at [Ethos Link](https://www.ethos-link.com) — practical software for growing businesses. We build tools for hospitality operators who need clear workflows, fast onboarding, and real human support.
|
|
321
|
+
|
|
322
|
+
We also build [Reviato](https://www.reviato.com), “Capture. Interpret. Act.”.
|
|
323
|
+
Turn guest feedback into clear next steps for your team. Collect private appraisals, spot patterns across reviews, and act before small issues turn into public ones.
|
data/exe/crawlscope
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Crawlscope
|
|
4
|
+
class Audit
|
|
5
|
+
def initialize(base_url:, sitemap_path:, rules:, schema_registry:, browser_factory: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, network_idle_timeout_seconds: Configuration::DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, renderer: :http, scroll_page: Configuration::DEFAULT_BROWSER_SCROLL_PAGE, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS, allowed_statuses: Configuration::DEFAULT_ALLOWED_STATUSES)
|
|
6
|
+
@base_url = base_url
|
|
7
|
+
@sitemap_path = sitemap_path
|
|
8
|
+
@rules = Array(rules)
|
|
9
|
+
@schema_registry = schema_registry
|
|
10
|
+
@browser_factory = browser_factory
|
|
11
|
+
@concurrency = concurrency
|
|
12
|
+
@network_idle_timeout_seconds = network_idle_timeout_seconds
|
|
13
|
+
@renderer = renderer.to_sym
|
|
14
|
+
@scroll_page = scroll_page
|
|
15
|
+
@timeout_seconds = timeout_seconds
|
|
16
|
+
@allowed_statuses = allowed_statuses
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def call
|
|
20
|
+
urls = Sitemap.new(path: @sitemap_path).urls(base_url: @base_url)
|
|
21
|
+
raise ValidationError, "No URLs found in sitemap: #{@sitemap_path}" if urls.empty?
|
|
22
|
+
|
|
23
|
+
@page_fetcher = build_page
|
|
24
|
+
pages = Crawler.new(
|
|
25
|
+
page_fetcher: @page_fetcher,
|
|
26
|
+
concurrency: @concurrency
|
|
27
|
+
).call(urls)
|
|
28
|
+
|
|
29
|
+
issues = IssueCollection.new
|
|
30
|
+
collect_crawl_issues(pages, issues)
|
|
31
|
+
cache_pages(pages)
|
|
32
|
+
context = {
|
|
33
|
+
allowed_statuses: @allowed_statuses,
|
|
34
|
+
base_url: @base_url,
|
|
35
|
+
resolve_target: method(:resolve_target),
|
|
36
|
+
schema_registry: @schema_registry
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
@rules.each do |rule|
|
|
40
|
+
rule.call(urls: urls, pages: pages, issues: issues, context: context)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
Result.new(
|
|
44
|
+
base_url: @base_url,
|
|
45
|
+
sitemap_path: @sitemap_path,
|
|
46
|
+
urls: urls,
|
|
47
|
+
pages: pages,
|
|
48
|
+
issues: issues
|
|
49
|
+
)
|
|
50
|
+
ensure
|
|
51
|
+
@page_fetcher&.close
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private
|
|
55
|
+
|
|
56
|
+
def build_browser
|
|
57
|
+
Crawlscope::Browser.new(
|
|
58
|
+
base_url: @base_url,
|
|
59
|
+
timeout_seconds: @timeout_seconds,
|
|
60
|
+
network_idle_timeout_seconds: @network_idle_timeout_seconds,
|
|
61
|
+
scroll_page: @scroll_page
|
|
62
|
+
)
|
|
63
|
+
rescue LoadError => error
|
|
64
|
+
raise ConfigurationError, "Browser rendering requires the ferrum gem (#{error.message})"
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def build_page
|
|
68
|
+
if @renderer == :browser
|
|
69
|
+
browser_factory = @browser_factory || method(:build_browser)
|
|
70
|
+
browser_factory.call
|
|
71
|
+
else
|
|
72
|
+
Http.new(base_url: @base_url, timeout_seconds: @timeout_seconds)
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def build_target_resolution(page, normalized_target_url, crawled:)
|
|
77
|
+
{
|
|
78
|
+
crawled: crawled,
|
|
79
|
+
error: page.error,
|
|
80
|
+
final_url: page.normalized_final_url || normalized_target_url,
|
|
81
|
+
status: page.status
|
|
82
|
+
}
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def cache_pages(pages)
|
|
86
|
+
@page_by_url = {}
|
|
87
|
+
@target_resolution_cache = {}
|
|
88
|
+
|
|
89
|
+
pages.each do |page|
|
|
90
|
+
@page_by_url[page.normalized_url] = page unless page.normalized_url.to_s.empty?
|
|
91
|
+
@page_by_url[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def collect_crawl_issues(pages, issues)
|
|
96
|
+
pages.each do |page|
|
|
97
|
+
if page.error
|
|
98
|
+
issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: page.url, message: page.error, details: {})
|
|
99
|
+
elsif !@allowed_statuses.include?(page.status)
|
|
100
|
+
issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status})
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def resolve_target(target_url)
|
|
106
|
+
normalized_target_url = Url.normalize(target_url, base_url: @base_url)
|
|
107
|
+
return @target_resolution_cache[normalized_target_url] if @target_resolution_cache.key?(normalized_target_url)
|
|
108
|
+
|
|
109
|
+
resolution = resolve_from_crawled_page(normalized_target_url)
|
|
110
|
+
resolution ||= resolve_by_fetching_target(normalized_target_url)
|
|
111
|
+
@target_resolution_cache[normalized_target_url] = resolution
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def resolve_by_fetching_target(normalized_target_url)
|
|
115
|
+
page = @page_fetcher.fetch(normalized_target_url)
|
|
116
|
+
@page_by_url[page.normalized_url] = page unless page.normalized_url.to_s.empty?
|
|
117
|
+
@page_by_url[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
|
|
118
|
+
build_target_resolution(page, normalized_target_url, crawled: false)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def resolve_from_crawled_page(normalized_target_url)
|
|
122
|
+
page = @page_by_url[normalized_target_url]
|
|
123
|
+
return if page.nil?
|
|
124
|
+
|
|
125
|
+
build_target_resolution(page, normalized_target_url, crawled: true)
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
|
|
5
|
+
module Crawlscope
|
|
6
|
+
class Browser
|
|
7
|
+
def initialize(base_url:, timeout_seconds:, network_idle_timeout_seconds:, scroll_page:)
|
|
8
|
+
@base_url = base_url
|
|
9
|
+
@timeout_seconds = timeout_seconds
|
|
10
|
+
@network_idle_timeout_seconds = network_idle_timeout_seconds
|
|
11
|
+
@scroll_page = scroll_page
|
|
12
|
+
@browser = build_browser
|
|
13
|
+
@page = @browser.create_page
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def close
|
|
17
|
+
@browser&.quit
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def fetch(url)
|
|
21
|
+
@page.network.clear(:traffic)
|
|
22
|
+
@page.go_to(url)
|
|
23
|
+
wait_for_network_idle
|
|
24
|
+
|
|
25
|
+
if @scroll_page
|
|
26
|
+
scroll_for_render
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
response = @page.network.response
|
|
30
|
+
final_url = response&.url.to_s
|
|
31
|
+
final_url = @page.current_url.to_s if final_url.empty?
|
|
32
|
+
final_url = @page.url.to_s if final_url.empty?
|
|
33
|
+
final_url = url if final_url.empty?
|
|
34
|
+
headers = response&.headers || {}
|
|
35
|
+
body = @page.body
|
|
36
|
+
|
|
37
|
+
Page.new(
|
|
38
|
+
url: url,
|
|
39
|
+
normalized_url: Url.normalize(url, base_url: @base_url),
|
|
40
|
+
final_url: final_url,
|
|
41
|
+
normalized_final_url: Url.normalize(final_url, base_url: @base_url),
|
|
42
|
+
status: @page.network.status,
|
|
43
|
+
headers: headers,
|
|
44
|
+
body: body,
|
|
45
|
+
doc: Nokogiri::HTML(body)
|
|
46
|
+
)
|
|
47
|
+
rescue => error
|
|
48
|
+
Page.new(
|
|
49
|
+
url: url,
|
|
50
|
+
normalized_url: Url.normalize(url, base_url: @base_url),
|
|
51
|
+
final_url: url,
|
|
52
|
+
normalized_final_url: Url.normalize(url, base_url: @base_url),
|
|
53
|
+
status: nil,
|
|
54
|
+
headers: {},
|
|
55
|
+
body: nil,
|
|
56
|
+
doc: nil,
|
|
57
|
+
error: "#{error.class}: #{error.message}"
|
|
58
|
+
)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
private
|
|
62
|
+
|
|
63
|
+
def build_browser
|
|
64
|
+
require "ferrum"
|
|
65
|
+
|
|
66
|
+
Ferrum::Browser.new(
|
|
67
|
+
headless: true,
|
|
68
|
+
timeout: @timeout_seconds,
|
|
69
|
+
headers: {"User-Agent" => Http::USER_AGENT}
|
|
70
|
+
)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def scroll_for_render
|
|
74
|
+
@page.evaluate("(function() { if (document.body) { window.scrollTo(0, document.body.scrollHeight); } })()")
|
|
75
|
+
wait_for_network_idle
|
|
76
|
+
@page.evaluate("(function() { if (document.body) { window.scrollTo(0, 0); } })()")
|
|
77
|
+
wait_for_network_idle
|
|
78
|
+
@page.evaluate("(function() { if (document.body) { window.scrollTo(0, document.body.scrollHeight / 2); } })()")
|
|
79
|
+
wait_for_network_idle
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def wait_for_network_idle
|
|
83
|
+
@page.network.wait_for_idle(duration: 0.5, timeout: @network_idle_timeout_seconds)
|
|
84
|
+
rescue Ferrum::TimeoutError
|
|
85
|
+
raise Timeout::Error, "Timed out waiting for browser network idle"
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|