crawlscope 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -11
- data/README.md +20 -13
- data/lib/crawlscope/browser.rb +8 -0
- data/lib/crawlscope/cli.rb +10 -10
- data/lib/crawlscope/configuration.rb +20 -5
- data/lib/crawlscope/context.rb +9 -0
- data/lib/crawlscope/{audit.rb → crawl.rb} +62 -58
- data/lib/crawlscope/crawler.rb +19 -1
- data/lib/crawlscope/http.rb +1 -1
- data/lib/crawlscope/rake_tasks.rb +28 -0
- data/lib/crawlscope/rules/links.rb +76 -43
- data/lib/crawlscope/rules/structured_data.rb +14 -1
- data/lib/crawlscope/run.rb +60 -0
- data/lib/crawlscope/schema_registry.rb +3 -349
- data/lib/crawlscope/schemas.rb +355 -0
- data/lib/crawlscope/sitemap.rb +18 -6
- data/lib/crawlscope/structured_data/audit.rb +7 -7
- data/lib/crawlscope/structured_data/check.rb +35 -0
- data/lib/crawlscope/structured_data/reporter.rb +69 -0
- data/lib/crawlscope/url.rb +14 -0
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +12 -23
- data/test/crawlscope/browser_test.rb +155 -0
- data/test/crawlscope/cli_test.rb +128 -6
- data/test/crawlscope/configuration_test.rb +49 -0
- data/test/crawlscope/{audit_test.rb → crawl_test.rb} +11 -5
- data/test/crawlscope/crawler_test.rb +34 -0
- data/test/crawlscope/http_test.rb +56 -0
- data/test/crawlscope/links_rule_test.rb +110 -5
- data/test/crawlscope/rule_registry_test.rb +32 -0
- data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
- data/test/crawlscope/schema_registry_test.rb +19 -0
- data/test/crawlscope/sitemap_test.rb +55 -0
- data/test/crawlscope/structured_data_document_test.rb +36 -0
- data/test/crawlscope/structured_data_report_test.rb +3 -3
- data/test/crawlscope/structured_data_reporter_test.rb +2 -2
- data/test/crawlscope/structured_data_rule_test.rb +20 -0
- data/test/crawlscope/structured_data_writer_test.rb +2 -2
- data/test/crawlscope/url_test.rb +31 -0
- metadata +14 -5
- data/lib/crawlscope/task.rb +0 -131
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ba21d55a2d9b787d7bb9d4e90f39e655a5fe2a884769dbef6f866d1e5779e076
|
|
4
|
+
data.tar.gz: b7c6b829412f8e436cd81d2d28bcd5fe22327f0bb9fcc34af307b4b5feac722c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d4a6e75c44c7cff4e238ff50168b7807fec8542074bbcbe838c50cf5eba02f181576291f1033620f268484b4c75f588215789515bd6c3ee9d7e76e8e5b94ceaf
|
|
7
|
+
data.tar.gz: 5576d6a31853ebf3e6662e4bbc8f97d4da918a24352e02c4d9c7569e4300ae102d79c9a348e55ce884273c12dfa1717b8b22c16091fa26eb0d69c19b4b7dca36
|
data/CHANGELOG.md
CHANGED
|
@@ -5,27 +5,23 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
-
## [0.
|
|
8
|
+
## [0.2.0] - 2026-04-24
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
###
|
|
12
|
-
|
|
13
|
-
- add crawlkit release-ready audit gem
|
|
14
|
-
|
|
15
|
-
- add standalone validation commands
|
|
11
|
+
### Changed
|
|
16
12
|
|
|
17
|
-
-
|
|
13
|
+
- simplify crawl and structured data boundaries
|
|
18
14
|
|
|
15
|
+
- harden validation boundaries
|
|
19
16
|
|
|
20
17
|
|
|
21
18
|
|
|
22
|
-
### Changed
|
|
23
19
|
|
|
24
|
-
|
|
20
|
+
### Fixed
|
|
25
21
|
|
|
26
|
-
-
|
|
22
|
+
- handle child sitemaps
|
|
27
23
|
|
|
28
|
-
-
|
|
24
|
+
- use URL for sitemap validation
|
|
29
25
|
|
|
30
26
|
|
|
31
27
|
|
data/README.md
CHANGED
|
@@ -58,16 +58,16 @@ gem "ferrum"
|
|
|
58
58
|
|
|
59
59
|
## CLI Usage
|
|
60
60
|
|
|
61
|
-
Validate a site
|
|
61
|
+
Validate a site from its default sitemap:
|
|
62
62
|
|
|
63
63
|
```bash
|
|
64
|
-
crawlscope validate --
|
|
64
|
+
crawlscope validate --url https://example.com
|
|
65
65
|
```
|
|
66
66
|
|
|
67
67
|
Validate only specific rules:
|
|
68
68
|
|
|
69
69
|
```bash
|
|
70
|
-
crawlscope validate --
|
|
70
|
+
crawlscope validate --url https://example.com --rules metadata,links
|
|
71
71
|
```
|
|
72
72
|
|
|
73
73
|
Validate structured data on one or more URLs:
|
|
@@ -77,10 +77,11 @@ crawlscope ldjson --url https://example.com/article
|
|
|
77
77
|
crawlscope ldjson --url https://example.com/a --url https://example.com/b --summary
|
|
78
78
|
```
|
|
79
79
|
|
|
80
|
-
|
|
80
|
+
To use a non-default sitemap, pass `--sitemap`:
|
|
81
81
|
|
|
82
|
-
|
|
83
|
-
|
|
82
|
+
```bash
|
|
83
|
+
crawlscope validate --url https://example.com --sitemap https://example.com/sitemap.xml
|
|
84
|
+
```
|
|
84
85
|
|
|
85
86
|
Child sitemap indexes are supported automatically.
|
|
86
87
|
|
|
@@ -89,14 +90,14 @@ Child sitemap indexes are supported automatically.
|
|
|
89
90
|
```ruby
|
|
90
91
|
require "crawlscope"
|
|
91
92
|
|
|
92
|
-
|
|
93
|
+
crawl = Crawlscope::Crawl.new(
|
|
93
94
|
base_url: "https://example.com",
|
|
94
95
|
sitemap_path: "https://example.com/sitemap.xml",
|
|
95
96
|
rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
|
|
96
97
|
schema_registry: Crawlscope::SchemaRegistry.default
|
|
97
98
|
)
|
|
98
99
|
|
|
99
|
-
result =
|
|
100
|
+
result = crawl.call
|
|
100
101
|
|
|
101
102
|
puts result.ok?
|
|
102
103
|
puts result.issues.to_a.map(&:message)
|
|
@@ -104,7 +105,7 @@ puts result.issues.to_a.map(&:message)
|
|
|
104
105
|
|
|
105
106
|
## Result Shape
|
|
106
107
|
|
|
107
|
-
`Crawlscope::
|
|
108
|
+
`Crawlscope::Crawl` returns a `Crawlscope::Result` with:
|
|
108
109
|
|
|
109
110
|
- `urls`: sitemap URLs selected for validation
|
|
110
111
|
- `pages`: fetched page snapshots
|
|
@@ -133,7 +134,7 @@ bin/rails crawlscope:validate
|
|
|
133
134
|
|
|
134
135
|
Available environment overrides:
|
|
135
136
|
|
|
136
|
-
- `
|
|
137
|
+
- `URL`
|
|
137
138
|
- `SITEMAP`
|
|
138
139
|
- `RULES=metadata,links`
|
|
139
140
|
- `JS=1` or `RENDERER=browser`
|
|
@@ -155,11 +156,15 @@ bin/rails crawlscope:validate:ldjson URL=https://example.com/article
|
|
|
155
156
|
The same validation surface is also available in the gem repository itself through plain `rake`:
|
|
156
157
|
|
|
157
158
|
```bash
|
|
158
|
-
bundle exec rake crawlscope:validate
|
|
159
|
-
bundle exec rake crawlscope:validate:metadata
|
|
159
|
+
bundle exec rake crawlscope:validate URL=https://example.com
|
|
160
|
+
bundle exec rake crawlscope:validate:metadata URL=https://example.com
|
|
160
161
|
bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article
|
|
161
162
|
```
|
|
162
163
|
|
|
164
|
+
`crawlscope:validate` runs all default sitemap rules: metadata, structured data, uniqueness, and links. `URL` is the site base. Without `SITEMAP`, Crawlscope uses `/sitemap.xml`. With `SITEMAP`, Crawlscope uses `URL` as the site base and validates URLs from that sitemap. `SITEMAP` may be a full URL or a local file path.
|
|
165
|
+
|
|
166
|
+
`crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap.
|
|
167
|
+
|
|
163
168
|
### Structured Data URL Audit
|
|
164
169
|
|
|
165
170
|
For one-off structured-data checks:
|
|
@@ -174,7 +179,7 @@ Optional flags:
|
|
|
174
179
|
|
|
175
180
|
- `DEBUG=1`: print detected items
|
|
176
181
|
- `SUMMARY=1`: print grouped failures
|
|
177
|
-
- `REPORT_PATH=...`: write a JSON report
|
|
182
|
+
- `REPORT_PATH=...`: write a JSON report. Treat this as trusted operator input; Crawlscope writes to the path the task process can access.
|
|
178
183
|
- `JS=1` or `RENDERER=browser`: render with Ferrum
|
|
179
184
|
|
|
180
185
|
## Rules
|
|
@@ -237,6 +242,8 @@ Checks:
|
|
|
237
242
|
- `WebApplication`
|
|
238
243
|
- `WebSite`
|
|
239
244
|
|
|
245
|
+
The default schema definitions live in `Crawlscope::Schemas`; `Crawlscope::SchemaRegistry` owns registration and validation.
|
|
246
|
+
|
|
240
247
|
Host apps can replace or extend the registry:
|
|
241
248
|
|
|
242
249
|
```ruby
|
data/lib/crawlscope/browser.rb
CHANGED
|
@@ -45,6 +45,8 @@ module Crawlscope
|
|
|
45
45
|
doc: Nokogiri::HTML(body)
|
|
46
46
|
)
|
|
47
47
|
rescue => error
|
|
48
|
+
raise unless browser_error?(error)
|
|
49
|
+
|
|
48
50
|
Page.new(
|
|
49
51
|
url: url,
|
|
50
52
|
normalized_url: Url.normalize(url, base_url: @base_url),
|
|
@@ -84,5 +86,11 @@ module Crawlscope
|
|
|
84
86
|
rescue Ferrum::TimeoutError
|
|
85
87
|
raise Timeout::Error, "Timed out waiting for browser network idle"
|
|
86
88
|
end
|
|
89
|
+
|
|
90
|
+
def browser_error?(error)
|
|
91
|
+
error.is_a?(Timeout::Error) ||
|
|
92
|
+
error.is_a?(SystemCallError) ||
|
|
93
|
+
error.class.name.to_s.start_with?("Ferrum::")
|
|
94
|
+
end
|
|
87
95
|
end
|
|
88
96
|
end
|
data/lib/crawlscope/cli.rb
CHANGED
|
@@ -37,7 +37,7 @@ module Crawlscope
|
|
|
37
37
|
@err.puts(general_usage)
|
|
38
38
|
1
|
|
39
39
|
end
|
|
40
|
-
rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ArgumentError => error
|
|
40
|
+
rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ValidationError, ArgumentError => error
|
|
41
41
|
@err.puts(error.message)
|
|
42
42
|
@err.puts("")
|
|
43
43
|
@err.puts(general_usage)
|
|
@@ -49,12 +49,12 @@ module Crawlscope
|
|
|
49
49
|
def general_usage
|
|
50
50
|
<<~TEXT
|
|
51
51
|
Usage:
|
|
52
|
-
crawlscope validate --
|
|
52
|
+
crawlscope validate --url https://example.com [options]
|
|
53
53
|
crawlscope ldjson --url https://example.com/page [options]
|
|
54
54
|
crawlscope version
|
|
55
55
|
|
|
56
56
|
Commands:
|
|
57
|
-
validate Audit
|
|
57
|
+
validate Audit URLs for metadata, structured data, uniqueness, and links
|
|
58
58
|
ldjson Validate structured data on one or more URLs
|
|
59
59
|
version Print the gem version
|
|
60
60
|
TEXT
|
|
@@ -109,7 +109,7 @@ module Crawlscope
|
|
|
109
109
|
|
|
110
110
|
configure_renderer(options[:renderer])
|
|
111
111
|
|
|
112
|
-
result = task.
|
|
112
|
+
result = task.validate_json_ld(
|
|
113
113
|
urls: urls,
|
|
114
114
|
debug: options[:debug],
|
|
115
115
|
renderer: options[:renderer],
|
|
@@ -123,7 +123,7 @@ module Crawlscope
|
|
|
123
123
|
|
|
124
124
|
def run_validate
|
|
125
125
|
options = {
|
|
126
|
-
|
|
126
|
+
url: normalized_string(ENV["URL"]),
|
|
127
127
|
rule_names: normalized_string(ENV["RULES"]),
|
|
128
128
|
sitemap_path: normalized_string(ENV["SITEMAP"])
|
|
129
129
|
}
|
|
@@ -134,10 +134,10 @@ module Crawlscope
|
|
|
134
134
|
@configuration.timeout_seconds = resolved_integer("TIMEOUT", default: @configuration.timeout_seconds, minimum: 1)
|
|
135
135
|
|
|
136
136
|
parser = OptionParser.new do |opts|
|
|
137
|
-
opts.banner = "Usage: crawlscope validate --
|
|
137
|
+
opts.banner = "Usage: crawlscope validate --url https://example.com [options]"
|
|
138
138
|
|
|
139
|
-
opts.on("--
|
|
140
|
-
options[:
|
|
139
|
+
opts.on("--url URL", "Set the site URL") do |value|
|
|
140
|
+
options[:url] = value
|
|
141
141
|
end
|
|
142
142
|
|
|
143
143
|
opts.on("--sitemap PATH_OR_URL", "Set the sitemap path or URL") do |value|
|
|
@@ -168,7 +168,7 @@ module Crawlscope
|
|
|
168
168
|
parser.parse!(@argv)
|
|
169
169
|
|
|
170
170
|
result = task.validate(
|
|
171
|
-
base_url: options[:
|
|
171
|
+
base_url: options[:url],
|
|
172
172
|
sitemap_path: options[:sitemap_path],
|
|
173
173
|
rule_names: options[:rule_names]
|
|
174
174
|
)
|
|
@@ -239,7 +239,7 @@ module Crawlscope
|
|
|
239
239
|
end
|
|
240
240
|
|
|
241
241
|
def task
|
|
242
|
-
@task ||=
|
|
242
|
+
@task ||= Run.new(configuration: @configuration, reporter: Reporter.new(io: @out))
|
|
243
243
|
end
|
|
244
244
|
end
|
|
245
245
|
end
|
|
@@ -7,6 +7,7 @@ module Crawlscope
|
|
|
7
7
|
DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS = 5
|
|
8
8
|
DEFAULT_BROWSER_SCROLL_PAGE = true
|
|
9
9
|
DEFAULT_CONCURRENCY = 10
|
|
10
|
+
RENDERERS = %i[http browser].freeze
|
|
10
11
|
DEFAULT_TIMEOUT_SECONDS = 20
|
|
11
12
|
|
|
12
13
|
attr_writer :allowed_statuses, :base_url, :browser_factory, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :rule_registry, :schema_registry, :scroll_page, :site_name, :sitemap_path, :timeout_seconds
|
|
@@ -26,7 +27,7 @@ module Crawlscope
|
|
|
26
27
|
|
|
27
28
|
def concurrency
|
|
28
29
|
value = resolve(@concurrency)
|
|
29
|
-
value
|
|
30
|
+
positive_integer(value, default: DEFAULT_CONCURRENCY, name: "concurrency")
|
|
30
31
|
end
|
|
31
32
|
|
|
32
33
|
def browser_concurrency
|
|
@@ -42,7 +43,7 @@ module Crawlscope
|
|
|
42
43
|
|
|
43
44
|
def network_idle_timeout_seconds
|
|
44
45
|
value = resolve(@network_idle_timeout_seconds)
|
|
45
|
-
value
|
|
46
|
+
positive_integer(value, default: DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, name: "network_idle_timeout_seconds")
|
|
46
47
|
end
|
|
47
48
|
|
|
48
49
|
def output
|
|
@@ -55,7 +56,10 @@ module Crawlscope
|
|
|
55
56
|
normalized_value = value.to_s.strip
|
|
56
57
|
normalized_value = "http" if normalized_value.empty?
|
|
57
58
|
|
|
58
|
-
normalized_value.to_sym
|
|
59
|
+
renderer = normalized_value.to_sym
|
|
60
|
+
return renderer if RENDERERS.include?(renderer)
|
|
61
|
+
|
|
62
|
+
raise ConfigurationError, "Crawlscope renderer must be http or browser"
|
|
59
63
|
end
|
|
60
64
|
|
|
61
65
|
def rule_registry
|
|
@@ -74,7 +78,7 @@ module Crawlscope
|
|
|
74
78
|
raise ConfigurationError, "Crawlscope sitemap_path is not configured"
|
|
75
79
|
end
|
|
76
80
|
|
|
77
|
-
|
|
81
|
+
Crawl.new(
|
|
78
82
|
base_url: base_url,
|
|
79
83
|
sitemap_path: sitemap_path,
|
|
80
84
|
browser_factory: browser_factory,
|
|
@@ -111,7 +115,7 @@ module Crawlscope
|
|
|
111
115
|
|
|
112
116
|
def timeout_seconds
|
|
113
117
|
value = resolve(@timeout_seconds)
|
|
114
|
-
value
|
|
118
|
+
positive_integer(value, default: DEFAULT_TIMEOUT_SECONDS, name: "timeout_seconds")
|
|
115
119
|
end
|
|
116
120
|
|
|
117
121
|
private
|
|
@@ -119,5 +123,16 @@ module Crawlscope
|
|
|
119
123
|
def resolve(value)
|
|
120
124
|
value.respond_to?(:call) ? value.call : value
|
|
121
125
|
end
|
|
126
|
+
|
|
127
|
+
def positive_integer(value, default:, name:)
|
|
128
|
+
return default if value.nil?
|
|
129
|
+
|
|
130
|
+
integer = value.is_a?(Integer) ? value : Integer(value, 10)
|
|
131
|
+
raise ArgumentError if integer < 1
|
|
132
|
+
|
|
133
|
+
integer
|
|
134
|
+
rescue ArgumentError, TypeError
|
|
135
|
+
raise ConfigurationError, "Crawlscope #{name} must be an integer >= 1"
|
|
136
|
+
end
|
|
122
137
|
end
|
|
123
138
|
end
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Crawlscope
|
|
4
|
-
class
|
|
4
|
+
class Crawl
|
|
5
5
|
def initialize(base_url:, sitemap_path:, rules:, schema_registry:, browser_factory: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, network_idle_timeout_seconds: Configuration::DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, renderer: :http, scroll_page: Configuration::DEFAULT_BROWSER_SCROLL_PAGE, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS, allowed_statuses: Configuration::DEFAULT_ALLOWED_STATUSES)
|
|
6
6
|
@base_url = base_url
|
|
7
7
|
@sitemap_path = sitemap_path
|
|
@@ -17,28 +17,15 @@ module Crawlscope
|
|
|
17
17
|
end
|
|
18
18
|
|
|
19
19
|
def call
|
|
20
|
-
urls =
|
|
21
|
-
raise ValidationError, "No URLs found in sitemap: #{@sitemap_path}" if urls.empty?
|
|
22
|
-
|
|
23
|
-
@page_fetcher = build_page
|
|
24
|
-
pages = Crawler.new(
|
|
25
|
-
page_fetcher: @page_fetcher,
|
|
26
|
-
concurrency: @concurrency
|
|
27
|
-
).call(urls)
|
|
20
|
+
urls = sitemap_urls
|
|
28
21
|
|
|
22
|
+
@page_fetcher = page
|
|
23
|
+
pages = Crawler.new(page_fetcher: @page_fetcher, concurrency: @concurrency).call(urls)
|
|
29
24
|
issues = IssueCollection.new
|
|
30
|
-
collect_crawl_issues(pages, issues)
|
|
31
|
-
cache_pages(pages)
|
|
32
|
-
context = {
|
|
33
|
-
allowed_statuses: @allowed_statuses,
|
|
34
|
-
base_url: @base_url,
|
|
35
|
-
resolve_target: method(:resolve_target),
|
|
36
|
-
schema_registry: @schema_registry
|
|
37
|
-
}
|
|
38
25
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
26
|
+
collect(pages, issues)
|
|
27
|
+
cache(pages)
|
|
28
|
+
scan(urls, pages, issues)
|
|
42
29
|
|
|
43
30
|
Result.new(
|
|
44
31
|
base_url: @base_url,
|
|
@@ -53,8 +40,15 @@ module Crawlscope
|
|
|
53
40
|
|
|
54
41
|
private
|
|
55
42
|
|
|
56
|
-
def
|
|
57
|
-
|
|
43
|
+
def sitemap_urls
|
|
44
|
+
urls = Sitemap.new(path: @sitemap_path).urls(base_url: @base_url)
|
|
45
|
+
raise ValidationError, "No URLs found in sitemap: #{@sitemap_path}" if urls.empty?
|
|
46
|
+
|
|
47
|
+
urls
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def browser
|
|
51
|
+
Browser.new(
|
|
58
52
|
base_url: @base_url,
|
|
59
53
|
timeout_seconds: @timeout_seconds,
|
|
60
54
|
network_idle_timeout_seconds: @network_idle_timeout_seconds,
|
|
@@ -64,35 +58,24 @@ module Crawlscope
|
|
|
64
58
|
raise ConfigurationError, "Browser rendering requires the ferrum gem (#{error.message})"
|
|
65
59
|
end
|
|
66
60
|
|
|
67
|
-
def
|
|
61
|
+
def page
|
|
68
62
|
if @renderer == :browser
|
|
69
|
-
|
|
70
|
-
browser_factory.call
|
|
63
|
+
(@browser_factory || method(:browser)).call
|
|
71
64
|
else
|
|
72
65
|
Http.new(base_url: @base_url, timeout_seconds: @timeout_seconds)
|
|
73
66
|
end
|
|
74
67
|
end
|
|
75
68
|
|
|
76
|
-
def
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
end
|
|
84
|
-
|
|
85
|
-
def cache_pages(pages)
|
|
86
|
-
@page_by_url = {}
|
|
87
|
-
@target_resolution_cache = {}
|
|
88
|
-
|
|
89
|
-
pages.each do |page|
|
|
90
|
-
@page_by_url[page.normalized_url] = page unless page.normalized_url.to_s.empty?
|
|
91
|
-
@page_by_url[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
|
|
92
|
-
end
|
|
69
|
+
def context
|
|
70
|
+
Context.new(
|
|
71
|
+
allowed_statuses: @allowed_statuses,
|
|
72
|
+
base_url: @base_url,
|
|
73
|
+
resolve_target: method(:resolve),
|
|
74
|
+
schema_registry: @schema_registry
|
|
75
|
+
)
|
|
93
76
|
end
|
|
94
77
|
|
|
95
|
-
def
|
|
78
|
+
def collect(pages, issues)
|
|
96
79
|
pages.each do |page|
|
|
97
80
|
if page.error
|
|
98
81
|
issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: page.url, message: page.error, details: {})
|
|
@@ -102,27 +85,48 @@ module Crawlscope
|
|
|
102
85
|
end
|
|
103
86
|
end
|
|
104
87
|
|
|
105
|
-
def
|
|
106
|
-
|
|
107
|
-
|
|
88
|
+
def cache(pages)
|
|
89
|
+
@pages = {}
|
|
90
|
+
@targets = {}
|
|
91
|
+
|
|
92
|
+
pages.each do |page|
|
|
93
|
+
@pages[page.normalized_url] = page unless page.normalized_url.to_s.empty?
|
|
94
|
+
@pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def scan(urls, pages, issues)
|
|
99
|
+
@rules.each do |rule|
|
|
100
|
+
rule.call(urls: urls, pages: pages, issues: issues, context: context)
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def resolve(target_url)
|
|
105
|
+
normalized_url = Url.normalize(target_url, base_url: @base_url)
|
|
106
|
+
return @targets[normalized_url] if @targets.key?(normalized_url)
|
|
108
107
|
|
|
109
|
-
|
|
110
|
-
resolution ||= resolve_by_fetching_target(normalized_target_url)
|
|
111
|
-
@target_resolution_cache[normalized_target_url] = resolution
|
|
108
|
+
@targets[normalized_url] = resolved_page(normalized_url) || fetched_page(normalized_url)
|
|
112
109
|
end
|
|
113
110
|
|
|
114
|
-
def
|
|
115
|
-
page = @page_fetcher.fetch(
|
|
116
|
-
@
|
|
117
|
-
@
|
|
118
|
-
|
|
111
|
+
def fetched_page(normalized_url)
|
|
112
|
+
page = @page_fetcher.fetch(normalized_url)
|
|
113
|
+
@pages[page.normalized_url] = page unless page.normalized_url.to_s.empty?
|
|
114
|
+
@pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
|
|
115
|
+
resolution(page, normalized_url, crawled: false)
|
|
119
116
|
end
|
|
120
117
|
|
|
121
|
-
def
|
|
122
|
-
page = @
|
|
123
|
-
|
|
118
|
+
def resolved_page(normalized_url)
|
|
119
|
+
page = @pages[normalized_url]
|
|
120
|
+
resolution(page, normalized_url, crawled: true) if page
|
|
121
|
+
end
|
|
124
122
|
|
|
125
|
-
|
|
123
|
+
def resolution(page, normalized_url, crawled:)
|
|
124
|
+
{
|
|
125
|
+
crawled: crawled,
|
|
126
|
+
error: page.error,
|
|
127
|
+
final_url: page.normalized_final_url || normalized_url,
|
|
128
|
+
status: page.status
|
|
129
|
+
}
|
|
126
130
|
end
|
|
127
131
|
end
|
|
128
132
|
end
|
data/lib/crawlscope/crawler.rb
CHANGED
|
@@ -15,7 +15,7 @@ module Crawlscope
|
|
|
15
15
|
|
|
16
16
|
urls.each do |url|
|
|
17
17
|
pool.post do
|
|
18
|
-
pages <<
|
|
18
|
+
pages << fetch(url)
|
|
19
19
|
end
|
|
20
20
|
end
|
|
21
21
|
|
|
@@ -24,5 +24,23 @@ module Crawlscope
|
|
|
24
24
|
|
|
25
25
|
pages.to_a
|
|
26
26
|
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
def fetch(url)
|
|
31
|
+
@page_fetcher.fetch(url)
|
|
32
|
+
rescue => error
|
|
33
|
+
Page.new(
|
|
34
|
+
url: url,
|
|
35
|
+
normalized_url: Url.normalize(url, base_url: url),
|
|
36
|
+
final_url: url,
|
|
37
|
+
normalized_final_url: Url.normalize(url, base_url: url),
|
|
38
|
+
status: nil,
|
|
39
|
+
headers: {},
|
|
40
|
+
body: nil,
|
|
41
|
+
doc: nil,
|
|
42
|
+
error: "#{error.class}: #{error.message}"
|
|
43
|
+
)
|
|
44
|
+
end
|
|
27
45
|
end
|
|
28
46
|
end
|
data/lib/crawlscope/http.rb
CHANGED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Crawlscope
|
|
4
|
+
module RakeTasks
|
|
5
|
+
module_function
|
|
6
|
+
|
|
7
|
+
def validate
|
|
8
|
+
run("validate")
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def ldjson
|
|
12
|
+
run("ldjson")
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def validate_rule(rule)
|
|
16
|
+
original_rules = ENV["RULES"]
|
|
17
|
+
ENV["RULES"] = rule
|
|
18
|
+
validate
|
|
19
|
+
ensure
|
|
20
|
+
ENV["RULES"] = original_rules
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def run(command)
|
|
24
|
+
status = Cli.start([command], out: $stdout, err: $stderr)
|
|
25
|
+
exit(status) unless status.zero?
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|