crawlscope 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -8
- data/README.md +21 -14
- data/lib/crawlscope/browser.rb +8 -0
- data/lib/crawlscope/cli.rb +15 -10
- data/lib/crawlscope/configuration.rb +20 -5
- data/lib/crawlscope/context.rb +9 -0
- data/lib/crawlscope/{audit.rb → crawl.rb} +68 -58
- data/lib/crawlscope/crawler.rb +19 -1
- data/lib/crawlscope/http.rb +1 -1
- data/lib/crawlscope/rake_tasks.rb +28 -0
- data/lib/crawlscope/rules/links.rb +99 -48
- data/lib/crawlscope/rules/metadata.rb +57 -11
- data/lib/crawlscope/rules/structured_data.rb +61 -1
- data/lib/crawlscope/run.rb +60 -0
- data/lib/crawlscope/schema_registry.rb +3 -349
- data/lib/crawlscope/schemas.rb +406 -0
- data/lib/crawlscope/sitemap.rb +18 -6
- data/lib/crawlscope/structured_data/audit.rb +7 -7
- data/lib/crawlscope/structured_data/check.rb +35 -0
- data/lib/crawlscope/structured_data/reporter.rb +69 -0
- data/lib/crawlscope/url.rb +14 -0
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +12 -23
- data/test/crawlscope/browser_test.rb +155 -0
- data/test/crawlscope/cli_test.rb +143 -7
- data/test/crawlscope/configuration_test.rb +49 -0
- data/test/crawlscope/{audit_test.rb → crawl_test.rb} +23 -7
- data/test/crawlscope/crawler_test.rb +34 -0
- data/test/crawlscope/http_test.rb +56 -0
- data/test/crawlscope/links_rule_test.rb +149 -5
- data/test/crawlscope/metadata_rule_test.rb +77 -0
- data/test/crawlscope/rule_registry_test.rb +32 -0
- data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
- data/test/crawlscope/schema_registry_test.rb +19 -0
- data/test/crawlscope/sitemap_test.rb +55 -0
- data/test/crawlscope/structured_data_document_test.rb +36 -0
- data/test/crawlscope/structured_data_report_test.rb +3 -3
- data/test/crawlscope/structured_data_reporter_test.rb +2 -2
- data/test/crawlscope/structured_data_rule_test.rb +111 -0
- data/test/crawlscope/structured_data_writer_test.rb +2 -2
- data/test/crawlscope/url_test.rb +31 -0
- metadata +15 -5
- data/lib/crawlscope/task.rb +0 -131
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b49aaaa6fdb5f7d5bd4dc63713d8c0090411e7063363645a900d8f59d803aaaa
|
|
4
|
+
data.tar.gz: 5dfcc35d60745c25db6faf3acaa4344e29e438c758740613d6216e2f47aeac6e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9f66627274ce2ea969b5bb9b53a339215718c37baf47393c75bcf3a528c5c73658c6a71903fdbbf9e53796aaf3680be5f99ab4151b834efbf9450e05abbab83b
|
|
7
|
+
data.tar.gz: 3cf2e2c7f251a6af7b931f00da63436eaa7e09f078d73de112852a10665cf16eefb561c7d61d6bc8b0c3c014ca0db2df217d31c00b9f0ed321565ed554574261
|
data/CHANGELOG.md
CHANGED
|
@@ -5,27 +5,26 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
-
## [0.
|
|
8
|
+
## [0.3.0] - 2026-04-28
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
### Added
|
|
12
12
|
|
|
13
|
-
- add
|
|
13
|
+
- add JobPost structured data
|
|
14
14
|
|
|
15
|
-
- add standalone validation commands
|
|
16
15
|
|
|
17
|
-
- move default schema rules into crawlkit
|
|
18
16
|
|
|
19
17
|
|
|
18
|
+
### Documentation
|
|
20
19
|
|
|
20
|
+
- fix missing changelog entry
|
|
21
21
|
|
|
22
|
-
### Changed
|
|
23
22
|
|
|
24
|
-
- strengthen public API coverage
|
|
25
23
|
|
|
26
|
-
- load shared test dependencies
|
|
27
24
|
|
|
28
|
-
|
|
25
|
+
### Fixed
|
|
26
|
+
|
|
27
|
+
- ldjson check now uses the same convention for default URL
|
|
29
28
|
|
|
30
29
|
|
|
31
30
|
|
data/README.md
CHANGED
|
@@ -58,16 +58,16 @@ gem "ferrum"
|
|
|
58
58
|
|
|
59
59
|
## CLI Usage
|
|
60
60
|
|
|
61
|
-
Validate a site
|
|
61
|
+
Validate a site from its default sitemap:
|
|
62
62
|
|
|
63
63
|
```bash
|
|
64
|
-
crawlscope validate --
|
|
64
|
+
crawlscope validate --url https://example.com
|
|
65
65
|
```
|
|
66
66
|
|
|
67
67
|
Validate only specific rules:
|
|
68
68
|
|
|
69
69
|
```bash
|
|
70
|
-
crawlscope validate --
|
|
70
|
+
crawlscope validate --url https://example.com --rules metadata,links
|
|
71
71
|
```
|
|
72
72
|
|
|
73
73
|
Validate structured data on one or more URLs:
|
|
@@ -77,10 +77,11 @@ crawlscope ldjson --url https://example.com/article
|
|
|
77
77
|
crawlscope ldjson --url https://example.com/a --url https://example.com/b --summary
|
|
78
78
|
```
|
|
79
79
|
|
|
80
|
-
|
|
80
|
+
To use a non-default sitemap, pass `--sitemap`:
|
|
81
81
|
|
|
82
|
-
|
|
83
|
-
|
|
82
|
+
```bash
|
|
83
|
+
crawlscope validate --url https://example.com --sitemap https://example.com/sitemap.xml
|
|
84
|
+
```
|
|
84
85
|
|
|
85
86
|
Child sitemap indexes are supported automatically.
|
|
86
87
|
|
|
@@ -89,14 +90,14 @@ Child sitemap indexes are supported automatically.
|
|
|
89
90
|
```ruby
|
|
90
91
|
require "crawlscope"
|
|
91
92
|
|
|
92
|
-
|
|
93
|
+
crawl = Crawlscope::Crawl.new(
|
|
93
94
|
base_url: "https://example.com",
|
|
94
95
|
sitemap_path: "https://example.com/sitemap.xml",
|
|
95
96
|
rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
|
|
96
97
|
schema_registry: Crawlscope::SchemaRegistry.default
|
|
97
98
|
)
|
|
98
99
|
|
|
99
|
-
result =
|
|
100
|
+
result = crawl.call
|
|
100
101
|
|
|
101
102
|
puts result.ok?
|
|
102
103
|
puts result.issues.to_a.map(&:message)
|
|
@@ -104,7 +105,7 @@ puts result.issues.to_a.map(&:message)
|
|
|
104
105
|
|
|
105
106
|
## Result Shape
|
|
106
107
|
|
|
107
|
-
`Crawlscope::
|
|
108
|
+
`Crawlscope::Crawl` returns a `Crawlscope::Result` with:
|
|
108
109
|
|
|
109
110
|
- `urls`: sitemap URLs selected for validation
|
|
110
111
|
- `pages`: fetched page snapshots
|
|
@@ -133,7 +134,7 @@ bin/rails crawlscope:validate
|
|
|
133
134
|
|
|
134
135
|
Available environment overrides:
|
|
135
136
|
|
|
136
|
-
- `
|
|
137
|
+
- `URL`
|
|
137
138
|
- `SITEMAP`
|
|
138
139
|
- `RULES=metadata,links`
|
|
139
140
|
- `JS=1` or `RENDERER=browser`
|
|
@@ -149,17 +150,21 @@ bin/rails crawlscope:validate:metadata
|
|
|
149
150
|
bin/rails crawlscope:validate:structured_data
|
|
150
151
|
bin/rails crawlscope:validate:uniqueness
|
|
151
152
|
bin/rails crawlscope:validate:links
|
|
152
|
-
bin/rails crawlscope:validate:ldjson
|
|
153
|
+
bin/rails crawlscope:validate:ldjson
|
|
153
154
|
```
|
|
154
155
|
|
|
155
156
|
The same validation surface is also available in the gem repository itself through plain `rake`:
|
|
156
157
|
|
|
157
158
|
```bash
|
|
158
|
-
bundle exec rake crawlscope:validate
|
|
159
|
-
bundle exec rake crawlscope:validate:metadata
|
|
159
|
+
bundle exec rake crawlscope:validate URL=https://example.com
|
|
160
|
+
bundle exec rake crawlscope:validate:metadata URL=https://example.com
|
|
160
161
|
bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article
|
|
161
162
|
```
|
|
162
163
|
|
|
164
|
+
`crawlscope:validate` runs all default sitemap rules: metadata, structured data, uniqueness, and links. `URL` is the site base. Without `SITEMAP`, Crawlscope uses `/sitemap.xml`. With `SITEMAP`, Crawlscope uses `URL` as the site base and validates URLs from that sitemap. `SITEMAP` may be a full URL or a local file path.
|
|
165
|
+
|
|
166
|
+
`crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`.
|
|
167
|
+
|
|
163
168
|
### Structured Data URL Audit
|
|
164
169
|
|
|
165
170
|
For one-off structured-data checks:
|
|
@@ -174,7 +179,7 @@ Optional flags:
|
|
|
174
179
|
|
|
175
180
|
- `DEBUG=1`: print detected items
|
|
176
181
|
- `SUMMARY=1`: print grouped failures
|
|
177
|
-
- `REPORT_PATH=...`: write a JSON report
|
|
182
|
+
- `REPORT_PATH=...`: write a JSON report. Treat this as trusted operator input; Crawlscope writes to the path the task process can access.
|
|
178
183
|
- `JS=1` or `RENDERER=browser`: render with Ferrum
|
|
179
184
|
|
|
180
185
|
## Rules
|
|
@@ -237,6 +242,8 @@ Checks:
|
|
|
237
242
|
- `WebApplication`
|
|
238
243
|
- `WebSite`
|
|
239
244
|
|
|
245
|
+
The default schema definitions live in `Crawlscope::Schemas`; `Crawlscope::SchemaRegistry` owns registration and validation.
|
|
246
|
+
|
|
240
247
|
Host apps can replace or extend the registry:
|
|
241
248
|
|
|
242
249
|
```ruby
|
data/lib/crawlscope/browser.rb
CHANGED
|
@@ -45,6 +45,8 @@ module Crawlscope
|
|
|
45
45
|
doc: Nokogiri::HTML(body)
|
|
46
46
|
)
|
|
47
47
|
rescue => error
|
|
48
|
+
raise unless browser_error?(error)
|
|
49
|
+
|
|
48
50
|
Page.new(
|
|
49
51
|
url: url,
|
|
50
52
|
normalized_url: Url.normalize(url, base_url: @base_url),
|
|
@@ -84,5 +86,11 @@ module Crawlscope
|
|
|
84
86
|
rescue Ferrum::TimeoutError
|
|
85
87
|
raise Timeout::Error, "Timed out waiting for browser network idle"
|
|
86
88
|
end
|
|
89
|
+
|
|
90
|
+
def browser_error?(error)
|
|
91
|
+
error.is_a?(Timeout::Error) ||
|
|
92
|
+
error.is_a?(SystemCallError) ||
|
|
93
|
+
error.class.name.to_s.start_with?("Ferrum::")
|
|
94
|
+
end
|
|
87
95
|
end
|
|
88
96
|
end
|
data/lib/crawlscope/cli.rb
CHANGED
|
@@ -37,7 +37,7 @@ module Crawlscope
|
|
|
37
37
|
@err.puts(general_usage)
|
|
38
38
|
1
|
|
39
39
|
end
|
|
40
|
-
rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ArgumentError => error
|
|
40
|
+
rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ValidationError, ArgumentError => error
|
|
41
41
|
@err.puts(error.message)
|
|
42
42
|
@err.puts("")
|
|
43
43
|
@err.puts(general_usage)
|
|
@@ -49,12 +49,12 @@ module Crawlscope
|
|
|
49
49
|
def general_usage
|
|
50
50
|
<<~TEXT
|
|
51
51
|
Usage:
|
|
52
|
-
crawlscope validate --
|
|
52
|
+
crawlscope validate --url https://example.com [options]
|
|
53
53
|
crawlscope ldjson --url https://example.com/page [options]
|
|
54
54
|
crawlscope version
|
|
55
55
|
|
|
56
56
|
Commands:
|
|
57
|
-
validate Audit
|
|
57
|
+
validate Audit URLs for metadata, structured data, uniqueness, and links
|
|
58
58
|
ldjson Validate structured data on one or more URLs
|
|
59
59
|
version Print the gem version
|
|
60
60
|
TEXT
|
|
@@ -105,11 +105,12 @@ module Crawlscope
|
|
|
105
105
|
parser.parse!(@argv)
|
|
106
106
|
|
|
107
107
|
urls = options[:urls].map(&:strip).reject(&:empty?)
|
|
108
|
+
urls = default_urls if urls.empty?
|
|
108
109
|
raise ConfigurationError, "Crawlscope URL is not configured" if urls.empty?
|
|
109
110
|
|
|
110
111
|
configure_renderer(options[:renderer])
|
|
111
112
|
|
|
112
|
-
result = task.
|
|
113
|
+
result = task.validate_json_ld(
|
|
113
114
|
urls: urls,
|
|
114
115
|
debug: options[:debug],
|
|
115
116
|
renderer: options[:renderer],
|
|
@@ -123,7 +124,7 @@ module Crawlscope
|
|
|
123
124
|
|
|
124
125
|
def run_validate
|
|
125
126
|
options = {
|
|
126
|
-
|
|
127
|
+
url: normalized_string(ENV["URL"]),
|
|
127
128
|
rule_names: normalized_string(ENV["RULES"]),
|
|
128
129
|
sitemap_path: normalized_string(ENV["SITEMAP"])
|
|
129
130
|
}
|
|
@@ -134,10 +135,10 @@ module Crawlscope
|
|
|
134
135
|
@configuration.timeout_seconds = resolved_integer("TIMEOUT", default: @configuration.timeout_seconds, minimum: 1)
|
|
135
136
|
|
|
136
137
|
parser = OptionParser.new do |opts|
|
|
137
|
-
opts.banner = "Usage: crawlscope validate --
|
|
138
|
+
opts.banner = "Usage: crawlscope validate --url https://example.com [options]"
|
|
138
139
|
|
|
139
|
-
opts.on("--
|
|
140
|
-
options[:
|
|
140
|
+
opts.on("--url URL", "Set the site URL") do |value|
|
|
141
|
+
options[:url] = value
|
|
141
142
|
end
|
|
142
143
|
|
|
143
144
|
opts.on("--sitemap PATH_OR_URL", "Set the sitemap path or URL") do |value|
|
|
@@ -168,7 +169,7 @@ module Crawlscope
|
|
|
168
169
|
parser.parse!(@argv)
|
|
169
170
|
|
|
170
171
|
result = task.validate(
|
|
171
|
-
base_url: options[:
|
|
172
|
+
base_url: options[:url],
|
|
172
173
|
sitemap_path: options[:sitemap_path],
|
|
173
174
|
rule_names: options[:rule_names]
|
|
174
175
|
)
|
|
@@ -238,8 +239,12 @@ module Crawlscope
|
|
|
238
239
|
raw_urls.split(";").map(&:strip).reject(&:empty?)
|
|
239
240
|
end
|
|
240
241
|
|
|
242
|
+
def default_urls
|
|
243
|
+
[normalized_string(@configuration.base_url) || "http://localhost:3000"]
|
|
244
|
+
end
|
|
245
|
+
|
|
241
246
|
def task
|
|
242
|
-
@task ||=
|
|
247
|
+
@task ||= Run.new(configuration: @configuration, reporter: Reporter.new(io: @out))
|
|
243
248
|
end
|
|
244
249
|
end
|
|
245
250
|
end
|
|
@@ -7,6 +7,7 @@ module Crawlscope
|
|
|
7
7
|
DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS = 5
|
|
8
8
|
DEFAULT_BROWSER_SCROLL_PAGE = true
|
|
9
9
|
DEFAULT_CONCURRENCY = 10
|
|
10
|
+
RENDERERS = %i[http browser].freeze
|
|
10
11
|
DEFAULT_TIMEOUT_SECONDS = 20
|
|
11
12
|
|
|
12
13
|
attr_writer :allowed_statuses, :base_url, :browser_factory, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :rule_registry, :schema_registry, :scroll_page, :site_name, :sitemap_path, :timeout_seconds
|
|
@@ -26,7 +27,7 @@ module Crawlscope
|
|
|
26
27
|
|
|
27
28
|
def concurrency
|
|
28
29
|
value = resolve(@concurrency)
|
|
29
|
-
value
|
|
30
|
+
positive_integer(value, default: DEFAULT_CONCURRENCY, name: "concurrency")
|
|
30
31
|
end
|
|
31
32
|
|
|
32
33
|
def browser_concurrency
|
|
@@ -42,7 +43,7 @@ module Crawlscope
|
|
|
42
43
|
|
|
43
44
|
def network_idle_timeout_seconds
|
|
44
45
|
value = resolve(@network_idle_timeout_seconds)
|
|
45
|
-
value
|
|
46
|
+
positive_integer(value, default: DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, name: "network_idle_timeout_seconds")
|
|
46
47
|
end
|
|
47
48
|
|
|
48
49
|
def output
|
|
@@ -55,7 +56,10 @@ module Crawlscope
|
|
|
55
56
|
normalized_value = value.to_s.strip
|
|
56
57
|
normalized_value = "http" if normalized_value.empty?
|
|
57
58
|
|
|
58
|
-
normalized_value.to_sym
|
|
59
|
+
renderer = normalized_value.to_sym
|
|
60
|
+
return renderer if RENDERERS.include?(renderer)
|
|
61
|
+
|
|
62
|
+
raise ConfigurationError, "Crawlscope renderer must be http or browser"
|
|
59
63
|
end
|
|
60
64
|
|
|
61
65
|
def rule_registry
|
|
@@ -74,7 +78,7 @@ module Crawlscope
|
|
|
74
78
|
raise ConfigurationError, "Crawlscope sitemap_path is not configured"
|
|
75
79
|
end
|
|
76
80
|
|
|
77
|
-
|
|
81
|
+
Crawl.new(
|
|
78
82
|
base_url: base_url,
|
|
79
83
|
sitemap_path: sitemap_path,
|
|
80
84
|
browser_factory: browser_factory,
|
|
@@ -111,7 +115,7 @@ module Crawlscope
|
|
|
111
115
|
|
|
112
116
|
def timeout_seconds
|
|
113
117
|
value = resolve(@timeout_seconds)
|
|
114
|
-
value
|
|
118
|
+
positive_integer(value, default: DEFAULT_TIMEOUT_SECONDS, name: "timeout_seconds")
|
|
115
119
|
end
|
|
116
120
|
|
|
117
121
|
private
|
|
@@ -119,5 +123,16 @@ module Crawlscope
|
|
|
119
123
|
def resolve(value)
|
|
120
124
|
value.respond_to?(:call) ? value.call : value
|
|
121
125
|
end
|
|
126
|
+
|
|
127
|
+
def positive_integer(value, default:, name:)
|
|
128
|
+
return default if value.nil?
|
|
129
|
+
|
|
130
|
+
integer = value.is_a?(Integer) ? value : Integer(value, 10)
|
|
131
|
+
raise ArgumentError if integer < 1
|
|
132
|
+
|
|
133
|
+
integer
|
|
134
|
+
rescue ArgumentError, TypeError
|
|
135
|
+
raise ConfigurationError, "Crawlscope #{name} must be an integer >= 1"
|
|
136
|
+
end
|
|
122
137
|
end
|
|
123
138
|
end
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Crawlscope
|
|
4
|
-
class
|
|
4
|
+
class Crawl
|
|
5
5
|
def initialize(base_url:, sitemap_path:, rules:, schema_registry:, browser_factory: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, network_idle_timeout_seconds: Configuration::DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, renderer: :http, scroll_page: Configuration::DEFAULT_BROWSER_SCROLL_PAGE, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS, allowed_statuses: Configuration::DEFAULT_ALLOWED_STATUSES)
|
|
6
6
|
@base_url = base_url
|
|
7
7
|
@sitemap_path = sitemap_path
|
|
@@ -17,28 +17,15 @@ module Crawlscope
|
|
|
17
17
|
end
|
|
18
18
|
|
|
19
19
|
def call
|
|
20
|
-
urls =
|
|
21
|
-
raise ValidationError, "No URLs found in sitemap: #{@sitemap_path}" if urls.empty?
|
|
22
|
-
|
|
23
|
-
@page_fetcher = build_page
|
|
24
|
-
pages = Crawler.new(
|
|
25
|
-
page_fetcher: @page_fetcher,
|
|
26
|
-
concurrency: @concurrency
|
|
27
|
-
).call(urls)
|
|
20
|
+
urls = sitemap_urls
|
|
28
21
|
|
|
22
|
+
@page_fetcher = page
|
|
23
|
+
pages = Crawler.new(page_fetcher: @page_fetcher, concurrency: @concurrency).call(urls)
|
|
29
24
|
issues = IssueCollection.new
|
|
30
|
-
collect_crawl_issues(pages, issues)
|
|
31
|
-
cache_pages(pages)
|
|
32
|
-
context = {
|
|
33
|
-
allowed_statuses: @allowed_statuses,
|
|
34
|
-
base_url: @base_url,
|
|
35
|
-
resolve_target: method(:resolve_target),
|
|
36
|
-
schema_registry: @schema_registry
|
|
37
|
-
}
|
|
38
25
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
26
|
+
collect(pages, issues)
|
|
27
|
+
cache(pages)
|
|
28
|
+
scan(urls, pages, issues)
|
|
42
29
|
|
|
43
30
|
Result.new(
|
|
44
31
|
base_url: @base_url,
|
|
@@ -53,8 +40,15 @@ module Crawlscope
|
|
|
53
40
|
|
|
54
41
|
private
|
|
55
42
|
|
|
56
|
-
def
|
|
57
|
-
|
|
43
|
+
def sitemap_urls
|
|
44
|
+
urls = Sitemap.new(path: @sitemap_path).urls(base_url: @base_url)
|
|
45
|
+
raise ValidationError, "No URLs found in sitemap: #{@sitemap_path}" if urls.empty?
|
|
46
|
+
|
|
47
|
+
urls
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def browser
|
|
51
|
+
Browser.new(
|
|
58
52
|
base_url: @base_url,
|
|
59
53
|
timeout_seconds: @timeout_seconds,
|
|
60
54
|
network_idle_timeout_seconds: @network_idle_timeout_seconds,
|
|
@@ -64,65 +58,81 @@ module Crawlscope
|
|
|
64
58
|
raise ConfigurationError, "Browser rendering requires the ferrum gem (#{error.message})"
|
|
65
59
|
end
|
|
66
60
|
|
|
67
|
-
def
|
|
61
|
+
def page
|
|
68
62
|
if @renderer == :browser
|
|
69
|
-
|
|
70
|
-
browser_factory.call
|
|
63
|
+
(@browser_factory || method(:browser)).call
|
|
71
64
|
else
|
|
72
65
|
Http.new(base_url: @base_url, timeout_seconds: @timeout_seconds)
|
|
73
66
|
end
|
|
74
67
|
end
|
|
75
68
|
|
|
76
|
-
def
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
end
|
|
84
|
-
|
|
85
|
-
def cache_pages(pages)
|
|
86
|
-
@page_by_url = {}
|
|
87
|
-
@target_resolution_cache = {}
|
|
88
|
-
|
|
89
|
-
pages.each do |page|
|
|
90
|
-
@page_by_url[page.normalized_url] = page unless page.normalized_url.to_s.empty?
|
|
91
|
-
@page_by_url[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
|
|
92
|
-
end
|
|
69
|
+
def context
|
|
70
|
+
Context.new(
|
|
71
|
+
allowed_statuses: @allowed_statuses,
|
|
72
|
+
base_url: @base_url,
|
|
73
|
+
resolve_target: method(:resolve),
|
|
74
|
+
schema_registry: @schema_registry
|
|
75
|
+
)
|
|
93
76
|
end
|
|
94
77
|
|
|
95
|
-
def
|
|
78
|
+
def collect(pages, issues)
|
|
96
79
|
pages.each do |page|
|
|
97
80
|
if page.error
|
|
98
81
|
issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: page.url, message: page.error, details: {})
|
|
99
82
|
elsif !@allowed_statuses.include?(page.status)
|
|
100
83
|
issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status})
|
|
84
|
+
elsif redirected?(page)
|
|
85
|
+
issues.add(code: :redirected_page, severity: :warning, category: :crawl, url: page.url, message: "redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
|
|
101
86
|
end
|
|
102
87
|
end
|
|
103
88
|
end
|
|
104
89
|
|
|
105
|
-
def
|
|
106
|
-
|
|
107
|
-
|
|
90
|
+
def cache(pages)
|
|
91
|
+
@pages = {}
|
|
92
|
+
@targets = {}
|
|
108
93
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
94
|
+
pages.each do |page|
|
|
95
|
+
@pages[page.normalized_url] = page unless page.normalized_url.to_s.empty?
|
|
96
|
+
@pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
|
|
97
|
+
end
|
|
112
98
|
end
|
|
113
99
|
|
|
114
|
-
def
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
build_target_resolution(page, normalized_target_url, crawled: false)
|
|
100
|
+
def scan(urls, pages, issues)
|
|
101
|
+
@rules.each do |rule|
|
|
102
|
+
rule.call(urls: urls, pages: pages, issues: issues, context: context)
|
|
103
|
+
end
|
|
119
104
|
end
|
|
120
105
|
|
|
121
|
-
def
|
|
122
|
-
|
|
123
|
-
return if
|
|
106
|
+
def resolve(target_url)
|
|
107
|
+
normalized_url = Url.normalize(target_url, base_url: @base_url)
|
|
108
|
+
return @targets[normalized_url] if @targets.key?(normalized_url)
|
|
109
|
+
|
|
110
|
+
@targets[normalized_url] = resolved_page(normalized_url) || fetched_page(normalized_url)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def fetched_page(normalized_url)
|
|
114
|
+
page = @page_fetcher.fetch(normalized_url)
|
|
115
|
+
@pages[page.normalized_url] = page unless page.normalized_url.to_s.empty?
|
|
116
|
+
@pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
|
|
117
|
+
resolution(page, normalized_url, crawled: false)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def resolved_page(normalized_url)
|
|
121
|
+
page = @pages[normalized_url]
|
|
122
|
+
resolution(page, normalized_url, crawled: true) if page
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def resolution(page, normalized_url, crawled:)
|
|
126
|
+
{
|
|
127
|
+
crawled: crawled,
|
|
128
|
+
error: page.error,
|
|
129
|
+
final_url: page.normalized_final_url || normalized_url,
|
|
130
|
+
status: page.status
|
|
131
|
+
}
|
|
132
|
+
end
|
|
124
133
|
|
|
125
|
-
|
|
134
|
+
def redirected?(page)
|
|
135
|
+
page.normalized_url.to_s != page.normalized_final_url.to_s
|
|
126
136
|
end
|
|
127
137
|
end
|
|
128
138
|
end
|
data/lib/crawlscope/crawler.rb
CHANGED
|
@@ -15,7 +15,7 @@ module Crawlscope
|
|
|
15
15
|
|
|
16
16
|
urls.each do |url|
|
|
17
17
|
pool.post do
|
|
18
|
-
pages <<
|
|
18
|
+
pages << fetch(url)
|
|
19
19
|
end
|
|
20
20
|
end
|
|
21
21
|
|
|
@@ -24,5 +24,23 @@ module Crawlscope
|
|
|
24
24
|
|
|
25
25
|
pages.to_a
|
|
26
26
|
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
def fetch(url)
|
|
31
|
+
@page_fetcher.fetch(url)
|
|
32
|
+
rescue => error
|
|
33
|
+
Page.new(
|
|
34
|
+
url: url,
|
|
35
|
+
normalized_url: Url.normalize(url, base_url: url),
|
|
36
|
+
final_url: url,
|
|
37
|
+
normalized_final_url: Url.normalize(url, base_url: url),
|
|
38
|
+
status: nil,
|
|
39
|
+
headers: {},
|
|
40
|
+
body: nil,
|
|
41
|
+
doc: nil,
|
|
42
|
+
error: "#{error.class}: #{error.message}"
|
|
43
|
+
)
|
|
44
|
+
end
|
|
27
45
|
end
|
|
28
46
|
end
|
data/lib/crawlscope/http.rb
CHANGED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Crawlscope
|
|
4
|
+
module RakeTasks
|
|
5
|
+
module_function
|
|
6
|
+
|
|
7
|
+
def validate
|
|
8
|
+
run("validate")
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def ldjson
|
|
12
|
+
run("ldjson")
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def validate_rule(rule)
|
|
16
|
+
original_rules = ENV["RULES"]
|
|
17
|
+
ENV["RULES"] = rule
|
|
18
|
+
validate
|
|
19
|
+
ensure
|
|
20
|
+
ENV["RULES"] = original_rules
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def run(command)
|
|
24
|
+
status = Cli.start([command], out: $stdout, err: $stderr)
|
|
25
|
+
exit(status) unless status.zero?
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|