crawlscope 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +7 -8
  3. data/README.md +21 -14
  4. data/lib/crawlscope/browser.rb +8 -0
  5. data/lib/crawlscope/cli.rb +15 -10
  6. data/lib/crawlscope/configuration.rb +20 -5
  7. data/lib/crawlscope/context.rb +9 -0
  8. data/lib/crawlscope/{audit.rb → crawl.rb} +68 -58
  9. data/lib/crawlscope/crawler.rb +19 -1
  10. data/lib/crawlscope/http.rb +1 -1
  11. data/lib/crawlscope/rake_tasks.rb +28 -0
  12. data/lib/crawlscope/rules/links.rb +99 -48
  13. data/lib/crawlscope/rules/metadata.rb +57 -11
  14. data/lib/crawlscope/rules/structured_data.rb +61 -1
  15. data/lib/crawlscope/run.rb +60 -0
  16. data/lib/crawlscope/schema_registry.rb +3 -349
  17. data/lib/crawlscope/schemas.rb +406 -0
  18. data/lib/crawlscope/sitemap.rb +18 -6
  19. data/lib/crawlscope/structured_data/audit.rb +7 -7
  20. data/lib/crawlscope/structured_data/check.rb +35 -0
  21. data/lib/crawlscope/structured_data/reporter.rb +69 -0
  22. data/lib/crawlscope/url.rb +14 -0
  23. data/lib/crawlscope/version.rb +1 -1
  24. data/lib/tasks/crawlscope_tasks.rake +12 -23
  25. data/test/crawlscope/browser_test.rb +155 -0
  26. data/test/crawlscope/cli_test.rb +143 -7
  27. data/test/crawlscope/configuration_test.rb +49 -0
  28. data/test/crawlscope/{audit_test.rb → crawl_test.rb} +23 -7
  29. data/test/crawlscope/crawler_test.rb +34 -0
  30. data/test/crawlscope/http_test.rb +56 -0
  31. data/test/crawlscope/links_rule_test.rb +149 -5
  32. data/test/crawlscope/metadata_rule_test.rb +77 -0
  33. data/test/crawlscope/rule_registry_test.rb +32 -0
  34. data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
  35. data/test/crawlscope/schema_registry_test.rb +19 -0
  36. data/test/crawlscope/sitemap_test.rb +55 -0
  37. data/test/crawlscope/structured_data_document_test.rb +36 -0
  38. data/test/crawlscope/structured_data_report_test.rb +3 -3
  39. data/test/crawlscope/structured_data_reporter_test.rb +2 -2
  40. data/test/crawlscope/structured_data_rule_test.rb +111 -0
  41. data/test/crawlscope/structured_data_writer_test.rb +2 -2
  42. data/test/crawlscope/url_test.rb +31 -0
  43. metadata +15 -5
  44. data/lib/crawlscope/task.rb +0 -131
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 58a83d74a7b2b8422df4f161db9d3a7fe3ff213495f0837fd29a08cc13715b86
4
- data.tar.gz: 02bd5743bcaae94bfdcc169fb6fe782257527984da68b091f6b75db3420b4244
3
+ metadata.gz: b49aaaa6fdb5f7d5bd4dc63713d8c0090411e7063363645a900d8f59d803aaaa
4
+ data.tar.gz: 5dfcc35d60745c25db6faf3acaa4344e29e438c758740613d6216e2f47aeac6e
5
5
  SHA512:
6
- metadata.gz: c566f6899f45633db13a8ee47ac15f5e6054a4adff087774ce17ef15c26b10340694bd395e0de0efbdb5b652cf8ea04e3cbbb452d9467fd8167143f3675d5642
7
- data.tar.gz: 1c087e1f4233224ea2c6b9b14de3bf34f4007b4689cd4fa8b9a3ea7ba688f78beb12431d0ffc7b6f54cae1eead319e3ba8293cef325440c614ca191b6ebf0e8b
6
+ metadata.gz: 9f66627274ce2ea969b5bb9b53a339215718c37baf47393c75bcf3a528c5c73658c6a71903fdbbf9e53796aaf3680be5f99ab4151b834efbf9450e05abbab83b
7
+ data.tar.gz: 3cf2e2c7f251a6af7b931f00da63436eaa7e09f078d73de112852a10665cf16eefb561c7d61d6bc8b0c3c014ca0db2df217d31c00b9f0ed321565ed554574261
data/CHANGELOG.md CHANGED
@@ -5,27 +5,26 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
- ## [0.1.0] - 2026-04-23
8
+ ## [0.3.0] - 2026-04-28
9
9
 
10
10
 
11
11
  ### Added
12
12
 
13
- - add crawlkit release-ready audit gem
13
+ - add JobPost structured data
14
14
 
15
- - add standalone validation commands
16
15
 
17
- - move default schema rules into crawlkit
18
16
 
19
17
 
18
+ ### Documentation
20
19
 
20
+ - fix missing changelog entry
21
21
 
22
- ### Changed
23
22
 
24
- - strengthen public API coverage
25
23
 
26
- - load shared test dependencies
27
24
 
28
- - rename crawlkit to crawlscope
25
+ ### Fixed
26
+
27
+ - ldjson check now uses the same convention for default URL
29
28
 
30
29
 
31
30
 
data/README.md CHANGED
@@ -58,16 +58,16 @@ gem "ferrum"
58
58
 
59
59
  ## CLI Usage
60
60
 
61
- Validate a site directly from the gem:
61
+ Validate a site from its default sitemap:
62
62
 
63
63
  ```bash
64
- crawlscope validate --base-url https://example.com
64
+ crawlscope validate --url https://example.com
65
65
  ```
66
66
 
67
67
  Validate only specific rules:
68
68
 
69
69
  ```bash
70
- crawlscope validate --base-url https://example.com --rules metadata,links
70
+ crawlscope validate --url https://example.com --rules metadata,links
71
71
  ```
72
72
 
73
73
  Validate structured data on one or more URLs:
@@ -77,10 +77,11 @@ crawlscope ldjson --url https://example.com/article
77
77
  crawlscope ldjson --url https://example.com/a --url https://example.com/b --summary
78
78
  ```
79
79
 
80
- If you do not pass `--sitemap`, `crawlscope` defaults to:
80
+ To use a non-default sitemap, pass `--sitemap`:
81
81
 
82
- - `https://example.com/sitemap.xml` for real site URLs
83
- - `public/sitemap.xml` for localhost-style development URLs when that file exists
82
+ ```bash
83
+ crawlscope validate --url https://example.com --sitemap https://example.com/sitemap.xml
84
+ ```
84
85
 
85
86
  Child sitemap indexes are supported automatically.
86
87
 
@@ -89,14 +90,14 @@ Child sitemap indexes are supported automatically.
89
90
  ```ruby
90
91
  require "crawlscope"
91
92
 
92
- audit = Crawlscope::Audit.new(
93
+ crawl = Crawlscope::Crawl.new(
93
94
  base_url: "https://example.com",
94
95
  sitemap_path: "https://example.com/sitemap.xml",
95
96
  rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
96
97
  schema_registry: Crawlscope::SchemaRegistry.default
97
98
  )
98
99
 
99
- result = audit.call
100
+ result = crawl.call
100
101
 
101
102
  puts result.ok?
102
103
  puts result.issues.to_a.map(&:message)
@@ -104,7 +105,7 @@ puts result.issues.to_a.map(&:message)
104
105
 
105
106
  ## Result Shape
106
107
 
107
- `Crawlscope::Audit` returns a `Crawlscope::Result` with:
108
+ `Crawlscope::Crawl` returns a `Crawlscope::Result` with:
108
109
 
109
110
  - `urls`: sitemap URLs selected for validation
110
111
  - `pages`: fetched page snapshots
@@ -133,7 +134,7 @@ bin/rails crawlscope:validate
133
134
 
134
135
  Available environment overrides:
135
136
 
136
- - `BASE_URL`
137
+ - `URL`
137
138
  - `SITEMAP`
138
139
  - `RULES=metadata,links`
139
140
  - `JS=1` or `RENDERER=browser`
@@ -149,17 +150,21 @@ bin/rails crawlscope:validate:metadata
149
150
  bin/rails crawlscope:validate:structured_data
150
151
  bin/rails crawlscope:validate:uniqueness
151
152
  bin/rails crawlscope:validate:links
152
- bin/rails crawlscope:validate:ldjson URL=https://example.com/article
153
+ bin/rails crawlscope:validate:ldjson
153
154
  ```
154
155
 
155
156
  The same validation surface is also available in the gem repository itself through plain `rake`:
156
157
 
157
158
  ```bash
158
- bundle exec rake crawlscope:validate BASE_URL=https://example.com
159
- bundle exec rake crawlscope:validate:metadata BASE_URL=https://example.com
159
+ bundle exec rake crawlscope:validate URL=https://example.com
160
+ bundle exec rake crawlscope:validate:metadata URL=https://example.com
160
161
  bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article
161
162
  ```
162
163
 
164
+ `crawlscope:validate` runs all default sitemap rules: metadata, structured data, uniqueness, and links. `URL` is the site base. Without `SITEMAP`, Crawlscope uses `/sitemap.xml`. With `SITEMAP`, Crawlscope uses `URL` as the site base and validates URLs from that sitemap. `SITEMAP` may be a full URL or a local file path.
165
+
166
+ `crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`.
167
+
163
168
  ### Structured Data URL Audit
164
169
 
165
170
  For one-off structured-data checks:
@@ -174,7 +179,7 @@ Optional flags:
174
179
 
175
180
  - `DEBUG=1`: print detected items
176
181
  - `SUMMARY=1`: print grouped failures
177
- - `REPORT_PATH=...`: write a JSON report
182
+ - `REPORT_PATH=...`: write a JSON report. Treat this as trusted operator input; Crawlscope writes to the path the task process can access.
178
183
  - `JS=1` or `RENDERER=browser`: render with Ferrum
179
184
 
180
185
  ## Rules
@@ -237,6 +242,8 @@ Checks:
237
242
  - `WebApplication`
238
243
  - `WebSite`
239
244
 
245
+ The default schema definitions live in `Crawlscope::Schemas`; `Crawlscope::SchemaRegistry` owns registration and validation.
246
+
240
247
  Host apps can replace or extend the registry:
241
248
 
242
249
  ```ruby
@@ -45,6 +45,8 @@ module Crawlscope
45
45
  doc: Nokogiri::HTML(body)
46
46
  )
47
47
  rescue => error
48
+ raise unless browser_error?(error)
49
+
48
50
  Page.new(
49
51
  url: url,
50
52
  normalized_url: Url.normalize(url, base_url: @base_url),
@@ -84,5 +86,11 @@ module Crawlscope
84
86
  rescue Ferrum::TimeoutError
85
87
  raise Timeout::Error, "Timed out waiting for browser network idle"
86
88
  end
89
+
90
+ def browser_error?(error)
91
+ error.is_a?(Timeout::Error) ||
92
+ error.is_a?(SystemCallError) ||
93
+ error.class.name.to_s.start_with?("Ferrum::")
94
+ end
87
95
  end
88
96
  end
@@ -37,7 +37,7 @@ module Crawlscope
37
37
  @err.puts(general_usage)
38
38
  1
39
39
  end
40
- rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ArgumentError => error
40
+ rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ValidationError, ArgumentError => error
41
41
  @err.puts(error.message)
42
42
  @err.puts("")
43
43
  @err.puts(general_usage)
@@ -49,12 +49,12 @@ module Crawlscope
49
49
  def general_usage
50
50
  <<~TEXT
51
51
  Usage:
52
- crawlscope validate --base-url https://example.com [options]
52
+ crawlscope validate --url https://example.com [options]
53
53
  crawlscope ldjson --url https://example.com/page [options]
54
54
  crawlscope version
55
55
 
56
56
  Commands:
57
- validate Audit sitemap URLs for metadata, structured data, uniqueness, and links
57
+ validate Audit URLs for metadata, structured data, uniqueness, and links
58
58
  ldjson Validate structured data on one or more URLs
59
59
  version Print the gem version
60
60
  TEXT
@@ -105,11 +105,12 @@ module Crawlscope
105
105
  parser.parse!(@argv)
106
106
 
107
107
  urls = options[:urls].map(&:strip).reject(&:empty?)
108
+ urls = default_urls if urls.empty?
108
109
  raise ConfigurationError, "Crawlscope URL is not configured" if urls.empty?
109
110
 
110
111
  configure_renderer(options[:renderer])
111
112
 
112
- result = task.validate_ldjson(
113
+ result = task.validate_json_ld(
113
114
  urls: urls,
114
115
  debug: options[:debug],
115
116
  renderer: options[:renderer],
@@ -123,7 +124,7 @@ module Crawlscope
123
124
 
124
125
  def run_validate
125
126
  options = {
126
- base_url: normalized_string(ENV["BASE_URL"]),
127
+ url: normalized_string(ENV["URL"]),
127
128
  rule_names: normalized_string(ENV["RULES"]),
128
129
  sitemap_path: normalized_string(ENV["SITEMAP"])
129
130
  }
@@ -134,10 +135,10 @@ module Crawlscope
134
135
  @configuration.timeout_seconds = resolved_integer("TIMEOUT", default: @configuration.timeout_seconds, minimum: 1)
135
136
 
136
137
  parser = OptionParser.new do |opts|
137
- opts.banner = "Usage: crawlscope validate --base-url https://example.com [options]"
138
+ opts.banner = "Usage: crawlscope validate --url https://example.com [options]"
138
139
 
139
- opts.on("--base-url URL", "Set the site base URL") do |value|
140
- options[:base_url] = value
140
+ opts.on("--url URL", "Set the site URL") do |value|
141
+ options[:url] = value
141
142
  end
142
143
 
143
144
  opts.on("--sitemap PATH_OR_URL", "Set the sitemap path or URL") do |value|
@@ -168,7 +169,7 @@ module Crawlscope
168
169
  parser.parse!(@argv)
169
170
 
170
171
  result = task.validate(
171
- base_url: options[:base_url],
172
+ base_url: options[:url],
172
173
  sitemap_path: options[:sitemap_path],
173
174
  rule_names: options[:rule_names]
174
175
  )
@@ -238,8 +239,12 @@ module Crawlscope
238
239
  raw_urls.split(";").map(&:strip).reject(&:empty?)
239
240
  end
240
241
 
242
+ def default_urls
243
+ [normalized_string(@configuration.base_url) || "http://localhost:3000"]
244
+ end
245
+
241
246
  def task
242
- @task ||= Task.new(configuration: @configuration, reporter: Reporter.new(io: @out))
247
+ @task ||= Run.new(configuration: @configuration, reporter: Reporter.new(io: @out))
243
248
  end
244
249
  end
245
250
  end
@@ -7,6 +7,7 @@ module Crawlscope
7
7
  DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS = 5
8
8
  DEFAULT_BROWSER_SCROLL_PAGE = true
9
9
  DEFAULT_CONCURRENCY = 10
10
+ RENDERERS = %i[http browser].freeze
10
11
  DEFAULT_TIMEOUT_SECONDS = 20
11
12
 
12
13
  attr_writer :allowed_statuses, :base_url, :browser_factory, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :rule_registry, :schema_registry, :scroll_page, :site_name, :sitemap_path, :timeout_seconds
@@ -26,7 +27,7 @@ module Crawlscope
26
27
 
27
28
  def concurrency
28
29
  value = resolve(@concurrency)
29
- value.nil? ? DEFAULT_CONCURRENCY : value.to_i
30
+ positive_integer(value, default: DEFAULT_CONCURRENCY, name: "concurrency")
30
31
  end
31
32
 
32
33
  def browser_concurrency
@@ -42,7 +43,7 @@ module Crawlscope
42
43
 
43
44
  def network_idle_timeout_seconds
44
45
  value = resolve(@network_idle_timeout_seconds)
45
- value.nil? ? DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS : value.to_i
46
+ positive_integer(value, default: DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, name: "network_idle_timeout_seconds")
46
47
  end
47
48
 
48
49
  def output
@@ -55,7 +56,10 @@ module Crawlscope
55
56
  normalized_value = value.to_s.strip
56
57
  normalized_value = "http" if normalized_value.empty?
57
58
 
58
- normalized_value.to_sym
59
+ renderer = normalized_value.to_sym
60
+ return renderer if RENDERERS.include?(renderer)
61
+
62
+ raise ConfigurationError, "Crawlscope renderer must be http or browser"
59
63
  end
60
64
 
61
65
  def rule_registry
@@ -74,7 +78,7 @@ module Crawlscope
74
78
  raise ConfigurationError, "Crawlscope sitemap_path is not configured"
75
79
  end
76
80
 
77
- Audit.new(
81
+ Crawl.new(
78
82
  base_url: base_url,
79
83
  sitemap_path: sitemap_path,
80
84
  browser_factory: browser_factory,
@@ -111,7 +115,7 @@ module Crawlscope
111
115
 
112
116
  def timeout_seconds
113
117
  value = resolve(@timeout_seconds)
114
- value.nil? ? DEFAULT_TIMEOUT_SECONDS : value.to_i
118
+ positive_integer(value, default: DEFAULT_TIMEOUT_SECONDS, name: "timeout_seconds")
115
119
  end
116
120
 
117
121
  private
@@ -119,5 +123,16 @@ module Crawlscope
119
123
  def resolve(value)
120
124
  value.respond_to?(:call) ? value.call : value
121
125
  end
126
+
127
+ def positive_integer(value, default:, name:)
128
+ return default if value.nil?
129
+
130
+ integer = value.is_a?(Integer) ? value : Integer(value, 10)
131
+ raise ArgumentError if integer < 1
132
+
133
+ integer
134
+ rescue ArgumentError, TypeError
135
+ raise ConfigurationError, "Crawlscope #{name} must be an integer >= 1"
136
+ end
122
137
  end
123
138
  end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ Context = Data.define(:allowed_statuses, :base_url, :resolve_target, :schema_registry) do
5
+ def fetch(name)
6
+ public_send(name)
7
+ end
8
+ end
9
+ end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Crawlscope
4
- class Audit
4
+ class Crawl
5
5
  def initialize(base_url:, sitemap_path:, rules:, schema_registry:, browser_factory: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, network_idle_timeout_seconds: Configuration::DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, renderer: :http, scroll_page: Configuration::DEFAULT_BROWSER_SCROLL_PAGE, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS, allowed_statuses: Configuration::DEFAULT_ALLOWED_STATUSES)
6
6
  @base_url = base_url
7
7
  @sitemap_path = sitemap_path
@@ -17,28 +17,15 @@ module Crawlscope
17
17
  end
18
18
 
19
19
  def call
20
- urls = Sitemap.new(path: @sitemap_path).urls(base_url: @base_url)
21
- raise ValidationError, "No URLs found in sitemap: #{@sitemap_path}" if urls.empty?
22
-
23
- @page_fetcher = build_page
24
- pages = Crawler.new(
25
- page_fetcher: @page_fetcher,
26
- concurrency: @concurrency
27
- ).call(urls)
20
+ urls = sitemap_urls
28
21
 
22
+ @page_fetcher = page
23
+ pages = Crawler.new(page_fetcher: @page_fetcher, concurrency: @concurrency).call(urls)
29
24
  issues = IssueCollection.new
30
- collect_crawl_issues(pages, issues)
31
- cache_pages(pages)
32
- context = {
33
- allowed_statuses: @allowed_statuses,
34
- base_url: @base_url,
35
- resolve_target: method(:resolve_target),
36
- schema_registry: @schema_registry
37
- }
38
25
 
39
- @rules.each do |rule|
40
- rule.call(urls: urls, pages: pages, issues: issues, context: context)
41
- end
26
+ collect(pages, issues)
27
+ cache(pages)
28
+ scan(urls, pages, issues)
42
29
 
43
30
  Result.new(
44
31
  base_url: @base_url,
@@ -53,8 +40,15 @@ module Crawlscope
53
40
 
54
41
  private
55
42
 
56
- def build_browser
57
- Crawlscope::Browser.new(
43
+ def sitemap_urls
44
+ urls = Sitemap.new(path: @sitemap_path).urls(base_url: @base_url)
45
+ raise ValidationError, "No URLs found in sitemap: #{@sitemap_path}" if urls.empty?
46
+
47
+ urls
48
+ end
49
+
50
+ def browser
51
+ Browser.new(
58
52
  base_url: @base_url,
59
53
  timeout_seconds: @timeout_seconds,
60
54
  network_idle_timeout_seconds: @network_idle_timeout_seconds,
@@ -64,65 +58,81 @@ module Crawlscope
64
58
  raise ConfigurationError, "Browser rendering requires the ferrum gem (#{error.message})"
65
59
  end
66
60
 
67
- def build_page
61
+ def page
68
62
  if @renderer == :browser
69
- browser_factory = @browser_factory || method(:build_browser)
70
- browser_factory.call
63
+ (@browser_factory || method(:browser)).call
71
64
  else
72
65
  Http.new(base_url: @base_url, timeout_seconds: @timeout_seconds)
73
66
  end
74
67
  end
75
68
 
76
- def build_target_resolution(page, normalized_target_url, crawled:)
77
- {
78
- crawled: crawled,
79
- error: page.error,
80
- final_url: page.normalized_final_url || normalized_target_url,
81
- status: page.status
82
- }
83
- end
84
-
85
- def cache_pages(pages)
86
- @page_by_url = {}
87
- @target_resolution_cache = {}
88
-
89
- pages.each do |page|
90
- @page_by_url[page.normalized_url] = page unless page.normalized_url.to_s.empty?
91
- @page_by_url[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
92
- end
69
+ def context
70
+ Context.new(
71
+ allowed_statuses: @allowed_statuses,
72
+ base_url: @base_url,
73
+ resolve_target: method(:resolve),
74
+ schema_registry: @schema_registry
75
+ )
93
76
  end
94
77
 
95
- def collect_crawl_issues(pages, issues)
78
+ def collect(pages, issues)
96
79
  pages.each do |page|
97
80
  if page.error
98
81
  issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: page.url, message: page.error, details: {})
99
82
  elsif !@allowed_statuses.include?(page.status)
100
83
  issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status})
84
+ elsif redirected?(page)
85
+ issues.add(code: :redirected_page, severity: :warning, category: :crawl, url: page.url, message: "redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
101
86
  end
102
87
  end
103
88
  end
104
89
 
105
- def resolve_target(target_url)
106
- normalized_target_url = Url.normalize(target_url, base_url: @base_url)
107
- return @target_resolution_cache[normalized_target_url] if @target_resolution_cache.key?(normalized_target_url)
90
+ def cache(pages)
91
+ @pages = {}
92
+ @targets = {}
108
93
 
109
- resolution = resolve_from_crawled_page(normalized_target_url)
110
- resolution ||= resolve_by_fetching_target(normalized_target_url)
111
- @target_resolution_cache[normalized_target_url] = resolution
94
+ pages.each do |page|
95
+ @pages[page.normalized_url] = page unless page.normalized_url.to_s.empty?
96
+ @pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
97
+ end
112
98
  end
113
99
 
114
- def resolve_by_fetching_target(normalized_target_url)
115
- page = @page_fetcher.fetch(normalized_target_url)
116
- @page_by_url[page.normalized_url] = page unless page.normalized_url.to_s.empty?
117
- @page_by_url[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
118
- build_target_resolution(page, normalized_target_url, crawled: false)
100
+ def scan(urls, pages, issues)
101
+ @rules.each do |rule|
102
+ rule.call(urls: urls, pages: pages, issues: issues, context: context)
103
+ end
119
104
  end
120
105
 
121
- def resolve_from_crawled_page(normalized_target_url)
122
- page = @page_by_url[normalized_target_url]
123
- return if page.nil?
106
+ def resolve(target_url)
107
+ normalized_url = Url.normalize(target_url, base_url: @base_url)
108
+ return @targets[normalized_url] if @targets.key?(normalized_url)
109
+
110
+ @targets[normalized_url] = resolved_page(normalized_url) || fetched_page(normalized_url)
111
+ end
112
+
113
+ def fetched_page(normalized_url)
114
+ page = @page_fetcher.fetch(normalized_url)
115
+ @pages[page.normalized_url] = page unless page.normalized_url.to_s.empty?
116
+ @pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
117
+ resolution(page, normalized_url, crawled: false)
118
+ end
119
+
120
+ def resolved_page(normalized_url)
121
+ page = @pages[normalized_url]
122
+ resolution(page, normalized_url, crawled: true) if page
123
+ end
124
+
125
+ def resolution(page, normalized_url, crawled:)
126
+ {
127
+ crawled: crawled,
128
+ error: page.error,
129
+ final_url: page.normalized_final_url || normalized_url,
130
+ status: page.status
131
+ }
132
+ end
124
133
 
125
- build_target_resolution(page, normalized_target_url, crawled: true)
134
+ def redirected?(page)
135
+ page.normalized_url.to_s != page.normalized_final_url.to_s
126
136
  end
127
137
  end
128
138
  end
@@ -15,7 +15,7 @@ module Crawlscope
15
15
 
16
16
  urls.each do |url|
17
17
  pool.post do
18
- pages << @page_fetcher.fetch(url)
18
+ pages << fetch(url)
19
19
  end
20
20
  end
21
21
 
@@ -24,5 +24,23 @@ module Crawlscope
24
24
 
25
25
  pages.to_a
26
26
  end
27
+
28
+ private
29
+
30
+ def fetch(url)
31
+ @page_fetcher.fetch(url)
32
+ rescue => error
33
+ Page.new(
34
+ url: url,
35
+ normalized_url: Url.normalize(url, base_url: url),
36
+ final_url: url,
37
+ normalized_final_url: Url.normalize(url, base_url: url),
38
+ status: nil,
39
+ headers: {},
40
+ body: nil,
41
+ doc: nil,
42
+ error: "#{error.class}: #{error.message}"
43
+ )
44
+ end
27
45
  end
28
46
  end
@@ -43,7 +43,7 @@ module Crawlscope
43
43
  body: body,
44
44
  doc: doc
45
45
  )
46
- rescue => error
46
+ rescue Faraday::Error, SocketError, SystemCallError, Timeout::Error => error
47
47
  Page.new(
48
48
  url: url,
49
49
  normalized_url: Url.normalize(url, base_url: @base_url),
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ module RakeTasks
5
+ module_function
6
+
7
+ def validate
8
+ run("validate")
9
+ end
10
+
11
+ def ldjson
12
+ run("ldjson")
13
+ end
14
+
15
+ def validate_rule(rule)
16
+ original_rules = ENV["RULES"]
17
+ ENV["RULES"] = rule
18
+ validate
19
+ ensure
20
+ ENV["RULES"] = original_rules
21
+ end
22
+
23
+ def run(command)
24
+ status = Cli.start([command], out: $stdout, err: $stderr)
25
+ exit(status) unless status.zero?
26
+ end
27
+ end
28
+ end