crawlscope 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +31 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +323 -0
  5. data/exe/crawlscope +6 -0
  6. data/lib/crawlscope/audit.rb +128 -0
  7. data/lib/crawlscope/browser.rb +88 -0
  8. data/lib/crawlscope/cli.rb +245 -0
  9. data/lib/crawlscope/configuration.rb +123 -0
  10. data/lib/crawlscope/crawler.rb +28 -0
  11. data/lib/crawlscope/http.rb +77 -0
  12. data/lib/crawlscope/issue.rb +17 -0
  13. data/lib/crawlscope/issue_collection.rb +41 -0
  14. data/lib/crawlscope/page.rb +23 -0
  15. data/lib/crawlscope/railtie.rb +9 -0
  16. data/lib/crawlscope/reporter.rb +33 -0
  17. data/lib/crawlscope/result.rb +9 -0
  18. data/lib/crawlscope/rule_registry.rb +39 -0
  19. data/lib/crawlscope/rules/links.rb +220 -0
  20. data/lib/crawlscope/rules/metadata.rb +93 -0
  21. data/lib/crawlscope/rules/structured_data.rb +58 -0
  22. data/lib/crawlscope/rules/uniqueness.rb +88 -0
  23. data/lib/crawlscope/schema_registry.rb +431 -0
  24. data/lib/crawlscope/sitemap.rb +67 -0
  25. data/lib/crawlscope/structured_data/audit.rb +150 -0
  26. data/lib/crawlscope/structured_data/document.rb +93 -0
  27. data/lib/crawlscope/structured_data/report.rb +77 -0
  28. data/lib/crawlscope/structured_data/reporter.rb +73 -0
  29. data/lib/crawlscope/structured_data/writer.rb +26 -0
  30. data/lib/crawlscope/task.rb +131 -0
  31. data/lib/crawlscope/url.rb +43 -0
  32. data/lib/crawlscope/version.rb +5 -0
  33. data/lib/crawlscope.rb +34 -0
  34. data/lib/tasks/crawlscope_tasks.rake +44 -0
  35. data/test/crawlscope/audit_test.rb +165 -0
  36. data/test/crawlscope/cli_test.rb +157 -0
  37. data/test/crawlscope/configuration_test.rb +45 -0
  38. data/test/crawlscope/links_rule_test.rb +87 -0
  39. data/test/crawlscope/loader_test.rb +11 -0
  40. data/test/crawlscope/reporter_test.rb +50 -0
  41. data/test/crawlscope/schema_registry_test.rb +89 -0
  42. data/test/crawlscope/sitemap_test.rb +51 -0
  43. data/test/crawlscope/structured_data_audit_test.rb +118 -0
  44. data/test/crawlscope/structured_data_document_test.rb +28 -0
  45. data/test/crawlscope/structured_data_report_test.rb +37 -0
  46. data/test/crawlscope/structured_data_reporter_test.rb +32 -0
  47. data/test/crawlscope/structured_data_rule_test.rb +78 -0
  48. data/test/crawlscope/structured_data_writer_test.rb +32 -0
  49. data/test/crawlscope/task_test.rb +206 -0
  50. data/test/crawlscope/uniqueness_rule_test.rb +46 -0
  51. data/test/test_helper.rb +23 -0
  52. metadata +271 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 58a83d74a7b2b8422df4f161db9d3a7fe3ff213495f0837fd29a08cc13715b86
4
+ data.tar.gz: 02bd5743bcaae94bfdcc169fb6fe782257527984da68b091f6b75db3420b4244
5
+ SHA512:
6
+ metadata.gz: c566f6899f45633db13a8ee47ac15f5e6054a4adff087774ce17ef15c26b10340694bd395e0de0efbdb5b652cf8ea04e3cbbb452d9467fd8167143f3675d5642
7
+ data.tar.gz: 1c087e1f4233224ea2c6b9b14de3bf34f4007b4689cd4fa8b9a3ea7ba688f78beb12431d0ffc7b6f54cae1eead319e3ba8293cef325440c614ca191b6ebf0e8b
data/CHANGELOG.md ADDED
@@ -0,0 +1,31 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.1.0] - 2026-04-23
9
+
10
+
11
+ ### Added
12
+
13
+ - add crawlkit release-ready audit gem
14
+
15
+ - add standalone validation commands
16
+
17
+ - move default schema rules into crawlkit
18
+
19
+
20
+
21
+
22
+ ### Changed
23
+
24
+ - strengthen public API coverage
25
+
26
+ - load shared test dependencies
27
+
28
+ - rename crawlkit to crawlscope
29
+
30
+
31
+
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ethos Link
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,323 @@
1
+ # Crawlscope
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/crawlscope.svg)](https://badge.fury.io/rb/crawlscope)
4
+ [![Ruby](https://github.com/ethos-link/crawlscope/actions/workflows/ruby.yml/badge.svg)](https://github.com/ethos-link/crawlscope/actions/workflows/ruby.yml)
5
+
6
+ `crawlscope` is a small Ruby gem for sitemap-driven SEO validation.
7
+
8
+ It is built by [Ethos Link](https://www.ethos-link.com) and used in production by [Reviato](https://www.reviato.com).
9
+
10
+ It is designed for Rails apps and plain Ruby scripts that want:
11
+
12
+ - deterministic sitemap crawling
13
+ - structured validation issues instead of free-form strings
14
+ - app-configurable rule and schema registries
15
+ - first-party rake tasks instead of a large DSL
16
+ - optional browser rendering for JavaScript-heavy pages
17
+
18
+ It works in three modes:
19
+
20
+ - as a plain Ruby library
21
+ - as a standalone CLI
22
+ - as Rails rake tasks through the included Railtie
23
+
24
+ The default rule set includes:
25
+
26
+ - metadata validation
27
+ - structured-data validation
28
+ - uniqueness checks
29
+ - internal-link checks
30
+
31
+ ## Installation
32
+
33
+ Add this line to your application's Gemfile:
34
+
35
+ ```ruby
36
+ gem "crawlscope"
37
+ ```
38
+
39
+ And then execute:
40
+
41
+ ```bash
42
+ bundle install
43
+ ```
44
+
45
+ Or install it directly:
46
+
47
+ ```bash
48
+ gem install crawlscope
49
+ ```
50
+
51
+ If you want browser rendering, also add:
52
+
53
+ ```ruby
54
+ gem "ferrum"
55
+ ```
56
+
57
+ `crawlscope` only loads Ferrum when you run in browser mode.
58
+
59
+ ## CLI Usage
60
+
61
+ Validate a site directly from the gem:
62
+
63
+ ```bash
64
+ crawlscope validate --base-url https://example.com
65
+ ```
66
+
67
+ Validate only specific rules:
68
+
69
+ ```bash
70
+ crawlscope validate --base-url https://example.com --rules metadata,links
71
+ ```
72
+
73
+ Validate structured data on one or more URLs:
74
+
75
+ ```bash
76
+ crawlscope ldjson --url https://example.com/article
77
+ crawlscope ldjson --url https://example.com/a --url https://example.com/b --summary
78
+ ```
79
+
80
+ If you do not pass `--sitemap`, `crawlscope` defaults to:
81
+
82
+ - `https://example.com/sitemap.xml` for real site URLs
83
+ - `public/sitemap.xml` for localhost-style development URLs when that file exists
84
+
85
+ Child sitemap indexes are supported automatically.
86
+
87
+ ## Ruby Usage
88
+
89
+ ```ruby
90
+ require "crawlscope"
91
+
92
+ audit = Crawlscope::Audit.new(
93
+ base_url: "https://example.com",
94
+ sitemap_path: "https://example.com/sitemap.xml",
95
+ rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
96
+ schema_registry: Crawlscope::SchemaRegistry.default
97
+ )
98
+
99
+ result = audit.call
100
+
101
+ puts result.ok?
102
+ puts result.issues.to_a.map(&:message)
103
+ ```
104
+
105
+ ## Result Shape
106
+
107
+ `Crawlscope::Audit` returns a `Crawlscope::Result` with:
108
+
109
+ - `urls`: sitemap URLs selected for validation
110
+ - `pages`: fetched page snapshots
111
+ - `issues`: structured issues with `code`, `severity`, `category`, `url`, and `message`
112
+
113
+ `result.ok?` returns `false` if any error, warning, or notice is present.
114
+
115
+ ## Rails Usage
116
+
117
+ In an initializer:
118
+
119
+ ```ruby
120
+ Crawlscope.configure do |config|
121
+ config.base_url = -> { "https://example.com" }
122
+ config.sitemap_path = -> { Rails.public_path.join("sitemap.xml").to_s }
123
+ config.site_name = "Example"
124
+ config.schema_registry = -> { Crawlscope::SchemaRegistry.default }
125
+ end
126
+ ```
127
+
128
+ Then run:
129
+
130
+ ```bash
131
+ bin/rails crawlscope:validate
132
+ ```
133
+
134
+ Available environment overrides:
135
+
136
+ - `BASE_URL`
137
+ - `SITEMAP`
138
+ - `RULES=metadata,links`
139
+ - `JS=1` or `RENDERER=browser`
140
+ - `TIMEOUT=30`
141
+ - `NETWORK_IDLE_TIMEOUT=10`
142
+ - `CONCURRENCY=5`
143
+
144
+ Available tasks:
145
+
146
+ ```bash
147
+ bin/rails crawlscope:validate
148
+ bin/rails crawlscope:validate:metadata
149
+ bin/rails crawlscope:validate:structured_data
150
+ bin/rails crawlscope:validate:uniqueness
151
+ bin/rails crawlscope:validate:links
152
+ bin/rails crawlscope:validate:ldjson URL=https://example.com/article
153
+ ```
154
+
155
+ The same validation surface is also available in the gem repository itself through plain `rake`:
156
+
157
+ ```bash
158
+ bundle exec rake crawlscope:validate BASE_URL=https://example.com
159
+ bundle exec rake crawlscope:validate:metadata BASE_URL=https://example.com
160
+ bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article
161
+ ```
162
+
163
+ ### Structured Data URL Audit
164
+
165
+ For one-off structured-data checks:
166
+
167
+ ```bash
168
+ bin/rails crawlscope:validate:ldjson URL=https://example.com/article
169
+ bin/rails crawlscope:validate:ldjson URL='https://example.com/a;https://example.com/b' SUMMARY=1
170
+ bin/rails crawlscope:validate:ldjson URL=https://example.com/article REPORT_PATH=tmp/structured-data.json
171
+ ```
172
+
173
+ Optional flags:
174
+
175
+ - `DEBUG=1`: print detected items
176
+ - `SUMMARY=1`: print grouped failures
177
+ - `REPORT_PATH=...`: write a JSON report
178
+ - `JS=1` or `RENDERER=browser`: render with Ferrum
179
+
180
+ ## Rules
181
+
182
+ Built-in rules:
183
+
184
+ - `metadata`
185
+ - `structured_data`
186
+ - `uniqueness`
187
+ - `links`
188
+
189
+ ### Metadata
190
+
191
+ Checks:
192
+
193
+ - missing `<h1>`
194
+ - missing `<title>`
195
+ - title length
196
+ - repeated site name in the title
197
+ - missing meta description
198
+ - meta description length
199
+ - missing canonical link
200
+ - canonical mismatch
201
+
202
+ ### Structured Data
203
+
204
+ Checks:
205
+
206
+ - malformed JSON-LD
207
+ - missing required fields for supported schema types
208
+ - schema validation failures from the configured registry
209
+ - direct URL structured-data audits through `crawlscope:validate:ldjson`
210
+
211
+ ### Uniqueness
212
+
213
+ Checks:
214
+
215
+ - duplicate titles
216
+ - duplicate meta descriptions
217
+ - duplicate content fingerprints
218
+
219
+ ### Links
220
+
221
+ Checks:
222
+
223
+ - broken internal links
224
+ - unresolved internal links
225
+ - low inbound anchor-link counts
226
+
227
+ ## Schema Registry
228
+
229
+ `crawlscope` ships with a default schema registry for common types such as:
230
+
231
+ - `Article`
232
+ - `FAQPage`
233
+ - `Organization`
234
+ - `Product`
235
+ - `Review`
236
+ - `SoftwareApplication`
237
+ - `WebApplication`
238
+ - `WebSite`
239
+
240
+ Host apps can replace or extend the registry:
241
+
242
+ ```ruby
243
+ Crawlscope.configure do |config|
244
+ config.schema_registry = -> { MyApp::StructuredData::SchemaRegistry.new }
245
+ end
246
+ ```
247
+
248
+ That makes `crawlscope` useful as the audit engine while the app remains the owner of stricter product-specific schema rules.
249
+
250
+ ## Development
251
+
252
+ ```bash
253
+ git clone https://github.com/ethos-link/crawlscope.git
254
+ cd crawlscope
255
+
256
+ bundle install
257
+ bundle exec rake test
258
+ bundle exec rake standard
259
+ bundle exec rake
260
+ ```
261
+
262
+ ### Git hooks
263
+
264
+ We use [lefthook](https://lefthook.dev/) with the Ruby [commitlint](https://github.com/arandilopez/commitlint) gem to enforce Conventional Commits on every commit. We also use [Standard Ruby](https://standardrb.com/) to keep code style consistent. CI validates commit messages, Standard Ruby, tests, and git-cliff changelog generation on pull requests and pushes to main/master.
265
+
266
+ Run the hook installer once per clone:
267
+
268
+ ```bash
269
+ bundle exec lefthook install
270
+ ```
271
+
272
+ ### Install locally
273
+
274
+ ```bash
275
+ rake install
276
+ ```
277
+
278
+ ## Release
279
+
280
+ Releases are tag-driven and published by GitHub Actions to RubyGems. Local release commands never publish directly.
281
+
282
+ Install [git-cliff](https://git-cliff.org/) locally before preparing a release. The release task regenerates `CHANGELOG.md` from Conventional Commits.
283
+
284
+ Before preparing a release, make sure you are on `main` or `master` with a clean worktree.
285
+
286
+ Then run one of:
287
+
288
+ ```bash
289
+ bundle exec rake 'release:prepare[patch]'
290
+ bundle exec rake 'release:prepare[minor]'
291
+ bundle exec rake 'release:prepare[major]'
292
+ bundle exec rake 'release:prepare[0.1.0]'
293
+ ```
294
+
295
+ The task will:
296
+
297
+ 1. Regenerate `CHANGELOG.md` with `git-cliff`.
298
+ 1. Update `lib/crawlscope/version.rb`.
299
+ 1. Commit the release changes.
300
+ 1. Create and push the `vX.Y.Z` tag.
301
+
302
+ The `Release` workflow then runs tests, publishes the gem to RubyGems, and creates the GitHub release from the changelog entry.
303
+
304
+ ## Contributing
305
+
306
+ 1. Fork it
307
+ 1. Create a branch (`git checkout -b feature/my-feature`)
308
+ 1. Commit your changes
309
+ 1. Push (`git push origin feature/my-feature`)
310
+ 1. Open a Pull Request
311
+
312
+ Please use [Conventional Commits](https://www.conventionalcommits.org/) for commit messages.
313
+
314
+ ## License
315
+
316
+ MIT License, see [LICENSE.txt](LICENSE.txt)
317
+
318
+ ## About
319
+
320
+ Made by the team at [Ethos Link](https://www.ethos-link.com) — practical software for growing businesses. We build tools for hospitality operators who need clear workflows, fast onboarding, and real human support.
321
+
322
+ We also build [Reviato](https://www.reviato.com), “Capture. Interpret. Act.”.
323
+ Turn guest feedback into clear next steps for your team. Collect private appraisals, spot patterns across reviews, and act before small issues turn into public ones.
data/exe/crawlscope ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "crawlscope"
5
+
6
+ exit Crawlscope::Cli.start(ARGV)
@@ -0,0 +1,128 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ class Audit
5
+ def initialize(base_url:, sitemap_path:, rules:, schema_registry:, browser_factory: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, network_idle_timeout_seconds: Configuration::DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, renderer: :http, scroll_page: Configuration::DEFAULT_BROWSER_SCROLL_PAGE, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS, allowed_statuses: Configuration::DEFAULT_ALLOWED_STATUSES)
6
+ @base_url = base_url
7
+ @sitemap_path = sitemap_path
8
+ @rules = Array(rules)
9
+ @schema_registry = schema_registry
10
+ @browser_factory = browser_factory
11
+ @concurrency = concurrency
12
+ @network_idle_timeout_seconds = network_idle_timeout_seconds
13
+ @renderer = renderer.to_sym
14
+ @scroll_page = scroll_page
15
+ @timeout_seconds = timeout_seconds
16
+ @allowed_statuses = allowed_statuses
17
+ end
18
+
19
+ def call
20
+ urls = Sitemap.new(path: @sitemap_path).urls(base_url: @base_url)
21
+ raise ValidationError, "No URLs found in sitemap: #{@sitemap_path}" if urls.empty?
22
+
23
+ @page_fetcher = build_page
24
+ pages = Crawler.new(
25
+ page_fetcher: @page_fetcher,
26
+ concurrency: @concurrency
27
+ ).call(urls)
28
+
29
+ issues = IssueCollection.new
30
+ collect_crawl_issues(pages, issues)
31
+ cache_pages(pages)
32
+ context = {
33
+ allowed_statuses: @allowed_statuses,
34
+ base_url: @base_url,
35
+ resolve_target: method(:resolve_target),
36
+ schema_registry: @schema_registry
37
+ }
38
+
39
+ @rules.each do |rule|
40
+ rule.call(urls: urls, pages: pages, issues: issues, context: context)
41
+ end
42
+
43
+ Result.new(
44
+ base_url: @base_url,
45
+ sitemap_path: @sitemap_path,
46
+ urls: urls,
47
+ pages: pages,
48
+ issues: issues
49
+ )
50
+ ensure
51
+ @page_fetcher&.close
52
+ end
53
+
54
+ private
55
+
56
+ def build_browser
57
+ Crawlscope::Browser.new(
58
+ base_url: @base_url,
59
+ timeout_seconds: @timeout_seconds,
60
+ network_idle_timeout_seconds: @network_idle_timeout_seconds,
61
+ scroll_page: @scroll_page
62
+ )
63
+ rescue LoadError => error
64
+ raise ConfigurationError, "Browser rendering requires the ferrum gem (#{error.message})"
65
+ end
66
+
67
+ def build_page
68
+ if @renderer == :browser
69
+ browser_factory = @browser_factory || method(:build_browser)
70
+ browser_factory.call
71
+ else
72
+ Http.new(base_url: @base_url, timeout_seconds: @timeout_seconds)
73
+ end
74
+ end
75
+
76
+ def build_target_resolution(page, normalized_target_url, crawled:)
77
+ {
78
+ crawled: crawled,
79
+ error: page.error,
80
+ final_url: page.normalized_final_url || normalized_target_url,
81
+ status: page.status
82
+ }
83
+ end
84
+
85
+ def cache_pages(pages)
86
+ @page_by_url = {}
87
+ @target_resolution_cache = {}
88
+
89
+ pages.each do |page|
90
+ @page_by_url[page.normalized_url] = page unless page.normalized_url.to_s.empty?
91
+ @page_by_url[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
92
+ end
93
+ end
94
+
95
+ def collect_crawl_issues(pages, issues)
96
+ pages.each do |page|
97
+ if page.error
98
+ issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: page.url, message: page.error, details: {})
99
+ elsif !@allowed_statuses.include?(page.status)
100
+ issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status})
101
+ end
102
+ end
103
+ end
104
+
105
+ def resolve_target(target_url)
106
+ normalized_target_url = Url.normalize(target_url, base_url: @base_url)
107
+ return @target_resolution_cache[normalized_target_url] if @target_resolution_cache.key?(normalized_target_url)
108
+
109
+ resolution = resolve_from_crawled_page(normalized_target_url)
110
+ resolution ||= resolve_by_fetching_target(normalized_target_url)
111
+ @target_resolution_cache[normalized_target_url] = resolution
112
+ end
113
+
114
+ def resolve_by_fetching_target(normalized_target_url)
115
+ page = @page_fetcher.fetch(normalized_target_url)
116
+ @page_by_url[page.normalized_url] = page unless page.normalized_url.to_s.empty?
117
+ @page_by_url[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
118
+ build_target_resolution(page, normalized_target_url, crawled: false)
119
+ end
120
+
121
+ def resolve_from_crawled_page(normalized_target_url)
122
+ page = @page_by_url[normalized_target_url]
123
+ return if page.nil?
124
+
125
+ build_target_resolution(page, normalized_target_url, crawled: true)
126
+ end
127
+ end
128
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+
5
+ module Crawlscope
6
+ class Browser
7
+ def initialize(base_url:, timeout_seconds:, network_idle_timeout_seconds:, scroll_page:)
8
+ @base_url = base_url
9
+ @timeout_seconds = timeout_seconds
10
+ @network_idle_timeout_seconds = network_idle_timeout_seconds
11
+ @scroll_page = scroll_page
12
+ @browser = build_browser
13
+ @page = @browser.create_page
14
+ end
15
+
16
+ def close
17
+ @browser&.quit
18
+ end
19
+
20
+ def fetch(url)
21
+ @page.network.clear(:traffic)
22
+ @page.go_to(url)
23
+ wait_for_network_idle
24
+
25
+ if @scroll_page
26
+ scroll_for_render
27
+ end
28
+
29
+ response = @page.network.response
30
+ final_url = response&.url.to_s
31
+ final_url = @page.current_url.to_s if final_url.empty?
32
+ final_url = @page.url.to_s if final_url.empty?
33
+ final_url = url if final_url.empty?
34
+ headers = response&.headers || {}
35
+ body = @page.body
36
+
37
+ Page.new(
38
+ url: url,
39
+ normalized_url: Url.normalize(url, base_url: @base_url),
40
+ final_url: final_url,
41
+ normalized_final_url: Url.normalize(final_url, base_url: @base_url),
42
+ status: @page.network.status,
43
+ headers: headers,
44
+ body: body,
45
+ doc: Nokogiri::HTML(body)
46
+ )
47
+ rescue => error
48
+ Page.new(
49
+ url: url,
50
+ normalized_url: Url.normalize(url, base_url: @base_url),
51
+ final_url: url,
52
+ normalized_final_url: Url.normalize(url, base_url: @base_url),
53
+ status: nil,
54
+ headers: {},
55
+ body: nil,
56
+ doc: nil,
57
+ error: "#{error.class}: #{error.message}"
58
+ )
59
+ end
60
+
61
+ private
62
+
63
+ def build_browser
64
+ require "ferrum"
65
+
66
+ Ferrum::Browser.new(
67
+ headless: true,
68
+ timeout: @timeout_seconds,
69
+ headers: {"User-Agent" => Http::USER_AGENT}
70
+ )
71
+ end
72
+
73
+ def scroll_for_render
74
+ @page.evaluate("(function() { if (document.body) { window.scrollTo(0, document.body.scrollHeight); } })()")
75
+ wait_for_network_idle
76
+ @page.evaluate("(function() { if (document.body) { window.scrollTo(0, 0); } })()")
77
+ wait_for_network_idle
78
+ @page.evaluate("(function() { if (document.body) { window.scrollTo(0, document.body.scrollHeight / 2); } })()")
79
+ wait_for_network_idle
80
+ end
81
+
82
+ def wait_for_network_idle
83
+ @page.network.wait_for_idle(duration: 0.5, timeout: @network_idle_timeout_seconds)
84
+ rescue Ferrum::TimeoutError
85
+ raise Timeout::Error, "Timed out waiting for browser network idle"
86
+ end
87
+ end
88
+ end