crawlscope 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +7 -8
  3. data/README.md +21 -14
  4. data/lib/crawlscope/browser.rb +8 -0
  5. data/lib/crawlscope/cli.rb +15 -10
  6. data/lib/crawlscope/configuration.rb +20 -5
  7. data/lib/crawlscope/context.rb +9 -0
  8. data/lib/crawlscope/{audit.rb → crawl.rb} +68 -58
  9. data/lib/crawlscope/crawler.rb +19 -1
  10. data/lib/crawlscope/http.rb +1 -1
  11. data/lib/crawlscope/rake_tasks.rb +28 -0
  12. data/lib/crawlscope/rules/links.rb +99 -48
  13. data/lib/crawlscope/rules/metadata.rb +57 -11
  14. data/lib/crawlscope/rules/structured_data.rb +61 -1
  15. data/lib/crawlscope/run.rb +60 -0
  16. data/lib/crawlscope/schema_registry.rb +3 -349
  17. data/lib/crawlscope/schemas.rb +406 -0
  18. data/lib/crawlscope/sitemap.rb +18 -6
  19. data/lib/crawlscope/structured_data/audit.rb +7 -7
  20. data/lib/crawlscope/structured_data/check.rb +35 -0
  21. data/lib/crawlscope/structured_data/reporter.rb +69 -0
  22. data/lib/crawlscope/url.rb +14 -0
  23. data/lib/crawlscope/version.rb +1 -1
  24. data/lib/tasks/crawlscope_tasks.rake +12 -23
  25. data/test/crawlscope/browser_test.rb +155 -0
  26. data/test/crawlscope/cli_test.rb +143 -7
  27. data/test/crawlscope/configuration_test.rb +49 -0
  28. data/test/crawlscope/{audit_test.rb → crawl_test.rb} +23 -7
  29. data/test/crawlscope/crawler_test.rb +34 -0
  30. data/test/crawlscope/http_test.rb +56 -0
  31. data/test/crawlscope/links_rule_test.rb +149 -5
  32. data/test/crawlscope/metadata_rule_test.rb +77 -0
  33. data/test/crawlscope/rule_registry_test.rb +32 -0
  34. data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
  35. data/test/crawlscope/schema_registry_test.rb +19 -0
  36. data/test/crawlscope/sitemap_test.rb +55 -0
  37. data/test/crawlscope/structured_data_document_test.rb +36 -0
  38. data/test/crawlscope/structured_data_report_test.rb +3 -3
  39. data/test/crawlscope/structured_data_reporter_test.rb +2 -2
  40. data/test/crawlscope/structured_data_rule_test.rb +111 -0
  41. data/test/crawlscope/structured_data_writer_test.rb +2 -2
  42. data/test/crawlscope/url_test.rb +31 -0
  43. metadata +15 -5
  44. data/lib/crawlscope/task.rb +0 -131
@@ -1,44 +1,33 @@
1
1
  namespace :crawlscope do
2
- desc "Validate sitemap URLs with the default Crawlscope rules. ENV: BASE_URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
2
+ desc "Validate URLs with all default Crawlscope rules. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
3
3
  task validate: :environment do
4
- status = Crawlscope::Cli.start(["validate"], out: $stdout, err: $stderr)
5
- exit(status) unless status.zero?
4
+ Crawlscope::RakeTasks.validate
6
5
  end
7
6
 
8
7
  namespace :validate do
9
- desc "Validate JSON-LD on one or more URLs. ENV: URL (required, semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
8
+ desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
10
9
  task ldjson: :environment do
11
- status = Crawlscope::Cli.start(["ldjson"], out: $stdout, err: $stderr)
12
- exit(status) unless status.zero?
10
+ Crawlscope::RakeTasks.ldjson
13
11
  end
14
12
 
15
- desc "Validate sitemap URLs with the metadata rule. ENV: BASE_URL, SITEMAP, JS=1"
13
+ desc "Validate URLs with the metadata rule. ENV: URL, SITEMAP, JS=1"
16
14
  task metadata: :environment do
17
- crawlscope_task_with_rules("metadata")
15
+ Crawlscope::RakeTasks.validate_rule("metadata")
18
16
  end
19
17
 
20
- desc "Validate sitemap URLs with the structured_data rule. ENV: BASE_URL, SITEMAP, JS=1"
18
+ desc "Validate sitemap URLs with the structured_data rule. ENV: URL, SITEMAP, JS=1"
21
19
  task structured_data: :environment do
22
- crawlscope_task_with_rules("structured_data")
20
+ Crawlscope::RakeTasks.validate_rule("structured_data")
23
21
  end
24
22
 
25
- desc "Validate sitemap URLs with the uniqueness rule. ENV: BASE_URL, SITEMAP, JS=1"
23
+ desc "Validate URLs with the uniqueness rule. ENV: URL, SITEMAP, JS=1"
26
24
  task uniqueness: :environment do
27
- crawlscope_task_with_rules("uniqueness")
25
+ Crawlscope::RakeTasks.validate_rule("uniqueness")
28
26
  end
29
27
 
30
- desc "Validate sitemap URLs with the links rule. ENV: BASE_URL, SITEMAP, JS=1"
28
+ desc "Validate URLs with the links rule. ENV: URL, SITEMAP, JS=1"
31
29
  task links: :environment do
32
- crawlscope_task_with_rules("links")
30
+ Crawlscope::RakeTasks.validate_rule("links")
33
31
  end
34
32
  end
35
-
36
- def crawlscope_task_with_rules(rules)
37
- original_rules = ENV["RULES"]
38
- ENV["RULES"] = rules
39
- status = Crawlscope::Cli.start(["validate"], out: $stdout, err: $stderr)
40
- exit(status) unless status.zero?
41
- ensure
42
- ENV["RULES"] = original_rules
43
- end
44
33
  end
@@ -0,0 +1,155 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeBrowserTest < Minitest::Test
6
+ Response = Data.define(:url, :headers)
7
+
8
+ class FakeBrowser
9
+ attr_reader :quit_called
10
+
11
+ def quit
12
+ @quit_called = true
13
+ end
14
+ end
15
+
16
+ class FakeNetwork
17
+ attr_reader :cleared, :idle_waits, :status
18
+
19
+ def initialize(response:, status: 200)
20
+ @response = response
21
+ @status = status
22
+ @cleared = []
23
+ @idle_waits = []
24
+ end
25
+
26
+ def clear(scope)
27
+ @cleared << scope
28
+ end
29
+
30
+ attr_reader :response
31
+
32
+ def wait_for_idle(duration:, timeout:)
33
+ @idle_waits << {duration: duration, timeout: timeout}
34
+ end
35
+ end
36
+
37
+ class FakePage
38
+ attr_reader :evaluations, :network, :visited_url
39
+
40
+ def initialize(network:, body: "<html></html>", current_url: "", url: "")
41
+ @network = network
42
+ @body = body
43
+ @current_url = current_url
44
+ @url = url
45
+ @evaluations = []
46
+ end
47
+
48
+ attr_reader :body
49
+
50
+ attr_reader :current_url
51
+
52
+ def evaluate(script)
53
+ @evaluations << script
54
+ end
55
+
56
+ def go_to(url)
57
+ @visited_url = url
58
+ end
59
+
60
+ attr_reader :url
61
+ end
62
+
63
+ def test_fetch_returns_rendered_page
64
+ network = FakeNetwork.new(response: Response.new(url: "https://example.com/final", headers: {"content-type" => "text/html"}))
65
+ page = FakePage.new(network: network, body: "<html><body>Hello</body></html>")
66
+ browser = browser_with(page: page, scroll_page: false)
67
+
68
+ result = browser.fetch("https://example.com/start")
69
+
70
+ assert_equal "https://example.com/start", page.visited_url
71
+ assert_equal [:traffic], network.cleared
72
+ assert_equal "https://example.com/final", result.final_url
73
+ assert_equal "https://example.com/final", result.normalized_final_url
74
+ assert_equal 200, result.status
75
+ assert result.html?
76
+ assert_equal [], page.evaluations
77
+ end
78
+
79
+ def test_fetch_scrolls_when_enabled
80
+ network = FakeNetwork.new(response: Response.new(url: "", headers: {}))
81
+ page = FakePage.new(network: network, current_url: "https://example.com/current")
82
+ browser = browser_with(page: page, scroll_page: true)
83
+
84
+ result = browser.fetch("https://example.com/start")
85
+
86
+ assert_equal "https://example.com/current", result.final_url
87
+ assert_equal 3, page.evaluations.size
88
+ assert_equal 4, network.idle_waits.size
89
+ end
90
+
91
+ def test_fetch_falls_back_to_page_url_and_original_url
92
+ page_url_network = FakeNetwork.new(response: nil)
93
+ page_url = FakePage.new(network: page_url_network, url: "https://example.com/page")
94
+ page_url_result = browser_with(page: page_url).fetch("https://example.com/start")
95
+
96
+ original_url_network = FakeNetwork.new(response: nil)
97
+ original_url = FakePage.new(network: original_url_network)
98
+ original_url_result = browser_with(page: original_url).fetch("https://example.com/start")
99
+
100
+ assert_equal "https://example.com/page", page_url_result.final_url
101
+ assert_equal "https://example.com/start", original_url_result.final_url
102
+ end
103
+
104
+ def test_fetch_returns_error_page_when_navigation_fails
105
+ page = Object.new
106
+ def page.network
107
+ raise Timeout::Error, "browser failed"
108
+ end
109
+
110
+ result = browser_with(page: page).fetch("https://example.com/start")
111
+
112
+ assert_equal "https://example.com/start", result.final_url
113
+ assert_nil result.status
114
+ assert_equal "Timeout::Error: browser failed", result.error
115
+ end
116
+
117
+ def test_fetch_reraises_programmer_errors
118
+ page = Object.new
119
+ def page.network
120
+ raise NoMethodError, "bad call"
121
+ end
122
+
123
+ browser = browser_with(page: page)
124
+
125
+ assert_raises(NoMethodError) { browser.fetch("https://example.com/start") }
126
+ end
127
+
128
+ def test_close_quits_browser
129
+ fake_browser = FakeBrowser.new
130
+ browser = browser_with(browser: fake_browser)
131
+
132
+ browser.close
133
+
134
+ assert fake_browser.quit_called
135
+ end
136
+
137
+ def test_close_allows_missing_browser
138
+ browser = browser_with(browser: nil)
139
+
140
+ assert_nil browser.close
141
+ end
142
+
143
+ private
144
+
145
+ def browser_with(page: FakePage.new(network: FakeNetwork.new(response: nil)), browser: FakeBrowser.new, scroll_page: false)
146
+ Crawlscope::Browser.allocate.tap do |instance|
147
+ instance.instance_variable_set(:@base_url, "https://example.com")
148
+ instance.instance_variable_set(:@timeout_seconds, 20)
149
+ instance.instance_variable_set(:@network_idle_timeout_seconds, 5)
150
+ instance.instance_variable_set(:@scroll_page, scroll_page)
151
+ instance.instance_variable_set(:@browser, browser)
152
+ instance.instance_variable_set(:@page, page)
153
+ end
154
+ end
155
+ end
@@ -4,9 +4,10 @@ require "test_helper"
4
4
 
5
5
  class CrawlscopeCliTest < Minitest::Test
6
6
  class FakeConfiguration
7
- attr_accessor :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
7
+ attr_accessor :base_url, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
8
8
 
9
9
  def initialize
10
+ @base_url = nil
10
11
  @concurrency = 10
11
12
  @network_idle_timeout_seconds = 5
12
13
  @renderer = :http
@@ -19,7 +20,7 @@ class CrawlscopeCliTest < Minitest::Test
19
20
  end
20
21
 
21
22
  class FakeTask
22
- attr_reader :validate_arguments, :ldjson_arguments
23
+ attr_reader :validate_arguments, :json_ld_arguments
23
24
 
24
25
  def validate(base_url:, sitemap_path:, rule_names:)
25
26
  @validate_arguments = {
@@ -31,8 +32,8 @@ class CrawlscopeCliTest < Minitest::Test
31
32
  success_result
32
33
  end
33
34
 
34
- def validate_ldjson(urls:, debug:, renderer:, report_path:, summary:, timeout_seconds:)
35
- @ldjson_arguments = {
35
+ def validate_json_ld(urls:, debug:, renderer:, report_path:, summary:, timeout_seconds:)
36
+ @json_ld_arguments = {
36
37
  urls: urls,
37
38
  debug: debug,
38
39
  renderer: renderer,
@@ -51,6 +52,20 @@ class CrawlscopeCliTest < Minitest::Test
51
52
  end
52
53
  end
53
54
 
55
+ class FailingTask < FakeTask
56
+ private
57
+
58
+ def success_result
59
+ Struct.new(:ok?).new(false)
60
+ end
61
+ end
62
+
63
+ class InvalidTask < FakeTask
64
+ def validate(base_url:, sitemap_path:, rule_names:)
65
+ raise Crawlscope::ValidationError, "No URLs found in sitemap: #{sitemap_path}"
66
+ end
67
+ end
68
+
54
69
  def test_version_prints_current_version
55
70
  out = StringIO.new
56
71
  err = StringIO.new
@@ -70,7 +85,7 @@ class CrawlscopeCliTest < Minitest::Test
70
85
 
71
86
  assert_equal 1, status
72
87
  assert_includes err.string, "Unknown command: unknown"
73
- assert_includes err.string, "crawlscope validate --base-url"
88
+ assert_includes err.string, "crawlscope validate --url"
74
89
  end
75
90
 
76
91
  def test_validate_passes_arguments_to_task
@@ -80,7 +95,7 @@ class CrawlscopeCliTest < Minitest::Test
80
95
  err = StringIO.new
81
96
 
82
97
  status = Crawlscope::Cli.start(
83
- ["validate", "--base-url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3"],
98
+ ["validate", "--url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3"],
84
99
  out: out,
85
100
  err: err,
86
101
  configuration: configuration,
@@ -125,12 +140,133 @@ class CrawlscopeCliTest < Minitest::Test
125
140
  summary: true,
126
141
  timeout_seconds: 20
127
142
  },
128
- task.ldjson_arguments
143
+ task.json_ld_arguments
129
144
  )
130
145
  assert_same out, configuration.output
131
146
  assert_empty err.string
132
147
  end
133
148
 
149
+ def test_ldjson_defaults_to_configured_base_url
150
+ configuration = FakeConfiguration.new
151
+ configuration.base_url = "https://example.com"
152
+ task = FakeTask.new
153
+
154
+ status = Crawlscope::Cli.start(["ldjson"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
155
+
156
+ assert_equal 0, status
157
+ assert_equal ["https://example.com"], task.json_ld_arguments[:urls]
158
+ end
159
+
160
+ def test_validate_caps_default_browser_concurrency
161
+ configuration = FakeConfiguration.new
162
+ task = FakeTask.new
163
+ out = StringIO.new
164
+ err = StringIO.new
165
+
166
+ with_env("JS" => "1") do
167
+ status = Crawlscope::Cli.start(["validate", "--url", "https://example.com"], out: out, err: err, configuration: configuration, task: task)
168
+
169
+ assert_equal 0, status
170
+ end
171
+
172
+ assert_equal :browser, configuration.renderer
173
+ assert_equal 4, configuration.concurrency
174
+ assert_includes out.string, "Default JS concurrency capped at 4"
175
+ end
176
+
177
+ def test_validate_uses_url_environment_as_base_url_for_default_sitemap
178
+ configuration = FakeConfiguration.new
179
+ task = FakeTask.new
180
+
181
+ with_env("URL" => "https://example.com") do
182
+ status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
183
+
184
+ assert_equal 0, status
185
+ end
186
+
187
+ assert_equal "https://example.com", task.validate_arguments[:base_url]
188
+ assert_nil task.validate_arguments[:sitemap_path]
189
+ end
190
+
191
+ def test_validate_uses_sitemap_mode_when_sitemap_is_configured
192
+ task = FakeTask.new
193
+
194
+ with_env("URL" => "https://example.com", "SITEMAP" => "https://example.com/sitemap.xml") do
195
+ status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: FakeConfiguration.new, task: task)
196
+
197
+ assert_equal 0, status
198
+ end
199
+
200
+ assert_equal "https://example.com", task.validate_arguments[:base_url]
201
+ assert_equal "https://example.com/sitemap.xml", task.validate_arguments[:sitemap_path]
202
+ end
203
+
204
+ def test_ldjson_accepts_repeated_urls_and_options
205
+ configuration = FakeConfiguration.new
206
+ task = FakeTask.new
207
+ out = StringIO.new
208
+ err = StringIO.new
209
+
210
+ status = Crawlscope::Cli.start(
211
+ ["ldjson", "--url", "https://example.com/a", "--url", "https://example.com/b", "--renderer", "browser", "--timeout", "12", "--network-idle-timeout", "3", "--report-path", "report.json", "--debug", "--summary"],
212
+ out: out,
213
+ err: err,
214
+ configuration: configuration,
215
+ task: task
216
+ )
217
+
218
+ assert_equal 0, status
219
+ assert_equal(
220
+ {
221
+ urls: ["https://example.com/a", "https://example.com/b"],
222
+ debug: true,
223
+ renderer: :browser,
224
+ report_path: "report.json",
225
+ summary: true,
226
+ timeout_seconds: 12
227
+ },
228
+ task.json_ld_arguments
229
+ )
230
+ assert_equal 3, configuration.network_idle_timeout_seconds
231
+ end
232
+
233
+ def test_ldjson_defaults_to_localhost
234
+ out = StringIO.new
235
+ err = StringIO.new
236
+ task = FakeTask.new
237
+
238
+ status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task: task)
239
+
240
+ assert_equal 0, status
241
+ assert_equal ["http://localhost:3000"], task.json_ld_arguments[:urls]
242
+ assert_empty err.string
243
+ end
244
+
245
+ def test_invalid_integer_option_returns_error
246
+ out = StringIO.new
247
+ err = StringIO.new
248
+
249
+ status = Crawlscope::Cli.start(["validate", "--timeout", "0"], out: out, err: err, configuration: FakeConfiguration.new, task: FakeTask.new)
250
+
251
+ assert_equal 1, status
252
+ assert_includes err.string, "timeout must be >= 1"
253
+ end
254
+
255
+ def test_failed_result_returns_failed_status
256
+ status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: FakeConfiguration.new, task: FailingTask.new)
257
+
258
+ assert_equal 1, status
259
+ end
260
+
261
+ def test_validation_errors_return_failed_status_without_reraising
262
+ err = StringIO.new
263
+
264
+ status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: err, configuration: FakeConfiguration.new, task: InvalidTask.new)
265
+
266
+ assert_equal 1, status
267
+ assert_includes err.string, "No URLs found in sitemap"
268
+ end
269
+
134
270
  private
135
271
 
136
272
  def with_env(overrides)
@@ -42,4 +42,53 @@ class CrawlscopeConfigurationTest < Minitest::Test
42
42
 
43
43
  assert_equal "Crawlscope sitemap_path is not configured", error.message
44
44
  end
45
+
46
+ def test_defaults_are_normalized
47
+ config = Crawlscope::Configuration.new
48
+
49
+ assert_equal [200, 301, 302], config.allowed_statuses
50
+ assert_equal 10, config.concurrency
51
+ assert_equal 4, config.browser_concurrency
52
+ assert_equal 5, config.network_idle_timeout_seconds
53
+ assert_equal :http, config.renderer
54
+ assert_equal 20, config.timeout_seconds
55
+ assert_equal $stdout, config.output
56
+ assert config.scroll_page?
57
+ end
58
+
59
+ def test_configured_values_are_normalized
60
+ config = Crawlscope::Configuration.new
61
+ config.allowed_statuses = ["200", "404"]
62
+ config.concurrency = "2"
63
+ config.network_idle_timeout_seconds = "7"
64
+ config.renderer = "browser"
65
+ config.timeout_seconds = "9"
66
+ config.scroll_page = false
67
+
68
+ assert_equal [200, 404], config.allowed_statuses
69
+ assert_equal 2, config.concurrency
70
+ assert_equal 2, config.browser_concurrency
71
+ assert_equal 7, config.network_idle_timeout_seconds
72
+ assert_equal :browser, config.renderer
73
+ assert_equal 9, config.timeout_seconds
74
+ refute config.scroll_page?
75
+ end
76
+
77
+ def test_renderer_must_be_supported
78
+ config = Crawlscope::Configuration.new
79
+ config.renderer = "webkit"
80
+
81
+ error = assert_raises(Crawlscope::ConfigurationError) { config.renderer }
82
+
83
+ assert_equal "Crawlscope renderer must be http or browser", error.message
84
+ end
85
+
86
+ def test_numeric_values_must_be_positive_integers
87
+ config = Crawlscope::Configuration.new
88
+ config.concurrency = "0"
89
+
90
+ error = assert_raises(Crawlscope::ConfigurationError) { config.concurrency }
91
+
92
+ assert_equal "Crawlscope concurrency must be an integer >= 1", error.message
93
+ end
45
94
  end
@@ -2,7 +2,7 @@
2
2
 
3
3
  require "test_helper"
4
4
 
5
- class CrawlscopeAuditTest < Minitest::Test
5
+ class CrawlscopeCrawlTest < Minitest::Test
6
6
  def setup
7
7
  @tmp_dir = Dir.mktmpdir
8
8
  @sitemap_path = File.join(@tmp_dir, "sitemap.xml")
@@ -31,8 +31,16 @@ class CrawlscopeAuditTest < Minitest::Test
31
31
  <html>
32
32
  <head>
33
33
  <title>Pricing</title>
34
- <meta name="description" content="Plans for hotels and restaurants">
34
+ <meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
35
35
  <link rel="canonical" href="https://example.com/pricing">
36
+ <meta property="og:title" content="Pricing">
37
+ <meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
38
+ <meta property="og:url" content="https://example.com/pricing">
39
+ <meta property="og:type" content="website">
40
+ <meta property="og:image" content="https://example.com/icon.png">
41
+ <script type="application/ld+json">
42
+ {"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
43
+ </script>
36
44
  </head>
37
45
  <body>
38
46
  <main>
@@ -43,7 +51,7 @@ class CrawlscopeAuditTest < Minitest::Test
43
51
  HTML
44
52
  )
45
53
 
46
- result = Crawlscope::Audit.new(
54
+ result = Crawlscope::Crawl.new(
47
55
  base_url: "https://example.com",
48
56
  sitemap_path: @sitemap_path,
49
57
  rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
@@ -84,7 +92,7 @@ class CrawlscopeAuditTest < Minitest::Test
84
92
  HTML
85
93
  )
86
94
 
87
- result = Crawlscope::Audit.new(
95
+ result = Crawlscope::Crawl.new(
88
96
  base_url: "https://example.com",
89
97
  sitemap_path: @sitemap_path,
90
98
  rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
@@ -92,7 +100,7 @@ class CrawlscopeAuditTest < Minitest::Test
92
100
  ).call
93
101
 
94
102
  refute result.ok?
95
- assert_equal %i[meta_description_too_long missing_canonical missing_h1 title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
103
+ assert_equal %i[incomplete_open_graph_tags meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
96
104
  end
97
105
 
98
106
  def test_uses_browser_when_renderer_is_browser
@@ -125,8 +133,16 @@ class CrawlscopeAuditTest < Minitest::Test
125
133
  <html>
126
134
  <head>
127
135
  <title>Pricing</title>
128
- <meta name="description" content="Plans for hotels and restaurants">
136
+ <meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
129
137
  <link rel="canonical" href="https://example.com/pricing">
138
+ <meta property="og:title" content="Pricing">
139
+ <meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
140
+ <meta property="og:url" content="https://example.com/pricing">
141
+ <meta property="og:type" content="website">
142
+ <meta property="og:image" content="https://example.com/icon.png">
143
+ <script type="application/ld+json">
144
+ {"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
145
+ </script>
130
146
  </head>
131
147
  <body>
132
148
  <main>
@@ -149,7 +165,7 @@ class CrawlscopeAuditTest < Minitest::Test
149
165
  end
150
166
  end.new
151
167
 
152
- result = Crawlscope::Audit.new(
168
+ result = Crawlscope::Crawl.new(
153
169
  base_url: "https://example.com",
154
170
  sitemap_path: @sitemap_path,
155
171
  rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeCrawlerTest < Minitest::Test
6
+ class RaisingFetcher
7
+ def fetch(url)
8
+ raise Timeout::Error, "fetch timed out" if url.include?("timeout")
9
+
10
+ Crawlscope::Page.new(
11
+ url: url,
12
+ normalized_url: url,
13
+ final_url: url,
14
+ normalized_final_url: url,
15
+ status: 200,
16
+ headers: {},
17
+ body: "<html></html>",
18
+ doc: Nokogiri::HTML("<html></html>")
19
+ )
20
+ end
21
+ end
22
+
23
+ def test_returns_error_page_when_fetcher_raises
24
+ pages = Crawlscope::Crawler.new(page_fetcher: RaisingFetcher.new, concurrency: 2).call(
25
+ ["https://example.com/ok", "https://example.com/timeout"]
26
+ )
27
+
28
+ assert_equal 2, pages.size
29
+ error_page = pages.find { |page| page.url == "https://example.com/timeout" }
30
+
31
+ assert_nil error_page.status
32
+ assert_equal "Timeout::Error: fetch timed out", error_page.error
33
+ end
34
+ end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeHttpTest < Minitest::Test
6
+ def test_fetch_parses_html_response
7
+ stub_request(:get, "https://example.com/page")
8
+ .to_return(status: 200, headers: {"Content-Type" => "text/html"}, body: "<html><body>Hello</body></html>")
9
+
10
+ page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/page")
11
+
12
+ assert_equal 200, page.status
13
+ assert page.html?
14
+ assert_equal "Hello", page.doc.at_css("body").text
15
+ end
16
+
17
+ def test_fetch_parses_responses_without_content_type_as_html
18
+ stub_request(:get, "https://example.com/page")
19
+ .to_return(status: 200, body: "<html><body>Hello</body></html>")
20
+
21
+ page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/page")
22
+
23
+ assert page.html?
24
+ end
25
+
26
+ def test_fetch_leaves_non_html_response_unparsed
27
+ stub_request(:get, "https://example.com/feed.xml")
28
+ .to_return(status: 200, headers: {"content-type" => "application/xml"}, body: "<feed></feed>")
29
+
30
+ page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/feed.xml")
31
+
32
+ assert_equal 200, page.status
33
+ refute page.html?
34
+ assert_equal "<feed></feed>", page.body
35
+ end
36
+
37
+ def test_fetch_returns_error_page_for_failed_requests
38
+ stub_request(:get, "https://example.com/down").to_timeout
39
+
40
+ page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/down")
41
+
42
+ assert_nil page.status
43
+ assert_includes page.error, "Faraday::ConnectionFailed"
44
+ assert_equal "https://example.com/down", page.final_url
45
+ end
46
+
47
+ def test_fetch_reraises_programmer_errors
48
+ http = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2)
49
+
50
+ def http.connection
51
+ raise NoMethodError, "bad call"
52
+ end
53
+
54
+ assert_raises(NoMethodError) { http.fetch("https://example.com/down") }
55
+ end
56
+ end