crawlscope 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +7 -11
  3. data/README.md +20 -13
  4. data/lib/crawlscope/browser.rb +8 -0
  5. data/lib/crawlscope/cli.rb +10 -10
  6. data/lib/crawlscope/configuration.rb +20 -5
  7. data/lib/crawlscope/context.rb +9 -0
  8. data/lib/crawlscope/{audit.rb → crawl.rb} +62 -58
  9. data/lib/crawlscope/crawler.rb +19 -1
  10. data/lib/crawlscope/http.rb +1 -1
  11. data/lib/crawlscope/rake_tasks.rb +28 -0
  12. data/lib/crawlscope/rules/links.rb +76 -43
  13. data/lib/crawlscope/rules/structured_data.rb +14 -1
  14. data/lib/crawlscope/run.rb +60 -0
  15. data/lib/crawlscope/schema_registry.rb +3 -349
  16. data/lib/crawlscope/schemas.rb +355 -0
  17. data/lib/crawlscope/sitemap.rb +18 -6
  18. data/lib/crawlscope/structured_data/audit.rb +7 -7
  19. data/lib/crawlscope/structured_data/check.rb +35 -0
  20. data/lib/crawlscope/structured_data/reporter.rb +69 -0
  21. data/lib/crawlscope/url.rb +14 -0
  22. data/lib/crawlscope/version.rb +1 -1
  23. data/lib/tasks/crawlscope_tasks.rake +12 -23
  24. data/test/crawlscope/browser_test.rb +155 -0
  25. data/test/crawlscope/cli_test.rb +128 -6
  26. data/test/crawlscope/configuration_test.rb +49 -0
  27. data/test/crawlscope/{audit_test.rb → crawl_test.rb} +11 -5
  28. data/test/crawlscope/crawler_test.rb +34 -0
  29. data/test/crawlscope/http_test.rb +56 -0
  30. data/test/crawlscope/links_rule_test.rb +110 -5
  31. data/test/crawlscope/rule_registry_test.rb +32 -0
  32. data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
  33. data/test/crawlscope/schema_registry_test.rb +19 -0
  34. data/test/crawlscope/sitemap_test.rb +55 -0
  35. data/test/crawlscope/structured_data_document_test.rb +36 -0
  36. data/test/crawlscope/structured_data_report_test.rb +3 -3
  37. data/test/crawlscope/structured_data_reporter_test.rb +2 -2
  38. data/test/crawlscope/structured_data_rule_test.rb +20 -0
  39. data/test/crawlscope/structured_data_writer_test.rb +2 -2
  40. data/test/crawlscope/url_test.rb +31 -0
  41. metadata +14 -5
  42. data/lib/crawlscope/task.rb +0 -131
@@ -0,0 +1,155 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeBrowserTest < Minitest::Test
6
+ Response = Data.define(:url, :headers)
7
+
8
+ class FakeBrowser
9
+ attr_reader :quit_called
10
+
11
+ def quit
12
+ @quit_called = true
13
+ end
14
+ end
15
+
16
+ class FakeNetwork
17
+ attr_reader :cleared, :idle_waits, :status
18
+
19
+ def initialize(response:, status: 200)
20
+ @response = response
21
+ @status = status
22
+ @cleared = []
23
+ @idle_waits = []
24
+ end
25
+
26
+ def clear(scope)
27
+ @cleared << scope
28
+ end
29
+
30
+ attr_reader :response
31
+
32
+ def wait_for_idle(duration:, timeout:)
33
+ @idle_waits << {duration: duration, timeout: timeout}
34
+ end
35
+ end
36
+
37
+ class FakePage
38
+ attr_reader :evaluations, :network, :visited_url
39
+
40
+ def initialize(network:, body: "<html></html>", current_url: "", url: "")
41
+ @network = network
42
+ @body = body
43
+ @current_url = current_url
44
+ @url = url
45
+ @evaluations = []
46
+ end
47
+
48
+ attr_reader :body
49
+
50
+ attr_reader :current_url
51
+
52
+ def evaluate(script)
53
+ @evaluations << script
54
+ end
55
+
56
+ def go_to(url)
57
+ @visited_url = url
58
+ end
59
+
60
+ attr_reader :url
61
+ end
62
+
63
+ def test_fetch_returns_rendered_page
64
+ network = FakeNetwork.new(response: Response.new(url: "https://example.com/final", headers: {"content-type" => "text/html"}))
65
+ page = FakePage.new(network: network, body: "<html><body>Hello</body></html>")
66
+ browser = browser_with(page: page, scroll_page: false)
67
+
68
+ result = browser.fetch("https://example.com/start")
69
+
70
+ assert_equal "https://example.com/start", page.visited_url
71
+ assert_equal [:traffic], network.cleared
72
+ assert_equal "https://example.com/final", result.final_url
73
+ assert_equal "https://example.com/final", result.normalized_final_url
74
+ assert_equal 200, result.status
75
+ assert result.html?
76
+ assert_equal [], page.evaluations
77
+ end
78
+
79
+ def test_fetch_scrolls_when_enabled
80
+ network = FakeNetwork.new(response: Response.new(url: "", headers: {}))
81
+ page = FakePage.new(network: network, current_url: "https://example.com/current")
82
+ browser = browser_with(page: page, scroll_page: true)
83
+
84
+ result = browser.fetch("https://example.com/start")
85
+
86
+ assert_equal "https://example.com/current", result.final_url
87
+ assert_equal 3, page.evaluations.size
88
+ assert_equal 4, network.idle_waits.size
89
+ end
90
+
91
+ def test_fetch_falls_back_to_page_url_and_original_url
92
+ page_url_network = FakeNetwork.new(response: nil)
93
+ page_url = FakePage.new(network: page_url_network, url: "https://example.com/page")
94
+ page_url_result = browser_with(page: page_url).fetch("https://example.com/start")
95
+
96
+ original_url_network = FakeNetwork.new(response: nil)
97
+ original_url = FakePage.new(network: original_url_network)
98
+ original_url_result = browser_with(page: original_url).fetch("https://example.com/start")
99
+
100
+ assert_equal "https://example.com/page", page_url_result.final_url
101
+ assert_equal "https://example.com/start", original_url_result.final_url
102
+ end
103
+
104
+ def test_fetch_returns_error_page_when_navigation_fails
105
+ page = Object.new
106
+ def page.network
107
+ raise Timeout::Error, "browser failed"
108
+ end
109
+
110
+ result = browser_with(page: page).fetch("https://example.com/start")
111
+
112
+ assert_equal "https://example.com/start", result.final_url
113
+ assert_nil result.status
114
+ assert_equal "Timeout::Error: browser failed", result.error
115
+ end
116
+
117
+ def test_fetch_reraises_programmer_errors
118
+ page = Object.new
119
+ def page.network
120
+ raise NoMethodError, "bad call"
121
+ end
122
+
123
+ browser = browser_with(page: page)
124
+
125
+ assert_raises(NoMethodError) { browser.fetch("https://example.com/start") }
126
+ end
127
+
128
+ def test_close_quits_browser
129
+ fake_browser = FakeBrowser.new
130
+ browser = browser_with(browser: fake_browser)
131
+
132
+ browser.close
133
+
134
+ assert fake_browser.quit_called
135
+ end
136
+
137
+ def test_close_allows_missing_browser
138
+ browser = browser_with(browser: nil)
139
+
140
+ assert_nil browser.close
141
+ end
142
+
143
+ private
144
+
145
+ def browser_with(page: FakePage.new(network: FakeNetwork.new(response: nil)), browser: FakeBrowser.new, scroll_page: false)
146
+ Crawlscope::Browser.allocate.tap do |instance|
147
+ instance.instance_variable_set(:@base_url, "https://example.com")
148
+ instance.instance_variable_set(:@timeout_seconds, 20)
149
+ instance.instance_variable_set(:@network_idle_timeout_seconds, 5)
150
+ instance.instance_variable_set(:@scroll_page, scroll_page)
151
+ instance.instance_variable_set(:@browser, browser)
152
+ instance.instance_variable_set(:@page, page)
153
+ end
154
+ end
155
+ end
@@ -19,7 +19,7 @@ class CrawlscopeCliTest < Minitest::Test
19
19
  end
20
20
 
21
21
  class FakeTask
22
- attr_reader :validate_arguments, :ldjson_arguments
22
+ attr_reader :validate_arguments, :json_ld_arguments
23
23
 
24
24
  def validate(base_url:, sitemap_path:, rule_names:)
25
25
  @validate_arguments = {
@@ -31,8 +31,8 @@ class CrawlscopeCliTest < Minitest::Test
31
31
  success_result
32
32
  end
33
33
 
34
- def validate_ldjson(urls:, debug:, renderer:, report_path:, summary:, timeout_seconds:)
35
- @ldjson_arguments = {
34
+ def validate_json_ld(urls:, debug:, renderer:, report_path:, summary:, timeout_seconds:)
35
+ @json_ld_arguments = {
36
36
  urls: urls,
37
37
  debug: debug,
38
38
  renderer: renderer,
@@ -51,6 +51,20 @@ class CrawlscopeCliTest < Minitest::Test
51
51
  end
52
52
  end
53
53
 
54
+ class FailingTask < FakeTask
55
+ private
56
+
57
+ def success_result
58
+ Struct.new(:ok?).new(false)
59
+ end
60
+ end
61
+
62
+ class InvalidTask < FakeTask
63
+ def validate(base_url:, sitemap_path:, rule_names:)
64
+ raise Crawlscope::ValidationError, "No URLs found in sitemap: #{sitemap_path}"
65
+ end
66
+ end
67
+
54
68
  def test_version_prints_current_version
55
69
  out = StringIO.new
56
70
  err = StringIO.new
@@ -70,7 +84,7 @@ class CrawlscopeCliTest < Minitest::Test
70
84
 
71
85
  assert_equal 1, status
72
86
  assert_includes err.string, "Unknown command: unknown"
73
- assert_includes err.string, "crawlscope validate --base-url"
87
+ assert_includes err.string, "crawlscope validate --url"
74
88
  end
75
89
 
76
90
  def test_validate_passes_arguments_to_task
@@ -80,7 +94,7 @@ class CrawlscopeCliTest < Minitest::Test
80
94
  err = StringIO.new
81
95
 
82
96
  status = Crawlscope::Cli.start(
83
- ["validate", "--base-url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3"],
97
+ ["validate", "--url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3"],
84
98
  out: out,
85
99
  err: err,
86
100
  configuration: configuration,
@@ -125,12 +139,120 @@ class CrawlscopeCliTest < Minitest::Test
125
139
  summary: true,
126
140
  timeout_seconds: 20
127
141
  },
128
- task.ldjson_arguments
142
+ task.json_ld_arguments
129
143
  )
130
144
  assert_same out, configuration.output
131
145
  assert_empty err.string
132
146
  end
133
147
 
148
+ def test_validate_caps_default_browser_concurrency
149
+ configuration = FakeConfiguration.new
150
+ task = FakeTask.new
151
+ out = StringIO.new
152
+ err = StringIO.new
153
+
154
+ with_env("JS" => "1") do
155
+ status = Crawlscope::Cli.start(["validate", "--url", "https://example.com"], out: out, err: err, configuration: configuration, task: task)
156
+
157
+ assert_equal 0, status
158
+ end
159
+
160
+ assert_equal :browser, configuration.renderer
161
+ assert_equal 4, configuration.concurrency
162
+ assert_includes out.string, "Default JS concurrency capped at 4"
163
+ end
164
+
165
+ def test_validate_uses_url_environment_as_base_url_for_default_sitemap
166
+ configuration = FakeConfiguration.new
167
+ task = FakeTask.new
168
+
169
+ with_env("URL" => "https://example.com") do
170
+ status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
171
+
172
+ assert_equal 0, status
173
+ end
174
+
175
+ assert_equal "https://example.com", task.validate_arguments[:base_url]
176
+ assert_nil task.validate_arguments[:sitemap_path]
177
+ end
178
+
179
+ def test_validate_uses_sitemap_mode_when_sitemap_is_configured
180
+ task = FakeTask.new
181
+
182
+ with_env("URL" => "https://example.com", "SITEMAP" => "https://example.com/sitemap.xml") do
183
+ status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: FakeConfiguration.new, task: task)
184
+
185
+ assert_equal 0, status
186
+ end
187
+
188
+ assert_equal "https://example.com", task.validate_arguments[:base_url]
189
+ assert_equal "https://example.com/sitemap.xml", task.validate_arguments[:sitemap_path]
190
+ end
191
+
192
+ def test_ldjson_accepts_repeated_urls_and_options
193
+ configuration = FakeConfiguration.new
194
+ task = FakeTask.new
195
+ out = StringIO.new
196
+ err = StringIO.new
197
+
198
+ status = Crawlscope::Cli.start(
199
+ ["ldjson", "--url", "https://example.com/a", "--url", "https://example.com/b", "--renderer", "browser", "--timeout", "12", "--network-idle-timeout", "3", "--report-path", "report.json", "--debug", "--summary"],
200
+ out: out,
201
+ err: err,
202
+ configuration: configuration,
203
+ task: task
204
+ )
205
+
206
+ assert_equal 0, status
207
+ assert_equal(
208
+ {
209
+ urls: ["https://example.com/a", "https://example.com/b"],
210
+ debug: true,
211
+ renderer: :browser,
212
+ report_path: "report.json",
213
+ summary: true,
214
+ timeout_seconds: 12
215
+ },
216
+ task.json_ld_arguments
217
+ )
218
+ assert_equal 3, configuration.network_idle_timeout_seconds
219
+ end
220
+
221
+ def test_ldjson_requires_urls
222
+ out = StringIO.new
223
+ err = StringIO.new
224
+
225
+ status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task: FakeTask.new)
226
+
227
+ assert_equal 1, status
228
+ assert_includes err.string, "Crawlscope URL is not configured"
229
+ end
230
+
231
+ def test_invalid_integer_option_returns_error
232
+ out = StringIO.new
233
+ err = StringIO.new
234
+
235
+ status = Crawlscope::Cli.start(["validate", "--timeout", "0"], out: out, err: err, configuration: FakeConfiguration.new, task: FakeTask.new)
236
+
237
+ assert_equal 1, status
238
+ assert_includes err.string, "timeout must be >= 1"
239
+ end
240
+
241
+ def test_failed_result_returns_failed_status
242
+ status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: FakeConfiguration.new, task: FailingTask.new)
243
+
244
+ assert_equal 1, status
245
+ end
246
+
247
+ def test_validation_errors_return_failed_status_without_reraising
248
+ err = StringIO.new
249
+
250
+ status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: err, configuration: FakeConfiguration.new, task: InvalidTask.new)
251
+
252
+ assert_equal 1, status
253
+ assert_includes err.string, "No URLs found in sitemap"
254
+ end
255
+
134
256
  private
135
257
 
136
258
  def with_env(overrides)
@@ -42,4 +42,53 @@ class CrawlscopeConfigurationTest < Minitest::Test
42
42
 
43
43
  assert_equal "Crawlscope sitemap_path is not configured", error.message
44
44
  end
45
+
46
+ def test_defaults_are_normalized
47
+ config = Crawlscope::Configuration.new
48
+
49
+ assert_equal [200, 301, 302], config.allowed_statuses
50
+ assert_equal 10, config.concurrency
51
+ assert_equal 4, config.browser_concurrency
52
+ assert_equal 5, config.network_idle_timeout_seconds
53
+ assert_equal :http, config.renderer
54
+ assert_equal 20, config.timeout_seconds
55
+ assert_equal $stdout, config.output
56
+ assert config.scroll_page?
57
+ end
58
+
59
+ def test_configured_values_are_normalized
60
+ config = Crawlscope::Configuration.new
61
+ config.allowed_statuses = ["200", "404"]
62
+ config.concurrency = "2"
63
+ config.network_idle_timeout_seconds = "7"
64
+ config.renderer = "browser"
65
+ config.timeout_seconds = "9"
66
+ config.scroll_page = false
67
+
68
+ assert_equal [200, 404], config.allowed_statuses
69
+ assert_equal 2, config.concurrency
70
+ assert_equal 2, config.browser_concurrency
71
+ assert_equal 7, config.network_idle_timeout_seconds
72
+ assert_equal :browser, config.renderer
73
+ assert_equal 9, config.timeout_seconds
74
+ refute config.scroll_page?
75
+ end
76
+
77
+ def test_renderer_must_be_supported
78
+ config = Crawlscope::Configuration.new
79
+ config.renderer = "webkit"
80
+
81
+ error = assert_raises(Crawlscope::ConfigurationError) { config.renderer }
82
+
83
+ assert_equal "Crawlscope renderer must be http or browser", error.message
84
+ end
85
+
86
+ def test_numeric_values_must_be_positive_integers
87
+ config = Crawlscope::Configuration.new
88
+ config.concurrency = "0"
89
+
90
+ error = assert_raises(Crawlscope::ConfigurationError) { config.concurrency }
91
+
92
+ assert_equal "Crawlscope concurrency must be an integer >= 1", error.message
93
+ end
45
94
  end
@@ -2,7 +2,7 @@
2
2
 
3
3
  require "test_helper"
4
4
 
5
- class CrawlscopeAuditTest < Minitest::Test
5
+ class CrawlscopeCrawlTest < Minitest::Test
6
6
  def setup
7
7
  @tmp_dir = Dir.mktmpdir
8
8
  @sitemap_path = File.join(@tmp_dir, "sitemap.xml")
@@ -33,6 +33,9 @@ class CrawlscopeAuditTest < Minitest::Test
33
33
  <title>Pricing</title>
34
34
  <meta name="description" content="Plans for hotels and restaurants">
35
35
  <link rel="canonical" href="https://example.com/pricing">
36
+ <script type="application/ld+json">
37
+ {"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
38
+ </script>
36
39
  </head>
37
40
  <body>
38
41
  <main>
@@ -43,7 +46,7 @@ class CrawlscopeAuditTest < Minitest::Test
43
46
  HTML
44
47
  )
45
48
 
46
- result = Crawlscope::Audit.new(
49
+ result = Crawlscope::Crawl.new(
47
50
  base_url: "https://example.com",
48
51
  sitemap_path: @sitemap_path,
49
52
  rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
@@ -84,7 +87,7 @@ class CrawlscopeAuditTest < Minitest::Test
84
87
  HTML
85
88
  )
86
89
 
87
- result = Crawlscope::Audit.new(
90
+ result = Crawlscope::Crawl.new(
88
91
  base_url: "https://example.com",
89
92
  sitemap_path: @sitemap_path,
90
93
  rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
@@ -92,7 +95,7 @@ class CrawlscopeAuditTest < Minitest::Test
92
95
  ).call
93
96
 
94
97
  refute result.ok?
95
- assert_equal %i[meta_description_too_long missing_canonical missing_h1 title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
98
+ assert_equal %i[meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
96
99
  end
97
100
 
98
101
  def test_uses_browser_when_renderer_is_browser
@@ -127,6 +130,9 @@ class CrawlscopeAuditTest < Minitest::Test
127
130
  <title>Pricing</title>
128
131
  <meta name="description" content="Plans for hotels and restaurants">
129
132
  <link rel="canonical" href="https://example.com/pricing">
133
+ <script type="application/ld+json">
134
+ {"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
135
+ </script>
130
136
  </head>
131
137
  <body>
132
138
  <main>
@@ -149,7 +155,7 @@ class CrawlscopeAuditTest < Minitest::Test
149
155
  end
150
156
  end.new
151
157
 
152
- result = Crawlscope::Audit.new(
158
+ result = Crawlscope::Crawl.new(
153
159
  base_url: "https://example.com",
154
160
  sitemap_path: @sitemap_path,
155
161
  rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeCrawlerTest < Minitest::Test
6
+ class RaisingFetcher
7
+ def fetch(url)
8
+ raise Timeout::Error, "fetch timed out" if url.include?("timeout")
9
+
10
+ Crawlscope::Page.new(
11
+ url: url,
12
+ normalized_url: url,
13
+ final_url: url,
14
+ normalized_final_url: url,
15
+ status: 200,
16
+ headers: {},
17
+ body: "<html></html>",
18
+ doc: Nokogiri::HTML("<html></html>")
19
+ )
20
+ end
21
+ end
22
+
23
+ def test_returns_error_page_when_fetcher_raises
24
+ pages = Crawlscope::Crawler.new(page_fetcher: RaisingFetcher.new, concurrency: 2).call(
25
+ ["https://example.com/ok", "https://example.com/timeout"]
26
+ )
27
+
28
+ assert_equal 2, pages.size
29
+ error_page = pages.find { |page| page.url == "https://example.com/timeout" }
30
+
31
+ assert_nil error_page.status
32
+ assert_equal "Timeout::Error: fetch timed out", error_page.error
33
+ end
34
+ end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeHttpTest < Minitest::Test
6
+ def test_fetch_parses_html_response
7
+ stub_request(:get, "https://example.com/page")
8
+ .to_return(status: 200, headers: {"Content-Type" => "text/html"}, body: "<html><body>Hello</body></html>")
9
+
10
+ page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/page")
11
+
12
+ assert_equal 200, page.status
13
+ assert page.html?
14
+ assert_equal "Hello", page.doc.at_css("body").text
15
+ end
16
+
17
+ def test_fetch_parses_responses_without_content_type_as_html
18
+ stub_request(:get, "https://example.com/page")
19
+ .to_return(status: 200, body: "<html><body>Hello</body></html>")
20
+
21
+ page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/page")
22
+
23
+ assert page.html?
24
+ end
25
+
26
+ def test_fetch_leaves_non_html_response_unparsed
27
+ stub_request(:get, "https://example.com/feed.xml")
28
+ .to_return(status: 200, headers: {"content-type" => "application/xml"}, body: "<feed></feed>")
29
+
30
+ page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/feed.xml")
31
+
32
+ assert_equal 200, page.status
33
+ refute page.html?
34
+ assert_equal "<feed></feed>", page.body
35
+ end
36
+
37
+ def test_fetch_returns_error_page_for_failed_requests
38
+ stub_request(:get, "https://example.com/down").to_timeout
39
+
40
+ page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/down")
41
+
42
+ assert_nil page.status
43
+ assert_includes page.error, "Faraday::ConnectionFailed"
44
+ assert_equal "https://example.com/down", page.final_url
45
+ end
46
+
47
+ def test_fetch_reraises_programmer_errors
48
+ http = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2)
49
+
50
+ def http.connection
51
+ raise NoMethodError, "bad call"
52
+ end
53
+
54
+ assert_raises(NoMethodError) { http.fetch("https://example.com/down") }
55
+ end
56
+ end
@@ -38,19 +38,124 @@ class CrawlscopeLinksRuleTest < Minitest::Test
38
38
  urls: ["https://example.com/guide", "https://example.com/pricing"],
39
39
  pages: pages,
40
40
  issues: issues,
41
- context: {
42
- allowed_statuses: [200, 301, 302],
43
- base_url: "https://example.com",
44
- resolve_target: method(:resolve_target)
45
- }
41
+ context: context
46
42
  )
47
43
 
48
44
  assert_equal [:broken_internal_link], issues.to_a.map(&:code)
49
45
  assert_includes issues.to_a.first.message, "HTTP 404"
50
46
  end
51
47
 
48
+ def test_reports_unresolved_internal_links
49
+ issues = Crawlscope::IssueCollection.new
50
+
51
+ Crawlscope::Rules::Links.new.call(
52
+ urls: [],
53
+ pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/unknown\">Unknown</a></main>")],
54
+ issues: issues,
55
+ context: context(resolver: ->(_target_url) {})
56
+ )
57
+
58
+ assert_includes issues.to_a.map(&:code), :unresolved_internal_link
59
+ assert_includes issues.to_a.find { |issue| issue.code == :unresolved_internal_link }.message, "unable to validate internal link"
60
+ end
61
+
62
+ def test_ignores_fetch_errors_for_urls_already_crawled
63
+ issues = Crawlscope::IssueCollection.new
64
+ resolver = lambda do |target_url|
65
+ {
66
+ crawled: true,
67
+ error: "Timeout::Error: timed out",
68
+ final_url: target_url,
69
+ status: nil
70
+ }
71
+ end
72
+
73
+ Crawlscope::Rules::Links.new.call(
74
+ urls: [],
75
+ pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/timeout\">Timeout</a></main>")],
76
+ issues: issues,
77
+ context: context(resolver: resolver)
78
+ )
79
+
80
+ assert_empty issues.to_a
81
+ end
82
+
83
+ def test_reports_fetch_errors_for_uncrawled_targets
84
+ issues = Crawlscope::IssueCollection.new
85
+ resolver = lambda do |target_url|
86
+ {
87
+ crawled: false,
88
+ error: "Timeout::Error: timed out",
89
+ final_url: target_url,
90
+ status: nil
91
+ }
92
+ end
93
+
94
+ Crawlscope::Rules::Links.new.call(
95
+ urls: [],
96
+ pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/timeout\">Timeout</a></main>")],
97
+ issues: issues,
98
+ context: context(resolver: resolver)
99
+ )
100
+
101
+ assert_equal [:unresolved_internal_link], issues.to_a.map(&:code)
102
+ end
103
+
104
+ def test_reports_low_inbound_anchor_links
105
+ issues = Crawlscope::IssueCollection.new
106
+
107
+ Crawlscope::Rules::Links.new.call(
108
+ urls: ["https://example.com/guide", "https://example.com/pricing"],
109
+ pages: [
110
+ page(url: "https://example.com/guide", body: "<main><a href=\"/pricing\">Pricing</a></main>"),
111
+ page(url: "https://example.com/pricing", body: "<main><p>Pricing</p></main>")
112
+ ],
113
+ issues: issues,
114
+ context: context
115
+ )
116
+
117
+ assert_equal [:low_inbound_anchor_links], issues.to_a.map(&:code)
118
+ assert_equal "https://example.com/guide", issues.to_a.first.url
119
+ end
120
+
121
+ def test_ignores_links_that_should_not_be_crawled
122
+ issues = Crawlscope::IssueCollection.new
123
+
124
+ Crawlscope::Rules::Links.new.call(
125
+ urls: ["https://example.com/guide"],
126
+ pages: [
127
+ page(
128
+ url: "https://example.com/guide",
129
+ body: <<~HTML
130
+ <html>
131
+ <body>
132
+ <a href="#section">Jump</a>
133
+ <a href="mailto:test@example.com">Email</a>
134
+ <a href="https://other.example.com/page">External</a>
135
+ <a href="/rails/info">Rails</a>
136
+ <a href="/empty"> </a>
137
+ </body>
138
+ </html>
139
+ HTML
140
+ )
141
+ ],
142
+ issues: issues,
143
+ context: context
144
+ )
145
+
146
+ assert_empty issues.to_a
147
+ end
148
+
52
149
  private
53
150
 
151
+ def context(resolver: method(:resolve_target))
152
+ {
153
+ allowed_statuses: [200, 301, 302],
154
+ base_url: "https://example.com",
155
+ resolve_target: resolver
156
+ }
157
+ end
158
+
54
159
  def page(url:, body:)
55
160
  doc = Nokogiri::HTML(body)
56
161