crawlscope 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -11
- data/README.md +20 -13
- data/lib/crawlscope/browser.rb +8 -0
- data/lib/crawlscope/cli.rb +10 -10
- data/lib/crawlscope/configuration.rb +20 -5
- data/lib/crawlscope/context.rb +9 -0
- data/lib/crawlscope/{audit.rb → crawl.rb} +62 -58
- data/lib/crawlscope/crawler.rb +19 -1
- data/lib/crawlscope/http.rb +1 -1
- data/lib/crawlscope/rake_tasks.rb +28 -0
- data/lib/crawlscope/rules/links.rb +76 -43
- data/lib/crawlscope/rules/structured_data.rb +14 -1
- data/lib/crawlscope/run.rb +60 -0
- data/lib/crawlscope/schema_registry.rb +3 -349
- data/lib/crawlscope/schemas.rb +355 -0
- data/lib/crawlscope/sitemap.rb +18 -6
- data/lib/crawlscope/structured_data/audit.rb +7 -7
- data/lib/crawlscope/structured_data/check.rb +35 -0
- data/lib/crawlscope/structured_data/reporter.rb +69 -0
- data/lib/crawlscope/url.rb +14 -0
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +12 -23
- data/test/crawlscope/browser_test.rb +155 -0
- data/test/crawlscope/cli_test.rb +128 -6
- data/test/crawlscope/configuration_test.rb +49 -0
- data/test/crawlscope/{audit_test.rb → crawl_test.rb} +11 -5
- data/test/crawlscope/crawler_test.rb +34 -0
- data/test/crawlscope/http_test.rb +56 -0
- data/test/crawlscope/links_rule_test.rb +110 -5
- data/test/crawlscope/rule_registry_test.rb +32 -0
- data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
- data/test/crawlscope/schema_registry_test.rb +19 -0
- data/test/crawlscope/sitemap_test.rb +55 -0
- data/test/crawlscope/structured_data_document_test.rb +36 -0
- data/test/crawlscope/structured_data_report_test.rb +3 -3
- data/test/crawlscope/structured_data_reporter_test.rb +2 -2
- data/test/crawlscope/structured_data_rule_test.rb +20 -0
- data/test/crawlscope/structured_data_writer_test.rb +2 -2
- data/test/crawlscope/url_test.rb +31 -0
- metadata +14 -5
- data/lib/crawlscope/task.rb +0 -131
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeBrowserTest < Minitest::Test
|
|
6
|
+
Response = Data.define(:url, :headers)
|
|
7
|
+
|
|
8
|
+
class FakeBrowser
|
|
9
|
+
attr_reader :quit_called
|
|
10
|
+
|
|
11
|
+
def quit
|
|
12
|
+
@quit_called = true
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
class FakeNetwork
|
|
17
|
+
attr_reader :cleared, :idle_waits, :status
|
|
18
|
+
|
|
19
|
+
def initialize(response:, status: 200)
|
|
20
|
+
@response = response
|
|
21
|
+
@status = status
|
|
22
|
+
@cleared = []
|
|
23
|
+
@idle_waits = []
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def clear(scope)
|
|
27
|
+
@cleared << scope
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
attr_reader :response
|
|
31
|
+
|
|
32
|
+
def wait_for_idle(duration:, timeout:)
|
|
33
|
+
@idle_waits << {duration: duration, timeout: timeout}
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
class FakePage
|
|
38
|
+
attr_reader :evaluations, :network, :visited_url
|
|
39
|
+
|
|
40
|
+
def initialize(network:, body: "<html></html>", current_url: "", url: "")
|
|
41
|
+
@network = network
|
|
42
|
+
@body = body
|
|
43
|
+
@current_url = current_url
|
|
44
|
+
@url = url
|
|
45
|
+
@evaluations = []
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
attr_reader :body
|
|
49
|
+
|
|
50
|
+
attr_reader :current_url
|
|
51
|
+
|
|
52
|
+
def evaluate(script)
|
|
53
|
+
@evaluations << script
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def go_to(url)
|
|
57
|
+
@visited_url = url
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
attr_reader :url
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def test_fetch_returns_rendered_page
|
|
64
|
+
network = FakeNetwork.new(response: Response.new(url: "https://example.com/final", headers: {"content-type" => "text/html"}))
|
|
65
|
+
page = FakePage.new(network: network, body: "<html><body>Hello</body></html>")
|
|
66
|
+
browser = browser_with(page: page, scroll_page: false)
|
|
67
|
+
|
|
68
|
+
result = browser.fetch("https://example.com/start")
|
|
69
|
+
|
|
70
|
+
assert_equal "https://example.com/start", page.visited_url
|
|
71
|
+
assert_equal [:traffic], network.cleared
|
|
72
|
+
assert_equal "https://example.com/final", result.final_url
|
|
73
|
+
assert_equal "https://example.com/final", result.normalized_final_url
|
|
74
|
+
assert_equal 200, result.status
|
|
75
|
+
assert result.html?
|
|
76
|
+
assert_equal [], page.evaluations
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def test_fetch_scrolls_when_enabled
|
|
80
|
+
network = FakeNetwork.new(response: Response.new(url: "", headers: {}))
|
|
81
|
+
page = FakePage.new(network: network, current_url: "https://example.com/current")
|
|
82
|
+
browser = browser_with(page: page, scroll_page: true)
|
|
83
|
+
|
|
84
|
+
result = browser.fetch("https://example.com/start")
|
|
85
|
+
|
|
86
|
+
assert_equal "https://example.com/current", result.final_url
|
|
87
|
+
assert_equal 3, page.evaluations.size
|
|
88
|
+
assert_equal 4, network.idle_waits.size
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def test_fetch_falls_back_to_page_url_and_original_url
|
|
92
|
+
page_url_network = FakeNetwork.new(response: nil)
|
|
93
|
+
page_url = FakePage.new(network: page_url_network, url: "https://example.com/page")
|
|
94
|
+
page_url_result = browser_with(page: page_url).fetch("https://example.com/start")
|
|
95
|
+
|
|
96
|
+
original_url_network = FakeNetwork.new(response: nil)
|
|
97
|
+
original_url = FakePage.new(network: original_url_network)
|
|
98
|
+
original_url_result = browser_with(page: original_url).fetch("https://example.com/start")
|
|
99
|
+
|
|
100
|
+
assert_equal "https://example.com/page", page_url_result.final_url
|
|
101
|
+
assert_equal "https://example.com/start", original_url_result.final_url
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def test_fetch_returns_error_page_when_navigation_fails
|
|
105
|
+
page = Object.new
|
|
106
|
+
def page.network
|
|
107
|
+
raise Timeout::Error, "browser failed"
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
result = browser_with(page: page).fetch("https://example.com/start")
|
|
111
|
+
|
|
112
|
+
assert_equal "https://example.com/start", result.final_url
|
|
113
|
+
assert_nil result.status
|
|
114
|
+
assert_equal "Timeout::Error: browser failed", result.error
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def test_fetch_reraises_programmer_errors
|
|
118
|
+
page = Object.new
|
|
119
|
+
def page.network
|
|
120
|
+
raise NoMethodError, "bad call"
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
browser = browser_with(page: page)
|
|
124
|
+
|
|
125
|
+
assert_raises(NoMethodError) { browser.fetch("https://example.com/start") }
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def test_close_quits_browser
|
|
129
|
+
fake_browser = FakeBrowser.new
|
|
130
|
+
browser = browser_with(browser: fake_browser)
|
|
131
|
+
|
|
132
|
+
browser.close
|
|
133
|
+
|
|
134
|
+
assert fake_browser.quit_called
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def test_close_allows_missing_browser
|
|
138
|
+
browser = browser_with(browser: nil)
|
|
139
|
+
|
|
140
|
+
assert_nil browser.close
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
private
|
|
144
|
+
|
|
145
|
+
def browser_with(page: FakePage.new(network: FakeNetwork.new(response: nil)), browser: FakeBrowser.new, scroll_page: false)
|
|
146
|
+
Crawlscope::Browser.allocate.tap do |instance|
|
|
147
|
+
instance.instance_variable_set(:@base_url, "https://example.com")
|
|
148
|
+
instance.instance_variable_set(:@timeout_seconds, 20)
|
|
149
|
+
instance.instance_variable_set(:@network_idle_timeout_seconds, 5)
|
|
150
|
+
instance.instance_variable_set(:@scroll_page, scroll_page)
|
|
151
|
+
instance.instance_variable_set(:@browser, browser)
|
|
152
|
+
instance.instance_variable_set(:@page, page)
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|
data/test/crawlscope/cli_test.rb
CHANGED
|
@@ -19,7 +19,7 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
19
19
|
end
|
|
20
20
|
|
|
21
21
|
class FakeTask
|
|
22
|
-
attr_reader :validate_arguments, :
|
|
22
|
+
attr_reader :validate_arguments, :json_ld_arguments
|
|
23
23
|
|
|
24
24
|
def validate(base_url:, sitemap_path:, rule_names:)
|
|
25
25
|
@validate_arguments = {
|
|
@@ -31,8 +31,8 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
31
31
|
success_result
|
|
32
32
|
end
|
|
33
33
|
|
|
34
|
-
def
|
|
35
|
-
@
|
|
34
|
+
def validate_json_ld(urls:, debug:, renderer:, report_path:, summary:, timeout_seconds:)
|
|
35
|
+
@json_ld_arguments = {
|
|
36
36
|
urls: urls,
|
|
37
37
|
debug: debug,
|
|
38
38
|
renderer: renderer,
|
|
@@ -51,6 +51,20 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
51
51
|
end
|
|
52
52
|
end
|
|
53
53
|
|
|
54
|
+
class FailingTask < FakeTask
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
def success_result
|
|
58
|
+
Struct.new(:ok?).new(false)
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
class InvalidTask < FakeTask
|
|
63
|
+
def validate(base_url:, sitemap_path:, rule_names:)
|
|
64
|
+
raise Crawlscope::ValidationError, "No URLs found in sitemap: #{sitemap_path}"
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
54
68
|
def test_version_prints_current_version
|
|
55
69
|
out = StringIO.new
|
|
56
70
|
err = StringIO.new
|
|
@@ -70,7 +84,7 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
70
84
|
|
|
71
85
|
assert_equal 1, status
|
|
72
86
|
assert_includes err.string, "Unknown command: unknown"
|
|
73
|
-
assert_includes err.string, "crawlscope validate --
|
|
87
|
+
assert_includes err.string, "crawlscope validate --url"
|
|
74
88
|
end
|
|
75
89
|
|
|
76
90
|
def test_validate_passes_arguments_to_task
|
|
@@ -80,7 +94,7 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
80
94
|
err = StringIO.new
|
|
81
95
|
|
|
82
96
|
status = Crawlscope::Cli.start(
|
|
83
|
-
["validate", "--
|
|
97
|
+
["validate", "--url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3"],
|
|
84
98
|
out: out,
|
|
85
99
|
err: err,
|
|
86
100
|
configuration: configuration,
|
|
@@ -125,12 +139,120 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
125
139
|
summary: true,
|
|
126
140
|
timeout_seconds: 20
|
|
127
141
|
},
|
|
128
|
-
task.
|
|
142
|
+
task.json_ld_arguments
|
|
129
143
|
)
|
|
130
144
|
assert_same out, configuration.output
|
|
131
145
|
assert_empty err.string
|
|
132
146
|
end
|
|
133
147
|
|
|
148
|
+
def test_validate_caps_default_browser_concurrency
|
|
149
|
+
configuration = FakeConfiguration.new
|
|
150
|
+
task = FakeTask.new
|
|
151
|
+
out = StringIO.new
|
|
152
|
+
err = StringIO.new
|
|
153
|
+
|
|
154
|
+
with_env("JS" => "1") do
|
|
155
|
+
status = Crawlscope::Cli.start(["validate", "--url", "https://example.com"], out: out, err: err, configuration: configuration, task: task)
|
|
156
|
+
|
|
157
|
+
assert_equal 0, status
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
assert_equal :browser, configuration.renderer
|
|
161
|
+
assert_equal 4, configuration.concurrency
|
|
162
|
+
assert_includes out.string, "Default JS concurrency capped at 4"
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def test_validate_uses_url_environment_as_base_url_for_default_sitemap
|
|
166
|
+
configuration = FakeConfiguration.new
|
|
167
|
+
task = FakeTask.new
|
|
168
|
+
|
|
169
|
+
with_env("URL" => "https://example.com") do
|
|
170
|
+
status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
|
|
171
|
+
|
|
172
|
+
assert_equal 0, status
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
assert_equal "https://example.com", task.validate_arguments[:base_url]
|
|
176
|
+
assert_nil task.validate_arguments[:sitemap_path]
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def test_validate_uses_sitemap_mode_when_sitemap_is_configured
|
|
180
|
+
task = FakeTask.new
|
|
181
|
+
|
|
182
|
+
with_env("URL" => "https://example.com", "SITEMAP" => "https://example.com/sitemap.xml") do
|
|
183
|
+
status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: FakeConfiguration.new, task: task)
|
|
184
|
+
|
|
185
|
+
assert_equal 0, status
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
assert_equal "https://example.com", task.validate_arguments[:base_url]
|
|
189
|
+
assert_equal "https://example.com/sitemap.xml", task.validate_arguments[:sitemap_path]
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def test_ldjson_accepts_repeated_urls_and_options
|
|
193
|
+
configuration = FakeConfiguration.new
|
|
194
|
+
task = FakeTask.new
|
|
195
|
+
out = StringIO.new
|
|
196
|
+
err = StringIO.new
|
|
197
|
+
|
|
198
|
+
status = Crawlscope::Cli.start(
|
|
199
|
+
["ldjson", "--url", "https://example.com/a", "--url", "https://example.com/b", "--renderer", "browser", "--timeout", "12", "--network-idle-timeout", "3", "--report-path", "report.json", "--debug", "--summary"],
|
|
200
|
+
out: out,
|
|
201
|
+
err: err,
|
|
202
|
+
configuration: configuration,
|
|
203
|
+
task: task
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
assert_equal 0, status
|
|
207
|
+
assert_equal(
|
|
208
|
+
{
|
|
209
|
+
urls: ["https://example.com/a", "https://example.com/b"],
|
|
210
|
+
debug: true,
|
|
211
|
+
renderer: :browser,
|
|
212
|
+
report_path: "report.json",
|
|
213
|
+
summary: true,
|
|
214
|
+
timeout_seconds: 12
|
|
215
|
+
},
|
|
216
|
+
task.json_ld_arguments
|
|
217
|
+
)
|
|
218
|
+
assert_equal 3, configuration.network_idle_timeout_seconds
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def test_ldjson_requires_urls
|
|
222
|
+
out = StringIO.new
|
|
223
|
+
err = StringIO.new
|
|
224
|
+
|
|
225
|
+
status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task: FakeTask.new)
|
|
226
|
+
|
|
227
|
+
assert_equal 1, status
|
|
228
|
+
assert_includes err.string, "Crawlscope URL is not configured"
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
def test_invalid_integer_option_returns_error
|
|
232
|
+
out = StringIO.new
|
|
233
|
+
err = StringIO.new
|
|
234
|
+
|
|
235
|
+
status = Crawlscope::Cli.start(["validate", "--timeout", "0"], out: out, err: err, configuration: FakeConfiguration.new, task: FakeTask.new)
|
|
236
|
+
|
|
237
|
+
assert_equal 1, status
|
|
238
|
+
assert_includes err.string, "timeout must be >= 1"
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def test_failed_result_returns_failed_status
|
|
242
|
+
status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: FakeConfiguration.new, task: FailingTask.new)
|
|
243
|
+
|
|
244
|
+
assert_equal 1, status
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
def test_validation_errors_return_failed_status_without_reraising
|
|
248
|
+
err = StringIO.new
|
|
249
|
+
|
|
250
|
+
status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: err, configuration: FakeConfiguration.new, task: InvalidTask.new)
|
|
251
|
+
|
|
252
|
+
assert_equal 1, status
|
|
253
|
+
assert_includes err.string, "No URLs found in sitemap"
|
|
254
|
+
end
|
|
255
|
+
|
|
134
256
|
private
|
|
135
257
|
|
|
136
258
|
def with_env(overrides)
|
|
@@ -42,4 +42,53 @@ class CrawlscopeConfigurationTest < Minitest::Test
|
|
|
42
42
|
|
|
43
43
|
assert_equal "Crawlscope sitemap_path is not configured", error.message
|
|
44
44
|
end
|
|
45
|
+
|
|
46
|
+
def test_defaults_are_normalized
|
|
47
|
+
config = Crawlscope::Configuration.new
|
|
48
|
+
|
|
49
|
+
assert_equal [200, 301, 302], config.allowed_statuses
|
|
50
|
+
assert_equal 10, config.concurrency
|
|
51
|
+
assert_equal 4, config.browser_concurrency
|
|
52
|
+
assert_equal 5, config.network_idle_timeout_seconds
|
|
53
|
+
assert_equal :http, config.renderer
|
|
54
|
+
assert_equal 20, config.timeout_seconds
|
|
55
|
+
assert_equal $stdout, config.output
|
|
56
|
+
assert config.scroll_page?
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def test_configured_values_are_normalized
|
|
60
|
+
config = Crawlscope::Configuration.new
|
|
61
|
+
config.allowed_statuses = ["200", "404"]
|
|
62
|
+
config.concurrency = "2"
|
|
63
|
+
config.network_idle_timeout_seconds = "7"
|
|
64
|
+
config.renderer = "browser"
|
|
65
|
+
config.timeout_seconds = "9"
|
|
66
|
+
config.scroll_page = false
|
|
67
|
+
|
|
68
|
+
assert_equal [200, 404], config.allowed_statuses
|
|
69
|
+
assert_equal 2, config.concurrency
|
|
70
|
+
assert_equal 2, config.browser_concurrency
|
|
71
|
+
assert_equal 7, config.network_idle_timeout_seconds
|
|
72
|
+
assert_equal :browser, config.renderer
|
|
73
|
+
assert_equal 9, config.timeout_seconds
|
|
74
|
+
refute config.scroll_page?
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def test_renderer_must_be_supported
|
|
78
|
+
config = Crawlscope::Configuration.new
|
|
79
|
+
config.renderer = "webkit"
|
|
80
|
+
|
|
81
|
+
error = assert_raises(Crawlscope::ConfigurationError) { config.renderer }
|
|
82
|
+
|
|
83
|
+
assert_equal "Crawlscope renderer must be http or browser", error.message
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def test_numeric_values_must_be_positive_integers
|
|
87
|
+
config = Crawlscope::Configuration.new
|
|
88
|
+
config.concurrency = "0"
|
|
89
|
+
|
|
90
|
+
error = assert_raises(Crawlscope::ConfigurationError) { config.concurrency }
|
|
91
|
+
|
|
92
|
+
assert_equal "Crawlscope concurrency must be an integer >= 1", error.message
|
|
93
|
+
end
|
|
45
94
|
end
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "test_helper"
|
|
4
4
|
|
|
5
|
-
class
|
|
5
|
+
class CrawlscopeCrawlTest < Minitest::Test
|
|
6
6
|
def setup
|
|
7
7
|
@tmp_dir = Dir.mktmpdir
|
|
8
8
|
@sitemap_path = File.join(@tmp_dir, "sitemap.xml")
|
|
@@ -33,6 +33,9 @@ class CrawlscopeAuditTest < Minitest::Test
|
|
|
33
33
|
<title>Pricing</title>
|
|
34
34
|
<meta name="description" content="Plans for hotels and restaurants">
|
|
35
35
|
<link rel="canonical" href="https://example.com/pricing">
|
|
36
|
+
<script type="application/ld+json">
|
|
37
|
+
{"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
|
|
38
|
+
</script>
|
|
36
39
|
</head>
|
|
37
40
|
<body>
|
|
38
41
|
<main>
|
|
@@ -43,7 +46,7 @@ class CrawlscopeAuditTest < Minitest::Test
|
|
|
43
46
|
HTML
|
|
44
47
|
)
|
|
45
48
|
|
|
46
|
-
result = Crawlscope::
|
|
49
|
+
result = Crawlscope::Crawl.new(
|
|
47
50
|
base_url: "https://example.com",
|
|
48
51
|
sitemap_path: @sitemap_path,
|
|
49
52
|
rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
|
|
@@ -84,7 +87,7 @@ class CrawlscopeAuditTest < Minitest::Test
|
|
|
84
87
|
HTML
|
|
85
88
|
)
|
|
86
89
|
|
|
87
|
-
result = Crawlscope::
|
|
90
|
+
result = Crawlscope::Crawl.new(
|
|
88
91
|
base_url: "https://example.com",
|
|
89
92
|
sitemap_path: @sitemap_path,
|
|
90
93
|
rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
|
|
@@ -92,7 +95,7 @@ class CrawlscopeAuditTest < Minitest::Test
|
|
|
92
95
|
).call
|
|
93
96
|
|
|
94
97
|
refute result.ok?
|
|
95
|
-
assert_equal %i[meta_description_too_long missing_canonical missing_h1 title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
|
|
98
|
+
assert_equal %i[meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
|
|
96
99
|
end
|
|
97
100
|
|
|
98
101
|
def test_uses_browser_when_renderer_is_browser
|
|
@@ -127,6 +130,9 @@ class CrawlscopeAuditTest < Minitest::Test
|
|
|
127
130
|
<title>Pricing</title>
|
|
128
131
|
<meta name="description" content="Plans for hotels and restaurants">
|
|
129
132
|
<link rel="canonical" href="https://example.com/pricing">
|
|
133
|
+
<script type="application/ld+json">
|
|
134
|
+
{"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
|
|
135
|
+
</script>
|
|
130
136
|
</head>
|
|
131
137
|
<body>
|
|
132
138
|
<main>
|
|
@@ -149,7 +155,7 @@ class CrawlscopeAuditTest < Minitest::Test
|
|
|
149
155
|
end
|
|
150
156
|
end.new
|
|
151
157
|
|
|
152
|
-
result = Crawlscope::
|
|
158
|
+
result = Crawlscope::Crawl.new(
|
|
153
159
|
base_url: "https://example.com",
|
|
154
160
|
sitemap_path: @sitemap_path,
|
|
155
161
|
rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeCrawlerTest < Minitest::Test
|
|
6
|
+
class RaisingFetcher
|
|
7
|
+
def fetch(url)
|
|
8
|
+
raise Timeout::Error, "fetch timed out" if url.include?("timeout")
|
|
9
|
+
|
|
10
|
+
Crawlscope::Page.new(
|
|
11
|
+
url: url,
|
|
12
|
+
normalized_url: url,
|
|
13
|
+
final_url: url,
|
|
14
|
+
normalized_final_url: url,
|
|
15
|
+
status: 200,
|
|
16
|
+
headers: {},
|
|
17
|
+
body: "<html></html>",
|
|
18
|
+
doc: Nokogiri::HTML("<html></html>")
|
|
19
|
+
)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def test_returns_error_page_when_fetcher_raises
|
|
24
|
+
pages = Crawlscope::Crawler.new(page_fetcher: RaisingFetcher.new, concurrency: 2).call(
|
|
25
|
+
["https://example.com/ok", "https://example.com/timeout"]
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
assert_equal 2, pages.size
|
|
29
|
+
error_page = pages.find { |page| page.url == "https://example.com/timeout" }
|
|
30
|
+
|
|
31
|
+
assert_nil error_page.status
|
|
32
|
+
assert_equal "Timeout::Error: fetch timed out", error_page.error
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeHttpTest < Minitest::Test
|
|
6
|
+
def test_fetch_parses_html_response
|
|
7
|
+
stub_request(:get, "https://example.com/page")
|
|
8
|
+
.to_return(status: 200, headers: {"Content-Type" => "text/html"}, body: "<html><body>Hello</body></html>")
|
|
9
|
+
|
|
10
|
+
page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/page")
|
|
11
|
+
|
|
12
|
+
assert_equal 200, page.status
|
|
13
|
+
assert page.html?
|
|
14
|
+
assert_equal "Hello", page.doc.at_css("body").text
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def test_fetch_parses_responses_without_content_type_as_html
|
|
18
|
+
stub_request(:get, "https://example.com/page")
|
|
19
|
+
.to_return(status: 200, body: "<html><body>Hello</body></html>")
|
|
20
|
+
|
|
21
|
+
page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/page")
|
|
22
|
+
|
|
23
|
+
assert page.html?
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def test_fetch_leaves_non_html_response_unparsed
|
|
27
|
+
stub_request(:get, "https://example.com/feed.xml")
|
|
28
|
+
.to_return(status: 200, headers: {"content-type" => "application/xml"}, body: "<feed></feed>")
|
|
29
|
+
|
|
30
|
+
page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/feed.xml")
|
|
31
|
+
|
|
32
|
+
assert_equal 200, page.status
|
|
33
|
+
refute page.html?
|
|
34
|
+
assert_equal "<feed></feed>", page.body
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def test_fetch_returns_error_page_for_failed_requests
|
|
38
|
+
stub_request(:get, "https://example.com/down").to_timeout
|
|
39
|
+
|
|
40
|
+
page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/down")
|
|
41
|
+
|
|
42
|
+
assert_nil page.status
|
|
43
|
+
assert_includes page.error, "Faraday::ConnectionFailed"
|
|
44
|
+
assert_equal "https://example.com/down", page.final_url
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def test_fetch_reraises_programmer_errors
|
|
48
|
+
http = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2)
|
|
49
|
+
|
|
50
|
+
def http.connection
|
|
51
|
+
raise NoMethodError, "bad call"
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
assert_raises(NoMethodError) { http.fetch("https://example.com/down") }
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -38,19 +38,124 @@ class CrawlscopeLinksRuleTest < Minitest::Test
|
|
|
38
38
|
urls: ["https://example.com/guide", "https://example.com/pricing"],
|
|
39
39
|
pages: pages,
|
|
40
40
|
issues: issues,
|
|
41
|
-
context:
|
|
42
|
-
allowed_statuses: [200, 301, 302],
|
|
43
|
-
base_url: "https://example.com",
|
|
44
|
-
resolve_target: method(:resolve_target)
|
|
45
|
-
}
|
|
41
|
+
context: context
|
|
46
42
|
)
|
|
47
43
|
|
|
48
44
|
assert_equal [:broken_internal_link], issues.to_a.map(&:code)
|
|
49
45
|
assert_includes issues.to_a.first.message, "HTTP 404"
|
|
50
46
|
end
|
|
51
47
|
|
|
48
|
+
def test_reports_unresolved_internal_links
|
|
49
|
+
issues = Crawlscope::IssueCollection.new
|
|
50
|
+
|
|
51
|
+
Crawlscope::Rules::Links.new.call(
|
|
52
|
+
urls: [],
|
|
53
|
+
pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/unknown\">Unknown</a></main>")],
|
|
54
|
+
issues: issues,
|
|
55
|
+
context: context(resolver: ->(_target_url) {})
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
assert_includes issues.to_a.map(&:code), :unresolved_internal_link
|
|
59
|
+
assert_includes issues.to_a.find { |issue| issue.code == :unresolved_internal_link }.message, "unable to validate internal link"
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def test_ignores_fetch_errors_for_urls_already_crawled
|
|
63
|
+
issues = Crawlscope::IssueCollection.new
|
|
64
|
+
resolver = lambda do |target_url|
|
|
65
|
+
{
|
|
66
|
+
crawled: true,
|
|
67
|
+
error: "Timeout::Error: timed out",
|
|
68
|
+
final_url: target_url,
|
|
69
|
+
status: nil
|
|
70
|
+
}
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
Crawlscope::Rules::Links.new.call(
|
|
74
|
+
urls: [],
|
|
75
|
+
pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/timeout\">Timeout</a></main>")],
|
|
76
|
+
issues: issues,
|
|
77
|
+
context: context(resolver: resolver)
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
assert_empty issues.to_a
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def test_reports_fetch_errors_for_uncrawled_targets
|
|
84
|
+
issues = Crawlscope::IssueCollection.new
|
|
85
|
+
resolver = lambda do |target_url|
|
|
86
|
+
{
|
|
87
|
+
crawled: false,
|
|
88
|
+
error: "Timeout::Error: timed out",
|
|
89
|
+
final_url: target_url,
|
|
90
|
+
status: nil
|
|
91
|
+
}
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
Crawlscope::Rules::Links.new.call(
|
|
95
|
+
urls: [],
|
|
96
|
+
pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/timeout\">Timeout</a></main>")],
|
|
97
|
+
issues: issues,
|
|
98
|
+
context: context(resolver: resolver)
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
assert_equal [:unresolved_internal_link], issues.to_a.map(&:code)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def test_reports_low_inbound_anchor_links
|
|
105
|
+
issues = Crawlscope::IssueCollection.new
|
|
106
|
+
|
|
107
|
+
Crawlscope::Rules::Links.new.call(
|
|
108
|
+
urls: ["https://example.com/guide", "https://example.com/pricing"],
|
|
109
|
+
pages: [
|
|
110
|
+
page(url: "https://example.com/guide", body: "<main><a href=\"/pricing\">Pricing</a></main>"),
|
|
111
|
+
page(url: "https://example.com/pricing", body: "<main><p>Pricing</p></main>")
|
|
112
|
+
],
|
|
113
|
+
issues: issues,
|
|
114
|
+
context: context
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
assert_equal [:low_inbound_anchor_links], issues.to_a.map(&:code)
|
|
118
|
+
assert_equal "https://example.com/guide", issues.to_a.first.url
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def test_ignores_links_that_should_not_be_crawled
|
|
122
|
+
issues = Crawlscope::IssueCollection.new
|
|
123
|
+
|
|
124
|
+
Crawlscope::Rules::Links.new.call(
|
|
125
|
+
urls: ["https://example.com/guide"],
|
|
126
|
+
pages: [
|
|
127
|
+
page(
|
|
128
|
+
url: "https://example.com/guide",
|
|
129
|
+
body: <<~HTML
|
|
130
|
+
<html>
|
|
131
|
+
<body>
|
|
132
|
+
<a href="#section">Jump</a>
|
|
133
|
+
<a href="mailto:test@example.com">Email</a>
|
|
134
|
+
<a href="https://other.example.com/page">External</a>
|
|
135
|
+
<a href="/rails/info">Rails</a>
|
|
136
|
+
<a href="/empty"> </a>
|
|
137
|
+
</body>
|
|
138
|
+
</html>
|
|
139
|
+
HTML
|
|
140
|
+
)
|
|
141
|
+
],
|
|
142
|
+
issues: issues,
|
|
143
|
+
context: context
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
assert_empty issues.to_a
|
|
147
|
+
end
|
|
148
|
+
|
|
52
149
|
private
|
|
53
150
|
|
|
151
|
+
def context(resolver: method(:resolve_target))
|
|
152
|
+
{
|
|
153
|
+
allowed_statuses: [200, 301, 302],
|
|
154
|
+
base_url: "https://example.com",
|
|
155
|
+
resolve_target: resolver
|
|
156
|
+
}
|
|
157
|
+
end
|
|
158
|
+
|
|
54
159
|
def page(url:, body:)
|
|
55
160
|
doc = Nokogiri::HTML(body)
|
|
56
161
|
|