crawlscope 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -8
- data/README.md +21 -14
- data/lib/crawlscope/browser.rb +8 -0
- data/lib/crawlscope/cli.rb +15 -10
- data/lib/crawlscope/configuration.rb +20 -5
- data/lib/crawlscope/context.rb +9 -0
- data/lib/crawlscope/{audit.rb → crawl.rb} +68 -58
- data/lib/crawlscope/crawler.rb +19 -1
- data/lib/crawlscope/http.rb +1 -1
- data/lib/crawlscope/rake_tasks.rb +28 -0
- data/lib/crawlscope/rules/links.rb +99 -48
- data/lib/crawlscope/rules/metadata.rb +57 -11
- data/lib/crawlscope/rules/structured_data.rb +61 -1
- data/lib/crawlscope/run.rb +60 -0
- data/lib/crawlscope/schema_registry.rb +3 -349
- data/lib/crawlscope/schemas.rb +406 -0
- data/lib/crawlscope/sitemap.rb +18 -6
- data/lib/crawlscope/structured_data/audit.rb +7 -7
- data/lib/crawlscope/structured_data/check.rb +35 -0
- data/lib/crawlscope/structured_data/reporter.rb +69 -0
- data/lib/crawlscope/url.rb +14 -0
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +12 -23
- data/test/crawlscope/browser_test.rb +155 -0
- data/test/crawlscope/cli_test.rb +143 -7
- data/test/crawlscope/configuration_test.rb +49 -0
- data/test/crawlscope/{audit_test.rb → crawl_test.rb} +23 -7
- data/test/crawlscope/crawler_test.rb +34 -0
- data/test/crawlscope/http_test.rb +56 -0
- data/test/crawlscope/links_rule_test.rb +149 -5
- data/test/crawlscope/metadata_rule_test.rb +77 -0
- data/test/crawlscope/rule_registry_test.rb +32 -0
- data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
- data/test/crawlscope/schema_registry_test.rb +19 -0
- data/test/crawlscope/sitemap_test.rb +55 -0
- data/test/crawlscope/structured_data_document_test.rb +36 -0
- data/test/crawlscope/structured_data_report_test.rb +3 -3
- data/test/crawlscope/structured_data_reporter_test.rb +2 -2
- data/test/crawlscope/structured_data_rule_test.rb +111 -0
- data/test/crawlscope/structured_data_writer_test.rb +2 -2
- data/test/crawlscope/url_test.rb +31 -0
- metadata +15 -5
- data/lib/crawlscope/task.rb +0 -131
|
@@ -1,44 +1,33 @@
|
|
|
1
1
|
namespace :crawlscope do
|
|
2
|
-
desc "Validate
|
|
2
|
+
desc "Validate URLs with all default Crawlscope rules. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
|
|
3
3
|
task validate: :environment do
|
|
4
|
-
|
|
5
|
-
exit(status) unless status.zero?
|
|
4
|
+
Crawlscope::RakeTasks.validate
|
|
6
5
|
end
|
|
7
6
|
|
|
8
7
|
namespace :validate do
|
|
9
|
-
desc "
|
|
8
|
+
desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
|
|
10
9
|
task ldjson: :environment do
|
|
11
|
-
|
|
12
|
-
exit(status) unless status.zero?
|
|
10
|
+
Crawlscope::RakeTasks.ldjson
|
|
13
11
|
end
|
|
14
12
|
|
|
15
|
-
desc "Validate
|
|
13
|
+
desc "Validate URLs with the metadata rule. ENV: URL, SITEMAP, JS=1"
|
|
16
14
|
task metadata: :environment do
|
|
17
|
-
|
|
15
|
+
Crawlscope::RakeTasks.validate_rule("metadata")
|
|
18
16
|
end
|
|
19
17
|
|
|
20
|
-
desc "Validate sitemap URLs with the structured_data rule. ENV:
|
|
18
|
+
desc "Validate sitemap URLs with the structured_data rule. ENV: URL, SITEMAP, JS=1"
|
|
21
19
|
task structured_data: :environment do
|
|
22
|
-
|
|
20
|
+
Crawlscope::RakeTasks.validate_rule("structured_data")
|
|
23
21
|
end
|
|
24
22
|
|
|
25
|
-
desc "Validate
|
|
23
|
+
desc "Validate URLs with the uniqueness rule. ENV: URL, SITEMAP, JS=1"
|
|
26
24
|
task uniqueness: :environment do
|
|
27
|
-
|
|
25
|
+
Crawlscope::RakeTasks.validate_rule("uniqueness")
|
|
28
26
|
end
|
|
29
27
|
|
|
30
|
-
desc "Validate
|
|
28
|
+
desc "Validate URLs with the links rule. ENV: URL, SITEMAP, JS=1"
|
|
31
29
|
task links: :environment do
|
|
32
|
-
|
|
30
|
+
Crawlscope::RakeTasks.validate_rule("links")
|
|
33
31
|
end
|
|
34
32
|
end
|
|
35
|
-
|
|
36
|
-
def crawlscope_task_with_rules(rules)
|
|
37
|
-
original_rules = ENV["RULES"]
|
|
38
|
-
ENV["RULES"] = rules
|
|
39
|
-
status = Crawlscope::Cli.start(["validate"], out: $stdout, err: $stderr)
|
|
40
|
-
exit(status) unless status.zero?
|
|
41
|
-
ensure
|
|
42
|
-
ENV["RULES"] = original_rules
|
|
43
|
-
end
|
|
44
33
|
end
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeBrowserTest < Minitest::Test
|
|
6
|
+
Response = Data.define(:url, :headers)
|
|
7
|
+
|
|
8
|
+
class FakeBrowser
|
|
9
|
+
attr_reader :quit_called
|
|
10
|
+
|
|
11
|
+
def quit
|
|
12
|
+
@quit_called = true
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
class FakeNetwork
|
|
17
|
+
attr_reader :cleared, :idle_waits, :status
|
|
18
|
+
|
|
19
|
+
def initialize(response:, status: 200)
|
|
20
|
+
@response = response
|
|
21
|
+
@status = status
|
|
22
|
+
@cleared = []
|
|
23
|
+
@idle_waits = []
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def clear(scope)
|
|
27
|
+
@cleared << scope
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
attr_reader :response
|
|
31
|
+
|
|
32
|
+
def wait_for_idle(duration:, timeout:)
|
|
33
|
+
@idle_waits << {duration: duration, timeout: timeout}
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
class FakePage
|
|
38
|
+
attr_reader :evaluations, :network, :visited_url
|
|
39
|
+
|
|
40
|
+
def initialize(network:, body: "<html></html>", current_url: "", url: "")
|
|
41
|
+
@network = network
|
|
42
|
+
@body = body
|
|
43
|
+
@current_url = current_url
|
|
44
|
+
@url = url
|
|
45
|
+
@evaluations = []
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
attr_reader :body
|
|
49
|
+
|
|
50
|
+
attr_reader :current_url
|
|
51
|
+
|
|
52
|
+
def evaluate(script)
|
|
53
|
+
@evaluations << script
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def go_to(url)
|
|
57
|
+
@visited_url = url
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
attr_reader :url
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def test_fetch_returns_rendered_page
|
|
64
|
+
network = FakeNetwork.new(response: Response.new(url: "https://example.com/final", headers: {"content-type" => "text/html"}))
|
|
65
|
+
page = FakePage.new(network: network, body: "<html><body>Hello</body></html>")
|
|
66
|
+
browser = browser_with(page: page, scroll_page: false)
|
|
67
|
+
|
|
68
|
+
result = browser.fetch("https://example.com/start")
|
|
69
|
+
|
|
70
|
+
assert_equal "https://example.com/start", page.visited_url
|
|
71
|
+
assert_equal [:traffic], network.cleared
|
|
72
|
+
assert_equal "https://example.com/final", result.final_url
|
|
73
|
+
assert_equal "https://example.com/final", result.normalized_final_url
|
|
74
|
+
assert_equal 200, result.status
|
|
75
|
+
assert result.html?
|
|
76
|
+
assert_equal [], page.evaluations
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def test_fetch_scrolls_when_enabled
|
|
80
|
+
network = FakeNetwork.new(response: Response.new(url: "", headers: {}))
|
|
81
|
+
page = FakePage.new(network: network, current_url: "https://example.com/current")
|
|
82
|
+
browser = browser_with(page: page, scroll_page: true)
|
|
83
|
+
|
|
84
|
+
result = browser.fetch("https://example.com/start")
|
|
85
|
+
|
|
86
|
+
assert_equal "https://example.com/current", result.final_url
|
|
87
|
+
assert_equal 3, page.evaluations.size
|
|
88
|
+
assert_equal 4, network.idle_waits.size
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def test_fetch_falls_back_to_page_url_and_original_url
|
|
92
|
+
page_url_network = FakeNetwork.new(response: nil)
|
|
93
|
+
page_url = FakePage.new(network: page_url_network, url: "https://example.com/page")
|
|
94
|
+
page_url_result = browser_with(page: page_url).fetch("https://example.com/start")
|
|
95
|
+
|
|
96
|
+
original_url_network = FakeNetwork.new(response: nil)
|
|
97
|
+
original_url = FakePage.new(network: original_url_network)
|
|
98
|
+
original_url_result = browser_with(page: original_url).fetch("https://example.com/start")
|
|
99
|
+
|
|
100
|
+
assert_equal "https://example.com/page", page_url_result.final_url
|
|
101
|
+
assert_equal "https://example.com/start", original_url_result.final_url
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def test_fetch_returns_error_page_when_navigation_fails
|
|
105
|
+
page = Object.new
|
|
106
|
+
def page.network
|
|
107
|
+
raise Timeout::Error, "browser failed"
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
result = browser_with(page: page).fetch("https://example.com/start")
|
|
111
|
+
|
|
112
|
+
assert_equal "https://example.com/start", result.final_url
|
|
113
|
+
assert_nil result.status
|
|
114
|
+
assert_equal "Timeout::Error: browser failed", result.error
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def test_fetch_reraises_programmer_errors
|
|
118
|
+
page = Object.new
|
|
119
|
+
def page.network
|
|
120
|
+
raise NoMethodError, "bad call"
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
browser = browser_with(page: page)
|
|
124
|
+
|
|
125
|
+
assert_raises(NoMethodError) { browser.fetch("https://example.com/start") }
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def test_close_quits_browser
|
|
129
|
+
fake_browser = FakeBrowser.new
|
|
130
|
+
browser = browser_with(browser: fake_browser)
|
|
131
|
+
|
|
132
|
+
browser.close
|
|
133
|
+
|
|
134
|
+
assert fake_browser.quit_called
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def test_close_allows_missing_browser
|
|
138
|
+
browser = browser_with(browser: nil)
|
|
139
|
+
|
|
140
|
+
assert_nil browser.close
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
private
|
|
144
|
+
|
|
145
|
+
def browser_with(page: FakePage.new(network: FakeNetwork.new(response: nil)), browser: FakeBrowser.new, scroll_page: false)
|
|
146
|
+
Crawlscope::Browser.allocate.tap do |instance|
|
|
147
|
+
instance.instance_variable_set(:@base_url, "https://example.com")
|
|
148
|
+
instance.instance_variable_set(:@timeout_seconds, 20)
|
|
149
|
+
instance.instance_variable_set(:@network_idle_timeout_seconds, 5)
|
|
150
|
+
instance.instance_variable_set(:@scroll_page, scroll_page)
|
|
151
|
+
instance.instance_variable_set(:@browser, browser)
|
|
152
|
+
instance.instance_variable_set(:@page, page)
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|
data/test/crawlscope/cli_test.rb
CHANGED
|
@@ -4,9 +4,10 @@ require "test_helper"
|
|
|
4
4
|
|
|
5
5
|
class CrawlscopeCliTest < Minitest::Test
|
|
6
6
|
class FakeConfiguration
|
|
7
|
-
attr_accessor :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
|
|
7
|
+
attr_accessor :base_url, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
|
|
8
8
|
|
|
9
9
|
def initialize
|
|
10
|
+
@base_url = nil
|
|
10
11
|
@concurrency = 10
|
|
11
12
|
@network_idle_timeout_seconds = 5
|
|
12
13
|
@renderer = :http
|
|
@@ -19,7 +20,7 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
19
20
|
end
|
|
20
21
|
|
|
21
22
|
class FakeTask
|
|
22
|
-
attr_reader :validate_arguments, :
|
|
23
|
+
attr_reader :validate_arguments, :json_ld_arguments
|
|
23
24
|
|
|
24
25
|
def validate(base_url:, sitemap_path:, rule_names:)
|
|
25
26
|
@validate_arguments = {
|
|
@@ -31,8 +32,8 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
31
32
|
success_result
|
|
32
33
|
end
|
|
33
34
|
|
|
34
|
-
def
|
|
35
|
-
@
|
|
35
|
+
def validate_json_ld(urls:, debug:, renderer:, report_path:, summary:, timeout_seconds:)
|
|
36
|
+
@json_ld_arguments = {
|
|
36
37
|
urls: urls,
|
|
37
38
|
debug: debug,
|
|
38
39
|
renderer: renderer,
|
|
@@ -51,6 +52,20 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
51
52
|
end
|
|
52
53
|
end
|
|
53
54
|
|
|
55
|
+
class FailingTask < FakeTask
|
|
56
|
+
private
|
|
57
|
+
|
|
58
|
+
def success_result
|
|
59
|
+
Struct.new(:ok?).new(false)
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
class InvalidTask < FakeTask
|
|
64
|
+
def validate(base_url:, sitemap_path:, rule_names:)
|
|
65
|
+
raise Crawlscope::ValidationError, "No URLs found in sitemap: #{sitemap_path}"
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
54
69
|
def test_version_prints_current_version
|
|
55
70
|
out = StringIO.new
|
|
56
71
|
err = StringIO.new
|
|
@@ -70,7 +85,7 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
70
85
|
|
|
71
86
|
assert_equal 1, status
|
|
72
87
|
assert_includes err.string, "Unknown command: unknown"
|
|
73
|
-
assert_includes err.string, "crawlscope validate --
|
|
88
|
+
assert_includes err.string, "crawlscope validate --url"
|
|
74
89
|
end
|
|
75
90
|
|
|
76
91
|
def test_validate_passes_arguments_to_task
|
|
@@ -80,7 +95,7 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
80
95
|
err = StringIO.new
|
|
81
96
|
|
|
82
97
|
status = Crawlscope::Cli.start(
|
|
83
|
-
["validate", "--
|
|
98
|
+
["validate", "--url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3"],
|
|
84
99
|
out: out,
|
|
85
100
|
err: err,
|
|
86
101
|
configuration: configuration,
|
|
@@ -125,12 +140,133 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
125
140
|
summary: true,
|
|
126
141
|
timeout_seconds: 20
|
|
127
142
|
},
|
|
128
|
-
task.
|
|
143
|
+
task.json_ld_arguments
|
|
129
144
|
)
|
|
130
145
|
assert_same out, configuration.output
|
|
131
146
|
assert_empty err.string
|
|
132
147
|
end
|
|
133
148
|
|
|
149
|
+
def test_ldjson_defaults_to_configured_base_url
|
|
150
|
+
configuration = FakeConfiguration.new
|
|
151
|
+
configuration.base_url = "https://example.com"
|
|
152
|
+
task = FakeTask.new
|
|
153
|
+
|
|
154
|
+
status = Crawlscope::Cli.start(["ldjson"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
|
|
155
|
+
|
|
156
|
+
assert_equal 0, status
|
|
157
|
+
assert_equal ["https://example.com"], task.json_ld_arguments[:urls]
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def test_validate_caps_default_browser_concurrency
|
|
161
|
+
configuration = FakeConfiguration.new
|
|
162
|
+
task = FakeTask.new
|
|
163
|
+
out = StringIO.new
|
|
164
|
+
err = StringIO.new
|
|
165
|
+
|
|
166
|
+
with_env("JS" => "1") do
|
|
167
|
+
status = Crawlscope::Cli.start(["validate", "--url", "https://example.com"], out: out, err: err, configuration: configuration, task: task)
|
|
168
|
+
|
|
169
|
+
assert_equal 0, status
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
assert_equal :browser, configuration.renderer
|
|
173
|
+
assert_equal 4, configuration.concurrency
|
|
174
|
+
assert_includes out.string, "Default JS concurrency capped at 4"
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def test_validate_uses_url_environment_as_base_url_for_default_sitemap
|
|
178
|
+
configuration = FakeConfiguration.new
|
|
179
|
+
task = FakeTask.new
|
|
180
|
+
|
|
181
|
+
with_env("URL" => "https://example.com") do
|
|
182
|
+
status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
|
|
183
|
+
|
|
184
|
+
assert_equal 0, status
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
assert_equal "https://example.com", task.validate_arguments[:base_url]
|
|
188
|
+
assert_nil task.validate_arguments[:sitemap_path]
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def test_validate_uses_sitemap_mode_when_sitemap_is_configured
|
|
192
|
+
task = FakeTask.new
|
|
193
|
+
|
|
194
|
+
with_env("URL" => "https://example.com", "SITEMAP" => "https://example.com/sitemap.xml") do
|
|
195
|
+
status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: FakeConfiguration.new, task: task)
|
|
196
|
+
|
|
197
|
+
assert_equal 0, status
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
assert_equal "https://example.com", task.validate_arguments[:base_url]
|
|
201
|
+
assert_equal "https://example.com/sitemap.xml", task.validate_arguments[:sitemap_path]
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def test_ldjson_accepts_repeated_urls_and_options
|
|
205
|
+
configuration = FakeConfiguration.new
|
|
206
|
+
task = FakeTask.new
|
|
207
|
+
out = StringIO.new
|
|
208
|
+
err = StringIO.new
|
|
209
|
+
|
|
210
|
+
status = Crawlscope::Cli.start(
|
|
211
|
+
["ldjson", "--url", "https://example.com/a", "--url", "https://example.com/b", "--renderer", "browser", "--timeout", "12", "--network-idle-timeout", "3", "--report-path", "report.json", "--debug", "--summary"],
|
|
212
|
+
out: out,
|
|
213
|
+
err: err,
|
|
214
|
+
configuration: configuration,
|
|
215
|
+
task: task
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
assert_equal 0, status
|
|
219
|
+
assert_equal(
|
|
220
|
+
{
|
|
221
|
+
urls: ["https://example.com/a", "https://example.com/b"],
|
|
222
|
+
debug: true,
|
|
223
|
+
renderer: :browser,
|
|
224
|
+
report_path: "report.json",
|
|
225
|
+
summary: true,
|
|
226
|
+
timeout_seconds: 12
|
|
227
|
+
},
|
|
228
|
+
task.json_ld_arguments
|
|
229
|
+
)
|
|
230
|
+
assert_equal 3, configuration.network_idle_timeout_seconds
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
def test_ldjson_defaults_to_localhost
|
|
234
|
+
out = StringIO.new
|
|
235
|
+
err = StringIO.new
|
|
236
|
+
task = FakeTask.new
|
|
237
|
+
|
|
238
|
+
status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task: task)
|
|
239
|
+
|
|
240
|
+
assert_equal 0, status
|
|
241
|
+
assert_equal ["http://localhost:3000"], task.json_ld_arguments[:urls]
|
|
242
|
+
assert_empty err.string
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
def test_invalid_integer_option_returns_error
|
|
246
|
+
out = StringIO.new
|
|
247
|
+
err = StringIO.new
|
|
248
|
+
|
|
249
|
+
status = Crawlscope::Cli.start(["validate", "--timeout", "0"], out: out, err: err, configuration: FakeConfiguration.new, task: FakeTask.new)
|
|
250
|
+
|
|
251
|
+
assert_equal 1, status
|
|
252
|
+
assert_includes err.string, "timeout must be >= 1"
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
def test_failed_result_returns_failed_status
|
|
256
|
+
status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: FakeConfiguration.new, task: FailingTask.new)
|
|
257
|
+
|
|
258
|
+
assert_equal 1, status
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
def test_validation_errors_return_failed_status_without_reraising
|
|
262
|
+
err = StringIO.new
|
|
263
|
+
|
|
264
|
+
status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: err, configuration: FakeConfiguration.new, task: InvalidTask.new)
|
|
265
|
+
|
|
266
|
+
assert_equal 1, status
|
|
267
|
+
assert_includes err.string, "No URLs found in sitemap"
|
|
268
|
+
end
|
|
269
|
+
|
|
134
270
|
private
|
|
135
271
|
|
|
136
272
|
def with_env(overrides)
|
|
@@ -42,4 +42,53 @@ class CrawlscopeConfigurationTest < Minitest::Test
|
|
|
42
42
|
|
|
43
43
|
assert_equal "Crawlscope sitemap_path is not configured", error.message
|
|
44
44
|
end
|
|
45
|
+
|
|
46
|
+
def test_defaults_are_normalized
|
|
47
|
+
config = Crawlscope::Configuration.new
|
|
48
|
+
|
|
49
|
+
assert_equal [200, 301, 302], config.allowed_statuses
|
|
50
|
+
assert_equal 10, config.concurrency
|
|
51
|
+
assert_equal 4, config.browser_concurrency
|
|
52
|
+
assert_equal 5, config.network_idle_timeout_seconds
|
|
53
|
+
assert_equal :http, config.renderer
|
|
54
|
+
assert_equal 20, config.timeout_seconds
|
|
55
|
+
assert_equal $stdout, config.output
|
|
56
|
+
assert config.scroll_page?
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def test_configured_values_are_normalized
|
|
60
|
+
config = Crawlscope::Configuration.new
|
|
61
|
+
config.allowed_statuses = ["200", "404"]
|
|
62
|
+
config.concurrency = "2"
|
|
63
|
+
config.network_idle_timeout_seconds = "7"
|
|
64
|
+
config.renderer = "browser"
|
|
65
|
+
config.timeout_seconds = "9"
|
|
66
|
+
config.scroll_page = false
|
|
67
|
+
|
|
68
|
+
assert_equal [200, 404], config.allowed_statuses
|
|
69
|
+
assert_equal 2, config.concurrency
|
|
70
|
+
assert_equal 2, config.browser_concurrency
|
|
71
|
+
assert_equal 7, config.network_idle_timeout_seconds
|
|
72
|
+
assert_equal :browser, config.renderer
|
|
73
|
+
assert_equal 9, config.timeout_seconds
|
|
74
|
+
refute config.scroll_page?
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def test_renderer_must_be_supported
|
|
78
|
+
config = Crawlscope::Configuration.new
|
|
79
|
+
config.renderer = "webkit"
|
|
80
|
+
|
|
81
|
+
error = assert_raises(Crawlscope::ConfigurationError) { config.renderer }
|
|
82
|
+
|
|
83
|
+
assert_equal "Crawlscope renderer must be http or browser", error.message
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def test_numeric_values_must_be_positive_integers
|
|
87
|
+
config = Crawlscope::Configuration.new
|
|
88
|
+
config.concurrency = "0"
|
|
89
|
+
|
|
90
|
+
error = assert_raises(Crawlscope::ConfigurationError) { config.concurrency }
|
|
91
|
+
|
|
92
|
+
assert_equal "Crawlscope concurrency must be an integer >= 1", error.message
|
|
93
|
+
end
|
|
45
94
|
end
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "test_helper"
|
|
4
4
|
|
|
5
|
-
class
|
|
5
|
+
class CrawlscopeCrawlTest < Minitest::Test
|
|
6
6
|
def setup
|
|
7
7
|
@tmp_dir = Dir.mktmpdir
|
|
8
8
|
@sitemap_path = File.join(@tmp_dir, "sitemap.xml")
|
|
@@ -31,8 +31,16 @@ class CrawlscopeAuditTest < Minitest::Test
|
|
|
31
31
|
<html>
|
|
32
32
|
<head>
|
|
33
33
|
<title>Pricing</title>
|
|
34
|
-
<meta name="description" content="Plans for hotels and restaurants">
|
|
34
|
+
<meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
|
|
35
35
|
<link rel="canonical" href="https://example.com/pricing">
|
|
36
|
+
<meta property="og:title" content="Pricing">
|
|
37
|
+
<meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
|
|
38
|
+
<meta property="og:url" content="https://example.com/pricing">
|
|
39
|
+
<meta property="og:type" content="website">
|
|
40
|
+
<meta property="og:image" content="https://example.com/icon.png">
|
|
41
|
+
<script type="application/ld+json">
|
|
42
|
+
{"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
|
|
43
|
+
</script>
|
|
36
44
|
</head>
|
|
37
45
|
<body>
|
|
38
46
|
<main>
|
|
@@ -43,7 +51,7 @@ class CrawlscopeAuditTest < Minitest::Test
|
|
|
43
51
|
HTML
|
|
44
52
|
)
|
|
45
53
|
|
|
46
|
-
result = Crawlscope::
|
|
54
|
+
result = Crawlscope::Crawl.new(
|
|
47
55
|
base_url: "https://example.com",
|
|
48
56
|
sitemap_path: @sitemap_path,
|
|
49
57
|
rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
|
|
@@ -84,7 +92,7 @@ class CrawlscopeAuditTest < Minitest::Test
|
|
|
84
92
|
HTML
|
|
85
93
|
)
|
|
86
94
|
|
|
87
|
-
result = Crawlscope::
|
|
95
|
+
result = Crawlscope::Crawl.new(
|
|
88
96
|
base_url: "https://example.com",
|
|
89
97
|
sitemap_path: @sitemap_path,
|
|
90
98
|
rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
|
|
@@ -92,7 +100,7 @@ class CrawlscopeAuditTest < Minitest::Test
|
|
|
92
100
|
).call
|
|
93
101
|
|
|
94
102
|
refute result.ok?
|
|
95
|
-
assert_equal %i[meta_description_too_long missing_canonical missing_h1 title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
|
|
103
|
+
assert_equal %i[incomplete_open_graph_tags meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
|
|
96
104
|
end
|
|
97
105
|
|
|
98
106
|
def test_uses_browser_when_renderer_is_browser
|
|
@@ -125,8 +133,16 @@ class CrawlscopeAuditTest < Minitest::Test
|
|
|
125
133
|
<html>
|
|
126
134
|
<head>
|
|
127
135
|
<title>Pricing</title>
|
|
128
|
-
<meta name="description" content="Plans for hotels and restaurants">
|
|
136
|
+
<meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
|
|
129
137
|
<link rel="canonical" href="https://example.com/pricing">
|
|
138
|
+
<meta property="og:title" content="Pricing">
|
|
139
|
+
<meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
|
|
140
|
+
<meta property="og:url" content="https://example.com/pricing">
|
|
141
|
+
<meta property="og:type" content="website">
|
|
142
|
+
<meta property="og:image" content="https://example.com/icon.png">
|
|
143
|
+
<script type="application/ld+json">
|
|
144
|
+
{"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
|
|
145
|
+
</script>
|
|
130
146
|
</head>
|
|
131
147
|
<body>
|
|
132
148
|
<main>
|
|
@@ -149,7 +165,7 @@ class CrawlscopeAuditTest < Minitest::Test
|
|
|
149
165
|
end
|
|
150
166
|
end.new
|
|
151
167
|
|
|
152
|
-
result = Crawlscope::
|
|
168
|
+
result = Crawlscope::Crawl.new(
|
|
153
169
|
base_url: "https://example.com",
|
|
154
170
|
sitemap_path: @sitemap_path,
|
|
155
171
|
rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeCrawlerTest < Minitest::Test
|
|
6
|
+
class RaisingFetcher
|
|
7
|
+
def fetch(url)
|
|
8
|
+
raise Timeout::Error, "fetch timed out" if url.include?("timeout")
|
|
9
|
+
|
|
10
|
+
Crawlscope::Page.new(
|
|
11
|
+
url: url,
|
|
12
|
+
normalized_url: url,
|
|
13
|
+
final_url: url,
|
|
14
|
+
normalized_final_url: url,
|
|
15
|
+
status: 200,
|
|
16
|
+
headers: {},
|
|
17
|
+
body: "<html></html>",
|
|
18
|
+
doc: Nokogiri::HTML("<html></html>")
|
|
19
|
+
)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def test_returns_error_page_when_fetcher_raises
|
|
24
|
+
pages = Crawlscope::Crawler.new(page_fetcher: RaisingFetcher.new, concurrency: 2).call(
|
|
25
|
+
["https://example.com/ok", "https://example.com/timeout"]
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
assert_equal 2, pages.size
|
|
29
|
+
error_page = pages.find { |page| page.url == "https://example.com/timeout" }
|
|
30
|
+
|
|
31
|
+
assert_nil error_page.status
|
|
32
|
+
assert_equal "Timeout::Error: fetch timed out", error_page.error
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeHttpTest < Minitest::Test
|
|
6
|
+
def test_fetch_parses_html_response
|
|
7
|
+
stub_request(:get, "https://example.com/page")
|
|
8
|
+
.to_return(status: 200, headers: {"Content-Type" => "text/html"}, body: "<html><body>Hello</body></html>")
|
|
9
|
+
|
|
10
|
+
page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/page")
|
|
11
|
+
|
|
12
|
+
assert_equal 200, page.status
|
|
13
|
+
assert page.html?
|
|
14
|
+
assert_equal "Hello", page.doc.at_css("body").text
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def test_fetch_parses_responses_without_content_type_as_html
|
|
18
|
+
stub_request(:get, "https://example.com/page")
|
|
19
|
+
.to_return(status: 200, body: "<html><body>Hello</body></html>")
|
|
20
|
+
|
|
21
|
+
page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/page")
|
|
22
|
+
|
|
23
|
+
assert page.html?
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def test_fetch_leaves_non_html_response_unparsed
|
|
27
|
+
stub_request(:get, "https://example.com/feed.xml")
|
|
28
|
+
.to_return(status: 200, headers: {"content-type" => "application/xml"}, body: "<feed></feed>")
|
|
29
|
+
|
|
30
|
+
page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/feed.xml")
|
|
31
|
+
|
|
32
|
+
assert_equal 200, page.status
|
|
33
|
+
refute page.html?
|
|
34
|
+
assert_equal "<feed></feed>", page.body
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def test_fetch_returns_error_page_for_failed_requests
|
|
38
|
+
stub_request(:get, "https://example.com/down").to_timeout
|
|
39
|
+
|
|
40
|
+
page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/down")
|
|
41
|
+
|
|
42
|
+
assert_nil page.status
|
|
43
|
+
assert_includes page.error, "Faraday::ConnectionFailed"
|
|
44
|
+
assert_equal "https://example.com/down", page.final_url
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def test_fetch_reraises_programmer_errors
|
|
48
|
+
http = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2)
|
|
49
|
+
|
|
50
|
+
def http.connection
|
|
51
|
+
raise NoMethodError, "bad call"
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
assert_raises(NoMethodError) { http.fetch("https://example.com/down") }
|
|
55
|
+
end
|
|
56
|
+
end
|