crawlscope 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +6 -0
- data/lib/crawlscope/cli.rb +4 -1
- data/lib/crawlscope/crawl.rb +2 -0
- data/lib/crawlscope/rake_tasks.rb +27 -12
- data/lib/crawlscope/reporter.rb +20 -5
- data/lib/crawlscope/rules/indexability.rb +130 -17
- data/lib/crawlscope/rules/links.rb +312 -9
- data/lib/crawlscope/rules/metadata.rb +61 -6
- data/lib/crawlscope/rules/structured_data.rb +31 -0
- data/lib/crawlscope/rules/uniqueness.rb +22 -0
- data/lib/crawlscope/sitemap.rb +9 -1
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +24 -24
- data/test/crawlscope/cli_test.rb +1 -0
- data/test/crawlscope/crawl_test.rb +26 -0
- data/test/crawlscope/indexability_rule_test.rb +33 -0
- data/test/crawlscope/links_rule_test.rb +148 -3
- data/test/crawlscope/metadata_rule_test.rb +36 -0
- data/test/crawlscope/rake_tasks_test.rb +70 -0
- data/test/crawlscope/reporter_test.rb +7 -3
- data/test/crawlscope/sitemap_test.rb +24 -0
- data/test/crawlscope/structured_data_rule_test.rb +56 -0
- data/test/crawlscope/uniqueness_rule_test.rb +17 -2
- metadata +2 -1
|
@@ -1,43 +1,43 @@
|
|
|
1
1
|
namespace :crawlscope do
|
|
2
|
-
desc "Validate URLs with all default Crawlscope rules. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
|
|
3
|
-
task validate: :environment do
|
|
4
|
-
Crawlscope::RakeTasks.validate
|
|
2
|
+
desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
|
|
3
|
+
task :validate, [:url, :sitemap, :rules] => :environment do |_task, args|
|
|
4
|
+
Crawlscope::RakeTasks.validate(url: args[:url], sitemap_path: args[:sitemap], rule_names: args[:rules])
|
|
5
5
|
end
|
|
6
6
|
|
|
7
7
|
namespace :validate do
|
|
8
|
-
desc "Directly validate JSON-LD on one
|
|
9
|
-
task ldjson: :environment do
|
|
10
|
-
Crawlscope::RakeTasks.ldjson
|
|
8
|
+
desc "Directly validate JSON-LD on one URL. Args: [url]. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
|
|
9
|
+
task :ldjson, [:url] => :environment do |_task, args|
|
|
10
|
+
Crawlscope::RakeTasks.ldjson(urls: args[:url])
|
|
11
11
|
end
|
|
12
12
|
|
|
13
|
-
desc "Validate URLs with the indexability rule. ENV: URL, SITEMAP, JS=1"
|
|
14
|
-
task indexability: :environment do
|
|
15
|
-
Crawlscope::RakeTasks.validate_rule("indexability")
|
|
13
|
+
desc "Validate URLs with the indexability rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
|
|
14
|
+
task :indexability, [:url, :sitemap] => :environment do |_task, args|
|
|
15
|
+
Crawlscope::RakeTasks.validate_rule("indexability", url: args[:url], sitemap_path: args[:sitemap])
|
|
16
16
|
end
|
|
17
17
|
|
|
18
|
-
desc "Validate URLs with the metadata rule. ENV: URL, SITEMAP, JS=1"
|
|
19
|
-
task metadata: :environment do
|
|
20
|
-
Crawlscope::RakeTasks.validate_rule("metadata")
|
|
18
|
+
desc "Validate URLs with the metadata rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
|
|
19
|
+
task :metadata, [:url, :sitemap] => :environment do |_task, args|
|
|
20
|
+
Crawlscope::RakeTasks.validate_rule("metadata", url: args[:url], sitemap_path: args[:sitemap])
|
|
21
21
|
end
|
|
22
22
|
|
|
23
|
-
desc "Validate sitemap URLs with the structured_data rule. ENV: URL, SITEMAP, JS=1"
|
|
24
|
-
task structured_data: :environment do
|
|
25
|
-
Crawlscope::RakeTasks.validate_rule("structured_data")
|
|
23
|
+
desc "Validate sitemap URLs with the structured_data rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
|
|
24
|
+
task :structured_data, [:url, :sitemap] => :environment do |_task, args|
|
|
25
|
+
Crawlscope::RakeTasks.validate_rule("structured_data", url: args[:url], sitemap_path: args[:sitemap])
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
-
desc "Validate URLs with the uniqueness rule. ENV: URL, SITEMAP, JS=1"
|
|
29
|
-
task uniqueness: :environment do
|
|
30
|
-
Crawlscope::RakeTasks.validate_rule("uniqueness")
|
|
28
|
+
desc "Validate URLs with the uniqueness rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
|
|
29
|
+
task :uniqueness, [:url, :sitemap] => :environment do |_task, args|
|
|
30
|
+
Crawlscope::RakeTasks.validate_rule("uniqueness", url: args[:url], sitemap_path: args[:sitemap])
|
|
31
31
|
end
|
|
32
32
|
|
|
33
|
-
desc "Validate URLs with the content_quality rule. ENV: URL, SITEMAP, JS=1"
|
|
34
|
-
task content_quality: :environment do
|
|
35
|
-
Crawlscope::RakeTasks.validate_rule("content_quality")
|
|
33
|
+
desc "Validate URLs with the content_quality rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
|
|
34
|
+
task :content_quality, [:url, :sitemap] => :environment do |_task, args|
|
|
35
|
+
Crawlscope::RakeTasks.validate_rule("content_quality", url: args[:url], sitemap_path: args[:sitemap])
|
|
36
36
|
end
|
|
37
37
|
|
|
38
|
-
desc "Validate URLs with the links rule. ENV: URL, SITEMAP, JS=1"
|
|
39
|
-
task links: :environment do
|
|
40
|
-
Crawlscope::RakeTasks.validate_rule("links")
|
|
38
|
+
desc "Validate URLs with the links rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
|
|
39
|
+
task :links, [:url, :sitemap] => :environment do |_task, args|
|
|
40
|
+
Crawlscope::RakeTasks.validate_rule("links", url: args[:url], sitemap_path: args[:sitemap])
|
|
41
41
|
end
|
|
42
42
|
end
|
|
43
43
|
end
|
data/test/crawlscope/cli_test.rb
CHANGED
|
@@ -188,4 +188,30 @@ class CrawlscopeCrawlTest < Minitest::Test
|
|
|
188
188
|
assert_equal ["https://example.com/pricing"], fake_browser.urls
|
|
189
189
|
assert fake_browser.closed
|
|
190
190
|
end
|
|
191
|
+
|
|
192
|
+
def test_reports_sitemap_redirect_url
|
|
193
|
+
File.write(
|
|
194
|
+
@sitemap_path,
|
|
195
|
+
<<~XML
|
|
196
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
197
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
198
|
+
<url><loc>https://example.com/old</loc></url>
|
|
199
|
+
</urlset>
|
|
200
|
+
XML
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
stub_request(:get, "https://example.com/old")
|
|
204
|
+
.to_return(status: 301, headers: {"Location" => "https://example.com/new"}, body: "")
|
|
205
|
+
stub_request(:get, "https://example.com/new")
|
|
206
|
+
.to_return(status: 200, headers: {"Content-Type" => "text/html"}, body: "<html><body>Moved</body></html>")
|
|
207
|
+
|
|
208
|
+
result = Crawlscope::Crawl.new(
|
|
209
|
+
base_url: "https://example.com",
|
|
210
|
+
sitemap_path: @sitemap_path,
|
|
211
|
+
rules: [],
|
|
212
|
+
schema_registry: Crawlscope::SchemaRegistry.default
|
|
213
|
+
).call
|
|
214
|
+
|
|
215
|
+
assert_includes result.issues.to_a.map(&:code), :sitemap_redirect_url
|
|
216
|
+
end
|
|
191
217
|
end
|
|
@@ -20,6 +20,39 @@ class CrawlscopeIndexabilityRuleTest < Minitest::Test
|
|
|
20
20
|
assert_equal :noindex_meta, issue.code
|
|
21
21
|
assert_equal :error, issue.severity
|
|
22
22
|
assert_equal "noindex, follow", issue.details[:content]
|
|
23
|
+
|
|
24
|
+
codes = issues.to_a.map(&:code)
|
|
25
|
+
assert_includes codes, :noindex_follow_meta
|
|
26
|
+
assert_includes codes, :sitemap_noindex_url
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def test_reports_meta_nofollow
|
|
30
|
+
issues = Crawlscope::IssueCollection.new
|
|
31
|
+
page = page_with(
|
|
32
|
+
body: <<~HTML
|
|
33
|
+
<html>
|
|
34
|
+
<head><meta name="robots" content="nofollow"></head>
|
|
35
|
+
<body><main>Visible content</main></body>
|
|
36
|
+
</html>
|
|
37
|
+
HTML
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues)
|
|
41
|
+
|
|
42
|
+
assert_equal [:nofollow_meta], issues.to_a.map(&:code)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def test_reports_noindex_nofollow_header
|
|
46
|
+
issues = Crawlscope::IssueCollection.new
|
|
47
|
+
page = page_with(headers: {"X-Robots-Tag" => "googlebot: noindex, nofollow"})
|
|
48
|
+
|
|
49
|
+
Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues)
|
|
50
|
+
|
|
51
|
+
codes = issues.to_a.map(&:code)
|
|
52
|
+
assert_includes codes, :noindex_header
|
|
53
|
+
assert_includes codes, :nofollow_header
|
|
54
|
+
assert_includes codes, :noindex_nofollow_header
|
|
55
|
+
assert_includes codes, :sitemap_noindex_url
|
|
23
56
|
end
|
|
24
57
|
|
|
25
58
|
def test_reports_x_robots_tag_noindex
|
|
@@ -41,7 +41,7 @@ class CrawlscopeLinksRuleTest < Minitest::Test
|
|
|
41
41
|
context: context
|
|
42
42
|
)
|
|
43
43
|
|
|
44
|
-
|
|
44
|
+
assert_includes issues.to_a.map(&:code), :broken_internal_link
|
|
45
45
|
assert_includes issues.to_a.first.message, "HTTP 404"
|
|
46
46
|
end
|
|
47
47
|
|
|
@@ -114,8 +114,151 @@ class CrawlscopeLinksRuleTest < Minitest::Test
|
|
|
114
114
|
context: context
|
|
115
115
|
)
|
|
116
116
|
|
|
117
|
-
|
|
118
|
-
|
|
117
|
+
orphan_issue = issues.to_a.find { |item| item.code == :orphan_page }
|
|
118
|
+
assert orphan_issue
|
|
119
|
+
assert_includes issues.to_a.map(&:code), :low_dofollow_inlinks
|
|
120
|
+
assert_equal "https://example.com/guide", orphan_issue.url
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def test_reports_pages_with_no_outgoing_internal_links
|
|
124
|
+
issues = Crawlscope::IssueCollection.new
|
|
125
|
+
|
|
126
|
+
Crawlscope::Rules::Links.new.call(
|
|
127
|
+
urls: ["https://example.com/guide", "https://example.com/pricing"],
|
|
128
|
+
pages: [
|
|
129
|
+
page(url: "https://example.com/guide", body: "<main><a href=\"/pricing\">Pricing</a></main>"),
|
|
130
|
+
page(url: "https://example.com/pricing", body: "<main><p>Pricing</p></main>")
|
|
131
|
+
],
|
|
132
|
+
issues: issues,
|
|
133
|
+
context: context
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
issue = issues.to_a.find { |item| item.code == :page_has_no_outgoing_links }
|
|
137
|
+
assert issue
|
|
138
|
+
assert_equal "https://example.com/pricing", issue.url
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def test_reports_nofollow_outlinks_and_inlink_follow_mix
|
|
142
|
+
issues = Crawlscope::IssueCollection.new
|
|
143
|
+
|
|
144
|
+
Crawlscope::Rules::Links.new.call(
|
|
145
|
+
urls: ["https://example.com/guide", "https://example.com/pricing", "https://example.com/about"],
|
|
146
|
+
pages: [
|
|
147
|
+
page(url: "https://example.com/guide", body: "<main><a href=\"/pricing\" rel=\"nofollow\">Pricing</a><a href=\"/about\">About</a></main>"),
|
|
148
|
+
page(url: "https://example.com/about", body: "<main><a href=\"/pricing\">Pricing</a></main>"),
|
|
149
|
+
page(url: "https://example.com/pricing", body: "<main><p>Pricing</p></main>")
|
|
150
|
+
],
|
|
151
|
+
issues: issues,
|
|
152
|
+
context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
codes = issues.to_a.map(&:code)
|
|
156
|
+
assert_includes codes, :nofollow_internal_outlinks
|
|
157
|
+
assert_includes codes, :mixed_follow_internal_inlinks
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def test_reports_only_nofollow_internal_inlinks
|
|
161
|
+
issues = Crawlscope::IssueCollection.new
|
|
162
|
+
|
|
163
|
+
Crawlscope::Rules::Links.new.call(
|
|
164
|
+
urls: ["https://example.com/guide", "https://example.com/pricing"],
|
|
165
|
+
pages: [
|
|
166
|
+
page(url: "https://example.com/guide", body: "<main><a href=\"/pricing\" rel=\"nofollow\">Pricing</a></main>"),
|
|
167
|
+
page(url: "https://example.com/pricing", body: "<main><p>Pricing</p></main>")
|
|
168
|
+
],
|
|
169
|
+
issues: issues,
|
|
170
|
+
context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
assert_includes issues.to_a.map(&:code), :only_nofollow_internal_inlinks
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def test_reports_https_pages_linking_to_internal_http_urls
|
|
177
|
+
issues = Crawlscope::IssueCollection.new
|
|
178
|
+
|
|
179
|
+
Crawlscope::Rules::Links.new.call(
|
|
180
|
+
urls: ["https://example.com/guide"],
|
|
181
|
+
pages: [page(url: "https://example.com/guide", body: "<main><a href=\"http://example.com/pricing\">Pricing</a></main>")],
|
|
182
|
+
issues: issues,
|
|
183
|
+
context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
assert_includes issues.to_a.map(&:code), :http_internal_link
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def test_reports_canonical_target_link_issues
|
|
190
|
+
issues = Crawlscope::IssueCollection.new
|
|
191
|
+
resolver = lambda do |target_url|
|
|
192
|
+
redirects = target_url == "https://example.com/canonical-about"
|
|
193
|
+
status = redirects ? 301 : 200
|
|
194
|
+
final_url = redirects ? "https://example.com/about" : target_url
|
|
195
|
+
{crawled: false, error: nil, final_url: final_url, status: status}
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
Crawlscope::Rules::Links.new.call(
|
|
199
|
+
urls: ["https://example.com/guide", "https://example.com/about"],
|
|
200
|
+
pages: [
|
|
201
|
+
page(url: "https://example.com/guide", body: "<main><a href=\"/about\">About</a></main>"),
|
|
202
|
+
page(
|
|
203
|
+
url: "https://example.com/about",
|
|
204
|
+
body: <<~HTML
|
|
205
|
+
<html>
|
|
206
|
+
<head><link rel="canonical" href="https://example.com/canonical-about"></head>
|
|
207
|
+
<body><main><p>About</p></main></body>
|
|
208
|
+
</html>
|
|
209
|
+
HTML
|
|
210
|
+
)
|
|
211
|
+
],
|
|
212
|
+
issues: issues,
|
|
213
|
+
context: context(resolver: resolver)
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
codes = issues.to_a.map(&:code)
|
|
217
|
+
assert_includes codes, :canonical_no_internal_inlinks
|
|
218
|
+
assert_includes codes, :canonical_points_to_redirect
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def test_reports_indexable_internal_pages_missing_from_sitemap
|
|
222
|
+
issues = Crawlscope::IssueCollection.new
|
|
223
|
+
resolver = lambda do |target_url|
|
|
224
|
+
{
|
|
225
|
+
crawled: false,
|
|
226
|
+
error: nil,
|
|
227
|
+
final_url: target_url,
|
|
228
|
+
html: true,
|
|
229
|
+
status: 200
|
|
230
|
+
}
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
Crawlscope::Rules::Links.new.call(
|
|
234
|
+
urls: ["https://example.com/guide"],
|
|
235
|
+
pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
|
|
236
|
+
issues: issues,
|
|
237
|
+
context: context(resolver: resolver)
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
issue = issues.to_a.find { |item| item.code == :indexable_page_missing_from_sitemap }
|
|
241
|
+
assert issue
|
|
242
|
+
assert_equal "https://example.com/hidden", issue.url
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
def test_reports_url_hygiene_issues
|
|
246
|
+
issues = Crawlscope::IssueCollection.new
|
|
247
|
+
long_path = "a" * 2_050
|
|
248
|
+
|
|
249
|
+
Crawlscope::Rules::Links.new.call(
|
|
250
|
+
urls: ["https://example.com//bad", "https://example.com/#{long_path}"],
|
|
251
|
+
pages: [
|
|
252
|
+
page(url: "https://example.com//bad", body: "<main><a href=\"/ok\">OK</a></main>"),
|
|
253
|
+
page(url: "https://example.com/#{long_path}", body: "<main><a href=\"/ok\">OK</a></main>")
|
|
254
|
+
],
|
|
255
|
+
issues: issues,
|
|
256
|
+
context: context(resolver: ->(target_url) { {crawled: false, error: nil, final_url: target_url, html: true, status: 200} })
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
codes = issues.to_a.map(&:code)
|
|
260
|
+
assert_includes codes, :url_double_slash
|
|
261
|
+
assert_includes codes, :url_too_long
|
|
119
262
|
end
|
|
120
263
|
|
|
121
264
|
def test_counts_root_page_links_as_inbound_links
|
|
@@ -217,6 +360,7 @@ class CrawlscopeLinksRuleTest < Minitest::Test
|
|
|
217
360
|
crawled: true,
|
|
218
361
|
error: nil,
|
|
219
362
|
final_url: target_url,
|
|
363
|
+
html: true,
|
|
220
364
|
status: 200
|
|
221
365
|
}
|
|
222
366
|
when "https://example.com/missing"
|
|
@@ -224,6 +368,7 @@ class CrawlscopeLinksRuleTest < Minitest::Test
|
|
|
224
368
|
crawled: false,
|
|
225
369
|
error: nil,
|
|
226
370
|
final_url: target_url,
|
|
371
|
+
html: false,
|
|
227
372
|
status: 404
|
|
228
373
|
}
|
|
229
374
|
end
|
|
@@ -48,6 +48,42 @@ class CrawlscopeMetadataRuleTest < Minitest::Test
|
|
|
48
48
|
refute_includes issues.to_a.map(&:code), :canonical_mismatch
|
|
49
49
|
end
|
|
50
50
|
|
|
51
|
+
def test_reports_multiple_title_multiple_descriptions_empty_h1_and_sitemap_canonical_mismatch
|
|
52
|
+
issues = Crawlscope::IssueCollection.new
|
|
53
|
+
invalid_page = page(
|
|
54
|
+
body: <<~HTML
|
|
55
|
+
<html>
|
|
56
|
+
<head>
|
|
57
|
+
<title>About</title>
|
|
58
|
+
<title>Duplicate About</title>
|
|
59
|
+
<meta name="description" content="A clear description that is long enough for search snippets, local validation checks, and realistic production metadata audits.">
|
|
60
|
+
<meta name="description" content="Duplicate description">
|
|
61
|
+
<link rel="canonical" href="https://example.com/canonical-about">
|
|
62
|
+
<meta property="og:title" content="About">
|
|
63
|
+
<meta property="og:description" content="About page">
|
|
64
|
+
<meta property="og:url" content="https://example.com/about">
|
|
65
|
+
<meta property="og:type" content="website">
|
|
66
|
+
<meta property="og:image" content="https://example.com/icon.png">
|
|
67
|
+
</head>
|
|
68
|
+
<body><main><h1> </h1></main></body>
|
|
69
|
+
</html>
|
|
70
|
+
HTML
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
Crawlscope::Rules::Metadata.new.call(
|
|
74
|
+
urls: [invalid_page.url],
|
|
75
|
+
pages: [invalid_page],
|
|
76
|
+
issues: issues
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
codes = issues.to_a.map(&:code)
|
|
80
|
+
assert_includes codes, :multiple_title_tags
|
|
81
|
+
assert_includes codes, :multiple_meta_descriptions
|
|
82
|
+
assert_includes codes, :empty_h1
|
|
83
|
+
assert_includes codes, :canonical_mismatch
|
|
84
|
+
assert_includes codes, :non_canonical_page_in_sitemap
|
|
85
|
+
end
|
|
86
|
+
|
|
51
87
|
private
|
|
52
88
|
|
|
53
89
|
def page(url: "https://example.com/about", body: nil)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeRakeTasksTest < Minitest::Test
|
|
6
|
+
def setup
|
|
7
|
+
@original_start = Crawlscope::Cli.method(:start)
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def teardown
|
|
11
|
+
singleton_class = class << Crawlscope::Cli; self; end
|
|
12
|
+
original_start = @original_start
|
|
13
|
+
singleton_class.define_method(:start) do |*args, **kwargs|
|
|
14
|
+
original_start.call(*args, **kwargs)
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def test_validate_passes_rake_arguments_to_cli
|
|
19
|
+
calls = capture_cli_calls
|
|
20
|
+
|
|
21
|
+
Crawlscope::RakeTasks.validate(
|
|
22
|
+
url: "http://localhost:3001",
|
|
23
|
+
sitemap_path: "http://localhost:3001/sitemap.xml",
|
|
24
|
+
rule_names: "metadata,links"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
assert_equal(
|
|
28
|
+
["validate", "--url", "http://localhost:3001", "--sitemap", "http://localhost:3001/sitemap.xml", "--rules", "metadata,links"],
|
|
29
|
+
calls.fetch(0).fetch(:argv)
|
|
30
|
+
)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def test_validate_rule_passes_rule_and_rake_arguments_to_cli
|
|
34
|
+
calls = capture_cli_calls
|
|
35
|
+
|
|
36
|
+
Crawlscope::RakeTasks.validate_rule(
|
|
37
|
+
"metadata",
|
|
38
|
+
url: "http://localhost:3001",
|
|
39
|
+
sitemap_path: "http://localhost:3001/sitemap.xml"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
assert_equal(
|
|
43
|
+
["validate", "--url", "http://localhost:3001", "--sitemap", "http://localhost:3001/sitemap.xml", "--rules", "metadata"],
|
|
44
|
+
calls.fetch(0).fetch(:argv)
|
|
45
|
+
)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def test_ldjson_passes_rake_url_argument_to_cli
|
|
49
|
+
calls = capture_cli_calls
|
|
50
|
+
|
|
51
|
+
Crawlscope::RakeTasks.ldjson(urls: "http://localhost:3001/article")
|
|
52
|
+
|
|
53
|
+
assert_equal(
|
|
54
|
+
["ldjson", "--url", "http://localhost:3001/article"],
|
|
55
|
+
calls.fetch(0).fetch(:argv)
|
|
56
|
+
)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
private
|
|
60
|
+
|
|
61
|
+
def capture_cli_calls
|
|
62
|
+
calls = []
|
|
63
|
+
singleton_class = class << Crawlscope::Cli; self; end
|
|
64
|
+
singleton_class.define_method(:start) do |argv, **kwargs|
|
|
65
|
+
calls << {argv: argv, kwargs: kwargs}
|
|
66
|
+
0
|
|
67
|
+
end
|
|
68
|
+
calls
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -23,7 +23,7 @@ class CrawlscopeReporterTest < Minitest::Test
|
|
|
23
23
|
refute_includes output, "Status: FAILED"
|
|
24
24
|
end
|
|
25
25
|
|
|
26
|
-
def
|
|
26
|
+
def test_reports_failed_result_with_grouped_counts_and_offenses
|
|
27
27
|
io = StringIO.new
|
|
28
28
|
issues = Crawlscope::IssueCollection.new
|
|
29
29
|
issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com/a", message: "missing <title>", details: {})
|
|
@@ -42,9 +42,13 @@ class CrawlscopeReporterTest < Minitest::Test
|
|
|
42
42
|
|
|
43
43
|
assert_includes output, "Status: FAILED"
|
|
44
44
|
assert_includes output, "Issues: 2"
|
|
45
|
+
assert_includes output, "Severity:"
|
|
45
46
|
assert_includes output, "notice: 1"
|
|
46
47
|
assert_includes output, "warning: 1"
|
|
47
|
-
assert_includes output, "
|
|
48
|
-
assert_includes output, "
|
|
48
|
+
assert_includes output, "Category:"
|
|
49
|
+
assert_includes output, "links: 1"
|
|
50
|
+
assert_includes output, "metadata: 1"
|
|
51
|
+
assert_includes output, " - [warning] missing_title https://example.com/a missing <title>"
|
|
52
|
+
assert_includes output, " - [notice] broken_internal_link https://example.com/b broken internal link"
|
|
49
53
|
end
|
|
50
54
|
end
|
|
@@ -49,6 +49,30 @@ class CrawlscopeSitemapTest < Minitest::Test
|
|
|
49
49
|
assert_equal ["https://www.example.com/features/reviews"], parser.urls(base_url: "https://www.example.com")
|
|
50
50
|
end
|
|
51
51
|
|
|
52
|
+
def test_remote_sitemap_http_error_is_explicit
|
|
53
|
+
stub_request(:get, "https://www.example.com/sitemap.xml")
|
|
54
|
+
.to_return(status: 500, body: "<html><body>Error</body></html>")
|
|
55
|
+
|
|
56
|
+
parser = Crawlscope::Sitemap.new(path: "https://www.example.com/sitemap.xml")
|
|
57
|
+
|
|
58
|
+
error = assert_raises(Crawlscope::ValidationError) do
|
|
59
|
+
parser.urls(base_url: "https://www.example.com")
|
|
60
|
+
end
|
|
61
|
+
assert_equal "Sitemap https://www.example.com/sitemap.xml returned HTTP 500", error.message
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def test_invalid_sitemap_root_is_explicit
|
|
65
|
+
stub_request(:get, "https://www.example.com/sitemap.xml")
|
|
66
|
+
.to_return(status: 200, body: "<html><body>Error</body></html>")
|
|
67
|
+
|
|
68
|
+
parser = Crawlscope::Sitemap.new(path: "https://www.example.com/sitemap.xml")
|
|
69
|
+
|
|
70
|
+
error = assert_raises(Crawlscope::ValidationError) do
|
|
71
|
+
parser.urls(base_url: "https://www.example.com")
|
|
72
|
+
end
|
|
73
|
+
assert_equal 'Sitemap https://www.example.com/sitemap.xml has unexpected root "html"', error.message
|
|
74
|
+
end
|
|
75
|
+
|
|
52
76
|
def test_rebases_remote_sitemap_index_children_to_base_url
|
|
53
77
|
stub_request(:get, "http://localhost:3000/sitemap.xml")
|
|
54
78
|
.to_return(
|
|
@@ -79,6 +79,62 @@ class CrawlscopeStructuredDataRuleTest < Minitest::Test
|
|
|
79
79
|
assert_equal ["json-ld", "microdata"], issues.to_a.first.details[:expected_sources]
|
|
80
80
|
end
|
|
81
81
|
|
|
82
|
+
def test_reports_structured_data_missing_type
|
|
83
|
+
issues = Crawlscope::IssueCollection.new
|
|
84
|
+
rule = Crawlscope::Rules::StructuredData.new
|
|
85
|
+
page = page(
|
|
86
|
+
url: "https://example.com/articles/test",
|
|
87
|
+
body: <<~HTML
|
|
88
|
+
<html>
|
|
89
|
+
<head>
|
|
90
|
+
<script type="application/ld+json">
|
|
91
|
+
{"@context":"https://schema.org","headline":"Untyped article"}
|
|
92
|
+
</script>
|
|
93
|
+
</head>
|
|
94
|
+
<body><h1>Article</h1></body>
|
|
95
|
+
</html>
|
|
96
|
+
HTML
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
rule.call(
|
|
100
|
+
urls: [page.url],
|
|
101
|
+
pages: [page],
|
|
102
|
+
issues: issues,
|
|
103
|
+
context: {schema_registry: Crawlscope::SchemaRegistry.default}
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
assert_includes issues.to_a.map(&:code), :structured_data_missing_type
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def test_reports_graph_entries_missing_type
|
|
110
|
+
issues = Crawlscope::IssueCollection.new
|
|
111
|
+
rule = Crawlscope::Rules::StructuredData.new
|
|
112
|
+
page = page(
|
|
113
|
+
url: "https://example.com/articles/test",
|
|
114
|
+
body: <<~HTML
|
|
115
|
+
<html>
|
|
116
|
+
<head>
|
|
117
|
+
<script type="application/ld+json">
|
|
118
|
+
{"@context":"https://schema.org","@type":"WebPage","@graph":[{"name":"Untyped node"}]}
|
|
119
|
+
</script>
|
|
120
|
+
</head>
|
|
121
|
+
<body><h1>Article</h1></body>
|
|
122
|
+
</html>
|
|
123
|
+
HTML
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
rule.call(
|
|
127
|
+
urls: [page.url],
|
|
128
|
+
pages: [page],
|
|
129
|
+
issues: issues,
|
|
130
|
+
context: {schema_registry: Crawlscope::SchemaRegistry.default}
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
issue = issues.to_a.find { |item| item.code == :structured_data_missing_type }
|
|
134
|
+
assert issue
|
|
135
|
+
assert_equal ["$.@graph[0]"], issue.details[:paths]
|
|
136
|
+
end
|
|
137
|
+
|
|
82
138
|
def test_validates_job_posting_markup
|
|
83
139
|
issues = Crawlscope::IssueCollection.new
|
|
84
140
|
rule = Crawlscope::Rules::StructuredData.new
|
|
@@ -13,7 +13,20 @@ class CrawlscopeUniquenessRuleTest < Minitest::Test
|
|
|
13
13
|
|
|
14
14
|
rule.call(urls: pages.map(&:url), pages: pages, issues: issues, context: {})
|
|
15
15
|
|
|
16
|
-
assert_equal %i[duplicate_content_fingerprint duplicate_meta_description duplicate_title].sort, issues.to_a.map(&:code).sort
|
|
16
|
+
assert_equal %i[duplicate_content_fingerprint duplicate_meta_description duplicate_pages_without_canonical duplicate_title].sort, issues.to_a.map(&:code).sort
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def test_allows_duplicate_pages_when_canonicals_are_present
|
|
20
|
+
issues = Crawlscope::IssueCollection.new
|
|
21
|
+
rule = Crawlscope::Rules::Uniqueness.new
|
|
22
|
+
pages = [
|
|
23
|
+
page(url: "https://example.com/a", canonical: "https://example.com/a"),
|
|
24
|
+
page(url: "https://example.com/b", canonical: "https://example.com/a")
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
rule.call(urls: pages.map(&:url), pages: pages, issues: issues, context: {})
|
|
28
|
+
|
|
29
|
+
refute_includes issues.to_a.map(&:code), :duplicate_pages_without_canonical
|
|
17
30
|
end
|
|
18
31
|
|
|
19
32
|
def test_reports_near_duplicate_content
|
|
@@ -59,13 +72,15 @@ class CrawlscopeUniquenessRuleTest < Minitest::Test
|
|
|
59
72
|
TEXT
|
|
60
73
|
end
|
|
61
74
|
|
|
62
|
-
def page(url:, content: nil)
|
|
75
|
+
def page(url:, content: nil, canonical: nil)
|
|
63
76
|
repeated_text = content || ("Useful content " * 30).strip
|
|
77
|
+
canonical_tag = canonical ? %(<link rel="canonical" href="#{canonical}">) : ""
|
|
64
78
|
body = <<~HTML
|
|
65
79
|
<html>
|
|
66
80
|
<head>
|
|
67
81
|
<title>Example Title</title>
|
|
68
82
|
<meta name="description" content="Example description">
|
|
83
|
+
#{canonical_tag}
|
|
69
84
|
</head>
|
|
70
85
|
<body>
|
|
71
86
|
<main>#{repeated_text}</main>
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: crawlscope
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Paulo Fidalgo
|
|
@@ -239,6 +239,7 @@ files:
|
|
|
239
239
|
- test/crawlscope/links_rule_test.rb
|
|
240
240
|
- test/crawlscope/loader_test.rb
|
|
241
241
|
- test/crawlscope/metadata_rule_test.rb
|
|
242
|
+
- test/crawlscope/rake_tasks_test.rb
|
|
242
243
|
- test/crawlscope/reporter_test.rb
|
|
243
244
|
- test/crawlscope/rule_registry_test.rb
|
|
244
245
|
- test/crawlscope/run_test.rb
|