crawlscope 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,43 +1,43 @@
1
1
  namespace :crawlscope do
2
- desc "Validate URLs with all default Crawlscope rules. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
3
- task validate: :environment do
4
- Crawlscope::RakeTasks.validate
2
+ desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
3
+ task :validate, [:url, :sitemap, :rules] => :environment do |_task, args|
4
+ Crawlscope::RakeTasks.validate(url: args[:url], sitemap_path: args[:sitemap], rule_names: args[:rules])
5
5
  end
6
6
 
7
7
  namespace :validate do
8
- desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
9
- task ldjson: :environment do
10
- Crawlscope::RakeTasks.ldjson
8
+ desc "Directly validate JSON-LD on one URL. Args: [url]. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
9
+ task :ldjson, [:url] => :environment do |_task, args|
10
+ Crawlscope::RakeTasks.ldjson(urls: args[:url])
11
11
  end
12
12
 
13
- desc "Validate URLs with the indexability rule. ENV: URL, SITEMAP, JS=1"
14
- task indexability: :environment do
15
- Crawlscope::RakeTasks.validate_rule("indexability")
13
+ desc "Validate URLs with the indexability rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
14
+ task :indexability, [:url, :sitemap] => :environment do |_task, args|
15
+ Crawlscope::RakeTasks.validate_rule("indexability", url: args[:url], sitemap_path: args[:sitemap])
16
16
  end
17
17
 
18
- desc "Validate URLs with the metadata rule. ENV: URL, SITEMAP, JS=1"
19
- task metadata: :environment do
20
- Crawlscope::RakeTasks.validate_rule("metadata")
18
+ desc "Validate URLs with the metadata rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
19
+ task :metadata, [:url, :sitemap] => :environment do |_task, args|
20
+ Crawlscope::RakeTasks.validate_rule("metadata", url: args[:url], sitemap_path: args[:sitemap])
21
21
  end
22
22
 
23
- desc "Validate sitemap URLs with the structured_data rule. ENV: URL, SITEMAP, JS=1"
24
- task structured_data: :environment do
25
- Crawlscope::RakeTasks.validate_rule("structured_data")
23
+ desc "Validate sitemap URLs with the structured_data rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
24
+ task :structured_data, [:url, :sitemap] => :environment do |_task, args|
25
+ Crawlscope::RakeTasks.validate_rule("structured_data", url: args[:url], sitemap_path: args[:sitemap])
26
26
  end
27
27
 
28
- desc "Validate URLs with the uniqueness rule. ENV: URL, SITEMAP, JS=1"
29
- task uniqueness: :environment do
30
- Crawlscope::RakeTasks.validate_rule("uniqueness")
28
+ desc "Validate URLs with the uniqueness rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
29
+ task :uniqueness, [:url, :sitemap] => :environment do |_task, args|
30
+ Crawlscope::RakeTasks.validate_rule("uniqueness", url: args[:url], sitemap_path: args[:sitemap])
31
31
  end
32
32
 
33
- desc "Validate URLs with the content_quality rule. ENV: URL, SITEMAP, JS=1"
34
- task content_quality: :environment do
35
- Crawlscope::RakeTasks.validate_rule("content_quality")
33
+ desc "Validate URLs with the content_quality rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
34
+ task :content_quality, [:url, :sitemap] => :environment do |_task, args|
35
+ Crawlscope::RakeTasks.validate_rule("content_quality", url: args[:url], sitemap_path: args[:sitemap])
36
36
  end
37
37
 
38
- desc "Validate URLs with the links rule. ENV: URL, SITEMAP, JS=1"
39
- task links: :environment do
40
- Crawlscope::RakeTasks.validate_rule("links")
38
+ desc "Validate URLs with the links rule. Args: [url,sitemap]. ENV: URL, SITEMAP, JS=1"
39
+ task :links, [:url, :sitemap] => :environment do |_task, args|
40
+ Crawlscope::RakeTasks.validate_rule("links", url: args[:url], sitemap_path: args[:sitemap])
41
41
  end
42
42
  end
43
43
  end
@@ -265,6 +265,7 @@ class CrawlscopeCliTest < Minitest::Test
265
265
 
266
266
  assert_equal 1, status
267
267
  assert_includes err.string, "No URLs found in sitemap"
268
+ refute_includes err.string, "Usage:"
268
269
  end
269
270
 
270
271
  private
@@ -188,4 +188,30 @@ class CrawlscopeCrawlTest < Minitest::Test
188
188
  assert_equal ["https://example.com/pricing"], fake_browser.urls
189
189
  assert fake_browser.closed
190
190
  end
191
+
192
+ def test_reports_sitemap_redirect_url
193
+ File.write(
194
+ @sitemap_path,
195
+ <<~XML
196
+ <?xml version="1.0" encoding="UTF-8"?>
197
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
198
+ <url><loc>https://example.com/old</loc></url>
199
+ </urlset>
200
+ XML
201
+ )
202
+
203
+ stub_request(:get, "https://example.com/old")
204
+ .to_return(status: 301, headers: {"Location" => "https://example.com/new"}, body: "")
205
+ stub_request(:get, "https://example.com/new")
206
+ .to_return(status: 200, headers: {"Content-Type" => "text/html"}, body: "<html><body>Moved</body></html>")
207
+
208
+ result = Crawlscope::Crawl.new(
209
+ base_url: "https://example.com",
210
+ sitemap_path: @sitemap_path,
211
+ rules: [],
212
+ schema_registry: Crawlscope::SchemaRegistry.default
213
+ ).call
214
+
215
+ assert_includes result.issues.to_a.map(&:code), :sitemap_redirect_url
216
+ end
191
217
  end
@@ -20,6 +20,39 @@ class CrawlscopeIndexabilityRuleTest < Minitest::Test
20
20
  assert_equal :noindex_meta, issue.code
21
21
  assert_equal :error, issue.severity
22
22
  assert_equal "noindex, follow", issue.details[:content]
23
+
24
+ codes = issues.to_a.map(&:code)
25
+ assert_includes codes, :noindex_follow_meta
26
+ assert_includes codes, :sitemap_noindex_url
27
+ end
28
+
29
+ def test_reports_meta_nofollow
30
+ issues = Crawlscope::IssueCollection.new
31
+ page = page_with(
32
+ body: <<~HTML
33
+ <html>
34
+ <head><meta name="robots" content="nofollow"></head>
35
+ <body><main>Visible content</main></body>
36
+ </html>
37
+ HTML
38
+ )
39
+
40
+ Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues)
41
+
42
+ assert_equal [:nofollow_meta], issues.to_a.map(&:code)
43
+ end
44
+
45
+ def test_reports_noindex_nofollow_header
46
+ issues = Crawlscope::IssueCollection.new
47
+ page = page_with(headers: {"X-Robots-Tag" => "googlebot: noindex, nofollow"})
48
+
49
+ Crawlscope::Rules::Indexability.new.call(urls: [page.url], pages: [page], issues: issues)
50
+
51
+ codes = issues.to_a.map(&:code)
52
+ assert_includes codes, :noindex_header
53
+ assert_includes codes, :nofollow_header
54
+ assert_includes codes, :noindex_nofollow_header
55
+ assert_includes codes, :sitemap_noindex_url
23
56
  end
24
57
 
25
58
  def test_reports_x_robots_tag_noindex
@@ -41,7 +41,7 @@ class CrawlscopeLinksRuleTest < Minitest::Test
41
41
  context: context
42
42
  )
43
43
 
44
- assert_equal [:broken_internal_link], issues.to_a.map(&:code)
44
+ assert_includes issues.to_a.map(&:code), :broken_internal_link
45
45
  assert_includes issues.to_a.first.message, "HTTP 404"
46
46
  end
47
47
 
@@ -114,8 +114,151 @@ class CrawlscopeLinksRuleTest < Minitest::Test
114
114
  context: context
115
115
  )
116
116
 
117
- assert_equal [:low_inbound_anchor_links], issues.to_a.map(&:code)
118
- assert_equal "https://example.com/guide", issues.to_a.first.url
117
+ orphan_issue = issues.to_a.find { |item| item.code == :orphan_page }
118
+ assert orphan_issue
119
+ assert_includes issues.to_a.map(&:code), :low_dofollow_inlinks
120
+ assert_equal "https://example.com/guide", orphan_issue.url
121
+ end
122
+
123
+ def test_reports_pages_with_no_outgoing_internal_links
124
+ issues = Crawlscope::IssueCollection.new
125
+
126
+ Crawlscope::Rules::Links.new.call(
127
+ urls: ["https://example.com/guide", "https://example.com/pricing"],
128
+ pages: [
129
+ page(url: "https://example.com/guide", body: "<main><a href=\"/pricing\">Pricing</a></main>"),
130
+ page(url: "https://example.com/pricing", body: "<main><p>Pricing</p></main>")
131
+ ],
132
+ issues: issues,
133
+ context: context
134
+ )
135
+
136
+ issue = issues.to_a.find { |item| item.code == :page_has_no_outgoing_links }
137
+ assert issue
138
+ assert_equal "https://example.com/pricing", issue.url
139
+ end
140
+
141
+ def test_reports_nofollow_outlinks_and_inlink_follow_mix
142
+ issues = Crawlscope::IssueCollection.new
143
+
144
+ Crawlscope::Rules::Links.new.call(
145
+ urls: ["https://example.com/guide", "https://example.com/pricing", "https://example.com/about"],
146
+ pages: [
147
+ page(url: "https://example.com/guide", body: "<main><a href=\"/pricing\" rel=\"nofollow\">Pricing</a><a href=\"/about\">About</a></main>"),
148
+ page(url: "https://example.com/about", body: "<main><a href=\"/pricing\">Pricing</a></main>"),
149
+ page(url: "https://example.com/pricing", body: "<main><p>Pricing</p></main>")
150
+ ],
151
+ issues: issues,
152
+ context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
153
+ )
154
+
155
+ codes = issues.to_a.map(&:code)
156
+ assert_includes codes, :nofollow_internal_outlinks
157
+ assert_includes codes, :mixed_follow_internal_inlinks
158
+ end
159
+
160
+ def test_reports_only_nofollow_internal_inlinks
161
+ issues = Crawlscope::IssueCollection.new
162
+
163
+ Crawlscope::Rules::Links.new.call(
164
+ urls: ["https://example.com/guide", "https://example.com/pricing"],
165
+ pages: [
166
+ page(url: "https://example.com/guide", body: "<main><a href=\"/pricing\" rel=\"nofollow\">Pricing</a></main>"),
167
+ page(url: "https://example.com/pricing", body: "<main><p>Pricing</p></main>")
168
+ ],
169
+ issues: issues,
170
+ context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
171
+ )
172
+
173
+ assert_includes issues.to_a.map(&:code), :only_nofollow_internal_inlinks
174
+ end
175
+
176
+ def test_reports_https_pages_linking_to_internal_http_urls
177
+ issues = Crawlscope::IssueCollection.new
178
+
179
+ Crawlscope::Rules::Links.new.call(
180
+ urls: ["https://example.com/guide"],
181
+ pages: [page(url: "https://example.com/guide", body: "<main><a href=\"http://example.com/pricing\">Pricing</a></main>")],
182
+ issues: issues,
183
+ context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
184
+ )
185
+
186
+ assert_includes issues.to_a.map(&:code), :http_internal_link
187
+ end
188
+
189
+ def test_reports_canonical_target_link_issues
190
+ issues = Crawlscope::IssueCollection.new
191
+ resolver = lambda do |target_url|
192
+ redirects = target_url == "https://example.com/canonical-about"
193
+ status = redirects ? 301 : 200
194
+ final_url = redirects ? "https://example.com/about" : target_url
195
+ {crawled: false, error: nil, final_url: final_url, status: status}
196
+ end
197
+
198
+ Crawlscope::Rules::Links.new.call(
199
+ urls: ["https://example.com/guide", "https://example.com/about"],
200
+ pages: [
201
+ page(url: "https://example.com/guide", body: "<main><a href=\"/about\">About</a></main>"),
202
+ page(
203
+ url: "https://example.com/about",
204
+ body: <<~HTML
205
+ <html>
206
+ <head><link rel="canonical" href="https://example.com/canonical-about"></head>
207
+ <body><main><p>About</p></main></body>
208
+ </html>
209
+ HTML
210
+ )
211
+ ],
212
+ issues: issues,
213
+ context: context(resolver: resolver)
214
+ )
215
+
216
+ codes = issues.to_a.map(&:code)
217
+ assert_includes codes, :canonical_no_internal_inlinks
218
+ assert_includes codes, :canonical_points_to_redirect
219
+ end
220
+
221
+ def test_reports_indexable_internal_pages_missing_from_sitemap
222
+ issues = Crawlscope::IssueCollection.new
223
+ resolver = lambda do |target_url|
224
+ {
225
+ crawled: false,
226
+ error: nil,
227
+ final_url: target_url,
228
+ html: true,
229
+ status: 200
230
+ }
231
+ end
232
+
233
+ Crawlscope::Rules::Links.new.call(
234
+ urls: ["https://example.com/guide"],
235
+ pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/hidden\">Hidden</a></main>")],
236
+ issues: issues,
237
+ context: context(resolver: resolver)
238
+ )
239
+
240
+ issue = issues.to_a.find { |item| item.code == :indexable_page_missing_from_sitemap }
241
+ assert issue
242
+ assert_equal "https://example.com/hidden", issue.url
243
+ end
244
+
245
+ def test_reports_url_hygiene_issues
246
+ issues = Crawlscope::IssueCollection.new
247
+ long_path = "a" * 2_050
248
+
249
+ Crawlscope::Rules::Links.new.call(
250
+ urls: ["https://example.com//bad", "https://example.com/#{long_path}"],
251
+ pages: [
252
+ page(url: "https://example.com//bad", body: "<main><a href=\"/ok\">OK</a></main>"),
253
+ page(url: "https://example.com/#{long_path}", body: "<main><a href=\"/ok\">OK</a></main>")
254
+ ],
255
+ issues: issues,
256
+ context: context(resolver: ->(target_url) { {crawled: false, error: nil, final_url: target_url, html: true, status: 200} })
257
+ )
258
+
259
+ codes = issues.to_a.map(&:code)
260
+ assert_includes codes, :url_double_slash
261
+ assert_includes codes, :url_too_long
119
262
  end
120
263
 
121
264
  def test_counts_root_page_links_as_inbound_links
@@ -217,6 +360,7 @@ class CrawlscopeLinksRuleTest < Minitest::Test
217
360
  crawled: true,
218
361
  error: nil,
219
362
  final_url: target_url,
363
+ html: true,
220
364
  status: 200
221
365
  }
222
366
  when "https://example.com/missing"
@@ -224,6 +368,7 @@ class CrawlscopeLinksRuleTest < Minitest::Test
224
368
  crawled: false,
225
369
  error: nil,
226
370
  final_url: target_url,
371
+ html: false,
227
372
  status: 404
228
373
  }
229
374
  end
@@ -48,6 +48,42 @@ class CrawlscopeMetadataRuleTest < Minitest::Test
48
48
  refute_includes issues.to_a.map(&:code), :canonical_mismatch
49
49
  end
50
50
 
51
+ def test_reports_multiple_title_multiple_descriptions_empty_h1_and_sitemap_canonical_mismatch
52
+ issues = Crawlscope::IssueCollection.new
53
+ invalid_page = page(
54
+ body: <<~HTML
55
+ <html>
56
+ <head>
57
+ <title>About</title>
58
+ <title>Duplicate About</title>
59
+ <meta name="description" content="A clear description that is long enough for search snippets, local validation checks, and realistic production metadata audits.">
60
+ <meta name="description" content="Duplicate description">
61
+ <link rel="canonical" href="https://example.com/canonical-about">
62
+ <meta property="og:title" content="About">
63
+ <meta property="og:description" content="About page">
64
+ <meta property="og:url" content="https://example.com/about">
65
+ <meta property="og:type" content="website">
66
+ <meta property="og:image" content="https://example.com/icon.png">
67
+ </head>
68
+ <body><main><h1> </h1></main></body>
69
+ </html>
70
+ HTML
71
+ )
72
+
73
+ Crawlscope::Rules::Metadata.new.call(
74
+ urls: [invalid_page.url],
75
+ pages: [invalid_page],
76
+ issues: issues
77
+ )
78
+
79
+ codes = issues.to_a.map(&:code)
80
+ assert_includes codes, :multiple_title_tags
81
+ assert_includes codes, :multiple_meta_descriptions
82
+ assert_includes codes, :empty_h1
83
+ assert_includes codes, :canonical_mismatch
84
+ assert_includes codes, :non_canonical_page_in_sitemap
85
+ end
86
+
51
87
  private
52
88
 
53
89
  def page(url: "https://example.com/about", body: nil)
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeRakeTasksTest < Minitest::Test
6
+ def setup
7
+ @original_start = Crawlscope::Cli.method(:start)
8
+ end
9
+
10
+ def teardown
11
+ singleton_class = class << Crawlscope::Cli; self; end
12
+ original_start = @original_start
13
+ singleton_class.define_method(:start) do |*args, **kwargs|
14
+ original_start.call(*args, **kwargs)
15
+ end
16
+ end
17
+
18
+ def test_validate_passes_rake_arguments_to_cli
19
+ calls = capture_cli_calls
20
+
21
+ Crawlscope::RakeTasks.validate(
22
+ url: "http://localhost:3001",
23
+ sitemap_path: "http://localhost:3001/sitemap.xml",
24
+ rule_names: "metadata,links"
25
+ )
26
+
27
+ assert_equal(
28
+ ["validate", "--url", "http://localhost:3001", "--sitemap", "http://localhost:3001/sitemap.xml", "--rules", "metadata,links"],
29
+ calls.fetch(0).fetch(:argv)
30
+ )
31
+ end
32
+
33
+ def test_validate_rule_passes_rule_and_rake_arguments_to_cli
34
+ calls = capture_cli_calls
35
+
36
+ Crawlscope::RakeTasks.validate_rule(
37
+ "metadata",
38
+ url: "http://localhost:3001",
39
+ sitemap_path: "http://localhost:3001/sitemap.xml"
40
+ )
41
+
42
+ assert_equal(
43
+ ["validate", "--url", "http://localhost:3001", "--sitemap", "http://localhost:3001/sitemap.xml", "--rules", "metadata"],
44
+ calls.fetch(0).fetch(:argv)
45
+ )
46
+ end
47
+
48
+ def test_ldjson_passes_rake_url_argument_to_cli
49
+ calls = capture_cli_calls
50
+
51
+ Crawlscope::RakeTasks.ldjson(urls: "http://localhost:3001/article")
52
+
53
+ assert_equal(
54
+ ["ldjson", "--url", "http://localhost:3001/article"],
55
+ calls.fetch(0).fetch(:argv)
56
+ )
57
+ end
58
+
59
+ private
60
+
61
+ def capture_cli_calls
62
+ calls = []
63
+ singleton_class = class << Crawlscope::Cli; self; end
64
+ singleton_class.define_method(:start) do |argv, **kwargs|
65
+ calls << {argv: argv, kwargs: kwargs}
66
+ 0
67
+ end
68
+ calls
69
+ end
70
+ end
@@ -23,7 +23,7 @@ class CrawlscopeReporterTest < Minitest::Test
23
23
  refute_includes output, "Status: FAILED"
24
24
  end
25
25
 
26
- def test_reports_failed_result_with_severity_counts
26
+ def test_reports_failed_result_with_grouped_counts_and_offenses
27
27
  io = StringIO.new
28
28
  issues = Crawlscope::IssueCollection.new
29
29
  issues.add(code: :missing_title, severity: :warning, category: :metadata, url: "https://example.com/a", message: "missing <title>", details: {})
@@ -42,9 +42,13 @@ class CrawlscopeReporterTest < Minitest::Test
42
42
 
43
43
  assert_includes output, "Status: FAILED"
44
44
  assert_includes output, "Issues: 2"
45
+ assert_includes output, "Severity:"
45
46
  assert_includes output, "notice: 1"
46
47
  assert_includes output, "warning: 1"
47
- assert_includes output, "- [warning] https://example.com/a missing <title>"
48
- assert_includes output, "- [notice] https://example.com/b broken internal link"
48
+ assert_includes output, "Category:"
49
+ assert_includes output, "links: 1"
50
+ assert_includes output, "metadata: 1"
51
+ assert_includes output, " - [warning] missing_title https://example.com/a missing <title>"
52
+ assert_includes output, " - [notice] broken_internal_link https://example.com/b broken internal link"
49
53
  end
50
54
  end
@@ -49,6 +49,30 @@ class CrawlscopeSitemapTest < Minitest::Test
49
49
  assert_equal ["https://www.example.com/features/reviews"], parser.urls(base_url: "https://www.example.com")
50
50
  end
51
51
 
52
+ def test_remote_sitemap_http_error_is_explicit
53
+ stub_request(:get, "https://www.example.com/sitemap.xml")
54
+ .to_return(status: 500, body: "<html><body>Error</body></html>")
55
+
56
+ parser = Crawlscope::Sitemap.new(path: "https://www.example.com/sitemap.xml")
57
+
58
+ error = assert_raises(Crawlscope::ValidationError) do
59
+ parser.urls(base_url: "https://www.example.com")
60
+ end
61
+ assert_equal "Sitemap https://www.example.com/sitemap.xml returned HTTP 500", error.message
62
+ end
63
+
64
+ def test_invalid_sitemap_root_is_explicit
65
+ stub_request(:get, "https://www.example.com/sitemap.xml")
66
+ .to_return(status: 200, body: "<html><body>Error</body></html>")
67
+
68
+ parser = Crawlscope::Sitemap.new(path: "https://www.example.com/sitemap.xml")
69
+
70
+ error = assert_raises(Crawlscope::ValidationError) do
71
+ parser.urls(base_url: "https://www.example.com")
72
+ end
73
+ assert_equal 'Sitemap https://www.example.com/sitemap.xml has unexpected root "html"', error.message
74
+ end
75
+
52
76
  def test_rebases_remote_sitemap_index_children_to_base_url
53
77
  stub_request(:get, "http://localhost:3000/sitemap.xml")
54
78
  .to_return(
@@ -79,6 +79,62 @@ class CrawlscopeStructuredDataRuleTest < Minitest::Test
79
79
  assert_equal ["json-ld", "microdata"], issues.to_a.first.details[:expected_sources]
80
80
  end
81
81
 
82
+ def test_reports_structured_data_missing_type
83
+ issues = Crawlscope::IssueCollection.new
84
+ rule = Crawlscope::Rules::StructuredData.new
85
+ page = page(
86
+ url: "https://example.com/articles/test",
87
+ body: <<~HTML
88
+ <html>
89
+ <head>
90
+ <script type="application/ld+json">
91
+ {"@context":"https://schema.org","headline":"Untyped article"}
92
+ </script>
93
+ </head>
94
+ <body><h1>Article</h1></body>
95
+ </html>
96
+ HTML
97
+ )
98
+
99
+ rule.call(
100
+ urls: [page.url],
101
+ pages: [page],
102
+ issues: issues,
103
+ context: {schema_registry: Crawlscope::SchemaRegistry.default}
104
+ )
105
+
106
+ assert_includes issues.to_a.map(&:code), :structured_data_missing_type
107
+ end
108
+
109
+ def test_reports_graph_entries_missing_type
110
+ issues = Crawlscope::IssueCollection.new
111
+ rule = Crawlscope::Rules::StructuredData.new
112
+ page = page(
113
+ url: "https://example.com/articles/test",
114
+ body: <<~HTML
115
+ <html>
116
+ <head>
117
+ <script type="application/ld+json">
118
+ {"@context":"https://schema.org","@type":"WebPage","@graph":[{"name":"Untyped node"}]}
119
+ </script>
120
+ </head>
121
+ <body><h1>Article</h1></body>
122
+ </html>
123
+ HTML
124
+ )
125
+
126
+ rule.call(
127
+ urls: [page.url],
128
+ pages: [page],
129
+ issues: issues,
130
+ context: {schema_registry: Crawlscope::SchemaRegistry.default}
131
+ )
132
+
133
+ issue = issues.to_a.find { |item| item.code == :structured_data_missing_type }
134
+ assert issue
135
+ assert_equal ["$.@graph[0]"], issue.details[:paths]
136
+ end
137
+
82
138
  def test_validates_job_posting_markup
83
139
  issues = Crawlscope::IssueCollection.new
84
140
  rule = Crawlscope::Rules::StructuredData.new
@@ -13,7 +13,20 @@ class CrawlscopeUniquenessRuleTest < Minitest::Test
13
13
 
14
14
  rule.call(urls: pages.map(&:url), pages: pages, issues: issues, context: {})
15
15
 
16
- assert_equal %i[duplicate_content_fingerprint duplicate_meta_description duplicate_title].sort, issues.to_a.map(&:code).sort
16
+ assert_equal %i[duplicate_content_fingerprint duplicate_meta_description duplicate_pages_without_canonical duplicate_title].sort, issues.to_a.map(&:code).sort
17
+ end
18
+
19
+ def test_allows_duplicate_pages_when_canonicals_are_present
20
+ issues = Crawlscope::IssueCollection.new
21
+ rule = Crawlscope::Rules::Uniqueness.new
22
+ pages = [
23
+ page(url: "https://example.com/a", canonical: "https://example.com/a"),
24
+ page(url: "https://example.com/b", canonical: "https://example.com/a")
25
+ ]
26
+
27
+ rule.call(urls: pages.map(&:url), pages: pages, issues: issues, context: {})
28
+
29
+ refute_includes issues.to_a.map(&:code), :duplicate_pages_without_canonical
17
30
  end
18
31
 
19
32
  def test_reports_near_duplicate_content
@@ -59,13 +72,15 @@ class CrawlscopeUniquenessRuleTest < Minitest::Test
59
72
  TEXT
60
73
  end
61
74
 
62
- def page(url:, content: nil)
75
+ def page(url:, content: nil, canonical: nil)
63
76
  repeated_text = content || ("Useful content " * 30).strip
77
+ canonical_tag = canonical ? %(<link rel="canonical" href="#{canonical}">) : ""
64
78
  body = <<~HTML
65
79
  <html>
66
80
  <head>
67
81
  <title>Example Title</title>
68
82
  <meta name="description" content="Example description">
83
+ #{canonical_tag}
69
84
  </head>
70
85
  <body>
71
86
  <main>#{repeated_text}</main>
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crawlscope
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paulo Fidalgo
@@ -239,6 +239,7 @@ files:
239
239
  - test/crawlscope/links_rule_test.rb
240
240
  - test/crawlscope/loader_test.rb
241
241
  - test/crawlscope/metadata_rule_test.rb
242
+ - test/crawlscope/rake_tasks_test.rb
242
243
  - test/crawlscope/reporter_test.rb
243
244
  - test/crawlscope/rule_registry_test.rb
244
245
  - test/crawlscope/run_test.rb