crawlscope 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeMetadataRuleTest < Minitest::Test
6
+ def test_reports_short_meta_description_multiple_h1_and_incomplete_open_graph
7
+ issues = Crawlscope::IssueCollection.new
8
+
9
+ Crawlscope::Rules::Metadata.new.call(
10
+ urls: [page.url],
11
+ pages: [page],
12
+ issues: issues
13
+ )
14
+
15
+ codes = issues.to_a.map(&:code)
16
+ assert_includes codes, :meta_description_too_short
17
+ assert_includes codes, :multiple_h1
18
+ assert_includes codes, :incomplete_open_graph_tags
19
+ end
20
+
21
+ def test_allows_localhost_page_with_matching_production_canonical_path
22
+ issues = Crawlscope::IssueCollection.new
23
+ local_page = page(
24
+ url: "http://localhost:3000/about",
25
+ body: <<~HTML
26
+ <html>
27
+ <head>
28
+ <title>About</title>
29
+ <meta name="description" content="A clear description that is long enough for search snippets, local validation checks, and realistic production metadata audits.">
30
+ <link rel="canonical" href="https://www.example.com/about">
31
+ <meta property="og:title" content="About">
32
+ <meta property="og:description" content="About page">
33
+ <meta property="og:url" content="https://www.example.com/about">
34
+ <meta property="og:type" content="website">
35
+ <meta property="og:image" content="https://www.example.com/icon.png">
36
+ </head>
37
+ <body><main><h1>About</h1></main></body>
38
+ </html>
39
+ HTML
40
+ )
41
+
42
+ Crawlscope::Rules::Metadata.new.call(
43
+ urls: [local_page.url],
44
+ pages: [local_page],
45
+ issues: issues
46
+ )
47
+
48
+ refute_includes issues.to_a.map(&:code), :canonical_mismatch
49
+ end
50
+
51
+ private
52
+
53
+ def page(url: "https://example.com/about", body: nil)
54
+ body ||= <<~HTML
55
+ <html>
56
+ <head>
57
+ <title>About</title>
58
+ <meta name="description" content="Too short">
59
+ <link rel="canonical" href="https://example.com/about">
60
+ <meta property="og:title" content="About">
61
+ </head>
62
+ <body><main><h1>About</h1><h1>Team</h1></main></body>
63
+ </html>
64
+ HTML
65
+
66
+ Crawlscope::Page.new(
67
+ url: url,
68
+ normalized_url: Crawlscope::Url.normalize(url, base_url: url),
69
+ final_url: url,
70
+ normalized_final_url: Crawlscope::Url.normalize(url, base_url: url),
71
+ status: 200,
72
+ headers: {"content-type" => "text/html"},
73
+ body: body,
74
+ doc: Nokogiri::HTML(body)
75
+ )
76
+ end
77
+ end
@@ -79,6 +79,97 @@ class CrawlscopeStructuredDataRuleTest < Minitest::Test
79
79
  assert_equal ["json-ld", "microdata"], issues.to_a.first.details[:expected_sources]
80
80
  end
81
81
 
82
+ def test_validates_job_posting_markup
83
+ issues = Crawlscope::IssueCollection.new
84
+ rule = Crawlscope::Rules::StructuredData.new
85
+ page = page(
86
+ url: "https://example.com/careers/sales-partner",
87
+ body: <<~HTML
88
+ <html>
89
+ <head>
90
+ <script type="application/ld+json">
91
+ {
92
+ "@context":"https://schema.org/",
93
+ "@type":"JobPosting",
94
+ "title":"Sales Partner",
95
+ "description":"A real role description.",
96
+ "datePosted":"2026-04-28",
97
+ "hiringOrganization":{"@type":"Organization","name":"Example","sameAs":"https://example.com/","logo":"https://example.com/icon.png"},
98
+ "jobLocationType":"TELECOMMUTE",
99
+ "applicantLocationRequirements":[{"@type":"Country","name":"South Africa"}]
100
+ }
101
+ </script>
102
+ </head>
103
+ <body><h1>Sales Partner</h1></body>
104
+ </html>
105
+ HTML
106
+ )
107
+
108
+ rule.call(
109
+ urls: [page.url],
110
+ pages: [page],
111
+ issues: issues,
112
+ context: {schema_registry: Crawlscope::SchemaRegistry.default}
113
+ )
114
+
115
+ assert_empty issues.to_a
116
+ end
117
+
118
+ def test_reports_schema_errors_for_invalid_job_posting_markup
119
+ issues = Crawlscope::IssueCollection.new
120
+ rule = Crawlscope::Rules::StructuredData.new
121
+ page = page(
122
+ url: "https://example.com/careers/sales-partner",
123
+ body: <<~HTML
124
+ <html>
125
+ <head>
126
+ <script type="application/ld+json">
127
+ {"@context":"https://schema.org","@type":"JobPosting","title":"Sales Partner"}
128
+ </script>
129
+ </head>
130
+ <body><h1>Sales Partner</h1></body>
131
+ </html>
132
+ HTML
133
+ )
134
+
135
+ rule.call(
136
+ urls: [page.url],
137
+ pages: [page],
138
+ issues: issues,
139
+ context: {schema_registry: Crawlscope::SchemaRegistry.default}
140
+ )
141
+
142
+ assert_equal [:structured_data_schema_error], issues.to_a.map(&:code)
143
+ assert_includes issues.to_a.first.message, "description"
144
+ end
145
+
146
+ def test_reports_missing_job_posting_for_career_detail_pages
147
+ issues = Crawlscope::IssueCollection.new
148
+ rule = Crawlscope::Rules::StructuredData.new
149
+ page = page(
150
+ url: "https://example.com/careers/sales-partner",
151
+ body: <<~HTML
152
+ <html>
153
+ <head>
154
+ <script type="application/ld+json">
155
+ {"@context":"https://schema.org","@type":"WebPage","name":"Sales Partner"}
156
+ </script>
157
+ </head>
158
+ <body><h1>Sales Partner</h1></body>
159
+ </html>
160
+ HTML
161
+ )
162
+
163
+ rule.call(
164
+ urls: [page.url],
165
+ pages: [page],
166
+ issues: issues,
167
+ context: {schema_registry: Crawlscope::SchemaRegistry.default}
168
+ )
169
+
170
+ assert_equal [:missing_job_posting], issues.to_a.map(&:code)
171
+ end
172
+
82
173
  private
83
174
 
84
175
  def page(url:, body:)
@@ -16,10 +16,51 @@ class CrawlscopeUniquenessRuleTest < Minitest::Test
16
16
  assert_equal %i[duplicate_content_fingerprint duplicate_meta_description duplicate_title].sort, issues.to_a.map(&:code).sort
17
17
  end
18
18
 
19
+ def test_reports_near_duplicate_content
20
+ issues = Crawlscope::IssueCollection.new
21
+ rule = Crawlscope::Rules::Uniqueness.new
22
+ pages = [
23
+ page(url: "https://example.com/a", content: near_duplicate_content("reliable")),
24
+ page(url: "https://example.com/b", content: near_duplicate_content("dependable"))
25
+ ]
26
+
27
+ rule.call(urls: pages.map(&:url), pages: pages, issues: issues, context: {})
28
+
29
+ issue = issues.to_a.find { |item| item.code == :near_duplicate_content }
30
+ assert issue
31
+ assert_operator issue.details[:similarity], :>=, issue.details[:threshold]
32
+ end
33
+
34
+ def test_skips_near_duplicate_scan_when_page_count_exceeds_limit
35
+ issues = Crawlscope::IssueCollection.new
36
+ rule = Crawlscope::Rules::Uniqueness.new(max_near_duplicate_pages: 1)
37
+ pages = [
38
+ page(url: "https://example.com/a", content: near_duplicate_content("reliable")),
39
+ page(url: "https://example.com/b", content: near_duplicate_content("dependable"))
40
+ ]
41
+
42
+ rule.call(urls: pages.map(&:url), pages: pages, issues: issues, context: {})
43
+
44
+ skip_issue = issues.to_a.find { |item| item.code == :near_duplicate_scan_skipped }
45
+ refute issues.to_a.any? { |item| item.code == :near_duplicate_content }
46
+ assert_equal :warning, skip_issue.severity
47
+ assert_equal({max_pages: 1, page_count: 2}, skip_issue.details)
48
+ end
49
+
19
50
  private
20
51
 
21
- def page(url:)
22
- repeated_text = ("Useful content " * 30).strip
52
+ def near_duplicate_content(adjective)
53
+ <<~TEXT.gsub(/\s+/, " ").strip
54
+ This page summarizes practical hotel review patterns for operators who need #{adjective}
55
+ service insights across locations. It compares recurring comments about staff, rooms,
56
+ cleanliness, check-in, breakfast, parking, and amenities so teams can prioritize fixes.
57
+ The analysis highlights repeat themes, explains why guests mention them, and keeps the
58
+ wording focused on decisions that improve daily operations.
59
+ TEXT
60
+ end
61
+
62
+ def page(url:, content: nil)
63
+ repeated_text = content || ("Useful content " * 30).strip
23
64
  body = <<~HTML
24
65
  <html>
25
66
  <head>
@@ -0,0 +1,86 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+ require "rake"
5
+
6
+ unless respond_to?(:release_version, true)
7
+ load File.expand_path("../Rakefile", __dir__)
8
+ end
9
+
10
+ class ReleaseTaskTest < Minitest::Test
11
+ def test_release_version_increments_patch_from_current_version
12
+ major, minor, patch = Crawlscope::VERSION.split(".").map(&:to_i)
13
+
14
+ assert_equal "#{major}.#{minor}.#{patch + 1}", release_version("patch")
15
+ end
16
+
17
+ def test_release_version_accepts_explicit_semantic_version
18
+ assert_equal "0.3.0", release_version("0.3.0")
19
+ end
20
+
21
+ def test_validate_release_version_rejects_current_version
22
+ error = assert_raises(ArgumentError) do
23
+ validate_release_version!("0.2.7", "0.2.7")
24
+ end
25
+
26
+ assert_equal(
27
+ "Release version 0.2.7 must be newer than current version 0.2.7.",
28
+ error.message
29
+ )
30
+ end
31
+
32
+ def test_validate_release_version_rejects_existing_local_tag
33
+ @local_release_tag_exists = true
34
+ @remote_release_tag_exists = false
35
+
36
+ error = assert_raises(ArgumentError) do
37
+ validate_release_version!("0.2.8", "0.2.7")
38
+ end
39
+
40
+ assert_equal "Release tag v0.2.8 already exists locally.", error.message
41
+ end
42
+
43
+ def test_validate_release_version_rejects_existing_remote_tag
44
+ @local_release_tag_exists = false
45
+ @remote_release_tag_exists = true
46
+
47
+ error = assert_raises(ArgumentError) do
48
+ validate_release_version!("0.2.8", "0.2.7")
49
+ end
50
+
51
+ assert_equal "Release tag v0.2.8 already exists on origin.", error.message
52
+ end
53
+
54
+ def test_remote_release_tag_command_asks_git_to_fail_when_no_tag_matches
55
+ assert_equal(
56
+ "git ls-remote --exit-code --tags origin refs/tags/v0.2.8",
57
+ remote_release_tag_command("v0.2.8")
58
+ )
59
+ end
60
+
61
+ def test_changelog_command_prepends_the_next_release
62
+ assert_equal(
63
+ [
64
+ "git-cliff",
65
+ "-c",
66
+ "cliff.toml",
67
+ "--unreleased",
68
+ "--tag",
69
+ "v0.2.8",
70
+ "--prepend",
71
+ "CHANGELOG.md"
72
+ ],
73
+ changelog_command("0.2.8")
74
+ )
75
+ end
76
+
77
+ private
78
+
79
+ def local_release_tag_exists?(_tag)
80
+ @local_release_tag_exists || false
81
+ end
82
+
83
+ def remote_release_tag_exists?(_tag)
84
+ @remote_release_tag_exists || false
85
+ end
86
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crawlscope
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paulo Fidalgo
@@ -199,6 +199,7 @@ files:
199
199
  - lib/crawlscope/context.rb
200
200
  - lib/crawlscope/crawl.rb
201
201
  - lib/crawlscope/crawler.rb
202
+ - lib/crawlscope/document_text.rb
202
203
  - lib/crawlscope/http.rb
203
204
  - lib/crawlscope/issue.rb
204
205
  - lib/crawlscope/issue_collection.rb
@@ -208,6 +209,8 @@ files:
208
209
  - lib/crawlscope/reporter.rb
209
210
  - lib/crawlscope/result.rb
210
211
  - lib/crawlscope/rule_registry.rb
212
+ - lib/crawlscope/rules/content_quality.rb
213
+ - lib/crawlscope/rules/indexability.rb
211
214
  - lib/crawlscope/rules/links.rb
212
215
  - lib/crawlscope/rules/metadata.rb
213
216
  - lib/crawlscope/rules/structured_data.rb
@@ -228,11 +231,14 @@ files:
228
231
  - test/crawlscope/browser_test.rb
229
232
  - test/crawlscope/cli_test.rb
230
233
  - test/crawlscope/configuration_test.rb
234
+ - test/crawlscope/content_quality_rule_test.rb
231
235
  - test/crawlscope/crawl_test.rb
232
236
  - test/crawlscope/crawler_test.rb
233
237
  - test/crawlscope/http_test.rb
238
+ - test/crawlscope/indexability_rule_test.rb
234
239
  - test/crawlscope/links_rule_test.rb
235
240
  - test/crawlscope/loader_test.rb
241
+ - test/crawlscope/metadata_rule_test.rb
236
242
  - test/crawlscope/reporter_test.rb
237
243
  - test/crawlscope/rule_registry_test.rb
238
244
  - test/crawlscope/run_test.rb
@@ -246,6 +252,7 @@ files:
246
252
  - test/crawlscope/structured_data_writer_test.rb
247
253
  - test/crawlscope/uniqueness_rule_test.rb
248
254
  - test/crawlscope/url_test.rb
255
+ - test/release_task_test.rb
249
256
  - test/test_helper.rb
250
257
  homepage: https://www.ethos-link.com/opensource/crawlscope
251
258
  licenses:
@@ -274,7 +281,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
274
281
  - !ruby/object:Gem::Version
275
282
  version: '0'
276
283
  requirements: []
277
- rubygems_version: 4.0.6
284
+ rubygems_version: 4.0.10
278
285
  specification_version: 4
279
286
  summary: Audit sitemap URLs for metadata, structured data, uniqueness, and links
280
287
  test_files: []