crawlscope 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +67 -0
- data/README.md +46 -9
- data/lib/crawlscope/cli.rb +5 -0
- data/lib/crawlscope/crawl.rb +6 -0
- data/lib/crawlscope/document_text.rb +40 -0
- data/lib/crawlscope/rule_registry.rb +3 -1
- data/lib/crawlscope/rules/content_quality.rb +99 -0
- data/lib/crawlscope/rules/indexability.rb +66 -0
- data/lib/crawlscope/rules/links.rb +24 -6
- data/lib/crawlscope/rules/metadata.rb +57 -11
- data/lib/crawlscope/rules/structured_data.rb +47 -0
- data/lib/crawlscope/rules/uniqueness.rb +76 -4
- data/lib/crawlscope/schemas.rb +52 -1
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +11 -1
- data/test/crawlscope/cli_test.rb +19 -5
- data/test/crawlscope/configuration_test.rb +8 -1
- data/test/crawlscope/content_quality_rule_test.rb +68 -0
- data/test/crawlscope/crawl_test.rb +23 -3
- data/test/crawlscope/indexability_rule_test.rb +96 -0
- data/test/crawlscope/links_rule_test.rb +39 -0
- data/test/crawlscope/metadata_rule_test.rb +77 -0
- data/test/crawlscope/structured_data_rule_test.rb +91 -0
- data/test/crawlscope/uniqueness_rule_test.rb +43 -2
- data/test/release_task_test.rb +86 -0
- metadata +9 -2
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeMetadataRuleTest < Minitest::Test
|
|
6
|
+
def test_reports_short_meta_description_multiple_h1_and_incomplete_open_graph
|
|
7
|
+
issues = Crawlscope::IssueCollection.new
|
|
8
|
+
|
|
9
|
+
Crawlscope::Rules::Metadata.new.call(
|
|
10
|
+
urls: [page.url],
|
|
11
|
+
pages: [page],
|
|
12
|
+
issues: issues
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
codes = issues.to_a.map(&:code)
|
|
16
|
+
assert_includes codes, :meta_description_too_short
|
|
17
|
+
assert_includes codes, :multiple_h1
|
|
18
|
+
assert_includes codes, :incomplete_open_graph_tags
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def test_allows_localhost_page_with_matching_production_canonical_path
|
|
22
|
+
issues = Crawlscope::IssueCollection.new
|
|
23
|
+
local_page = page(
|
|
24
|
+
url: "http://localhost:3000/about",
|
|
25
|
+
body: <<~HTML
|
|
26
|
+
<html>
|
|
27
|
+
<head>
|
|
28
|
+
<title>About</title>
|
|
29
|
+
<meta name="description" content="A clear description that is long enough for search snippets, local validation checks, and realistic production metadata audits.">
|
|
30
|
+
<link rel="canonical" href="https://www.example.com/about">
|
|
31
|
+
<meta property="og:title" content="About">
|
|
32
|
+
<meta property="og:description" content="About page">
|
|
33
|
+
<meta property="og:url" content="https://www.example.com/about">
|
|
34
|
+
<meta property="og:type" content="website">
|
|
35
|
+
<meta property="og:image" content="https://www.example.com/icon.png">
|
|
36
|
+
</head>
|
|
37
|
+
<body><main><h1>About</h1></main></body>
|
|
38
|
+
</html>
|
|
39
|
+
HTML
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
Crawlscope::Rules::Metadata.new.call(
|
|
43
|
+
urls: [local_page.url],
|
|
44
|
+
pages: [local_page],
|
|
45
|
+
issues: issues
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
refute_includes issues.to_a.map(&:code), :canonical_mismatch
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
def page(url: "https://example.com/about", body: nil)
|
|
54
|
+
body ||= <<~HTML
|
|
55
|
+
<html>
|
|
56
|
+
<head>
|
|
57
|
+
<title>About</title>
|
|
58
|
+
<meta name="description" content="Too short">
|
|
59
|
+
<link rel="canonical" href="https://example.com/about">
|
|
60
|
+
<meta property="og:title" content="About">
|
|
61
|
+
</head>
|
|
62
|
+
<body><main><h1>About</h1><h1>Team</h1></main></body>
|
|
63
|
+
</html>
|
|
64
|
+
HTML
|
|
65
|
+
|
|
66
|
+
Crawlscope::Page.new(
|
|
67
|
+
url: url,
|
|
68
|
+
normalized_url: Crawlscope::Url.normalize(url, base_url: url),
|
|
69
|
+
final_url: url,
|
|
70
|
+
normalized_final_url: Crawlscope::Url.normalize(url, base_url: url),
|
|
71
|
+
status: 200,
|
|
72
|
+
headers: {"content-type" => "text/html"},
|
|
73
|
+
body: body,
|
|
74
|
+
doc: Nokogiri::HTML(body)
|
|
75
|
+
)
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -79,6 +79,97 @@ class CrawlscopeStructuredDataRuleTest < Minitest::Test
|
|
|
79
79
|
assert_equal ["json-ld", "microdata"], issues.to_a.first.details[:expected_sources]
|
|
80
80
|
end
|
|
81
81
|
|
|
82
|
+
def test_validates_job_posting_markup
|
|
83
|
+
issues = Crawlscope::IssueCollection.new
|
|
84
|
+
rule = Crawlscope::Rules::StructuredData.new
|
|
85
|
+
page = page(
|
|
86
|
+
url: "https://example.com/careers/sales-partner",
|
|
87
|
+
body: <<~HTML
|
|
88
|
+
<html>
|
|
89
|
+
<head>
|
|
90
|
+
<script type="application/ld+json">
|
|
91
|
+
{
|
|
92
|
+
"@context":"https://schema.org/",
|
|
93
|
+
"@type":"JobPosting",
|
|
94
|
+
"title":"Sales Partner",
|
|
95
|
+
"description":"A real role description.",
|
|
96
|
+
"datePosted":"2026-04-28",
|
|
97
|
+
"hiringOrganization":{"@type":"Organization","name":"Example","sameAs":"https://example.com/","logo":"https://example.com/icon.png"},
|
|
98
|
+
"jobLocationType":"TELECOMMUTE",
|
|
99
|
+
"applicantLocationRequirements":[{"@type":"Country","name":"South Africa"}]
|
|
100
|
+
}
|
|
101
|
+
</script>
|
|
102
|
+
</head>
|
|
103
|
+
<body><h1>Sales Partner</h1></body>
|
|
104
|
+
</html>
|
|
105
|
+
HTML
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
rule.call(
|
|
109
|
+
urls: [page.url],
|
|
110
|
+
pages: [page],
|
|
111
|
+
issues: issues,
|
|
112
|
+
context: {schema_registry: Crawlscope::SchemaRegistry.default}
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
assert_empty issues.to_a
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def test_reports_schema_errors_for_invalid_job_posting_markup
|
|
119
|
+
issues = Crawlscope::IssueCollection.new
|
|
120
|
+
rule = Crawlscope::Rules::StructuredData.new
|
|
121
|
+
page = page(
|
|
122
|
+
url: "https://example.com/careers/sales-partner",
|
|
123
|
+
body: <<~HTML
|
|
124
|
+
<html>
|
|
125
|
+
<head>
|
|
126
|
+
<script type="application/ld+json">
|
|
127
|
+
{"@context":"https://schema.org","@type":"JobPosting","title":"Sales Partner"}
|
|
128
|
+
</script>
|
|
129
|
+
</head>
|
|
130
|
+
<body><h1>Sales Partner</h1></body>
|
|
131
|
+
</html>
|
|
132
|
+
HTML
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
rule.call(
|
|
136
|
+
urls: [page.url],
|
|
137
|
+
pages: [page],
|
|
138
|
+
issues: issues,
|
|
139
|
+
context: {schema_registry: Crawlscope::SchemaRegistry.default}
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
assert_equal [:structured_data_schema_error], issues.to_a.map(&:code)
|
|
143
|
+
assert_includes issues.to_a.first.message, "description"
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def test_reports_missing_job_posting_for_career_detail_pages
|
|
147
|
+
issues = Crawlscope::IssueCollection.new
|
|
148
|
+
rule = Crawlscope::Rules::StructuredData.new
|
|
149
|
+
page = page(
|
|
150
|
+
url: "https://example.com/careers/sales-partner",
|
|
151
|
+
body: <<~HTML
|
|
152
|
+
<html>
|
|
153
|
+
<head>
|
|
154
|
+
<script type="application/ld+json">
|
|
155
|
+
{"@context":"https://schema.org","@type":"WebPage","name":"Sales Partner"}
|
|
156
|
+
</script>
|
|
157
|
+
</head>
|
|
158
|
+
<body><h1>Sales Partner</h1></body>
|
|
159
|
+
</html>
|
|
160
|
+
HTML
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
rule.call(
|
|
164
|
+
urls: [page.url],
|
|
165
|
+
pages: [page],
|
|
166
|
+
issues: issues,
|
|
167
|
+
context: {schema_registry: Crawlscope::SchemaRegistry.default}
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
assert_equal [:missing_job_posting], issues.to_a.map(&:code)
|
|
171
|
+
end
|
|
172
|
+
|
|
82
173
|
private
|
|
83
174
|
|
|
84
175
|
def page(url:, body:)
|
|
@@ -16,10 +16,51 @@ class CrawlscopeUniquenessRuleTest < Minitest::Test
|
|
|
16
16
|
assert_equal %i[duplicate_content_fingerprint duplicate_meta_description duplicate_title].sort, issues.to_a.map(&:code).sort
|
|
17
17
|
end
|
|
18
18
|
|
|
19
|
+
def test_reports_near_duplicate_content
|
|
20
|
+
issues = Crawlscope::IssueCollection.new
|
|
21
|
+
rule = Crawlscope::Rules::Uniqueness.new
|
|
22
|
+
pages = [
|
|
23
|
+
page(url: "https://example.com/a", content: near_duplicate_content("reliable")),
|
|
24
|
+
page(url: "https://example.com/b", content: near_duplicate_content("dependable"))
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
rule.call(urls: pages.map(&:url), pages: pages, issues: issues, context: {})
|
|
28
|
+
|
|
29
|
+
issue = issues.to_a.find { |item| item.code == :near_duplicate_content }
|
|
30
|
+
assert issue
|
|
31
|
+
assert_operator issue.details[:similarity], :>=, issue.details[:threshold]
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def test_skips_near_duplicate_scan_when_page_count_exceeds_limit
|
|
35
|
+
issues = Crawlscope::IssueCollection.new
|
|
36
|
+
rule = Crawlscope::Rules::Uniqueness.new(max_near_duplicate_pages: 1)
|
|
37
|
+
pages = [
|
|
38
|
+
page(url: "https://example.com/a", content: near_duplicate_content("reliable")),
|
|
39
|
+
page(url: "https://example.com/b", content: near_duplicate_content("dependable"))
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
rule.call(urls: pages.map(&:url), pages: pages, issues: issues, context: {})
|
|
43
|
+
|
|
44
|
+
skip_issue = issues.to_a.find { |item| item.code == :near_duplicate_scan_skipped }
|
|
45
|
+
refute issues.to_a.any? { |item| item.code == :near_duplicate_content }
|
|
46
|
+
assert_equal :warning, skip_issue.severity
|
|
47
|
+
assert_equal({max_pages: 1, page_count: 2}, skip_issue.details)
|
|
48
|
+
end
|
|
49
|
+
|
|
19
50
|
private
|
|
20
51
|
|
|
21
|
-
def
|
|
22
|
-
|
|
52
|
+
def near_duplicate_content(adjective)
|
|
53
|
+
<<~TEXT.gsub(/\s+/, " ").strip
|
|
54
|
+
This page summarizes practical hotel review patterns for operators who need #{adjective}
|
|
55
|
+
service insights across locations. It compares recurring comments about staff, rooms,
|
|
56
|
+
cleanliness, check-in, breakfast, parking, and amenities so teams can prioritize fixes.
|
|
57
|
+
The analysis highlights repeat themes, explains why guests mention them, and keeps the
|
|
58
|
+
wording focused on decisions that improve daily operations.
|
|
59
|
+
TEXT
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def page(url:, content: nil)
|
|
63
|
+
repeated_text = content || ("Useful content " * 30).strip
|
|
23
64
|
body = <<~HTML
|
|
24
65
|
<html>
|
|
25
66
|
<head>
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
require "rake"
|
|
5
|
+
|
|
6
|
+
unless respond_to?(:release_version, true)
|
|
7
|
+
load File.expand_path("../Rakefile", __dir__)
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
class ReleaseTaskTest < Minitest::Test
|
|
11
|
+
def test_release_version_increments_patch_from_current_version
|
|
12
|
+
major, minor, patch = Crawlscope::VERSION.split(".").map(&:to_i)
|
|
13
|
+
|
|
14
|
+
assert_equal "#{major}.#{minor}.#{patch + 1}", release_version("patch")
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def test_release_version_accepts_explicit_semantic_version
|
|
18
|
+
assert_equal "0.3.0", release_version("0.3.0")
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def test_validate_release_version_rejects_current_version
|
|
22
|
+
error = assert_raises(ArgumentError) do
|
|
23
|
+
validate_release_version!("0.2.7", "0.2.7")
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
assert_equal(
|
|
27
|
+
"Release version 0.2.7 must be newer than current version 0.2.7.",
|
|
28
|
+
error.message
|
|
29
|
+
)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def test_validate_release_version_rejects_existing_local_tag
|
|
33
|
+
@local_release_tag_exists = true
|
|
34
|
+
@remote_release_tag_exists = false
|
|
35
|
+
|
|
36
|
+
error = assert_raises(ArgumentError) do
|
|
37
|
+
validate_release_version!("0.2.8", "0.2.7")
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
assert_equal "Release tag v0.2.8 already exists locally.", error.message
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def test_validate_release_version_rejects_existing_remote_tag
|
|
44
|
+
@local_release_tag_exists = false
|
|
45
|
+
@remote_release_tag_exists = true
|
|
46
|
+
|
|
47
|
+
error = assert_raises(ArgumentError) do
|
|
48
|
+
validate_release_version!("0.2.8", "0.2.7")
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
assert_equal "Release tag v0.2.8 already exists on origin.", error.message
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def test_remote_release_tag_command_asks_git_to_fail_when_no_tag_matches
|
|
55
|
+
assert_equal(
|
|
56
|
+
"git ls-remote --exit-code --tags origin refs/tags/v0.2.8",
|
|
57
|
+
remote_release_tag_command("v0.2.8")
|
|
58
|
+
)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def test_changelog_command_prepends_the_next_release
|
|
62
|
+
assert_equal(
|
|
63
|
+
[
|
|
64
|
+
"git-cliff",
|
|
65
|
+
"-c",
|
|
66
|
+
"cliff.toml",
|
|
67
|
+
"--unreleased",
|
|
68
|
+
"--tag",
|
|
69
|
+
"v0.2.8",
|
|
70
|
+
"--prepend",
|
|
71
|
+
"CHANGELOG.md"
|
|
72
|
+
],
|
|
73
|
+
changelog_command("0.2.8")
|
|
74
|
+
)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
private
|
|
78
|
+
|
|
79
|
+
def local_release_tag_exists?(_tag)
|
|
80
|
+
@local_release_tag_exists || false
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def remote_release_tag_exists?(_tag)
|
|
84
|
+
@remote_release_tag_exists || false
|
|
85
|
+
end
|
|
86
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: crawlscope
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.4.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Paulo Fidalgo
|
|
@@ -199,6 +199,7 @@ files:
|
|
|
199
199
|
- lib/crawlscope/context.rb
|
|
200
200
|
- lib/crawlscope/crawl.rb
|
|
201
201
|
- lib/crawlscope/crawler.rb
|
|
202
|
+
- lib/crawlscope/document_text.rb
|
|
202
203
|
- lib/crawlscope/http.rb
|
|
203
204
|
- lib/crawlscope/issue.rb
|
|
204
205
|
- lib/crawlscope/issue_collection.rb
|
|
@@ -208,6 +209,8 @@ files:
|
|
|
208
209
|
- lib/crawlscope/reporter.rb
|
|
209
210
|
- lib/crawlscope/result.rb
|
|
210
211
|
- lib/crawlscope/rule_registry.rb
|
|
212
|
+
- lib/crawlscope/rules/content_quality.rb
|
|
213
|
+
- lib/crawlscope/rules/indexability.rb
|
|
211
214
|
- lib/crawlscope/rules/links.rb
|
|
212
215
|
- lib/crawlscope/rules/metadata.rb
|
|
213
216
|
- lib/crawlscope/rules/structured_data.rb
|
|
@@ -228,11 +231,14 @@ files:
|
|
|
228
231
|
- test/crawlscope/browser_test.rb
|
|
229
232
|
- test/crawlscope/cli_test.rb
|
|
230
233
|
- test/crawlscope/configuration_test.rb
|
|
234
|
+
- test/crawlscope/content_quality_rule_test.rb
|
|
231
235
|
- test/crawlscope/crawl_test.rb
|
|
232
236
|
- test/crawlscope/crawler_test.rb
|
|
233
237
|
- test/crawlscope/http_test.rb
|
|
238
|
+
- test/crawlscope/indexability_rule_test.rb
|
|
234
239
|
- test/crawlscope/links_rule_test.rb
|
|
235
240
|
- test/crawlscope/loader_test.rb
|
|
241
|
+
- test/crawlscope/metadata_rule_test.rb
|
|
236
242
|
- test/crawlscope/reporter_test.rb
|
|
237
243
|
- test/crawlscope/rule_registry_test.rb
|
|
238
244
|
- test/crawlscope/run_test.rb
|
|
@@ -246,6 +252,7 @@ files:
|
|
|
246
252
|
- test/crawlscope/structured_data_writer_test.rb
|
|
247
253
|
- test/crawlscope/uniqueness_rule_test.rb
|
|
248
254
|
- test/crawlscope/url_test.rb
|
|
255
|
+
- test/release_task_test.rb
|
|
249
256
|
- test/test_helper.rb
|
|
250
257
|
homepage: https://www.ethos-link.com/opensource/crawlscope
|
|
251
258
|
licenses:
|
|
@@ -274,7 +281,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
274
281
|
- !ruby/object:Gem::Version
|
|
275
282
|
version: '0'
|
|
276
283
|
requirements: []
|
|
277
|
-
rubygems_version: 4.0.
|
|
284
|
+
rubygems_version: 4.0.10
|
|
278
285
|
specification_version: 4
|
|
279
286
|
summary: Audit sitemap URLs for metadata, structured data, uniqueness, and links
|
|
280
287
|
test_files: []
|