seobuddy 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- seobuddy/__init__.py +10 -0
- seobuddy/__main__.py +3 -0
- seobuddy/auditor.py +85 -0
- seobuddy/checks/__init__.py +1 -0
- seobuddy/checks/base.py +167 -0
- seobuddy/checks/canonical.py +64 -0
- seobuddy/checks/content.py +61 -0
- seobuddy/checks/headings.py +57 -0
- seobuddy/checks/hreflang.py +155 -0
- seobuddy/checks/images.py +49 -0
- seobuddy/checks/jsonld.py +100 -0
- seobuddy/checks/links.py +119 -0
- seobuddy/checks/meta.py +59 -0
- seobuddy/checks/opengraph.py +37 -0
- seobuddy/checks/robots_check.py +51 -0
- seobuddy/checks/sitemap_check.py +100 -0
- seobuddy/checks/technical.py +53 -0
- seobuddy/checks/title.py +59 -0
- seobuddy/cli.py +225 -0
- seobuddy/crawler.py +180 -0
- seobuddy/display.py +257 -0
- seobuddy/html_utils.py +47 -0
- seobuddy/models.py +99 -0
- seobuddy/report.py +148 -0
- seobuddy/site_resources.py +251 -0
- seobuddy/url_utils.py +155 -0
- seobuddy-0.2.0.dist-info/METADATA +449 -0
- seobuddy-0.2.0.dist-info/RECORD +31 -0
- seobuddy-0.2.0.dist-info/WHEEL +4 -0
- seobuddy-0.2.0.dist-info/entry_points.txt +2 -0
- seobuddy-0.2.0.dist-info/licenses/LICENSE +21 -0
seobuddy/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""SEObuddy — technical SEO audit CLI."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.2.0"
|
|
4
|
+
|
|
5
|
+
AUTHOR_NAME = "NikitaY.com"
|
|
6
|
+
AUTHOR_URL = "https://nikitay.com/"
|
|
7
|
+
GITHUB_URL = "https://github.com/nikitaycs50/SEObuddy"
|
|
8
|
+
COPYRIGHT = f"Copyright © 2026 {AUTHOR_NAME}. All rights reserved."
|
|
9
|
+
CREATED_BY = f"Created by {AUTHOR_NAME}"
|
|
10
|
+
CLI_EPILOG = f"{CREATED_BY} — {AUTHOR_URL}"
|
seobuddy/__main__.py
ADDED
seobuddy/auditor.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Orchestrates all SEO checks for a single page."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
from bs4 import BeautifulSoup
|
|
7
|
+
|
|
8
|
+
from seobuddy.checks import (
|
|
9
|
+
canonical,
|
|
10
|
+
content,
|
|
11
|
+
headings,
|
|
12
|
+
hreflang,
|
|
13
|
+
images,
|
|
14
|
+
jsonld,
|
|
15
|
+
links,
|
|
16
|
+
meta,
|
|
17
|
+
opengraph,
|
|
18
|
+
technical,
|
|
19
|
+
title,
|
|
20
|
+
)
|
|
21
|
+
from seobuddy.checks.base import CATEGORY_ORDER, weighted_page_score
|
|
22
|
+
from seobuddy.html_utils import is_html_content, parse_html
|
|
23
|
+
from seobuddy.models import AuditConfig, CheckResult, CheckStatus, PageAudit, PageData, SiteContext
|
|
24
|
+
from seobuddy.url_utils import path_display
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _empty_result(name: str, weight: float) -> CheckResult:
|
|
28
|
+
from seobuddy.checks.base import CATEGORY_WEIGHTS
|
|
29
|
+
|
|
30
|
+
w = CATEGORY_WEIGHTS.get(name, weight)
|
|
31
|
+
return CheckResult(
|
|
32
|
+
name=name,
|
|
33
|
+
score=0,
|
|
34
|
+
weight=w,
|
|
35
|
+
status=CheckStatus.FAIL,
|
|
36
|
+
findings=["No HTML content to analyze"],
|
|
37
|
+
suggestions=[],
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _minimal_soup() -> BeautifulSoup:
|
|
42
|
+
return BeautifulSoup("<html></html>", "lxml")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
async def audit_page(
|
|
46
|
+
page: PageData,
|
|
47
|
+
context: SiteContext,
|
|
48
|
+
config: AuditConfig,
|
|
49
|
+
client: httpx.AsyncClient | None = None,
|
|
50
|
+
) -> PageAudit:
|
|
51
|
+
html = page.html or ""
|
|
52
|
+
content_type = page.headers.get("content-type") or ""
|
|
53
|
+
is_html = page.status_code < 400 and html.strip() and is_html_content(content_type, html)
|
|
54
|
+
|
|
55
|
+
soup = parse_html(html, content_type) if is_html else None
|
|
56
|
+
results: dict[str, CheckResult] = {}
|
|
57
|
+
|
|
58
|
+
if not soup:
|
|
59
|
+
from seobuddy.checks.base import CATEGORY_WEIGHTS
|
|
60
|
+
|
|
61
|
+
for cat in CATEGORY_ORDER:
|
|
62
|
+
if cat == "technical":
|
|
63
|
+
results[cat] = technical.check(_minimal_soup(), page, context)
|
|
64
|
+
else:
|
|
65
|
+
results[cat] = _empty_result(cat, CATEGORY_WEIGHTS[cat])
|
|
66
|
+
else:
|
|
67
|
+
results["title"] = title.check(soup, page, context)
|
|
68
|
+
results["meta"] = meta.check(soup, page, context)
|
|
69
|
+
results["opengraph"] = opengraph.check(soup, page, context)
|
|
70
|
+
results["jsonld"] = jsonld.check(soup, page, context)
|
|
71
|
+
results["headings"] = headings.check(soup, page, context)
|
|
72
|
+
results["content"] = content.check(soup, page, context)
|
|
73
|
+
results["links"] = await links.check_async(soup, page, context, config, client)
|
|
74
|
+
results["images"] = images.check(soup, page, context)
|
|
75
|
+
results["canonical"] = canonical.check(soup, page, context)
|
|
76
|
+
results["hreflang"] = hreflang.check(soup, page, context)
|
|
77
|
+
results["technical"] = technical.check(soup, page, context)
|
|
78
|
+
|
|
79
|
+
score = weighted_page_score(results)
|
|
80
|
+
return PageAudit(
|
|
81
|
+
page=page,
|
|
82
|
+
results=results,
|
|
83
|
+
score=score,
|
|
84
|
+
path_display=path_display(page.final_url),
|
|
85
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""SEO audit check modules."""
|
seobuddy/checks/base.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Scoring helpers and category metadata."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from seobuddy.models import CheckResult, CheckStatus, PageAudit
|
|
6
|
+
|
|
7
|
+
# Display order and labels for reports/UI
|
|
8
|
+
CATEGORY_ORDER = [
|
|
9
|
+
"title",
|
|
10
|
+
"meta",
|
|
11
|
+
"opengraph",
|
|
12
|
+
"jsonld",
|
|
13
|
+
"headings",
|
|
14
|
+
"content",
|
|
15
|
+
"links",
|
|
16
|
+
"images",
|
|
17
|
+
"canonical",
|
|
18
|
+
"hreflang",
|
|
19
|
+
"technical",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
CATEGORY_LABELS = {
|
|
23
|
+
"title": "Title",
|
|
24
|
+
"meta": "Meta Desc",
|
|
25
|
+
"opengraph": "Open Graph",
|
|
26
|
+
"jsonld": "JSON-LD",
|
|
27
|
+
"headings": "Headings",
|
|
28
|
+
"content": "Content",
|
|
29
|
+
"links": "Links",
|
|
30
|
+
"images": "Images",
|
|
31
|
+
"canonical": "Canonical",
|
|
32
|
+
"hreflang": "Hreflang",
|
|
33
|
+
"technical": "Technical",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
SITE_CHECK_ORDER = ("robots", "sitemap")
|
|
37
|
+
|
|
38
|
+
SITE_CATEGORY_LABELS = {
|
|
39
|
+
"robots": "Robots.txt",
|
|
40
|
+
"sitemap": "Sitemap",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def site_check_pages_ok(score: int) -> str:
|
|
45
|
+
"""Pages OK column for site-wide checks (single site-level pass)."""
|
|
46
|
+
ok = 1 if score >= 80 else 0
|
|
47
|
+
return f"{ok}/1"
|
|
48
|
+
|
|
49
|
+
CATEGORY_WEIGHTS = {
|
|
50
|
+
"title": 0.15,
|
|
51
|
+
"meta": 0.10,
|
|
52
|
+
"opengraph": 0.09,
|
|
53
|
+
"jsonld": 0.09,
|
|
54
|
+
"headings": 0.09,
|
|
55
|
+
"content": 0.15,
|
|
56
|
+
"links": 0.09,
|
|
57
|
+
"images": 0.09,
|
|
58
|
+
"canonical": 0.05,
|
|
59
|
+
"hreflang": 0.05,
|
|
60
|
+
"technical": 0.05,
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def clamp_score(value: float) -> int:
|
|
65
|
+
return max(0, min(100, round(value)))
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def status_from_score(score: int) -> CheckStatus:
|
|
69
|
+
if score >= 80:
|
|
70
|
+
return CheckStatus.PASS
|
|
71
|
+
if score >= 60:
|
|
72
|
+
return CheckStatus.WARN
|
|
73
|
+
return CheckStatus.FAIL
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def weighted_page_score(results: dict[str, CheckResult]) -> int:
|
|
77
|
+
total = 0.0
|
|
78
|
+
for key, weight in CATEGORY_WEIGHTS.items():
|
|
79
|
+
result = results.get(key)
|
|
80
|
+
if result:
|
|
81
|
+
total += result.score * weight
|
|
82
|
+
return clamp_score(total)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def letter_grade(score: int) -> str:
|
|
86
|
+
if score >= 90:
|
|
87
|
+
return "A"
|
|
88
|
+
if score >= 80:
|
|
89
|
+
return "B"
|
|
90
|
+
if score >= 70:
|
|
91
|
+
return "C+"
|
|
92
|
+
if score >= 60:
|
|
93
|
+
return "C"
|
|
94
|
+
if score >= 50:
|
|
95
|
+
return "D"
|
|
96
|
+
return "F"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def aggregate_category_scores(
|
|
100
|
+
pages: list[PageAudit],
|
|
101
|
+
) -> dict[str, dict[str, int]]:
|
|
102
|
+
"""Per-category mean score and pages_ok count (score >= 80)."""
|
|
103
|
+
agg: dict[str, dict[str, int]] = {}
|
|
104
|
+
for cat in CATEGORY_ORDER:
|
|
105
|
+
scores = []
|
|
106
|
+
ok = 0
|
|
107
|
+
for page in pages:
|
|
108
|
+
r = page.results.get(cat)
|
|
109
|
+
if r:
|
|
110
|
+
scores.append(r.score)
|
|
111
|
+
if r.score >= 80:
|
|
112
|
+
ok += 1
|
|
113
|
+
agg[cat] = {
|
|
114
|
+
"score": round(sum(scores) / len(scores)) if scores else 0,
|
|
115
|
+
"pages_ok": ok,
|
|
116
|
+
"pages_total": len(pages),
|
|
117
|
+
}
|
|
118
|
+
return agg
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def top_issues(pages: list[PageAudit], limit: int = 5) -> list[str]:
|
|
122
|
+
"""Human-readable top issues sorted by impact."""
|
|
123
|
+
if not pages:
|
|
124
|
+
return []
|
|
125
|
+
|
|
126
|
+
agg = aggregate_category_scores(pages)
|
|
127
|
+
impacts: list[tuple[float, str]] = []
|
|
128
|
+
|
|
129
|
+
for cat in CATEGORY_ORDER:
|
|
130
|
+
data = agg[cat]
|
|
131
|
+
cat_score = data["score"]
|
|
132
|
+
weight = CATEGORY_WEIGHTS[cat]
|
|
133
|
+
impact = weight * (100 - cat_score)
|
|
134
|
+
label = CATEGORY_LABELS[cat]
|
|
135
|
+
ok = data["pages_ok"]
|
|
136
|
+
total = data["pages_total"]
|
|
137
|
+
if cat_score < 80:
|
|
138
|
+
line = f"{label} weak (avg {cat_score}/100, {ok}/{total} pages OK)"
|
|
139
|
+
impacts.append((impact, line))
|
|
140
|
+
|
|
141
|
+
impacts.sort(key=lambda x: x[0], reverse=True)
|
|
142
|
+
lines = []
|
|
143
|
+
for i, (_, line) in enumerate(impacts[:limit], 1):
|
|
144
|
+
circled = "①②③④⑤"[i - 1] if i <= 5 else str(i)
|
|
145
|
+
lines.append(f"{circled} {line}")
|
|
146
|
+
return lines
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def collect_recommendations(pages: list[PageAudit]) -> list[str]:
|
|
150
|
+
"""Deduplicated suggestions sorted by category impact."""
|
|
151
|
+
agg = aggregate_category_scores(pages)
|
|
152
|
+
seen: set[str] = set()
|
|
153
|
+
items: list[tuple[float, str]] = []
|
|
154
|
+
|
|
155
|
+
for cat in CATEGORY_ORDER:
|
|
156
|
+
data = agg[cat]
|
|
157
|
+
impact = CATEGORY_WEIGHTS[cat] * (100 - data["score"])
|
|
158
|
+
for page in pages:
|
|
159
|
+
r = page.results.get(cat)
|
|
160
|
+
if r:
|
|
161
|
+
for s in r.suggestions:
|
|
162
|
+
if s not in seen:
|
|
163
|
+
seen.add(s)
|
|
164
|
+
items.append((impact, s))
|
|
165
|
+
|
|
166
|
+
items.sort(key=lambda x: x[0], reverse=True)
|
|
167
|
+
return [s for _, s in items]
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Canonical URL check."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
|
|
9
|
+
from seobuddy.checks.base import CATEGORY_WEIGHTS, clamp_score, status_from_score
|
|
10
|
+
from seobuddy.models import CheckResult, CheckStatus, PageData, SiteContext
|
|
11
|
+
from seobuddy.url_utils import normalize_url
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def check(soup: BeautifulSoup, page: PageData, context: SiteContext) -> CheckResult:
|
|
15
|
+
weight = CATEGORY_WEIGHTS["canonical"]
|
|
16
|
+
findings: list[str] = []
|
|
17
|
+
suggestions: list[str] = []
|
|
18
|
+
|
|
19
|
+
tag = soup.find("link", rel=lambda r: r and "canonical" in r.lower())
|
|
20
|
+
href = (tag.get("href") or "").strip() if tag else ""
|
|
21
|
+
|
|
22
|
+
if not href:
|
|
23
|
+
return CheckResult(
|
|
24
|
+
name="canonical",
|
|
25
|
+
score=0,
|
|
26
|
+
weight=weight,
|
|
27
|
+
status=CheckStatus.FAIL,
|
|
28
|
+
findings=["Missing canonical link tag"],
|
|
29
|
+
suggestions=["Add <link rel='canonical' href='...'> pointing to preferred URL"],
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
parsed = urlparse(href)
|
|
33
|
+
if not parsed.scheme:
|
|
34
|
+
return CheckResult(
|
|
35
|
+
name="canonical",
|
|
36
|
+
score=40,
|
|
37
|
+
weight=weight,
|
|
38
|
+
status=CheckStatus.WARN,
|
|
39
|
+
findings=["Canonical URL is not absolute"],
|
|
40
|
+
suggestions=["Use absolute URL in canonical tag"],
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
norm_canonical = normalize_url(href) or href
|
|
44
|
+
norm_page = normalize_url(page.final_url) or page.final_url
|
|
45
|
+
|
|
46
|
+
if norm_canonical == norm_page:
|
|
47
|
+
score = 100
|
|
48
|
+
findings.append("Canonical points to this page")
|
|
49
|
+
else:
|
|
50
|
+
score = 70
|
|
51
|
+
findings.append(f"Canonical points elsewhere: {href}")
|
|
52
|
+
suggestions.append("Ensure canonical matches preferred URL for this content")
|
|
53
|
+
|
|
54
|
+
context.canonicals_seen.add(norm_canonical)
|
|
55
|
+
score = clamp_score(score)
|
|
56
|
+
|
|
57
|
+
return CheckResult(
|
|
58
|
+
name="canonical",
|
|
59
|
+
score=score,
|
|
60
|
+
weight=weight,
|
|
61
|
+
status=status_from_score(score),
|
|
62
|
+
findings=findings,
|
|
63
|
+
suggestions=suggestions,
|
|
64
|
+
)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Content quality check (word count, text/HTML ratio)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
|
|
7
|
+
from seobuddy.checks.base import CATEGORY_WEIGHTS, clamp_score, status_from_score
|
|
8
|
+
from seobuddy.models import CheckResult, CheckStatus, PageData, SiteContext
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _word_count(soup: BeautifulSoup) -> int:
|
|
12
|
+
for tag in soup(["script", "style", "noscript"]):
|
|
13
|
+
tag.decompose()
|
|
14
|
+
text = soup.get_text(separator=" ", strip=True)
|
|
15
|
+
words = [w for w in text.split() if w]
|
|
16
|
+
return len(words)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def check(soup: BeautifulSoup, page: PageData, context: SiteContext) -> CheckResult:
|
|
20
|
+
weight = CATEGORY_WEIGHTS["content"]
|
|
21
|
+
findings: list[str] = []
|
|
22
|
+
suggestions: list[str] = []
|
|
23
|
+
|
|
24
|
+
# Parse copy so word-count decompose does not mutate caller's soup
|
|
25
|
+
from bs4 import BeautifulSoup as BS
|
|
26
|
+
|
|
27
|
+
words = _word_count(BS(str(soup), "lxml"))
|
|
28
|
+
|
|
29
|
+
if words < 300:
|
|
30
|
+
word_score = 0
|
|
31
|
+
findings.append(f"Thin content ({words} words; minimum 300)")
|
|
32
|
+
suggestions.append("Add substantive content (300+ words)")
|
|
33
|
+
elif words < 500:
|
|
34
|
+
word_score = 60
|
|
35
|
+
findings.append(f"Moderate content ({words} words)")
|
|
36
|
+
elif words < 800:
|
|
37
|
+
word_score = 80
|
|
38
|
+
findings.append(f"Good content depth ({words} words)")
|
|
39
|
+
else:
|
|
40
|
+
word_score = 100
|
|
41
|
+
findings.append(f"Strong content depth ({words} words)")
|
|
42
|
+
|
|
43
|
+
html_len = len(page.html or "")
|
|
44
|
+
visible = soup.get_text(separator=" ", strip=True)
|
|
45
|
+
ratio = (len(visible) / html_len * 100) if html_len else 0
|
|
46
|
+
|
|
47
|
+
ratio_score = 100 if ratio >= 15 else max(0, int(ratio / 15 * 100))
|
|
48
|
+
if ratio < 15:
|
|
49
|
+
findings.append(f"Low text/HTML ratio ({ratio:.1f}%; aim ≥15%)")
|
|
50
|
+
suggestions.append("Reduce boilerplate markup; increase visible text")
|
|
51
|
+
|
|
52
|
+
score = clamp_score((word_score + ratio_score) / 2)
|
|
53
|
+
|
|
54
|
+
return CheckResult(
|
|
55
|
+
name="content",
|
|
56
|
+
score=score,
|
|
57
|
+
weight=weight,
|
|
58
|
+
status=status_from_score(score),
|
|
59
|
+
findings=findings,
|
|
60
|
+
suggestions=suggestions,
|
|
61
|
+
)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Heading hierarchy check."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
|
|
9
|
+
from seobuddy.checks.base import CATEGORY_WEIGHTS, clamp_score, status_from_score
|
|
10
|
+
from seobuddy.models import CheckResult, CheckStatus, PageData, SiteContext
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def check(soup: BeautifulSoup, page: PageData, context: SiteContext) -> CheckResult:
|
|
14
|
+
weight = CATEGORY_WEIGHTS["headings"]
|
|
15
|
+
findings: list[str] = []
|
|
16
|
+
suggestions: list[str] = []
|
|
17
|
+
score = 100
|
|
18
|
+
|
|
19
|
+
h1s = [h for h in soup.find_all("h1") if h.get_text(strip=True)]
|
|
20
|
+
if len(h1s) == 0:
|
|
21
|
+
score = 20
|
|
22
|
+
findings.append("No H1 heading found")
|
|
23
|
+
suggestions.append("Add exactly one H1 with primary page topic")
|
|
24
|
+
elif len(h1s) > 1:
|
|
25
|
+
score = 40
|
|
26
|
+
findings.append(f"Multiple H1 headings ({len(h1s)})")
|
|
27
|
+
suggestions.append("Use a single H1 per page")
|
|
28
|
+
else:
|
|
29
|
+
findings.append("Single H1 present")
|
|
30
|
+
|
|
31
|
+
# Hierarchy: collect heading levels in document order
|
|
32
|
+
headings = soup.find_all(re.compile(r"^h[1-6]$", re.I))
|
|
33
|
+
levels = []
|
|
34
|
+
for h in headings:
|
|
35
|
+
if h.name:
|
|
36
|
+
levels.append(int(h.name[1]))
|
|
37
|
+
|
|
38
|
+
gap_penalty = 0
|
|
39
|
+
prev = 0
|
|
40
|
+
for level in levels:
|
|
41
|
+
if prev and level > prev + 1:
|
|
42
|
+
gap_penalty += 20
|
|
43
|
+
findings.append(f"Heading hierarchy skip (H{prev} → H{level})")
|
|
44
|
+
prev = level
|
|
45
|
+
|
|
46
|
+
score = clamp_score(max(0, score - gap_penalty))
|
|
47
|
+
if gap_penalty and not suggestions:
|
|
48
|
+
suggestions.append("Maintain sequential heading levels (e.g. H1 → H2 → H3)")
|
|
49
|
+
|
|
50
|
+
return CheckResult(
|
|
51
|
+
name="headings",
|
|
52
|
+
score=score,
|
|
53
|
+
weight=weight,
|
|
54
|
+
status=status_from_score(score),
|
|
55
|
+
findings=findings,
|
|
56
|
+
suggestions=suggestions,
|
|
57
|
+
)
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""Hreflang alternate link audit."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
from bs4 import BeautifulSoup
|
|
9
|
+
|
|
10
|
+
from seobuddy.checks.base import CATEGORY_WEIGHTS, clamp_score, status_from_score
|
|
11
|
+
from seobuddy.models import CheckResult, PageAudit, PageData, SiteContext
|
|
12
|
+
from seobuddy.url_utils import crawl_dedup_key, normalize_url
|
|
13
|
+
|
|
14
|
+
HREFLANG_RE = re.compile(
|
|
15
|
+
r"^[a-z]{2,3}(?:-[A-Za-z0-9]{2,8})*$|^x-default$",
|
|
16
|
+
re.IGNORECASE,
|
|
17
|
+
)
|
|
18
|
+
MAX_EDGES = 200
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _page_key(url: str) -> str:
|
|
22
|
+
return crawl_dedup_key(url)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _extract_alternates(soup: BeautifulSoup, page_url: str) -> list[tuple[str, str]]:
|
|
26
|
+
alternates: list[tuple[str, str]] = []
|
|
27
|
+
for link in soup.find_all("link", rel=lambda v: v and "alternate" in v):
|
|
28
|
+
hreflang = (link.get("hreflang") or "").strip()
|
|
29
|
+
href = (link.get("href") or "").strip()
|
|
30
|
+
if not hreflang or not href:
|
|
31
|
+
continue
|
|
32
|
+
target = normalize_url(href, page_url) or href
|
|
33
|
+
alternates.append((hreflang.lower(), target))
|
|
34
|
+
return alternates
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def check(soup: BeautifulSoup, page: PageData, context: SiteContext) -> CheckResult:
|
|
38
|
+
weight = CATEGORY_WEIGHTS["hreflang"]
|
|
39
|
+
findings: list[str] = []
|
|
40
|
+
suggestions: list[str] = []
|
|
41
|
+
score = 100
|
|
42
|
+
|
|
43
|
+
alternates = _extract_alternates(soup, page.final_url)
|
|
44
|
+
if not alternates:
|
|
45
|
+
findings.append("No hreflang alternates")
|
|
46
|
+
return CheckResult(
|
|
47
|
+
name="hreflang",
|
|
48
|
+
score=100,
|
|
49
|
+
weight=weight,
|
|
50
|
+
status=status_from_score(100),
|
|
51
|
+
findings=findings,
|
|
52
|
+
suggestions=suggestions,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
from_key = _page_key(page.final_url)
|
|
56
|
+
codes_seen: dict[str, str] = {}
|
|
57
|
+
has_x_default = False
|
|
58
|
+
lang_codes: set[str] = set()
|
|
59
|
+
self_ref = False
|
|
60
|
+
|
|
61
|
+
for code, target in alternates:
|
|
62
|
+
if len(context.hreflang_edges) < MAX_EDGES:
|
|
63
|
+
context.hreflang_edges.append((from_key, code, _page_key(target)))
|
|
64
|
+
|
|
65
|
+
if not HREFLANG_RE.match(code):
|
|
66
|
+
score -= 25
|
|
67
|
+
findings.append(f"Invalid hreflang code: {code}")
|
|
68
|
+
suggestions.append("Use BCP 47 tags (e.g. en, en-US) or x-default")
|
|
69
|
+
|
|
70
|
+
if code in codes_seen and codes_seen[code] != target:
|
|
71
|
+
score -= 20
|
|
72
|
+
findings.append(f"Duplicate hreflang '{code}' with different hrefs")
|
|
73
|
+
codes_seen[code] = target
|
|
74
|
+
|
|
75
|
+
if code == "x-default":
|
|
76
|
+
has_x_default = True
|
|
77
|
+
else:
|
|
78
|
+
lang_codes.add(code)
|
|
79
|
+
|
|
80
|
+
parsed = urlparse(target)
|
|
81
|
+
if not parsed.scheme:
|
|
82
|
+
score -= 15
|
|
83
|
+
findings.append(f"Non-absolute hreflang href: {target[:60]}")
|
|
84
|
+
suggestions.append("Use absolute URLs in hreflang link tags")
|
|
85
|
+
|
|
86
|
+
if _page_key(target) == from_key:
|
|
87
|
+
self_ref = True
|
|
88
|
+
|
|
89
|
+
if len(lang_codes) >= 2 and not has_x_default:
|
|
90
|
+
score -= 20
|
|
91
|
+
findings.append("Missing x-default with multiple language alternates")
|
|
92
|
+
suggestions.append("Add <link rel='alternate' hreflang='x-default' href='...'>")
|
|
93
|
+
|
|
94
|
+
if not self_ref:
|
|
95
|
+
score -= 15
|
|
96
|
+
findings.append("Page does not self-reference in hreflang cluster")
|
|
97
|
+
suggestions.append("Include a self-referencing hreflang link for this URL")
|
|
98
|
+
|
|
99
|
+
if not findings:
|
|
100
|
+
findings.append(f"Hreflang cluster OK ({len(alternates)} alternate(s))")
|
|
101
|
+
|
|
102
|
+
score = clamp_score(score)
|
|
103
|
+
return CheckResult(
|
|
104
|
+
name="hreflang",
|
|
105
|
+
score=score,
|
|
106
|
+
weight=weight,
|
|
107
|
+
status=status_from_score(score),
|
|
108
|
+
findings=findings,
|
|
109
|
+
suggestions=suggestions,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def validate_hreflang_reciprocity(
|
|
114
|
+
context: SiteContext,
|
|
115
|
+
pages: list[PageAudit],
|
|
116
|
+
) -> None:
|
|
117
|
+
"""Augment page hreflang results when return links are missing."""
|
|
118
|
+
edges = context.hreflang_edges[:MAX_EDGES]
|
|
119
|
+
if not edges:
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
by_from: dict[str, list[tuple[str, str]]] = {}
|
|
123
|
+
for from_key, code, to_key in edges:
|
|
124
|
+
by_from.setdefault(from_key, []).append((code, to_key))
|
|
125
|
+
|
|
126
|
+
page_by_key: dict[str, PageAudit] = {}
|
|
127
|
+
for pa in pages:
|
|
128
|
+
page_by_key[_page_key(pa.page.final_url)] = pa
|
|
129
|
+
|
|
130
|
+
missing: list[tuple[str, str, str]] = []
|
|
131
|
+
for from_key, code, to_key in edges:
|
|
132
|
+
if code == "x-default":
|
|
133
|
+
continue
|
|
134
|
+
back = by_from.get(to_key, [])
|
|
135
|
+
if not any(c == code and t == from_key for c, t in back):
|
|
136
|
+
missing.append((from_key, code, to_key))
|
|
137
|
+
|
|
138
|
+
if not missing:
|
|
139
|
+
return
|
|
140
|
+
|
|
141
|
+
for from_key, code, to_key in missing[:10]:
|
|
142
|
+
pa = page_by_key.get(from_key)
|
|
143
|
+
if not pa:
|
|
144
|
+
continue
|
|
145
|
+
r = pa.results.get("hreflang")
|
|
146
|
+
if not r:
|
|
147
|
+
continue
|
|
148
|
+
msg = f"Missing return hreflang '{code}' from {to_key}"
|
|
149
|
+
if msg not in r.findings:
|
|
150
|
+
r.findings.append(msg)
|
|
151
|
+
r.suggestions.append(
|
|
152
|
+
"Ensure each hreflang target page links back with the same hreflang code"
|
|
153
|
+
)
|
|
154
|
+
r.score = clamp_score(r.score - 15)
|
|
155
|
+
r.status = status_from_score(r.score)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Image alt text and lazy loading check."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
|
|
7
|
+
from seobuddy.checks.base import CATEGORY_WEIGHTS, clamp_score, status_from_score
|
|
8
|
+
from seobuddy.models import CheckResult, CheckStatus, PageData, SiteContext
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def check(soup: BeautifulSoup, page: PageData, context: SiteContext) -> CheckResult:
|
|
12
|
+
weight = CATEGORY_WEIGHTS["images"]
|
|
13
|
+
findings: list[str] = []
|
|
14
|
+
suggestions: list[str] = []
|
|
15
|
+
|
|
16
|
+
imgs = soup.find_all("img")
|
|
17
|
+
if not imgs:
|
|
18
|
+
return CheckResult(
|
|
19
|
+
name="images",
|
|
20
|
+
score=100,
|
|
21
|
+
weight=weight,
|
|
22
|
+
status=CheckStatus.PASS,
|
|
23
|
+
findings=["No images on page"],
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
with_alt = sum(1 for img in imgs if (img.get("alt") or "").strip())
|
|
27
|
+
missing = len(imgs) - with_alt
|
|
28
|
+
pct = with_alt / len(imgs) * 100
|
|
29
|
+
score = clamp_score(pct)
|
|
30
|
+
|
|
31
|
+
if missing:
|
|
32
|
+
findings.append(f"{missing}/{len(imgs)} images missing alt text")
|
|
33
|
+
suggestions.append("Add descriptive alt attributes to all images")
|
|
34
|
+
|
|
35
|
+
lazy = any((img.get("loading") or "").lower() == "lazy" for img in imgs)
|
|
36
|
+
if lazy:
|
|
37
|
+
findings.append("Lazy loading detected on at least one image")
|
|
38
|
+
|
|
39
|
+
if not missing:
|
|
40
|
+
findings.append("All images have alt text")
|
|
41
|
+
|
|
42
|
+
return CheckResult(
|
|
43
|
+
name="images",
|
|
44
|
+
score=score,
|
|
45
|
+
weight=weight,
|
|
46
|
+
status=status_from_score(score),
|
|
47
|
+
findings=findings,
|
|
48
|
+
suggestions=suggestions,
|
|
49
|
+
)
|