seoextract 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- seoextract/__init__.py +3 -0
- seoextract/crawler.py +148 -0
- seoextract/init.py +84 -0
- seoextract/models.py +116 -0
- seoextract/parser.py +140 -0
- seoextract/rules.py +268 -0
- seoextract/safe_browsing.py +61 -0
- seoextract/scorer.py +48 -0
- seoextract-0.1.0.dist-info/METADATA +294 -0
- seoextract-0.1.0.dist-info/RECORD +12 -0
- seoextract-0.1.0.dist-info/WHEEL +5 -0
- seoextract-0.1.0.dist-info/top_level.txt +1 -0
seoextract/__init__.py
ADDED
seoextract/crawler.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import requests
|
|
3
|
+
from urllib.parse import urljoin, urlparse
|
|
4
|
+
from urllib.robotparser import RobotFileParser
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
HEADERS = {
|
|
9
|
+
"User-Agent": "SEOExtractHF/1.0 (SEO Audit Bot; +https://github.com/Britto1221/seoextracthf)"
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
TIMEOUT = 10 # seconds per request
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# ── Robots.txt Checker ────────────────────────────────────────────────────────
|
|
16
|
+
|
|
17
|
+
def _load_robots(base_url: str) -> RobotFileParser | None:
|
|
18
|
+
"""
|
|
19
|
+
Returns a RobotFileParser if robots.txt is reachable and returns 200.
|
|
20
|
+
Returns None if unreachable or blocked — caller treats None as allow-all.
|
|
21
|
+
"""
|
|
22
|
+
robots_url = urljoin(base_url, "/robots.txt")
|
|
23
|
+
try:
|
|
24
|
+
resp = requests.get(robots_url, headers=HEADERS, timeout=5)
|
|
25
|
+
if resp.status_code != 200:
|
|
26
|
+
return None # can't read robots.txt → allow everything
|
|
27
|
+
rp = RobotFileParser()
|
|
28
|
+
rp.set_url(robots_url)
|
|
29
|
+
rp.read()
|
|
30
|
+
return rp
|
|
31
|
+
except Exception:
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ── Single Page Fetch ─────────────────────────────────────────────────────────
|
|
36
|
+
|
|
37
|
+
def fetch_page(url: str, session: requests.Session) -> dict:
|
|
38
|
+
"""
|
|
39
|
+
Fetch a single URL.
|
|
40
|
+
Returns: { url, status_code, html, response_time_ms, final_url }
|
|
41
|
+
On failure returns status_code 0 and empty html.
|
|
42
|
+
"""
|
|
43
|
+
try:
|
|
44
|
+
start = time.time()
|
|
45
|
+
response = session.get(url, headers=HEADERS, timeout=TIMEOUT, allow_redirects=True)
|
|
46
|
+
elapsed = (time.time() - start) * 1000 # ms
|
|
47
|
+
|
|
48
|
+
content_type = response.headers.get("Content-Type", "")
|
|
49
|
+
html = response.text if "text/html" in content_type else ""
|
|
50
|
+
|
|
51
|
+
return {
|
|
52
|
+
"url" : url,
|
|
53
|
+
"final_url" : response.url,
|
|
54
|
+
"status_code" : response.status_code,
|
|
55
|
+
"html" : html,
|
|
56
|
+
"response_time_ms": round(elapsed, 2),
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
except requests.exceptions.Timeout as e:
|
|
60
|
+
return {
|
|
61
|
+
"url": url,
|
|
62
|
+
"final_url": url,
|
|
63
|
+
"status_code": 408,
|
|
64
|
+
"html": "",
|
|
65
|
+
"response_time_ms": 0.0,
|
|
66
|
+
"error": str(e),
|
|
67
|
+
}
|
|
68
|
+
except requests.exceptions.TooManyRedirects:
|
|
69
|
+
return {"url": url, "final_url": url, "status_code": 310, "html": "", "response_time_ms": 0.0}
|
|
70
|
+
except requests.exceptions.ConnectionError:
|
|
71
|
+
return {"url": url, "final_url": url, "status_code": 0, "html": "", "response_time_ms": 0.0}
|
|
72
|
+
except Exception:
|
|
73
|
+
return {"url": url, "final_url": url, "status_code": 0, "html": "", "response_time_ms": 0.0}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# ── Internal Link Extractor ───────────────────────────────────────────────────
|
|
77
|
+
|
|
78
|
+
def _extract_internal_links(html: str, base_url: str) -> list[str]:
|
|
79
|
+
"""Extract all internal hrefs from a page's HTML."""
|
|
80
|
+
soup = BeautifulSoup(html, "lxml")
|
|
81
|
+
base_domain = urlparse(base_url).netloc
|
|
82
|
+
links = []
|
|
83
|
+
|
|
84
|
+
for tag in soup.find_all("a", href=True):
|
|
85
|
+
href = tag["href"].strip()
|
|
86
|
+
if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("tel:"):
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
full_url = urljoin(base_url, href)
|
|
90
|
+
parsed = urlparse(full_url)
|
|
91
|
+
|
|
92
|
+
# Keep only http/https same-domain links
|
|
93
|
+
if parsed.scheme in ("http", "https") and parsed.netloc == base_domain:
|
|
94
|
+
# Normalize: remove fragment
|
|
95
|
+
clean = parsed._replace(fragment="").geturl()
|
|
96
|
+
links.append(clean)
|
|
97
|
+
|
|
98
|
+
return list(set(links))
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ── BFS Crawler ───────────────────────────────────────────────────────────────
|
|
102
|
+
|
|
103
|
+
def crawl(seed_url: str, max_pages: int = 20) -> list[dict]:
|
|
104
|
+
"""
|
|
105
|
+
BFS crawl starting from seed_url.
|
|
106
|
+
Returns list of fetch results (one dict per page).
|
|
107
|
+
Respects robots.txt and max_pages cap.
|
|
108
|
+
"""
|
|
109
|
+
# Normalize seed
|
|
110
|
+
parsed_seed = urlparse(seed_url)
|
|
111
|
+
|
|
112
|
+
if not parsed_seed.scheme:
|
|
113
|
+
seed_url = "https://" + seed_url
|
|
114
|
+
|
|
115
|
+
parsed_seed = urlparse(seed_url)
|
|
116
|
+
base_url = f"{parsed_seed.scheme}://{parsed_seed.netloc}"
|
|
117
|
+
|
|
118
|
+
robots = _load_robots(base_url)
|
|
119
|
+
session = requests.Session()
|
|
120
|
+
visited = set()
|
|
121
|
+
queue = [seed_url]
|
|
122
|
+
results = []
|
|
123
|
+
|
|
124
|
+
while queue and len(results) < max_pages:
|
|
125
|
+
url = queue.pop(0)
|
|
126
|
+
|
|
127
|
+
# Skip already visited
|
|
128
|
+
if url in visited:
|
|
129
|
+
continue
|
|
130
|
+
visited.add(url)
|
|
131
|
+
|
|
132
|
+
# Respect robots.txt (None means unreachable → allow all)
|
|
133
|
+
if robots is not None and not robots.can_fetch(HEADERS["User-Agent"], url):
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
# Fetch the page
|
|
137
|
+
result = fetch_page(url, session)
|
|
138
|
+
results.append(result)
|
|
139
|
+
|
|
140
|
+
# Only follow links from successful HTML pages
|
|
141
|
+
if result["status_code"] == 200 and result["html"]:
|
|
142
|
+
new_links = _extract_internal_links(result["html"], base_url)
|
|
143
|
+
for link in new_links:
|
|
144
|
+
if link not in visited and link not in queue:
|
|
145
|
+
queue.append(link)
|
|
146
|
+
|
|
147
|
+
session.close()
|
|
148
|
+
return results
|
seoextract/init.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from .crawler import crawl
|
|
3
|
+
from .parser import parse
|
|
4
|
+
from .rules import detect_issues
|
|
5
|
+
from .scorer import score_site
|
|
6
|
+
from .models import AuditResult, Severity,SafeBrowsingResult
|
|
7
|
+
from .safe_browsing import check_safe_browsing
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SEOExtract:
|
|
11
|
+
|
|
12
|
+
@staticmethod
|
|
13
|
+
def audit(
|
|
14
|
+
url: str,
|
|
15
|
+
max_pages: int = 20,
|
|
16
|
+
safe_browsing_api_key: str | None = None,
|
|
17
|
+
) -> AuditResult:
|
|
18
|
+
"""
|
|
19
|
+
Main entry point.
|
|
20
|
+
|
|
21
|
+
Usage:
|
|
22
|
+
from seoextracthf import SEOExtract
|
|
23
|
+
result = SEOExtract.audit("https://example.com")
|
|
24
|
+
|
|
25
|
+
Returns an AuditResult object with:
|
|
26
|
+
result.site_score
|
|
27
|
+
result.grade
|
|
28
|
+
result.pages → list of PageData
|
|
29
|
+
result.issues → list of SEOIssue
|
|
30
|
+
"""
|
|
31
|
+
safe_result = check_safe_browsing(url, api_key=safe_browsing_api_key)
|
|
32
|
+
safe_browsing = SafeBrowsingResult(**safe_result)
|
|
33
|
+
|
|
34
|
+
if safe_browsing.is_safe is False:
|
|
35
|
+
return AuditResult(
|
|
36
|
+
url=url,
|
|
37
|
+
audit_date=datetime.now().strftime("%Y-%m-%d %H:%M"),
|
|
38
|
+
pages_crawled=0,
|
|
39
|
+
site_score=0.0,
|
|
40
|
+
grade="F",
|
|
41
|
+
total_issues=0,
|
|
42
|
+
critical_count=0,
|
|
43
|
+
warning_count=0,
|
|
44
|
+
info_count=0,
|
|
45
|
+
pages=[],
|
|
46
|
+
issues=[],
|
|
47
|
+
safe_browsing=safe_browsing,
|
|
48
|
+
)
|
|
49
|
+
# 1. Crawl
|
|
50
|
+
raw_pages = crawl(url, max_pages=max_pages)
|
|
51
|
+
|
|
52
|
+
# 2. Parse each page
|
|
53
|
+
pages = [parse(r) for r in raw_pages]
|
|
54
|
+
|
|
55
|
+
# 3. Detect issues
|
|
56
|
+
issues = detect_issues(pages)
|
|
57
|
+
|
|
58
|
+
# 4. Score
|
|
59
|
+
site_score, grade = score_site(pages, issues)
|
|
60
|
+
|
|
61
|
+
# 5. Count by severity
|
|
62
|
+
critical = sum(1 for i in issues if i.severity == Severity.CRITICAL)
|
|
63
|
+
warning = sum(1 for i in issues if i.severity == Severity.WARNING)
|
|
64
|
+
info = sum(1 for i in issues if i.severity == Severity.INFO)
|
|
65
|
+
|
|
66
|
+
return AuditResult(
|
|
67
|
+
url = url,
|
|
68
|
+
audit_date = datetime.now().strftime("%Y-%m-%d %H:%M"),
|
|
69
|
+
pages_crawled = len(pages),
|
|
70
|
+
site_score = site_score,
|
|
71
|
+
grade = grade,
|
|
72
|
+
total_issues = len(issues),
|
|
73
|
+
critical_count= critical,
|
|
74
|
+
warning_count = warning,
|
|
75
|
+
info_count = info,
|
|
76
|
+
pages = pages,
|
|
77
|
+
issues = issues,
|
|
78
|
+
safe_browsing=safe_browsing,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
result = SEOExtract.audit("https://rootpro.in/")
|
|
84
|
+
print(result.model_dump_json(indent=2))
|
seoextract/models.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
from pydantic import BaseModel, HttpUrl
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# ── Severity Levels ──────────────────────────────────────────────────────────
|
|
7
|
+
|
|
8
|
+
class Severity(str, Enum):
|
|
9
|
+
CRITICAL = "CRITICAL"
|
|
10
|
+
WARNING = "WARNING"
|
|
11
|
+
INFO = "INFO"
|
|
12
|
+
|
|
13
|
+
class SafeBrowsingResult(BaseModel):
|
|
14
|
+
is_safe: bool | None
|
|
15
|
+
threats: list[str] = []
|
|
16
|
+
error: str | None = None
|
|
17
|
+
# ── Issue Types ───────────────────────────────────────────────────────────────
|
|
18
|
+
|
|
19
|
+
class IssueType(str, Enum):
|
|
20
|
+
MISSING_VIEWPORT = "Missing Viewport Meta Tag"
|
|
21
|
+
META_TOO_SHORT = "Meta Description Too Short"
|
|
22
|
+
MISSING_TITLE = "Missing Title"
|
|
23
|
+
TITLE_TOO_SHORT = "Title Too Short"
|
|
24
|
+
TITLE_TOO_LONG = "Title Too Long"
|
|
25
|
+
DUPLICATE_TITLE = "Duplicate Title"
|
|
26
|
+
MISSING_META = "Missing Meta Description"
|
|
27
|
+
META_TOO_LONG = "Meta Description Too Long"
|
|
28
|
+
MISSING_H1 = "Missing H1 Tag"
|
|
29
|
+
MULTIPLE_H1 = "Multiple H1 Tags"
|
|
30
|
+
THIN_CONTENT = "Thin Content"
|
|
31
|
+
MISSING_ALT_TEXT = "Missing Image Alt Text"
|
|
32
|
+
BROKEN_LINK = "Broken Internal Link"
|
|
33
|
+
MISSING_CANONICAL = "Missing Canonical Tag"
|
|
34
|
+
POOR_INTERNAL_LINKING = "Poor Internal Linking"
|
|
35
|
+
NO_SCHEMA = "No Schema Markup"
|
|
36
|
+
MISSING_ROBOTS_META = "Missing Robots Meta Tag"
|
|
37
|
+
DUPLICATE_META = "Duplicate Meta Description"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ── Single SEO Issue ──────────────────────────────────────────────────────────
|
|
41
|
+
|
|
42
|
+
class SEOIssue(BaseModel):
|
|
43
|
+
page_url : str
|
|
44
|
+
issue_type : IssueType
|
|
45
|
+
severity : Severity
|
|
46
|
+
current_value : str # what was found (or empty string if missing)
|
|
47
|
+
suggestion : str # one-line hint for fixing it
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ── Per-Page Extracted Data ───────────────────────────────────────────────────
|
|
51
|
+
|
|
52
|
+
class PageData(BaseModel):
|
|
53
|
+
url : str
|
|
54
|
+
status_code : int
|
|
55
|
+
response_time_ms : float
|
|
56
|
+
|
|
57
|
+
# Title
|
|
58
|
+
title : Optional[str] = None
|
|
59
|
+
title_length : int = 0
|
|
60
|
+
|
|
61
|
+
# Meta
|
|
62
|
+
meta_description : Optional[str] = None
|
|
63
|
+
meta_description_length : int = 0
|
|
64
|
+
|
|
65
|
+
# Robots + Canonical
|
|
66
|
+
robots_meta : Optional[str] = None # e.g. "noindex, nofollow"
|
|
67
|
+
canonical : Optional[str] = None
|
|
68
|
+
|
|
69
|
+
# Headings
|
|
70
|
+
h1_tags : list[str] = []
|
|
71
|
+
h2_tags : list[str] = []
|
|
72
|
+
h3_tags : list[str] = []
|
|
73
|
+
h1_count: int = 0
|
|
74
|
+
h2_count: int = 0
|
|
75
|
+
h3_count: int = 0
|
|
76
|
+
|
|
77
|
+
# Content
|
|
78
|
+
word_count: int = 0
|
|
79
|
+
|
|
80
|
+
# Images
|
|
81
|
+
total_images : int = 0
|
|
82
|
+
images_missing_alt : int = 0
|
|
83
|
+
|
|
84
|
+
# Links
|
|
85
|
+
internal_links : list[str] = []
|
|
86
|
+
external_links : list[str] = []
|
|
87
|
+
internal_count : int = 0
|
|
88
|
+
external_count : int = 0
|
|
89
|
+
|
|
90
|
+
# Schema
|
|
91
|
+
schema_found : bool = False
|
|
92
|
+
|
|
93
|
+
# Open Graph
|
|
94
|
+
og_title : Optional[str] = None
|
|
95
|
+
og_description : Optional[str] = None
|
|
96
|
+
|
|
97
|
+
# Scoring
|
|
98
|
+
page_score : float = 0.0
|
|
99
|
+
|
|
100
|
+
final_url: Optional[str] = None
|
|
101
|
+
|
|
102
|
+
viewport: Optional[str] = None
|
|
103
|
+
|
|
104
|
+
class AuditResult(BaseModel):
|
|
105
|
+
url : str
|
|
106
|
+
audit_date : str
|
|
107
|
+
pages_crawled : int
|
|
108
|
+
site_score : float
|
|
109
|
+
grade : str # A / B / C / D / F
|
|
110
|
+
total_issues : int
|
|
111
|
+
critical_count: int
|
|
112
|
+
warning_count : int
|
|
113
|
+
info_count : int
|
|
114
|
+
pages : list[PageData]
|
|
115
|
+
issues : list[SEOIssue]
|
|
116
|
+
safe_browsing: SafeBrowsingResult
|
seoextract/parser.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from urllib.parse import urljoin, urlparse
|
|
3
|
+
from bs4 import BeautifulSoup
|
|
4
|
+
|
|
5
|
+
from .models import PageData
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def parse(fetch_result: dict) -> PageData:
|
|
9
|
+
"""
|
|
10
|
+
Takes one fetch_result dict from crawler.
|
|
11
|
+
Returns a fully populated PageData object.
|
|
12
|
+
"""
|
|
13
|
+
url = fetch_result["url"]
|
|
14
|
+
final_url = fetch_result.get("final_url", url)
|
|
15
|
+
status_code = fetch_result["status_code"]
|
|
16
|
+
response_time_ms = fetch_result["response_time_ms"]
|
|
17
|
+
html = fetch_result.get("html", "")
|
|
18
|
+
|
|
19
|
+
# Base for resolving relative URLs
|
|
20
|
+
parsed_base = urlparse(final_url)
|
|
21
|
+
base_domain = parsed_base.netloc
|
|
22
|
+
base_url = f"{parsed_base.scheme}://{base_domain}"
|
|
23
|
+
|
|
24
|
+
# If no HTML (error page, non-HTML, timeout) return minimal PageData
|
|
25
|
+
if not html:
|
|
26
|
+
return PageData(
|
|
27
|
+
url=url,
|
|
28
|
+
status_code=status_code,
|
|
29
|
+
response_time_ms=response_time_ms,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
soup = BeautifulSoup(html, "lxml")
|
|
33
|
+
|
|
34
|
+
# ── Title ─────────────────────────────────────────────────────────────────
|
|
35
|
+
title_tag = soup.find("title")
|
|
36
|
+
title = title_tag.get_text(strip=True) if title_tag else None
|
|
37
|
+
title_length = len(title) if title else 0
|
|
38
|
+
|
|
39
|
+
# ── Meta Description ──────────────────────────────────────────────────────
|
|
40
|
+
meta_tag = soup.find("meta", attrs={"name": lambda n: n and n.lower() == "description"})
|
|
41
|
+
meta_description = meta_tag["content"].strip() if meta_tag and meta_tag.get("content") else None
|
|
42
|
+
meta_description_length = len(meta_description) if meta_description else 0
|
|
43
|
+
|
|
44
|
+
# ── Robots Meta ───────────────────────────────────────────────────────────
|
|
45
|
+
robots_tag = soup.find("meta", attrs={"name": lambda n: n and n.lower() == "robots"})
|
|
46
|
+
robots_meta = robots_tag["content"].strip() if robots_tag and robots_tag.get("content") else None
|
|
47
|
+
|
|
48
|
+
viewport_tag = soup.find("meta", attrs={"name": lambda n: n and n.lower() == "viewport"})
|
|
49
|
+
viewport = viewport_tag["content"].strip() if viewport_tag and viewport_tag.get("content") else None
|
|
50
|
+
|
|
51
|
+
# ── Canonical ─────────────────────────────────────────────────────────────
|
|
52
|
+
canonical_tag = soup.find("link", attrs={"rel": lambda r: r and "canonical" in r})
|
|
53
|
+
canonical = canonical_tag["href"].strip() if canonical_tag and canonical_tag.get("href") else None
|
|
54
|
+
|
|
55
|
+
# ── Headings ──────────────────────────────────────────────────────────────
|
|
56
|
+
h1_tags = [tag.get_text(strip=True) for tag in soup.find_all("h1")]
|
|
57
|
+
h2_tags = [tag.get_text(strip=True) for tag in soup.find_all("h2")]
|
|
58
|
+
h3_tags = [tag.get_text(strip=True) for tag in soup.find_all("h3")]
|
|
59
|
+
|
|
60
|
+
# ── Word Count ────────────────────────────────────────────────────────────
|
|
61
|
+
# Remove script and style tags before counting
|
|
62
|
+
for tag in soup(["script", "style", "noscript"]):
|
|
63
|
+
tag.decompose()
|
|
64
|
+
body_text = soup.get_text(separator=" ", strip=True)
|
|
65
|
+
word_count = len(body_text.split())
|
|
66
|
+
|
|
67
|
+
# ── Images ────────────────────────────────────────────────────────────────
|
|
68
|
+
all_images = soup.find_all("img")
|
|
69
|
+
total_images = len(all_images)
|
|
70
|
+
images_missing_alt = sum(
|
|
71
|
+
1 for img in all_images
|
|
72
|
+
if not img.get("alt") or img["alt"].strip() == ""
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# ── Links ─────────────────────────────────────────────────────────────────
|
|
76
|
+
internal_links = []
|
|
77
|
+
external_links = []
|
|
78
|
+
|
|
79
|
+
for a_tag in soup.find_all("a", href=True):
|
|
80
|
+
href = a_tag["href"].strip()
|
|
81
|
+
if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("tel:"):
|
|
82
|
+
continue
|
|
83
|
+
|
|
84
|
+
full_url = urljoin(final_url, href)
|
|
85
|
+
link_domain = urlparse(full_url).netloc
|
|
86
|
+
|
|
87
|
+
if link_domain == base_domain:
|
|
88
|
+
internal_links.append(full_url)
|
|
89
|
+
else:
|
|
90
|
+
external_links.append(full_url)
|
|
91
|
+
|
|
92
|
+
# Deduplicate
|
|
93
|
+
internal_links = list(set(internal_links))
|
|
94
|
+
external_links = list(set(external_links))
|
|
95
|
+
|
|
96
|
+
# ── Schema Markup ─────────────────────────────────────────────────────────
|
|
97
|
+
schema_found = False
|
|
98
|
+
for script_tag in soup.find_all("script", attrs={"type": "application/ld+json"}):
|
|
99
|
+
try:
|
|
100
|
+
data = json.loads(script_tag.string or "")
|
|
101
|
+
if data:
|
|
102
|
+
schema_found = True
|
|
103
|
+
break
|
|
104
|
+
except (json.JSONDecodeError, TypeError):
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
# ── Open Graph ────────────────────────────────────────────────────────────
|
|
108
|
+
og_title_tag = soup.find("meta", attrs={"property": "og:title"})
|
|
109
|
+
og_description_tag = soup.find("meta", attrs={"property": "og:description"})
|
|
110
|
+
og_title = og_title_tag["content"].strip() if og_title_tag and og_title_tag.get("content") else None
|
|
111
|
+
og_description = og_description_tag["content"].strip() if og_description_tag and og_description_tag.get("content") else None
|
|
112
|
+
|
|
113
|
+
return PageData(
|
|
114
|
+
url=url,
|
|
115
|
+
final_url=final_url,
|
|
116
|
+
status_code=status_code,
|
|
117
|
+
response_time_ms=response_time_ms,
|
|
118
|
+
title=title,
|
|
119
|
+
title_length=title_length,
|
|
120
|
+
meta_description=meta_description,
|
|
121
|
+
meta_description_length=meta_description_length,
|
|
122
|
+
robots_meta=robots_meta,
|
|
123
|
+
canonical=canonical,
|
|
124
|
+
h1_tags=h1_tags,
|
|
125
|
+
h2_tags=h2_tags,
|
|
126
|
+
h3_tags=h3_tags,
|
|
127
|
+
h1_count=len(h1_tags),
|
|
128
|
+
h2_count=len(h2_tags),
|
|
129
|
+
h3_count=len(h3_tags),
|
|
130
|
+
word_count=word_count,
|
|
131
|
+
total_images=total_images,
|
|
132
|
+
images_missing_alt=images_missing_alt,
|
|
133
|
+
internal_links=internal_links,
|
|
134
|
+
external_links=external_links,
|
|
135
|
+
internal_count=len(internal_links),
|
|
136
|
+
external_count=len(external_links),
|
|
137
|
+
schema_found=schema_found,
|
|
138
|
+
og_title=og_title,
|
|
139
|
+
og_description=og_description,
|
|
140
|
+
)
|
seoextract/rules.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
from .models import PageData, SEOIssue, IssueType, Severity
|
|
2
|
+
|
|
3
|
+
TITLE_MIN = 50
|
|
4
|
+
TITLE_MAX = 60
|
|
5
|
+
META_MIN = 50
|
|
6
|
+
META_MAX = 160
|
|
7
|
+
THIN_CONTENT = 300
|
|
8
|
+
MIN_INTERNAL = 2
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _check_title(page: PageData) -> list[SEOIssue]:
|
|
12
|
+
issues = []
|
|
13
|
+
|
|
14
|
+
if not page.title:
|
|
15
|
+
issues.append(SEOIssue(
|
|
16
|
+
page_url = page.url,
|
|
17
|
+
issue_type = IssueType.MISSING_TITLE,
|
|
18
|
+
severity = Severity.CRITICAL,
|
|
19
|
+
current_value = "",
|
|
20
|
+
suggestion = "Add a descriptive <title> tag between 30–60 characters.",
|
|
21
|
+
))
|
|
22
|
+
elif page.title_length < TITLE_MIN:
|
|
23
|
+
issues.append(SEOIssue(
|
|
24
|
+
page_url = page.url,
|
|
25
|
+
issue_type = IssueType.TITLE_TOO_SHORT,
|
|
26
|
+
severity = Severity.WARNING,
|
|
27
|
+
current_value = page.title,
|
|
28
|
+
suggestion = f"Title is {page.title_length} chars. Expand to at least {TITLE_MIN} characters.",
|
|
29
|
+
))
|
|
30
|
+
elif page.title_length > TITLE_MAX:
|
|
31
|
+
issues.append(SEOIssue(
|
|
32
|
+
page_url = page.url,
|
|
33
|
+
issue_type = IssueType.TITLE_TOO_LONG,
|
|
34
|
+
severity = Severity.WARNING,
|
|
35
|
+
current_value = page.title,
|
|
36
|
+
suggestion = f"Title is {page.title_length} chars. Trim to under {TITLE_MAX} characters.",
|
|
37
|
+
))
|
|
38
|
+
|
|
39
|
+
return issues
|
|
40
|
+
|
|
41
|
+
def _check_duplicate_meta_descriptions(pages: list[PageData]) -> list[SEOIssue]:
|
|
42
|
+
issues = []
|
|
43
|
+
seen = {}
|
|
44
|
+
|
|
45
|
+
for page in pages:
|
|
46
|
+
if not page.meta_description:
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
meta_lower = page.meta_description.strip().lower()
|
|
50
|
+
|
|
51
|
+
if meta_lower in seen:
|
|
52
|
+
issues.append(SEOIssue(
|
|
53
|
+
page_url=page.url,
|
|
54
|
+
issue_type=IssueType.DUPLICATE_META,
|
|
55
|
+
severity=Severity.WARNING,
|
|
56
|
+
current_value=page.meta_description,
|
|
57
|
+
suggestion=f"This meta description duplicates '{seen[meta_lower]}'. Each page should have a unique meta description.",
|
|
58
|
+
))
|
|
59
|
+
else:
|
|
60
|
+
seen[meta_lower] = page.url
|
|
61
|
+
|
|
62
|
+
return issues
|
|
63
|
+
|
|
64
|
+
def _check_meta(page: PageData) -> list[SEOIssue]:
|
|
65
|
+
issues = []
|
|
66
|
+
|
|
67
|
+
if not page.meta_description:
|
|
68
|
+
issues.append(SEOIssue(
|
|
69
|
+
page_url=page.url,
|
|
70
|
+
issue_type=IssueType.MISSING_META,
|
|
71
|
+
severity=Severity.WARNING,
|
|
72
|
+
current_value="",
|
|
73
|
+
suggestion="Add a meta description between 50–160 characters summarising the page.",
|
|
74
|
+
))
|
|
75
|
+
|
|
76
|
+
elif page.meta_description_length < META_MIN:
|
|
77
|
+
issues.append(SEOIssue(
|
|
78
|
+
page_url=page.url,
|
|
79
|
+
issue_type=IssueType.META_TOO_SHORT,
|
|
80
|
+
severity=Severity.WARNING,
|
|
81
|
+
current_value=page.meta_description,
|
|
82
|
+
suggestion=f"Meta description is only {page.meta_description_length} chars. Expand to at least {META_MIN}.",
|
|
83
|
+
))
|
|
84
|
+
|
|
85
|
+
elif page.meta_description_length > META_MAX:
|
|
86
|
+
issues.append(SEOIssue(
|
|
87
|
+
page_url=page.url,
|
|
88
|
+
issue_type=IssueType.META_TOO_LONG,
|
|
89
|
+
severity=Severity.WARNING,
|
|
90
|
+
current_value=page.meta_description,
|
|
91
|
+
suggestion=f"Meta description is {page.meta_description_length} chars. Trim to under {META_MAX}.",
|
|
92
|
+
))
|
|
93
|
+
|
|
94
|
+
return issues
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _check_headings(page: PageData) -> list[SEOIssue]:
|
|
98
|
+
issues = []
|
|
99
|
+
|
|
100
|
+
if page.h1_count == 0:
|
|
101
|
+
issues.append(SEOIssue(
|
|
102
|
+
page_url = page.url,
|
|
103
|
+
issue_type = IssueType.MISSING_H1,
|
|
104
|
+
severity = Severity.CRITICAL,
|
|
105
|
+
current_value = "0 H1 tags found",
|
|
106
|
+
suggestion = "Add exactly one <h1> tag that describes the main topic of this page.",
|
|
107
|
+
))
|
|
108
|
+
elif page.h1_count > 1:
|
|
109
|
+
issues.append(SEOIssue(
|
|
110
|
+
page_url = page.url,
|
|
111
|
+
issue_type = IssueType.MULTIPLE_H1,
|
|
112
|
+
severity = Severity.WARNING,
|
|
113
|
+
current_value = f"{page.h1_count} H1 tags: {page.h1_tags}",
|
|
114
|
+
suggestion = f"Reduce to one H1 tag. Found: {page.h1_tags}",
|
|
115
|
+
))
|
|
116
|
+
|
|
117
|
+
return issues
|
|
118
|
+
|
|
119
|
+
def _check_status_code(page: PageData) -> list[SEOIssue]:
|
|
120
|
+
issues = []
|
|
121
|
+
|
|
122
|
+
if page.status_code >= 400 or page.status_code == 0:
|
|
123
|
+
issues.append(SEOIssue(
|
|
124
|
+
page_url=page.url,
|
|
125
|
+
issue_type=IssueType.BROKEN_LINK,
|
|
126
|
+
severity=Severity.CRITICAL,
|
|
127
|
+
current_value=str(page.status_code),
|
|
128
|
+
suggestion="Fix this page because it does not return a successful HTTP 200 response.",
|
|
129
|
+
))
|
|
130
|
+
|
|
131
|
+
return issues
|
|
132
|
+
|
|
133
|
+
def _check_content(page: PageData) -> list[SEOIssue]:
|
|
134
|
+
issues = []
|
|
135
|
+
|
|
136
|
+
if page.word_count < THIN_CONTENT and page.status_code == 200:
|
|
137
|
+
issues.append(SEOIssue(
|
|
138
|
+
page_url = page.url,
|
|
139
|
+
issue_type = IssueType.THIN_CONTENT,
|
|
140
|
+
severity = Severity.WARNING,
|
|
141
|
+
current_value = f"{page.word_count} words",
|
|
142
|
+
suggestion = f"Page has only {page.word_count} words. Aim for at least {THIN_CONTENT} words of meaningful content.",
|
|
143
|
+
))
|
|
144
|
+
|
|
145
|
+
return issues
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _check_images(page: PageData) -> list[SEOIssue]:
|
|
149
|
+
issues = []
|
|
150
|
+
|
|
151
|
+
if page.images_missing_alt > 0:
|
|
152
|
+
issues.append(SEOIssue(
|
|
153
|
+
page_url = page.url,
|
|
154
|
+
issue_type = IssueType.MISSING_ALT_TEXT,
|
|
155
|
+
severity = Severity.WARNING,
|
|
156
|
+
current_value = f"{page.images_missing_alt} of {page.total_images} images missing alt text",
|
|
157
|
+
suggestion = f"Add descriptive alt text to all {page.images_missing_alt} images missing it.",
|
|
158
|
+
))
|
|
159
|
+
|
|
160
|
+
return issues
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _check_canonical(page: PageData) -> list[SEOIssue]:
|
|
164
|
+
issues = []
|
|
165
|
+
|
|
166
|
+
if not page.canonical and page.status_code == 200:
|
|
167
|
+
issues.append(SEOIssue(
|
|
168
|
+
page_url = page.url,
|
|
169
|
+
issue_type = IssueType.MISSING_CANONICAL,
|
|
170
|
+
severity = Severity.INFO,
|
|
171
|
+
current_value = "",
|
|
172
|
+
suggestion = "Add a <link rel='canonical'> tag to prevent duplicate content issues.",
|
|
173
|
+
))
|
|
174
|
+
|
|
175
|
+
return issues
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _check_internal_linking(page: PageData) -> list[SEOIssue]:
|
|
179
|
+
issues = []
|
|
180
|
+
|
|
181
|
+
if page.internal_count < MIN_INTERNAL and page.status_code == 200:
|
|
182
|
+
issues.append(SEOIssue(
|
|
183
|
+
page_url = page.url,
|
|
184
|
+
issue_type = IssueType.POOR_INTERNAL_LINKING,
|
|
185
|
+
severity = Severity.INFO,
|
|
186
|
+
current_value = f"{page.internal_count} internal links",
|
|
187
|
+
suggestion = f"Add at least {MIN_INTERNAL} internal links to help search engines discover related pages.",
|
|
188
|
+
))
|
|
189
|
+
|
|
190
|
+
return issues
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _check_schema(page: PageData) -> list[SEOIssue]:
|
|
194
|
+
issues = []
|
|
195
|
+
|
|
196
|
+
if not page.schema_found and page.status_code == 200:
|
|
197
|
+
issues.append(SEOIssue(
|
|
198
|
+
page_url = page.url,
|
|
199
|
+
issue_type = IssueType.NO_SCHEMA,
|
|
200
|
+
severity = Severity.INFO,
|
|
201
|
+
current_value = "No JSON-LD schema found",
|
|
202
|
+
suggestion = "Add Schema.org structured data (JSON-LD) to improve search result appearance.",
|
|
203
|
+
))
|
|
204
|
+
|
|
205
|
+
return issues
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# ── Duplicate Title Check (site-level) ───────────────────────────────────────
|
|
209
|
+
|
|
210
|
+
def _check_duplicate_titles(pages: list[PageData]) -> list[SEOIssue]:
|
|
211
|
+
issues = []
|
|
212
|
+
seen = {} # title → first page url
|
|
213
|
+
|
|
214
|
+
for page in pages:
|
|
215
|
+
if not page.title:
|
|
216
|
+
continue
|
|
217
|
+
title_lower = page.title.strip().lower()
|
|
218
|
+
if title_lower in seen:
|
|
219
|
+
issues.append(SEOIssue(
|
|
220
|
+
page_url = page.url,
|
|
221
|
+
issue_type = IssueType.DUPLICATE_TITLE,
|
|
222
|
+
severity = Severity.CRITICAL,
|
|
223
|
+
current_value = page.title,
|
|
224
|
+
suggestion = f"This title duplicates '{seen[title_lower]}'. Each page needs a unique title.",
|
|
225
|
+
))
|
|
226
|
+
else:
|
|
227
|
+
seen[title_lower] = page.url
|
|
228
|
+
|
|
229
|
+
return issues
|
|
230
|
+
|
|
231
|
+
def _check_viewport(page: PageData) -> list[SEOIssue]:
|
|
232
|
+
issues = []
|
|
233
|
+
|
|
234
|
+
if not page.viewport and page.status_code == 200:
|
|
235
|
+
issues.append(SEOIssue(
|
|
236
|
+
page_url=page.url,
|
|
237
|
+
issue_type=IssueType.MISSING_VIEWPORT,
|
|
238
|
+
severity=Severity.WARNING,
|
|
239
|
+
current_value="",
|
|
240
|
+
suggestion="Add a viewport meta tag for mobile responsiveness.",
|
|
241
|
+
))
|
|
242
|
+
|
|
243
|
+
return issues
|
|
244
|
+
|
|
245
|
+
def detect_issues(pages: list[PageData]) -> list[SEOIssue]:
|
|
246
|
+
all_issues = []
|
|
247
|
+
|
|
248
|
+
for page in pages:
|
|
249
|
+
all_issues.extend(_check_status_code(page))
|
|
250
|
+
|
|
251
|
+
# skip SEO checks if page failed
|
|
252
|
+
if page.status_code != 200:
|
|
253
|
+
continue
|
|
254
|
+
|
|
255
|
+
all_issues.extend(_check_title(page))
|
|
256
|
+
all_issues.extend(_check_meta(page))
|
|
257
|
+
all_issues.extend(_check_headings(page))
|
|
258
|
+
all_issues.extend(_check_content(page))
|
|
259
|
+
all_issues.extend(_check_images(page))
|
|
260
|
+
all_issues.extend(_check_canonical(page))
|
|
261
|
+
all_issues.extend(_check_internal_linking(page))
|
|
262
|
+
all_issues.extend(_check_schema(page))
|
|
263
|
+
all_issues.extend(_check_duplicate_titles(pages))
|
|
264
|
+
all_issues.extend(_check_duplicate_meta_descriptions(pages))
|
|
265
|
+
all_issues.extend(_check_duplicate_titles(pages))
|
|
266
|
+
all_issues.extend(_check_viewport(page))
|
|
267
|
+
|
|
268
|
+
return all_issues
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import requests
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
|
|
5
|
+
load_dotenv()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def check_safe_browsing(url: str, api_key: str | None = None) -> dict:
|
|
9
|
+
# manual key has first priority
|
|
10
|
+
if api_key is None:
|
|
11
|
+
api_key = os.getenv("GOOGLE_SAFE_BROWSING_API_KEY")
|
|
12
|
+
|
|
13
|
+
if not api_key:
|
|
14
|
+
return {
|
|
15
|
+
"is_safe": None,
|
|
16
|
+
"threats": [],
|
|
17
|
+
"error": "Google Safe Browsing API key not provided and GOOGLE_SAFE_BROWSING_API_KEY not found in .env",
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
endpoint = f"https://safebrowsing.googleapis.com/v4/threatMatches:find?key={api_key}"
|
|
21
|
+
|
|
22
|
+
payload = {
|
|
23
|
+
"client": {
|
|
24
|
+
"clientId": "seoextracthf",
|
|
25
|
+
"clientVersion": "1.0.0",
|
|
26
|
+
},
|
|
27
|
+
"threatInfo": {
|
|
28
|
+
"threatTypes": [
|
|
29
|
+
"MALWARE",
|
|
30
|
+
"SOCIAL_ENGINEERING",
|
|
31
|
+
"UNWANTED_SOFTWARE",
|
|
32
|
+
"POTENTIALLY_HARMFUL_APPLICATION",
|
|
33
|
+
],
|
|
34
|
+
"platformTypes": ["ANY_PLATFORM"],
|
|
35
|
+
"threatEntryTypes": ["URL"],
|
|
36
|
+
"threatEntries": [{"url": url}],
|
|
37
|
+
},
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
response = requests.post(endpoint, json=payload, timeout=10)
|
|
42
|
+
response.raise_for_status()
|
|
43
|
+
data = response.json()
|
|
44
|
+
|
|
45
|
+
matches = data.get("matches", [])
|
|
46
|
+
|
|
47
|
+
if not matches:
|
|
48
|
+
return {"is_safe": True, "threats": [], "error": None}
|
|
49
|
+
|
|
50
|
+
return {
|
|
51
|
+
"is_safe": False,
|
|
52
|
+
"threats": [match.get("threatType") for match in matches],
|
|
53
|
+
"error": None,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
except requests.RequestException as e:
|
|
57
|
+
return {
|
|
58
|
+
"is_safe": None,
|
|
59
|
+
"threats": [],
|
|
60
|
+
"error": str(e),
|
|
61
|
+
}
|
seoextract/scorer.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from .models import PageData, SEOIssue, Severity
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
# ── Severity Penalty Weights ──────────────────────────────────────────────────
|
|
5
|
+
|
|
6
|
+
PENALTY = {
|
|
7
|
+
Severity.CRITICAL : 20,
|
|
8
|
+
Severity.WARNING : 8,
|
|
9
|
+
Severity.INFO : 3,
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _grade(score: float) -> str:
|
|
14
|
+
if score >= 90: return "A"
|
|
15
|
+
if score >= 75: return "B"
|
|
16
|
+
if score >= 60: return "C"
|
|
17
|
+
if score >= 40: return "D"
|
|
18
|
+
return "F"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def score_page(page: PageData, issues: list[SEOIssue]) -> float:
|
|
22
|
+
"""
|
|
23
|
+
Calculate a 0–100 score for a single page.
|
|
24
|
+
Starts at 100, deducts points per issue by severity.
|
|
25
|
+
"""
|
|
26
|
+
page_issues = [i for i in issues if i.page_url == page.url]
|
|
27
|
+
score = 100.0
|
|
28
|
+
if page.status_code != 200:
|
|
29
|
+
return 0.0
|
|
30
|
+
for issue in page_issues:
|
|
31
|
+
score -= PENALTY[issue.severity]
|
|
32
|
+
|
|
33
|
+
return round(max(score, 0.0), 1)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def score_site(pages: list[PageData], issues: list[SEOIssue]) -> tuple[float, str]:
|
|
37
|
+
"""
|
|
38
|
+
Calculate site-level score and grade.
|
|
39
|
+
Returns: (site_score, grade)
|
|
40
|
+
"""
|
|
41
|
+
if not pages:
|
|
42
|
+
return 0.0, "F"
|
|
43
|
+
|
|
44
|
+
for page in pages:
|
|
45
|
+
page.page_score = score_page(page, issues)
|
|
46
|
+
|
|
47
|
+
site_score = round(sum(p.page_score for p in pages) / len(pages), 1)
|
|
48
|
+
return site_score, _grade(site_score)
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: seoextract
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight Python SEO audit engine that returns Pydantic structured output.
|
|
5
|
+
Author: Britto K
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: requests>=2.31.0
|
|
9
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
10
|
+
Requires-Dist: lxml>=5.0.0
|
|
11
|
+
Requires-Dist: pydantic>=2.0.0
|
|
12
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
13
|
+
|
|
14
|
+
# SEOExtractHF
|
|
15
|
+
|
|
16
|
+
<div align="center">
|
|
17
|
+
|
|
18
|
+
**A lightweight Python SEO audit engine with built-in Google Safe Browsing support.**
|
|
19
|
+
|
|
20
|
+
Returns validated **Pydantic structured output** that can be directly consumed by AI agents, dashboards, APIs, report generators, and automation pipelines.
|
|
21
|
+
|
|
22
|
+
</div>
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Features
|
|
27
|
+
|
|
28
|
+
- Website crawler
|
|
29
|
+
- Google Safe Browsing validation
|
|
30
|
+
- Technical SEO auditing
|
|
31
|
+
- Pydantic structured output
|
|
32
|
+
- Page-level SEO metrics
|
|
33
|
+
- Site-level SEO scoring
|
|
34
|
+
- Severity-based issue detection
|
|
35
|
+
- Duplicate title detection
|
|
36
|
+
- Duplicate meta description detection
|
|
37
|
+
- Canonical tag detection
|
|
38
|
+
- Viewport detection
|
|
39
|
+
- Schema.org detection
|
|
40
|
+
- Image alt-text validation
|
|
41
|
+
- Internal linking analysis
|
|
42
|
+
- Thin content detection
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
# Installation
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install seoextracthf
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
or install from source
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install -e .
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
# Requirements
|
|
61
|
+
|
|
62
|
+
- Python 3.10+
|
|
63
|
+
- Google Safe Browsing API Key
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
# Google Safe Browsing Setup
|
|
68
|
+
|
|
69
|
+
SEOExtractHF checks every website against Google's Safe Browsing service **before crawling**.
|
|
70
|
+
|
|
71
|
+
If Google reports the website as unsafe, crawling is stopped automatically.
|
|
72
|
+
|
|
73
|
+
If no Google Safe Browsing API key is provided, safe_browsing.is_safe will be None.
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Option 1 (Recommended)
|
|
77
|
+
|
|
78
|
+
Create a `.env` file.
|
|
79
|
+
|
|
80
|
+
```text
|
|
81
|
+
.env
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Add your API key.
|
|
85
|
+
|
|
86
|
+
```env
|
|
87
|
+
GOOGLE_SAFE_BROWSING_API_KEY=YOUR_API_KEY
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
SEOExtractHF automatically loads the API key.
|
|
91
|
+
|
|
92
|
+
No additional code is required.
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Option 2
|
|
97
|
+
|
|
98
|
+
Pass the API key manually.
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from seoextracthf import SEOExtract
|
|
102
|
+
|
|
103
|
+
result = SEOExtract.audit(
|
|
104
|
+
"https://example.com",
|
|
105
|
+
safe_browsing_api_key="YOUR_API_KEY"
|
|
106
|
+
)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
When an API key is supplied manually, the `.env` file is **not used**.
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
# Quick Start
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from seoextracthf import SEOExtract
|
|
117
|
+
|
|
118
|
+
result = SEOExtract.audit(
|
|
119
|
+
"https://example.com"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
print(result.model_dump_json(indent=2))
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
# Returned Object
|
|
128
|
+
|
|
129
|
+
SEOExtractHF returns a validated Pydantic model.
|
|
130
|
+
|
|
131
|
+
```text
|
|
132
|
+
AuditResult
|
|
133
|
+
│
|
|
134
|
+
├── url
|
|
135
|
+
├── audit_date
|
|
136
|
+
├── pages_crawled
|
|
137
|
+
├── site_score
|
|
138
|
+
├── grade
|
|
139
|
+
├── total_issues
|
|
140
|
+
├── critical_count
|
|
141
|
+
├── warning_count
|
|
142
|
+
├── info_count
|
|
143
|
+
├── pages
|
|
144
|
+
├── issues
|
|
145
|
+
└── safe_browsing
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
# Example
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
from seoextracthf import SEOExtract
|
|
154
|
+
|
|
155
|
+
result = SEOExtract.audit("https://example.com")
|
|
156
|
+
|
|
157
|
+
print(result.site_score)
|
|
158
|
+
print(result.grade)
|
|
159
|
+
print(result.safe_browsing)
|
|
160
|
+
|
|
161
|
+
for issue in result.issues:
|
|
162
|
+
print(issue.issue_type)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
# Safe Browsing Result
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
{
|
|
171
|
+
"is_safe": True,
|
|
172
|
+
"threats": [],
|
|
173
|
+
"error": None
|
|
174
|
+
}
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
If Google reports a threat:
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
{
|
|
181
|
+
"is_safe": False,
|
|
182
|
+
"threats": [
|
|
183
|
+
"MALWARE"
|
|
184
|
+
],
|
|
185
|
+
"error": None
|
|
186
|
+
}
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
SEOExtractHF immediately stops crawling unsafe websites.
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
# Current SEO Checks
|
|
194
|
+
|
|
195
|
+
## Page Quality
|
|
196
|
+
|
|
197
|
+
- Title validation
|
|
198
|
+
- Meta description validation
|
|
199
|
+
- H1 validation
|
|
200
|
+
- Thin content detection
|
|
201
|
+
|
|
202
|
+
## Technical SEO
|
|
203
|
+
|
|
204
|
+
- Canonical tag
|
|
205
|
+
- Viewport meta tag
|
|
206
|
+
- Schema.org JSON-LD
|
|
207
|
+
- HTTP status validation
|
|
208
|
+
|
|
209
|
+
## Images
|
|
210
|
+
|
|
211
|
+
- Missing ALT attributes
|
|
212
|
+
|
|
213
|
+
## Links
|
|
214
|
+
|
|
215
|
+
- Internal link analysis
|
|
216
|
+
|
|
217
|
+
## Site-wide Checks
|
|
218
|
+
|
|
219
|
+
- Duplicate titles
|
|
220
|
+
- Duplicate meta descriptions
|
|
221
|
+
|
|
222
|
+
## Security
|
|
223
|
+
|
|
224
|
+
- Google Safe Browsing validation
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
# Example Output
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
AuditResult(
|
|
232
|
+
site_score=91.0,
|
|
233
|
+
grade="A",
|
|
234
|
+
total_issues=4,
|
|
235
|
+
pages_crawled=15
|
|
236
|
+
)
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
# Project Structure
|
|
242
|
+
|
|
243
|
+
```
|
|
244
|
+
seoextracthf/
|
|
245
|
+
│
|
|
246
|
+
├── crawler.py
|
|
247
|
+
├── parser.py
|
|
248
|
+
├── rules.py
|
|
249
|
+
├── scorer.py
|
|
250
|
+
├── safe_browsing.py
|
|
251
|
+
├── models.py
|
|
252
|
+
└── __init__.py
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
# Designed For
|
|
258
|
+
|
|
259
|
+
SEOExtractHF is designed to be used inside:
|
|
260
|
+
|
|
261
|
+
- AI SEO Agents
|
|
262
|
+
- LangGraph workflows
|
|
263
|
+
- FastAPI applications
|
|
264
|
+
- Streamlit dashboards
|
|
265
|
+
- Report generators
|
|
266
|
+
- CI/CD quality checks
|
|
267
|
+
- Data pipelines
|
|
268
|
+
- SEO automation tools
|
|
269
|
+
|
|
270
|
+
---
|
|
271
|
+
|
|
272
|
+
# Dependencies
|
|
273
|
+
|
|
274
|
+
- beautifulsoup4
|
|
275
|
+
- lxml
|
|
276
|
+
- requests
|
|
277
|
+
- pydantic
|
|
278
|
+
- python-dotenv
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
# License
|
|
283
|
+
|
|
284
|
+
MIT License
|
|
285
|
+
|
|
286
|
+
---
|
|
287
|
+
|
|
288
|
+
# Author
|
|
289
|
+
|
|
290
|
+
**Britto K**
|
|
291
|
+
|
|
292
|
+
GitHub:
|
|
293
|
+
|
|
294
|
+
https://github.com/Britto1221# seoextract
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
seoextract/__init__.py,sha256=mYxMjSFt3xUJBhQ2vS_CQ6i-i7EKAr5O5t4RWEjj22Y,56
|
|
2
|
+
seoextract/crawler.py,sha256=-oCnPiPHDKpJiLajRWn8OPqJzgPNkGzJu8AFzO_GKYg,5528
|
|
3
|
+
seoextract/init.py,sha256=5T64sdKW4mJmVz-WziebfOYuX4bofuc9af0GHdXAFS4,2731
|
|
4
|
+
seoextract/models.py,sha256=dtxV9VPiZFzWJiQJMLtKkU-3fMnvXxMWsfwva8q67qM,3912
|
|
5
|
+
seoextract/parser.py,sha256=FqazLOWNA-RhEpHfjUZytO8d1RaMYUV_TuggChhDO1c,7120
|
|
6
|
+
seoextract/rules.py,sha256=yqmb12HT722UsXgHJAxaljzMiyTljWNB3maXB93qkl0,9521
|
|
7
|
+
seoextract/safe_browsing.py,sha256=WiBCM_NuOhpS_xHsO0QLFuO740rMa8okfgSwlxZhKC4,1726
|
|
8
|
+
seoextract/scorer.py,sha256=eOavcwXJbIJRPO9S85f65bZAG2xu1G39dXnyIdeOP60,1394
|
|
9
|
+
seoextract-0.1.0.dist-info/METADATA,sha256=fkeKaE4JmHwXjCfTqLCCuY5W4X6P3QCILBclJ4VZPyg,4609
|
|
10
|
+
seoextract-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
11
|
+
seoextract-0.1.0.dist-info/top_level.txt,sha256=YuTAdTr8NcUNo7p5ygQtcRd42rbDqR0KdFcSUyUuGrw,11
|
|
12
|
+
seoextract-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
seoextract
|