seoextract 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
seoextract/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from .init import SEOExtract
2
+
3
+ __all__ = ["SEOExtract"]
seoextract/crawler.py ADDED
@@ -0,0 +1,148 @@
1
+ import time
2
+ import requests
3
+ from urllib.parse import urljoin, urlparse
4
+ from urllib.robotparser import RobotFileParser
5
+ from bs4 import BeautifulSoup
6
+
7
+
8
+ HEADERS = {
9
+ "User-Agent": "SEOExtractHF/1.0 (SEO Audit Bot; +https://github.com/Britto1221/seoextracthf)"
10
+ }
11
+
12
+ TIMEOUT = 10 # seconds per request
13
+
14
+
15
+ # ── Robots.txt Checker ────────────────────────────────────────────────────────
16
+
17
+ def _load_robots(base_url: str) -> RobotFileParser | None:
18
+ """
19
+ Returns a RobotFileParser if robots.txt is reachable and returns 200.
20
+ Returns None if unreachable or blocked — caller treats None as allow-all.
21
+ """
22
+ robots_url = urljoin(base_url, "/robots.txt")
23
+ try:
24
+ resp = requests.get(robots_url, headers=HEADERS, timeout=5)
25
+ if resp.status_code != 200:
26
+ return None # can't read robots.txt → allow everything
27
+ rp = RobotFileParser()
28
+ rp.set_url(robots_url)
29
+ rp.read()
30
+ return rp
31
+ except Exception:
32
+ return None
33
+
34
+
35
+ # ── Single Page Fetch ─────────────────────────────────────────────────────────
36
+
37
+ def fetch_page(url: str, session: requests.Session) -> dict:
38
+ """
39
+ Fetch a single URL.
40
+ Returns: { url, status_code, html, response_time_ms, final_url }
41
+ On failure returns status_code 0 and empty html.
42
+ """
43
+ try:
44
+ start = time.time()
45
+ response = session.get(url, headers=HEADERS, timeout=TIMEOUT, allow_redirects=True)
46
+ elapsed = (time.time() - start) * 1000 # ms
47
+
48
+ content_type = response.headers.get("Content-Type", "")
49
+ html = response.text if "text/html" in content_type else ""
50
+
51
+ return {
52
+ "url" : url,
53
+ "final_url" : response.url,
54
+ "status_code" : response.status_code,
55
+ "html" : html,
56
+ "response_time_ms": round(elapsed, 2),
57
+ }
58
+
59
+ except requests.exceptions.Timeout as e:
60
+ return {
61
+ "url": url,
62
+ "final_url": url,
63
+ "status_code": 408,
64
+ "html": "",
65
+ "response_time_ms": 0.0,
66
+ "error": str(e),
67
+ }
68
+ except requests.exceptions.TooManyRedirects:
69
+ return {"url": url, "final_url": url, "status_code": 310, "html": "", "response_time_ms": 0.0}
70
+ except requests.exceptions.ConnectionError:
71
+ return {"url": url, "final_url": url, "status_code": 0, "html": "", "response_time_ms": 0.0}
72
+ except Exception:
73
+ return {"url": url, "final_url": url, "status_code": 0, "html": "", "response_time_ms": 0.0}
74
+
75
+
76
+ # ── Internal Link Extractor ───────────────────────────────────────────────────
77
+
78
+ def _extract_internal_links(html: str, base_url: str) -> list[str]:
79
+ """Extract all internal hrefs from a page's HTML."""
80
+ soup = BeautifulSoup(html, "lxml")
81
+ base_domain = urlparse(base_url).netloc
82
+ links = []
83
+
84
+ for tag in soup.find_all("a", href=True):
85
+ href = tag["href"].strip()
86
+ if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("tel:"):
87
+ continue
88
+
89
+ full_url = urljoin(base_url, href)
90
+ parsed = urlparse(full_url)
91
+
92
+ # Keep only http/https same-domain links
93
+ if parsed.scheme in ("http", "https") and parsed.netloc == base_domain:
94
+ # Normalize: remove fragment
95
+ clean = parsed._replace(fragment="").geturl()
96
+ links.append(clean)
97
+
98
+ return list(set(links))
99
+
100
+
101
+ # ── BFS Crawler ───────────────────────────────────────────────────────────────
102
+
103
+ def crawl(seed_url: str, max_pages: int = 20) -> list[dict]:
104
+ """
105
+ BFS crawl starting from seed_url.
106
+ Returns list of fetch results (one dict per page).
107
+ Respects robots.txt and max_pages cap.
108
+ """
109
+ # Normalize seed
110
+ parsed_seed = urlparse(seed_url)
111
+
112
+ if not parsed_seed.scheme:
113
+ seed_url = "https://" + seed_url
114
+
115
+ parsed_seed = urlparse(seed_url)
116
+ base_url = f"{parsed_seed.scheme}://{parsed_seed.netloc}"
117
+
118
+ robots = _load_robots(base_url)
119
+ session = requests.Session()
120
+ visited = set()
121
+ queue = [seed_url]
122
+ results = []
123
+
124
+ while queue and len(results) < max_pages:
125
+ url = queue.pop(0)
126
+
127
+ # Skip already visited
128
+ if url in visited:
129
+ continue
130
+ visited.add(url)
131
+
132
+ # Respect robots.txt (None means unreachable → allow all)
133
+ if robots is not None and not robots.can_fetch(HEADERS["User-Agent"], url):
134
+ continue
135
+
136
+ # Fetch the page
137
+ result = fetch_page(url, session)
138
+ results.append(result)
139
+
140
+ # Only follow links from successful HTML pages
141
+ if result["status_code"] == 200 and result["html"]:
142
+ new_links = _extract_internal_links(result["html"], base_url)
143
+ for link in new_links:
144
+ if link not in visited and link not in queue:
145
+ queue.append(link)
146
+
147
+ session.close()
148
+ return results
seoextract/init.py ADDED
@@ -0,0 +1,84 @@
1
+ from datetime import datetime
2
+ from .crawler import crawl
3
+ from .parser import parse
4
+ from .rules import detect_issues
5
+ from .scorer import score_site
6
+ from .models import AuditResult, Severity,SafeBrowsingResult
7
+ from .safe_browsing import check_safe_browsing
8
+
9
+
10
+ class SEOExtract:
11
+
12
+ @staticmethod
13
+ def audit(
14
+ url: str,
15
+ max_pages: int = 20,
16
+ safe_browsing_api_key: str | None = None,
17
+ ) -> AuditResult:
18
+ """
19
+ Main entry point.
20
+
21
+ Usage:
22
+ from seoextracthf import SEOExtract
23
+ result = SEOExtract.audit("https://example.com")
24
+
25
+ Returns an AuditResult object with:
26
+ result.site_score
27
+ result.grade
28
+ result.pages → list of PageData
29
+ result.issues → list of SEOIssue
30
+ """
31
+ safe_result = check_safe_browsing(url, api_key=safe_browsing_api_key)
32
+ safe_browsing = SafeBrowsingResult(**safe_result)
33
+
34
+ if safe_browsing.is_safe is False:
35
+ return AuditResult(
36
+ url=url,
37
+ audit_date=datetime.now().strftime("%Y-%m-%d %H:%M"),
38
+ pages_crawled=0,
39
+ site_score=0.0,
40
+ grade="F",
41
+ total_issues=0,
42
+ critical_count=0,
43
+ warning_count=0,
44
+ info_count=0,
45
+ pages=[],
46
+ issues=[],
47
+ safe_browsing=safe_browsing,
48
+ )
49
+ # 1. Crawl
50
+ raw_pages = crawl(url, max_pages=max_pages)
51
+
52
+ # 2. Parse each page
53
+ pages = [parse(r) for r in raw_pages]
54
+
55
+ # 3. Detect issues
56
+ issues = detect_issues(pages)
57
+
58
+ # 4. Score
59
+ site_score, grade = score_site(pages, issues)
60
+
61
+ # 5. Count by severity
62
+ critical = sum(1 for i in issues if i.severity == Severity.CRITICAL)
63
+ warning = sum(1 for i in issues if i.severity == Severity.WARNING)
64
+ info = sum(1 for i in issues if i.severity == Severity.INFO)
65
+
66
+ return AuditResult(
67
+ url = url,
68
+ audit_date = datetime.now().strftime("%Y-%m-%d %H:%M"),
69
+ pages_crawled = len(pages),
70
+ site_score = site_score,
71
+ grade = grade,
72
+ total_issues = len(issues),
73
+ critical_count= critical,
74
+ warning_count = warning,
75
+ info_count = info,
76
+ pages = pages,
77
+ issues = issues,
78
+ safe_browsing=safe_browsing,
79
+ )
80
+
81
+
82
+ if __name__ == "__main__":
83
+ result = SEOExtract.audit("https://rootpro.in/")
84
+ print(result.model_dump_json(indent=2))
seoextract/models.py ADDED
@@ -0,0 +1,116 @@
1
+ from pydantic import BaseModel, HttpUrl
2
+ from typing import Optional
3
+ from enum import Enum
4
+
5
+
6
+ # ── Severity Levels ──────────────────────────────────────────────────────────
7
+
8
+ class Severity(str, Enum):
9
+ CRITICAL = "CRITICAL"
10
+ WARNING = "WARNING"
11
+ INFO = "INFO"
12
+
13
+ class SafeBrowsingResult(BaseModel):
14
+ is_safe: bool | None
15
+ threats: list[str] = []
16
+ error: str | None = None
17
+ # ── Issue Types ───────────────────────────────────────────────────────────────
18
+
19
+ class IssueType(str, Enum):
20
+ MISSING_VIEWPORT = "Missing Viewport Meta Tag"
21
+ META_TOO_SHORT = "Meta Description Too Short"
22
+ MISSING_TITLE = "Missing Title"
23
+ TITLE_TOO_SHORT = "Title Too Short"
24
+ TITLE_TOO_LONG = "Title Too Long"
25
+ DUPLICATE_TITLE = "Duplicate Title"
26
+ MISSING_META = "Missing Meta Description"
27
+ META_TOO_LONG = "Meta Description Too Long"
28
+ MISSING_H1 = "Missing H1 Tag"
29
+ MULTIPLE_H1 = "Multiple H1 Tags"
30
+ THIN_CONTENT = "Thin Content"
31
+ MISSING_ALT_TEXT = "Missing Image Alt Text"
32
+ BROKEN_LINK = "Broken Internal Link"
33
+ MISSING_CANONICAL = "Missing Canonical Tag"
34
+ POOR_INTERNAL_LINKING = "Poor Internal Linking"
35
+ NO_SCHEMA = "No Schema Markup"
36
+ MISSING_ROBOTS_META = "Missing Robots Meta Tag"
37
+ DUPLICATE_META = "Duplicate Meta Description"
38
+
39
+
40
+ # ── Single SEO Issue ──────────────────────────────────────────────────────────
41
+
42
+ class SEOIssue(BaseModel):
43
+ page_url : str
44
+ issue_type : IssueType
45
+ severity : Severity
46
+ current_value : str # what was found (or empty string if missing)
47
+ suggestion : str # one-line hint for fixing it
48
+
49
+
50
+ # ── Per-Page Extracted Data ───────────────────────────────────────────────────
51
+
52
+ class PageData(BaseModel):
53
+ url : str
54
+ status_code : int
55
+ response_time_ms : float
56
+
57
+ # Title
58
+ title : Optional[str] = None
59
+ title_length : int = 0
60
+
61
+ # Meta
62
+ meta_description : Optional[str] = None
63
+ meta_description_length : int = 0
64
+
65
+ # Robots + Canonical
66
+ robots_meta : Optional[str] = None # e.g. "noindex, nofollow"
67
+ canonical : Optional[str] = None
68
+
69
+ # Headings
70
+ h1_tags : list[str] = []
71
+ h2_tags : list[str] = []
72
+ h3_tags : list[str] = []
73
+ h1_count: int = 0
74
+ h2_count: int = 0
75
+ h3_count: int = 0
76
+
77
+ # Content
78
+ word_count: int = 0
79
+
80
+ # Images
81
+ total_images : int = 0
82
+ images_missing_alt : int = 0
83
+
84
+ # Links
85
+ internal_links : list[str] = []
86
+ external_links : list[str] = []
87
+ internal_count : int = 0
88
+ external_count : int = 0
89
+
90
+ # Schema
91
+ schema_found : bool = False
92
+
93
+ # Open Graph
94
+ og_title : Optional[str] = None
95
+ og_description : Optional[str] = None
96
+
97
+ # Scoring
98
+ page_score : float = 0.0
99
+
100
+ final_url: Optional[str] = None
101
+
102
+ viewport: Optional[str] = None
103
+
104
+ class AuditResult(BaseModel):
105
+ url : str
106
+ audit_date : str
107
+ pages_crawled : int
108
+ site_score : float
109
+ grade : str # A / B / C / D / F
110
+ total_issues : int
111
+ critical_count: int
112
+ warning_count : int
113
+ info_count : int
114
+ pages : list[PageData]
115
+ issues : list[SEOIssue]
116
+ safe_browsing: SafeBrowsingResult
seoextract/parser.py ADDED
@@ -0,0 +1,140 @@
1
+ import json
2
+ from urllib.parse import urljoin, urlparse
3
+ from bs4 import BeautifulSoup
4
+
5
+ from .models import PageData
6
+
7
+
8
+ def parse(fetch_result: dict) -> PageData:
9
+ """
10
+ Takes one fetch_result dict from crawler.
11
+ Returns a fully populated PageData object.
12
+ """
13
+ url = fetch_result["url"]
14
+ final_url = fetch_result.get("final_url", url)
15
+ status_code = fetch_result["status_code"]
16
+ response_time_ms = fetch_result["response_time_ms"]
17
+ html = fetch_result.get("html", "")
18
+
19
+ # Base for resolving relative URLs
20
+ parsed_base = urlparse(final_url)
21
+ base_domain = parsed_base.netloc
22
+ base_url = f"{parsed_base.scheme}://{base_domain}"
23
+
24
+ # If no HTML (error page, non-HTML, timeout) return minimal PageData
25
+ if not html:
26
+ return PageData(
27
+ url=url,
28
+ status_code=status_code,
29
+ response_time_ms=response_time_ms,
30
+ )
31
+
32
+ soup = BeautifulSoup(html, "lxml")
33
+
34
+ # ── Title ─────────────────────────────────────────────────────────────────
35
+ title_tag = soup.find("title")
36
+ title = title_tag.get_text(strip=True) if title_tag else None
37
+ title_length = len(title) if title else 0
38
+
39
+ # ── Meta Description ──────────────────────────────────────────────────────
40
+ meta_tag = soup.find("meta", attrs={"name": lambda n: n and n.lower() == "description"})
41
+ meta_description = meta_tag["content"].strip() if meta_tag and meta_tag.get("content") else None
42
+ meta_description_length = len(meta_description) if meta_description else 0
43
+
44
+ # ── Robots Meta ───────────────────────────────────────────────────────────
45
+ robots_tag = soup.find("meta", attrs={"name": lambda n: n and n.lower() == "robots"})
46
+ robots_meta = robots_tag["content"].strip() if robots_tag and robots_tag.get("content") else None
47
+
48
+ viewport_tag = soup.find("meta", attrs={"name": lambda n: n and n.lower() == "viewport"})
49
+ viewport = viewport_tag["content"].strip() if viewport_tag and viewport_tag.get("content") else None
50
+
51
+ # ── Canonical ─────────────────────────────────────────────────────────────
52
+ canonical_tag = soup.find("link", attrs={"rel": lambda r: r and "canonical" in r})
53
+ canonical = canonical_tag["href"].strip() if canonical_tag and canonical_tag.get("href") else None
54
+
55
+ # ── Headings ──────────────────────────────────────────────────────────────
56
+ h1_tags = [tag.get_text(strip=True) for tag in soup.find_all("h1")]
57
+ h2_tags = [tag.get_text(strip=True) for tag in soup.find_all("h2")]
58
+ h3_tags = [tag.get_text(strip=True) for tag in soup.find_all("h3")]
59
+
60
+ # ── Word Count ────────────────────────────────────────────────────────────
61
+ # Remove script and style tags before counting
62
+ for tag in soup(["script", "style", "noscript"]):
63
+ tag.decompose()
64
+ body_text = soup.get_text(separator=" ", strip=True)
65
+ word_count = len(body_text.split())
66
+
67
+ # ── Images ────────────────────────────────────────────────────────────────
68
+ all_images = soup.find_all("img")
69
+ total_images = len(all_images)
70
+ images_missing_alt = sum(
71
+ 1 for img in all_images
72
+ if not img.get("alt") or img["alt"].strip() == ""
73
+ )
74
+
75
+ # ── Links ─────────────────────────────────────────────────────────────────
76
+ internal_links = []
77
+ external_links = []
78
+
79
+ for a_tag in soup.find_all("a", href=True):
80
+ href = a_tag["href"].strip()
81
+ if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("tel:"):
82
+ continue
83
+
84
+ full_url = urljoin(final_url, href)
85
+ link_domain = urlparse(full_url).netloc
86
+
87
+ if link_domain == base_domain:
88
+ internal_links.append(full_url)
89
+ else:
90
+ external_links.append(full_url)
91
+
92
+ # Deduplicate
93
+ internal_links = list(set(internal_links))
94
+ external_links = list(set(external_links))
95
+
96
+ # ── Schema Markup ─────────────────────────────────────────────────────────
97
+ schema_found = False
98
+ for script_tag in soup.find_all("script", attrs={"type": "application/ld+json"}):
99
+ try:
100
+ data = json.loads(script_tag.string or "")
101
+ if data:
102
+ schema_found = True
103
+ break
104
+ except (json.JSONDecodeError, TypeError):
105
+ continue
106
+
107
+ # ── Open Graph ────────────────────────────────────────────────────────────
108
+ og_title_tag = soup.find("meta", attrs={"property": "og:title"})
109
+ og_description_tag = soup.find("meta", attrs={"property": "og:description"})
110
+ og_title = og_title_tag["content"].strip() if og_title_tag and og_title_tag.get("content") else None
111
+ og_description = og_description_tag["content"].strip() if og_description_tag and og_description_tag.get("content") else None
112
+
113
+ return PageData(
114
+ url=url,
115
+ final_url=final_url,
116
+ status_code=status_code,
117
+ response_time_ms=response_time_ms,
118
+ title=title,
119
+ title_length=title_length,
120
+ meta_description=meta_description,
121
+ meta_description_length=meta_description_length,
122
+ robots_meta=robots_meta,
123
+ canonical=canonical,
124
+ h1_tags=h1_tags,
125
+ h2_tags=h2_tags,
126
+ h3_tags=h3_tags,
127
+ h1_count=len(h1_tags),
128
+ h2_count=len(h2_tags),
129
+ h3_count=len(h3_tags),
130
+ word_count=word_count,
131
+ total_images=total_images,
132
+ images_missing_alt=images_missing_alt,
133
+ internal_links=internal_links,
134
+ external_links=external_links,
135
+ internal_count=len(internal_links),
136
+ external_count=len(external_links),
137
+ schema_found=schema_found,
138
+ og_title=og_title,
139
+ og_description=og_description,
140
+ )
seoextract/rules.py ADDED
@@ -0,0 +1,268 @@
1
+ from .models import PageData, SEOIssue, IssueType, Severity
2
+
3
+ TITLE_MIN = 50
4
+ TITLE_MAX = 60
5
+ META_MIN = 50
6
+ META_MAX = 160
7
+ THIN_CONTENT = 300
8
+ MIN_INTERNAL = 2
9
+
10
+
11
+ def _check_title(page: PageData) -> list[SEOIssue]:
12
+ issues = []
13
+
14
+ if not page.title:
15
+ issues.append(SEOIssue(
16
+ page_url = page.url,
17
+ issue_type = IssueType.MISSING_TITLE,
18
+ severity = Severity.CRITICAL,
19
+ current_value = "",
20
+ suggestion = "Add a descriptive <title> tag between 30–60 characters.",
21
+ ))
22
+ elif page.title_length < TITLE_MIN:
23
+ issues.append(SEOIssue(
24
+ page_url = page.url,
25
+ issue_type = IssueType.TITLE_TOO_SHORT,
26
+ severity = Severity.WARNING,
27
+ current_value = page.title,
28
+ suggestion = f"Title is {page.title_length} chars. Expand to at least {TITLE_MIN} characters.",
29
+ ))
30
+ elif page.title_length > TITLE_MAX:
31
+ issues.append(SEOIssue(
32
+ page_url = page.url,
33
+ issue_type = IssueType.TITLE_TOO_LONG,
34
+ severity = Severity.WARNING,
35
+ current_value = page.title,
36
+ suggestion = f"Title is {page.title_length} chars. Trim to under {TITLE_MAX} characters.",
37
+ ))
38
+
39
+ return issues
40
+
41
+ def _check_duplicate_meta_descriptions(pages: list[PageData]) -> list[SEOIssue]:
42
+ issues = []
43
+ seen = {}
44
+
45
+ for page in pages:
46
+ if not page.meta_description:
47
+ continue
48
+
49
+ meta_lower = page.meta_description.strip().lower()
50
+
51
+ if meta_lower in seen:
52
+ issues.append(SEOIssue(
53
+ page_url=page.url,
54
+ issue_type=IssueType.DUPLICATE_META,
55
+ severity=Severity.WARNING,
56
+ current_value=page.meta_description,
57
+ suggestion=f"This meta description duplicates '{seen[meta_lower]}'. Each page should have a unique meta description.",
58
+ ))
59
+ else:
60
+ seen[meta_lower] = page.url
61
+
62
+ return issues
63
+
64
+ def _check_meta(page: PageData) -> list[SEOIssue]:
65
+ issues = []
66
+
67
+ if not page.meta_description:
68
+ issues.append(SEOIssue(
69
+ page_url=page.url,
70
+ issue_type=IssueType.MISSING_META,
71
+ severity=Severity.WARNING,
72
+ current_value="",
73
+ suggestion="Add a meta description between 50–160 characters summarising the page.",
74
+ ))
75
+
76
+ elif page.meta_description_length < META_MIN:
77
+ issues.append(SEOIssue(
78
+ page_url=page.url,
79
+ issue_type=IssueType.META_TOO_SHORT,
80
+ severity=Severity.WARNING,
81
+ current_value=page.meta_description,
82
+ suggestion=f"Meta description is only {page.meta_description_length} chars. Expand to at least {META_MIN}.",
83
+ ))
84
+
85
+ elif page.meta_description_length > META_MAX:
86
+ issues.append(SEOIssue(
87
+ page_url=page.url,
88
+ issue_type=IssueType.META_TOO_LONG,
89
+ severity=Severity.WARNING,
90
+ current_value=page.meta_description,
91
+ suggestion=f"Meta description is {page.meta_description_length} chars. Trim to under {META_MAX}.",
92
+ ))
93
+
94
+ return issues
95
+
96
+
97
+ def _check_headings(page: PageData) -> list[SEOIssue]:
98
+ issues = []
99
+
100
+ if page.h1_count == 0:
101
+ issues.append(SEOIssue(
102
+ page_url = page.url,
103
+ issue_type = IssueType.MISSING_H1,
104
+ severity = Severity.CRITICAL,
105
+ current_value = "0 H1 tags found",
106
+ suggestion = "Add exactly one <h1> tag that describes the main topic of this page.",
107
+ ))
108
+ elif page.h1_count > 1:
109
+ issues.append(SEOIssue(
110
+ page_url = page.url,
111
+ issue_type = IssueType.MULTIPLE_H1,
112
+ severity = Severity.WARNING,
113
+ current_value = f"{page.h1_count} H1 tags: {page.h1_tags}",
114
+ suggestion = f"Reduce to one H1 tag. Found: {page.h1_tags}",
115
+ ))
116
+
117
+ return issues
118
+
119
+ def _check_status_code(page: PageData) -> list[SEOIssue]:
120
+ issues = []
121
+
122
+ if page.status_code >= 400 or page.status_code == 0:
123
+ issues.append(SEOIssue(
124
+ page_url=page.url,
125
+ issue_type=IssueType.BROKEN_LINK,
126
+ severity=Severity.CRITICAL,
127
+ current_value=str(page.status_code),
128
+ suggestion="Fix this page because it does not return a successful HTTP 200 response.",
129
+ ))
130
+
131
+ return issues
132
+
133
+ def _check_content(page: PageData) -> list[SEOIssue]:
134
+ issues = []
135
+
136
+ if page.word_count < THIN_CONTENT and page.status_code == 200:
137
+ issues.append(SEOIssue(
138
+ page_url = page.url,
139
+ issue_type = IssueType.THIN_CONTENT,
140
+ severity = Severity.WARNING,
141
+ current_value = f"{page.word_count} words",
142
+ suggestion = f"Page has only {page.word_count} words. Aim for at least {THIN_CONTENT} words of meaningful content.",
143
+ ))
144
+
145
+ return issues
146
+
147
+
148
+ def _check_images(page: PageData) -> list[SEOIssue]:
149
+ issues = []
150
+
151
+ if page.images_missing_alt > 0:
152
+ issues.append(SEOIssue(
153
+ page_url = page.url,
154
+ issue_type = IssueType.MISSING_ALT_TEXT,
155
+ severity = Severity.WARNING,
156
+ current_value = f"{page.images_missing_alt} of {page.total_images} images missing alt text",
157
+ suggestion = f"Add descriptive alt text to all {page.images_missing_alt} images missing it.",
158
+ ))
159
+
160
+ return issues
161
+
162
+
163
+ def _check_canonical(page: PageData) -> list[SEOIssue]:
164
+ issues = []
165
+
166
+ if not page.canonical and page.status_code == 200:
167
+ issues.append(SEOIssue(
168
+ page_url = page.url,
169
+ issue_type = IssueType.MISSING_CANONICAL,
170
+ severity = Severity.INFO,
171
+ current_value = "",
172
+ suggestion = "Add a <link rel='canonical'> tag to prevent duplicate content issues.",
173
+ ))
174
+
175
+ return issues
176
+
177
+
178
+ def _check_internal_linking(page: PageData) -> list[SEOIssue]:
179
+ issues = []
180
+
181
+ if page.internal_count < MIN_INTERNAL and page.status_code == 200:
182
+ issues.append(SEOIssue(
183
+ page_url = page.url,
184
+ issue_type = IssueType.POOR_INTERNAL_LINKING,
185
+ severity = Severity.INFO,
186
+ current_value = f"{page.internal_count} internal links",
187
+ suggestion = f"Add at least {MIN_INTERNAL} internal links to help search engines discover related pages.",
188
+ ))
189
+
190
+ return issues
191
+
192
+
193
+ def _check_schema(page: PageData) -> list[SEOIssue]:
194
+ issues = []
195
+
196
+ if not page.schema_found and page.status_code == 200:
197
+ issues.append(SEOIssue(
198
+ page_url = page.url,
199
+ issue_type = IssueType.NO_SCHEMA,
200
+ severity = Severity.INFO,
201
+ current_value = "No JSON-LD schema found",
202
+ suggestion = "Add Schema.org structured data (JSON-LD) to improve search result appearance.",
203
+ ))
204
+
205
+ return issues
206
+
207
+
208
+ # ── Duplicate Title Check (site-level) ───────────────────────────────────────
209
+
210
+ def _check_duplicate_titles(pages: list[PageData]) -> list[SEOIssue]:
211
+ issues = []
212
+ seen = {} # title → first page url
213
+
214
+ for page in pages:
215
+ if not page.title:
216
+ continue
217
+ title_lower = page.title.strip().lower()
218
+ if title_lower in seen:
219
+ issues.append(SEOIssue(
220
+ page_url = page.url,
221
+ issue_type = IssueType.DUPLICATE_TITLE,
222
+ severity = Severity.CRITICAL,
223
+ current_value = page.title,
224
+ suggestion = f"This title duplicates '{seen[title_lower]}'. Each page needs a unique title.",
225
+ ))
226
+ else:
227
+ seen[title_lower] = page.url
228
+
229
+ return issues
230
+
231
+ def _check_viewport(page: PageData) -> list[SEOIssue]:
232
+ issues = []
233
+
234
+ if not page.viewport and page.status_code == 200:
235
+ issues.append(SEOIssue(
236
+ page_url=page.url,
237
+ issue_type=IssueType.MISSING_VIEWPORT,
238
+ severity=Severity.WARNING,
239
+ current_value="",
240
+ suggestion="Add a viewport meta tag for mobile responsiveness.",
241
+ ))
242
+
243
+ return issues
244
+
245
+ def detect_issues(pages: list[PageData]) -> list[SEOIssue]:
246
+ all_issues = []
247
+
248
+ for page in pages:
249
+ all_issues.extend(_check_status_code(page))
250
+
251
+ # skip SEO checks if page failed
252
+ if page.status_code != 200:
253
+ continue
254
+
255
+ all_issues.extend(_check_title(page))
256
+ all_issues.extend(_check_meta(page))
257
+ all_issues.extend(_check_headings(page))
258
+ all_issues.extend(_check_content(page))
259
+ all_issues.extend(_check_images(page))
260
+ all_issues.extend(_check_canonical(page))
261
+ all_issues.extend(_check_internal_linking(page))
262
+ all_issues.extend(_check_schema(page))
263
+ all_issues.extend(_check_duplicate_titles(pages))
264
+ all_issues.extend(_check_duplicate_meta_descriptions(pages))
265
+ all_issues.extend(_check_duplicate_titles(pages))
266
+ all_issues.extend(_check_viewport(page))
267
+
268
+ return all_issues
@@ -0,0 +1,61 @@
1
+ import os
2
+ import requests
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+
8
+ def check_safe_browsing(url: str, api_key: str | None = None) -> dict:
9
+ # manual key has first priority
10
+ if api_key is None:
11
+ api_key = os.getenv("GOOGLE_SAFE_BROWSING_API_KEY")
12
+
13
+ if not api_key:
14
+ return {
15
+ "is_safe": None,
16
+ "threats": [],
17
+ "error": "Google Safe Browsing API key not provided and GOOGLE_SAFE_BROWSING_API_KEY not found in .env",
18
+ }
19
+
20
+ endpoint = f"https://safebrowsing.googleapis.com/v4/threatMatches:find?key={api_key}"
21
+
22
+ payload = {
23
+ "client": {
24
+ "clientId": "seoextracthf",
25
+ "clientVersion": "1.0.0",
26
+ },
27
+ "threatInfo": {
28
+ "threatTypes": [
29
+ "MALWARE",
30
+ "SOCIAL_ENGINEERING",
31
+ "UNWANTED_SOFTWARE",
32
+ "POTENTIALLY_HARMFUL_APPLICATION",
33
+ ],
34
+ "platformTypes": ["ANY_PLATFORM"],
35
+ "threatEntryTypes": ["URL"],
36
+ "threatEntries": [{"url": url}],
37
+ },
38
+ }
39
+
40
+ try:
41
+ response = requests.post(endpoint, json=payload, timeout=10)
42
+ response.raise_for_status()
43
+ data = response.json()
44
+
45
+ matches = data.get("matches", [])
46
+
47
+ if not matches:
48
+ return {"is_safe": True, "threats": [], "error": None}
49
+
50
+ return {
51
+ "is_safe": False,
52
+ "threats": [match.get("threatType") for match in matches],
53
+ "error": None,
54
+ }
55
+
56
+ except requests.RequestException as e:
57
+ return {
58
+ "is_safe": None,
59
+ "threats": [],
60
+ "error": str(e),
61
+ }
seoextract/scorer.py ADDED
@@ -0,0 +1,48 @@
1
+ from .models import PageData, SEOIssue, Severity
2
+
3
+
4
+ # ── Severity Penalty Weights ──────────────────────────────────────────────────
5
+
6
+ PENALTY = {
7
+ Severity.CRITICAL : 20,
8
+ Severity.WARNING : 8,
9
+ Severity.INFO : 3,
10
+ }
11
+
12
+
13
+ def _grade(score: float) -> str:
14
+ if score >= 90: return "A"
15
+ if score >= 75: return "B"
16
+ if score >= 60: return "C"
17
+ if score >= 40: return "D"
18
+ return "F"
19
+
20
+
21
+ def score_page(page: PageData, issues: list[SEOIssue]) -> float:
22
+ """
23
+ Calculate a 0–100 score for a single page.
24
+ Starts at 100, deducts points per issue by severity.
25
+ """
26
+ page_issues = [i for i in issues if i.page_url == page.url]
27
+ score = 100.0
28
+ if page.status_code != 200:
29
+ return 0.0
30
+ for issue in page_issues:
31
+ score -= PENALTY[issue.severity]
32
+
33
+ return round(max(score, 0.0), 1)
34
+
35
+
36
+ def score_site(pages: list[PageData], issues: list[SEOIssue]) -> tuple[float, str]:
37
+ """
38
+ Calculate site-level score and grade.
39
+ Returns: (site_score, grade)
40
+ """
41
+ if not pages:
42
+ return 0.0, "F"
43
+
44
+ for page in pages:
45
+ page.page_score = score_page(page, issues)
46
+
47
+ site_score = round(sum(p.page_score for p in pages) / len(pages), 1)
48
+ return site_score, _grade(site_score)
@@ -0,0 +1,294 @@
1
+ Metadata-Version: 2.4
2
+ Name: seoextract
3
+ Version: 0.1.0
4
+ Summary: A lightweight Python SEO audit engine that returns Pydantic structured output.
5
+ Author: Britto K
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: requests>=2.31.0
9
+ Requires-Dist: beautifulsoup4>=4.12.0
10
+ Requires-Dist: lxml>=5.0.0
11
+ Requires-Dist: pydantic>=2.0.0
12
+ Requires-Dist: python-dotenv>=1.0.0
13
+
14
+ # SEOExtractHF
15
+
16
+ <div align="center">
17
+
18
+ **A lightweight Python SEO audit engine with built-in Google Safe Browsing support.**
19
+
20
+ Returns validated **Pydantic structured output** that can be directly consumed by AI agents, dashboards, APIs, report generators, and automation pipelines.
21
+
22
+ </div>
23
+
24
+ ---
25
+
26
+ ## Features
27
+
28
+ - Website crawler
29
+ - Google Safe Browsing validation
30
+ - Technical SEO auditing
31
+ - Pydantic structured output
32
+ - Page-level SEO metrics
33
+ - Site-level SEO scoring
34
+ - Severity-based issue detection
35
+ - Duplicate title detection
36
+ - Duplicate meta description detection
37
+ - Canonical tag detection
38
+ - Viewport detection
39
+ - Schema.org detection
40
+ - Image alt-text validation
41
+ - Internal linking analysis
42
+ - Thin content detection
43
+
44
+ ---
45
+
46
+ # Installation
47
+
48
+ ```bash
49
+ pip install seoextracthf
50
+ ```
51
+
52
+ or install from source
53
+
54
+ ```bash
55
+ pip install -e .
56
+ ```
57
+
58
+ ---
59
+
60
+ # Requirements
61
+
62
+ - Python 3.10+
63
+ - Google Safe Browsing API Key
64
+
65
+ ---
66
+
67
+ # Google Safe Browsing Setup
68
+
69
+ SEOExtractHF checks every website against Google's Safe Browsing service **before crawling**.
70
+
71
+ If Google reports the website as unsafe, crawling is stopped automatically.
72
+
73
+ If no Google Safe Browsing API key is provided, safe_browsing.is_safe will be None.
74
+ ---
75
+
76
+ ## Option 1 (Recommended)
77
+
78
+ Create a `.env` file.
79
+
80
+ ```text
81
+ .env
82
+ ```
83
+
84
+ Add your API key.
85
+
86
+ ```env
87
+ GOOGLE_SAFE_BROWSING_API_KEY=YOUR_API_KEY
88
+ ```
89
+
90
+ SEOExtractHF automatically loads the API key.
91
+
92
+ No additional code is required.
93
+
94
+ ---
95
+
96
+ ## Option 2
97
+
98
+ Pass the API key manually.
99
+
100
+ ```python
101
+ from seoextracthf import SEOExtract
102
+
103
+ result = SEOExtract.audit(
104
+ "https://example.com",
105
+ safe_browsing_api_key="YOUR_API_KEY"
106
+ )
107
+ ```
108
+
109
+ When an API key is supplied manually, the `.env` file is **not used**.
110
+
111
+ ---
112
+
113
+ # Quick Start
114
+
115
+ ```python
116
+ from seoextracthf import SEOExtract
117
+
118
+ result = SEOExtract.audit(
119
+ "https://example.com"
120
+ )
121
+
122
+ print(result.model_dump_json(indent=2))
123
+ ```
124
+
125
+ ---
126
+
127
+ # Returned Object
128
+
129
+ SEOExtractHF returns a validated Pydantic model.
130
+
131
+ ```text
132
+ AuditResult
133
+
134
+ ├── url
135
+ ├── audit_date
136
+ ├── pages_crawled
137
+ ├── site_score
138
+ ├── grade
139
+ ├── total_issues
140
+ ├── critical_count
141
+ ├── warning_count
142
+ ├── info_count
143
+ ├── pages
144
+ ├── issues
145
+ └── safe_browsing
146
+ ```
147
+
148
+ ---
149
+
150
+ # Example
151
+
152
+ ```python
153
+ from seoextracthf import SEOExtract
154
+
155
+ result = SEOExtract.audit("https://example.com")
156
+
157
+ print(result.site_score)
158
+ print(result.grade)
159
+ print(result.safe_browsing)
160
+
161
+ for issue in result.issues:
162
+ print(issue.issue_type)
163
+ ```
164
+
165
+ ---
166
+
167
+ # Safe Browsing Result
168
+
169
+ ```python
170
+ {
171
+ "is_safe": True,
172
+ "threats": [],
173
+ "error": None
174
+ }
175
+ ```
176
+
177
+ If Google reports a threat:
178
+
179
+ ```python
180
+ {
181
+ "is_safe": False,
182
+ "threats": [
183
+ "MALWARE"
184
+ ],
185
+ "error": None
186
+ }
187
+ ```
188
+
189
+ SEOExtractHF immediately stops crawling unsafe websites.
190
+
191
+ ---
192
+
193
+ # Current SEO Checks
194
+
195
+ ## Page Quality
196
+
197
+ - Title validation
198
+ - Meta description validation
199
+ - H1 validation
200
+ - Thin content detection
201
+
202
+ ## Technical SEO
203
+
204
+ - Canonical tag
205
+ - Viewport meta tag
206
+ - Schema.org JSON-LD
207
+ - HTTP status validation
208
+
209
+ ## Images
210
+
211
+ - Missing ALT attributes
212
+
213
+ ## Links
214
+
215
+ - Internal link analysis
216
+
217
+ ## Site-wide Checks
218
+
219
+ - Duplicate titles
220
+ - Duplicate meta descriptions
221
+
222
+ ## Security
223
+
224
+ - Google Safe Browsing validation
225
+
226
+ ---
227
+
228
+ # Example Output
229
+
230
+ ```python
231
+ AuditResult(
232
+ site_score=91.0,
233
+ grade="A",
234
+ total_issues=4,
235
+ pages_crawled=15
236
+ )
237
+ ```
238
+
239
+ ---
240
+
241
+ # Project Structure
242
+
243
+ ```
244
+ seoextracthf/
245
+
246
+ ├── crawler.py
247
+ ├── parser.py
248
+ ├── rules.py
249
+ ├── scorer.py
250
+ ├── safe_browsing.py
251
+ ├── models.py
252
+ └── __init__.py
253
+ ```
254
+
255
+ ---
256
+
257
+ # Designed For
258
+
259
+ SEOExtractHF is designed to be used inside:
260
+
261
+ - AI SEO Agents
262
+ - LangGraph workflows
263
+ - FastAPI applications
264
+ - Streamlit dashboards
265
+ - Report generators
266
+ - CI/CD quality checks
267
+ - Data pipelines
268
+ - SEO automation tools
269
+
270
+ ---
271
+
272
+ # Dependencies
273
+
274
+ - beautifulsoup4
275
+ - lxml
276
+ - requests
277
+ - pydantic
278
+ - python-dotenv
279
+
280
+ ---
281
+
282
+ # License
283
+
284
+ MIT License
285
+
286
+ ---
287
+
288
+ # Author
289
+
290
+ **Britto K**
291
+
292
+ GitHub:
293
+
294
+ https://github.com/Britto1221# seoextract
@@ -0,0 +1,12 @@
1
+ seoextract/__init__.py,sha256=mYxMjSFt3xUJBhQ2vS_CQ6i-i7EKAr5O5t4RWEjj22Y,56
2
+ seoextract/crawler.py,sha256=-oCnPiPHDKpJiLajRWn8OPqJzgPNkGzJu8AFzO_GKYg,5528
3
+ seoextract/init.py,sha256=5T64sdKW4mJmVz-WziebfOYuX4bofuc9af0GHdXAFS4,2731
4
+ seoextract/models.py,sha256=dtxV9VPiZFzWJiQJMLtKkU-3fMnvXxMWsfwva8q67qM,3912
5
+ seoextract/parser.py,sha256=FqazLOWNA-RhEpHfjUZytO8d1RaMYUV_TuggChhDO1c,7120
6
+ seoextract/rules.py,sha256=yqmb12HT722UsXgHJAxaljzMiyTljWNB3maXB93qkl0,9521
7
+ seoextract/safe_browsing.py,sha256=WiBCM_NuOhpS_xHsO0QLFuO740rMa8okfgSwlxZhKC4,1726
8
+ seoextract/scorer.py,sha256=eOavcwXJbIJRPO9S85f65bZAG2xu1G39dXnyIdeOP60,1394
9
+ seoextract-0.1.0.dist-info/METADATA,sha256=fkeKaE4JmHwXjCfTqLCCuY5W4X6P3QCILBclJ4VZPyg,4609
10
+ seoextract-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
11
+ seoextract-0.1.0.dist-info/top_level.txt,sha256=YuTAdTr8NcUNo7p5ygQtcRd42rbDqR0KdFcSUyUuGrw,11
12
+ seoextract-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ seoextract