bmad-plus 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +88 -0
- package/README.md +1 -0
- package/oveanet-pack/seo-audit-360/README.md +59 -53
- package/oveanet-pack/seo-audit-360/agent/seo-chief.md +275 -0
- package/oveanet-pack/seo-audit-360/agent/seo-judge.md +241 -0
- package/oveanet-pack/seo-audit-360/agent/seo-scout.md +171 -0
- package/oveanet-pack/seo-audit-360/agent.yaml +69 -70
- package/oveanet-pack/seo-audit-360/ref/cwv-thresholds.md +87 -0
- package/oveanet-pack/seo-audit-360/ref/eeat-criteria.md +123 -0
- package/oveanet-pack/seo-audit-360/ref/geo-signals.md +167 -0
- package/oveanet-pack/seo-audit-360/ref/quality-gates.md +133 -0
- package/oveanet-pack/seo-audit-360/ref/schema-catalog.md +91 -0
- package/oveanet-pack/seo-audit-360/ref/schema-templates.json +356 -0
- package/oveanet-pack/seo-audit-360/scripts/seo_crawl.py +282 -0
- package/oveanet-pack/seo-audit-360/scripts/seo_fetch.py +231 -0
- package/oveanet-pack/seo-audit-360/scripts/seo_parse.py +255 -0
- package/oveanet-pack/seo-audit-360/scripts/seo_screenshot.py +202 -0
- package/oveanet-pack/seo-audit-360/templates/seo-audit-workflow.md +241 -0
- package/package.json +1 -1
- package/oveanet-pack/seo-audit-360/agent/seo-geo-360-auditor.md +0 -441
- package/oveanet-pack/seo-audit-360/templates/llms.txt +0 -73
- package/oveanet-pack/seo-audit-360/templates/robots.txt +0 -38
- package/oveanet-pack/seo-audit-360/templates/schema-templates.json +0 -116
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
SEO Fetch — Secure HTTP page fetcher for SEO analysis.
|
|
4
|
+
|
|
5
|
+
Features:
|
|
6
|
+
- SSRF protection (blocks private/loopback/reserved IPs)
|
|
7
|
+
- Multi-UA support (standard, Googlebot, GPTBot, ClaudeBot)
|
|
8
|
+
- Redirect chain tracking
|
|
9
|
+
- Cookie handling
|
|
10
|
+
- Configurable timeout
|
|
11
|
+
|
|
12
|
+
Author: Laurent Rochetta
|
|
13
|
+
License: MIT
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import ipaddress
|
|
18
|
+
import json
|
|
19
|
+
import socket
|
|
20
|
+
import sys
|
|
21
|
+
from typing import Optional
|
|
22
|
+
from urllib.parse import urlparse
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
import requests
|
|
26
|
+
except ImportError:
|
|
27
|
+
print("Error: requests library required. Install: pip install requests", file=sys.stderr)
|
|
28
|
+
sys.exit(1)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# ── User-Agent Presets ──────────────────────────────────────────────
|
|
32
|
+
|
|
33
|
+
USER_AGENTS = {
|
|
34
|
+
"default": (
|
|
35
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
36
|
+
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 BMADSEOEngine/2.0"
|
|
37
|
+
),
|
|
38
|
+
"googlebot": (
|
|
39
|
+
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
|
40
|
+
),
|
|
41
|
+
"gptbot": (
|
|
42
|
+
"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; "
|
|
43
|
+
"+https://openai.com/gptbot)"
|
|
44
|
+
),
|
|
45
|
+
"claudebot": (
|
|
46
|
+
"Mozilla/5.0 (compatible; ClaudeBot/1.0; +https://www.anthropic.com/claudebot)"
|
|
47
|
+
),
|
|
48
|
+
"mobile": (
|
|
49
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) "
|
|
50
|
+
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1"
|
|
51
|
+
),
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
DEFAULT_HEADERS = {
|
|
55
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
56
|
+
"Accept-Language": "en-US,en;q=0.9,fr;q=0.8",
|
|
57
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
58
|
+
"Connection": "keep-alive",
|
|
59
|
+
"Cache-Control": "no-cache",
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ── Security: SSRF Prevention ──────────────────────────────────────
|
|
64
|
+
|
|
65
|
+
def is_safe_url(url: str) -> bool:
|
|
66
|
+
"""Block requests to private, loopback, and reserved IP addresses."""
|
|
67
|
+
parsed = urlparse(url)
|
|
68
|
+
hostname = parsed.hostname
|
|
69
|
+
|
|
70
|
+
if not hostname:
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
resolved_ip = socket.gethostbyname(hostname)
|
|
75
|
+
ip = ipaddress.ip_address(resolved_ip)
|
|
76
|
+
if ip.is_private or ip.is_loopback or ip.is_reserved or ip.is_link_local:
|
|
77
|
+
return False
|
|
78
|
+
except (socket.gaierror, ValueError):
|
|
79
|
+
pass # DNS failure handled by requests
|
|
80
|
+
|
|
81
|
+
return True
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# ── Core Fetcher ───────────────────────────────────────────────────
|
|
85
|
+
|
|
86
|
+
def fetch_page(
|
|
87
|
+
url: str,
|
|
88
|
+
timeout: int = 30,
|
|
89
|
+
follow_redirects: bool = True,
|
|
90
|
+
max_redirects: int = 5,
|
|
91
|
+
user_agent: str = "default",
|
|
92
|
+
) -> dict:
|
|
93
|
+
"""
|
|
94
|
+
Fetch a web page with security checks and detailed response tracking.
|
|
95
|
+
|
|
96
|
+
Returns dict with: url, status_code, content, headers, redirect_chain,
|
|
97
|
+
content_length, response_time_ms, error
|
|
98
|
+
"""
|
|
99
|
+
result = {
|
|
100
|
+
"url": url,
|
|
101
|
+
"final_url": None,
|
|
102
|
+
"status_code": None,
|
|
103
|
+
"content": None,
|
|
104
|
+
"headers": {},
|
|
105
|
+
"redirect_chain": [],
|
|
106
|
+
"content_length": 0,
|
|
107
|
+
"response_time_ms": 0,
|
|
108
|
+
"error": None,
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# Normalize URL
|
|
112
|
+
parsed = urlparse(url)
|
|
113
|
+
if not parsed.scheme:
|
|
114
|
+
url = f"https://{url}"
|
|
115
|
+
parsed = urlparse(url)
|
|
116
|
+
|
|
117
|
+
if parsed.scheme not in ("http", "https"):
|
|
118
|
+
result["error"] = f"Invalid URL scheme: {parsed.scheme}"
|
|
119
|
+
return result
|
|
120
|
+
|
|
121
|
+
# SSRF check
|
|
122
|
+
if not is_safe_url(url):
|
|
123
|
+
resolved = "unknown"
|
|
124
|
+
try:
|
|
125
|
+
resolved = socket.gethostbyname(parsed.hostname)
|
|
126
|
+
except Exception:
|
|
127
|
+
pass
|
|
128
|
+
result["error"] = f"Blocked: URL resolves to private/internal IP ({resolved})"
|
|
129
|
+
return result
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
session = requests.Session()
|
|
133
|
+
session.max_redirects = max_redirects
|
|
134
|
+
|
|
135
|
+
headers = dict(DEFAULT_HEADERS)
|
|
136
|
+
ua_string = USER_AGENTS.get(user_agent, user_agent)
|
|
137
|
+
headers["User-Agent"] = ua_string
|
|
138
|
+
|
|
139
|
+
import time
|
|
140
|
+
start = time.monotonic()
|
|
141
|
+
|
|
142
|
+
response = session.get(
|
|
143
|
+
url,
|
|
144
|
+
headers=headers,
|
|
145
|
+
timeout=timeout,
|
|
146
|
+
allow_redirects=follow_redirects,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
elapsed_ms = round((time.monotonic() - start) * 1000)
|
|
150
|
+
|
|
151
|
+
result["final_url"] = response.url
|
|
152
|
+
result["status_code"] = response.status_code
|
|
153
|
+
result["content"] = response.text
|
|
154
|
+
result["headers"] = dict(response.headers)
|
|
155
|
+
result["content_length"] = len(response.content)
|
|
156
|
+
result["response_time_ms"] = elapsed_ms
|
|
157
|
+
|
|
158
|
+
if response.history:
|
|
159
|
+
result["redirect_chain"] = [
|
|
160
|
+
{"url": r.url, "status": r.status_code}
|
|
161
|
+
for r in response.history
|
|
162
|
+
]
|
|
163
|
+
|
|
164
|
+
except requests.exceptions.Timeout:
|
|
165
|
+
result["error"] = f"Request timed out after {timeout}s"
|
|
166
|
+
except requests.exceptions.TooManyRedirects:
|
|
167
|
+
result["error"] = f"Too many redirects (max {max_redirects})"
|
|
168
|
+
except requests.exceptions.SSLError as e:
|
|
169
|
+
result["error"] = f"SSL error: {e}"
|
|
170
|
+
except requests.exceptions.ConnectionError as e:
|
|
171
|
+
result["error"] = f"Connection error: {e}"
|
|
172
|
+
except requests.exceptions.RequestException as e:
|
|
173
|
+
result["error"] = f"Request failed: {e}"
|
|
174
|
+
|
|
175
|
+
return result
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# ── CLI ────────────────────────────────────────────────────────────
|
|
179
|
+
|
|
180
|
+
def main():
|
|
181
|
+
parser = argparse.ArgumentParser(
|
|
182
|
+
description="SEO Fetch — Secure HTTP fetcher for SEO analysis (BMAD+ SEO Engine)"
|
|
183
|
+
)
|
|
184
|
+
parser.add_argument("url", help="URL to fetch")
|
|
185
|
+
parser.add_argument("--output", "-o", help="Save HTML to file")
|
|
186
|
+
parser.add_argument("--timeout", "-t", type=int, default=30, help="Timeout in seconds")
|
|
187
|
+
parser.add_argument("--no-redirects", action="store_true", help="Don't follow redirects")
|
|
188
|
+
parser.add_argument(
|
|
189
|
+
"--ua", choices=list(USER_AGENTS.keys()), default="default",
|
|
190
|
+
help="User-Agent preset (default, googlebot, gptbot, claudebot, mobile)"
|
|
191
|
+
)
|
|
192
|
+
parser.add_argument("--json", "-j", action="store_true", help="Output full result as JSON")
|
|
193
|
+
|
|
194
|
+
args = parser.parse_args()
|
|
195
|
+
|
|
196
|
+
result = fetch_page(
|
|
197
|
+
args.url,
|
|
198
|
+
timeout=args.timeout,
|
|
199
|
+
follow_redirects=not args.no_redirects,
|
|
200
|
+
user_agent=args.ua,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
if result["error"]:
|
|
204
|
+
print(f"Error: {result['error']}", file=sys.stderr)
|
|
205
|
+
sys.exit(1)
|
|
206
|
+
|
|
207
|
+
if args.json:
|
|
208
|
+
# Output metadata as JSON (without full HTML content for readability)
|
|
209
|
+
output = {k: v for k, v in result.items() if k != "content"}
|
|
210
|
+
output["content_preview"] = result["content"][:500] if result["content"] else None
|
|
211
|
+
print(json.dumps(output, indent=2))
|
|
212
|
+
elif args.output:
|
|
213
|
+
with open(args.output, "w", encoding="utf-8") as f:
|
|
214
|
+
f.write(result["content"])
|
|
215
|
+
print(f"Saved to {args.output}")
|
|
216
|
+
else:
|
|
217
|
+
print(result["content"])
|
|
218
|
+
|
|
219
|
+
# Metadata to stderr
|
|
220
|
+
print(f"\n--- Fetch Summary ---", file=sys.stderr)
|
|
221
|
+
print(f"Final URL: {result['final_url']}", file=sys.stderr)
|
|
222
|
+
print(f"Status: {result['status_code']}", file=sys.stderr)
|
|
223
|
+
print(f"Size: {result['content_length']:,} bytes", file=sys.stderr)
|
|
224
|
+
print(f"Time: {result['response_time_ms']}ms", file=sys.stderr)
|
|
225
|
+
if result["redirect_chain"]:
|
|
226
|
+
chain = " → ".join(r["url"] for r in result["redirect_chain"])
|
|
227
|
+
print(f"Redirects: {chain}", file=sys.stderr)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
if __name__ == "__main__":
|
|
231
|
+
main()
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
SEO Parse — HTML parser for SEO element extraction.
|
|
4
|
+
|
|
5
|
+
Extracts: title, meta tags, canonicals, headings, images, links (internal/external),
|
|
6
|
+
schema (JSON-LD), Open Graph, Twitter Cards, hreflang, word count, text/code ratio.
|
|
7
|
+
|
|
8
|
+
Author: Laurent Rochetta
|
|
9
|
+
License: MIT
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
import sys
|
|
17
|
+
from typing import Optional
|
|
18
|
+
from urllib.parse import urljoin, urlparse
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
from bs4 import BeautifulSoup
|
|
22
|
+
except ImportError:
|
|
23
|
+
print("Error: beautifulsoup4 required. Install: pip install beautifulsoup4", file=sys.stderr)
|
|
24
|
+
sys.exit(1)
|
|
25
|
+
|
|
26
|
+
# Use lxml if available for speed, fallback to html.parser
|
|
27
|
+
try:
|
|
28
|
+
import lxml # noqa: F401
|
|
29
|
+
HTML_PARSER = "lxml"
|
|
30
|
+
except ImportError:
|
|
31
|
+
HTML_PARSER = "html.parser"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def parse_html(html: str, base_url: Optional[str] = None) -> dict:
|
|
35
|
+
"""
|
|
36
|
+
Parse HTML and extract all SEO-relevant elements.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
html: Raw HTML content
|
|
40
|
+
base_url: Base URL for resolving relative links
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Comprehensive dictionary of SEO data
|
|
44
|
+
"""
|
|
45
|
+
soup = BeautifulSoup(html, HTML_PARSER)
|
|
46
|
+
|
|
47
|
+
result = {
|
|
48
|
+
"title": None,
|
|
49
|
+
"title_length": 0,
|
|
50
|
+
"meta_description": None,
|
|
51
|
+
"meta_description_length": 0,
|
|
52
|
+
"meta_robots": None,
|
|
53
|
+
"meta_viewport": None,
|
|
54
|
+
"canonical": None,
|
|
55
|
+
"headings": {"h1": [], "h2": [], "h3": [], "h4": []},
|
|
56
|
+
"images": [],
|
|
57
|
+
"links": {"internal": [], "external": [], "broken_candidates": []},
|
|
58
|
+
"schema_blocks": [],
|
|
59
|
+
"open_graph": {},
|
|
60
|
+
"twitter_card": {},
|
|
61
|
+
"hreflang": [],
|
|
62
|
+
"word_count": 0,
|
|
63
|
+
"html_size_bytes": len(html.encode("utf-8")),
|
|
64
|
+
"text_ratio": 0.0,
|
|
65
|
+
"has_lang_attr": False,
|
|
66
|
+
"lang": None,
|
|
67
|
+
"scripts_count": 0,
|
|
68
|
+
"stylesheets_count": 0,
|
|
69
|
+
"dom_depth_estimate": 0,
|
|
70
|
+
"security_headers_hints": {},
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
# ── Title ──
|
|
74
|
+
title_tag = soup.find("title")
|
|
75
|
+
if title_tag:
|
|
76
|
+
result["title"] = title_tag.get_text(strip=True)
|
|
77
|
+
result["title_length"] = len(result["title"])
|
|
78
|
+
|
|
79
|
+
# ── Meta Tags ──
|
|
80
|
+
for meta in soup.find_all("meta"):
|
|
81
|
+
name = (meta.get("name") or "").lower()
|
|
82
|
+
property_attr = (meta.get("property") or "").lower()
|
|
83
|
+
content = meta.get("content", "")
|
|
84
|
+
|
|
85
|
+
if name == "description":
|
|
86
|
+
result["meta_description"] = content
|
|
87
|
+
result["meta_description_length"] = len(content)
|
|
88
|
+
elif name == "robots":
|
|
89
|
+
result["meta_robots"] = content
|
|
90
|
+
elif name == "viewport":
|
|
91
|
+
result["meta_viewport"] = content
|
|
92
|
+
|
|
93
|
+
# Open Graph
|
|
94
|
+
if property_attr.startswith("og:"):
|
|
95
|
+
result["open_graph"][property_attr] = content
|
|
96
|
+
|
|
97
|
+
# Twitter Card
|
|
98
|
+
if name.startswith("twitter:"):
|
|
99
|
+
result["twitter_card"][name] = content
|
|
100
|
+
|
|
101
|
+
# ── Language ──
|
|
102
|
+
html_tag = soup.find("html")
|
|
103
|
+
if html_tag and html_tag.get("lang"):
|
|
104
|
+
result["has_lang_attr"] = True
|
|
105
|
+
result["lang"] = html_tag.get("lang")
|
|
106
|
+
|
|
107
|
+
# ── Canonical ──
|
|
108
|
+
canonical = soup.find("link", rel="canonical")
|
|
109
|
+
if canonical:
|
|
110
|
+
result["canonical"] = canonical.get("href")
|
|
111
|
+
|
|
112
|
+
# ── Hreflang ──
|
|
113
|
+
for link in soup.find_all("link", rel="alternate"):
|
|
114
|
+
hreflang = link.get("hreflang")
|
|
115
|
+
if hreflang:
|
|
116
|
+
result["hreflang"].append({
|
|
117
|
+
"lang": hreflang,
|
|
118
|
+
"href": link.get("href"),
|
|
119
|
+
})
|
|
120
|
+
|
|
121
|
+
# ── Headings ──
|
|
122
|
+
for level in ["h1", "h2", "h3", "h4"]:
|
|
123
|
+
for tag in soup.find_all(level):
|
|
124
|
+
text = tag.get_text(strip=True)
|
|
125
|
+
if text:
|
|
126
|
+
result["headings"][level].append(text)
|
|
127
|
+
|
|
128
|
+
# ── Images ──
|
|
129
|
+
for img in soup.find_all("img"):
|
|
130
|
+
src = img.get("src", "")
|
|
131
|
+
if base_url and src:
|
|
132
|
+
src = urljoin(base_url, src)
|
|
133
|
+
|
|
134
|
+
has_alt = img.get("alt") is not None
|
|
135
|
+
alt_text = img.get("alt", "")
|
|
136
|
+
has_dimensions = bool(img.get("width") and img.get("height"))
|
|
137
|
+
|
|
138
|
+
result["images"].append({
|
|
139
|
+
"src": src,
|
|
140
|
+
"alt": alt_text,
|
|
141
|
+
"has_alt": has_alt,
|
|
142
|
+
"alt_empty": has_alt and alt_text.strip() == "",
|
|
143
|
+
"width": img.get("width"),
|
|
144
|
+
"height": img.get("height"),
|
|
145
|
+
"has_dimensions": has_dimensions,
|
|
146
|
+
"loading": img.get("loading"),
|
|
147
|
+
"srcset": img.get("srcset") is not None,
|
|
148
|
+
})
|
|
149
|
+
|
|
150
|
+
# ── Links ──
|
|
151
|
+
if base_url:
|
|
152
|
+
base_domain = urlparse(base_url).netloc
|
|
153
|
+
|
|
154
|
+
for a in soup.find_all("a", href=True):
|
|
155
|
+
href = a.get("href", "")
|
|
156
|
+
if not href or href.startswith("#") or href.startswith("javascript:"):
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
full_url = urljoin(base_url, href)
|
|
160
|
+
parsed = urlparse(full_url)
|
|
161
|
+
|
|
162
|
+
link_data = {
|
|
163
|
+
"href": full_url,
|
|
164
|
+
"text": a.get_text(strip=True)[:100],
|
|
165
|
+
"rel": a.get("rel", []),
|
|
166
|
+
"is_nofollow": "nofollow" in (a.get("rel") or []),
|
|
167
|
+
"target": a.get("target"),
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
if parsed.netloc == base_domain:
|
|
171
|
+
result["links"]["internal"].append(link_data)
|
|
172
|
+
else:
|
|
173
|
+
result["links"]["external"].append(link_data)
|
|
174
|
+
|
|
175
|
+
# ── Schema (JSON-LD) ──
|
|
176
|
+
for script in soup.find_all("script", type="application/ld+json"):
|
|
177
|
+
try:
|
|
178
|
+
schema_data = json.loads(script.string)
|
|
179
|
+
if isinstance(schema_data, dict):
|
|
180
|
+
result["schema_blocks"].append({
|
|
181
|
+
"type": schema_data.get("@type", "unknown"),
|
|
182
|
+
"data": schema_data,
|
|
183
|
+
})
|
|
184
|
+
elif isinstance(schema_data, list):
|
|
185
|
+
for item in schema_data:
|
|
186
|
+
if isinstance(item, dict):
|
|
187
|
+
result["schema_blocks"].append({
|
|
188
|
+
"type": item.get("@type", "unknown"),
|
|
189
|
+
"data": item,
|
|
190
|
+
})
|
|
191
|
+
except (json.JSONDecodeError, TypeError):
|
|
192
|
+
result["schema_blocks"].append({"type": "PARSE_ERROR", "data": None})
|
|
193
|
+
|
|
194
|
+
# ── Resource Counts ──
|
|
195
|
+
result["scripts_count"] = len(soup.find_all("script"))
|
|
196
|
+
result["stylesheets_count"] = len(soup.find_all("link", rel="stylesheet"))
|
|
197
|
+
|
|
198
|
+
# ── Word Count & Text Ratio ──
|
|
199
|
+
text_soup = BeautifulSoup(html, HTML_PARSER)
|
|
200
|
+
for element in text_soup(["script", "style", "nav", "footer", "header", "noscript"]):
|
|
201
|
+
element.decompose()
|
|
202
|
+
|
|
203
|
+
visible_text = text_soup.get_text(separator=" ", strip=True)
|
|
204
|
+
words = re.findall(r"\b\w+\b", visible_text)
|
|
205
|
+
result["word_count"] = len(words)
|
|
206
|
+
|
|
207
|
+
text_bytes = len(visible_text.encode("utf-8"))
|
|
208
|
+
if result["html_size_bytes"] > 0:
|
|
209
|
+
result["text_ratio"] = round(text_bytes / result["html_size_bytes"], 3)
|
|
210
|
+
|
|
211
|
+
return result
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
# ── CLI ────────────────────────────────────────────────────────────
|
|
215
|
+
|
|
216
|
+
def main():
|
|
217
|
+
parser = argparse.ArgumentParser(
|
|
218
|
+
description="SEO Parse — HTML parser for SEO analysis (BMAD+ SEO Engine)"
|
|
219
|
+
)
|
|
220
|
+
parser.add_argument("file", nargs="?", help="HTML file to parse")
|
|
221
|
+
parser.add_argument("--url", "-u", help="Base URL for resolving relative links")
|
|
222
|
+
parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
|
|
223
|
+
|
|
224
|
+
args = parser.parse_args()
|
|
225
|
+
|
|
226
|
+
if args.file:
|
|
227
|
+
real_path = os.path.realpath(args.file)
|
|
228
|
+
if not os.path.isfile(real_path):
|
|
229
|
+
print(f"Error: File not found: {args.file}", file=sys.stderr)
|
|
230
|
+
sys.exit(1)
|
|
231
|
+
with open(real_path, "r", encoding="utf-8") as f:
|
|
232
|
+
html = f.read()
|
|
233
|
+
else:
|
|
234
|
+
html = sys.stdin.read()
|
|
235
|
+
|
|
236
|
+
result = parse_html(html, args.url)
|
|
237
|
+
|
|
238
|
+
if args.json:
|
|
239
|
+
print(json.dumps(result, indent=2, ensure_ascii=False))
|
|
240
|
+
else:
|
|
241
|
+
print(f"Title: {result['title']} ({result['title_length']} chars)")
|
|
242
|
+
print(f"Meta Description: {result['meta_description'][:80] + '...' if result['meta_description'] and len(result['meta_description']) > 80 else result['meta_description']}")
|
|
243
|
+
print(f"Canonical: {result['canonical']}")
|
|
244
|
+
print(f"Language: {result['lang']}")
|
|
245
|
+
print(f"H1: {len(result['headings']['h1'])} | H2: {len(result['headings']['h2'])} | H3: {len(result['headings']['h3'])}")
|
|
246
|
+
print(f"Images: {len(result['images'])} (missing alt: {sum(1 for i in result['images'] if not i['has_alt'])})")
|
|
247
|
+
print(f"Internal Links: {len(result['links']['internal'])} | External: {len(result['links']['external'])}")
|
|
248
|
+
print(f"Schema Blocks: {len(result['schema_blocks'])} ({', '.join(s['type'] for s in result['schema_blocks'])})")
|
|
249
|
+
print(f"Word Count: {result['word_count']:,}")
|
|
250
|
+
print(f"Text/HTML Ratio: {result['text_ratio']:.1%}")
|
|
251
|
+
print(f"Scripts: {result['scripts_count']} | Stylesheets: {result['stylesheets_count']}")
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
if __name__ == "__main__":
|
|
255
|
+
main()
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
SEO Screenshot — Viewport screenshot capture for visual SEO analysis.
|
|
4
|
+
|
|
5
|
+
Features:
|
|
6
|
+
- Mobile and desktop viewport presets
|
|
7
|
+
- Above-the-fold element detection
|
|
8
|
+
- Full-page capture option
|
|
9
|
+
- PNG output with configurable quality
|
|
10
|
+
|
|
11
|
+
Requires: playwright (pip install playwright && playwright install chromium)
|
|
12
|
+
|
|
13
|
+
Author: Laurent Rochetta
|
|
14
|
+
License: MIT
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import sys
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
VIEWPORTS = {
|
|
22
|
+
"mobile": {"width": 375, "height": 812, "device_scale_factor": 3, "is_mobile": True},
|
|
23
|
+
"tablet": {"width": 768, "height": 1024, "device_scale_factor": 2, "is_mobile": True},
|
|
24
|
+
"desktop": {"width": 1440, "height": 900, "device_scale_factor": 1, "is_mobile": False},
|
|
25
|
+
"desktop-hd": {"width": 1920, "height": 1080, "device_scale_factor": 1, "is_mobile": False},
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def capture_screenshot(
|
|
30
|
+
url: str,
|
|
31
|
+
output: str = "screenshot.png",
|
|
32
|
+
viewport: str = "desktop",
|
|
33
|
+
full_page: bool = False,
|
|
34
|
+
wait_ms: int = 2000,
|
|
35
|
+
):
|
|
36
|
+
"""
|
|
37
|
+
Capture a viewport screenshot of a URL using Playwright.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
url: URL to capture
|
|
41
|
+
output: Output file path (.png)
|
|
42
|
+
viewport: Viewport preset (mobile, tablet, desktop, desktop-hd)
|
|
43
|
+
full_page: Capture full page scroll or just viewport
|
|
44
|
+
wait_ms: Wait time after page load (ms)
|
|
45
|
+
"""
|
|
46
|
+
try:
|
|
47
|
+
from playwright.sync_api import sync_playwright
|
|
48
|
+
except ImportError:
|
|
49
|
+
print(
|
|
50
|
+
"Error: playwright required.\n"
|
|
51
|
+
"Install: pip install playwright && playwright install chromium",
|
|
52
|
+
file=sys.stderr,
|
|
53
|
+
)
|
|
54
|
+
sys.exit(1)
|
|
55
|
+
|
|
56
|
+
vp = VIEWPORTS.get(viewport, VIEWPORTS["desktop"])
|
|
57
|
+
|
|
58
|
+
with sync_playwright() as p:
|
|
59
|
+
browser = p.chromium.launch(headless=True)
|
|
60
|
+
context = browser.new_context(
|
|
61
|
+
viewport={"width": vp["width"], "height": vp["height"]},
|
|
62
|
+
device_scale_factor=vp["device_scale_factor"],
|
|
63
|
+
is_mobile=vp["is_mobile"],
|
|
64
|
+
user_agent=(
|
|
65
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) "
|
|
66
|
+
"AppleWebKit/605.1.15 Mobile/15E148 Safari/604.1"
|
|
67
|
+
if vp["is_mobile"]
|
|
68
|
+
else "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
69
|
+
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 BMADSEOEngine/2.0"
|
|
70
|
+
),
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
page = context.new_page()
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
page.goto(url, wait_until="networkidle", timeout=30000)
|
|
77
|
+
except Exception:
|
|
78
|
+
# Fallback: wait for load event instead
|
|
79
|
+
page.goto(url, wait_until="load", timeout=30000)
|
|
80
|
+
|
|
81
|
+
# Wait for dynamic content
|
|
82
|
+
page.wait_for_timeout(wait_ms)
|
|
83
|
+
|
|
84
|
+
# Capture screenshot
|
|
85
|
+
page.screenshot(path=output, full_page=full_page)
|
|
86
|
+
|
|
87
|
+
# Gather above-the-fold metrics
|
|
88
|
+
metrics = page.evaluate("""() => {
|
|
89
|
+
const viewportHeight = window.innerHeight;
|
|
90
|
+
const viewportWidth = window.innerWidth;
|
|
91
|
+
|
|
92
|
+
// Find CTAs above the fold
|
|
93
|
+
const ctas = [];
|
|
94
|
+
const buttons = document.querySelectorAll('a, button, [role="button"]');
|
|
95
|
+
buttons.forEach(el => {
|
|
96
|
+
const rect = el.getBoundingClientRect();
|
|
97
|
+
if (rect.top < viewportHeight && rect.bottom > 0) {
|
|
98
|
+
const text = el.textContent.trim().substring(0, 50);
|
|
99
|
+
if (text && (
|
|
100
|
+
/sign.?up|get.?start|try|buy|contact|demo|free|download|subscribe/i.test(text)
|
|
101
|
+
)) {
|
|
102
|
+
ctas.push({
|
|
103
|
+
text: text,
|
|
104
|
+
tag: el.tagName,
|
|
105
|
+
top: Math.round(rect.top),
|
|
106
|
+
visible: rect.width > 0 && rect.height > 0,
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
// Find hero/LCP candidate
|
|
113
|
+
const images = document.querySelectorAll('img');
|
|
114
|
+
let largestImage = null;
|
|
115
|
+
let largestArea = 0;
|
|
116
|
+
images.forEach(img => {
|
|
117
|
+
const rect = img.getBoundingClientRect();
|
|
118
|
+
const area = rect.width * rect.height;
|
|
119
|
+
if (area > largestArea && rect.top < viewportHeight) {
|
|
120
|
+
largestArea = area;
|
|
121
|
+
largestImage = {
|
|
122
|
+
src: img.src.substring(0, 100),
|
|
123
|
+
width: Math.round(rect.width),
|
|
124
|
+
height: Math.round(rect.height),
|
|
125
|
+
top: Math.round(rect.top),
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
// Check for horizontal scroll
|
|
131
|
+
const hasHorizontalScroll = document.documentElement.scrollWidth > viewportWidth;
|
|
132
|
+
|
|
133
|
+
// Font size check
|
|
134
|
+
const body = document.body;
|
|
135
|
+
const bodyFontSize = body ? parseFloat(getComputedStyle(body).fontSize) : 16;
|
|
136
|
+
|
|
137
|
+
return {
|
|
138
|
+
viewportWidth,
|
|
139
|
+
viewportHeight,
|
|
140
|
+
ctas_above_fold: ctas.length,
|
|
141
|
+
cta_details: ctas.slice(0, 5),
|
|
142
|
+
largest_image_above_fold: largestImage,
|
|
143
|
+
has_horizontal_scroll: hasHorizontalScroll,
|
|
144
|
+
body_font_size_px: bodyFontSize,
|
|
145
|
+
dom_element_count: document.querySelectorAll('*').length,
|
|
146
|
+
};
|
|
147
|
+
}""")
|
|
148
|
+
|
|
149
|
+
browser.close()
|
|
150
|
+
|
|
151
|
+
return metrics
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# ── CLI ────────────────────────────────────────────────────────────
|
|
155
|
+
|
|
156
|
+
def main():
|
|
157
|
+
parser = argparse.ArgumentParser(
|
|
158
|
+
description="SEO Screenshot — Viewport capture (BMAD+ SEO Engine)"
|
|
159
|
+
)
|
|
160
|
+
parser.add_argument("url", help="URL to capture")
|
|
161
|
+
parser.add_argument("--output", "-o", default="screenshot.png", help="Output file path")
|
|
162
|
+
parser.add_argument(
|
|
163
|
+
"--viewport", "-v",
|
|
164
|
+
choices=list(VIEWPORTS.keys()), default="desktop",
|
|
165
|
+
help="Viewport preset"
|
|
166
|
+
)
|
|
167
|
+
parser.add_argument("--full", action="store_true", help="Capture full page (not just viewport)")
|
|
168
|
+
parser.add_argument("--wait", "-w", type=int, default=2000, help="Wait after load (ms)")
|
|
169
|
+
parser.add_argument("--json", "-j", action="store_true", help="Output metrics as JSON")
|
|
170
|
+
|
|
171
|
+
args = parser.parse_args()
|
|
172
|
+
|
|
173
|
+
import json
|
|
174
|
+
|
|
175
|
+
metrics = capture_screenshot(
|
|
176
|
+
url=args.url,
|
|
177
|
+
output=args.output,
|
|
178
|
+
viewport=args.viewport,
|
|
179
|
+
full_page=args.full,
|
|
180
|
+
wait_ms=args.wait,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
print(f"Screenshot saved: {args.output}", file=sys.stderr)
|
|
184
|
+
|
|
185
|
+
if args.json:
|
|
186
|
+
print(json.dumps(metrics, indent=2))
|
|
187
|
+
else:
|
|
188
|
+
print(f"\nAbove-the-Fold Analysis ({args.viewport}):")
|
|
189
|
+
print(f" Viewport: {metrics['viewportWidth']}×{metrics['viewportHeight']}")
|
|
190
|
+
print(f" CTAs above fold: {metrics['ctas_above_fold']}")
|
|
191
|
+
for cta in metrics.get("cta_details", []):
|
|
192
|
+
print(f" - \"{cta['text']}\" ({cta['tag']}, top: {cta['top']}px)")
|
|
193
|
+
if metrics.get("largest_image_above_fold"):
|
|
194
|
+
img = metrics["largest_image_above_fold"]
|
|
195
|
+
print(f" Largest image: {img['width']}×{img['height']} at y={img['top']}px")
|
|
196
|
+
print(f" Horizontal scroll: {'⚠️ YES' if metrics['has_horizontal_scroll'] else '✅ No'}")
|
|
197
|
+
print(f" Body font size: {metrics['body_font_size_px']}px {'✅' if metrics['body_font_size_px'] >= 16 else '⚠️ <16px'}")
|
|
198
|
+
print(f" DOM elements: {metrics['dom_element_count']:,}")
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
if __name__ == "__main__":
|
|
202
|
+
main()
|