antigravity-seo-kit 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. package/.agent/agent.md +96 -0
  2. package/.agent/skills/seo/SKILL.md +153 -0
  3. package/.agent/skills/seo/references/cwv-thresholds.md +108 -0
  4. package/.agent/skills/seo/references/eeat-framework.md +214 -0
  5. package/.agent/skills/seo/references/local-schema-types.md +230 -0
  6. package/.agent/skills/seo/references/local-seo-signals.md +218 -0
  7. package/.agent/skills/seo/references/maps-api-endpoints.md +160 -0
  8. package/.agent/skills/seo/references/maps-free-apis.md +176 -0
  9. package/.agent/skills/seo/references/maps-gbp-checklist.md +150 -0
  10. package/.agent/skills/seo/references/maps-geo-grid.md +154 -0
  11. package/.agent/skills/seo/references/quality-gates.md +155 -0
  12. package/.agent/skills/seo/references/schema-types.md +118 -0
  13. package/.agent/skills/seo/schema/templates.json +213 -0
  14. package/.agent/skills/seo/scripts/analyze_visual.py +217 -0
  15. package/.agent/skills/seo/scripts/capture_screenshot.py +181 -0
  16. package/.agent/skills/seo/scripts/fetch_page.py +196 -0
  17. package/.agent/skills/seo/scripts/parse_html.py +201 -0
  18. package/.agent/skills/seo-audit/SKILL.md +278 -0
  19. package/.agent/skills/seo-competitor-pages/SKILL.md +212 -0
  20. package/.agent/skills/seo-content/SKILL.md +230 -0
  21. package/.agent/skills/seo-dataforseo/SKILL.md +418 -0
  22. package/.agent/skills/seo-geo/SKILL.md +305 -0
  23. package/.agent/skills/seo-google/SKILL.md +405 -0
  24. package/.agent/skills/seo-google/assets/templates/cwv-audit-report.md +48 -0
  25. package/.agent/skills/seo-google/assets/templates/gsc-performance-report.md +44 -0
  26. package/.agent/skills/seo-google/assets/templates/indexation-status-report.md +43 -0
  27. package/.agent/skills/seo-google/references/auth-setup.md +154 -0
  28. package/.agent/skills/seo-google/references/ga4-data-api.md +184 -0
  29. package/.agent/skills/seo-google/references/indexing-api.md +107 -0
  30. package/.agent/skills/seo-google/references/keyword-planner-api.md +66 -0
  31. package/.agent/skills/seo-google/references/nlp-api.md +55 -0
  32. package/.agent/skills/seo-google/references/pagespeed-crux-api.md +204 -0
  33. package/.agent/skills/seo-google/references/rate-limits-quotas.md +75 -0
  34. package/.agent/skills/seo-google/references/search-console-api.md +156 -0
  35. package/.agent/skills/seo-google/references/supplementary-apis.md +99 -0
  36. package/.agent/skills/seo-google/references/youtube-api.md +49 -0
  37. package/.agent/skills/seo-google/scripts/crux_history.py +321 -0
  38. package/.agent/skills/seo-google/scripts/ga4_report.py +478 -0
  39. package/.agent/skills/seo-google/scripts/google_auth.py +795 -0
  40. package/.agent/skills/seo-google/scripts/google_report.py +2273 -0
  41. package/.agent/skills/seo-google/scripts/gsc_inspect.py +340 -0
  42. package/.agent/skills/seo-google/scripts/gsc_query.py +378 -0
  43. package/.agent/skills/seo-google/scripts/indexing_notify.py +313 -0
  44. package/.agent/skills/seo-google/scripts/keyword_planner.py +297 -0
  45. package/.agent/skills/seo-google/scripts/nlp_analyze.py +309 -0
  46. package/.agent/skills/seo-google/scripts/pagespeed_check.py +649 -0
  47. package/.agent/skills/seo-google/scripts/youtube_search.py +355 -0
  48. package/.agent/skills/seo-hreflang/SKILL.md +192 -0
  49. package/.agent/skills/seo-image-gen/SKILL.md +211 -0
  50. package/.agent/skills/seo-image-gen/references/cost-tracking.md +47 -0
  51. package/.agent/skills/seo-image-gen/references/gemini-models.md +200 -0
  52. package/.agent/skills/seo-image-gen/references/mcp-tools.md +115 -0
  53. package/.agent/skills/seo-image-gen/references/post-processing.md +192 -0
  54. package/.agent/skills/seo-image-gen/references/presets.md +69 -0
  55. package/.agent/skills/seo-image-gen/references/prompt-engineering.md +411 -0
  56. package/.agent/skills/seo-image-gen/references/seo-image-presets.md +137 -0
  57. package/.agent/skills/seo-image-gen/scripts/batch.py +97 -0
  58. package/.agent/skills/seo-image-gen/scripts/cost_tracker.py +191 -0
  59. package/.agent/skills/seo-image-gen/scripts/edit.py +141 -0
  60. package/.agent/skills/seo-image-gen/scripts/generate.py +149 -0
  61. package/.agent/skills/seo-image-gen/scripts/presets.py +153 -0
  62. package/.agent/skills/seo-image-gen/scripts/setup_mcp.py +151 -0
  63. package/.agent/skills/seo-image-gen/scripts/validate_setup.py +133 -0
  64. package/.agent/skills/seo-images/SKILL.md +176 -0
  65. package/.agent/skills/seo-local/SKILL.md +381 -0
  66. package/.agent/skills/seo-maps/SKILL.md +328 -0
  67. package/.agent/skills/seo-page/SKILL.md +86 -0
  68. package/.agent/skills/seo-plan/SKILL.md +118 -0
  69. package/.agent/skills/seo-plan/assets/agency.md +175 -0
  70. package/.agent/skills/seo-plan/assets/ecommerce.md +167 -0
  71. package/.agent/skills/seo-plan/assets/generic.md +144 -0
  72. package/.agent/skills/seo-plan/assets/local-service.md +160 -0
  73. package/.agent/skills/seo-plan/assets/publisher.md +153 -0
  74. package/.agent/skills/seo-plan/assets/saas.md +135 -0
  75. package/.agent/skills/seo-programmatic/SKILL.md +171 -0
  76. package/.agent/skills/seo-schema/SKILL.md +223 -0
  77. package/.agent/skills/seo-sitemap/SKILL.md +180 -0
  78. package/.agent/skills/seo-technical/SKILL.md +211 -0
  79. package/.agent/workflows/seo-audit.md +17 -0
  80. package/.agent/workflows/seo-competitor-pages.md +12 -0
  81. package/.agent/workflows/seo-content.md +14 -0
  82. package/.agent/workflows/seo-geo.md +12 -0
  83. package/.agent/workflows/seo-google.md +12 -0
  84. package/.agent/workflows/seo-hreflang.md +12 -0
  85. package/.agent/workflows/seo-images.md +13 -0
  86. package/.agent/workflows/seo-local.md +12 -0
  87. package/.agent/workflows/seo-maps.md +11 -0
  88. package/.agent/workflows/seo-page.md +13 -0
  89. package/.agent/workflows/seo-plan.md +13 -0
  90. package/.agent/workflows/seo-programmatic.md +12 -0
  91. package/.agent/workflows/seo-schema.md +11 -0
  92. package/.agent/workflows/seo-sitemap.md +9 -0
  93. package/.agent/workflows/seo-technical.md +18 -0
  94. package/LICENSE +88 -0
  95. package/README.md +122 -0
  96. package/bin/cli.js +117 -0
  97. package/docs/ARCHITECTURE.md +218 -0
  98. package/docs/COMMANDS.md +184 -0
  99. package/docs/INSTALLATION.md +100 -0
  100. package/docs/MCP-INTEGRATION.md +153 -0
  101. package/docs/TROUBLESHOOTING.md +151 -0
  102. package/docs/superpowers/plans/2026-03-13-github-audit-fixes.md +511 -0
  103. package/extensions/banana/README.md +95 -0
  104. package/extensions/banana/docs/BANANA-SETUP.md +86 -0
  105. package/extensions/banana/install.sh +170 -0
  106. package/extensions/banana/references/cost-tracking.md +47 -0
  107. package/extensions/banana/references/gemini-models.md +200 -0
  108. package/extensions/banana/references/mcp-tools.md +115 -0
  109. package/extensions/banana/references/post-processing.md +192 -0
  110. package/extensions/banana/references/presets.md +69 -0
  111. package/extensions/banana/references/prompt-engineering.md +411 -0
  112. package/extensions/banana/references/seo-image-presets.md +137 -0
  113. package/extensions/banana/scripts/batch.py +97 -0
  114. package/extensions/banana/scripts/cost_tracker.py +191 -0
  115. package/extensions/banana/scripts/edit.py +141 -0
  116. package/extensions/banana/scripts/generate.py +149 -0
  117. package/extensions/banana/scripts/presets.py +153 -0
  118. package/extensions/banana/scripts/setup_mcp.py +151 -0
  119. package/extensions/banana/scripts/validate_setup.py +133 -0
  120. package/extensions/banana/uninstall.sh +43 -0
  121. package/extensions/dataforseo/README.md +169 -0
  122. package/extensions/dataforseo/docs/DATAFORSEO-SETUP.md +74 -0
  123. package/extensions/dataforseo/field-config.json +280 -0
  124. package/extensions/dataforseo/install.ps1 +110 -0
  125. package/extensions/dataforseo/install.sh +161 -0
  126. package/extensions/dataforseo/uninstall.ps1 +35 -0
  127. package/extensions/dataforseo/uninstall.sh +39 -0
  128. package/lib/api.js +190 -0
  129. package/lib/fingerprint.js +68 -0
  130. package/lib/installer.js +486 -0
  131. package/lib/utils.js +254 -0
  132. package/package.json +40 -0
  133. package/pyproject.toml +11 -0
  134. package/requirements-google.txt +15 -0
  135. package/requirements.txt +11 -0
@@ -0,0 +1,181 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Capture screenshots of web pages using Playwright.
4
+
5
+ Usage:
6
+ python capture_screenshot.py https://example.com
7
+ python capture_screenshot.py https://example.com --mobile
8
+ python capture_screenshot.py https://example.com --output screenshots/
9
+ """
10
+
11
+ import argparse
12
+ import ipaddress
13
+ import os
14
+ import socket
15
+ import sys
16
+ from urllib.parse import ParseResult, urlparse
17
+
18
+ try:
19
+ from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
20
+ except ImportError:
21
+ print("Error: playwright required. Install with: pip install playwright && playwright install chromium")
22
+ sys.exit(1)
23
+
24
+
25
+ VIEWPORTS = {
26
+ "desktop": {"width": 1920, "height": 1080},
27
+ "laptop": {"width": 1366, "height": 768},
28
+ "tablet": {"width": 768, "height": 1024},
29
+ "mobile": {"width": 375, "height": 812},
30
+ }
31
+
32
+
33
+ def normalize_url(url: str) -> tuple[str, ParseResult]:
34
+ """Normalize URL and return (url, parsed_url)."""
35
+ parsed = urlparse(url)
36
+ if not parsed.scheme:
37
+ url = f"https://{url}"
38
+ parsed = urlparse(url)
39
+
40
+ if parsed.scheme not in ("http", "https"):
41
+ raise ValueError(f"Invalid URL scheme: {parsed.scheme}")
42
+ if not parsed.hostname:
43
+ raise ValueError("Invalid URL: missing hostname")
44
+
45
+ return url, parsed
46
+
47
+
48
+ def capture_screenshot(
49
+ url: str,
50
+ output_path: str,
51
+ viewport: str = "desktop",
52
+ full_page: bool = False,
53
+ timeout: int = 30000,
54
+ ) -> dict:
55
+ """
56
+ Capture a screenshot of a web page.
57
+
58
+ Args:
59
+ url: URL to capture
60
+ output_path: Output file path
61
+ viewport: Viewport preset (desktop, laptop, tablet, mobile)
62
+ full_page: Whether to capture full page or just viewport
63
+ timeout: Page load timeout in milliseconds
64
+
65
+ Returns:
66
+ Dictionary with capture results
67
+ """
68
+ result = {
69
+ "url": url,
70
+ "output": output_path,
71
+ "viewport": viewport,
72
+ "success": False,
73
+ "error": None,
74
+ }
75
+
76
+ if viewport not in VIEWPORTS:
77
+ result["error"] = f"Invalid viewport: {viewport}. Choose from: {list(VIEWPORTS.keys())}"
78
+ return result
79
+
80
+ try:
81
+ url, parsed = normalize_url(url)
82
+ result["url"] = url
83
+ except ValueError as e:
84
+ result["error"] = str(e)
85
+ return result
86
+
87
+ # SSRF prevention: block private/internal IPs
88
+ try:
89
+ resolved_ip = socket.gethostbyname(parsed.hostname)
90
+ ip = ipaddress.ip_address(resolved_ip)
91
+ if ip.is_private or ip.is_loopback or ip.is_reserved:
92
+ result["error"] = f"Blocked: URL resolves to private/internal IP ({resolved_ip})"
93
+ return result
94
+ except socket.gaierror:
95
+ pass
96
+
97
+ vp = VIEWPORTS[viewport]
98
+
99
+ try:
100
+ with sync_playwright() as p:
101
+ browser = p.chromium.launch(headless=True)
102
+ context = browser.new_context(
103
+ viewport={"width": vp["width"], "height": vp["height"]},
104
+ device_scale_factor=2 if viewport == "mobile" else 1,
105
+ )
106
+ page = context.new_page()
107
+
108
+ # Navigate and wait for network idle
109
+ page.goto(url, wait_until="networkidle", timeout=timeout)
110
+
111
+ # Wait a bit more for any lazy-loaded content
112
+ page.wait_for_timeout(1000)
113
+
114
+ # Capture screenshot
115
+ page.screenshot(path=output_path, full_page=full_page)
116
+
117
+ result["success"] = True
118
+ browser.close()
119
+
120
+ except PlaywrightTimeout:
121
+ result["error"] = f"Page load timed out after {timeout}ms"
122
+ except Exception as e:
123
+ result["error"] = str(e)
124
+
125
+ return result
126
+
127
+
128
+ def main():
129
+ parser = argparse.ArgumentParser(description="Capture web page screenshots")
130
+ parser.add_argument("url", help="URL to capture")
131
+ parser.add_argument("--output", "-o", default="screenshots", help="Output directory")
132
+ parser.add_argument("--viewport", "-v", default="desktop", choices=VIEWPORTS.keys())
133
+ parser.add_argument("--all", "-a", action="store_true", help="Capture all viewports")
134
+ parser.add_argument("--full", "-f", action="store_true", help="Capture full page")
135
+ parser.add_argument("--timeout", "-t", type=int, default=30000, help="Timeout in ms")
136
+
137
+ args = parser.parse_args()
138
+
139
+ # Sanitize output path - prevent directory traversal
140
+ output_dir = os.path.realpath(args.output)
141
+ cwd = os.getcwd()
142
+ home = os.path.expanduser("~")
143
+ if not (output_dir.startswith(cwd) or output_dir.startswith(home)):
144
+ print("Error: Output path must be within current directory or home directory", file=sys.stderr)
145
+ sys.exit(1)
146
+
147
+ # Create output directory
148
+ os.makedirs(args.output, exist_ok=True)
149
+
150
+ try:
151
+ normalized_url, parsed_url = normalize_url(args.url)
152
+ except ValueError as e:
153
+ print(f"Error: {e}", file=sys.stderr)
154
+ sys.exit(1)
155
+
156
+ # Generate filename from URL
157
+ base_name = parsed_url.netloc.replace(".", "_")
158
+
159
+ viewports = VIEWPORTS.keys() if args.all else [args.viewport]
160
+
161
+ for viewport in viewports:
162
+ filename = f"{base_name}_{viewport}.png"
163
+ output_path = os.path.join(args.output, filename)
164
+
165
+ print(f"Capturing {viewport} screenshot...")
166
+ result = capture_screenshot(
167
+ normalized_url,
168
+ output_path,
169
+ viewport=viewport,
170
+ full_page=args.full,
171
+ timeout=args.timeout,
172
+ )
173
+
174
+ if result["success"]:
175
+ print(f" ✓ Saved to {output_path}")
176
+ else:
177
+ print(f" ✗ Failed: {result['error']}")
178
+
179
+
180
+ if __name__ == "__main__":
181
+ main()
@@ -0,0 +1,196 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Fetch a web page with proper headers and error handling.
4
+
5
+ Usage:
6
+ python fetch_page.py https://example.com
7
+ python fetch_page.py https://example.com --output page.html
8
+ """
9
+
10
+ import argparse
11
+ import ipaddress
12
+ import socket
13
+ import sys
14
+ from typing import Optional
15
+ from urllib.parse import urlparse
16
+
17
+ try:
18
+ import requests
19
+ except ImportError:
20
+ print("Error: requests library required. Install with: pip install requests")
21
+ sys.exit(1)
22
+
23
+
24
+ DEFAULT_USER_AGENT = (
25
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
26
+ "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ClaudeSEO/1.2"
27
+ )
28
+
29
+ # Googlebot UA for prerender/dynamic rendering detection.
30
+ # Prerender services (Prerender.io, Rendertron) serve fully rendered HTML to
31
+ # Googlebot but raw JS shells to other UAs. Comparing response sizes between
32
+ # DEFAULT_USER_AGENT and GOOGLEBOT_USER_AGENT reveals whether a site uses
33
+ # dynamic rendering, a key signal for SPA detection.
34
+ GOOGLEBOT_USER_AGENT = (
35
+ "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
36
+ )
37
+
38
+ DEFAULT_HEADERS = {
39
+ "User-Agent": DEFAULT_USER_AGENT,
40
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
41
+ "Accept-Language": "en-US,en;q=0.5",
42
+ "Accept-Encoding": "gzip, deflate",
43
+ "Connection": "keep-alive",
44
+ }
45
+
46
+
47
+ def fetch_page(
48
+ url: str,
49
+ timeout: int = 30,
50
+ follow_redirects: bool = True,
51
+ max_redirects: int = 5,
52
+ user_agent: Optional[str] = None,
53
+ ) -> dict:
54
+ """
55
+ Fetch a web page and return response details.
56
+
57
+ Args:
58
+ url: The URL to fetch
59
+ timeout: Request timeout in seconds
60
+ follow_redirects: Whether to follow redirects
61
+ max_redirects: Maximum number of redirects to follow
62
+
63
+ Returns:
64
+ Dictionary with:
65
+ - url: Final URL after redirects
66
+ - status_code: HTTP status code
67
+ - content: Response body
68
+ - headers: Response headers
69
+ - redirect_chain: List of redirect URLs
70
+ - error: Error message if failed
71
+ """
72
+ result = {
73
+ "url": url,
74
+ "status_code": None,
75
+ "content": None,
76
+ "headers": {},
77
+ "redirect_chain": [],
78
+ "redirect_details": [],
79
+ "error": None,
80
+ }
81
+
82
+ # Validate URL
83
+ parsed = urlparse(url)
84
+ if not parsed.scheme:
85
+ url = f"https://{url}"
86
+ parsed = urlparse(url)
87
+
88
+ if parsed.scheme not in ("http", "https"):
89
+ result["error"] = f"Invalid URL scheme: {parsed.scheme}"
90
+ return result
91
+
92
+ # SSRF prevention: block private/internal IPs
93
+ try:
94
+ resolved_ip = socket.gethostbyname(parsed.hostname)
95
+ ip = ipaddress.ip_address(resolved_ip)
96
+ if ip.is_private or ip.is_loopback or ip.is_reserved:
97
+ result["error"] = f"Blocked: URL resolves to private/internal IP ({resolved_ip})"
98
+ return result
99
+ except (socket.gaierror, ValueError):
100
+ pass # DNS resolution failure handled by requests below
101
+
102
+ try:
103
+ session = requests.Session()
104
+ session.max_redirects = max_redirects
105
+
106
+ headers = dict(DEFAULT_HEADERS)
107
+ if user_agent:
108
+ headers["User-Agent"] = user_agent
109
+
110
+ response = session.get(
111
+ url,
112
+ headers=headers,
113
+ timeout=timeout,
114
+ allow_redirects=follow_redirects,
115
+ )
116
+
117
+ result["url"] = response.url
118
+ result["status_code"] = response.status_code
119
+ result["content"] = response.text
120
+ result["headers"] = dict(response.headers)
121
+
122
+ # Track redirect chain with status codes
123
+ if response.history:
124
+ result["redirect_chain"] = [r.url for r in response.history]
125
+ result["redirect_details"] = [
126
+ {"url": r.url, "status_code": r.status_code}
127
+ for r in response.history
128
+ ]
129
+
130
+ except requests.exceptions.Timeout:
131
+ result["error"] = f"Request timed out after {timeout} seconds"
132
+ except requests.exceptions.TooManyRedirects:
133
+ result["error"] = f"Too many redirects (max {max_redirects})"
134
+ except requests.exceptions.SSLError as e:
135
+ result["error"] = f"SSL error: {e}"
136
+ except requests.exceptions.ConnectionError as e:
137
+ result["error"] = f"Connection error: {e}"
138
+ except requests.exceptions.RequestException as e:
139
+ result["error"] = f"Request failed: {e}"
140
+
141
+ return result
142
+
143
+
144
+ def main():
145
+ parser = argparse.ArgumentParser(description="Fetch a web page for SEO analysis")
146
+ parser.add_argument("url", help="URL to fetch")
147
+ parser.add_argument("--output", "-o", help="Output file path")
148
+ parser.add_argument("--timeout", "-t", type=int, default=30, help="Timeout in seconds")
149
+ parser.add_argument("--no-redirects", action="store_true", help="Don't follow redirects")
150
+ parser.add_argument("--user-agent", help="Custom User-Agent string")
151
+ parser.add_argument(
152
+ "--googlebot",
153
+ action="store_true",
154
+ help=(
155
+ "Use Googlebot UA to detect dynamic rendering / prerender services. "
156
+ "Compare response size with default UA to identify SPA prerender configuration."
157
+ ),
158
+ )
159
+
160
+ args = parser.parse_args()
161
+
162
+ ua = args.user_agent
163
+ if args.googlebot:
164
+ ua = GOOGLEBOT_USER_AGENT
165
+
166
+ result = fetch_page(
167
+ args.url,
168
+ timeout=args.timeout,
169
+ follow_redirects=not args.no_redirects,
170
+ user_agent=ua,
171
+ )
172
+
173
+ if result["error"]:
174
+ print(f"Error: {result['error']}", file=sys.stderr)
175
+ sys.exit(1)
176
+
177
+ if args.output:
178
+ with open(args.output, "w", encoding="utf-8") as f:
179
+ f.write(result["content"])
180
+ print(f"Saved to {args.output}")
181
+ else:
182
+ print(result["content"])
183
+
184
+ # Print metadata to stderr
185
+ print(f"\nURL: {result['url']}", file=sys.stderr)
186
+ print(f"Status: {result['status_code']}", file=sys.stderr)
187
+ if result["redirect_details"]:
188
+ for rd in result["redirect_details"]:
189
+ print(f" {rd['status_code']} -> {rd['url']}", file=sys.stderr)
190
+ print(f" {result['status_code']} -> {result['url']} (final)", file=sys.stderr)
191
+ elif result["redirect_chain"]:
192
+ print(f"Redirects: {' -> '.join(result['redirect_chain'])}", file=sys.stderr)
193
+
194
+
195
+ if __name__ == "__main__":
196
+ main()
@@ -0,0 +1,201 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Parse HTML and extract SEO-relevant elements.
4
+
5
+ Usage:
6
+ python parse_html.py page.html
7
+ python parse_html.py --url https://example.com
8
+ """
9
+
10
+ import argparse
11
+ import json
12
+ import os
13
+ import re
14
+ import sys
15
+ from typing import Optional
16
+ from urllib.parse import urljoin, urlparse
17
+
18
+ try:
19
+ from bs4 import BeautifulSoup
20
+ except ImportError:
21
+ print("Error: beautifulsoup4 required. Install with: pip install beautifulsoup4")
22
+ sys.exit(1)
23
+
24
+ try:
25
+ import lxml # noqa: F401
26
+ _HTML_PARSER = "lxml"
27
+ except ImportError:
28
+ _HTML_PARSER = "html.parser"
29
+
30
+
31
+ def parse_html(html: str, base_url: Optional[str] = None) -> dict:
32
+ """
33
+ Parse HTML and extract SEO-relevant elements.
34
+
35
+ Args:
36
+ html: HTML content to parse
37
+ base_url: Base URL for resolving relative links
38
+
39
+ Returns:
40
+ Dictionary with extracted SEO data
41
+ """
42
+ soup = BeautifulSoup(html, _HTML_PARSER)
43
+
44
+ result = {
45
+ "title": None,
46
+ "meta_description": None,
47
+ "meta_robots": None,
48
+ "canonical": None,
49
+ "h1": [],
50
+ "h2": [],
51
+ "h3": [],
52
+ "images": [],
53
+ "links": {
54
+ "internal": [],
55
+ "external": [],
56
+ },
57
+ "schema": [],
58
+ "open_graph": {},
59
+ "twitter_card": {},
60
+ "word_count": 0,
61
+ "hreflang": [],
62
+ }
63
+
64
+ # Title
65
+ title_tag = soup.find("title")
66
+ if title_tag:
67
+ result["title"] = title_tag.get_text(strip=True)
68
+
69
+ # Meta tags
70
+ for meta in soup.find_all("meta"):
71
+ name = meta.get("name", "").lower()
72
+ property_attr = meta.get("property", "").lower()
73
+ content = meta.get("content", "")
74
+
75
+ if name == "description":
76
+ result["meta_description"] = content
77
+ elif name == "robots":
78
+ result["meta_robots"] = content
79
+
80
+ # Open Graph
81
+ if property_attr.startswith("og:"):
82
+ result["open_graph"][property_attr] = content
83
+
84
+ # Twitter Card
85
+ if name.startswith("twitter:"):
86
+ result["twitter_card"][name] = content
87
+
88
+ # Canonical
89
+ canonical = soup.find("link", rel="canonical")
90
+ if canonical:
91
+ result["canonical"] = canonical.get("href")
92
+
93
+ # Hreflang
94
+ for link in soup.find_all("link", rel="alternate"):
95
+ hreflang = link.get("hreflang")
96
+ if hreflang:
97
+ result["hreflang"].append({
98
+ "lang": hreflang,
99
+ "href": link.get("href"),
100
+ })
101
+
102
+ # Headings
103
+ for tag in ["h1", "h2", "h3"]:
104
+ for heading in soup.find_all(tag):
105
+ text = heading.get_text(strip=True)
106
+ if text:
107
+ result[tag].append(text)
108
+
109
+ # Images
110
+ for img in soup.find_all("img"):
111
+ src = img.get("src", "")
112
+ if base_url and src:
113
+ src = urljoin(base_url, src)
114
+
115
+ result["images"].append({
116
+ "src": src,
117
+ "alt": img.get("alt"),
118
+ "width": img.get("width"),
119
+ "height": img.get("height"),
120
+ "loading": img.get("loading"),
121
+ })
122
+
123
+ # Links
124
+ if base_url:
125
+ base_domain = urlparse(base_url).netloc
126
+
127
+ for a in soup.find_all("a", href=True):
128
+ href = a.get("href", "")
129
+ if not href or href.startswith("#") or href.startswith("javascript:"):
130
+ continue
131
+
132
+ full_url = urljoin(base_url, href)
133
+ parsed = urlparse(full_url)
134
+
135
+ link_data = {
136
+ "href": full_url,
137
+ "text": a.get_text(strip=True)[:100],
138
+ "rel": a.get("rel", []),
139
+ }
140
+
141
+ if parsed.netloc == base_domain:
142
+ result["links"]["internal"].append(link_data)
143
+ else:
144
+ result["links"]["external"].append(link_data)
145
+
146
+ # Schema (JSON-LD)
147
+ for script in soup.find_all("script", type="application/ld+json"):
148
+ try:
149
+ schema_data = json.loads(script.string)
150
+ result["schema"].append(schema_data)
151
+ except (json.JSONDecodeError, TypeError):
152
+ pass
153
+
154
+ # Word count (visible text only)
155
+ for element in soup(["script", "style", "nav", "footer", "header"]):
156
+ element.decompose()
157
+
158
+ text = soup.get_text(separator=" ", strip=True)
159
+ words = re.findall(r"\b\w+\b", text)
160
+ result["word_count"] = len(words)
161
+
162
+ return result
163
+
164
+
165
+ def main():
166
+ parser = argparse.ArgumentParser(description="Parse HTML for SEO analysis")
167
+ parser.add_argument("file", nargs="?", help="HTML file to parse")
168
+ parser.add_argument("--url", "-u", help="Base URL for resolving links")
169
+ parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
170
+
171
+ args = parser.parse_args()
172
+
173
+ if args.file:
174
+ real_path = os.path.realpath(args.file)
175
+ if not os.path.isfile(real_path):
176
+ print(f"Error: File not found: {args.file}", file=sys.stderr)
177
+ sys.exit(1)
178
+ with open(real_path, "r", encoding="utf-8") as f:
179
+ html = f.read()
180
+ else:
181
+ html = sys.stdin.read()
182
+
183
+ result = parse_html(html, args.url)
184
+
185
+ if args.json:
186
+ print(json.dumps(result, indent=2))
187
+ else:
188
+ print(f"Title: {result['title']}")
189
+ print(f"Meta Description: {result['meta_description']}")
190
+ print(f"Canonical: {result['canonical']}")
191
+ print(f"H1 Tags: {len(result['h1'])}")
192
+ print(f"H2 Tags: {len(result['h2'])}")
193
+ print(f"Images: {len(result['images'])}")
194
+ print(f"Internal Links: {len(result['links']['internal'])}")
195
+ print(f"External Links: {len(result['links']['external'])}")
196
+ print(f"Schema Blocks: {len(result['schema'])}")
197
+ print(f"Word Count: {result['word_count']}")
198
+
199
+
200
+ if __name__ == "__main__":
201
+ main()