antigravity-seo-kit 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent/agent.md +96 -0
- package/.agent/skills/seo/SKILL.md +153 -0
- package/.agent/skills/seo/references/cwv-thresholds.md +108 -0
- package/.agent/skills/seo/references/eeat-framework.md +214 -0
- package/.agent/skills/seo/references/local-schema-types.md +230 -0
- package/.agent/skills/seo/references/local-seo-signals.md +218 -0
- package/.agent/skills/seo/references/maps-api-endpoints.md +160 -0
- package/.agent/skills/seo/references/maps-free-apis.md +176 -0
- package/.agent/skills/seo/references/maps-gbp-checklist.md +150 -0
- package/.agent/skills/seo/references/maps-geo-grid.md +154 -0
- package/.agent/skills/seo/references/quality-gates.md +155 -0
- package/.agent/skills/seo/references/schema-types.md +118 -0
- package/.agent/skills/seo/schema/templates.json +213 -0
- package/.agent/skills/seo/scripts/analyze_visual.py +217 -0
- package/.agent/skills/seo/scripts/capture_screenshot.py +181 -0
- package/.agent/skills/seo/scripts/fetch_page.py +196 -0
- package/.agent/skills/seo/scripts/parse_html.py +201 -0
- package/.agent/skills/seo-audit/SKILL.md +278 -0
- package/.agent/skills/seo-competitor-pages/SKILL.md +212 -0
- package/.agent/skills/seo-content/SKILL.md +230 -0
- package/.agent/skills/seo-dataforseo/SKILL.md +418 -0
- package/.agent/skills/seo-geo/SKILL.md +305 -0
- package/.agent/skills/seo-google/SKILL.md +405 -0
- package/.agent/skills/seo-google/assets/templates/cwv-audit-report.md +48 -0
- package/.agent/skills/seo-google/assets/templates/gsc-performance-report.md +44 -0
- package/.agent/skills/seo-google/assets/templates/indexation-status-report.md +43 -0
- package/.agent/skills/seo-google/references/auth-setup.md +154 -0
- package/.agent/skills/seo-google/references/ga4-data-api.md +184 -0
- package/.agent/skills/seo-google/references/indexing-api.md +107 -0
- package/.agent/skills/seo-google/references/keyword-planner-api.md +66 -0
- package/.agent/skills/seo-google/references/nlp-api.md +55 -0
- package/.agent/skills/seo-google/references/pagespeed-crux-api.md +204 -0
- package/.agent/skills/seo-google/references/rate-limits-quotas.md +75 -0
- package/.agent/skills/seo-google/references/search-console-api.md +156 -0
- package/.agent/skills/seo-google/references/supplementary-apis.md +99 -0
- package/.agent/skills/seo-google/references/youtube-api.md +49 -0
- package/.agent/skills/seo-google/scripts/crux_history.py +321 -0
- package/.agent/skills/seo-google/scripts/ga4_report.py +478 -0
- package/.agent/skills/seo-google/scripts/google_auth.py +795 -0
- package/.agent/skills/seo-google/scripts/google_report.py +2273 -0
- package/.agent/skills/seo-google/scripts/gsc_inspect.py +340 -0
- package/.agent/skills/seo-google/scripts/gsc_query.py +378 -0
- package/.agent/skills/seo-google/scripts/indexing_notify.py +313 -0
- package/.agent/skills/seo-google/scripts/keyword_planner.py +297 -0
- package/.agent/skills/seo-google/scripts/nlp_analyze.py +309 -0
- package/.agent/skills/seo-google/scripts/pagespeed_check.py +649 -0
- package/.agent/skills/seo-google/scripts/youtube_search.py +355 -0
- package/.agent/skills/seo-hreflang/SKILL.md +192 -0
- package/.agent/skills/seo-image-gen/SKILL.md +211 -0
- package/.agent/skills/seo-image-gen/references/cost-tracking.md +47 -0
- package/.agent/skills/seo-image-gen/references/gemini-models.md +200 -0
- package/.agent/skills/seo-image-gen/references/mcp-tools.md +115 -0
- package/.agent/skills/seo-image-gen/references/post-processing.md +192 -0
- package/.agent/skills/seo-image-gen/references/presets.md +69 -0
- package/.agent/skills/seo-image-gen/references/prompt-engineering.md +411 -0
- package/.agent/skills/seo-image-gen/references/seo-image-presets.md +137 -0
- package/.agent/skills/seo-image-gen/scripts/batch.py +97 -0
- package/.agent/skills/seo-image-gen/scripts/cost_tracker.py +191 -0
- package/.agent/skills/seo-image-gen/scripts/edit.py +141 -0
- package/.agent/skills/seo-image-gen/scripts/generate.py +149 -0
- package/.agent/skills/seo-image-gen/scripts/presets.py +153 -0
- package/.agent/skills/seo-image-gen/scripts/setup_mcp.py +151 -0
- package/.agent/skills/seo-image-gen/scripts/validate_setup.py +133 -0
- package/.agent/skills/seo-images/SKILL.md +176 -0
- package/.agent/skills/seo-local/SKILL.md +381 -0
- package/.agent/skills/seo-maps/SKILL.md +328 -0
- package/.agent/skills/seo-page/SKILL.md +86 -0
- package/.agent/skills/seo-plan/SKILL.md +118 -0
- package/.agent/skills/seo-plan/assets/agency.md +175 -0
- package/.agent/skills/seo-plan/assets/ecommerce.md +167 -0
- package/.agent/skills/seo-plan/assets/generic.md +144 -0
- package/.agent/skills/seo-plan/assets/local-service.md +160 -0
- package/.agent/skills/seo-plan/assets/publisher.md +153 -0
- package/.agent/skills/seo-plan/assets/saas.md +135 -0
- package/.agent/skills/seo-programmatic/SKILL.md +171 -0
- package/.agent/skills/seo-schema/SKILL.md +223 -0
- package/.agent/skills/seo-sitemap/SKILL.md +180 -0
- package/.agent/skills/seo-technical/SKILL.md +211 -0
- package/.agent/workflows/seo-audit.md +17 -0
- package/.agent/workflows/seo-competitor-pages.md +12 -0
- package/.agent/workflows/seo-content.md +14 -0
- package/.agent/workflows/seo-geo.md +12 -0
- package/.agent/workflows/seo-google.md +12 -0
- package/.agent/workflows/seo-hreflang.md +12 -0
- package/.agent/workflows/seo-images.md +13 -0
- package/.agent/workflows/seo-local.md +12 -0
- package/.agent/workflows/seo-maps.md +11 -0
- package/.agent/workflows/seo-page.md +13 -0
- package/.agent/workflows/seo-plan.md +13 -0
- package/.agent/workflows/seo-programmatic.md +12 -0
- package/.agent/workflows/seo-schema.md +11 -0
- package/.agent/workflows/seo-sitemap.md +9 -0
- package/.agent/workflows/seo-technical.md +18 -0
- package/LICENSE +88 -0
- package/README.md +122 -0
- package/bin/cli.js +117 -0
- package/docs/ARCHITECTURE.md +218 -0
- package/docs/COMMANDS.md +184 -0
- package/docs/INSTALLATION.md +100 -0
- package/docs/MCP-INTEGRATION.md +153 -0
- package/docs/TROUBLESHOOTING.md +151 -0
- package/docs/superpowers/plans/2026-03-13-github-audit-fixes.md +511 -0
- package/extensions/banana/README.md +95 -0
- package/extensions/banana/docs/BANANA-SETUP.md +86 -0
- package/extensions/banana/install.sh +170 -0
- package/extensions/banana/references/cost-tracking.md +47 -0
- package/extensions/banana/references/gemini-models.md +200 -0
- package/extensions/banana/references/mcp-tools.md +115 -0
- package/extensions/banana/references/post-processing.md +192 -0
- package/extensions/banana/references/presets.md +69 -0
- package/extensions/banana/references/prompt-engineering.md +411 -0
- package/extensions/banana/references/seo-image-presets.md +137 -0
- package/extensions/banana/scripts/batch.py +97 -0
- package/extensions/banana/scripts/cost_tracker.py +191 -0
- package/extensions/banana/scripts/edit.py +141 -0
- package/extensions/banana/scripts/generate.py +149 -0
- package/extensions/banana/scripts/presets.py +153 -0
- package/extensions/banana/scripts/setup_mcp.py +151 -0
- package/extensions/banana/scripts/validate_setup.py +133 -0
- package/extensions/banana/uninstall.sh +43 -0
- package/extensions/dataforseo/README.md +169 -0
- package/extensions/dataforseo/docs/DATAFORSEO-SETUP.md +74 -0
- package/extensions/dataforseo/field-config.json +280 -0
- package/extensions/dataforseo/install.ps1 +110 -0
- package/extensions/dataforseo/install.sh +161 -0
- package/extensions/dataforseo/uninstall.ps1 +35 -0
- package/extensions/dataforseo/uninstall.sh +39 -0
- package/lib/api.js +190 -0
- package/lib/fingerprint.js +68 -0
- package/lib/installer.js +486 -0
- package/lib/utils.js +254 -0
- package/package.json +40 -0
- package/pyproject.toml +11 -0
- package/requirements-google.txt +15 -0
- package/requirements.txt +11 -0
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Capture screenshots of web pages using Playwright.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
python capture_screenshot.py https://example.com
|
|
7
|
+
python capture_screenshot.py https://example.com --mobile
|
|
8
|
+
python capture_screenshot.py https://example.com --output screenshots/
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import ipaddress
|
|
13
|
+
import os
|
|
14
|
+
import socket
|
|
15
|
+
import sys
|
|
16
|
+
from urllib.parse import ParseResult, urlparse
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
|
20
|
+
except ImportError:
|
|
21
|
+
print("Error: playwright required. Install with: pip install playwright && playwright install chromium")
|
|
22
|
+
sys.exit(1)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
VIEWPORTS = {
|
|
26
|
+
"desktop": {"width": 1920, "height": 1080},
|
|
27
|
+
"laptop": {"width": 1366, "height": 768},
|
|
28
|
+
"tablet": {"width": 768, "height": 1024},
|
|
29
|
+
"mobile": {"width": 375, "height": 812},
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def normalize_url(url: str) -> tuple[str, ParseResult]:
|
|
34
|
+
"""Normalize URL and return (url, parsed_url)."""
|
|
35
|
+
parsed = urlparse(url)
|
|
36
|
+
if not parsed.scheme:
|
|
37
|
+
url = f"https://{url}"
|
|
38
|
+
parsed = urlparse(url)
|
|
39
|
+
|
|
40
|
+
if parsed.scheme not in ("http", "https"):
|
|
41
|
+
raise ValueError(f"Invalid URL scheme: {parsed.scheme}")
|
|
42
|
+
if not parsed.hostname:
|
|
43
|
+
raise ValueError("Invalid URL: missing hostname")
|
|
44
|
+
|
|
45
|
+
return url, parsed
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def capture_screenshot(
|
|
49
|
+
url: str,
|
|
50
|
+
output_path: str,
|
|
51
|
+
viewport: str = "desktop",
|
|
52
|
+
full_page: bool = False,
|
|
53
|
+
timeout: int = 30000,
|
|
54
|
+
) -> dict:
|
|
55
|
+
"""
|
|
56
|
+
Capture a screenshot of a web page.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
url: URL to capture
|
|
60
|
+
output_path: Output file path
|
|
61
|
+
viewport: Viewport preset (desktop, laptop, tablet, mobile)
|
|
62
|
+
full_page: Whether to capture full page or just viewport
|
|
63
|
+
timeout: Page load timeout in milliseconds
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Dictionary with capture results
|
|
67
|
+
"""
|
|
68
|
+
result = {
|
|
69
|
+
"url": url,
|
|
70
|
+
"output": output_path,
|
|
71
|
+
"viewport": viewport,
|
|
72
|
+
"success": False,
|
|
73
|
+
"error": None,
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if viewport not in VIEWPORTS:
|
|
77
|
+
result["error"] = f"Invalid viewport: {viewport}. Choose from: {list(VIEWPORTS.keys())}"
|
|
78
|
+
return result
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
url, parsed = normalize_url(url)
|
|
82
|
+
result["url"] = url
|
|
83
|
+
except ValueError as e:
|
|
84
|
+
result["error"] = str(e)
|
|
85
|
+
return result
|
|
86
|
+
|
|
87
|
+
# SSRF prevention: block private/internal IPs
|
|
88
|
+
try:
|
|
89
|
+
resolved_ip = socket.gethostbyname(parsed.hostname)
|
|
90
|
+
ip = ipaddress.ip_address(resolved_ip)
|
|
91
|
+
if ip.is_private or ip.is_loopback or ip.is_reserved:
|
|
92
|
+
result["error"] = f"Blocked: URL resolves to private/internal IP ({resolved_ip})"
|
|
93
|
+
return result
|
|
94
|
+
except socket.gaierror:
|
|
95
|
+
pass
|
|
96
|
+
|
|
97
|
+
vp = VIEWPORTS[viewport]
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
with sync_playwright() as p:
|
|
101
|
+
browser = p.chromium.launch(headless=True)
|
|
102
|
+
context = browser.new_context(
|
|
103
|
+
viewport={"width": vp["width"], "height": vp["height"]},
|
|
104
|
+
device_scale_factor=2 if viewport == "mobile" else 1,
|
|
105
|
+
)
|
|
106
|
+
page = context.new_page()
|
|
107
|
+
|
|
108
|
+
# Navigate and wait for network idle
|
|
109
|
+
page.goto(url, wait_until="networkidle", timeout=timeout)
|
|
110
|
+
|
|
111
|
+
# Wait a bit more for any lazy-loaded content
|
|
112
|
+
page.wait_for_timeout(1000)
|
|
113
|
+
|
|
114
|
+
# Capture screenshot
|
|
115
|
+
page.screenshot(path=output_path, full_page=full_page)
|
|
116
|
+
|
|
117
|
+
result["success"] = True
|
|
118
|
+
browser.close()
|
|
119
|
+
|
|
120
|
+
except PlaywrightTimeout:
|
|
121
|
+
result["error"] = f"Page load timed out after {timeout}ms"
|
|
122
|
+
except Exception as e:
|
|
123
|
+
result["error"] = str(e)
|
|
124
|
+
|
|
125
|
+
return result
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def main():
|
|
129
|
+
parser = argparse.ArgumentParser(description="Capture web page screenshots")
|
|
130
|
+
parser.add_argument("url", help="URL to capture")
|
|
131
|
+
parser.add_argument("--output", "-o", default="screenshots", help="Output directory")
|
|
132
|
+
parser.add_argument("--viewport", "-v", default="desktop", choices=VIEWPORTS.keys())
|
|
133
|
+
parser.add_argument("--all", "-a", action="store_true", help="Capture all viewports")
|
|
134
|
+
parser.add_argument("--full", "-f", action="store_true", help="Capture full page")
|
|
135
|
+
parser.add_argument("--timeout", "-t", type=int, default=30000, help="Timeout in ms")
|
|
136
|
+
|
|
137
|
+
args = parser.parse_args()
|
|
138
|
+
|
|
139
|
+
# Sanitize output path - prevent directory traversal
|
|
140
|
+
output_dir = os.path.realpath(args.output)
|
|
141
|
+
cwd = os.getcwd()
|
|
142
|
+
home = os.path.expanduser("~")
|
|
143
|
+
if not (output_dir.startswith(cwd) or output_dir.startswith(home)):
|
|
144
|
+
print("Error: Output path must be within current directory or home directory", file=sys.stderr)
|
|
145
|
+
sys.exit(1)
|
|
146
|
+
|
|
147
|
+
# Create output directory
|
|
148
|
+
os.makedirs(args.output, exist_ok=True)
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
normalized_url, parsed_url = normalize_url(args.url)
|
|
152
|
+
except ValueError as e:
|
|
153
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
154
|
+
sys.exit(1)
|
|
155
|
+
|
|
156
|
+
# Generate filename from URL
|
|
157
|
+
base_name = parsed_url.netloc.replace(".", "_")
|
|
158
|
+
|
|
159
|
+
viewports = VIEWPORTS.keys() if args.all else [args.viewport]
|
|
160
|
+
|
|
161
|
+
for viewport in viewports:
|
|
162
|
+
filename = f"{base_name}_{viewport}.png"
|
|
163
|
+
output_path = os.path.join(args.output, filename)
|
|
164
|
+
|
|
165
|
+
print(f"Capturing {viewport} screenshot...")
|
|
166
|
+
result = capture_screenshot(
|
|
167
|
+
normalized_url,
|
|
168
|
+
output_path,
|
|
169
|
+
viewport=viewport,
|
|
170
|
+
full_page=args.full,
|
|
171
|
+
timeout=args.timeout,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
if result["success"]:
|
|
175
|
+
print(f" ✓ Saved to {output_path}")
|
|
176
|
+
else:
|
|
177
|
+
print(f" ✗ Failed: {result['error']}")
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
if __name__ == "__main__":
|
|
181
|
+
main()
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Fetch a web page with proper headers and error handling.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
python fetch_page.py https://example.com
|
|
7
|
+
python fetch_page.py https://example.com --output page.html
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import ipaddress
|
|
12
|
+
import socket
|
|
13
|
+
import sys
|
|
14
|
+
from typing import Optional
|
|
15
|
+
from urllib.parse import urlparse
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
import requests
|
|
19
|
+
except ImportError:
|
|
20
|
+
print("Error: requests library required. Install with: pip install requests")
|
|
21
|
+
sys.exit(1)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
DEFAULT_USER_AGENT = (
|
|
25
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
26
|
+
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ClaudeSEO/1.2"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Googlebot UA for prerender/dynamic rendering detection.
|
|
30
|
+
# Prerender services (Prerender.io, Rendertron) serve fully rendered HTML to
|
|
31
|
+
# Googlebot but raw JS shells to other UAs. Comparing response sizes between
|
|
32
|
+
# DEFAULT_USER_AGENT and GOOGLEBOT_USER_AGENT reveals whether a site uses
|
|
33
|
+
# dynamic rendering, a key signal for SPA detection.
|
|
34
|
+
GOOGLEBOT_USER_AGENT = (
|
|
35
|
+
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
DEFAULT_HEADERS = {
|
|
39
|
+
"User-Agent": DEFAULT_USER_AGENT,
|
|
40
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
41
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
42
|
+
"Accept-Encoding": "gzip, deflate",
|
|
43
|
+
"Connection": "keep-alive",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def fetch_page(
|
|
48
|
+
url: str,
|
|
49
|
+
timeout: int = 30,
|
|
50
|
+
follow_redirects: bool = True,
|
|
51
|
+
max_redirects: int = 5,
|
|
52
|
+
user_agent: Optional[str] = None,
|
|
53
|
+
) -> dict:
|
|
54
|
+
"""
|
|
55
|
+
Fetch a web page and return response details.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
url: The URL to fetch
|
|
59
|
+
timeout: Request timeout in seconds
|
|
60
|
+
follow_redirects: Whether to follow redirects
|
|
61
|
+
max_redirects: Maximum number of redirects to follow
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Dictionary with:
|
|
65
|
+
- url: Final URL after redirects
|
|
66
|
+
- status_code: HTTP status code
|
|
67
|
+
- content: Response body
|
|
68
|
+
- headers: Response headers
|
|
69
|
+
- redirect_chain: List of redirect URLs
|
|
70
|
+
- error: Error message if failed
|
|
71
|
+
"""
|
|
72
|
+
result = {
|
|
73
|
+
"url": url,
|
|
74
|
+
"status_code": None,
|
|
75
|
+
"content": None,
|
|
76
|
+
"headers": {},
|
|
77
|
+
"redirect_chain": [],
|
|
78
|
+
"redirect_details": [],
|
|
79
|
+
"error": None,
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
# Validate URL
|
|
83
|
+
parsed = urlparse(url)
|
|
84
|
+
if not parsed.scheme:
|
|
85
|
+
url = f"https://{url}"
|
|
86
|
+
parsed = urlparse(url)
|
|
87
|
+
|
|
88
|
+
if parsed.scheme not in ("http", "https"):
|
|
89
|
+
result["error"] = f"Invalid URL scheme: {parsed.scheme}"
|
|
90
|
+
return result
|
|
91
|
+
|
|
92
|
+
# SSRF prevention: block private/internal IPs
|
|
93
|
+
try:
|
|
94
|
+
resolved_ip = socket.gethostbyname(parsed.hostname)
|
|
95
|
+
ip = ipaddress.ip_address(resolved_ip)
|
|
96
|
+
if ip.is_private or ip.is_loopback or ip.is_reserved:
|
|
97
|
+
result["error"] = f"Blocked: URL resolves to private/internal IP ({resolved_ip})"
|
|
98
|
+
return result
|
|
99
|
+
except (socket.gaierror, ValueError):
|
|
100
|
+
pass # DNS resolution failure handled by requests below
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
session = requests.Session()
|
|
104
|
+
session.max_redirects = max_redirects
|
|
105
|
+
|
|
106
|
+
headers = dict(DEFAULT_HEADERS)
|
|
107
|
+
if user_agent:
|
|
108
|
+
headers["User-Agent"] = user_agent
|
|
109
|
+
|
|
110
|
+
response = session.get(
|
|
111
|
+
url,
|
|
112
|
+
headers=headers,
|
|
113
|
+
timeout=timeout,
|
|
114
|
+
allow_redirects=follow_redirects,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
result["url"] = response.url
|
|
118
|
+
result["status_code"] = response.status_code
|
|
119
|
+
result["content"] = response.text
|
|
120
|
+
result["headers"] = dict(response.headers)
|
|
121
|
+
|
|
122
|
+
# Track redirect chain with status codes
|
|
123
|
+
if response.history:
|
|
124
|
+
result["redirect_chain"] = [r.url for r in response.history]
|
|
125
|
+
result["redirect_details"] = [
|
|
126
|
+
{"url": r.url, "status_code": r.status_code}
|
|
127
|
+
for r in response.history
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
except requests.exceptions.Timeout:
|
|
131
|
+
result["error"] = f"Request timed out after {timeout} seconds"
|
|
132
|
+
except requests.exceptions.TooManyRedirects:
|
|
133
|
+
result["error"] = f"Too many redirects (max {max_redirects})"
|
|
134
|
+
except requests.exceptions.SSLError as e:
|
|
135
|
+
result["error"] = f"SSL error: {e}"
|
|
136
|
+
except requests.exceptions.ConnectionError as e:
|
|
137
|
+
result["error"] = f"Connection error: {e}"
|
|
138
|
+
except requests.exceptions.RequestException as e:
|
|
139
|
+
result["error"] = f"Request failed: {e}"
|
|
140
|
+
|
|
141
|
+
return result
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def main():
|
|
145
|
+
parser = argparse.ArgumentParser(description="Fetch a web page for SEO analysis")
|
|
146
|
+
parser.add_argument("url", help="URL to fetch")
|
|
147
|
+
parser.add_argument("--output", "-o", help="Output file path")
|
|
148
|
+
parser.add_argument("--timeout", "-t", type=int, default=30, help="Timeout in seconds")
|
|
149
|
+
parser.add_argument("--no-redirects", action="store_true", help="Don't follow redirects")
|
|
150
|
+
parser.add_argument("--user-agent", help="Custom User-Agent string")
|
|
151
|
+
parser.add_argument(
|
|
152
|
+
"--googlebot",
|
|
153
|
+
action="store_true",
|
|
154
|
+
help=(
|
|
155
|
+
"Use Googlebot UA to detect dynamic rendering / prerender services. "
|
|
156
|
+
"Compare response size with default UA to identify SPA prerender configuration."
|
|
157
|
+
),
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
args = parser.parse_args()
|
|
161
|
+
|
|
162
|
+
ua = args.user_agent
|
|
163
|
+
if args.googlebot:
|
|
164
|
+
ua = GOOGLEBOT_USER_AGENT
|
|
165
|
+
|
|
166
|
+
result = fetch_page(
|
|
167
|
+
args.url,
|
|
168
|
+
timeout=args.timeout,
|
|
169
|
+
follow_redirects=not args.no_redirects,
|
|
170
|
+
user_agent=ua,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if result["error"]:
|
|
174
|
+
print(f"Error: {result['error']}", file=sys.stderr)
|
|
175
|
+
sys.exit(1)
|
|
176
|
+
|
|
177
|
+
if args.output:
|
|
178
|
+
with open(args.output, "w", encoding="utf-8") as f:
|
|
179
|
+
f.write(result["content"])
|
|
180
|
+
print(f"Saved to {args.output}")
|
|
181
|
+
else:
|
|
182
|
+
print(result["content"])
|
|
183
|
+
|
|
184
|
+
# Print metadata to stderr
|
|
185
|
+
print(f"\nURL: {result['url']}", file=sys.stderr)
|
|
186
|
+
print(f"Status: {result['status_code']}", file=sys.stderr)
|
|
187
|
+
if result["redirect_details"]:
|
|
188
|
+
for rd in result["redirect_details"]:
|
|
189
|
+
print(f" {rd['status_code']} -> {rd['url']}", file=sys.stderr)
|
|
190
|
+
print(f" {result['status_code']} -> {result['url']} (final)", file=sys.stderr)
|
|
191
|
+
elif result["redirect_chain"]:
|
|
192
|
+
print(f"Redirects: {' -> '.join(result['redirect_chain'])}", file=sys.stderr)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
if __name__ == "__main__":
|
|
196
|
+
main()
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Parse HTML and extract SEO-relevant elements.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
python parse_html.py page.html
|
|
7
|
+
python parse_html.py --url https://example.com
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import json
|
|
12
|
+
import os
|
|
13
|
+
import re
|
|
14
|
+
import sys
|
|
15
|
+
from typing import Optional
|
|
16
|
+
from urllib.parse import urljoin, urlparse
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
from bs4 import BeautifulSoup
|
|
20
|
+
except ImportError:
|
|
21
|
+
print("Error: beautifulsoup4 required. Install with: pip install beautifulsoup4")
|
|
22
|
+
sys.exit(1)
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
import lxml # noqa: F401
|
|
26
|
+
_HTML_PARSER = "lxml"
|
|
27
|
+
except ImportError:
|
|
28
|
+
_HTML_PARSER = "html.parser"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def parse_html(html: str, base_url: Optional[str] = None) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
Parse HTML and extract SEO-relevant elements.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
html: HTML content to parse
|
|
37
|
+
base_url: Base URL for resolving relative links
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Dictionary with extracted SEO data
|
|
41
|
+
"""
|
|
42
|
+
soup = BeautifulSoup(html, _HTML_PARSER)
|
|
43
|
+
|
|
44
|
+
result = {
|
|
45
|
+
"title": None,
|
|
46
|
+
"meta_description": None,
|
|
47
|
+
"meta_robots": None,
|
|
48
|
+
"canonical": None,
|
|
49
|
+
"h1": [],
|
|
50
|
+
"h2": [],
|
|
51
|
+
"h3": [],
|
|
52
|
+
"images": [],
|
|
53
|
+
"links": {
|
|
54
|
+
"internal": [],
|
|
55
|
+
"external": [],
|
|
56
|
+
},
|
|
57
|
+
"schema": [],
|
|
58
|
+
"open_graph": {},
|
|
59
|
+
"twitter_card": {},
|
|
60
|
+
"word_count": 0,
|
|
61
|
+
"hreflang": [],
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# Title
|
|
65
|
+
title_tag = soup.find("title")
|
|
66
|
+
if title_tag:
|
|
67
|
+
result["title"] = title_tag.get_text(strip=True)
|
|
68
|
+
|
|
69
|
+
# Meta tags
|
|
70
|
+
for meta in soup.find_all("meta"):
|
|
71
|
+
name = meta.get("name", "").lower()
|
|
72
|
+
property_attr = meta.get("property", "").lower()
|
|
73
|
+
content = meta.get("content", "")
|
|
74
|
+
|
|
75
|
+
if name == "description":
|
|
76
|
+
result["meta_description"] = content
|
|
77
|
+
elif name == "robots":
|
|
78
|
+
result["meta_robots"] = content
|
|
79
|
+
|
|
80
|
+
# Open Graph
|
|
81
|
+
if property_attr.startswith("og:"):
|
|
82
|
+
result["open_graph"][property_attr] = content
|
|
83
|
+
|
|
84
|
+
# Twitter Card
|
|
85
|
+
if name.startswith("twitter:"):
|
|
86
|
+
result["twitter_card"][name] = content
|
|
87
|
+
|
|
88
|
+
# Canonical
|
|
89
|
+
canonical = soup.find("link", rel="canonical")
|
|
90
|
+
if canonical:
|
|
91
|
+
result["canonical"] = canonical.get("href")
|
|
92
|
+
|
|
93
|
+
# Hreflang
|
|
94
|
+
for link in soup.find_all("link", rel="alternate"):
|
|
95
|
+
hreflang = link.get("hreflang")
|
|
96
|
+
if hreflang:
|
|
97
|
+
result["hreflang"].append({
|
|
98
|
+
"lang": hreflang,
|
|
99
|
+
"href": link.get("href"),
|
|
100
|
+
})
|
|
101
|
+
|
|
102
|
+
# Headings
|
|
103
|
+
for tag in ["h1", "h2", "h3"]:
|
|
104
|
+
for heading in soup.find_all(tag):
|
|
105
|
+
text = heading.get_text(strip=True)
|
|
106
|
+
if text:
|
|
107
|
+
result[tag].append(text)
|
|
108
|
+
|
|
109
|
+
# Images
|
|
110
|
+
for img in soup.find_all("img"):
|
|
111
|
+
src = img.get("src", "")
|
|
112
|
+
if base_url and src:
|
|
113
|
+
src = urljoin(base_url, src)
|
|
114
|
+
|
|
115
|
+
result["images"].append({
|
|
116
|
+
"src": src,
|
|
117
|
+
"alt": img.get("alt"),
|
|
118
|
+
"width": img.get("width"),
|
|
119
|
+
"height": img.get("height"),
|
|
120
|
+
"loading": img.get("loading"),
|
|
121
|
+
})
|
|
122
|
+
|
|
123
|
+
# Links
|
|
124
|
+
if base_url:
|
|
125
|
+
base_domain = urlparse(base_url).netloc
|
|
126
|
+
|
|
127
|
+
for a in soup.find_all("a", href=True):
|
|
128
|
+
href = a.get("href", "")
|
|
129
|
+
if not href or href.startswith("#") or href.startswith("javascript:"):
|
|
130
|
+
continue
|
|
131
|
+
|
|
132
|
+
full_url = urljoin(base_url, href)
|
|
133
|
+
parsed = urlparse(full_url)
|
|
134
|
+
|
|
135
|
+
link_data = {
|
|
136
|
+
"href": full_url,
|
|
137
|
+
"text": a.get_text(strip=True)[:100],
|
|
138
|
+
"rel": a.get("rel", []),
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
if parsed.netloc == base_domain:
|
|
142
|
+
result["links"]["internal"].append(link_data)
|
|
143
|
+
else:
|
|
144
|
+
result["links"]["external"].append(link_data)
|
|
145
|
+
|
|
146
|
+
# Schema (JSON-LD)
|
|
147
|
+
for script in soup.find_all("script", type="application/ld+json"):
|
|
148
|
+
try:
|
|
149
|
+
schema_data = json.loads(script.string)
|
|
150
|
+
result["schema"].append(schema_data)
|
|
151
|
+
except (json.JSONDecodeError, TypeError):
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
# Word count (visible text only)
|
|
155
|
+
for element in soup(["script", "style", "nav", "footer", "header"]):
|
|
156
|
+
element.decompose()
|
|
157
|
+
|
|
158
|
+
text = soup.get_text(separator=" ", strip=True)
|
|
159
|
+
words = re.findall(r"\b\w+\b", text)
|
|
160
|
+
result["word_count"] = len(words)
|
|
161
|
+
|
|
162
|
+
return result
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def main():
|
|
166
|
+
parser = argparse.ArgumentParser(description="Parse HTML for SEO analysis")
|
|
167
|
+
parser.add_argument("file", nargs="?", help="HTML file to parse")
|
|
168
|
+
parser.add_argument("--url", "-u", help="Base URL for resolving links")
|
|
169
|
+
parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
|
|
170
|
+
|
|
171
|
+
args = parser.parse_args()
|
|
172
|
+
|
|
173
|
+
if args.file:
|
|
174
|
+
real_path = os.path.realpath(args.file)
|
|
175
|
+
if not os.path.isfile(real_path):
|
|
176
|
+
print(f"Error: File not found: {args.file}", file=sys.stderr)
|
|
177
|
+
sys.exit(1)
|
|
178
|
+
with open(real_path, "r", encoding="utf-8") as f:
|
|
179
|
+
html = f.read()
|
|
180
|
+
else:
|
|
181
|
+
html = sys.stdin.read()
|
|
182
|
+
|
|
183
|
+
result = parse_html(html, args.url)
|
|
184
|
+
|
|
185
|
+
if args.json:
|
|
186
|
+
print(json.dumps(result, indent=2))
|
|
187
|
+
else:
|
|
188
|
+
print(f"Title: {result['title']}")
|
|
189
|
+
print(f"Meta Description: {result['meta_description']}")
|
|
190
|
+
print(f"Canonical: {result['canonical']}")
|
|
191
|
+
print(f"H1 Tags: {len(result['h1'])}")
|
|
192
|
+
print(f"H2 Tags: {len(result['h2'])}")
|
|
193
|
+
print(f"Images: {len(result['images'])}")
|
|
194
|
+
print(f"Internal Links: {len(result['links']['internal'])}")
|
|
195
|
+
print(f"External Links: {len(result['links']['external'])}")
|
|
196
|
+
print(f"Schema Blocks: {len(result['schema'])}")
|
|
197
|
+
print(f"Word Count: {result['word_count']}")
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
if __name__ == "__main__":
|
|
201
|
+
main()
|