bmad-plus 0.3.3 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +34 -0
- package/README.md +12 -56
- package/osint-agent-package/skills/bmad-osint-investigate/osint/SKILL.md +452 -452
- package/osint-agent-package/skills/bmad-osint-investigate/osint/assets/dossier-template.md +116 -116
- package/osint-agent-package/skills/bmad-osint-investigate/osint/references/content-extraction.md +100 -100
- package/osint-agent-package/skills/bmad-osint-investigate/osint/references/platforms.md +130 -130
- package/osint-agent-package/skills/bmad-osint-investigate/osint/references/psychoprofile.md +69 -69
- package/osint-agent-package/skills/bmad-osint-investigate/osint/references/tools.md +281 -281
- package/osint-agent-package/skills/bmad-osint-investigate/osint/scripts/mcp-client.py +136 -136
- package/package.json +1 -1
- package/readme-international/README.de.md +1 -1
- package/readme-international/README.es.md +1 -1
- package/readme-international/README.fr.md +1 -1
- package/tools/cli/commands/install.js +74 -46
- package/tools/cli/i18n.js +501 -0
- package/oveanet-pack/animated-website/DEPLOYMENT.md +0 -104
- package/oveanet-pack/animated-website/README.md +0 -63
- package/oveanet-pack/animated-website/agent/animated-website-agent.md +0 -325
- package/oveanet-pack/animated-website/agent.yaml +0 -63
- package/oveanet-pack/animated-website/templates/animated-website-workflow.md +0 -55
- package/oveanet-pack/seo-audit-360/DEPLOYMENT.md +0 -115
- package/oveanet-pack/seo-audit-360/README.md +0 -66
- package/oveanet-pack/seo-audit-360/SKILL.md +0 -171
- package/oveanet-pack/seo-audit-360/agent/seo-chief.md +0 -294
- package/oveanet-pack/seo-audit-360/agent/seo-judge.md +0 -241
- package/oveanet-pack/seo-audit-360/agent/seo-scout.md +0 -171
- package/oveanet-pack/seo-audit-360/agent.yaml +0 -70
- package/oveanet-pack/seo-audit-360/checklist.md +0 -140
- package/oveanet-pack/seo-audit-360/hooks/seo-check.sh +0 -95
- package/oveanet-pack/seo-audit-360/pagespeed-playbook.md +0 -320
- package/oveanet-pack/seo-audit-360/ref/audit-schema.json +0 -187
- package/oveanet-pack/seo-audit-360/ref/cwv-thresholds.md +0 -87
- package/oveanet-pack/seo-audit-360/ref/eeat-criteria.md +0 -123
- package/oveanet-pack/seo-audit-360/ref/geo-signals.md +0 -167
- package/oveanet-pack/seo-audit-360/ref/hreflang-rules.md +0 -153
- package/oveanet-pack/seo-audit-360/ref/quality-gates.md +0 -133
- package/oveanet-pack/seo-audit-360/ref/schema-catalog.md +0 -91
- package/oveanet-pack/seo-audit-360/ref/schema-templates.json +0 -356
- package/oveanet-pack/seo-audit-360/requirements.txt +0 -14
- package/oveanet-pack/seo-audit-360/scripts/__pycache__/seo_crawl.cpython-314.pyc +0 -0
- package/oveanet-pack/seo-audit-360/scripts/__pycache__/seo_parse.cpython-314.pyc +0 -0
- package/oveanet-pack/seo-audit-360/scripts/install.ps1 +0 -53
- package/oveanet-pack/seo-audit-360/scripts/install.sh +0 -48
- package/oveanet-pack/seo-audit-360/scripts/seo_apis.py +0 -464
- package/oveanet-pack/seo-audit-360/scripts/seo_crawl.py +0 -282
- package/oveanet-pack/seo-audit-360/scripts/seo_fetch.py +0 -231
- package/oveanet-pack/seo-audit-360/scripts/seo_parse.py +0 -255
- package/oveanet-pack/seo-audit-360/scripts/seo_report.py +0 -403
- package/oveanet-pack/seo-audit-360/scripts/seo_screenshot.py +0 -202
- package/oveanet-pack/seo-audit-360/templates/seo-audit-workflow.md +0 -241
- package/oveanet-pack/seo-audit-360/tests/__pycache__/test_crawl.cpython-314-pytest-9.0.2.pyc +0 -0
- package/oveanet-pack/seo-audit-360/tests/__pycache__/test_parse.cpython-314-pytest-9.0.2.pyc +0 -0
- package/oveanet-pack/seo-audit-360/tests/fixtures/sample_page.html +0 -62
- package/oveanet-pack/seo-audit-360/tests/test_apis.py +0 -75
- package/oveanet-pack/seo-audit-360/tests/test_crawl.py +0 -121
- package/oveanet-pack/seo-audit-360/tests/test_fetch.py +0 -70
- package/oveanet-pack/seo-audit-360/tests/test_parse.py +0 -184
- package/oveanet-pack/universal-backup/DEPLOYMENT.md +0 -80
- package/oveanet-pack/universal-backup/README.md +0 -58
- package/oveanet-pack/universal-backup/agent/backup-agent.md +0 -71
- package/oveanet-pack/universal-backup/agent.yaml +0 -45
- package/oveanet-pack/universal-backup/templates/backup-workflow.md +0 -51
|
@@ -1,282 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
SEO Crawl — Recursive mini-crawler for site structure discovery.
|
|
4
|
-
|
|
5
|
-
Features:
|
|
6
|
-
- Sitemap.xml parsing for initial page list
|
|
7
|
-
- Recursive link-following with configurable depth
|
|
8
|
-
- Internal link graph construction
|
|
9
|
-
- Orphan page detection
|
|
10
|
-
- robots.txt respect
|
|
11
|
-
|
|
12
|
-
Author: Laurent Rochetta
|
|
13
|
-
License: MIT
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
import argparse
|
|
17
|
-
import json
|
|
18
|
-
import re
|
|
19
|
-
import sys
|
|
20
|
-
import xml.etree.ElementTree as ET
|
|
21
|
-
from collections import defaultdict
|
|
22
|
-
from typing import Optional, Set
|
|
23
|
-
from urllib.parse import urljoin, urlparse
|
|
24
|
-
|
|
25
|
-
try:
|
|
26
|
-
import requests
|
|
27
|
-
except ImportError:
|
|
28
|
-
print("Error: requests library required. Install: pip install requests", file=sys.stderr)
|
|
29
|
-
sys.exit(1)
|
|
30
|
-
|
|
31
|
-
USER_AGENT = (
|
|
32
|
-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
33
|
-
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 BMADSEOEngine/2.0"
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class SEOCrawler:
|
|
38
|
-
"""Recursive mini-crawler for SEO site structure analysis."""
|
|
39
|
-
|
|
40
|
-
def __init__(self, base_url: str, max_depth: int = 2, max_pages: int = 25, timeout: int = 15):
|
|
41
|
-
self.base_url = base_url.rstrip("/")
|
|
42
|
-
self.base_domain = urlparse(self.base_url).netloc
|
|
43
|
-
self.max_depth = max_depth
|
|
44
|
-
self.max_pages = max_pages
|
|
45
|
-
self.timeout = timeout
|
|
46
|
-
|
|
47
|
-
self.visited: Set[str] = set()
|
|
48
|
-
self.pages: list = []
|
|
49
|
-
self.link_graph: dict = defaultdict(set) # page -> set of linked pages
|
|
50
|
-
self.sitemap_urls: list = []
|
|
51
|
-
self.robots_txt: Optional[str] = None
|
|
52
|
-
self.errors: list = []
|
|
53
|
-
|
|
54
|
-
def normalize_url(self, url: str) -> str:
|
|
55
|
-
"""Normalize URL for deduplication."""
|
|
56
|
-
parsed = urlparse(url)
|
|
57
|
-
path = parsed.path.rstrip("/") or "/"
|
|
58
|
-
return f"{parsed.scheme}://{parsed.netloc}{path}"
|
|
59
|
-
|
|
60
|
-
def is_internal(self, url: str) -> bool:
|
|
61
|
-
"""Check if URL belongs to the same domain."""
|
|
62
|
-
return urlparse(url).netloc == self.base_domain
|
|
63
|
-
|
|
64
|
-
def fetch(self, url: str) -> Optional[str]:
|
|
65
|
-
"""Fetch a page with error handling."""
|
|
66
|
-
try:
|
|
67
|
-
response = requests.get(
|
|
68
|
-
url,
|
|
69
|
-
headers={"User-Agent": USER_AGENT},
|
|
70
|
-
timeout=self.timeout,
|
|
71
|
-
allow_redirects=True,
|
|
72
|
-
)
|
|
73
|
-
if response.status_code == 200 and "text/html" in response.headers.get("content-type", ""):
|
|
74
|
-
return response.text
|
|
75
|
-
else:
|
|
76
|
-
self.pages.append({
|
|
77
|
-
"url": url,
|
|
78
|
-
"status": response.status_code,
|
|
79
|
-
"content_type": response.headers.get("content-type", ""),
|
|
80
|
-
"title": None,
|
|
81
|
-
"word_count": 0,
|
|
82
|
-
"depth": -1,
|
|
83
|
-
})
|
|
84
|
-
except requests.RequestException as e:
|
|
85
|
-
self.errors.append({"url": url, "error": str(e)})
|
|
86
|
-
return None
|
|
87
|
-
|
|
88
|
-
def fetch_robots_txt(self):
|
|
89
|
-
"""Fetch and store robots.txt."""
|
|
90
|
-
try:
|
|
91
|
-
response = requests.get(
|
|
92
|
-
f"{self.base_url}/robots.txt",
|
|
93
|
-
headers={"User-Agent": USER_AGENT},
|
|
94
|
-
timeout=self.timeout,
|
|
95
|
-
)
|
|
96
|
-
if response.status_code == 200:
|
|
97
|
-
self.robots_txt = response.text
|
|
98
|
-
except requests.RequestException:
|
|
99
|
-
pass
|
|
100
|
-
|
|
101
|
-
def parse_sitemap(self):
|
|
102
|
-
"""Discover pages from sitemap.xml."""
|
|
103
|
-
sitemap_url = f"{self.base_url}/sitemap.xml"
|
|
104
|
-
|
|
105
|
-
# Check robots.txt for sitemap reference
|
|
106
|
-
if self.robots_txt:
|
|
107
|
-
for line in self.robots_txt.splitlines():
|
|
108
|
-
if line.strip().lower().startswith("sitemap:"):
|
|
109
|
-
sitemap_url = line.split(":", 1)[1].strip()
|
|
110
|
-
break
|
|
111
|
-
|
|
112
|
-
try:
|
|
113
|
-
response = requests.get(
|
|
114
|
-
sitemap_url,
|
|
115
|
-
headers={"User-Agent": USER_AGENT},
|
|
116
|
-
timeout=self.timeout,
|
|
117
|
-
)
|
|
118
|
-
if response.status_code == 200 and "xml" in response.headers.get("content-type", ""):
|
|
119
|
-
root = ET.fromstring(response.content)
|
|
120
|
-
ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
|
121
|
-
|
|
122
|
-
for url_el in root.findall(".//sm:url/sm:loc", ns):
|
|
123
|
-
if url_el.text and self.is_internal(url_el.text):
|
|
124
|
-
self.sitemap_urls.append(url_el.text)
|
|
125
|
-
|
|
126
|
-
# Handle sitemap index
|
|
127
|
-
for sitemap_el in root.findall(".//sm:sitemap/sm:loc", ns):
|
|
128
|
-
self.sitemap_urls.append(f"[sitemap-index]: {sitemap_el.text}")
|
|
129
|
-
|
|
130
|
-
except (requests.RequestException, ET.ParseError):
|
|
131
|
-
pass
|
|
132
|
-
|
|
133
|
-
def extract_links(self, html: str, page_url: str) -> list:
|
|
134
|
-
"""Extract internal links from HTML."""
|
|
135
|
-
links = []
|
|
136
|
-
# Simple regex for links (avoids BS4 dependency for crawler)
|
|
137
|
-
for match in re.finditer(r'href=["\']([^"\']+)["\']', html):
|
|
138
|
-
href = match.group(1)
|
|
139
|
-
if href.startswith("#") or href.startswith("javascript:") or href.startswith("mailto:"):
|
|
140
|
-
continue
|
|
141
|
-
|
|
142
|
-
full_url = urljoin(page_url, href)
|
|
143
|
-
if self.is_internal(full_url):
|
|
144
|
-
normalized = self.normalize_url(full_url)
|
|
145
|
-
links.append(normalized)
|
|
146
|
-
self.link_graph[page_url].add(normalized)
|
|
147
|
-
|
|
148
|
-
return links
|
|
149
|
-
|
|
150
|
-
def extract_title(self, html: str) -> Optional[str]:
|
|
151
|
-
"""Extract title from HTML."""
|
|
152
|
-
match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
|
|
153
|
-
return match.group(1).strip() if match else None
|
|
154
|
-
|
|
155
|
-
def count_words(self, html: str) -> int:
|
|
156
|
-
"""Count visible words in HTML."""
|
|
157
|
-
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
|
|
158
|
-
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
|
|
159
|
-
text = re.sub(r"<[^>]+>", " ", text)
|
|
160
|
-
words = re.findall(r"\b\w+\b", text)
|
|
161
|
-
return len(words)
|
|
162
|
-
|
|
163
|
-
def crawl(self):
|
|
164
|
-
"""Execute the recursive crawl."""
|
|
165
|
-
self.fetch_robots_txt()
|
|
166
|
-
self.parse_sitemap()
|
|
167
|
-
|
|
168
|
-
# Start with base URL
|
|
169
|
-
queue = [(self.base_url, 0)] # (url, depth)
|
|
170
|
-
|
|
171
|
-
while queue and len(self.visited) < self.max_pages:
|
|
172
|
-
url, depth = queue.pop(0)
|
|
173
|
-
normalized = self.normalize_url(url)
|
|
174
|
-
|
|
175
|
-
if normalized in self.visited:
|
|
176
|
-
continue
|
|
177
|
-
if depth > self.max_depth:
|
|
178
|
-
continue
|
|
179
|
-
|
|
180
|
-
self.visited.add(normalized)
|
|
181
|
-
html = self.fetch(normalized)
|
|
182
|
-
|
|
183
|
-
if html:
|
|
184
|
-
title = self.extract_title(html)
|
|
185
|
-
word_count = self.count_words(html)
|
|
186
|
-
|
|
187
|
-
self.pages.append({
|
|
188
|
-
"url": normalized,
|
|
189
|
-
"status": 200,
|
|
190
|
-
"title": title,
|
|
191
|
-
"word_count": word_count,
|
|
192
|
-
"depth": depth,
|
|
193
|
-
})
|
|
194
|
-
|
|
195
|
-
# Discover links for next level
|
|
196
|
-
if depth < self.max_depth:
|
|
197
|
-
links = self.extract_links(html, normalized)
|
|
198
|
-
for link in links:
|
|
199
|
-
if link not in self.visited:
|
|
200
|
-
queue.append((link, depth + 1))
|
|
201
|
-
|
|
202
|
-
def get_results(self) -> dict:
|
|
203
|
-
"""Return crawl results as dictionary."""
|
|
204
|
-
# Detect orphan pages (in sitemap but not linked from any crawled page)
|
|
205
|
-
all_linked = set()
|
|
206
|
-
for targets in self.link_graph.values():
|
|
207
|
-
all_linked.update(targets)
|
|
208
|
-
|
|
209
|
-
orphans = [url for url in self.sitemap_urls
|
|
210
|
-
if isinstance(url, str) and not url.startswith("[") and
|
|
211
|
-
self.normalize_url(url) not in all_linked]
|
|
212
|
-
|
|
213
|
-
return {
|
|
214
|
-
"base_url": self.base_url,
|
|
215
|
-
"pages_crawled": len(self.pages),
|
|
216
|
-
"max_depth": self.max_depth,
|
|
217
|
-
"sitemap_urls_found": len([u for u in self.sitemap_urls if not str(u).startswith("[")]),
|
|
218
|
-
"has_robots_txt": self.robots_txt is not None,
|
|
219
|
-
"has_sitemap": len(self.sitemap_urls) > 0,
|
|
220
|
-
"pages": self.pages,
|
|
221
|
-
"orphan_pages": orphans[:10],
|
|
222
|
-
"link_graph_summary": {
|
|
223
|
-
"total_internal_links": sum(len(v) for v in self.link_graph.values()),
|
|
224
|
-
"avg_links_per_page": round(
|
|
225
|
-
sum(len(v) for v in self.link_graph.values()) / max(len(self.link_graph), 1), 1
|
|
226
|
-
),
|
|
227
|
-
},
|
|
228
|
-
"errors": self.errors,
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
# ── CLI ────────────────────────────────────────────────────────────
|
|
233
|
-
|
|
234
|
-
def main():
|
|
235
|
-
parser = argparse.ArgumentParser(
|
|
236
|
-
description="SEO Crawl — Recursive mini-crawler (BMAD+ SEO Engine)"
|
|
237
|
-
)
|
|
238
|
-
parser.add_argument("url", help="Base URL to crawl")
|
|
239
|
-
parser.add_argument("--depth", "-d", type=int, default=2, help="Max crawl depth (default: 2)")
|
|
240
|
-
parser.add_argument("--max", "-m", type=int, default=25, help="Max pages (default: 25)")
|
|
241
|
-
parser.add_argument("--timeout", "-t", type=int, default=15, help="Per-page timeout (default: 15s)")
|
|
242
|
-
parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
|
|
243
|
-
|
|
244
|
-
args = parser.parse_args()
|
|
245
|
-
|
|
246
|
-
crawler = SEOCrawler(
|
|
247
|
-
base_url=args.url,
|
|
248
|
-
max_depth=args.depth,
|
|
249
|
-
max_pages=args.max,
|
|
250
|
-
timeout=args.timeout,
|
|
251
|
-
)
|
|
252
|
-
|
|
253
|
-
print(f"Crawling {args.url} (depth={args.depth}, max={args.max})...", file=sys.stderr)
|
|
254
|
-
crawler.crawl()
|
|
255
|
-
results = crawler.get_results()
|
|
256
|
-
|
|
257
|
-
if args.json:
|
|
258
|
-
# Convert sets to lists for JSON serialization
|
|
259
|
-
print(json.dumps(results, indent=2, ensure_ascii=False, default=list))
|
|
260
|
-
else:
|
|
261
|
-
print(f"\n{'='*60}")
|
|
262
|
-
print(f"Crawl Summary: {results['base_url']}")
|
|
263
|
-
print(f"{'='*60}")
|
|
264
|
-
print(f"Pages crawled: {results['pages_crawled']}")
|
|
265
|
-
print(f"Sitemap URLs: {results['sitemap_urls_found']}")
|
|
266
|
-
print(f"robots.txt: {'✅' if results['has_robots_txt'] else '❌'}")
|
|
267
|
-
print(f"Internal links: {results['link_graph_summary']['total_internal_links']}")
|
|
268
|
-
print(f"Avg links/page: {results['link_graph_summary']['avg_links_per_page']}")
|
|
269
|
-
print(f"Orphan pages: {len(results['orphan_pages'])}")
|
|
270
|
-
print(f"Errors: {len(results['errors'])}")
|
|
271
|
-
|
|
272
|
-
print(f"\n{'─'*60}")
|
|
273
|
-
print("Pages:")
|
|
274
|
-
for page in results["pages"]:
|
|
275
|
-
status = "✅" if page["status"] == 200 else f"⚠️ {page['status']}"
|
|
276
|
-
title = (page["title"] or "No title")[:50]
|
|
277
|
-
print(f" {status} [{page['depth']}] {title} ({page['word_count']} words)")
|
|
278
|
-
print(f" {page['url']}")
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
if __name__ == "__main__":
|
|
282
|
-
main()
|
|
@@ -1,231 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
SEO Fetch — Secure HTTP page fetcher for SEO analysis.
|
|
4
|
-
|
|
5
|
-
Features:
|
|
6
|
-
- SSRF protection (blocks private/loopback/reserved IPs)
|
|
7
|
-
- Multi-UA support (standard, Googlebot, GPTBot, ClaudeBot)
|
|
8
|
-
- Redirect chain tracking
|
|
9
|
-
- Cookie handling
|
|
10
|
-
- Configurable timeout
|
|
11
|
-
|
|
12
|
-
Author: Laurent Rochetta
|
|
13
|
-
License: MIT
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
import argparse
|
|
17
|
-
import ipaddress
|
|
18
|
-
import json
|
|
19
|
-
import socket
|
|
20
|
-
import sys
|
|
21
|
-
from typing import Optional
|
|
22
|
-
from urllib.parse import urlparse
|
|
23
|
-
|
|
24
|
-
try:
|
|
25
|
-
import requests
|
|
26
|
-
except ImportError:
|
|
27
|
-
print("Error: requests library required. Install: pip install requests", file=sys.stderr)
|
|
28
|
-
sys.exit(1)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
# ── User-Agent Presets ──────────────────────────────────────────────
|
|
32
|
-
|
|
33
|
-
USER_AGENTS = {
|
|
34
|
-
"default": (
|
|
35
|
-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
36
|
-
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 BMADSEOEngine/2.0"
|
|
37
|
-
),
|
|
38
|
-
"googlebot": (
|
|
39
|
-
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
|
40
|
-
),
|
|
41
|
-
"gptbot": (
|
|
42
|
-
"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; "
|
|
43
|
-
"+https://openai.com/gptbot)"
|
|
44
|
-
),
|
|
45
|
-
"claudebot": (
|
|
46
|
-
"Mozilla/5.0 (compatible; ClaudeBot/1.0; +https://www.anthropic.com/claudebot)"
|
|
47
|
-
),
|
|
48
|
-
"mobile": (
|
|
49
|
-
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) "
|
|
50
|
-
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1"
|
|
51
|
-
),
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
DEFAULT_HEADERS = {
|
|
55
|
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
56
|
-
"Accept-Language": "en-US,en;q=0.9,fr;q=0.8",
|
|
57
|
-
"Accept-Encoding": "gzip, deflate, br",
|
|
58
|
-
"Connection": "keep-alive",
|
|
59
|
-
"Cache-Control": "no-cache",
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
# ── Security: SSRF Prevention ──────────────────────────────────────
|
|
64
|
-
|
|
65
|
-
def is_safe_url(url: str) -> bool:
|
|
66
|
-
"""Block requests to private, loopback, and reserved IP addresses."""
|
|
67
|
-
parsed = urlparse(url)
|
|
68
|
-
hostname = parsed.hostname
|
|
69
|
-
|
|
70
|
-
if not hostname:
|
|
71
|
-
return False
|
|
72
|
-
|
|
73
|
-
try:
|
|
74
|
-
resolved_ip = socket.gethostbyname(hostname)
|
|
75
|
-
ip = ipaddress.ip_address(resolved_ip)
|
|
76
|
-
if ip.is_private or ip.is_loopback or ip.is_reserved or ip.is_link_local:
|
|
77
|
-
return False
|
|
78
|
-
except (socket.gaierror, ValueError):
|
|
79
|
-
pass # DNS failure handled by requests
|
|
80
|
-
|
|
81
|
-
return True
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
# ── Core Fetcher ───────────────────────────────────────────────────
|
|
85
|
-
|
|
86
|
-
def fetch_page(
|
|
87
|
-
url: str,
|
|
88
|
-
timeout: int = 30,
|
|
89
|
-
follow_redirects: bool = True,
|
|
90
|
-
max_redirects: int = 5,
|
|
91
|
-
user_agent: str = "default",
|
|
92
|
-
) -> dict:
|
|
93
|
-
"""
|
|
94
|
-
Fetch a web page with security checks and detailed response tracking.
|
|
95
|
-
|
|
96
|
-
Returns dict with: url, status_code, content, headers, redirect_chain,
|
|
97
|
-
content_length, response_time_ms, error
|
|
98
|
-
"""
|
|
99
|
-
result = {
|
|
100
|
-
"url": url,
|
|
101
|
-
"final_url": None,
|
|
102
|
-
"status_code": None,
|
|
103
|
-
"content": None,
|
|
104
|
-
"headers": {},
|
|
105
|
-
"redirect_chain": [],
|
|
106
|
-
"content_length": 0,
|
|
107
|
-
"response_time_ms": 0,
|
|
108
|
-
"error": None,
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
# Normalize URL
|
|
112
|
-
parsed = urlparse(url)
|
|
113
|
-
if not parsed.scheme:
|
|
114
|
-
url = f"https://{url}"
|
|
115
|
-
parsed = urlparse(url)
|
|
116
|
-
|
|
117
|
-
if parsed.scheme not in ("http", "https"):
|
|
118
|
-
result["error"] = f"Invalid URL scheme: {parsed.scheme}"
|
|
119
|
-
return result
|
|
120
|
-
|
|
121
|
-
# SSRF check
|
|
122
|
-
if not is_safe_url(url):
|
|
123
|
-
resolved = "unknown"
|
|
124
|
-
try:
|
|
125
|
-
resolved = socket.gethostbyname(parsed.hostname)
|
|
126
|
-
except Exception:
|
|
127
|
-
pass
|
|
128
|
-
result["error"] = f"Blocked: URL resolves to private/internal IP ({resolved})"
|
|
129
|
-
return result
|
|
130
|
-
|
|
131
|
-
try:
|
|
132
|
-
session = requests.Session()
|
|
133
|
-
session.max_redirects = max_redirects
|
|
134
|
-
|
|
135
|
-
headers = dict(DEFAULT_HEADERS)
|
|
136
|
-
ua_string = USER_AGENTS.get(user_agent, user_agent)
|
|
137
|
-
headers["User-Agent"] = ua_string
|
|
138
|
-
|
|
139
|
-
import time
|
|
140
|
-
start = time.monotonic()
|
|
141
|
-
|
|
142
|
-
response = session.get(
|
|
143
|
-
url,
|
|
144
|
-
headers=headers,
|
|
145
|
-
timeout=timeout,
|
|
146
|
-
allow_redirects=follow_redirects,
|
|
147
|
-
)
|
|
148
|
-
|
|
149
|
-
elapsed_ms = round((time.monotonic() - start) * 1000)
|
|
150
|
-
|
|
151
|
-
result["final_url"] = response.url
|
|
152
|
-
result["status_code"] = response.status_code
|
|
153
|
-
result["content"] = response.text
|
|
154
|
-
result["headers"] = dict(response.headers)
|
|
155
|
-
result["content_length"] = len(response.content)
|
|
156
|
-
result["response_time_ms"] = elapsed_ms
|
|
157
|
-
|
|
158
|
-
if response.history:
|
|
159
|
-
result["redirect_chain"] = [
|
|
160
|
-
{"url": r.url, "status": r.status_code}
|
|
161
|
-
for r in response.history
|
|
162
|
-
]
|
|
163
|
-
|
|
164
|
-
except requests.exceptions.Timeout:
|
|
165
|
-
result["error"] = f"Request timed out after {timeout}s"
|
|
166
|
-
except requests.exceptions.TooManyRedirects:
|
|
167
|
-
result["error"] = f"Too many redirects (max {max_redirects})"
|
|
168
|
-
except requests.exceptions.SSLError as e:
|
|
169
|
-
result["error"] = f"SSL error: {e}"
|
|
170
|
-
except requests.exceptions.ConnectionError as e:
|
|
171
|
-
result["error"] = f"Connection error: {e}"
|
|
172
|
-
except requests.exceptions.RequestException as e:
|
|
173
|
-
result["error"] = f"Request failed: {e}"
|
|
174
|
-
|
|
175
|
-
return result
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
# ── CLI ────────────────────────────────────────────────────────────
|
|
179
|
-
|
|
180
|
-
def main():
|
|
181
|
-
parser = argparse.ArgumentParser(
|
|
182
|
-
description="SEO Fetch — Secure HTTP fetcher for SEO analysis (BMAD+ SEO Engine)"
|
|
183
|
-
)
|
|
184
|
-
parser.add_argument("url", help="URL to fetch")
|
|
185
|
-
parser.add_argument("--output", "-o", help="Save HTML to file")
|
|
186
|
-
parser.add_argument("--timeout", "-t", type=int, default=30, help="Timeout in seconds")
|
|
187
|
-
parser.add_argument("--no-redirects", action="store_true", help="Don't follow redirects")
|
|
188
|
-
parser.add_argument(
|
|
189
|
-
"--ua", choices=list(USER_AGENTS.keys()), default="default",
|
|
190
|
-
help="User-Agent preset (default, googlebot, gptbot, claudebot, mobile)"
|
|
191
|
-
)
|
|
192
|
-
parser.add_argument("--json", "-j", action="store_true", help="Output full result as JSON")
|
|
193
|
-
|
|
194
|
-
args = parser.parse_args()
|
|
195
|
-
|
|
196
|
-
result = fetch_page(
|
|
197
|
-
args.url,
|
|
198
|
-
timeout=args.timeout,
|
|
199
|
-
follow_redirects=not args.no_redirects,
|
|
200
|
-
user_agent=args.ua,
|
|
201
|
-
)
|
|
202
|
-
|
|
203
|
-
if result["error"]:
|
|
204
|
-
print(f"Error: {result['error']}", file=sys.stderr)
|
|
205
|
-
sys.exit(1)
|
|
206
|
-
|
|
207
|
-
if args.json:
|
|
208
|
-
# Output metadata as JSON (without full HTML content for readability)
|
|
209
|
-
output = {k: v for k, v in result.items() if k != "content"}
|
|
210
|
-
output["content_preview"] = result["content"][:500] if result["content"] else None
|
|
211
|
-
print(json.dumps(output, indent=2))
|
|
212
|
-
elif args.output:
|
|
213
|
-
with open(args.output, "w", encoding="utf-8") as f:
|
|
214
|
-
f.write(result["content"])
|
|
215
|
-
print(f"Saved to {args.output}")
|
|
216
|
-
else:
|
|
217
|
-
print(result["content"])
|
|
218
|
-
|
|
219
|
-
# Metadata to stderr
|
|
220
|
-
print(f"\n--- Fetch Summary ---", file=sys.stderr)
|
|
221
|
-
print(f"Final URL: {result['final_url']}", file=sys.stderr)
|
|
222
|
-
print(f"Status: {result['status_code']}", file=sys.stderr)
|
|
223
|
-
print(f"Size: {result['content_length']:,} bytes", file=sys.stderr)
|
|
224
|
-
print(f"Time: {result['response_time_ms']}ms", file=sys.stderr)
|
|
225
|
-
if result["redirect_chain"]:
|
|
226
|
-
chain = " → ".join(r["url"] for r in result["redirect_chain"])
|
|
227
|
-
print(f"Redirects: {chain}", file=sys.stderr)
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
if __name__ == "__main__":
|
|
231
|
-
main()
|