bmad-plus 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,356 @@
1
+ {
2
+ "meta": {
3
+ "author": "Laurent Rochetta",
4
+ "engine": "BMAD+ SEO Engine v2.0",
5
+ "schema_org_version": "29.4",
6
+ "last_updated": "2026-03-19"
7
+ },
8
+ "templates": [
9
+ {
10
+ "type": "Organization",
11
+ "use_case": "Company/brand homepage",
12
+ "template": {
13
+ "@context": "https://schema.org",
14
+ "@type": "Organization",
15
+ "name": "[Company Name]",
16
+ "url": "[Website URL]",
17
+ "logo": "[Logo URL]",
18
+ "description": "[Company description]",
19
+ "foundingDate": "[YYYY]",
20
+ "contactPoint": {
21
+ "@type": "ContactPoint",
22
+ "telephone": "[Phone]",
23
+ "contactType": "customer service",
24
+ "availableLanguage": ["English", "French"]
25
+ },
26
+ "sameAs": [
27
+ "[Facebook URL]",
28
+ "[LinkedIn URL]",
29
+ "[Twitter URL]",
30
+ "[YouTube URL]"
31
+ ]
32
+ }
33
+ },
34
+ {
35
+ "type": "LocalBusiness",
36
+ "use_case": "Physical business location",
37
+ "template": {
38
+ "@context": "https://schema.org",
39
+ "@type": "LocalBusiness",
40
+ "name": "[Business Name]",
41
+ "url": "[Website URL]",
42
+ "image": "[Photo URL]",
43
+ "telephone": "[Phone]",
44
+ "priceRange": "[$$]",
45
+ "address": {
46
+ "@type": "PostalAddress",
47
+ "streetAddress": "[Street]",
48
+ "addressLocality": "[City]",
49
+ "addressRegion": "[State/Region]",
50
+ "postalCode": "[ZIP]",
51
+ "addressCountry": "[Country Code]"
52
+ },
53
+ "geo": {
54
+ "@type": "GeoCoordinates",
55
+ "latitude": "[Lat]",
56
+ "longitude": "[Long]"
57
+ },
58
+ "openingHoursSpecification": [
59
+ {
60
+ "@type": "OpeningHoursSpecification",
61
+ "dayOfWeek": ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"],
62
+ "opens": "09:00",
63
+ "closes": "18:00"
64
+ }
65
+ ]
66
+ }
67
+ },
68
+ {
69
+ "type": "Article",
70
+ "use_case": "Blog posts, articles",
71
+ "template": {
72
+ "@context": "https://schema.org",
73
+ "@type": "Article",
74
+ "headline": "[Title - max 110 chars]",
75
+ "description": "[Summary]",
76
+ "author": {
77
+ "@type": "Person",
78
+ "name": "[Author Name]",
79
+ "url": "[Author Profile URL]"
80
+ },
81
+ "datePublished": "[YYYY-MM-DD]",
82
+ "dateModified": "[YYYY-MM-DD]",
83
+ "image": "[Featured Image URL]",
84
+ "publisher": {
85
+ "@type": "Organization",
86
+ "name": "[Publisher Name]",
87
+ "logo": {
88
+ "@type": "ImageObject",
89
+ "url": "[Logo URL]"
90
+ }
91
+ },
92
+ "mainEntityOfPage": {
93
+ "@type": "WebPage",
94
+ "@id": "[Page URL]"
95
+ }
96
+ }
97
+ },
98
+ {
99
+ "type": "Product",
100
+ "use_case": "E-commerce product pages",
101
+ "template": {
102
+ "@context": "https://schema.org",
103
+ "@type": "Product",
104
+ "name": "[Product Name]",
105
+ "image": "[Product Image URL]",
106
+ "description": "[Product Description]",
107
+ "sku": "[SKU]",
108
+ "brand": {
109
+ "@type": "Brand",
110
+ "name": "[Brand Name]"
111
+ },
112
+ "offers": {
113
+ "@type": "Offer",
114
+ "url": "[Product URL]",
115
+ "price": "[Price]",
116
+ "priceCurrency": "[Currency Code]",
117
+ "availability": "https://schema.org/InStock",
118
+ "seller": {
119
+ "@type": "Organization",
120
+ "name": "[Seller Name]"
121
+ }
122
+ },
123
+ "aggregateRating": {
124
+ "@type": "AggregateRating",
125
+ "ratingValue": "[4.5]",
126
+ "reviewCount": "[120]"
127
+ }
128
+ }
129
+ },
130
+ {
131
+ "type": "WebSite",
132
+ "use_case": "Site-level with sitelinks search box",
133
+ "template": {
134
+ "@context": "https://schema.org",
135
+ "@type": "WebSite",
136
+ "name": "[Site Name]",
137
+ "url": "[Homepage URL]",
138
+ "potentialAction": {
139
+ "@type": "SearchAction",
140
+ "target": {
141
+ "@type": "EntryPoint",
142
+ "urlTemplate": "[Search URL]?q={search_term_string}"
143
+ },
144
+ "query-input": "required name=search_term_string"
145
+ }
146
+ }
147
+ },
148
+ {
149
+ "type": "BreadcrumbList",
150
+ "use_case": "Navigation breadcrumbs",
151
+ "template": {
152
+ "@context": "https://schema.org",
153
+ "@type": "BreadcrumbList",
154
+ "itemListElement": [
155
+ {
156
+ "@type": "ListItem",
157
+ "position": 1,
158
+ "name": "Home",
159
+ "item": "[Homepage URL]"
160
+ },
161
+ {
162
+ "@type": "ListItem",
163
+ "position": 2,
164
+ "name": "[Category]",
165
+ "item": "[Category URL]"
166
+ },
167
+ {
168
+ "@type": "ListItem",
169
+ "position": 3,
170
+ "name": "[Current Page]"
171
+ }
172
+ ]
173
+ }
174
+ },
175
+ {
176
+ "type": "Person",
177
+ "use_case": "Author/team member profiles (E-E-A-T)",
178
+ "template": {
179
+ "@context": "https://schema.org",
180
+ "@type": "Person",
181
+ "name": "[Full Name]",
182
+ "jobTitle": "[Job Title]",
183
+ "url": "[Profile URL]",
184
+ "image": "[Photo URL]",
185
+ "description": "[Professional bio]",
186
+ "worksFor": {
187
+ "@type": "Organization",
188
+ "name": "[Company Name]"
189
+ },
190
+ "sameAs": [
191
+ "[LinkedIn URL]",
192
+ "[Twitter URL]",
193
+ "[GitHub URL]"
194
+ ]
195
+ }
196
+ },
197
+ {
198
+ "type": "Service",
199
+ "use_case": "Service business pages",
200
+ "template": {
201
+ "@context": "https://schema.org",
202
+ "@type": "Service",
203
+ "name": "[Service Name]",
204
+ "description": "[Service Description]",
205
+ "provider": {
206
+ "@type": "Organization",
207
+ "name": "[Provider Name]",
208
+ "url": "[Provider URL]"
209
+ },
210
+ "areaServed": {
211
+ "@type": "City",
212
+ "name": "[Service Area]"
213
+ },
214
+ "offers": {
215
+ "@type": "Offer",
216
+ "price": "[Starting Price]",
217
+ "priceCurrency": "[Currency]"
218
+ }
219
+ }
220
+ },
221
+ {
222
+ "type": "VideoObject",
223
+ "use_case": "Video content pages",
224
+ "template": {
225
+ "@context": "https://schema.org",
226
+ "@type": "VideoObject",
227
+ "name": "[Video Title]",
228
+ "description": "[Video Description]",
229
+ "thumbnailUrl": "[Thumbnail URL]",
230
+ "uploadDate": "[YYYY-MM-DD]",
231
+ "duration": "[PT1H30M]",
232
+ "contentUrl": "[Video File URL]",
233
+ "embedUrl": "[Embed URL]",
234
+ "publisher": {
235
+ "@type": "Organization",
236
+ "name": "[Publisher]"
237
+ }
238
+ }
239
+ },
240
+ {
241
+ "type": "Event",
242
+ "use_case": "Events and conferences",
243
+ "template": {
244
+ "@context": "https://schema.org",
245
+ "@type": "Event",
246
+ "name": "[Event Name]",
247
+ "description": "[Event Description]",
248
+ "startDate": "[YYYY-MM-DDTHH:MM]",
249
+ "endDate": "[YYYY-MM-DDTHH:MM]",
250
+ "location": {
251
+ "@type": "Place",
252
+ "name": "[Venue Name]",
253
+ "address": {
254
+ "@type": "PostalAddress",
255
+ "streetAddress": "[Street]",
256
+ "addressLocality": "[City]",
257
+ "addressCountry": "[Country]"
258
+ }
259
+ },
260
+ "organizer": {
261
+ "@type": "Organization",
262
+ "name": "[Organizer Name]",
263
+ "url": "[Organizer URL]"
264
+ },
265
+ "offers": {
266
+ "@type": "Offer",
267
+ "price": "[Price]",
268
+ "priceCurrency": "[Currency]",
269
+ "url": "[Ticket URL]",
270
+ "availability": "https://schema.org/InStock"
271
+ }
272
+ }
273
+ },
274
+ {
275
+ "type": "SoftwareApplication",
276
+ "use_case": "SaaS and app pages",
277
+ "template": {
278
+ "@context": "https://schema.org",
279
+ "@type": "SoftwareApplication",
280
+ "name": "[App Name]",
281
+ "description": "[App Description]",
282
+ "applicationCategory": "[Category]",
283
+ "operatingSystem": "[OS]",
284
+ "offers": {
285
+ "@type": "Offer",
286
+ "price": "[Price or 0]",
287
+ "priceCurrency": "[Currency]"
288
+ },
289
+ "aggregateRating": {
290
+ "@type": "AggregateRating",
291
+ "ratingValue": "[Rating]",
292
+ "ratingCount": "[Count]"
293
+ }
294
+ }
295
+ },
296
+ {
297
+ "type": "ProfilePage",
298
+ "use_case": "Author/creator profile pages (E-E-A-T)",
299
+ "template": {
300
+ "@context": "https://schema.org",
301
+ "@type": "ProfilePage",
302
+ "mainEntity": {
303
+ "@type": "Person",
304
+ "name": "[Author Name]",
305
+ "url": "[Profile URL]",
306
+ "description": "[Bio and expertise]",
307
+ "sameAs": ["[LinkedIn]", "[Twitter]"]
308
+ }
309
+ }
310
+ },
311
+ {
312
+ "type": "ProductGroup",
313
+ "use_case": "E-commerce product variants",
314
+ "template": {
315
+ "@context": "https://schema.org",
316
+ "@type": "ProductGroup",
317
+ "name": "[Product Name]",
318
+ "description": "[Group description]",
319
+ "productGroupID": "[Group ID]",
320
+ "variesBy": ["https://schema.org/size", "https://schema.org/color"],
321
+ "hasVariant": [
322
+ {
323
+ "@type": "Product",
324
+ "name": "[Variant Name]",
325
+ "sku": "[SKU]",
326
+ "color": "[Color]",
327
+ "size": "[Size]",
328
+ "offers": {
329
+ "@type": "Offer",
330
+ "price": "[Price]",
331
+ "priceCurrency": "[Currency]",
332
+ "availability": "https://schema.org/InStock"
333
+ }
334
+ }
335
+ ]
336
+ }
337
+ },
338
+ {
339
+ "type": "Certification",
340
+ "use_case": "Product certifications (April 2025)",
341
+ "template": {
342
+ "@context": "https://schema.org",
343
+ "@type": "Product",
344
+ "name": "[Product Name]",
345
+ "hasCertification": {
346
+ "@type": "Certification",
347
+ "certificationIdentification": "[Certification Name]",
348
+ "issuedBy": {
349
+ "@type": "Organization",
350
+ "name": "[Issuing Organization]"
351
+ }
352
+ }
353
+ }
354
+ }
355
+ ]
356
+ }
@@ -0,0 +1,282 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ SEO Crawl — Recursive mini-crawler for site structure discovery.
4
+
5
+ Features:
6
+ - Sitemap.xml parsing for initial page list
7
+ - Recursive link-following with configurable depth
8
+ - Internal link graph construction
9
+ - Orphan page detection
10
+ - robots.txt respect
11
+
12
+ Author: Laurent Rochetta
13
+ License: MIT
14
+ """
15
+
16
+ import argparse
17
+ import json
18
+ import re
19
+ import sys
20
+ import xml.etree.ElementTree as ET
21
+ from collections import defaultdict
22
+ from typing import Optional, Set
23
+ from urllib.parse import urljoin, urlparse
24
+
25
+ try:
26
+ import requests
27
+ except ImportError:
28
+ print("Error: requests library required. Install: pip install requests", file=sys.stderr)
29
+ sys.exit(1)
30
+
31
+ USER_AGENT = (
32
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
33
+ "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 BMADSEOEngine/2.0"
34
+ )
35
+
36
+
37
+ class SEOCrawler:
38
+ """Recursive mini-crawler for SEO site structure analysis."""
39
+
40
+ def __init__(self, base_url: str, max_depth: int = 2, max_pages: int = 25, timeout: int = 15):
41
+ self.base_url = base_url.rstrip("/")
42
+ self.base_domain = urlparse(self.base_url).netloc
43
+ self.max_depth = max_depth
44
+ self.max_pages = max_pages
45
+ self.timeout = timeout
46
+
47
+ self.visited: Set[str] = set()
48
+ self.pages: list = []
49
+ self.link_graph: dict = defaultdict(set) # page -> set of linked pages
50
+ self.sitemap_urls: list = []
51
+ self.robots_txt: Optional[str] = None
52
+ self.errors: list = []
53
+
54
+ def normalize_url(self, url: str) -> str:
55
+ """Normalize URL for deduplication."""
56
+ parsed = urlparse(url)
57
+ path = parsed.path.rstrip("/") or "/"
58
+ return f"{parsed.scheme}://{parsed.netloc}{path}"
59
+
60
+ def is_internal(self, url: str) -> bool:
61
+ """Check if URL belongs to the same domain."""
62
+ return urlparse(url).netloc == self.base_domain
63
+
64
+ def fetch(self, url: str) -> Optional[str]:
65
+ """Fetch a page with error handling."""
66
+ try:
67
+ response = requests.get(
68
+ url,
69
+ headers={"User-Agent": USER_AGENT},
70
+ timeout=self.timeout,
71
+ allow_redirects=True,
72
+ )
73
+ if response.status_code == 200 and "text/html" in response.headers.get("content-type", ""):
74
+ return response.text
75
+ else:
76
+ self.pages.append({
77
+ "url": url,
78
+ "status": response.status_code,
79
+ "content_type": response.headers.get("content-type", ""),
80
+ "title": None,
81
+ "word_count": 0,
82
+ "depth": -1,
83
+ })
84
+ except requests.RequestException as e:
85
+ self.errors.append({"url": url, "error": str(e)})
86
+ return None
87
+
88
+ def fetch_robots_txt(self):
89
+ """Fetch and store robots.txt."""
90
+ try:
91
+ response = requests.get(
92
+ f"{self.base_url}/robots.txt",
93
+ headers={"User-Agent": USER_AGENT},
94
+ timeout=self.timeout,
95
+ )
96
+ if response.status_code == 200:
97
+ self.robots_txt = response.text
98
+ except requests.RequestException:
99
+ pass
100
+
101
+ def parse_sitemap(self):
102
+ """Discover pages from sitemap.xml."""
103
+ sitemap_url = f"{self.base_url}/sitemap.xml"
104
+
105
+ # Check robots.txt for sitemap reference
106
+ if self.robots_txt:
107
+ for line in self.robots_txt.splitlines():
108
+ if line.strip().lower().startswith("sitemap:"):
109
+ sitemap_url = line.split(":", 1)[1].strip()
110
+ break
111
+
112
+ try:
113
+ response = requests.get(
114
+ sitemap_url,
115
+ headers={"User-Agent": USER_AGENT},
116
+ timeout=self.timeout,
117
+ )
118
+ if response.status_code == 200 and "xml" in response.headers.get("content-type", ""):
119
+ root = ET.fromstring(response.content)
120
+ ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
121
+
122
+ for url_el in root.findall(".//sm:url/sm:loc", ns):
123
+ if url_el.text and self.is_internal(url_el.text):
124
+ self.sitemap_urls.append(url_el.text)
125
+
126
+ # Handle sitemap index
127
+ for sitemap_el in root.findall(".//sm:sitemap/sm:loc", ns):
128
+ self.sitemap_urls.append(f"[sitemap-index]: {sitemap_el.text}")
129
+
130
+ except (requests.RequestException, ET.ParseError):
131
+ pass
132
+
133
+ def extract_links(self, html: str, page_url: str) -> list:
134
+ """Extract internal links from HTML."""
135
+ links = []
136
+ # Simple regex for links (avoids BS4 dependency for crawler)
137
+ for match in re.finditer(r'href=["\']([^"\']+)["\']', html):
138
+ href = match.group(1)
139
+ if href.startswith("#") or href.startswith("javascript:") or href.startswith("mailto:"):
140
+ continue
141
+
142
+ full_url = urljoin(page_url, href)
143
+ if self.is_internal(full_url):
144
+ normalized = self.normalize_url(full_url)
145
+ links.append(normalized)
146
+ self.link_graph[page_url].add(normalized)
147
+
148
+ return links
149
+
150
+ def extract_title(self, html: str) -> Optional[str]:
151
+ """Extract title from HTML."""
152
+ match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
153
+ return match.group(1).strip() if match else None
154
+
155
+ def count_words(self, html: str) -> int:
156
+ """Count visible words in HTML."""
157
+ text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
158
+ text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
159
+ text = re.sub(r"<[^>]+>", " ", text)
160
+ words = re.findall(r"\b\w+\b", text)
161
+ return len(words)
162
+
163
+ def crawl(self):
164
+ """Execute the recursive crawl."""
165
+ self.fetch_robots_txt()
166
+ self.parse_sitemap()
167
+
168
+ # Start with base URL
169
+ queue = [(self.base_url, 0)] # (url, depth)
170
+
171
+ while queue and len(self.visited) < self.max_pages:
172
+ url, depth = queue.pop(0)
173
+ normalized = self.normalize_url(url)
174
+
175
+ if normalized in self.visited:
176
+ continue
177
+ if depth > self.max_depth:
178
+ continue
179
+
180
+ self.visited.add(normalized)
181
+ html = self.fetch(normalized)
182
+
183
+ if html:
184
+ title = self.extract_title(html)
185
+ word_count = self.count_words(html)
186
+
187
+ self.pages.append({
188
+ "url": normalized,
189
+ "status": 200,
190
+ "title": title,
191
+ "word_count": word_count,
192
+ "depth": depth,
193
+ })
194
+
195
+ # Discover links for next level
196
+ if depth < self.max_depth:
197
+ links = self.extract_links(html, normalized)
198
+ for link in links:
199
+ if link not in self.visited:
200
+ queue.append((link, depth + 1))
201
+
202
+ def get_results(self) -> dict:
203
+ """Return crawl results as dictionary."""
204
+ # Detect orphan pages (in sitemap but not linked from any crawled page)
205
+ all_linked = set()
206
+ for targets in self.link_graph.values():
207
+ all_linked.update(targets)
208
+
209
+ orphans = [url for url in self.sitemap_urls
210
+ if isinstance(url, str) and not url.startswith("[") and
211
+ self.normalize_url(url) not in all_linked]
212
+
213
+ return {
214
+ "base_url": self.base_url,
215
+ "pages_crawled": len(self.pages),
216
+ "max_depth": self.max_depth,
217
+ "sitemap_urls_found": len([u for u in self.sitemap_urls if not str(u).startswith("[")]),
218
+ "has_robots_txt": self.robots_txt is not None,
219
+ "has_sitemap": len(self.sitemap_urls) > 0,
220
+ "pages": self.pages,
221
+ "orphan_pages": orphans[:10],
222
+ "link_graph_summary": {
223
+ "total_internal_links": sum(len(v) for v in self.link_graph.values()),
224
+ "avg_links_per_page": round(
225
+ sum(len(v) for v in self.link_graph.values()) / max(len(self.link_graph), 1), 1
226
+ ),
227
+ },
228
+ "errors": self.errors,
229
+ }
230
+
231
+
232
+ # ── CLI ────────────────────────────────────────────────────────────
233
+
234
+ def main():
235
+ parser = argparse.ArgumentParser(
236
+ description="SEO Crawl — Recursive mini-crawler (BMAD+ SEO Engine)"
237
+ )
238
+ parser.add_argument("url", help="Base URL to crawl")
239
+ parser.add_argument("--depth", "-d", type=int, default=2, help="Max crawl depth (default: 2)")
240
+ parser.add_argument("--max", "-m", type=int, default=25, help="Max pages (default: 25)")
241
+ parser.add_argument("--timeout", "-t", type=int, default=15, help="Per-page timeout (default: 15s)")
242
+ parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
243
+
244
+ args = parser.parse_args()
245
+
246
+ crawler = SEOCrawler(
247
+ base_url=args.url,
248
+ max_depth=args.depth,
249
+ max_pages=args.max,
250
+ timeout=args.timeout,
251
+ )
252
+
253
+ print(f"Crawling {args.url} (depth={args.depth}, max={args.max})...", file=sys.stderr)
254
+ crawler.crawl()
255
+ results = crawler.get_results()
256
+
257
+ if args.json:
258
+ # Convert sets to lists for JSON serialization
259
+ print(json.dumps(results, indent=2, ensure_ascii=False, default=list))
260
+ else:
261
+ print(f"\n{'='*60}")
262
+ print(f"Crawl Summary: {results['base_url']}")
263
+ print(f"{'='*60}")
264
+ print(f"Pages crawled: {results['pages_crawled']}")
265
+ print(f"Sitemap URLs: {results['sitemap_urls_found']}")
266
+ print(f"robots.txt: {'✅' if results['has_robots_txt'] else '❌'}")
267
+ print(f"Internal links: {results['link_graph_summary']['total_internal_links']}")
268
+ print(f"Avg links/page: {results['link_graph_summary']['avg_links_per_page']}")
269
+ print(f"Orphan pages: {len(results['orphan_pages'])}")
270
+ print(f"Errors: {len(results['errors'])}")
271
+
272
+ print(f"\n{'─'*60}")
273
+ print("Pages:")
274
+ for page in results["pages"]:
275
+ status = "✅" if page["status"] == 200 else f"⚠️ {page['status']}"
276
+ title = (page["title"] or "No title")[:50]
277
+ print(f" {status} [{page['depth']}] {title} ({page['word_count']} words)")
278
+ print(f" {page['url']}")
279
+
280
+
281
+ if __name__ == "__main__":
282
+ main()