bmad-plus 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +88 -0
- package/README.md +1 -0
- package/oveanet-pack/seo-audit-360/README.md +59 -53
- package/oveanet-pack/seo-audit-360/agent/seo-chief.md +275 -0
- package/oveanet-pack/seo-audit-360/agent/seo-judge.md +241 -0
- package/oveanet-pack/seo-audit-360/agent/seo-scout.md +171 -0
- package/oveanet-pack/seo-audit-360/agent.yaml +69 -70
- package/oveanet-pack/seo-audit-360/ref/cwv-thresholds.md +87 -0
- package/oveanet-pack/seo-audit-360/ref/eeat-criteria.md +123 -0
- package/oveanet-pack/seo-audit-360/ref/geo-signals.md +167 -0
- package/oveanet-pack/seo-audit-360/ref/quality-gates.md +133 -0
- package/oveanet-pack/seo-audit-360/ref/schema-catalog.md +91 -0
- package/oveanet-pack/seo-audit-360/ref/schema-templates.json +356 -0
- package/oveanet-pack/seo-audit-360/scripts/seo_crawl.py +282 -0
- package/oveanet-pack/seo-audit-360/scripts/seo_fetch.py +231 -0
- package/oveanet-pack/seo-audit-360/scripts/seo_parse.py +255 -0
- package/oveanet-pack/seo-audit-360/scripts/seo_screenshot.py +202 -0
- package/oveanet-pack/seo-audit-360/templates/seo-audit-workflow.md +241 -0
- package/package.json +1 -1
- package/oveanet-pack/seo-audit-360/agent/seo-geo-360-auditor.md +0 -441
- package/oveanet-pack/seo-audit-360/templates/llms.txt +0 -73
- package/oveanet-pack/seo-audit-360/templates/robots.txt +0 -38
- package/oveanet-pack/seo-audit-360/templates/schema-templates.json +0 -116
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
{
|
|
2
|
+
"meta": {
|
|
3
|
+
"author": "Laurent Rochetta",
|
|
4
|
+
"engine": "BMAD+ SEO Engine v2.0",
|
|
5
|
+
"schema_org_version": "29.4",
|
|
6
|
+
"last_updated": "2026-03-19"
|
|
7
|
+
},
|
|
8
|
+
"templates": [
|
|
9
|
+
{
|
|
10
|
+
"type": "Organization",
|
|
11
|
+
"use_case": "Company/brand homepage",
|
|
12
|
+
"template": {
|
|
13
|
+
"@context": "https://schema.org",
|
|
14
|
+
"@type": "Organization",
|
|
15
|
+
"name": "[Company Name]",
|
|
16
|
+
"url": "[Website URL]",
|
|
17
|
+
"logo": "[Logo URL]",
|
|
18
|
+
"description": "[Company description]",
|
|
19
|
+
"foundingDate": "[YYYY]",
|
|
20
|
+
"contactPoint": {
|
|
21
|
+
"@type": "ContactPoint",
|
|
22
|
+
"telephone": "[Phone]",
|
|
23
|
+
"contactType": "customer service",
|
|
24
|
+
"availableLanguage": ["English", "French"]
|
|
25
|
+
},
|
|
26
|
+
"sameAs": [
|
|
27
|
+
"[Facebook URL]",
|
|
28
|
+
"[LinkedIn URL]",
|
|
29
|
+
"[Twitter URL]",
|
|
30
|
+
"[YouTube URL]"
|
|
31
|
+
]
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
"type": "LocalBusiness",
|
|
36
|
+
"use_case": "Physical business location",
|
|
37
|
+
"template": {
|
|
38
|
+
"@context": "https://schema.org",
|
|
39
|
+
"@type": "LocalBusiness",
|
|
40
|
+
"name": "[Business Name]",
|
|
41
|
+
"url": "[Website URL]",
|
|
42
|
+
"image": "[Photo URL]",
|
|
43
|
+
"telephone": "[Phone]",
|
|
44
|
+
"priceRange": "[$$]",
|
|
45
|
+
"address": {
|
|
46
|
+
"@type": "PostalAddress",
|
|
47
|
+
"streetAddress": "[Street]",
|
|
48
|
+
"addressLocality": "[City]",
|
|
49
|
+
"addressRegion": "[State/Region]",
|
|
50
|
+
"postalCode": "[ZIP]",
|
|
51
|
+
"addressCountry": "[Country Code]"
|
|
52
|
+
},
|
|
53
|
+
"geo": {
|
|
54
|
+
"@type": "GeoCoordinates",
|
|
55
|
+
"latitude": "[Lat]",
|
|
56
|
+
"longitude": "[Long]"
|
|
57
|
+
},
|
|
58
|
+
"openingHoursSpecification": [
|
|
59
|
+
{
|
|
60
|
+
"@type": "OpeningHoursSpecification",
|
|
61
|
+
"dayOfWeek": ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"],
|
|
62
|
+
"opens": "09:00",
|
|
63
|
+
"closes": "18:00"
|
|
64
|
+
}
|
|
65
|
+
]
|
|
66
|
+
}
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"type": "Article",
|
|
70
|
+
"use_case": "Blog posts, articles",
|
|
71
|
+
"template": {
|
|
72
|
+
"@context": "https://schema.org",
|
|
73
|
+
"@type": "Article",
|
|
74
|
+
"headline": "[Title - max 110 chars]",
|
|
75
|
+
"description": "[Summary]",
|
|
76
|
+
"author": {
|
|
77
|
+
"@type": "Person",
|
|
78
|
+
"name": "[Author Name]",
|
|
79
|
+
"url": "[Author Profile URL]"
|
|
80
|
+
},
|
|
81
|
+
"datePublished": "[YYYY-MM-DD]",
|
|
82
|
+
"dateModified": "[YYYY-MM-DD]",
|
|
83
|
+
"image": "[Featured Image URL]",
|
|
84
|
+
"publisher": {
|
|
85
|
+
"@type": "Organization",
|
|
86
|
+
"name": "[Publisher Name]",
|
|
87
|
+
"logo": {
|
|
88
|
+
"@type": "ImageObject",
|
|
89
|
+
"url": "[Logo URL]"
|
|
90
|
+
}
|
|
91
|
+
},
|
|
92
|
+
"mainEntityOfPage": {
|
|
93
|
+
"@type": "WebPage",
|
|
94
|
+
"@id": "[Page URL]"
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
"type": "Product",
|
|
100
|
+
"use_case": "E-commerce product pages",
|
|
101
|
+
"template": {
|
|
102
|
+
"@context": "https://schema.org",
|
|
103
|
+
"@type": "Product",
|
|
104
|
+
"name": "[Product Name]",
|
|
105
|
+
"image": "[Product Image URL]",
|
|
106
|
+
"description": "[Product Description]",
|
|
107
|
+
"sku": "[SKU]",
|
|
108
|
+
"brand": {
|
|
109
|
+
"@type": "Brand",
|
|
110
|
+
"name": "[Brand Name]"
|
|
111
|
+
},
|
|
112
|
+
"offers": {
|
|
113
|
+
"@type": "Offer",
|
|
114
|
+
"url": "[Product URL]",
|
|
115
|
+
"price": "[Price]",
|
|
116
|
+
"priceCurrency": "[Currency Code]",
|
|
117
|
+
"availability": "https://schema.org/InStock",
|
|
118
|
+
"seller": {
|
|
119
|
+
"@type": "Organization",
|
|
120
|
+
"name": "[Seller Name]"
|
|
121
|
+
}
|
|
122
|
+
},
|
|
123
|
+
"aggregateRating": {
|
|
124
|
+
"@type": "AggregateRating",
|
|
125
|
+
"ratingValue": "[4.5]",
|
|
126
|
+
"reviewCount": "[120]"
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
},
|
|
130
|
+
{
|
|
131
|
+
"type": "WebSite",
|
|
132
|
+
"use_case": "Site-level with sitelinks search box",
|
|
133
|
+
"template": {
|
|
134
|
+
"@context": "https://schema.org",
|
|
135
|
+
"@type": "WebSite",
|
|
136
|
+
"name": "[Site Name]",
|
|
137
|
+
"url": "[Homepage URL]",
|
|
138
|
+
"potentialAction": {
|
|
139
|
+
"@type": "SearchAction",
|
|
140
|
+
"target": {
|
|
141
|
+
"@type": "EntryPoint",
|
|
142
|
+
"urlTemplate": "[Search URL]?q={search_term_string}"
|
|
143
|
+
},
|
|
144
|
+
"query-input": "required name=search_term_string"
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
},
|
|
148
|
+
{
|
|
149
|
+
"type": "BreadcrumbList",
|
|
150
|
+
"use_case": "Navigation breadcrumbs",
|
|
151
|
+
"template": {
|
|
152
|
+
"@context": "https://schema.org",
|
|
153
|
+
"@type": "BreadcrumbList",
|
|
154
|
+
"itemListElement": [
|
|
155
|
+
{
|
|
156
|
+
"@type": "ListItem",
|
|
157
|
+
"position": 1,
|
|
158
|
+
"name": "Home",
|
|
159
|
+
"item": "[Homepage URL]"
|
|
160
|
+
},
|
|
161
|
+
{
|
|
162
|
+
"@type": "ListItem",
|
|
163
|
+
"position": 2,
|
|
164
|
+
"name": "[Category]",
|
|
165
|
+
"item": "[Category URL]"
|
|
166
|
+
},
|
|
167
|
+
{
|
|
168
|
+
"@type": "ListItem",
|
|
169
|
+
"position": 3,
|
|
170
|
+
"name": "[Current Page]"
|
|
171
|
+
}
|
|
172
|
+
]
|
|
173
|
+
}
|
|
174
|
+
},
|
|
175
|
+
{
|
|
176
|
+
"type": "Person",
|
|
177
|
+
"use_case": "Author/team member profiles (E-E-A-T)",
|
|
178
|
+
"template": {
|
|
179
|
+
"@context": "https://schema.org",
|
|
180
|
+
"@type": "Person",
|
|
181
|
+
"name": "[Full Name]",
|
|
182
|
+
"jobTitle": "[Job Title]",
|
|
183
|
+
"url": "[Profile URL]",
|
|
184
|
+
"image": "[Photo URL]",
|
|
185
|
+
"description": "[Professional bio]",
|
|
186
|
+
"worksFor": {
|
|
187
|
+
"@type": "Organization",
|
|
188
|
+
"name": "[Company Name]"
|
|
189
|
+
},
|
|
190
|
+
"sameAs": [
|
|
191
|
+
"[LinkedIn URL]",
|
|
192
|
+
"[Twitter URL]",
|
|
193
|
+
"[GitHub URL]"
|
|
194
|
+
]
|
|
195
|
+
}
|
|
196
|
+
},
|
|
197
|
+
{
|
|
198
|
+
"type": "Service",
|
|
199
|
+
"use_case": "Service business pages",
|
|
200
|
+
"template": {
|
|
201
|
+
"@context": "https://schema.org",
|
|
202
|
+
"@type": "Service",
|
|
203
|
+
"name": "[Service Name]",
|
|
204
|
+
"description": "[Service Description]",
|
|
205
|
+
"provider": {
|
|
206
|
+
"@type": "Organization",
|
|
207
|
+
"name": "[Provider Name]",
|
|
208
|
+
"url": "[Provider URL]"
|
|
209
|
+
},
|
|
210
|
+
"areaServed": {
|
|
211
|
+
"@type": "City",
|
|
212
|
+
"name": "[Service Area]"
|
|
213
|
+
},
|
|
214
|
+
"offers": {
|
|
215
|
+
"@type": "Offer",
|
|
216
|
+
"price": "[Starting Price]",
|
|
217
|
+
"priceCurrency": "[Currency]"
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
},
|
|
221
|
+
{
|
|
222
|
+
"type": "VideoObject",
|
|
223
|
+
"use_case": "Video content pages",
|
|
224
|
+
"template": {
|
|
225
|
+
"@context": "https://schema.org",
|
|
226
|
+
"@type": "VideoObject",
|
|
227
|
+
"name": "[Video Title]",
|
|
228
|
+
"description": "[Video Description]",
|
|
229
|
+
"thumbnailUrl": "[Thumbnail URL]",
|
|
230
|
+
"uploadDate": "[YYYY-MM-DD]",
|
|
231
|
+
"duration": "[PT1H30M]",
|
|
232
|
+
"contentUrl": "[Video File URL]",
|
|
233
|
+
"embedUrl": "[Embed URL]",
|
|
234
|
+
"publisher": {
|
|
235
|
+
"@type": "Organization",
|
|
236
|
+
"name": "[Publisher]"
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
},
|
|
240
|
+
{
|
|
241
|
+
"type": "Event",
|
|
242
|
+
"use_case": "Events and conferences",
|
|
243
|
+
"template": {
|
|
244
|
+
"@context": "https://schema.org",
|
|
245
|
+
"@type": "Event",
|
|
246
|
+
"name": "[Event Name]",
|
|
247
|
+
"description": "[Event Description]",
|
|
248
|
+
"startDate": "[YYYY-MM-DDTHH:MM]",
|
|
249
|
+
"endDate": "[YYYY-MM-DDTHH:MM]",
|
|
250
|
+
"location": {
|
|
251
|
+
"@type": "Place",
|
|
252
|
+
"name": "[Venue Name]",
|
|
253
|
+
"address": {
|
|
254
|
+
"@type": "PostalAddress",
|
|
255
|
+
"streetAddress": "[Street]",
|
|
256
|
+
"addressLocality": "[City]",
|
|
257
|
+
"addressCountry": "[Country]"
|
|
258
|
+
}
|
|
259
|
+
},
|
|
260
|
+
"organizer": {
|
|
261
|
+
"@type": "Organization",
|
|
262
|
+
"name": "[Organizer Name]",
|
|
263
|
+
"url": "[Organizer URL]"
|
|
264
|
+
},
|
|
265
|
+
"offers": {
|
|
266
|
+
"@type": "Offer",
|
|
267
|
+
"price": "[Price]",
|
|
268
|
+
"priceCurrency": "[Currency]",
|
|
269
|
+
"url": "[Ticket URL]",
|
|
270
|
+
"availability": "https://schema.org/InStock"
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
},
|
|
274
|
+
{
|
|
275
|
+
"type": "SoftwareApplication",
|
|
276
|
+
"use_case": "SaaS and app pages",
|
|
277
|
+
"template": {
|
|
278
|
+
"@context": "https://schema.org",
|
|
279
|
+
"@type": "SoftwareApplication",
|
|
280
|
+
"name": "[App Name]",
|
|
281
|
+
"description": "[App Description]",
|
|
282
|
+
"applicationCategory": "[Category]",
|
|
283
|
+
"operatingSystem": "[OS]",
|
|
284
|
+
"offers": {
|
|
285
|
+
"@type": "Offer",
|
|
286
|
+
"price": "[Price or 0]",
|
|
287
|
+
"priceCurrency": "[Currency]"
|
|
288
|
+
},
|
|
289
|
+
"aggregateRating": {
|
|
290
|
+
"@type": "AggregateRating",
|
|
291
|
+
"ratingValue": "[Rating]",
|
|
292
|
+
"ratingCount": "[Count]"
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
},
|
|
296
|
+
{
|
|
297
|
+
"type": "ProfilePage",
|
|
298
|
+
"use_case": "Author/creator profile pages (E-E-A-T)",
|
|
299
|
+
"template": {
|
|
300
|
+
"@context": "https://schema.org",
|
|
301
|
+
"@type": "ProfilePage",
|
|
302
|
+
"mainEntity": {
|
|
303
|
+
"@type": "Person",
|
|
304
|
+
"name": "[Author Name]",
|
|
305
|
+
"url": "[Profile URL]",
|
|
306
|
+
"description": "[Bio and expertise]",
|
|
307
|
+
"sameAs": ["[LinkedIn]", "[Twitter]"]
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
},
|
|
311
|
+
{
|
|
312
|
+
"type": "ProductGroup",
|
|
313
|
+
"use_case": "E-commerce product variants",
|
|
314
|
+
"template": {
|
|
315
|
+
"@context": "https://schema.org",
|
|
316
|
+
"@type": "ProductGroup",
|
|
317
|
+
"name": "[Product Name]",
|
|
318
|
+
"description": "[Group description]",
|
|
319
|
+
"productGroupID": "[Group ID]",
|
|
320
|
+
"variesBy": ["https://schema.org/size", "https://schema.org/color"],
|
|
321
|
+
"hasVariant": [
|
|
322
|
+
{
|
|
323
|
+
"@type": "Product",
|
|
324
|
+
"name": "[Variant Name]",
|
|
325
|
+
"sku": "[SKU]",
|
|
326
|
+
"color": "[Color]",
|
|
327
|
+
"size": "[Size]",
|
|
328
|
+
"offers": {
|
|
329
|
+
"@type": "Offer",
|
|
330
|
+
"price": "[Price]",
|
|
331
|
+
"priceCurrency": "[Currency]",
|
|
332
|
+
"availability": "https://schema.org/InStock"
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
]
|
|
336
|
+
}
|
|
337
|
+
},
|
|
338
|
+
{
|
|
339
|
+
"type": "Certification",
|
|
340
|
+
"use_case": "Product certifications (April 2025)",
|
|
341
|
+
"template": {
|
|
342
|
+
"@context": "https://schema.org",
|
|
343
|
+
"@type": "Product",
|
|
344
|
+
"name": "[Product Name]",
|
|
345
|
+
"hasCertification": {
|
|
346
|
+
"@type": "Certification",
|
|
347
|
+
"certificationIdentification": "[Certification Name]",
|
|
348
|
+
"issuedBy": {
|
|
349
|
+
"@type": "Organization",
|
|
350
|
+
"name": "[Issuing Organization]"
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
]
|
|
356
|
+
}
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
SEO Crawl — Recursive mini-crawler for site structure discovery.
|
|
4
|
+
|
|
5
|
+
Features:
|
|
6
|
+
- Sitemap.xml parsing for initial page list
|
|
7
|
+
- Recursive link-following with configurable depth
|
|
8
|
+
- Internal link graph construction
|
|
9
|
+
- Orphan page detection
|
|
10
|
+
- robots.txt respect
|
|
11
|
+
|
|
12
|
+
Author: Laurent Rochetta
|
|
13
|
+
License: MIT
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import re
|
|
19
|
+
import sys
|
|
20
|
+
import xml.etree.ElementTree as ET
|
|
21
|
+
from collections import defaultdict
|
|
22
|
+
from typing import Optional, Set
|
|
23
|
+
from urllib.parse import urljoin, urlparse
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
import requests
|
|
27
|
+
except ImportError:
|
|
28
|
+
print("Error: requests library required. Install: pip install requests", file=sys.stderr)
|
|
29
|
+
sys.exit(1)
|
|
30
|
+
|
|
31
|
+
USER_AGENT = (
|
|
32
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
33
|
+
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 BMADSEOEngine/2.0"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class SEOCrawler:
|
|
38
|
+
"""Recursive mini-crawler for SEO site structure analysis."""
|
|
39
|
+
|
|
40
|
+
def __init__(self, base_url: str, max_depth: int = 2, max_pages: int = 25, timeout: int = 15):
|
|
41
|
+
self.base_url = base_url.rstrip("/")
|
|
42
|
+
self.base_domain = urlparse(self.base_url).netloc
|
|
43
|
+
self.max_depth = max_depth
|
|
44
|
+
self.max_pages = max_pages
|
|
45
|
+
self.timeout = timeout
|
|
46
|
+
|
|
47
|
+
self.visited: Set[str] = set()
|
|
48
|
+
self.pages: list = []
|
|
49
|
+
self.link_graph: dict = defaultdict(set) # page -> set of linked pages
|
|
50
|
+
self.sitemap_urls: list = []
|
|
51
|
+
self.robots_txt: Optional[str] = None
|
|
52
|
+
self.errors: list = []
|
|
53
|
+
|
|
54
|
+
def normalize_url(self, url: str) -> str:
|
|
55
|
+
"""Normalize URL for deduplication."""
|
|
56
|
+
parsed = urlparse(url)
|
|
57
|
+
path = parsed.path.rstrip("/") or "/"
|
|
58
|
+
return f"{parsed.scheme}://{parsed.netloc}{path}"
|
|
59
|
+
|
|
60
|
+
def is_internal(self, url: str) -> bool:
|
|
61
|
+
"""Check if URL belongs to the same domain."""
|
|
62
|
+
return urlparse(url).netloc == self.base_domain
|
|
63
|
+
|
|
64
|
+
def fetch(self, url: str) -> Optional[str]:
|
|
65
|
+
"""Fetch a page with error handling."""
|
|
66
|
+
try:
|
|
67
|
+
response = requests.get(
|
|
68
|
+
url,
|
|
69
|
+
headers={"User-Agent": USER_AGENT},
|
|
70
|
+
timeout=self.timeout,
|
|
71
|
+
allow_redirects=True,
|
|
72
|
+
)
|
|
73
|
+
if response.status_code == 200 and "text/html" in response.headers.get("content-type", ""):
|
|
74
|
+
return response.text
|
|
75
|
+
else:
|
|
76
|
+
self.pages.append({
|
|
77
|
+
"url": url,
|
|
78
|
+
"status": response.status_code,
|
|
79
|
+
"content_type": response.headers.get("content-type", ""),
|
|
80
|
+
"title": None,
|
|
81
|
+
"word_count": 0,
|
|
82
|
+
"depth": -1,
|
|
83
|
+
})
|
|
84
|
+
except requests.RequestException as e:
|
|
85
|
+
self.errors.append({"url": url, "error": str(e)})
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
def fetch_robots_txt(self):
|
|
89
|
+
"""Fetch and store robots.txt."""
|
|
90
|
+
try:
|
|
91
|
+
response = requests.get(
|
|
92
|
+
f"{self.base_url}/robots.txt",
|
|
93
|
+
headers={"User-Agent": USER_AGENT},
|
|
94
|
+
timeout=self.timeout,
|
|
95
|
+
)
|
|
96
|
+
if response.status_code == 200:
|
|
97
|
+
self.robots_txt = response.text
|
|
98
|
+
except requests.RequestException:
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
def parse_sitemap(self):
|
|
102
|
+
"""Discover pages from sitemap.xml."""
|
|
103
|
+
sitemap_url = f"{self.base_url}/sitemap.xml"
|
|
104
|
+
|
|
105
|
+
# Check robots.txt for sitemap reference
|
|
106
|
+
if self.robots_txt:
|
|
107
|
+
for line in self.robots_txt.splitlines():
|
|
108
|
+
if line.strip().lower().startswith("sitemap:"):
|
|
109
|
+
sitemap_url = line.split(":", 1)[1].strip()
|
|
110
|
+
break
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
response = requests.get(
|
|
114
|
+
sitemap_url,
|
|
115
|
+
headers={"User-Agent": USER_AGENT},
|
|
116
|
+
timeout=self.timeout,
|
|
117
|
+
)
|
|
118
|
+
if response.status_code == 200 and "xml" in response.headers.get("content-type", ""):
|
|
119
|
+
root = ET.fromstring(response.content)
|
|
120
|
+
ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
|
121
|
+
|
|
122
|
+
for url_el in root.findall(".//sm:url/sm:loc", ns):
|
|
123
|
+
if url_el.text and self.is_internal(url_el.text):
|
|
124
|
+
self.sitemap_urls.append(url_el.text)
|
|
125
|
+
|
|
126
|
+
# Handle sitemap index
|
|
127
|
+
for sitemap_el in root.findall(".//sm:sitemap/sm:loc", ns):
|
|
128
|
+
self.sitemap_urls.append(f"[sitemap-index]: {sitemap_el.text}")
|
|
129
|
+
|
|
130
|
+
except (requests.RequestException, ET.ParseError):
|
|
131
|
+
pass
|
|
132
|
+
|
|
133
|
+
def extract_links(self, html: str, page_url: str) -> list:
|
|
134
|
+
"""Extract internal links from HTML."""
|
|
135
|
+
links = []
|
|
136
|
+
# Simple regex for links (avoids BS4 dependency for crawler)
|
|
137
|
+
for match in re.finditer(r'href=["\']([^"\']+)["\']', html):
|
|
138
|
+
href = match.group(1)
|
|
139
|
+
if href.startswith("#") or href.startswith("javascript:") or href.startswith("mailto:"):
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
full_url = urljoin(page_url, href)
|
|
143
|
+
if self.is_internal(full_url):
|
|
144
|
+
normalized = self.normalize_url(full_url)
|
|
145
|
+
links.append(normalized)
|
|
146
|
+
self.link_graph[page_url].add(normalized)
|
|
147
|
+
|
|
148
|
+
return links
|
|
149
|
+
|
|
150
|
+
def extract_title(self, html: str) -> Optional[str]:
|
|
151
|
+
"""Extract title from HTML."""
|
|
152
|
+
match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
|
|
153
|
+
return match.group(1).strip() if match else None
|
|
154
|
+
|
|
155
|
+
def count_words(self, html: str) -> int:
|
|
156
|
+
"""Count visible words in HTML."""
|
|
157
|
+
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
|
|
158
|
+
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
|
|
159
|
+
text = re.sub(r"<[^>]+>", " ", text)
|
|
160
|
+
words = re.findall(r"\b\w+\b", text)
|
|
161
|
+
return len(words)
|
|
162
|
+
|
|
163
|
+
def crawl(self):
|
|
164
|
+
"""Execute the recursive crawl."""
|
|
165
|
+
self.fetch_robots_txt()
|
|
166
|
+
self.parse_sitemap()
|
|
167
|
+
|
|
168
|
+
# Start with base URL
|
|
169
|
+
queue = [(self.base_url, 0)] # (url, depth)
|
|
170
|
+
|
|
171
|
+
while queue and len(self.visited) < self.max_pages:
|
|
172
|
+
url, depth = queue.pop(0)
|
|
173
|
+
normalized = self.normalize_url(url)
|
|
174
|
+
|
|
175
|
+
if normalized in self.visited:
|
|
176
|
+
continue
|
|
177
|
+
if depth > self.max_depth:
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
self.visited.add(normalized)
|
|
181
|
+
html = self.fetch(normalized)
|
|
182
|
+
|
|
183
|
+
if html:
|
|
184
|
+
title = self.extract_title(html)
|
|
185
|
+
word_count = self.count_words(html)
|
|
186
|
+
|
|
187
|
+
self.pages.append({
|
|
188
|
+
"url": normalized,
|
|
189
|
+
"status": 200,
|
|
190
|
+
"title": title,
|
|
191
|
+
"word_count": word_count,
|
|
192
|
+
"depth": depth,
|
|
193
|
+
})
|
|
194
|
+
|
|
195
|
+
# Discover links for next level
|
|
196
|
+
if depth < self.max_depth:
|
|
197
|
+
links = self.extract_links(html, normalized)
|
|
198
|
+
for link in links:
|
|
199
|
+
if link not in self.visited:
|
|
200
|
+
queue.append((link, depth + 1))
|
|
201
|
+
|
|
202
|
+
def get_results(self) -> dict:
|
|
203
|
+
"""Return crawl results as dictionary."""
|
|
204
|
+
# Detect orphan pages (in sitemap but not linked from any crawled page)
|
|
205
|
+
all_linked = set()
|
|
206
|
+
for targets in self.link_graph.values():
|
|
207
|
+
all_linked.update(targets)
|
|
208
|
+
|
|
209
|
+
orphans = [url for url in self.sitemap_urls
|
|
210
|
+
if isinstance(url, str) and not url.startswith("[") and
|
|
211
|
+
self.normalize_url(url) not in all_linked]
|
|
212
|
+
|
|
213
|
+
return {
|
|
214
|
+
"base_url": self.base_url,
|
|
215
|
+
"pages_crawled": len(self.pages),
|
|
216
|
+
"max_depth": self.max_depth,
|
|
217
|
+
"sitemap_urls_found": len([u for u in self.sitemap_urls if not str(u).startswith("[")]),
|
|
218
|
+
"has_robots_txt": self.robots_txt is not None,
|
|
219
|
+
"has_sitemap": len(self.sitemap_urls) > 0,
|
|
220
|
+
"pages": self.pages,
|
|
221
|
+
"orphan_pages": orphans[:10],
|
|
222
|
+
"link_graph_summary": {
|
|
223
|
+
"total_internal_links": sum(len(v) for v in self.link_graph.values()),
|
|
224
|
+
"avg_links_per_page": round(
|
|
225
|
+
sum(len(v) for v in self.link_graph.values()) / max(len(self.link_graph), 1), 1
|
|
226
|
+
),
|
|
227
|
+
},
|
|
228
|
+
"errors": self.errors,
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
# ── CLI ────────────────────────────────────────────────────────────
|
|
233
|
+
|
|
234
|
+
def main():
|
|
235
|
+
parser = argparse.ArgumentParser(
|
|
236
|
+
description="SEO Crawl — Recursive mini-crawler (BMAD+ SEO Engine)"
|
|
237
|
+
)
|
|
238
|
+
parser.add_argument("url", help="Base URL to crawl")
|
|
239
|
+
parser.add_argument("--depth", "-d", type=int, default=2, help="Max crawl depth (default: 2)")
|
|
240
|
+
parser.add_argument("--max", "-m", type=int, default=25, help="Max pages (default: 25)")
|
|
241
|
+
parser.add_argument("--timeout", "-t", type=int, default=15, help="Per-page timeout (default: 15s)")
|
|
242
|
+
parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
|
|
243
|
+
|
|
244
|
+
args = parser.parse_args()
|
|
245
|
+
|
|
246
|
+
crawler = SEOCrawler(
|
|
247
|
+
base_url=args.url,
|
|
248
|
+
max_depth=args.depth,
|
|
249
|
+
max_pages=args.max,
|
|
250
|
+
timeout=args.timeout,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
print(f"Crawling {args.url} (depth={args.depth}, max={args.max})...", file=sys.stderr)
|
|
254
|
+
crawler.crawl()
|
|
255
|
+
results = crawler.get_results()
|
|
256
|
+
|
|
257
|
+
if args.json:
|
|
258
|
+
# Convert sets to lists for JSON serialization
|
|
259
|
+
print(json.dumps(results, indent=2, ensure_ascii=False, default=list))
|
|
260
|
+
else:
|
|
261
|
+
print(f"\n{'='*60}")
|
|
262
|
+
print(f"Crawl Summary: {results['base_url']}")
|
|
263
|
+
print(f"{'='*60}")
|
|
264
|
+
print(f"Pages crawled: {results['pages_crawled']}")
|
|
265
|
+
print(f"Sitemap URLs: {results['sitemap_urls_found']}")
|
|
266
|
+
print(f"robots.txt: {'✅' if results['has_robots_txt'] else '❌'}")
|
|
267
|
+
print(f"Internal links: {results['link_graph_summary']['total_internal_links']}")
|
|
268
|
+
print(f"Avg links/page: {results['link_graph_summary']['avg_links_per_page']}")
|
|
269
|
+
print(f"Orphan pages: {len(results['orphan_pages'])}")
|
|
270
|
+
print(f"Errors: {len(results['errors'])}")
|
|
271
|
+
|
|
272
|
+
print(f"\n{'─'*60}")
|
|
273
|
+
print("Pages:")
|
|
274
|
+
for page in results["pages"]:
|
|
275
|
+
status = "✅" if page["status"] == 200 else f"⚠️ {page['status']}"
|
|
276
|
+
title = (page["title"] or "No title")[:50]
|
|
277
|
+
print(f" {status} [{page['depth']}] {title} ({page['word_count']} words)")
|
|
278
|
+
print(f" {page['url']}")
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
if __name__ == "__main__":
|
|
282
|
+
main()
|