aeo-cli 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aeo_cli/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ """AEO-CLI: Agentic Engine Optimization CLI tool."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ try:
6
+ __version__ = version("aeo-cli")
7
+ except PackageNotFoundError:
8
+ __version__ = "0.0.0"
File without changes
@@ -0,0 +1,564 @@
1
+ """Core audit orchestration — runs all pillar checks and computes AEO score."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import re
8
+ from collections.abc import Callable
9
+ from urllib.parse import urlparse
10
+ from urllib.robotparser import RobotFileParser
11
+
12
+ import httpx
13
+ from bs4 import BeautifulSoup
14
+
15
+ from aeo_cli.core.crawler import CrawlResult, extract_page, extract_pages
16
+ from aeo_cli.core.discovery import discover_pages
17
+ from aeo_cli.core.models import (
18
+ AuditReport,
19
+ BotAccessResult,
20
+ ContentReport,
21
+ DiscoveryResult,
22
+ LlmsTxtReport,
23
+ PageAudit,
24
+ RobotsReport,
25
+ SchemaOrgResult,
26
+ SchemaReport,
27
+ SiteAuditReport,
28
+ )
29
+
30
+ AI_BOTS: list[str] = [
31
+ "GPTBot",
32
+ "ChatGPT-User",
33
+ "Google-Extended",
34
+ "ClaudeBot",
35
+ "PerplexityBot",
36
+ "Amazonbot",
37
+ "OAI-SearchBot",
38
+ ]
39
+
40
+ DEFAULT_TIMEOUT: int = 15
41
+
42
+
43
+ # ── Pillar 1: Robots.txt ──────────────────────────────────────────────────────
44
+
45
+
46
+ async def check_robots(
47
+ url: str, client: httpx.AsyncClient
48
+ ) -> tuple[RobotsReport, str | None]:
49
+ """Fetch robots.txt and check AI bot access.
50
+
51
+ Returns:
52
+ (report, raw_robots_text) — raw text is provided so discovery can filter URLs.
53
+ """
54
+ parsed = urlparse(url)
55
+ robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
56
+
57
+ try:
58
+ resp = await client.get(robots_url, follow_redirects=True)
59
+ if resp.status_code != 200:
60
+ return (
61
+ RobotsReport(
62
+ found=False, detail=f"robots.txt returned HTTP {resp.status_code}"
63
+ ),
64
+ None,
65
+ )
66
+
67
+ raw_text = resp.text
68
+ rp = RobotFileParser()
69
+ rp.parse(raw_text.splitlines())
70
+
71
+ bots = []
72
+ for bot in AI_BOTS:
73
+ allowed = rp.can_fetch(bot, "/")
74
+ bots.append(BotAccessResult(
75
+ bot=bot,
76
+ allowed=allowed,
77
+ detail="Allowed" if allowed else "Blocked by robots.txt",
78
+ ))
79
+
80
+ allowed_count = sum(1 for b in bots if b.allowed)
81
+ return (
82
+ RobotsReport(
83
+ found=True,
84
+ bots=bots,
85
+ detail=f"{allowed_count}/{len(AI_BOTS)} AI bots allowed",
86
+ ),
87
+ raw_text,
88
+ )
89
+
90
+ except httpx.HTTPError as e:
91
+ return RobotsReport(found=False, detail=f"Failed to fetch robots.txt: {e}"), None
92
+
93
+
94
+ # ── Pillar 2: llms.txt ────────────────────────────────────────────────────────
95
+
96
+
97
+ async def check_llms_txt(url: str, client: httpx.AsyncClient) -> LlmsTxtReport:
98
+ """Probe /llms.txt and /.well-known/llms.txt."""
99
+ parsed = urlparse(url)
100
+ base = f"{parsed.scheme}://{parsed.netloc}"
101
+ paths = ["/llms.txt", "/.well-known/llms.txt"]
102
+
103
+ for path in paths:
104
+ probe_url = base + path
105
+ try:
106
+ resp = await client.get(probe_url, follow_redirects=True)
107
+ if resp.status_code == 200 and len(resp.text.strip()) > 0:
108
+ return LlmsTxtReport(
109
+ found=True,
110
+ url=probe_url,
111
+ detail=f"Found at {probe_url}",
112
+ )
113
+ except httpx.HTTPError:
114
+ continue
115
+
116
+ return LlmsTxtReport(found=False, detail="llms.txt not found")
117
+
118
+
119
+ # ── Pillar 3: Schema.org JSON-LD ──────────────────────────────────────────────
120
+
121
+
122
+ def check_schema_org(html: str) -> SchemaReport: # noqa: C901
123
+ """Extract and analyze JSON-LD structured data from HTML."""
124
+ if not html:
125
+ return SchemaReport(detail="No HTML to analyze")
126
+
127
+ soup = BeautifulSoup(html, "html.parser")
128
+ ld_scripts = soup.find_all("script", attrs={"type": "application/ld+json"})
129
+
130
+ schemas: list[SchemaOrgResult] = []
131
+ for script in ld_scripts:
132
+ try:
133
+ data = json.loads(script.string or "")
134
+ # Handle both single objects and arrays
135
+ items = data if isinstance(data, list) else [data]
136
+ for item in items:
137
+ if isinstance(item, dict):
138
+ schema_type = item.get("@type", "Unknown")
139
+ if isinstance(schema_type, list):
140
+ schema_type = ", ".join(schema_type)
141
+ props = [k for k in item.keys() if not k.startswith("@")]
142
+ schemas.append(SchemaOrgResult(
143
+ schema_type=schema_type,
144
+ properties=props,
145
+ ))
146
+ except (json.JSONDecodeError, TypeError):
147
+ continue
148
+
149
+ blocks_found = len(schemas)
150
+ detail = f"{blocks_found} JSON-LD block(s) found" if blocks_found else "No JSON-LD found"
151
+
152
+ return SchemaReport(blocks_found=blocks_found, schemas=schemas, detail=detail)
153
+
154
+
155
+ # ── Pillar 4: Content Density ─────────────────────────────────────────────────
156
+
157
+
158
+ def check_content(markdown: str) -> ContentReport:
159
+ """Analyze markdown content density."""
160
+ if not markdown:
161
+ return ContentReport(detail="No content extracted")
162
+
163
+ words = markdown.split()
164
+ word_count = len(words)
165
+ char_count = len(markdown)
166
+ has_headings = bool(re.search(r"^#{1,6}\s", markdown, re.MULTILINE))
167
+ has_lists = bool(re.search(r"^[\s]*[-*+]\s", markdown, re.MULTILINE))
168
+ has_code_blocks = "```" in markdown
169
+
170
+ detail = f"{word_count} words"
171
+ if has_headings:
172
+ detail += ", has headings"
173
+ if has_lists:
174
+ detail += ", has lists"
175
+ if has_code_blocks:
176
+ detail += ", has code blocks"
177
+
178
+ return ContentReport(
179
+ word_count=word_count,
180
+ char_count=char_count,
181
+ has_headings=has_headings,
182
+ has_lists=has_lists,
183
+ has_code_blocks=has_code_blocks,
184
+ detail=detail,
185
+ )
186
+
187
+
188
+ # ── Scoring ───────────────────────────────────────────────────────────────────
189
+
190
+
191
+ def compute_scores(
192
+ robots: RobotsReport,
193
+ llms_txt: LlmsTxtReport,
194
+ schema_org: SchemaReport,
195
+ content: ContentReport,
196
+ ) -> tuple[RobotsReport, LlmsTxtReport, SchemaReport, ContentReport, float]:
197
+ """Compute scores for each pillar and overall AEO score.
198
+
199
+ Scoring weights (revised 2026-02-18):
200
+ Content (max 40): most impactful — what LLMs actually extract and cite
201
+ Schema (max 25): structured signals help LLMs understand page entities
202
+ Robots (max 25): gatekeeper — blocked bots can't crawl at all
203
+ llms.txt (max 10): forward-looking signal, minimal real impact today
204
+
205
+ Rationale: When AI search engines (ChatGPT, Perplexity, Claude) look up
206
+ products or answer questions, they crawl pages and extract text content.
207
+ Content quality dominates what gets cited. Schema.org gives structured
208
+ "cheat sheets" (Product, Article, FAQ). Robots.txt is pass/fail per bot.
209
+ llms.txt is emerging but not yet weighted by any major AI search engine.
210
+ """
211
+ # Robots: max 25 — proportional to bots allowed
212
+ if robots.found and robots.bots:
213
+ allowed = sum(1 for b in robots.bots if b.allowed)
214
+ robots.score = round(25 * allowed / len(robots.bots), 1)
215
+ else:
216
+ robots.score = 0
217
+
218
+ # llms.txt: max 10
219
+ llms_txt.score = 10 if llms_txt.found else 0
220
+
221
+ # Schema: max 25 — reward high-value types more
222
+ if schema_org.blocks_found > 0:
223
+ unique_types = {s.schema_type for s in schema_org.schemas}
224
+ # Base 8 for having any JSON-LD, +5 per unique type, capped at 25
225
+ schema_org.score = min(25, 8 + 5 * len(unique_types))
226
+ else:
227
+ schema_org.score = 0
228
+
229
+ # Content: max 40 — word count tiers + structure bonuses
230
+ # Higher thresholds reflect that LLMs need substantial content to cite
231
+ score = 0
232
+ if content.word_count >= 1500:
233
+ score = 25
234
+ elif content.word_count >= 800:
235
+ score = 20
236
+ elif content.word_count >= 400:
237
+ score = 15
238
+ elif content.word_count >= 150:
239
+ score = 8
240
+ if content.has_headings:
241
+ score += 7 # structure matters a lot for LLM extraction
242
+ if content.has_lists:
243
+ score += 5 # lists are highly extractable by LLMs
244
+ if content.has_code_blocks:
245
+ score += 3 # relevant for technical content
246
+ content.score = min(40, score)
247
+
248
+ overall = robots.score + llms_txt.score + schema_org.score + content.score
249
+ return robots, llms_txt, schema_org, content, overall
250
+
251
+
252
+ # ── Orchestrator ──────────────────────────────────────────────────────────────
253
+
254
+
255
+ def audit_page_content(html: str, markdown: str) -> tuple[SchemaReport, ContentReport]:
256
+ """Run page-specific checks (schema + content) on pre-crawled data."""
257
+ return check_schema_org(html), check_content(markdown)
258
+
259
+
260
+ def _page_weight(url: str) -> int:
261
+ """Return a weight for a page based on URL depth.
262
+
263
+ Shallower pages (homepage, top-level sections) are more representative of a
264
+ site's AEO readiness and therefore receive higher weight in aggregation.
265
+
266
+ Returns:
267
+ 3 for depth 0-1 (homepage or single path segment)
268
+ 2 for depth 2
269
+ 1 for depth 3+
270
+ """
271
+ path = urlparse(url).path.strip("/")
272
+ depth = len(path.split("/")) if path else 0
273
+ if depth <= 1:
274
+ return 3
275
+ if depth == 2:
276
+ return 2
277
+ return 1
278
+
279
+
280
+ def aggregate_page_scores(
281
+ pages: list[PageAudit],
282
+ robots: RobotsReport,
283
+ llms_txt: LlmsTxtReport,
284
+ ) -> tuple[SchemaReport, ContentReport, float]:
285
+ """Aggregate per-page scores into site-level pillar scores.
286
+
287
+ Content (40pts) and Schema (25pts) use weighted averages — shallower pages
288
+ count more (see ``_page_weight``).
289
+ Robots (25pts) and llms.txt (10pts) are site-wide, used as-is.
290
+ Word/char counts remain simple averages.
291
+ """
292
+ successful = [p for p in pages if not p.errors or p.content.word_count > 0]
293
+ if not successful:
294
+ agg_schema = SchemaReport(detail="No pages audited successfully")
295
+ agg_content = ContentReport(detail="No pages audited successfully")
296
+ return agg_schema, agg_content, robots.score + llms_txt.score
297
+
298
+ weights = [_page_weight(p.url) for p in successful]
299
+ total_weight = sum(weights)
300
+
301
+ # Aggregate schema: collect all blocks, weighted-average the score
302
+ all_schemas: list[SchemaOrgResult] = []
303
+ total_blocks = 0
304
+ schema_score_sum = 0.0
305
+ for p, w in zip(successful, weights):
306
+ all_schemas.extend(p.schema_org.schemas)
307
+ total_blocks += p.schema_org.blocks_found
308
+ schema_score_sum += p.schema_org.score * w
309
+
310
+ avg_schema_score = round(schema_score_sum / total_weight, 1)
311
+ agg_schema = SchemaReport(
312
+ blocks_found=total_blocks,
313
+ schemas=all_schemas,
314
+ score=avg_schema_score,
315
+ detail=(
316
+ f"{total_blocks} JSON-LD block(s) across {len(successful)} pages"
317
+ f" (weighted avg score {avg_schema_score})"
318
+ ),
319
+ )
320
+
321
+ # Aggregate content: weighted-average scores, simple-average metrics
322
+ content_score_sum = 0.0
323
+ word_sum = 0
324
+ char_sum = 0
325
+ any_headings = False
326
+ any_lists = False
327
+ any_code = False
328
+ for p, w in zip(successful, weights):
329
+ content_score_sum += p.content.score * w
330
+ word_sum += p.content.word_count
331
+ char_sum += p.content.char_count
332
+ any_headings = any_headings or p.content.has_headings
333
+ any_lists = any_lists or p.content.has_lists
334
+ any_code = any_code or p.content.has_code_blocks
335
+
336
+ n = len(successful)
337
+ avg_content_score = round(content_score_sum / total_weight, 1)
338
+ avg_words = word_sum // n
339
+ agg_content = ContentReport(
340
+ word_count=avg_words,
341
+ char_count=char_sum // n,
342
+ has_headings=any_headings,
343
+ has_lists=any_lists,
344
+ has_code_blocks=any_code,
345
+ score=avg_content_score,
346
+ detail=(
347
+ f"avg {avg_words} words across {n} pages"
348
+ f" (weighted avg score {avg_content_score})"
349
+ ),
350
+ )
351
+
352
+ overall = robots.score + llms_txt.score + avg_schema_score + avg_content_score
353
+ return agg_schema, agg_content, overall
354
+
355
+
356
+ async def audit_url(url: str) -> AuditReport:
357
+ """Run a full AEO audit on a single URL. Returns AuditReport with all pillar scores."""
358
+ errors: list[str] = []
359
+
360
+ async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT, follow_redirects=True) as client:
361
+ # Run HTTP checks and browser crawl concurrently
362
+ robots_task = check_robots(url, client)
363
+ llms_task = check_llms_txt(url, client)
364
+ crawl_task = extract_page(url)
365
+
366
+ robots_result, llms_txt, crawl_result = await asyncio.gather(
367
+ robots_task, llms_task, crawl_task, return_exceptions=True
368
+ )
369
+
370
+ # Handle exceptions from gather
371
+ if isinstance(robots_result, BaseException):
372
+ errors.append(f"Robots check failed: {robots_result}")
373
+ robots = RobotsReport(found=False, detail="Check failed")
374
+ else:
375
+ robots, _raw_robots = robots_result # destructure tuple
376
+
377
+ if isinstance(llms_txt, BaseException):
378
+ errors.append(f"llms.txt check failed: {llms_txt}")
379
+ llms_txt = LlmsTxtReport(found=False, detail="Check failed")
380
+ crawl: CrawlResult | None = None
381
+ if isinstance(crawl_result, BaseException):
382
+ errors.append(f"Crawl failed: {crawl_result}")
383
+ else:
384
+ crawl = crawl_result
385
+
386
+ # Run sync checks on crawl results
387
+ html = crawl.html if crawl and crawl.success else ""
388
+ markdown = crawl.markdown if crawl and crawl.success else ""
389
+
390
+ if crawl and not crawl.success and crawl.error:
391
+ errors.append(f"Crawl error: {crawl.error}")
392
+
393
+ schema_org = check_schema_org(html)
394
+ content = check_content(markdown)
395
+
396
+ # Compute scores
397
+ robots, llms_txt, schema_org, content, overall = compute_scores(
398
+ robots, llms_txt, schema_org, content
399
+ )
400
+
401
+ return AuditReport(
402
+ url=url,
403
+ overall_score=overall,
404
+ robots=robots,
405
+ llms_txt=llms_txt,
406
+ schema_org=schema_org,
407
+ content=content,
408
+ errors=errors,
409
+ )
410
+
411
+
412
+ SITE_AUDIT_TIMEOUT: int = 90
413
+
414
+
415
+ async def audit_site(
416
+ url: str,
417
+ *,
418
+ max_pages: int = 10,
419
+ delay_seconds: float = 1.0,
420
+ progress_callback: Callable[[str], None] | None = None,
421
+ ) -> SiteAuditReport:
422
+ """Run a multi-page AEO audit. Discovers pages via sitemap/spider and aggregates scores."""
423
+ errors: list[str] = []
424
+ domain = urlparse(url).netloc
425
+
426
+ def _progress(msg: str) -> None:
427
+ if progress_callback:
428
+ progress_callback(msg)
429
+
430
+ try:
431
+ return await asyncio.wait_for(
432
+ _audit_site_inner(url, domain, max_pages, delay_seconds, errors, _progress),
433
+ timeout=SITE_AUDIT_TIMEOUT,
434
+ )
435
+ except asyncio.TimeoutError:
436
+ errors.append(f"Audit timed out after {SITE_AUDIT_TIMEOUT}s, returning partial results")
437
+ # Return whatever we have — the inner function stores partial results
438
+ return SiteAuditReport(
439
+ url=url,
440
+ domain=domain,
441
+ overall_score=0,
442
+ robots=RobotsReport(found=False, detail="Timed out"),
443
+ llms_txt=LlmsTxtReport(found=False, detail="Timed out"),
444
+ schema_org=SchemaReport(detail="Timed out"),
445
+ content=ContentReport(detail="Timed out"),
446
+ discovery=DiscoveryResult(method="timeout", detail="Timed out"),
447
+ errors=errors,
448
+ )
449
+
450
+
451
+ async def _audit_site_inner(
452
+ url: str,
453
+ domain: str,
454
+ max_pages: int,
455
+ delay_seconds: float,
456
+ errors: list[str],
457
+ progress: Callable[[str], None],
458
+ ) -> SiteAuditReport:
459
+ """Inner implementation of audit_site, wrapped with a timeout by the caller."""
460
+ progress("Running site-wide checks...")
461
+
462
+ async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT, follow_redirects=True) as client:
463
+ # Phase 1: Site-wide checks + seed crawl in parallel
464
+ robots_task = check_robots(url, client)
465
+ llms_task = check_llms_txt(url, client)
466
+ crawl_task = extract_page(url)
467
+
468
+ robots_result, llms_txt, seed_crawl = await asyncio.gather(
469
+ robots_task, llms_task, crawl_task, return_exceptions=True
470
+ )
471
+
472
+ # Unpack results
473
+ raw_robots: str | None = None
474
+ if isinstance(robots_result, BaseException):
475
+ errors.append(f"Robots check failed: {robots_result}")
476
+ robots = RobotsReport(found=False, detail="Check failed")
477
+ else:
478
+ robots, raw_robots = robots_result
479
+
480
+ if isinstance(llms_txt, BaseException):
481
+ errors.append(f"llms.txt check failed: {llms_txt}")
482
+ llms_txt = LlmsTxtReport(found=False, detail="Check failed")
483
+
484
+ seed: CrawlResult | None = None
485
+ if isinstance(seed_crawl, BaseException):
486
+ errors.append(f"Seed crawl failed: {seed_crawl}")
487
+ else:
488
+ seed = seed_crawl
489
+
490
+ # Phase 2: Discover pages
491
+ progress("Discovering pages...")
492
+ seed_links = None
493
+ if seed and seed.success:
494
+ seed_links = seed.internal_links
495
+
496
+ discovery = await discover_pages(
497
+ url,
498
+ client,
499
+ max_pages=max_pages,
500
+ robots_txt=raw_robots,
501
+ seed_links=seed_links,
502
+ )
503
+
504
+ # Phase 3: Audit seed page + batch crawl remaining pages
505
+ pages: list[PageAudit] = []
506
+
507
+ # Audit the seed page first
508
+ if seed and seed.success:
509
+ schema, content = audit_page_content(seed.html, seed.markdown)
510
+ # Score the page-level pillar checks
511
+ _, _, schema, content, _ = compute_scores(
512
+ RobotsReport(found=False), LlmsTxtReport(found=False), schema, content
513
+ )
514
+ pages.append(PageAudit(url=url, schema_org=schema, content=content))
515
+ elif seed and not seed.success:
516
+ errors.append(f"Seed crawl error: {seed.error}")
517
+
518
+ # Crawl remaining sampled pages (exclude seed which is already crawled)
519
+ remaining_urls = [u for u in discovery.urls_sampled if u != url]
520
+ if remaining_urls:
521
+ progress(f"Crawling {len(remaining_urls)} additional pages...")
522
+ crawl_results = await extract_pages(
523
+ remaining_urls, delay_seconds=delay_seconds
524
+ )
525
+
526
+ for i, result in enumerate(crawl_results):
527
+ progress(f"Auditing page {i + 2}/{len(discovery.urls_sampled)}...")
528
+ if result.success:
529
+ schema, content = audit_page_content(result.html, result.markdown)
530
+ _, _, schema, content, _ = compute_scores(
531
+ RobotsReport(found=False), LlmsTxtReport(found=False), schema, content
532
+ )
533
+ pages.append(PageAudit(url=result.url, schema_org=schema, content=content))
534
+ else:
535
+ pages.append(PageAudit(
536
+ url=result.url,
537
+ schema_org=SchemaReport(detail="Crawl failed"),
538
+ content=ContentReport(detail="Crawl failed"),
539
+ errors=[result.error or "Unknown crawl error"],
540
+ ))
541
+
542
+ # Phase 4: Compute site-wide robot/llms scores
543
+ robots, llms_txt, _, _, _ = compute_scores(
544
+ robots, llms_txt, SchemaReport(), ContentReport()
545
+ )
546
+
547
+ # Phase 5: Aggregate
548
+ pages_failed = sum(1 for p in pages if p.errors)
549
+ agg_schema, agg_content, overall = aggregate_page_scores(pages, robots, llms_txt)
550
+
551
+ return SiteAuditReport(
552
+ url=url,
553
+ domain=domain,
554
+ overall_score=overall,
555
+ robots=robots,
556
+ llms_txt=llms_txt,
557
+ schema_org=agg_schema,
558
+ content=agg_content,
559
+ discovery=discovery,
560
+ pages=pages,
561
+ pages_audited=len(pages),
562
+ pages_failed=pages_failed,
563
+ errors=errors,
564
+ )
aeo_cli/core/cache.py ADDED
@@ -0,0 +1,34 @@
1
+ """Simple in-memory cache for robots.txt during site audits."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+
7
+ from aeo_cli.core.models import RobotsReport
8
+
9
+
10
+ @dataclass
11
+ class RobotsCache:
12
+ """Cache robots.txt results to avoid re-fetching for every page in a site audit.
13
+
14
+ Keyed by domain (netloc). A single site audit typically only needs one entry,
15
+ but the design supports multi-domain usage if needed.
16
+ """
17
+
18
+ _store: dict[str, tuple[RobotsReport, str | None]] = field(default_factory=dict)
19
+
20
+ def get(self, domain: str) -> tuple[RobotsReport, str | None] | None:
21
+ """Return cached (RobotsReport, raw_text) or None if not cached."""
22
+ return self._store.get(domain)
23
+
24
+ def set(self, domain: str, report: RobotsReport, raw_text: str | None) -> None:
25
+ """Cache a robots.txt result for a domain."""
26
+ self._store[domain] = (report, raw_text)
27
+
28
+ def has(self, domain: str) -> bool:
29
+ """Check if a domain's robots.txt is cached."""
30
+ return domain in self._store
31
+
32
+ def clear(self) -> None:
33
+ """Clear the cache."""
34
+ self._store.clear()