aeo-cli 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aeo_cli/__init__.py +8 -0
- aeo_cli/core/__init__.py +0 -0
- aeo_cli/core/auditor.py +564 -0
- aeo_cli/core/cache.py +34 -0
- aeo_cli/core/crawler.py +188 -0
- aeo_cli/core/discovery.py +251 -0
- aeo_cli/core/generate/__init__.py +15 -0
- aeo_cli/core/generate/compiler.py +133 -0
- aeo_cli/core/generate/llm.py +136 -0
- aeo_cli/core/generate/profiles.py +85 -0
- aeo_cli/core/generate/prompts.py +108 -0
- aeo_cli/core/models.py +223 -0
- aeo_cli/core/retry.py +78 -0
- aeo_cli/formatters/__init__.py +1 -0
- aeo_cli/formatters/ci_summary.py +79 -0
- aeo_cli/formatters/csv.py +59 -0
- aeo_cli/formatters/markdown.py +81 -0
- aeo_cli/main.py +420 -0
- aeo_cli/py.typed +0 -0
- aeo_cli/server.py +62 -0
- aeo_cli-0.2.1.dist-info/METADATA +302 -0
- aeo_cli-0.2.1.dist-info/RECORD +26 -0
- aeo_cli-0.2.1.dist-info/WHEEL +5 -0
- aeo_cli-0.2.1.dist-info/entry_points.txt +2 -0
- aeo_cli-0.2.1.dist-info/licenses/LICENSE +21 -0
- aeo_cli-0.2.1.dist-info/top_level.txt +1 -0
aeo_cli/__init__.py
ADDED
aeo_cli/core/__init__.py
ADDED
|
File without changes
|
aeo_cli/core/auditor.py
ADDED
|
@@ -0,0 +1,564 @@
|
|
|
1
|
+
"""Core audit orchestration — runs all pillar checks and computes AEO score."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import re
|
|
8
|
+
from collections.abc import Callable
|
|
9
|
+
from urllib.parse import urlparse
|
|
10
|
+
from urllib.robotparser import RobotFileParser
|
|
11
|
+
|
|
12
|
+
import httpx
|
|
13
|
+
from bs4 import BeautifulSoup
|
|
14
|
+
|
|
15
|
+
from aeo_cli.core.crawler import CrawlResult, extract_page, extract_pages
|
|
16
|
+
from aeo_cli.core.discovery import discover_pages
|
|
17
|
+
from aeo_cli.core.models import (
|
|
18
|
+
AuditReport,
|
|
19
|
+
BotAccessResult,
|
|
20
|
+
ContentReport,
|
|
21
|
+
DiscoveryResult,
|
|
22
|
+
LlmsTxtReport,
|
|
23
|
+
PageAudit,
|
|
24
|
+
RobotsReport,
|
|
25
|
+
SchemaOrgResult,
|
|
26
|
+
SchemaReport,
|
|
27
|
+
SiteAuditReport,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
AI_BOTS: list[str] = [
|
|
31
|
+
"GPTBot",
|
|
32
|
+
"ChatGPT-User",
|
|
33
|
+
"Google-Extended",
|
|
34
|
+
"ClaudeBot",
|
|
35
|
+
"PerplexityBot",
|
|
36
|
+
"Amazonbot",
|
|
37
|
+
"OAI-SearchBot",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
DEFAULT_TIMEOUT: int = 15
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ── Pillar 1: Robots.txt ──────────────────────────────────────────────────────
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
async def check_robots(
|
|
47
|
+
url: str, client: httpx.AsyncClient
|
|
48
|
+
) -> tuple[RobotsReport, str | None]:
|
|
49
|
+
"""Fetch robots.txt and check AI bot access.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
(report, raw_robots_text) — raw text is provided so discovery can filter URLs.
|
|
53
|
+
"""
|
|
54
|
+
parsed = urlparse(url)
|
|
55
|
+
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
resp = await client.get(robots_url, follow_redirects=True)
|
|
59
|
+
if resp.status_code != 200:
|
|
60
|
+
return (
|
|
61
|
+
RobotsReport(
|
|
62
|
+
found=False, detail=f"robots.txt returned HTTP {resp.status_code}"
|
|
63
|
+
),
|
|
64
|
+
None,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
raw_text = resp.text
|
|
68
|
+
rp = RobotFileParser()
|
|
69
|
+
rp.parse(raw_text.splitlines())
|
|
70
|
+
|
|
71
|
+
bots = []
|
|
72
|
+
for bot in AI_BOTS:
|
|
73
|
+
allowed = rp.can_fetch(bot, "/")
|
|
74
|
+
bots.append(BotAccessResult(
|
|
75
|
+
bot=bot,
|
|
76
|
+
allowed=allowed,
|
|
77
|
+
detail="Allowed" if allowed else "Blocked by robots.txt",
|
|
78
|
+
))
|
|
79
|
+
|
|
80
|
+
allowed_count = sum(1 for b in bots if b.allowed)
|
|
81
|
+
return (
|
|
82
|
+
RobotsReport(
|
|
83
|
+
found=True,
|
|
84
|
+
bots=bots,
|
|
85
|
+
detail=f"{allowed_count}/{len(AI_BOTS)} AI bots allowed",
|
|
86
|
+
),
|
|
87
|
+
raw_text,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
except httpx.HTTPError as e:
|
|
91
|
+
return RobotsReport(found=False, detail=f"Failed to fetch robots.txt: {e}"), None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ── Pillar 2: llms.txt ────────────────────────────────────────────────────────
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
async def check_llms_txt(url: str, client: httpx.AsyncClient) -> LlmsTxtReport:
|
|
98
|
+
"""Probe /llms.txt and /.well-known/llms.txt."""
|
|
99
|
+
parsed = urlparse(url)
|
|
100
|
+
base = f"{parsed.scheme}://{parsed.netloc}"
|
|
101
|
+
paths = ["/llms.txt", "/.well-known/llms.txt"]
|
|
102
|
+
|
|
103
|
+
for path in paths:
|
|
104
|
+
probe_url = base + path
|
|
105
|
+
try:
|
|
106
|
+
resp = await client.get(probe_url, follow_redirects=True)
|
|
107
|
+
if resp.status_code == 200 and len(resp.text.strip()) > 0:
|
|
108
|
+
return LlmsTxtReport(
|
|
109
|
+
found=True,
|
|
110
|
+
url=probe_url,
|
|
111
|
+
detail=f"Found at {probe_url}",
|
|
112
|
+
)
|
|
113
|
+
except httpx.HTTPError:
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
return LlmsTxtReport(found=False, detail="llms.txt not found")
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# ── Pillar 3: Schema.org JSON-LD ──────────────────────────────────────────────
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def check_schema_org(html: str) -> SchemaReport: # noqa: C901
|
|
123
|
+
"""Extract and analyze JSON-LD structured data from HTML."""
|
|
124
|
+
if not html:
|
|
125
|
+
return SchemaReport(detail="No HTML to analyze")
|
|
126
|
+
|
|
127
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
128
|
+
ld_scripts = soup.find_all("script", attrs={"type": "application/ld+json"})
|
|
129
|
+
|
|
130
|
+
schemas: list[SchemaOrgResult] = []
|
|
131
|
+
for script in ld_scripts:
|
|
132
|
+
try:
|
|
133
|
+
data = json.loads(script.string or "")
|
|
134
|
+
# Handle both single objects and arrays
|
|
135
|
+
items = data if isinstance(data, list) else [data]
|
|
136
|
+
for item in items:
|
|
137
|
+
if isinstance(item, dict):
|
|
138
|
+
schema_type = item.get("@type", "Unknown")
|
|
139
|
+
if isinstance(schema_type, list):
|
|
140
|
+
schema_type = ", ".join(schema_type)
|
|
141
|
+
props = [k for k in item.keys() if not k.startswith("@")]
|
|
142
|
+
schemas.append(SchemaOrgResult(
|
|
143
|
+
schema_type=schema_type,
|
|
144
|
+
properties=props,
|
|
145
|
+
))
|
|
146
|
+
except (json.JSONDecodeError, TypeError):
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
blocks_found = len(schemas)
|
|
150
|
+
detail = f"{blocks_found} JSON-LD block(s) found" if blocks_found else "No JSON-LD found"
|
|
151
|
+
|
|
152
|
+
return SchemaReport(blocks_found=blocks_found, schemas=schemas, detail=detail)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# ── Pillar 4: Content Density ─────────────────────────────────────────────────
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def check_content(markdown: str) -> ContentReport:
|
|
159
|
+
"""Analyze markdown content density."""
|
|
160
|
+
if not markdown:
|
|
161
|
+
return ContentReport(detail="No content extracted")
|
|
162
|
+
|
|
163
|
+
words = markdown.split()
|
|
164
|
+
word_count = len(words)
|
|
165
|
+
char_count = len(markdown)
|
|
166
|
+
has_headings = bool(re.search(r"^#{1,6}\s", markdown, re.MULTILINE))
|
|
167
|
+
has_lists = bool(re.search(r"^[\s]*[-*+]\s", markdown, re.MULTILINE))
|
|
168
|
+
has_code_blocks = "```" in markdown
|
|
169
|
+
|
|
170
|
+
detail = f"{word_count} words"
|
|
171
|
+
if has_headings:
|
|
172
|
+
detail += ", has headings"
|
|
173
|
+
if has_lists:
|
|
174
|
+
detail += ", has lists"
|
|
175
|
+
if has_code_blocks:
|
|
176
|
+
detail += ", has code blocks"
|
|
177
|
+
|
|
178
|
+
return ContentReport(
|
|
179
|
+
word_count=word_count,
|
|
180
|
+
char_count=char_count,
|
|
181
|
+
has_headings=has_headings,
|
|
182
|
+
has_lists=has_lists,
|
|
183
|
+
has_code_blocks=has_code_blocks,
|
|
184
|
+
detail=detail,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# ── Scoring ───────────────────────────────────────────────────────────────────
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def compute_scores(
|
|
192
|
+
robots: RobotsReport,
|
|
193
|
+
llms_txt: LlmsTxtReport,
|
|
194
|
+
schema_org: SchemaReport,
|
|
195
|
+
content: ContentReport,
|
|
196
|
+
) -> tuple[RobotsReport, LlmsTxtReport, SchemaReport, ContentReport, float]:
|
|
197
|
+
"""Compute scores for each pillar and overall AEO score.
|
|
198
|
+
|
|
199
|
+
Scoring weights (revised 2026-02-18):
|
|
200
|
+
Content (max 40): most impactful — what LLMs actually extract and cite
|
|
201
|
+
Schema (max 25): structured signals help LLMs understand page entities
|
|
202
|
+
Robots (max 25): gatekeeper — blocked bots can't crawl at all
|
|
203
|
+
llms.txt (max 10): forward-looking signal, minimal real impact today
|
|
204
|
+
|
|
205
|
+
Rationale: When AI search engines (ChatGPT, Perplexity, Claude) look up
|
|
206
|
+
products or answer questions, they crawl pages and extract text content.
|
|
207
|
+
Content quality dominates what gets cited. Schema.org gives structured
|
|
208
|
+
"cheat sheets" (Product, Article, FAQ). Robots.txt is pass/fail per bot.
|
|
209
|
+
llms.txt is emerging but not yet weighted by any major AI search engine.
|
|
210
|
+
"""
|
|
211
|
+
# Robots: max 25 — proportional to bots allowed
|
|
212
|
+
if robots.found and robots.bots:
|
|
213
|
+
allowed = sum(1 for b in robots.bots if b.allowed)
|
|
214
|
+
robots.score = round(25 * allowed / len(robots.bots), 1)
|
|
215
|
+
else:
|
|
216
|
+
robots.score = 0
|
|
217
|
+
|
|
218
|
+
# llms.txt: max 10
|
|
219
|
+
llms_txt.score = 10 if llms_txt.found else 0
|
|
220
|
+
|
|
221
|
+
# Schema: max 25 — reward high-value types more
|
|
222
|
+
if schema_org.blocks_found > 0:
|
|
223
|
+
unique_types = {s.schema_type for s in schema_org.schemas}
|
|
224
|
+
# Base 8 for having any JSON-LD, +5 per unique type, capped at 25
|
|
225
|
+
schema_org.score = min(25, 8 + 5 * len(unique_types))
|
|
226
|
+
else:
|
|
227
|
+
schema_org.score = 0
|
|
228
|
+
|
|
229
|
+
# Content: max 40 — word count tiers + structure bonuses
|
|
230
|
+
# Higher thresholds reflect that LLMs need substantial content to cite
|
|
231
|
+
score = 0
|
|
232
|
+
if content.word_count >= 1500:
|
|
233
|
+
score = 25
|
|
234
|
+
elif content.word_count >= 800:
|
|
235
|
+
score = 20
|
|
236
|
+
elif content.word_count >= 400:
|
|
237
|
+
score = 15
|
|
238
|
+
elif content.word_count >= 150:
|
|
239
|
+
score = 8
|
|
240
|
+
if content.has_headings:
|
|
241
|
+
score += 7 # structure matters a lot for LLM extraction
|
|
242
|
+
if content.has_lists:
|
|
243
|
+
score += 5 # lists are highly extractable by LLMs
|
|
244
|
+
if content.has_code_blocks:
|
|
245
|
+
score += 3 # relevant for technical content
|
|
246
|
+
content.score = min(40, score)
|
|
247
|
+
|
|
248
|
+
overall = robots.score + llms_txt.score + schema_org.score + content.score
|
|
249
|
+
return robots, llms_txt, schema_org, content, overall
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
# ── Orchestrator ──────────────────────────────────────────────────────────────
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def audit_page_content(html: str, markdown: str) -> tuple[SchemaReport, ContentReport]:
|
|
256
|
+
"""Run page-specific checks (schema + content) on pre-crawled data."""
|
|
257
|
+
return check_schema_org(html), check_content(markdown)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _page_weight(url: str) -> int:
|
|
261
|
+
"""Return a weight for a page based on URL depth.
|
|
262
|
+
|
|
263
|
+
Shallower pages (homepage, top-level sections) are more representative of a
|
|
264
|
+
site's AEO readiness and therefore receive higher weight in aggregation.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
3 for depth 0-1 (homepage or single path segment)
|
|
268
|
+
2 for depth 2
|
|
269
|
+
1 for depth 3+
|
|
270
|
+
"""
|
|
271
|
+
path = urlparse(url).path.strip("/")
|
|
272
|
+
depth = len(path.split("/")) if path else 0
|
|
273
|
+
if depth <= 1:
|
|
274
|
+
return 3
|
|
275
|
+
if depth == 2:
|
|
276
|
+
return 2
|
|
277
|
+
return 1
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def aggregate_page_scores(
|
|
281
|
+
pages: list[PageAudit],
|
|
282
|
+
robots: RobotsReport,
|
|
283
|
+
llms_txt: LlmsTxtReport,
|
|
284
|
+
) -> tuple[SchemaReport, ContentReport, float]:
|
|
285
|
+
"""Aggregate per-page scores into site-level pillar scores.
|
|
286
|
+
|
|
287
|
+
Content (40pts) and Schema (25pts) use weighted averages — shallower pages
|
|
288
|
+
count more (see ``_page_weight``).
|
|
289
|
+
Robots (25pts) and llms.txt (10pts) are site-wide, used as-is.
|
|
290
|
+
Word/char counts remain simple averages.
|
|
291
|
+
"""
|
|
292
|
+
successful = [p for p in pages if not p.errors or p.content.word_count > 0]
|
|
293
|
+
if not successful:
|
|
294
|
+
agg_schema = SchemaReport(detail="No pages audited successfully")
|
|
295
|
+
agg_content = ContentReport(detail="No pages audited successfully")
|
|
296
|
+
return agg_schema, agg_content, robots.score + llms_txt.score
|
|
297
|
+
|
|
298
|
+
weights = [_page_weight(p.url) for p in successful]
|
|
299
|
+
total_weight = sum(weights)
|
|
300
|
+
|
|
301
|
+
# Aggregate schema: collect all blocks, weighted-average the score
|
|
302
|
+
all_schemas: list[SchemaOrgResult] = []
|
|
303
|
+
total_blocks = 0
|
|
304
|
+
schema_score_sum = 0.0
|
|
305
|
+
for p, w in zip(successful, weights):
|
|
306
|
+
all_schemas.extend(p.schema_org.schemas)
|
|
307
|
+
total_blocks += p.schema_org.blocks_found
|
|
308
|
+
schema_score_sum += p.schema_org.score * w
|
|
309
|
+
|
|
310
|
+
avg_schema_score = round(schema_score_sum / total_weight, 1)
|
|
311
|
+
agg_schema = SchemaReport(
|
|
312
|
+
blocks_found=total_blocks,
|
|
313
|
+
schemas=all_schemas,
|
|
314
|
+
score=avg_schema_score,
|
|
315
|
+
detail=(
|
|
316
|
+
f"{total_blocks} JSON-LD block(s) across {len(successful)} pages"
|
|
317
|
+
f" (weighted avg score {avg_schema_score})"
|
|
318
|
+
),
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# Aggregate content: weighted-average scores, simple-average metrics
|
|
322
|
+
content_score_sum = 0.0
|
|
323
|
+
word_sum = 0
|
|
324
|
+
char_sum = 0
|
|
325
|
+
any_headings = False
|
|
326
|
+
any_lists = False
|
|
327
|
+
any_code = False
|
|
328
|
+
for p, w in zip(successful, weights):
|
|
329
|
+
content_score_sum += p.content.score * w
|
|
330
|
+
word_sum += p.content.word_count
|
|
331
|
+
char_sum += p.content.char_count
|
|
332
|
+
any_headings = any_headings or p.content.has_headings
|
|
333
|
+
any_lists = any_lists or p.content.has_lists
|
|
334
|
+
any_code = any_code or p.content.has_code_blocks
|
|
335
|
+
|
|
336
|
+
n = len(successful)
|
|
337
|
+
avg_content_score = round(content_score_sum / total_weight, 1)
|
|
338
|
+
avg_words = word_sum // n
|
|
339
|
+
agg_content = ContentReport(
|
|
340
|
+
word_count=avg_words,
|
|
341
|
+
char_count=char_sum // n,
|
|
342
|
+
has_headings=any_headings,
|
|
343
|
+
has_lists=any_lists,
|
|
344
|
+
has_code_blocks=any_code,
|
|
345
|
+
score=avg_content_score,
|
|
346
|
+
detail=(
|
|
347
|
+
f"avg {avg_words} words across {n} pages"
|
|
348
|
+
f" (weighted avg score {avg_content_score})"
|
|
349
|
+
),
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
overall = robots.score + llms_txt.score + avg_schema_score + avg_content_score
|
|
353
|
+
return agg_schema, agg_content, overall
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
async def audit_url(url: str) -> AuditReport:
|
|
357
|
+
"""Run a full AEO audit on a single URL. Returns AuditReport with all pillar scores."""
|
|
358
|
+
errors: list[str] = []
|
|
359
|
+
|
|
360
|
+
async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT, follow_redirects=True) as client:
|
|
361
|
+
# Run HTTP checks and browser crawl concurrently
|
|
362
|
+
robots_task = check_robots(url, client)
|
|
363
|
+
llms_task = check_llms_txt(url, client)
|
|
364
|
+
crawl_task = extract_page(url)
|
|
365
|
+
|
|
366
|
+
robots_result, llms_txt, crawl_result = await asyncio.gather(
|
|
367
|
+
robots_task, llms_task, crawl_task, return_exceptions=True
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Handle exceptions from gather
|
|
371
|
+
if isinstance(robots_result, BaseException):
|
|
372
|
+
errors.append(f"Robots check failed: {robots_result}")
|
|
373
|
+
robots = RobotsReport(found=False, detail="Check failed")
|
|
374
|
+
else:
|
|
375
|
+
robots, _raw_robots = robots_result # destructure tuple
|
|
376
|
+
|
|
377
|
+
if isinstance(llms_txt, BaseException):
|
|
378
|
+
errors.append(f"llms.txt check failed: {llms_txt}")
|
|
379
|
+
llms_txt = LlmsTxtReport(found=False, detail="Check failed")
|
|
380
|
+
crawl: CrawlResult | None = None
|
|
381
|
+
if isinstance(crawl_result, BaseException):
|
|
382
|
+
errors.append(f"Crawl failed: {crawl_result}")
|
|
383
|
+
else:
|
|
384
|
+
crawl = crawl_result
|
|
385
|
+
|
|
386
|
+
# Run sync checks on crawl results
|
|
387
|
+
html = crawl.html if crawl and crawl.success else ""
|
|
388
|
+
markdown = crawl.markdown if crawl and crawl.success else ""
|
|
389
|
+
|
|
390
|
+
if crawl and not crawl.success and crawl.error:
|
|
391
|
+
errors.append(f"Crawl error: {crawl.error}")
|
|
392
|
+
|
|
393
|
+
schema_org = check_schema_org(html)
|
|
394
|
+
content = check_content(markdown)
|
|
395
|
+
|
|
396
|
+
# Compute scores
|
|
397
|
+
robots, llms_txt, schema_org, content, overall = compute_scores(
|
|
398
|
+
robots, llms_txt, schema_org, content
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
return AuditReport(
|
|
402
|
+
url=url,
|
|
403
|
+
overall_score=overall,
|
|
404
|
+
robots=robots,
|
|
405
|
+
llms_txt=llms_txt,
|
|
406
|
+
schema_org=schema_org,
|
|
407
|
+
content=content,
|
|
408
|
+
errors=errors,
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
SITE_AUDIT_TIMEOUT: int = 90
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
async def audit_site(
|
|
416
|
+
url: str,
|
|
417
|
+
*,
|
|
418
|
+
max_pages: int = 10,
|
|
419
|
+
delay_seconds: float = 1.0,
|
|
420
|
+
progress_callback: Callable[[str], None] | None = None,
|
|
421
|
+
) -> SiteAuditReport:
|
|
422
|
+
"""Run a multi-page AEO audit. Discovers pages via sitemap/spider and aggregates scores."""
|
|
423
|
+
errors: list[str] = []
|
|
424
|
+
domain = urlparse(url).netloc
|
|
425
|
+
|
|
426
|
+
def _progress(msg: str) -> None:
|
|
427
|
+
if progress_callback:
|
|
428
|
+
progress_callback(msg)
|
|
429
|
+
|
|
430
|
+
try:
|
|
431
|
+
return await asyncio.wait_for(
|
|
432
|
+
_audit_site_inner(url, domain, max_pages, delay_seconds, errors, _progress),
|
|
433
|
+
timeout=SITE_AUDIT_TIMEOUT,
|
|
434
|
+
)
|
|
435
|
+
except asyncio.TimeoutError:
|
|
436
|
+
errors.append(f"Audit timed out after {SITE_AUDIT_TIMEOUT}s, returning partial results")
|
|
437
|
+
# Return whatever we have — the inner function stores partial results
|
|
438
|
+
return SiteAuditReport(
|
|
439
|
+
url=url,
|
|
440
|
+
domain=domain,
|
|
441
|
+
overall_score=0,
|
|
442
|
+
robots=RobotsReport(found=False, detail="Timed out"),
|
|
443
|
+
llms_txt=LlmsTxtReport(found=False, detail="Timed out"),
|
|
444
|
+
schema_org=SchemaReport(detail="Timed out"),
|
|
445
|
+
content=ContentReport(detail="Timed out"),
|
|
446
|
+
discovery=DiscoveryResult(method="timeout", detail="Timed out"),
|
|
447
|
+
errors=errors,
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
async def _audit_site_inner(
|
|
452
|
+
url: str,
|
|
453
|
+
domain: str,
|
|
454
|
+
max_pages: int,
|
|
455
|
+
delay_seconds: float,
|
|
456
|
+
errors: list[str],
|
|
457
|
+
progress: Callable[[str], None],
|
|
458
|
+
) -> SiteAuditReport:
|
|
459
|
+
"""Inner implementation of audit_site, wrapped with a timeout by the caller."""
|
|
460
|
+
progress("Running site-wide checks...")
|
|
461
|
+
|
|
462
|
+
async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT, follow_redirects=True) as client:
|
|
463
|
+
# Phase 1: Site-wide checks + seed crawl in parallel
|
|
464
|
+
robots_task = check_robots(url, client)
|
|
465
|
+
llms_task = check_llms_txt(url, client)
|
|
466
|
+
crawl_task = extract_page(url)
|
|
467
|
+
|
|
468
|
+
robots_result, llms_txt, seed_crawl = await asyncio.gather(
|
|
469
|
+
robots_task, llms_task, crawl_task, return_exceptions=True
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
# Unpack results
|
|
473
|
+
raw_robots: str | None = None
|
|
474
|
+
if isinstance(robots_result, BaseException):
|
|
475
|
+
errors.append(f"Robots check failed: {robots_result}")
|
|
476
|
+
robots = RobotsReport(found=False, detail="Check failed")
|
|
477
|
+
else:
|
|
478
|
+
robots, raw_robots = robots_result
|
|
479
|
+
|
|
480
|
+
if isinstance(llms_txt, BaseException):
|
|
481
|
+
errors.append(f"llms.txt check failed: {llms_txt}")
|
|
482
|
+
llms_txt = LlmsTxtReport(found=False, detail="Check failed")
|
|
483
|
+
|
|
484
|
+
seed: CrawlResult | None = None
|
|
485
|
+
if isinstance(seed_crawl, BaseException):
|
|
486
|
+
errors.append(f"Seed crawl failed: {seed_crawl}")
|
|
487
|
+
else:
|
|
488
|
+
seed = seed_crawl
|
|
489
|
+
|
|
490
|
+
# Phase 2: Discover pages
|
|
491
|
+
progress("Discovering pages...")
|
|
492
|
+
seed_links = None
|
|
493
|
+
if seed and seed.success:
|
|
494
|
+
seed_links = seed.internal_links
|
|
495
|
+
|
|
496
|
+
discovery = await discover_pages(
|
|
497
|
+
url,
|
|
498
|
+
client,
|
|
499
|
+
max_pages=max_pages,
|
|
500
|
+
robots_txt=raw_robots,
|
|
501
|
+
seed_links=seed_links,
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
# Phase 3: Audit seed page + batch crawl remaining pages
|
|
505
|
+
pages: list[PageAudit] = []
|
|
506
|
+
|
|
507
|
+
# Audit the seed page first
|
|
508
|
+
if seed and seed.success:
|
|
509
|
+
schema, content = audit_page_content(seed.html, seed.markdown)
|
|
510
|
+
# Score the page-level pillar checks
|
|
511
|
+
_, _, schema, content, _ = compute_scores(
|
|
512
|
+
RobotsReport(found=False), LlmsTxtReport(found=False), schema, content
|
|
513
|
+
)
|
|
514
|
+
pages.append(PageAudit(url=url, schema_org=schema, content=content))
|
|
515
|
+
elif seed and not seed.success:
|
|
516
|
+
errors.append(f"Seed crawl error: {seed.error}")
|
|
517
|
+
|
|
518
|
+
# Crawl remaining sampled pages (exclude seed which is already crawled)
|
|
519
|
+
remaining_urls = [u for u in discovery.urls_sampled if u != url]
|
|
520
|
+
if remaining_urls:
|
|
521
|
+
progress(f"Crawling {len(remaining_urls)} additional pages...")
|
|
522
|
+
crawl_results = await extract_pages(
|
|
523
|
+
remaining_urls, delay_seconds=delay_seconds
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
for i, result in enumerate(crawl_results):
|
|
527
|
+
progress(f"Auditing page {i + 2}/{len(discovery.urls_sampled)}...")
|
|
528
|
+
if result.success:
|
|
529
|
+
schema, content = audit_page_content(result.html, result.markdown)
|
|
530
|
+
_, _, schema, content, _ = compute_scores(
|
|
531
|
+
RobotsReport(found=False), LlmsTxtReport(found=False), schema, content
|
|
532
|
+
)
|
|
533
|
+
pages.append(PageAudit(url=result.url, schema_org=schema, content=content))
|
|
534
|
+
else:
|
|
535
|
+
pages.append(PageAudit(
|
|
536
|
+
url=result.url,
|
|
537
|
+
schema_org=SchemaReport(detail="Crawl failed"),
|
|
538
|
+
content=ContentReport(detail="Crawl failed"),
|
|
539
|
+
errors=[result.error or "Unknown crawl error"],
|
|
540
|
+
))
|
|
541
|
+
|
|
542
|
+
# Phase 4: Compute site-wide robot/llms scores
|
|
543
|
+
robots, llms_txt, _, _, _ = compute_scores(
|
|
544
|
+
robots, llms_txt, SchemaReport(), ContentReport()
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
# Phase 5: Aggregate
|
|
548
|
+
pages_failed = sum(1 for p in pages if p.errors)
|
|
549
|
+
agg_schema, agg_content, overall = aggregate_page_scores(pages, robots, llms_txt)
|
|
550
|
+
|
|
551
|
+
return SiteAuditReport(
|
|
552
|
+
url=url,
|
|
553
|
+
domain=domain,
|
|
554
|
+
overall_score=overall,
|
|
555
|
+
robots=robots,
|
|
556
|
+
llms_txt=llms_txt,
|
|
557
|
+
schema_org=agg_schema,
|
|
558
|
+
content=agg_content,
|
|
559
|
+
discovery=discovery,
|
|
560
|
+
pages=pages,
|
|
561
|
+
pages_audited=len(pages),
|
|
562
|
+
pages_failed=pages_failed,
|
|
563
|
+
errors=errors,
|
|
564
|
+
)
|
aeo_cli/core/cache.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Simple in-memory cache for robots.txt during site audits."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
from aeo_cli.core.models import RobotsReport
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class RobotsCache:
|
|
12
|
+
"""Cache robots.txt results to avoid re-fetching for every page in a site audit.
|
|
13
|
+
|
|
14
|
+
Keyed by domain (netloc). A single site audit typically only needs one entry,
|
|
15
|
+
but the design supports multi-domain usage if needed.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
_store: dict[str, tuple[RobotsReport, str | None]] = field(default_factory=dict)
|
|
19
|
+
|
|
20
|
+
def get(self, domain: str) -> tuple[RobotsReport, str | None] | None:
|
|
21
|
+
"""Return cached (RobotsReport, raw_text) or None if not cached."""
|
|
22
|
+
return self._store.get(domain)
|
|
23
|
+
|
|
24
|
+
def set(self, domain: str, report: RobotsReport, raw_text: str | None) -> None:
|
|
25
|
+
"""Cache a robots.txt result for a domain."""
|
|
26
|
+
self._store[domain] = (report, raw_text)
|
|
27
|
+
|
|
28
|
+
def has(self, domain: str) -> bool:
|
|
29
|
+
"""Check if a domain's robots.txt is cached."""
|
|
30
|
+
return domain in self._store
|
|
31
|
+
|
|
32
|
+
def clear(self) -> None:
|
|
33
|
+
"""Clear the cache."""
|
|
34
|
+
self._store.clear()
|