@agentimization/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,2507 @@
1
+ // node_modules/@agentimization/shared/dist/types.js
2
+ import { z } from "zod";
3
+ var CHECK_STATUSES = ["pass", "warn", "fail", "skip", "info"];
4
+ var CHECK_CATEGORIES = [
5
+ "content-discoverability",
6
+ "markdown-availability",
7
+ "content-structure",
8
+ "page-size",
9
+ "url-stability",
10
+ "authentication",
11
+ "geo-signals",
12
+ "agent-protocols"
13
+ ];
14
+ var CheckResultSchema = z.object({
15
+ id: z.string(),
16
+ name: z.string(),
17
+ category: z.enum(CHECK_CATEGORIES),
18
+ status: z.enum(CHECK_STATUSES),
19
+ message: z.string(),
20
+ details: z.string().optional(),
21
+ suggestion: z.string().optional(),
22
+ score: z.number().min(0).max(1).optional(),
23
+ metadata: z.record(z.unknown()).optional()
24
+ });
25
+ var AuditResultSchema = z.object({
26
+ url: z.string(),
27
+ timestamp: z.string(),
28
+ overall_score: z.number().min(0).max(100),
29
+ grade: z.enum(["A+", "A", "B", "C", "D", "F"]),
30
+ checks: z.array(CheckResultSchema),
31
+ summary: z.object({
32
+ total: z.number(),
33
+ passed: z.number(),
34
+ warned: z.number(),
35
+ failed: z.number(),
36
+ skipped: z.number()
37
+ }),
38
+ categories: z.record(z.object({
39
+ score: z.number().min(0).max(100),
40
+ checks: z.number(),
41
+ passed: z.number()
42
+ })),
43
+ latency_ms: z.number()
44
+ });
45
+ var DEFAULT_CONFIG = {
46
+ sampleSize: 10,
47
+ timeout: 1e4,
48
+ concurrency: 5,
49
+ userAgent: "Agentimization/0.1 (GEO Audit; +https://github.com/antlio/agentimization)",
50
+ categories: [...CHECK_CATEGORIES],
51
+ onEvent: () => {
52
+ }
53
+ };
54
+
55
+ // src/checks/content-discoverability.ts
56
+ var llmsTxtExists = {
57
+ id: "llms-txt-exists",
58
+ name: "llms.txt Exists",
59
+ category: "content-discoverability",
60
+ description: "Checks if llms.txt is present at the site root",
61
+ weight: 1,
62
+ run: async (ctx) => {
63
+ if (ctx.llmsTxt) {
64
+ return {
65
+ id: "llms-txt-exists",
66
+ name: "llms.txt Exists",
67
+ category: "content-discoverability",
68
+ status: "pass",
69
+ message: ctx.mode === "local" ? "llms.txt found in project root" : `llms.txt found at ${ctx.baseUrl.origin}/llms.txt`
70
+ };
71
+ }
72
+ return {
73
+ id: "llms-txt-exists",
74
+ name: "llms.txt Exists",
75
+ category: "content-discoverability",
76
+ status: "fail",
77
+ message: "No llms.txt found at site root",
78
+ suggestion: "Create a /llms.txt file that describes your site for AI agents. See https://llmstxt.org for the specification."
79
+ };
80
+ }
81
+ };
82
+ var llmsTxtValid = {
83
+ id: "llms-txt-valid",
84
+ name: "llms.txt Valid Structure",
85
+ category: "content-discoverability",
86
+ description: "Checks if llms.txt follows the proposed structure (H1, blockquote, heading-delimited link sections)",
87
+ weight: 0.8,
88
+ run: async (ctx) => {
89
+ if (!ctx.llmsTxt) {
90
+ return {
91
+ id: "llms-txt-valid",
92
+ name: "llms.txt Valid Structure",
93
+ category: "content-discoverability",
94
+ status: "skip",
95
+ message: "Skipped \u2014 no llms.txt found"
96
+ };
97
+ }
98
+ const issues = [];
99
+ const lines = ctx.llmsTxt.split("\n");
100
+ const hasH1 = lines.some((l) => /^#\s+/.test(l));
101
+ if (!hasH1) issues.push("Missing H1 title");
102
+ const hasBlockquote = lines.some((l) => /^>\s+/.test(l));
103
+ if (!hasBlockquote) issues.push("Missing blockquote description");
104
+ const hasHeadingSections = lines.some((l) => /^##\s+/.test(l));
105
+ if (!hasHeadingSections) issues.push("Missing ## section headings");
106
+ const hasLinks = /\[.+\]\(.+\)/.test(ctx.llmsTxt);
107
+ if (!hasLinks) issues.push("No markdown links found");
108
+ if (issues.length === 0) {
109
+ return {
110
+ id: "llms-txt-valid",
111
+ name: "llms.txt Valid Structure",
112
+ category: "content-discoverability",
113
+ status: "pass",
114
+ message: "llms.txt follows the proposed structure (H1, blockquote, heading-delimited link sections)"
115
+ };
116
+ }
117
+ return {
118
+ id: "llms-txt-valid",
119
+ name: "llms.txt Valid Structure",
120
+ category: "content-discoverability",
121
+ status: issues.length <= 1 ? "warn" : "fail",
122
+ message: `llms.txt structure issues: ${issues.join(", ")}`,
123
+ suggestion: "Follow the llms.txt spec: start with # Title, > Description, then ## Sections with [links](url)."
124
+ };
125
+ }
126
+ };
127
+ var llmsTxtSize = {
128
+ id: "llms-txt-size",
129
+ name: "llms.txt Size",
130
+ category: "content-discoverability",
131
+ description: "Checks if llms.txt is under the 50,000 character threshold",
132
+ weight: 0.5,
133
+ run: async (ctx) => {
134
+ if (!ctx.llmsTxt) {
135
+ return {
136
+ id: "llms-txt-size",
137
+ name: "llms.txt Size",
138
+ category: "content-discoverability",
139
+ status: "skip",
140
+ message: "Skipped \u2014 no llms.txt found"
141
+ };
142
+ }
143
+ const size = ctx.llmsTxt.length;
144
+ if (size <= 5e4) {
145
+ return {
146
+ id: "llms-txt-size",
147
+ name: "llms.txt Size",
148
+ category: "content-discoverability",
149
+ status: "pass",
150
+ message: `llms.txt is ${size.toLocaleString()} characters (under 50,000 threshold)`,
151
+ metadata: { size }
152
+ };
153
+ }
154
+ return {
155
+ id: "llms-txt-size",
156
+ name: "llms.txt Size",
157
+ category: "content-discoverability",
158
+ status: "warn",
159
+ message: `llms.txt is ${size.toLocaleString()} characters (over 50,000 threshold)`,
160
+ suggestion: "Consider splitting into llms.txt (summary) and llms-full.txt (complete reference).",
161
+ metadata: { size }
162
+ };
163
+ }
164
+ };
165
+ var llmsTxtFreshness = {
166
+ id: "llms-txt-freshness",
167
+ name: "llms.txt Coverage",
168
+ category: "content-discoverability",
169
+ description: "Checks how many sitemap pages are referenced in llms.txt",
170
+ weight: 0.7,
171
+ run: async (ctx) => {
172
+ if (!ctx.llmsTxt) {
173
+ return {
174
+ id: "llms-txt-freshness",
175
+ name: "llms.txt Coverage",
176
+ category: "content-discoverability",
177
+ status: "skip",
178
+ message: "Skipped \u2014 no llms.txt found"
179
+ };
180
+ }
181
+ if (ctx.sitemapUrls.length === 0) {
182
+ return {
183
+ id: "llms-txt-freshness",
184
+ name: "llms.txt Coverage",
185
+ category: "content-discoverability",
186
+ status: "info",
187
+ message: "No sitemap found to compare coverage against"
188
+ };
189
+ }
190
+ const sharedRegistrable = (a, b) => {
191
+ if (a === b) return true;
192
+ const aParts = a.split(".");
193
+ const bParts = b.split(".");
194
+ const tail = (parts) => parts.slice(-2).join(".");
195
+ return aParts.length >= 2 && bParts.length >= 2 && tail(aParts) === tail(bParts);
196
+ };
197
+ const keyFor = (raw) => {
198
+ try {
199
+ const u = new URL(raw, ctx.baseUrl.origin);
200
+ if (!sharedRegistrable(u.hostname, ctx.baseUrl.hostname)) return null;
201
+ let path = u.pathname.length > 1 ? u.pathname.replace(/\/+$/, "") : u.pathname;
202
+ path = path.replace(/\.(md|mdx|markdown)$/i, "");
203
+ return path.toLowerCase();
204
+ } catch {
205
+ return null;
206
+ }
207
+ };
208
+ const linkRegex = /\[.+?\]\(([^)]+)\)/g;
209
+ const llmsKeys = /* @__PURE__ */ new Set();
210
+ let match;
211
+ while ((match = linkRegex.exec(ctx.llmsTxt)) !== null) {
212
+ const k = keyFor(match[1]);
213
+ if (k) llmsKeys.add(k);
214
+ }
215
+ const sitemapKeys = /* @__PURE__ */ new Set();
216
+ for (const u of ctx.sitemapUrls) {
217
+ const k = keyFor(u);
218
+ if (k) sitemapKeys.add(k);
219
+ }
220
+ if (sitemapKeys.size === 0 || llmsKeys.size === 0) {
221
+ return {
222
+ id: "llms-txt-freshness",
223
+ name: "llms.txt Coverage",
224
+ category: "content-discoverability",
225
+ status: "skip",
226
+ message: "Not enough same-origin pages to compare llms.txt and sitemap"
227
+ };
228
+ }
229
+ const llmsInSitemap = [...llmsKeys].filter((k) => sitemapKeys.has(k)).length;
230
+ const sitemapInLlms = [...sitemapKeys].filter((k) => llmsKeys.has(k)).length;
231
+ const freshnessPct = Math.round(llmsInSitemap / llmsKeys.size * 100);
232
+ const coveragePct = Math.round(sitemapInLlms / sitemapKeys.size * 100);
233
+ const message = `llms.txt covers ${coveragePct}% of ${sitemapKeys.size} sitemap pages; ${freshnessPct}% of llms.txt links resolve in sitemap`;
234
+ if (coveragePct >= 70 && freshnessPct >= 90) {
235
+ return {
236
+ id: "llms-txt-freshness",
237
+ name: "llms.txt Coverage",
238
+ category: "content-discoverability",
239
+ status: "pass",
240
+ message,
241
+ metadata: { coveragePct, freshnessPct, llmsCount: llmsKeys.size, sitemapCount: sitemapKeys.size }
242
+ };
243
+ }
244
+ const missingFromLlms = sitemapKeys.size - sitemapInLlms;
245
+ const staleInLlms = llmsKeys.size - llmsInSitemap;
246
+ return {
247
+ id: "llms-txt-freshness",
248
+ name: "llms.txt Coverage",
249
+ category: "content-discoverability",
250
+ status: coveragePct >= 40 || freshnessPct >= 70 ? "warn" : "fail",
251
+ message: `${message}${missingFromLlms > 0 ? ` \xB7 ${missingFromLlms} sitemap pages not in llms.txt` : ""}${staleInLlms > 0 ? ` \xB7 ${staleInLlms} llms.txt links not in sitemap` : ""}`,
252
+ suggestion: coveragePct < freshnessPct ? "Add missing sitemap pages to llms.txt to improve AI agent discoverability." : "Some llms.txt links aren't in the sitemap \u2014 they may be stale or your sitemap may be incomplete.",
253
+ metadata: {
254
+ coveragePct,
255
+ freshnessPct,
256
+ llmsCount: llmsKeys.size,
257
+ sitemapCount: sitemapKeys.size,
258
+ missingFromLlms,
259
+ staleInLlms
260
+ }
261
+ };
262
+ }
263
+ };
264
+ var llmsTxtLinksResolve = {
265
+ id: "llms-txt-links-resolve",
266
+ name: "llms.txt Links Resolve",
267
+ category: "content-discoverability",
268
+ description: "Checks if links in llms.txt return 200 OK",
269
+ weight: 0.8,
270
+ requiresNetwork: true,
271
+ run: async (ctx) => {
272
+ if (!ctx.llmsTxt) {
273
+ return {
274
+ id: "llms-txt-links-resolve",
275
+ name: "llms.txt Links Resolve",
276
+ category: "content-discoverability",
277
+ status: "skip",
278
+ message: "Skipped \u2014 no llms.txt found"
279
+ };
280
+ }
281
+ const linkRegex = /\[.+?\]\(([^)]+)\)/g;
282
+ const urls = [];
283
+ let match;
284
+ while ((match = linkRegex.exec(ctx.llmsTxt)) !== null) {
285
+ try {
286
+ const resolved2 = new URL(match[1], ctx.baseUrl.origin);
287
+ if (resolved2.origin === ctx.baseUrl.origin) {
288
+ urls.push(resolved2.href);
289
+ }
290
+ } catch {
291
+ }
292
+ }
293
+ if (urls.length === 0) {
294
+ return {
295
+ id: "llms-txt-links-resolve",
296
+ name: "llms.txt Links Resolve",
297
+ category: "content-discoverability",
298
+ status: "info",
299
+ message: "No same-origin links found in llms.txt"
300
+ };
301
+ }
302
+ const sampled = urls.slice(0, 10);
303
+ const results = await Promise.allSettled(
304
+ sampled.map(async (url) => {
305
+ const resp = await fetch(url, { method: "HEAD", redirect: "follow" });
306
+ return { url, status: resp.status };
307
+ })
308
+ );
309
+ const resolved = results.filter(
310
+ (r) => r.status === "fulfilled" && r.value.status >= 200 && r.value.status < 400
311
+ ).length;
312
+ if (resolved === sampled.length) {
313
+ return {
314
+ id: "llms-txt-links-resolve",
315
+ name: "llms.txt Links Resolve",
316
+ category: "content-discoverability",
317
+ status: "pass",
318
+ message: `All ${resolved} sampled same-origin links resolve (${urls.length} total links)`,
319
+ metadata: { resolved, sampled: sampled.length, total: urls.length }
320
+ };
321
+ }
322
+ return {
323
+ id: "llms-txt-links-resolve",
324
+ name: "llms.txt Links Resolve",
325
+ category: "content-discoverability",
326
+ status: "fail",
327
+ message: `${resolved}/${sampled.length} sampled links resolve \u2014 ${sampled.length - resolved} broken`,
328
+ suggestion: "Fix broken links in llms.txt. AI agents will fail to fetch these pages.",
329
+ metadata: { resolved, sampled: sampled.length, total: urls.length }
330
+ };
331
+ }
332
+ };
333
+ var llmsTxtLinksMarkdown = {
334
+ id: "llms-txt-links-markdown",
335
+ name: "llms.txt Links Markdown",
336
+ category: "content-discoverability",
337
+ description: "Checks how many links in llms.txt point to .md URLs (or markdown-able paths)",
338
+ weight: 0.6,
339
+ run: async (ctx) => {
340
+ if (!ctx.llmsTxt) {
341
+ return {
342
+ id: "llms-txt-links-markdown",
343
+ name: "llms.txt Links Markdown",
344
+ category: "content-discoverability",
345
+ status: "skip",
346
+ message: "Skipped \u2014 no llms.txt found"
347
+ };
348
+ }
349
+ const linkRegex = /\[.+?\]\(([^)]+)\)/g;
350
+ const urls = [];
351
+ let m;
352
+ while ((m = linkRegex.exec(ctx.llmsTxt)) !== null) {
353
+ urls.push(m[1]);
354
+ }
355
+ if (urls.length === 0) {
356
+ return {
357
+ id: "llms-txt-links-markdown",
358
+ name: "llms.txt Links Markdown",
359
+ category: "content-discoverability",
360
+ status: "info",
361
+ message: "No links found in llms.txt"
362
+ };
363
+ }
364
+ const isMd = (u) => {
365
+ try {
366
+ const parsed = new URL(u, ctx.baseUrl.origin);
367
+ const path = parsed.pathname.toLowerCase();
368
+ return path.endsWith(".md") || path.endsWith(".mdx") || path.endsWith(".markdown");
369
+ } catch {
370
+ return /\.mdx?$/i.test(u);
371
+ }
372
+ };
373
+ const mdLinks = urls.filter(isMd).length;
374
+ const pct = Math.round(mdLinks / urls.length * 100);
375
+ if (pct >= 80) {
376
+ return {
377
+ id: "llms-txt-links-markdown",
378
+ name: "llms.txt Links Markdown",
379
+ category: "content-discoverability",
380
+ status: "pass",
381
+ message: `${mdLinks}/${urls.length} llms.txt links point to .md URLs (${pct}%)`,
382
+ metadata: { mdLinks, total: urls.length, pct }
383
+ };
384
+ }
385
+ if (pct >= 30) {
386
+ return {
387
+ id: "llms-txt-links-markdown",
388
+ name: "llms.txt Links Markdown",
389
+ category: "content-discoverability",
390
+ status: "warn",
391
+ message: `${mdLinks}/${urls.length} llms.txt links point to .md URLs (${pct}%)`,
392
+ suggestion: "Point llms.txt links to .md URLs (or markdown-able paths) so agents fetch parseable content directly instead of HTML they have to scrape.",
393
+ metadata: { mdLinks, total: urls.length, pct }
394
+ };
395
+ }
396
+ return {
397
+ id: "llms-txt-links-markdown",
398
+ name: "llms.txt Links Markdown",
399
+ category: "content-discoverability",
400
+ status: "fail",
401
+ message: `Only ${mdLinks}/${urls.length} llms.txt links point to .md URLs (${pct}%)`,
402
+ suggestion: "Most llms.txt links are HTML-only. Serve a markdown version at .md URLs and link to those \u2014 agents get cleaner content and fewer parse failures.",
403
+ metadata: { mdLinks, total: urls.length, pct }
404
+ };
405
+ }
406
+ };
407
+ var sitemapExists = {
408
+ id: "sitemap-exists",
409
+ name: "Sitemap Exists",
410
+ category: "content-discoverability",
411
+ description: "Checks if sitemap.xml is present",
412
+ weight: 0.7,
413
+ run: async (ctx) => {
414
+ if (ctx.sitemapXml) {
415
+ return {
416
+ id: "sitemap-exists",
417
+ name: "Sitemap Exists",
418
+ category: "content-discoverability",
419
+ status: "pass",
420
+ message: `sitemap.xml found with ${ctx.sitemapUrls.length} URLs`,
421
+ metadata: { urlCount: ctx.sitemapUrls.length }
422
+ };
423
+ }
424
+ return {
425
+ id: "sitemap-exists",
426
+ name: "Sitemap Exists",
427
+ category: "content-discoverability",
428
+ status: "fail",
429
+ message: "No sitemap.xml found",
430
+ suggestion: "Create a /sitemap.xml to help AI agents discover all pages on your site."
431
+ };
432
+ }
433
+ };
434
+ var robotsTxtAgentRules = {
435
+ id: "robots-txt-agent-rules",
436
+ name: "robots.txt AI Agent Rules",
437
+ category: "content-discoverability",
438
+ description: "Checks if robots.txt has specific rules for AI agents/crawlers",
439
+ weight: 0.6,
440
+ run: async (ctx) => {
441
+ if (!ctx.robotsTxt) {
442
+ return {
443
+ id: "robots-txt-agent-rules",
444
+ name: "robots.txt AI Agent Rules",
445
+ category: "content-discoverability",
446
+ status: "warn",
447
+ message: "No robots.txt found",
448
+ suggestion: "Create a /robots.txt. Without it, AI agents may not know what they're allowed to crawl."
449
+ };
450
+ }
451
+ const aiAgents = [
452
+ "GPTBot",
453
+ "ChatGPT-User",
454
+ "Claude-Web",
455
+ "ClaudeBot",
456
+ "Anthropic",
457
+ "Google-Extended",
458
+ "PerplexityBot",
459
+ "Bytespider",
460
+ "CCBot",
461
+ "cohere-ai"
462
+ ];
463
+ const blocked = [];
464
+ const allowed = [];
465
+ const lines = ctx.robotsTxt.split("\n");
466
+ let currentAgent = "";
467
+ for (const line of lines) {
468
+ const agentMatch = line.match(/^User-agent:\s*(.+)/i);
469
+ if (agentMatch) {
470
+ currentAgent = agentMatch[1].trim();
471
+ continue;
472
+ }
473
+ const matchedAgent = aiAgents.find(
474
+ (a) => currentAgent === a || currentAgent === "*"
475
+ );
476
+ if (!matchedAgent) continue;
477
+ if (/^Disallow:\s*\/\s*$/i.test(line) && currentAgent !== "*") {
478
+ blocked.push(currentAgent);
479
+ } else if (/^Allow:\s*\//i.test(line) && currentAgent !== "*") {
480
+ allowed.push(currentAgent);
481
+ }
482
+ }
483
+ if (blocked.length > 0) {
484
+ return {
485
+ id: "robots-txt-agent-rules",
486
+ name: "robots.txt AI Agent Rules",
487
+ category: "content-discoverability",
488
+ status: "warn",
489
+ message: `AI agents blocked: ${blocked.join(", ")}`,
490
+ suggestion: "Consider allowing AI agents to crawl your site for better GEO visibility. Blocked agents can't index your content for AI-powered search.",
491
+ metadata: { blocked, allowed }
492
+ };
493
+ }
494
+ return {
495
+ id: "robots-txt-agent-rules",
496
+ name: "robots.txt AI Agent Rules",
497
+ category: "content-discoverability",
498
+ status: "pass",
499
+ message: `robots.txt present. No AI agents explicitly blocked.${allowed.length > 0 ? ` Explicitly allowed: ${allowed.join(", ")}` : ""}`,
500
+ metadata: { blocked, allowed }
501
+ };
502
+ }
503
+ };
504
+ var contentDiscoverabilityChecks = [
505
+ llmsTxtExists,
506
+ llmsTxtValid,
507
+ llmsTxtSize,
508
+ llmsTxtFreshness,
509
+ llmsTxtLinksResolve,
510
+ llmsTxtLinksMarkdown,
511
+ sitemapExists,
512
+ robotsTxtAgentRules
513
+ ];
514
+
515
+ // src/utils/fetch.ts
516
+ var makeHeaders = (config) => ({
517
+ "User-Agent": config.userAgent ?? DEFAULT_CONFIG.userAgent,
518
+ Accept: "text/html,application/xhtml+xml,text/markdown,text/plain,*/*"
519
+ });
520
+ var fetchPage = async (url, config = {}) => {
521
+ const timeout = config.timeout ?? DEFAULT_CONFIG.timeout;
522
+ const start = Date.now();
523
+ const controller = new AbortController();
524
+ const timer = setTimeout(() => controller.abort(), timeout);
525
+ try {
526
+ const response = await fetch(url, {
527
+ headers: makeHeaders(config),
528
+ signal: controller.signal,
529
+ redirect: "follow"
530
+ });
531
+ const html = await response.text();
532
+ const headers = {};
533
+ response.headers.forEach((value, key) => {
534
+ headers[key] = value;
535
+ });
536
+ return {
537
+ url,
538
+ html,
539
+ statusCode: response.status,
540
+ headers,
541
+ fetchTime: Date.now() - start
542
+ };
543
+ } finally {
544
+ clearTimeout(timer);
545
+ }
546
+ };
547
+ var fetchText = async (url, config = {}) => {
548
+ try {
549
+ const result = await fetchPage(url, config);
550
+ return { text: result.html, statusCode: result.statusCode, headers: result.headers };
551
+ } catch {
552
+ return null;
553
+ }
554
+ };
555
+ var fetchWithContentNegotiation = async (url, accept, config = {}) => {
556
+ const timeout = config.timeout ?? DEFAULT_CONFIG.timeout;
557
+ const controller = new AbortController();
558
+ const timer = setTimeout(() => controller.abort(), timeout);
559
+ try {
560
+ const response = await fetch(url, {
561
+ headers: {
562
+ "User-Agent": config.userAgent ?? DEFAULT_CONFIG.userAgent,
563
+ Accept: accept
564
+ },
565
+ signal: controller.signal,
566
+ redirect: "follow"
567
+ });
568
+ const text = await response.text();
569
+ return {
570
+ text,
571
+ statusCode: response.status,
572
+ contentType: response.headers.get("content-type") ?? ""
573
+ };
574
+ } catch {
575
+ return null;
576
+ } finally {
577
+ clearTimeout(timer);
578
+ }
579
+ };
580
+ var fetchMany = async (urls, config = {}) => {
581
+ const concurrency = config.concurrency ?? DEFAULT_CONFIG.concurrency;
582
+ const results = [];
583
+ for (let i = 0; i < urls.length; i += concurrency) {
584
+ const chunk = urls.slice(i, i + concurrency);
585
+ const chunkResults = await Promise.allSettled(
586
+ chunk.map((url) => fetchPage(url, config))
587
+ );
588
+ for (const result of chunkResults) {
589
+ if (result.status === "fulfilled") {
590
+ results.push(result.value);
591
+ }
592
+ }
593
+ }
594
+ return results;
595
+ };
596
+
597
+ // src/checks/markdown-availability.ts
598
+ var markdownUrlSupport = {
599
+ id: "markdown-url-support",
600
+ name: "Markdown URL Support",
601
+ category: "markdown-availability",
602
+ description: "Checks if pages serve markdown when .md is appended to the URL",
603
+ weight: 0.8,
604
+ requiresNetwork: true,
605
+ run: async (ctx) => {
606
+ const pages = ctx.sampledPages.slice(0, 10);
607
+ if (pages.length === 0) {
608
+ return {
609
+ id: "markdown-url-support",
610
+ name: "Markdown URL Support",
611
+ category: "markdown-availability",
612
+ status: "skip",
613
+ message: "No pages sampled"
614
+ };
615
+ }
616
+ let supported = 0;
617
+ for (const page of pages) {
618
+ const mdUrl = page.url.replace(/\/?$/, ".md");
619
+ const result = await fetchWithContentNegotiation(mdUrl, "text/markdown");
620
+ if (result && result.statusCode === 200 && result.text.length > 50) {
621
+ supported++;
622
+ }
623
+ }
624
+ const pct = Math.round(supported / pages.length * 100);
625
+ if (supported === pages.length) {
626
+ return {
627
+ id: "markdown-url-support",
628
+ name: "Markdown URL Support",
629
+ category: "markdown-availability",
630
+ status: "pass",
631
+ message: `${supported}/${pages.length} sampled pages support .md URLs (${pct}%)`,
632
+ metadata: { supported, total: pages.length }
633
+ };
634
+ }
635
+ return {
636
+ id: "markdown-url-support",
637
+ name: "Markdown URL Support",
638
+ category: "markdown-availability",
639
+ status: supported > 0 ? "warn" : "fail",
640
+ message: `${supported}/${pages.length} sampled pages support .md URLs (${pct}%)`,
641
+ suggestion: "Serve markdown versions of pages at {url}.md \u2014 this makes content easily consumable by AI agents without HTML parsing.",
642
+ metadata: { supported, total: pages.length }
643
+ };
644
+ }
645
+ };
646
+ var contentNegotiation = {
647
+ id: "content-negotiation",
648
+ name: "Content Negotiation",
649
+ category: "markdown-availability",
650
+ description: "Checks if pages serve markdown when Accept: text/markdown is sent",
651
+ weight: 0.7,
652
+ requiresNetwork: true,
653
+ run: async (ctx) => {
654
+ const pages = ctx.sampledPages.slice(0, 10);
655
+ if (pages.length === 0) {
656
+ return {
657
+ id: "content-negotiation",
658
+ name: "Content Negotiation",
659
+ category: "markdown-availability",
660
+ status: "skip",
661
+ message: "No pages sampled"
662
+ };
663
+ }
664
+ let supported = 0;
665
+ for (const page of pages) {
666
+ const result = await fetchWithContentNegotiation(page.url, "text/markdown");
667
+ if (result && result.statusCode === 200 && (result.contentType.includes("text/markdown") || result.contentType.includes("text/plain"))) {
668
+ supported++;
669
+ }
670
+ }
671
+ const pct = Math.round(supported / pages.length * 100);
672
+ if (supported === pages.length) {
673
+ return {
674
+ id: "content-negotiation",
675
+ name: "Content Negotiation",
676
+ category: "markdown-availability",
677
+ status: "pass",
678
+ message: `${supported}/${pages.length} sampled pages support content negotiation (${pct}%)`,
679
+ metadata: { supported, total: pages.length }
680
+ };
681
+ }
682
+ return {
683
+ id: "content-negotiation",
684
+ name: "Content Negotiation",
685
+ category: "markdown-availability",
686
+ status: supported > 0 ? "warn" : "info",
687
+ message: `${supported}/${pages.length} sampled pages support content negotiation (${pct}%)`,
688
+ suggestion: "Implement content negotiation: when an AI agent sends Accept: text/markdown, respond with a markdown version of the page.",
689
+ metadata: { supported, total: pages.length }
690
+ };
691
+ }
692
+ };
693
+ var markdownContentParity = {
694
+ id: "markdown-content-parity",
695
+ name: "Markdown Content Parity",
696
+ category: "markdown-availability",
697
+ description: "Checks if markdown versions contain equivalent content to HTML versions",
698
+ weight: 0.6,
699
+ requiresNetwork: true,
700
+ run: async (ctx) => {
701
+ const pages = ctx.sampledPages.slice(0, 10);
702
+ const pagesWithMarkdown = pages.filter((p) => p.markdown);
703
+ if (pagesWithMarkdown.length === 0) {
704
+ return {
705
+ id: "markdown-content-parity",
706
+ name: "Markdown Content Parity",
707
+ category: "markdown-availability",
708
+ status: "skip",
709
+ message: "No pages with markdown versions found"
710
+ };
711
+ }
712
+ let totalMissing = 0;
713
+ let checked = 0;
714
+ for (const page of pagesWithMarkdown) {
715
+ if (!page.markdown) continue;
716
+ checked++;
717
+ const htmlText = page.html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
718
+ const htmlWords = new Set(htmlText.toLowerCase().split(/\s+/).filter((w) => w.length > 3));
719
+ const mdWords = new Set(page.markdown.toLowerCase().split(/\s+/).filter((w) => w.length > 3));
720
+ const missingWords = [...htmlWords].filter((w) => !mdWords.has(w));
721
+ const missingPct = htmlWords.size > 0 ? missingWords.length / htmlWords.size * 100 : 0;
722
+ totalMissing += missingPct;
723
+ }
724
+ const avgMissing = checked > 0 ? Math.round(totalMissing / checked) : 0;
725
+ if (avgMissing <= 5) {
726
+ return {
727
+ id: "markdown-content-parity",
728
+ name: "Markdown Content Parity",
729
+ category: "markdown-availability",
730
+ status: "pass",
731
+ message: `All ${checked} pages have equivalent markdown and HTML content (avg ${avgMissing}% missing)`,
732
+ metadata: { checked, avgMissing }
733
+ };
734
+ }
735
+ return {
736
+ id: "markdown-content-parity",
737
+ name: "Markdown Content Parity",
738
+ category: "markdown-availability",
739
+ status: avgMissing <= 15 ? "warn" : "fail",
740
+ message: `Markdown versions are missing ~${avgMissing}% of HTML content on average`,
741
+ suggestion: "Ensure markdown versions include all meaningful content from the HTML page. Missing content means AI agents get an incomplete picture.",
742
+ metadata: { checked, avgMissing }
743
+ };
744
+ }
745
+ };
746
+ var markdownAvailabilityChecks = [
747
+ markdownUrlSupport,
748
+ contentNegotiation,
749
+ markdownContentParity
750
+ ];
751
+
752
+ // src/utils/html.ts
753
+ var stripHtml = (html) => html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
754
+ var extractLinks = (html, baseUrl) => {
755
+ const links = [];
756
+ const linkRegex = /<a[^>]+href=["']([^"']+)["']/gi;
757
+ let match;
758
+ while ((match = linkRegex.exec(html)) !== null) {
759
+ try {
760
+ const resolved = new URL(match[1], baseUrl).href;
761
+ links.push(resolved);
762
+ } catch {
763
+ }
764
+ }
765
+ return links;
766
+ };
767
+ var extractMetaTags = (html) => {
768
+ const meta = {};
769
+ const metaRegex = /<meta[^>]+(?:name|property)=["']([^"']+)["'][^>]+content=["']([^"']+)["']/gi;
770
+ let match;
771
+ while ((match = metaRegex.exec(html)) !== null) {
772
+ meta[match[1]] = match[2];
773
+ }
774
+ const metaRegex2 = /<meta[^>]+content=["']([^"']+)["'][^>]+(?:name|property)=["']([^"']+)["']/gi;
775
+ while ((match = metaRegex2.exec(html)) !== null) {
776
+ meta[match[2]] = match[1];
777
+ }
778
+ return meta;
779
+ };
780
+ var extractJsonLd = (html) => {
781
+ const results = [];
782
+ const regex = /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
783
+ let match;
784
+ while ((match = regex.exec(html)) !== null) {
785
+ try {
786
+ results.push(JSON.parse(match[1]));
787
+ } catch {
788
+ }
789
+ }
790
+ return results;
791
+ };
792
+ var extractHeadings = (html) => {
793
+ const headings = [];
794
+ const regex = /<h([1-6])[^>]*>([\s\S]*?)<\/h\1>/gi;
795
+ let match;
796
+ while ((match = regex.exec(html)) !== null) {
797
+ headings.push({
798
+ level: parseInt(match[1], 10),
799
+ text: stripHtml(match[2]).trim()
800
+ });
801
+ }
802
+ return headings;
803
+ };
804
+ var hasServerRenderedContent = (html) => {
805
+ const withoutScripts = html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "");
806
+ const textContent = stripHtml(withoutScripts);
807
+ return textContent.length > 100;
808
+ };
809
+ var findContentStartPosition = (html) => {
810
+ const markers = [
811
+ /<main[\s>]/i,
812
+ /<article[\s>]/i,
813
+ /id=["']content["']/i,
814
+ /id=["']main["']/i,
815
+ /class=["'][^"']*content[^"']*["']/i,
816
+ /role=["']main["']/i
817
+ ];
818
+ for (const marker of markers) {
819
+ const match = html.search(marker);
820
+ if (match >= 0) {
821
+ return match / html.length;
822
+ }
823
+ }
824
+ const firstP = html.search(/<p[\s>]/i);
825
+ if (firstP >= 0) {
826
+ return firstP / html.length;
827
+ }
828
+ return 0.5;
829
+ };
830
+ var extractCodeFences = (markdown) => {
831
+ const fences = [];
832
+ const lines = markdown.split("\n");
833
+ let inFence = false;
834
+ let currentLang = "";
835
+ for (const line of lines) {
836
+ const openMatch = line.match(/^```(\w*)/);
837
+ if (openMatch && !inFence) {
838
+ inFence = true;
839
+ currentLang = openMatch[1] ?? "";
840
+ } else if (line.trim() === "```" && inFence) {
841
+ fences.push({ lang: currentLang, closed: true });
842
+ inFence = false;
843
+ currentLang = "";
844
+ }
845
+ }
846
+ if (inFence) {
847
+ fences.push({ lang: currentLang, closed: false });
848
+ }
849
+ return fences;
850
+ };
851
+ var parseSitemapUrls = (xml) => {
852
+ const urls = [];
853
+ const regex = /<loc>([^<]+)<\/loc>/gi;
854
+ let match;
855
+ while ((match = regex.exec(xml)) !== null) {
856
+ urls.push(match[1].trim());
857
+ }
858
+ return urls;
859
+ };
860
+
861
+ // src/checks/page-size.ts
862
+ var MAX_HTML_CHARS = 5e4;
863
+ var MAX_MD_CHARS = 5e4;
864
+ var renderingStrategy = {
865
+ id: "rendering-strategy",
866
+ name: "Rendering Strategy",
867
+ category: "page-size",
868
+ description: "Checks if pages contain server-rendered content (vs client-side only)",
869
+ weight: 1,
870
+ run: async (ctx) => {
871
+ const pages = ctx.sampledPages.slice(0, 10);
872
+ if (pages.length === 0) {
873
+ return {
874
+ id: "rendering-strategy",
875
+ name: "Rendering Strategy",
876
+ category: "page-size",
877
+ status: "skip",
878
+ message: "No pages sampled"
879
+ };
880
+ }
881
+ let ssrCount = 0;
882
+ for (const page of pages) {
883
+ if (hasServerRenderedContent(page.html)) ssrCount++;
884
+ }
885
+ if (ssrCount === pages.length) {
886
+ return {
887
+ id: "rendering-strategy",
888
+ name: "Rendering Strategy",
889
+ category: "page-size",
890
+ status: "pass",
891
+ message: `All ${pages.length} sampled pages contain server-rendered content`
892
+ };
893
+ }
894
+ const csrCount = pages.length - ssrCount;
895
+ return {
896
+ id: "rendering-strategy",
897
+ name: "Rendering Strategy",
898
+ category: "page-size",
899
+ status: csrCount > pages.length / 2 ? "fail" : "warn",
900
+ message: `${csrCount}/${pages.length} pages appear to be client-side rendered only`,
901
+ suggestion: "AI agents and crawlers can't execute JavaScript. Use SSR, SSG, or pre-rendering to ensure your content is in the initial HTML response."
902
+ };
903
+ }
904
+ };
905
+ var pageSizeHtml = {
906
+ id: "page-size-html",
907
+ name: "Page Size (HTML)",
908
+ category: "page-size",
909
+ description: "Checks if HTML pages convert to under 50K characters of text content",
910
+ weight: 0.6,
911
+ run: async (ctx) => {
912
+ const pages = ctx.sampledPages.slice(0, 10);
913
+ if (pages.length === 0) {
914
+ return {
915
+ id: "page-size-html",
916
+ name: "Page Size (HTML)",
917
+ category: "page-size",
918
+ status: "skip",
919
+ message: "No pages sampled"
920
+ };
921
+ }
922
+ const sizes = pages.map((p) => {
923
+ const textContent = stripHtml(p.html);
924
+ const boilerplate = Math.round((1 - textContent.length / Math.max(p.html.length, 1)) * 100);
925
+ return { url: p.url, htmlSize: p.html.length, textSize: textContent.length, boilerplate };
926
+ });
927
+ const overLimit = sizes.filter((s) => s.textSize > MAX_HTML_CHARS);
928
+ const median = sizes.map((s) => s.textSize).sort((a, b) => a - b)[Math.floor(sizes.length / 2)];
929
+ const avgBoilerplate = Math.round(sizes.reduce((sum, s) => sum + s.boilerplate, 0) / sizes.length);
930
+ if (overLimit.length === 0) {
931
+ return {
932
+ id: "page-size-html",
933
+ name: "Page Size (HTML)",
934
+ category: "page-size",
935
+ status: "pass",
936
+ message: `All ${pages.length} sampled pages convert under ${(MAX_HTML_CHARS / 1e3).toFixed(0)}K chars (median ${(median / 1e3).toFixed(0)}K, ${avgBoilerplate}% boilerplate)`,
937
+ metadata: { median, avgBoilerplate }
938
+ };
939
+ }
940
+ return {
941
+ id: "page-size-html",
942
+ name: "Page Size (HTML)",
943
+ category: "page-size",
944
+ status: "warn",
945
+ message: `${overLimit.length}/${pages.length} pages exceed ${(MAX_HTML_CHARS / 1e3).toFixed(0)}K chars of content`,
946
+ suggestion: "Large pages may be truncated by AI agents. Consider splitting into smaller, focused pages or providing a table of contents.",
947
+ metadata: { overLimit: overLimit.length, median, avgBoilerplate }
948
+ };
949
+ }
950
+ };
951
+ var pageSizeMarkdown = {
952
+ id: "page-size-markdown",
953
+ name: "Page Size (Markdown)",
954
+ category: "page-size",
955
+ description: "Checks if markdown versions are under 50K characters",
956
+ weight: 0.5,
957
+ run: async (ctx) => {
958
+ const pagesWithMd = ctx.sampledPages.filter((p) => p.markdown).slice(0, 10);
959
+ if (pagesWithMd.length === 0) {
960
+ return {
961
+ id: "page-size-markdown",
962
+ name: "Page Size (Markdown)",
963
+ category: "page-size",
964
+ status: "skip",
965
+ message: "No markdown versions available"
966
+ };
967
+ }
968
+ const sizes = pagesWithMd.map((p) => ({ url: p.url, size: p.markdown.length }));
969
+ const overLimit = sizes.filter((s) => s.size > MAX_MD_CHARS);
970
+ const median = sizes.map((s) => s.size).sort((a, b) => a - b)[Math.floor(sizes.length / 2)];
971
+ const max = Math.max(...sizes.map((s) => s.size));
972
+ if (overLimit.length === 0) {
973
+ return {
974
+ id: "page-size-markdown",
975
+ name: "Page Size (Markdown)",
976
+ category: "page-size",
977
+ status: "pass",
978
+ message: `All ${pagesWithMd.length} pages under ${(MAX_MD_CHARS / 1e3).toFixed(0)}K chars (median ${(median / 1e3).toFixed(0)}K, max ${(max / 1e3).toFixed(0)}K)`,
979
+ metadata: { median, max }
980
+ };
981
+ }
982
+ return {
983
+ id: "page-size-markdown",
984
+ name: "Page Size (Markdown)",
985
+ category: "page-size",
986
+ status: "warn",
987
+ message: `${overLimit.length}/${pagesWithMd.length} markdown pages exceed ${(MAX_MD_CHARS / 1e3).toFixed(0)}K chars`,
988
+ suggestion: "Split large markdown pages into smaller sections to avoid AI agent context window truncation."
989
+ };
990
+ }
991
+ };
992
+ var contentStartPosition = {
993
+ id: "content-start-position",
994
+ name: "Content Start Position",
995
+ category: "page-size",
996
+ description: "Checks if main content starts within the first 10% of the HTML",
997
+ weight: 0.5,
998
+ run: async (ctx) => {
999
+ const pages = ctx.sampledPages.slice(0, 10);
1000
+ if (pages.length === 0) {
1001
+ return {
1002
+ id: "content-start-position",
1003
+ name: "Content Start Position",
1004
+ category: "page-size",
1005
+ status: "skip",
1006
+ message: "No pages sampled"
1007
+ };
1008
+ }
1009
+ const positions = pages.map((p) => ({
1010
+ url: p.url,
1011
+ position: findContentStartPosition(p.html)
1012
+ }));
1013
+ const earlyStart = positions.filter((p) => p.position <= 0.1);
1014
+ const medianPct = Math.round(
1015
+ positions.map((p) => p.position).sort((a, b) => a - b)[Math.floor(positions.length / 2)] * 100
1016
+ );
1017
+ if (earlyStart.length === pages.length) {
1018
+ return {
1019
+ id: "content-start-position",
1020
+ name: "Content Start Position",
1021
+ category: "page-size",
1022
+ status: "pass",
1023
+ message: `Content starts within first 10% on all ${pages.length} sampled pages (median ${medianPct}%)`,
1024
+ metadata: { medianPct }
1025
+ };
1026
+ }
1027
+ return {
1028
+ id: "content-start-position",
1029
+ name: "Content Start Position",
1030
+ category: "page-size",
1031
+ status: "warn",
1032
+ message: `Content starts late on ${pages.length - earlyStart.length}/${pages.length} pages (median ${medianPct}%)`,
1033
+ suggestion: "Move main content higher in the HTML. AI agents may waste context window tokens on navigation, headers, and boilerplate before reaching actual content.",
1034
+ metadata: { medianPct, earlyStart: earlyStart.length }
1035
+ };
1036
+ }
1037
+ };
1038
+ var pageSizeChecks = [
1039
+ renderingStrategy,
1040
+ pageSizeHtml,
1041
+ pageSizeMarkdown,
1042
+ contentStartPosition
1043
+ ];
1044
+
1045
+ // src/checks/content-structure.ts
1046
+ var markdownCodeFenceValidity = {
1047
+ id: "markdown-code-fence-validity",
1048
+ name: "Markdown Code Fence Validity",
1049
+ category: "content-structure",
1050
+ description: "Checks if all code fences in markdown content are properly closed",
1051
+ weight: 0.5,
1052
+ run: async (ctx) => {
1053
+ const pagesWithMd = ctx.sampledPages.filter((p) => p.markdown);
1054
+ if (pagesWithMd.length === 0) {
1055
+ let totalFences2 = 0;
1056
+ let unclosed2 = 0;
1057
+ for (const page of ctx.sampledPages.slice(0, 10)) {
1058
+ const codeBlockRegex = /<code[\s\S]*?<\/code>/gi;
1059
+ const matches = page.html.match(codeBlockRegex);
1060
+ if (matches) totalFences2 += matches.length;
1061
+ }
1062
+ if (totalFences2 === 0) {
1063
+ return {
1064
+ id: "markdown-code-fence-validity",
1065
+ name: "Markdown Code Fence Validity",
1066
+ category: "content-structure",
1067
+ status: "info",
1068
+ message: "No code blocks detected"
1069
+ };
1070
+ }
1071
+ return {
1072
+ id: "markdown-code-fence-validity",
1073
+ name: "Markdown Code Fence Validity",
1074
+ category: "content-structure",
1075
+ status: "pass",
1076
+ message: `All ${totalFences2} code blocks properly closed across ${ctx.sampledPages.length} pages`
1077
+ };
1078
+ }
1079
+ let totalFences = 0;
1080
+ let unclosed = 0;
1081
+ for (const page of pagesWithMd) {
1082
+ const fences = extractCodeFences(page.markdown);
1083
+ totalFences += fences.length;
1084
+ unclosed += fences.filter((f) => !f.closed).length;
1085
+ }
1086
+ if (unclosed === 0) {
1087
+ return {
1088
+ id: "markdown-code-fence-validity",
1089
+ name: "Markdown Code Fence Validity",
1090
+ category: "content-structure",
1091
+ status: "pass",
1092
+ message: `All ${totalFences} code fences properly closed across ${pagesWithMd.length} pages`
1093
+ };
1094
+ }
1095
+ return {
1096
+ id: "markdown-code-fence-validity",
1097
+ name: "Markdown Code Fence Validity",
1098
+ category: "content-structure",
1099
+ status: "fail",
1100
+ message: `${unclosed} unclosed code fence(s) found across ${pagesWithMd.length} pages`,
1101
+ suggestion: "Unclosed code fences cause AI agents to misparse content. Ensure every ``` has a matching closing ```."
1102
+ };
1103
+ }
1104
+ };
1105
+ var sectionHeaderQuality = {
1106
+ id: "section-header-quality",
1107
+ name: "Section Header Quality",
1108
+ category: "content-structure",
1109
+ description: "Checks if pages have a logical heading hierarchy (H1 \u2192 H2 \u2192 H3, no skips)",
1110
+ weight: 0.6,
1111
+ run: async (ctx) => {
1112
+ const pages = ctx.sampledPages.slice(0, 10);
1113
+ if (pages.length === 0) {
1114
+ return {
1115
+ id: "section-header-quality",
1116
+ name: "Section Header Quality",
1117
+ category: "content-structure",
1118
+ status: "skip",
1119
+ message: "No pages sampled"
1120
+ };
1121
+ }
1122
+ let goodPages = 0;
1123
+ const issues = [];
1124
+ for (const page of pages) {
1125
+ const headings = extractHeadings(page.html);
1126
+ if (headings.length === 0) continue;
1127
+ let pageGood = true;
1128
+ const hasH1 = headings.some((h) => h.level === 1);
1129
+ if (!hasH1) {
1130
+ pageGood = false;
1131
+ }
1132
+ const h1Count = headings.filter((h) => h.level === 1).length;
1133
+ if (h1Count > 1) {
1134
+ pageGood = false;
1135
+ }
1136
+ for (let i = 1; i < headings.length; i++) {
1137
+ const prev = headings[i - 1].level;
1138
+ const curr = headings[i].level;
1139
+ if (curr > prev + 1) {
1140
+ pageGood = false;
1141
+ break;
1142
+ }
1143
+ }
1144
+ if (pageGood) goodPages++;
1145
+ }
1146
+ if (goodPages === pages.length) {
1147
+ return {
1148
+ id: "section-header-quality",
1149
+ name: "Section Header Quality",
1150
+ category: "content-structure",
1151
+ status: "pass",
1152
+ message: `All ${pages.length} pages have proper heading hierarchy`
1153
+ };
1154
+ }
1155
+ return {
1156
+ id: "section-header-quality",
1157
+ name: "Section Header Quality",
1158
+ category: "content-structure",
1159
+ status: "warn",
1160
+ message: `${pages.length - goodPages}/${pages.length} pages have heading hierarchy issues (skipped levels, missing H1, or multiple H1s)`,
1161
+ suggestion: "Use a single H1 per page and don't skip heading levels (e.g., H1 \u2192 H3). AI agents use heading hierarchy to understand page structure and section boundaries."
1162
+ };
1163
+ }
1164
+ };
1165
+ var tabbedContentSerialization = {
1166
+ id: "tabbed-content-serialization",
1167
+ name: "Tabbed Content Serialization",
1168
+ category: "content-structure",
1169
+ description: "Checks if tabbed/accordion content is properly serialized in HTML",
1170
+ weight: 0.4,
1171
+ run: async (ctx) => {
1172
+ const pages = ctx.sampledPages.slice(0, 10);
1173
+ let pagesWithTabs = 0;
1174
+ let pagesWithHiddenContent = 0;
1175
+ for (const page of pages) {
1176
+ const hasTabs = /role=["']tabpanel["']|data-tab|tab-content|tabs-container/i.test(page.html);
1177
+ if (hasTabs) {
1178
+ pagesWithTabs++;
1179
+ const hiddenPanels = (page.html.match(/hidden|display:\s*none|aria-hidden=["']true["']/gi) ?? []).length;
1180
+ if (hiddenPanels > 2) {
1181
+ pagesWithHiddenContent++;
1182
+ }
1183
+ }
1184
+ }
1185
+ if (pagesWithTabs === 0) {
1186
+ return {
1187
+ id: "tabbed-content-serialization",
1188
+ name: "Tabbed Content Serialization",
1189
+ category: "content-structure",
1190
+ status: "info",
1191
+ message: `No tabbed content detected across ${pages.length} sampled pages`
1192
+ };
1193
+ }
1194
+ if (pagesWithHiddenContent === 0) {
1195
+ return {
1196
+ id: "tabbed-content-serialization",
1197
+ name: "Tabbed Content Serialization",
1198
+ category: "content-structure",
1199
+ status: "pass",
1200
+ message: `${pagesWithTabs} pages with tabbed content \u2014 all tabs are server-rendered`
1201
+ };
1202
+ }
1203
+ return {
1204
+ id: "tabbed-content-serialization",
1205
+ name: "Tabbed Content Serialization",
1206
+ category: "content-structure",
1207
+ status: "warn",
1208
+ message: `${pagesWithHiddenContent}/${pagesWithTabs} tabbed pages have hidden content that AI agents may miss`,
1209
+ suggestion: "Serialize all tab content in the HTML even if it's visually hidden. AI agents can't click tabs \u2014 they only see the initial HTML."
1210
+ };
1211
+ }
1212
+ };
1213
+ var contentStructureChecks = [
1214
+ markdownCodeFenceValidity,
1215
+ sectionHeaderQuality,
1216
+ tabbedContentSerialization
1217
+ ];
1218
+
1219
+ // src/checks/url-stability.ts
1220
+ var httpStatusCodes = {
1221
+ id: "http-status-codes",
1222
+ name: "HTTP Status Codes",
1223
+ category: "url-stability",
1224
+ description: "Checks if the site returns proper 404 for bad URLs (vs soft 404s)",
1225
+ weight: 0.6,
1226
+ requiresNetwork: true,
1227
+ run: async (ctx) => {
1228
+ const testUrls = [
1229
+ `${ctx.baseUrl.origin}/this-page-definitely-does-not-exist-${Date.now()}`,
1230
+ `${ctx.baseUrl.origin}/404-test-agentimization-audit`
1231
+ ];
1232
+ let proper404 = 0;
1233
+ for (const url of testUrls) {
1234
+ try {
1235
+ const resp = await fetch(url, { method: "GET", redirect: "follow" });
1236
+ if (resp.status === 404) proper404++;
1237
+ } catch {
1238
+ }
1239
+ }
1240
+ if (proper404 === testUrls.length) {
1241
+ return {
1242
+ id: "http-status-codes",
1243
+ name: "HTTP Status Codes",
1244
+ category: "url-stability",
1245
+ status: "pass",
1246
+ message: `All ${ctx.sampledPages.length} sampled pages return proper error codes for bad URLs`
1247
+ };
1248
+ }
1249
+ return {
1250
+ id: "http-status-codes",
1251
+ name: "HTTP Status Codes",
1252
+ category: "url-stability",
1253
+ status: "warn",
1254
+ message: "Site may be returning soft 404s (200 status for non-existent pages)",
1255
+ suggestion: "Return proper 404 status codes for non-existent pages. Soft 404s confuse AI agents and waste their context on error pages."
1256
+ };
1257
+ }
1258
+ };
1259
+ var redirectBehavior = {
1260
+ id: "redirect-behavior",
1261
+ name: "Redirect Behavior",
1262
+ category: "url-stability",
1263
+ description: "Checks if sampled pages have clean redirect behavior (no excessive chains)",
1264
+ weight: 0.4,
1265
+ requiresNetwork: true,
1266
+ run: async (ctx) => {
1267
+ const pages = ctx.sampledPages.slice(0, 10);
1268
+ let redirected = 0;
1269
+ for (const page of pages) {
1270
+ if (page.headers["location"]) {
1271
+ redirected++;
1272
+ }
1273
+ }
1274
+ if (redirected === 0) {
1275
+ return {
1276
+ id: "redirect-behavior",
1277
+ name: "Redirect Behavior",
1278
+ category: "url-stability",
1279
+ status: "pass",
1280
+ message: `No redirects detected across ${pages.length} sampled pages`
1281
+ };
1282
+ }
1283
+ return {
1284
+ id: "redirect-behavior",
1285
+ name: "Redirect Behavior",
1286
+ category: "url-stability",
1287
+ status: "warn",
1288
+ message: `${redirected}/${pages.length} sampled pages involve redirects`,
1289
+ suggestion: "Minimize redirects. Each redirect adds latency for AI agents and some agents may not follow redirect chains properly."
1290
+ };
1291
+ }
1292
+ };
1293
+ var cacheHeaderHygiene = {
1294
+ id: "cache-header-hygiene",
1295
+ name: "Cache Header Hygiene",
1296
+ category: "url-stability",
1297
+ description: "Checks if pages have appropriate cache headers for AI agent crawling",
1298
+ weight: 0.4,
1299
+ requiresNetwork: true,
1300
+ run: async (ctx) => {
1301
+ const pages = ctx.sampledPages.slice(0, 10);
1302
+ let withCacheHeaders = 0;
1303
+ for (const page of pages) {
1304
+ const hasCacheControl = !!page.headers["cache-control"];
1305
+ const hasETag = !!page.headers["etag"];
1306
+ const hasLastModified = !!page.headers["last-modified"];
1307
+ if (hasCacheControl || hasETag || hasLastModified) {
1308
+ withCacheHeaders++;
1309
+ }
1310
+ }
1311
+ if (withCacheHeaders === pages.length) {
1312
+ return {
1313
+ id: "cache-header-hygiene",
1314
+ name: "Cache Header Hygiene",
1315
+ category: "url-stability",
1316
+ status: "pass",
1317
+ message: `All ${pages.length + 1} endpoints have appropriate cache headers`
1318
+ };
1319
+ }
1320
+ return {
1321
+ id: "cache-header-hygiene",
1322
+ name: "Cache Header Hygiene",
1323
+ category: "url-stability",
1324
+ status: "warn",
1325
+ message: `${pages.length - withCacheHeaders}/${pages.length} pages missing cache headers`,
1326
+ suggestion: "Add Cache-Control, ETag, or Last-Modified headers. This helps AI agents efficiently re-crawl your content without re-fetching unchanged pages."
1327
+ };
1328
+ }
1329
+ };
1330
+ var urlStabilityChecks = [
1331
+ httpStatusCodes,
1332
+ redirectBehavior,
1333
+ cacheHeaderHygiene
1334
+ ];
1335
+
1336
+ // src/checks/authentication.ts
1337
+ var authGateDetection = {
1338
+ id: "auth-gate-detection",
1339
+ name: "Auth Gate Detection",
1340
+ category: "authentication",
1341
+ description: "Checks if pages are publicly accessible without authentication",
1342
+ weight: 0.9,
1343
+ requiresNetwork: true,
1344
+ run: async (ctx) => {
1345
+ const pages = ctx.sampledPages.slice(0, 10);
1346
+ let gated = 0;
1347
+ const gatedUrls = [];
1348
+ for (const page of pages) {
1349
+ const isGated = page.statusCode === 401 || page.statusCode === 403 || /login|sign.?in|authenticate/i.test(page.html.slice(0, 5e3)) && page.html.length < 5e3;
1350
+ if (isGated) {
1351
+ gated++;
1352
+ gatedUrls.push(page.url);
1353
+ }
1354
+ }
1355
+ if (gated === 0) {
1356
+ return {
1357
+ id: "auth-gate-detection",
1358
+ name: "Auth Gate Detection",
1359
+ category: "authentication",
1360
+ status: "pass",
1361
+ message: `All ${pages.length} sampled pages are publicly accessible`
1362
+ };
1363
+ }
1364
+ return {
1365
+ id: "auth-gate-detection",
1366
+ name: "Auth Gate Detection",
1367
+ category: "authentication",
1368
+ status: "fail",
1369
+ message: `${gated}/${pages.length} pages are behind an auth gate`,
1370
+ suggestion: "AI agents can't authenticate. Move content you want discoverable to public pages, or provide an alternative access method (API, llms.txt summary).",
1371
+ metadata: { gatedUrls }
1372
+ };
1373
+ }
1374
+ };
1375
+ var authAlternativeAccess = {
1376
+ id: "auth-alternative-access",
1377
+ name: "Auth Alternative Access",
1378
+ category: "authentication",
1379
+ description: "Checks if gated content has alternative access paths for AI agents",
1380
+ weight: 0.5,
1381
+ requiresNetwork: true,
1382
+ run: async (ctx) => {
1383
+ const pages = ctx.sampledPages.slice(0, 10);
1384
+ const gated = pages.filter(
1385
+ (p) => p.statusCode === 401 || p.statusCode === 403
1386
+ );
1387
+ if (gated.length === 0) {
1388
+ return {
1389
+ id: "auth-alternative-access",
1390
+ name: "Auth Alternative Access",
1391
+ category: "authentication",
1392
+ status: "pass",
1393
+ message: "All docs pages are publicly accessible; no alternative access paths needed"
1394
+ };
1395
+ }
1396
+ const hasAlternative = ctx.llmsTxt && ctx.llmsTxt.length > 100;
1397
+ if (hasAlternative) {
1398
+ return {
1399
+ id: "auth-alternative-access",
1400
+ name: "Auth Alternative Access",
1401
+ category: "authentication",
1402
+ status: "warn",
1403
+ message: `${gated.length} pages are gated but llms.txt provides alternative content summary`,
1404
+ suggestion: "Consider expanding llms.txt to cover all gated content with sufficient detail for AI agents."
1405
+ };
1406
+ }
1407
+ return {
1408
+ id: "auth-alternative-access",
1409
+ name: "Auth Alternative Access",
1410
+ category: "authentication",
1411
+ status: "fail",
1412
+ message: `${gated.length} pages are gated with no alternative access for AI agents`,
1413
+ suggestion: "Provide a public llms.txt or API endpoint that summarizes gated content. AI agents need some way to understand what's behind the auth wall."
1414
+ };
1415
+ }
1416
+ };
1417
+ var authenticationChecks = [
1418
+ authGateDetection,
1419
+ authAlternativeAccess
1420
+ ];
1421
+
1422
+ // src/checks/geo-signals.ts
1423
+ var detectFramework = (pages) => {
1424
+ for (const page of pages) {
1425
+ const xpb = (page.headers["x-powered-by"] ?? "").toLowerCase();
1426
+ if (xpb.includes("next.js")) return "next";
1427
+ if (xpb.includes("nuxt")) return "nuxt";
1428
+ const html = page.html;
1429
+ if (/\/_next\/static\//.test(html) || /<script[^>]+id="__NEXT_DATA__"/.test(html)) return "next";
1430
+ if (/__NUXT__\s*=/.test(html) || /\/_nuxt\//.test(html)) return "nuxt";
1431
+ if (/data-sveltekit-/.test(html)) return "sveltekit";
1432
+ if (/<meta\s+name="generator"\s+content="Astro/i.test(html)) return "astro";
1433
+ if (/<meta\s+name="generator"\s+content="WordPress/i.test(html)) return "wordpress";
1434
+ if (/\/build\/_assets\/.*\.js/.test(html) && /window\.__remixContext/.test(html)) return "remix";
1435
+ }
1436
+ return null;
1437
+ };
1438
+ var FRAMEWORK_DOCS = {
1439
+ next: "https://nextjs.org/docs/app/guides/json-ld",
1440
+ nuxt: "https://nuxt.com/modules/schema-org",
1441
+ sveltekit: "https://kit.svelte.dev/docs/seo#manual-setup-structured-data",
1442
+ astro: "https://docs.astro.build/en/guides/integrations-guide/sitemap/#structured-data",
1443
+ remix: "https://remix.run/docs/en/main/route/meta",
1444
+ wordpress: "https://yoast.com/structured-data-with-schema-org-the-ultimate-guide/"
1445
+ };
1446
+ var frameworkHint = (fw) => {
1447
+ if (fw && FRAMEWORK_DOCS[fw]) return ` See: ${FRAMEWORK_DOCS[fw]}`;
1448
+ return " See: https://schema.org/docs/gs.html";
1449
+ };
1450
+ var structuredDataCoverage = {
1451
+ id: "structured-data-coverage",
1452
+ name: "Structured Data Coverage",
1453
+ category: "geo-signals",
1454
+ description: "Checks for schema.org / JSON-LD structured data on pages",
1455
+ weight: 0.8,
1456
+ run: async (ctx) => {
1457
+ const pages = ctx.sampledPages.slice(0, 10);
1458
+ let withStructuredData = 0;
1459
+ const types = [];
1460
+ for (const page of pages) {
1461
+ const jsonLd = extractJsonLd(page.html);
1462
+ if (jsonLd.length > 0) {
1463
+ withStructuredData++;
1464
+ for (const item of jsonLd) {
1465
+ const type = item?.["@type"];
1466
+ if (typeof type === "string") types.push(type);
1467
+ }
1468
+ }
1469
+ }
1470
+ const uniqueTypes = [...new Set(types)];
1471
+ if (withStructuredData === pages.length) {
1472
+ return {
1473
+ id: "structured-data-coverage",
1474
+ name: "Structured Data Coverage",
1475
+ category: "geo-signals",
1476
+ status: "pass",
1477
+ message: `All ${pages.length} pages have structured data. Types: ${uniqueTypes.join(", ") || "detected"}`,
1478
+ metadata: { withStructuredData, types: uniqueTypes }
1479
+ };
1480
+ }
1481
+ const fw = detectFramework(pages);
1482
+ if (withStructuredData > 0) {
1483
+ return {
1484
+ id: "structured-data-coverage",
1485
+ name: "Structured Data Coverage",
1486
+ category: "geo-signals",
1487
+ status: "warn",
1488
+ message: `${withStructuredData}/${pages.length} pages have structured data${uniqueTypes.length > 0 ? ` (${uniqueTypes.join(", ")})` : ""}`,
1489
+ suggestion: `Add JSON-LD structured data (schema.org) to all pages. This helps generative engines understand your content type, authorship, and relationships.${frameworkHint(fw)}`,
1490
+ metadata: { withStructuredData, types: uniqueTypes, framework: fw }
1491
+ };
1492
+ }
1493
+ return {
1494
+ id: "structured-data-coverage",
1495
+ name: "Structured Data Coverage",
1496
+ category: "geo-signals",
1497
+ status: "fail",
1498
+ message: "No structured data (JSON-LD / schema.org) found on any sampled page",
1499
+ suggestion: `Add JSON-LD structured data to your pages. At minimum use Article, WebPage, or Organization schema. This is a strong GEO signal \u2014 generative engines use it to decide what to cite.${frameworkHint(fw)}`,
1500
+ metadata: { framework: fw }
1501
+ };
1502
+ }
1503
+ };
1504
+ var citationWorthiness = {
1505
+ id: "citation-worthiness",
1506
+ name: "Citation Worthiness",
1507
+ category: "geo-signals",
1508
+ description: "Checks for content signals that make pages citable by AI (stats, data points, quotes, definitions)",
1509
+ weight: 0.7,
1510
+ run: async (ctx) => {
1511
+ const pages = ctx.sampledPages.slice(0, 10);
1512
+ let citablePages = 0;
1513
+ const signals = {
1514
+ withStats: 0,
1515
+ withDefinitions: 0,
1516
+ withQuotes: 0,
1517
+ withLists: 0,
1518
+ withTables: 0
1519
+ };
1520
+ for (const page of pages) {
1521
+ const html = page.html;
1522
+ let pageCitable = false;
1523
+ if (/\d+%|\d+x|[0-9,]+\s+(users|customers|companies|downloads)/i.test(html)) {
1524
+ signals.withStats++;
1525
+ pageCitable = true;
1526
+ }
1527
+ if (/<dfn|<dt|"is a |"refers to |"means /i.test(html)) {
1528
+ signals.withDefinitions++;
1529
+ pageCitable = true;
1530
+ }
1531
+ if (/<blockquote/i.test(html)) {
1532
+ signals.withQuotes++;
1533
+ pageCitable = true;
1534
+ }
1535
+ if (/<table[\s\S]*?<\/table>/i.test(html)) {
1536
+ signals.withTables++;
1537
+ pageCitable = true;
1538
+ }
1539
+ if (/<ol[\s\S]*?<\/ol>/i.test(html)) {
1540
+ signals.withLists++;
1541
+ pageCitable = true;
1542
+ }
1543
+ if (pageCitable) citablePages++;
1544
+ }
1545
+ if (citablePages >= pages.length * 0.7) {
1546
+ return {
1547
+ id: "citation-worthiness",
1548
+ name: "Citation Worthiness",
1549
+ category: "geo-signals",
1550
+ status: "pass",
1551
+ message: `${citablePages}/${pages.length} pages contain citable content (stats, definitions, tables, structured data)`,
1552
+ metadata: signals
1553
+ };
1554
+ }
1555
+ return {
1556
+ id: "citation-worthiness",
1557
+ name: "Citation Worthiness",
1558
+ category: "geo-signals",
1559
+ status: citablePages > 0 ? "warn" : "fail",
1560
+ message: `Only ${citablePages}/${pages.length} pages contain citable content signals`,
1561
+ suggestion: "Add concrete data points, statistics, definitions, and structured information. Generative engines prefer citing content with specific, verifiable claims over vague prose.",
1562
+ metadata: signals
1563
+ };
1564
+ }
1565
+ };
1566
+ var topicalAuthoritySignals = {
1567
+ id: "topical-authority-signals",
1568
+ name: "Topical Authority Signals",
1569
+ category: "geo-signals",
1570
+ description: "Checks internal linking depth and content clustering as authority signals",
1571
+ weight: 0.6,
1572
+ run: async (ctx) => {
1573
+ const pages = ctx.sampledPages.slice(0, 10);
1574
+ let totalInternalLinks = 0;
1575
+ let pagesWithGoodLinking = 0;
1576
+ for (const page of pages) {
1577
+ const links = extractLinks(page.html, ctx.baseUrl.origin);
1578
+ const internalLinks = ctx.mode === "local" ? links.filter((l) => !l.startsWith("http://") && !l.startsWith("https://")) : links.filter((l) => {
1579
+ try {
1580
+ return new URL(l).origin === ctx.baseUrl.origin;
1581
+ } catch {
1582
+ return false;
1583
+ }
1584
+ });
1585
+ totalInternalLinks += internalLinks.length;
1586
+ if (internalLinks.length >= 3) pagesWithGoodLinking++;
1587
+ }
1588
+ const avgLinks = pages.length > 0 ? Math.round(totalInternalLinks / pages.length) : 0;
1589
+ if (avgLinks >= 5 && pagesWithGoodLinking >= pages.length * 0.7) {
1590
+ return {
1591
+ id: "topical-authority-signals",
1592
+ name: "Topical Authority Signals",
1593
+ category: "geo-signals",
1594
+ status: "pass",
1595
+ message: `Strong internal linking: avg ${avgLinks} internal links/page, ${pagesWithGoodLinking}/${pages.length} pages well-connected`,
1596
+ metadata: { avgLinks, pagesWithGoodLinking }
1597
+ };
1598
+ }
1599
+ return {
1600
+ id: "topical-authority-signals",
1601
+ name: "Topical Authority Signals",
1602
+ category: "geo-signals",
1603
+ status: avgLinks >= 2 ? "warn" : "fail",
1604
+ message: `Weak internal linking: avg ${avgLinks} internal links/page`,
1605
+ suggestion: "Increase internal linking between related pages. Generative engines use link density and clustering to assess topical authority \u2014 well-linked content is more likely to be cited.",
1606
+ metadata: { avgLinks, pagesWithGoodLinking }
1607
+ };
1608
+ }
1609
+ };
1610
+ var contentFreshness = {
1611
+ id: "content-freshness",
1612
+ name: "Content Freshness",
1613
+ category: "geo-signals",
1614
+ description: "Checks for date signals (last-modified headers, published dates, updated dates)",
1615
+ weight: 0.5,
1616
+ run: async (ctx) => {
1617
+ const pages = ctx.sampledPages.slice(0, 10);
1618
+ let withDateSignals = 0;
1619
+ for (const page of pages) {
1620
+ const hasLastModified = !!page.headers["last-modified"];
1621
+ const meta = extractMetaTags(page.html);
1622
+ const hasDateMeta = !!meta["article:published_time"] || !!meta["article:modified_time"] || !!meta["date"] || !!meta["last-modified"];
1623
+ const jsonLd = extractJsonLd(page.html);
1624
+ const hasDateJsonLd = jsonLd.some((item) => {
1625
+ const obj = item;
1626
+ return !!obj?.datePublished || !!obj?.dateModified;
1627
+ });
1628
+ if (hasLastModified || hasDateMeta || hasDateJsonLd) {
1629
+ withDateSignals++;
1630
+ }
1631
+ }
1632
+ if (withDateSignals >= pages.length * 0.8) {
1633
+ return {
1634
+ id: "content-freshness",
1635
+ name: "Content Freshness",
1636
+ category: "geo-signals",
1637
+ status: "pass",
1638
+ message: `${withDateSignals}/${pages.length} pages have date/freshness signals`,
1639
+ metadata: { withDateSignals }
1640
+ };
1641
+ }
1642
+ return {
1643
+ id: "content-freshness",
1644
+ name: "Content Freshness",
1645
+ category: "geo-signals",
1646
+ status: "warn",
1647
+ message: `Only ${withDateSignals}/${pages.length} pages have date/freshness signals`,
1648
+ suggestion: "Add Last-Modified headers, article:modified_time meta tags, or dateModified in JSON-LD. Generative engines favor fresh content and use date signals to assess relevance.",
1649
+ metadata: { withDateSignals }
1650
+ };
1651
+ }
1652
+ };
1653
+ var eeatSignals = {
1654
+ id: "eeat-signals",
1655
+ name: "E-E-A-T Signals",
1656
+ category: "geo-signals",
1657
+ description: "Checks for author attribution, expertise markers, and trust signals",
1658
+ weight: 0.6,
1659
+ run: async (ctx) => {
1660
+ const pages = ctx.sampledPages.slice(0, 10);
1661
+ let withAuthor = 0;
1662
+ let withExpertise = 0;
1663
+ for (const page of pages) {
1664
+ const meta = extractMetaTags(page.html);
1665
+ const jsonLd = extractJsonLd(page.html);
1666
+ const hasAuthorMeta = !!meta["author"] || !!meta["article:author"];
1667
+ const hasAuthorJsonLd = jsonLd.some((item) => {
1668
+ const obj = item;
1669
+ return !!obj?.author;
1670
+ });
1671
+ const hasAuthorHtml = /class=["'][^"']*author[^"']*["']|rel=["']author["']/i.test(page.html);
1672
+ if (hasAuthorMeta || hasAuthorJsonLd || hasAuthorHtml) withAuthor++;
1673
+ const hasCredentials = /Ph\.?D|M\.?D|CPA|certified|licensed|expert|specialist/i.test(page.html);
1674
+ const hasAboutPage = extractLinks(page.html, ctx.baseUrl.origin).some((l) => /about|team|author/i.test(l));
1675
+ if (hasCredentials || hasAboutPage) withExpertise++;
1676
+ }
1677
+ const score = (withAuthor + withExpertise) / (pages.length * 2);
1678
+ if (score >= 0.6) {
1679
+ return {
1680
+ id: "eeat-signals",
1681
+ name: "E-E-A-T Signals",
1682
+ category: "geo-signals",
1683
+ status: "pass",
1684
+ message: `Good E-E-A-T: ${withAuthor}/${pages.length} pages with author attribution, ${withExpertise}/${pages.length} with expertise markers`,
1685
+ metadata: { withAuthor, withExpertise }
1686
+ };
1687
+ }
1688
+ return {
1689
+ id: "eeat-signals",
1690
+ name: "E-E-A-T Signals",
1691
+ category: "geo-signals",
1692
+ status: score >= 0.3 ? "warn" : "info",
1693
+ message: `Weak E-E-A-T: ${withAuthor}/${pages.length} author attributions, ${withExpertise}/${pages.length} expertise markers`,
1694
+ suggestion: "Add author names and credentials to content. Link to about/team pages. Generative engines assess source credibility through E-E-A-T signals when deciding what to cite.",
1695
+ metadata: { withAuthor, withExpertise }
1696
+ };
1697
+ }
1698
+ };
1699
+ var faqSchema = {
1700
+ id: "faq-schema",
1701
+ name: "FAQ / Q&A Schema",
1702
+ category: "geo-signals",
1703
+ description: "Checks for FAQ or Q&A structured data \u2014 highly cited by generative engines",
1704
+ weight: 0.5,
1705
+ run: async (ctx) => {
1706
+ const pages = ctx.sampledPages.slice(0, 10);
1707
+ let withFaq = 0;
1708
+ for (const page of pages) {
1709
+ const jsonLd = extractJsonLd(page.html);
1710
+ const hasFaqSchema = jsonLd.some((item) => {
1711
+ const type = item?.["@type"];
1712
+ return type === "FAQPage" || type === "QAPage";
1713
+ });
1714
+ const hasFaqHtml = /<details|<summary|class=["'][^"']*faq[^"']*["']|id=["'][^"']*faq[^"']*["']/i.test(page.html);
1715
+ if (hasFaqSchema || hasFaqHtml) withFaq++;
1716
+ }
1717
+ if (withFaq > 0) {
1718
+ return {
1719
+ id: "faq-schema",
1720
+ name: "FAQ / Q&A Schema",
1721
+ category: "geo-signals",
1722
+ status: "pass",
1723
+ message: `${withFaq}/${pages.length} pages contain FAQ/Q&A content`,
1724
+ metadata: { withFaq }
1725
+ };
1726
+ }
1727
+ return {
1728
+ id: "faq-schema",
1729
+ name: "FAQ / Q&A Schema",
1730
+ category: "geo-signals",
1731
+ status: "info",
1732
+ message: "No FAQ/Q&A schema or FAQ-like content detected",
1733
+ suggestion: "Add FAQPage schema or Q&A sections to relevant pages. FAQ-formatted content is heavily cited by generative engines because it directly maps to user questions.",
1734
+ metadata: { withFaq }
1735
+ };
1736
+ }
1737
+ };
1738
+ var canonicalUrlConsistency = {
1739
+ id: "canonical-url-consistency",
1740
+ name: "Canonical URL Consistency",
1741
+ category: "geo-signals",
1742
+ description: "Checks if pages have consistent canonical URLs",
1743
+ weight: 0.5,
1744
+ run: async (ctx) => {
1745
+ const pages = ctx.sampledPages.slice(0, 10);
1746
+ let withCanonical = 0;
1747
+ let selfReferencing = 0;
1748
+ for (const page of pages) {
1749
+ const canonicalMatch = page.html.match(/<link[^>]+rel=["']canonical["'][^>]+href=["']([^"']+)["']/i);
1750
+ if (canonicalMatch) {
1751
+ withCanonical++;
1752
+ try {
1753
+ const canonical = new URL(canonicalMatch[1], ctx.baseUrl.origin).href;
1754
+ const pageUrl = new URL(page.url).href;
1755
+ if (canonical === pageUrl || canonical === pageUrl.replace(/\/$/, "")) {
1756
+ selfReferencing++;
1757
+ }
1758
+ } catch {
1759
+ }
1760
+ }
1761
+ }
1762
+ if (withCanonical === pages.length && selfReferencing === pages.length) {
1763
+ return {
1764
+ id: "canonical-url-consistency",
1765
+ name: "Canonical URL Consistency",
1766
+ category: "geo-signals",
1767
+ status: "pass",
1768
+ message: `All ${pages.length} pages have self-referencing canonical URLs`
1769
+ };
1770
+ }
1771
+ if (withCanonical === 0) {
1772
+ return {
1773
+ id: "canonical-url-consistency",
1774
+ name: "Canonical URL Consistency",
1775
+ category: "geo-signals",
1776
+ status: "warn",
1777
+ message: "No canonical URLs found on sampled pages",
1778
+ suggestion: 'Add <link rel="canonical"> to every page. This prevents duplicate content issues when AI agents discover the same page through different URLs.'
1779
+ };
1780
+ }
1781
+ return {
1782
+ id: "canonical-url-consistency",
1783
+ name: "Canonical URL Consistency",
1784
+ category: "geo-signals",
1785
+ status: "warn",
1786
+ message: `${withCanonical}/${pages.length} pages have canonical URLs (${selfReferencing} self-referencing)`,
1787
+ suggestion: "Ensure every page has a self-referencing canonical URL to avoid confusing AI agents about the authoritative version.",
1788
+ metadata: { withCanonical, selfReferencing }
1789
+ };
1790
+ }
1791
+ };
1792
+ var geoSignalChecks = [
1793
+ structuredDataCoverage,
1794
+ citationWorthiness,
1795
+ topicalAuthoritySignals,
1796
+ contentFreshness,
1797
+ eeatSignals,
1798
+ faqSchema,
1799
+ canonicalUrlConsistency
1800
+ ];
1801
+
1802
+ // src/checks/agent-protocols.ts
1803
+ var mcpServerCard = {
1804
+ id: "mcp-server-card",
1805
+ name: "MCP Server Card",
1806
+ category: "agent-protocols",
1807
+ description: "Checks for a Model Context Protocol server card at .well-known/mcp/server-card.json",
1808
+ weight: 0.8,
1809
+ run: async (ctx) => {
1810
+ if (!ctx.mcpServerCard) {
1811
+ return {
1812
+ id: "mcp-server-card",
1813
+ name: "MCP Server Card",
1814
+ category: "agent-protocols",
1815
+ status: "fail",
1816
+ message: "No MCP server card found at /.well-known/mcp/server-card.json",
1817
+ suggestion: "Add a server-card.json at /.well-known/mcp/server-card.json describing your MCP server's tools, authentication, and capabilities. This lets AI agents discover what they can do with your site before connecting."
1818
+ };
1819
+ }
1820
+ try {
1821
+ const card = JSON.parse(ctx.mcpServerCard);
1822
+ const hasName = typeof card.name === "string";
1823
+ const hasTools = Array.isArray(card.tools) || Array.isArray(card.capabilities?.tools);
1824
+ const hasDescription = typeof card.description === "string";
1825
+ if (!hasName && !hasDescription) {
1826
+ return {
1827
+ id: "mcp-server-card",
1828
+ name: "MCP Server Card",
1829
+ category: "agent-protocols",
1830
+ status: "warn",
1831
+ message: "MCP server card found but missing name and description fields",
1832
+ suggestion: "Add at minimum a 'name' and 'description' to your server card so agents understand what your MCP server does.",
1833
+ metadata: { fields: Object.keys(card) }
1834
+ };
1835
+ }
1836
+ const toolCount = Array.isArray(card.tools) ? card.tools.length : Array.isArray(card.capabilities?.tools) ? card.capabilities.tools.length : 0;
1837
+ return {
1838
+ id: "mcp-server-card",
1839
+ name: "MCP Server Card",
1840
+ category: "agent-protocols",
1841
+ status: "pass",
1842
+ message: `MCP server card found: "${card.name ?? "unnamed"}"${toolCount > 0 ? ` with ${toolCount} tools` : ""}`,
1843
+ metadata: { name: card.name, toolCount, hasDescription }
1844
+ };
1845
+ } catch {
1846
+ return {
1847
+ id: "mcp-server-card",
1848
+ name: "MCP Server Card",
1849
+ category: "agent-protocols",
1850
+ status: "warn",
1851
+ message: "MCP server card found but contains invalid JSON",
1852
+ suggestion: "Fix the JSON syntax in your /.well-known/mcp/server-card.json file."
1853
+ };
1854
+ }
1855
+ }
1856
+ };
1857
+ var apiCatalog = {
1858
+ id: "api-catalog",
1859
+ name: "API Catalog (RFC 9727)",
1860
+ category: "agent-protocols",
1861
+ description: "Checks for an API catalog at .well-known/api-catalog per RFC 9727",
1862
+ weight: 0.5,
1863
+ run: async (ctx) => {
1864
+ if (!ctx.apiCatalog) {
1865
+ return {
1866
+ id: "api-catalog",
1867
+ name: "API Catalog (RFC 9727)",
1868
+ category: "agent-protocols",
1869
+ status: "info",
1870
+ message: "No API catalog found at /.well-known/api-catalog",
1871
+ suggestion: "If your site exposes APIs, add an api-catalog at /.well-known/api-catalog (RFC 9727). This gives agents a single location to discover all your APIs, their specs, docs, and status endpoints."
1872
+ };
1873
+ }
1874
+ try {
1875
+ const catalog = JSON.parse(ctx.apiCatalog);
1876
+ const apis = Array.isArray(catalog.apis) ? catalog.apis : [];
1877
+ if (apis.length === 0) {
1878
+ return {
1879
+ id: "api-catalog",
1880
+ name: "API Catalog (RFC 9727)",
1881
+ category: "agent-protocols",
1882
+ status: "warn",
1883
+ message: "API catalog found but contains no API entries",
1884
+ suggestion: "Add API entries to your api-catalog with links to specs, documentation, and status endpoints."
1885
+ };
1886
+ }
1887
+ return {
1888
+ id: "api-catalog",
1889
+ name: "API Catalog (RFC 9727)",
1890
+ category: "agent-protocols",
1891
+ status: "pass",
1892
+ message: `API catalog found with ${apis.length} API${apis.length === 1 ? "" : "s"} listed`,
1893
+ metadata: { apiCount: apis.length }
1894
+ };
1895
+ } catch {
1896
+ const hasLinks = ctx.apiCatalog.includes("http") || ctx.apiCatalog.includes("<");
1897
+ if (hasLinks) {
1898
+ return {
1899
+ id: "api-catalog",
1900
+ name: "API Catalog (RFC 9727)",
1901
+ category: "agent-protocols",
1902
+ status: "pass",
1903
+ message: "API catalog found (non-JSON format)"
1904
+ };
1905
+ }
1906
+ return {
1907
+ id: "api-catalog",
1908
+ name: "API Catalog (RFC 9727)",
1909
+ category: "agent-protocols",
1910
+ status: "warn",
1911
+ message: "API catalog found but could not parse content",
1912
+ suggestion: "Ensure your api-catalog returns valid JSON or CoRE Link Format per RFC 9727."
1913
+ };
1914
+ }
1915
+ }
1916
+ };
1917
+ var contentSignals = {
1918
+ id: "content-signals",
1919
+ name: "Content Signals (AI Usage Declarations)",
1920
+ category: "agent-protocols",
1921
+ description: "Checks for AI-specific content signals in robots.txt (ai-train, ai-input, search directives)",
1922
+ weight: 0.6,
1923
+ run: async (ctx) => {
1924
+ if (!ctx.robotsTxt) {
1925
+ return {
1926
+ id: "content-signals",
1927
+ name: "Content Signals (AI Usage Declarations)",
1928
+ category: "agent-protocols",
1929
+ status: "info",
1930
+ message: "No robots.txt found \u2014 cannot check for content signals",
1931
+ suggestion: "Add a robots.txt with Content Signals directives to declare how AI agents may use your content (ai-train, ai-input, search)."
1932
+ };
1933
+ }
1934
+ const text = ctx.robotsTxt.toLowerCase();
1935
+ const signals = [];
1936
+ if (/ai[-_]?train/i.test(ctx.robotsTxt)) signals.push("ai-train");
1937
+ if (/ai[-_]?input/i.test(ctx.robotsTxt)) signals.push("ai-input");
1938
+ if (/content[-_]?signals?/i.test(ctx.robotsTxt)) signals.push("content-signals");
1939
+ const hasGPTBot = text.includes("gptbot");
1940
+ const hasClaudeBot = text.includes("claudebot") || text.includes("claude-web");
1941
+ const hasGoogleExtended = text.includes("google-extended");
1942
+ const hasPerplexityBot = text.includes("perplexitybot");
1943
+ const hasCCBot = text.includes("ccbot");
1944
+ const hasBytespider = text.includes("bytespider");
1945
+ const namedBots = [
1946
+ hasGPTBot && "GPTBot",
1947
+ hasClaudeBot && "ClaudeBot",
1948
+ hasGoogleExtended && "Google-Extended",
1949
+ hasPerplexityBot && "PerplexityBot",
1950
+ hasCCBot && "CCBot",
1951
+ hasBytespider && "Bytespider"
1952
+ ].filter(Boolean);
1953
+ if (signals.length > 0) {
1954
+ return {
1955
+ id: "content-signals",
1956
+ name: "Content Signals (AI Usage Declarations)",
1957
+ category: "agent-protocols",
1958
+ status: "pass",
1959
+ message: `Content signals found: ${signals.join(", ")}${namedBots.length > 0 ? `. AI bot rules for: ${namedBots.join(", ")}` : ""}`,
1960
+ metadata: { signals, namedBots }
1961
+ };
1962
+ }
1963
+ if (namedBots.length >= 2) {
1964
+ return {
1965
+ id: "content-signals",
1966
+ name: "Content Signals (AI Usage Declarations)",
1967
+ category: "agent-protocols",
1968
+ status: "warn",
1969
+ message: `No content signals directives, but has AI bot rules for: ${namedBots.join(", ")}`,
1970
+ suggestion: "Consider adding Content Signals directives (ai-train, ai-input) for more granular control over how AI agents use your content, beyond simple allow/disallow.",
1971
+ metadata: { signals, namedBots }
1972
+ };
1973
+ }
1974
+ return {
1975
+ id: "content-signals",
1976
+ name: "Content Signals (AI Usage Declarations)",
1977
+ category: "agent-protocols",
1978
+ status: "info",
1979
+ message: "No AI-specific content signals or bot directives found in robots.txt",
1980
+ suggestion: "Add Content Signals directives to robots.txt to explicitly declare how AI agents may use your content. This gives you granular control over training, citations, and search indexing."
1981
+ };
1982
+ }
1983
+ };
1984
+ var linkHeaders = {
1985
+ id: "link-headers",
1986
+ name: "Link Headers (RFC 8288)",
1987
+ category: "agent-protocols",
1988
+ description: "Checks for Link HTTP headers that help agents discover related resources",
1989
+ weight: 0.4,
1990
+ requiresNetwork: true,
1991
+ run: async (ctx) => {
1992
+ const pages = ctx.sampledPages.slice(0, 10);
1993
+ let withLinkHeaders = 0;
1994
+ const relTypes = /* @__PURE__ */ new Set();
1995
+ for (const page of pages) {
1996
+ const linkHeader = page.headers["link"];
1997
+ if (linkHeader) {
1998
+ withLinkHeaders++;
1999
+ const relMatches = linkHeader.matchAll(/rel="([^"]+)"/gi);
2000
+ for (const match of relMatches) {
2001
+ for (const rel of match[1].split(/\s+/)) {
2002
+ relTypes.add(rel);
2003
+ }
2004
+ }
2005
+ }
2006
+ }
2007
+ if (withLinkHeaders === 0) {
2008
+ return {
2009
+ id: "link-headers",
2010
+ name: "Link Headers (RFC 8288)",
2011
+ category: "agent-protocols",
2012
+ status: "info",
2013
+ message: "No Link HTTP headers found on sampled pages",
2014
+ suggestion: "Add Link headers (RFC 8288) to responses with rel=alternate, rel=canonical, or rel=describedby. This lets AI agents discover alternative representations (markdown, JSON) without parsing HTML."
2015
+ };
2016
+ }
2017
+ const hasAlternate = relTypes.has("alternate");
2018
+ const hasCanonical = relTypes.has("canonical");
2019
+ const hasDescribedBy = relTypes.has("describedby");
2020
+ const agentUseful = hasAlternate || hasDescribedBy;
2021
+ if (agentUseful) {
2022
+ return {
2023
+ id: "link-headers",
2024
+ name: "Link Headers (RFC 8288)",
2025
+ category: "agent-protocols",
2026
+ status: "pass",
2027
+ message: `Link headers found on ${withLinkHeaders}/${pages.length} pages (rel types: ${[...relTypes].join(", ")})`,
2028
+ metadata: { withLinkHeaders, relTypes: [...relTypes] }
2029
+ };
2030
+ }
2031
+ return {
2032
+ id: "link-headers",
2033
+ name: "Link Headers (RFC 8288)",
2034
+ category: "agent-protocols",
2035
+ status: "warn",
2036
+ message: `Link headers found on ${withLinkHeaders}/${pages.length} pages but missing agent-useful rel types`,
2037
+ suggestion: "Add rel=alternate (to point to markdown/JSON versions) and rel=describedby Link headers. These help AI agents find the best representation of your content.",
2038
+ metadata: { withLinkHeaders, relTypes: [...relTypes] }
2039
+ };
2040
+ }
2041
+ };
2042
+ var agentSkillsIndex = {
2043
+ id: "agent-skills-index",
2044
+ name: "Agent Skills Index",
2045
+ category: "agent-protocols",
2046
+ description: "Checks for an agent skills index at .well-known/agent-skills/index.json",
2047
+ weight: 0.4,
2048
+ run: async (ctx) => {
2049
+ if (!ctx.agentSkillsIndex) {
2050
+ return {
2051
+ id: "agent-skills-index",
2052
+ name: "Agent Skills Index",
2053
+ category: "agent-protocols",
2054
+ status: "info",
2055
+ message: "No agent skills index found at /.well-known/agent-skills/index.json",
2056
+ suggestion: "Add an agent-skills/index.json at /.well-known/ to declare what capabilities agents can use on your site. This is an emerging standard for agentic web interactions."
2057
+ };
2058
+ }
2059
+ try {
2060
+ const skills = JSON.parse(ctx.agentSkillsIndex);
2061
+ const skillList = Array.isArray(skills) ? skills : Array.isArray(skills.skills) ? skills.skills : [];
2062
+ if (skillList.length === 0) {
2063
+ return {
2064
+ id: "agent-skills-index",
2065
+ name: "Agent Skills Index",
2066
+ category: "agent-protocols",
2067
+ status: "warn",
2068
+ message: "Agent skills index found but contains no skills",
2069
+ suggestion: "Add skill definitions to your agent-skills index describing what actions agents can perform on your site."
2070
+ };
2071
+ }
2072
+ return {
2073
+ id: "agent-skills-index",
2074
+ name: "Agent Skills Index",
2075
+ category: "agent-protocols",
2076
+ status: "pass",
2077
+ message: `Agent skills index found with ${skillList.length} skill${skillList.length === 1 ? "" : "s"} declared`,
2078
+ metadata: { skillCount: skillList.length }
2079
+ };
2080
+ } catch {
2081
+ return {
2082
+ id: "agent-skills-index",
2083
+ name: "Agent Skills Index",
2084
+ category: "agent-protocols",
2085
+ status: "warn",
2086
+ message: "Agent skills index found but contains invalid JSON",
2087
+ suggestion: "Fix the JSON syntax in your /.well-known/agent-skills/index.json file."
2088
+ };
2089
+ }
2090
+ }
2091
+ };
2092
+ var agentsMd = {
2093
+ id: "agents-md",
2094
+ name: "AGENTS.md",
2095
+ category: "agent-protocols",
2096
+ description: "Checks for an AGENTS.md or AGENT.md file that guides coding agents on how to work with the project",
2097
+ weight: 0.7,
2098
+ run: async (ctx) => {
2099
+ if (ctx.mode === "remote") {
2100
+ return {
2101
+ id: "agents-md",
2102
+ name: "AGENTS.md",
2103
+ category: "agent-protocols",
2104
+ status: "skip",
2105
+ message: "AGENTS.md is a repo-root file (run on a local path to check it)"
2106
+ };
2107
+ }
2108
+ if (!ctx.agentsMd) {
2109
+ return {
2110
+ id: "agents-md",
2111
+ name: "AGENTS.md",
2112
+ category: "agent-protocols",
2113
+ status: "fail",
2114
+ message: "No AGENTS.md or AGENT.md found",
2115
+ suggestion: "Add an AGENTS.md at the project root. This is the universal agent configuration file \u2014 a README for AI coding agents. Include build/test commands, architecture overview, conventions, and any gotchas. Used by 60k+ open-source projects."
2116
+ };
2117
+ }
2118
+ const content = ctx.agentsMd;
2119
+ const lines = content.split("\n").filter((l) => l.trim().length > 0);
2120
+ const headings = content.match(/^#{1,3}\s+.+$/gm) ?? [];
2121
+ const hasBuildInfo = /\b(build|compile|install|setup)\b/i.test(content);
2122
+ const hasTestInfo = /\b(test|spec|jest|vitest|pytest|cargo test)\b/i.test(content);
2123
+ const hasArchInfo = /\b(architecture|structure|directory|folder|module|package)\b/i.test(content);
2124
+ const hasConventions = /\b(convention|style|pattern|rule|guideline|lint)\b/i.test(content);
2125
+ const hasCodeBlocks = /```/.test(content);
2126
+ const signals = [
2127
+ hasBuildInfo && "build",
2128
+ hasTestInfo && "test",
2129
+ hasArchInfo && "architecture",
2130
+ hasConventions && "conventions",
2131
+ hasCodeBlocks && "code examples"
2132
+ ].filter(Boolean);
2133
+ if (lines.length < 5) {
2134
+ return {
2135
+ id: "agents-md",
2136
+ name: "AGENTS.md",
2137
+ category: "agent-protocols",
2138
+ status: "warn",
2139
+ message: `AGENTS.md found but very short (${lines.length} lines)`,
2140
+ suggestion: "Expand your AGENTS.md with build/test commands, architecture overview, code conventions, and common gotchas. The more context you give coding agents, the better they'll work with your project.",
2141
+ metadata: { lines: lines.length, headings: headings.length }
2142
+ };
2143
+ }
2144
+ if (signals.length < 2) {
2145
+ return {
2146
+ id: "agents-md",
2147
+ name: "AGENTS.md",
2148
+ category: "agent-protocols",
2149
+ status: "warn",
2150
+ message: `AGENTS.md found (${lines.length} lines, ${headings.length} sections) but missing key info`,
2151
+ suggestion: `Your AGENTS.md covers ${signals.length > 0 ? signals.join(", ") : "limited topics"}. Consider adding: ${[!hasBuildInfo && "build commands", !hasTestInfo && "test instructions", !hasArchInfo && "architecture overview", !hasConventions && "code conventions"].filter(Boolean).join(", ")}.`,
2152
+ metadata: { lines: lines.length, headings: headings.length, signals }
2153
+ };
2154
+ }
2155
+ return {
2156
+ id: "agents-md",
2157
+ name: "AGENTS.md",
2158
+ category: "agent-protocols",
2159
+ status: "pass",
2160
+ message: `AGENTS.md found (${lines.length} lines, ${headings.length} sections) covering: ${signals.join(", ")}`,
2161
+ metadata: { lines: lines.length, headings: headings.length, signals }
2162
+ };
2163
+ }
2164
+ };
2165
+ var agentProtocolChecks = [
2166
+ mcpServerCard,
2167
+ apiCatalog,
2168
+ contentSignals,
2169
+ linkHeaders,
2170
+ agentSkillsIndex,
2171
+ agentsMd
2172
+ ];
2173
+
2174
+ // src/checks/index.ts
2175
+ var ALL_CHECKS = [
2176
+ ...contentDiscoverabilityChecks,
2177
+ ...markdownAvailabilityChecks,
2178
+ ...pageSizeChecks,
2179
+ ...contentStructureChecks,
2180
+ ...urlStabilityChecks,
2181
+ ...authenticationChecks,
2182
+ ...geoSignalChecks,
2183
+ ...agentProtocolChecks
2184
+ ];
2185
+
2186
+ // src/utils/local.ts
2187
+ import { readFileSync, readdirSync, existsSync } from "fs";
2188
+ import { join, relative, extname } from "path";
2189
+ var walkDir = (dir, extensions, maxDepth = 10) => {
2190
+ if (maxDepth <= 0) return [];
2191
+ const results = [];
2192
+ try {
2193
+ const entries = readdirSync(dir, { withFileTypes: true });
2194
+ for (const entry of entries) {
2195
+ const fullPath = join(dir, entry.name);
2196
+ if (entry.name.startsWith(".") || entry.name === "node_modules" || entry.name === "dist") {
2197
+ continue;
2198
+ }
2199
+ if (entry.isDirectory()) {
2200
+ results.push(...walkDir(fullPath, extensions, maxDepth - 1));
2201
+ } else if (entry.isFile() && extensions.has(extname(entry.name).toLowerCase())) {
2202
+ results.push(fullPath);
2203
+ }
2204
+ }
2205
+ } catch {
2206
+ }
2207
+ return results;
2208
+ };
2209
+ var readIfExists = (path) => {
2210
+ try {
2211
+ if (existsSync(path)) {
2212
+ return readFileSync(path, "utf-8");
2213
+ }
2214
+ } catch {
2215
+ }
2216
+ return void 0;
2217
+ };
2218
+ var buildLocalContext = (dirPath, config) => {
2219
+ const baseUrl = new URL(`file://${dirPath}`);
2220
+ const robotsTxt = readIfExists(join(dirPath, "robots.txt"));
2221
+ const llmsTxt = readIfExists(join(dirPath, "llms.txt"));
2222
+ const llmsFullTxt = readIfExists(join(dirPath, "llms-full.txt"));
2223
+ const sitemapXml = readIfExists(join(dirPath, "sitemap.xml"));
2224
+ const mcpServerCard2 = readIfExists(join(dirPath, ".well-known", "mcp", "server-card.json"));
2225
+ const apiCatalog2 = readIfExists(join(dirPath, ".well-known", "api-catalog"));
2226
+ const agentSkillsIndex2 = readIfExists(join(dirPath, ".well-known", "agent-skills", "index.json"));
2227
+ const agentsMd2 = readIfExists(join(dirPath, "AGENTS.md")) ?? readIfExists(join(dirPath, "AGENT.md"));
2228
+ const sitemapUrls = sitemapXml ? parseSitemapUrls(sitemapXml) : [];
2229
+ if (!sitemapXml && robotsTxt) {
2230
+ const sitemapMatch = robotsTxt.match(/Sitemap:\s*(.+)/i);
2231
+ if (sitemapMatch) {
2232
+ const sitemapPath = sitemapMatch[1].trim();
2233
+ const localSitemapPath = sitemapPath.startsWith("http") ? null : join(dirPath, sitemapPath.replace(/^\//, ""));
2234
+ if (localSitemapPath) {
2235
+ const altSitemap = readIfExists(localSitemapPath);
2236
+ if (altSitemap) {
2237
+ sitemapUrls.push(...parseSitemapUrls(altSitemap));
2238
+ }
2239
+ }
2240
+ }
2241
+ }
2242
+ const htmlFiles = walkDir(dirPath, /* @__PURE__ */ new Set([".html", ".htm"]));
2243
+ const mdFiles = walkDir(dirPath, /* @__PURE__ */ new Set([".md", ".mdx"]));
2244
+ const allFiles = [...htmlFiles, ...mdFiles];
2245
+ const sampled = allFiles.slice(0, config.sampleSize);
2246
+ const sampledPages = sampled.map((filePath) => {
2247
+ const content = readFileSync(filePath, "utf-8");
2248
+ const relPath = relative(dirPath, filePath);
2249
+ const ext = extname(filePath).toLowerCase();
2250
+ const isMarkdown = ext === ".md" || ext === ".mdx";
2251
+ const url = `file://${filePath}`;
2252
+ return {
2253
+ url,
2254
+ html: isMarkdown ? wrapMarkdownAsHtml(content, relPath) : content,
2255
+ statusCode: 200,
2256
+ headers: {},
2257
+ markdown: isMarkdown ? content : void 0,
2258
+ fetchTime: 0
2259
+ };
2260
+ });
2261
+ return {
2262
+ mode: "local",
2263
+ targetUrl: dirPath,
2264
+ baseUrl,
2265
+ sitemapUrls,
2266
+ sampledPages,
2267
+ robotsTxt,
2268
+ llmsTxt,
2269
+ llmsFullTxt,
2270
+ sitemapXml,
2271
+ allUrls: sampled.map((f) => `file://${f}`),
2272
+ mcpServerCard: mcpServerCard2,
2273
+ apiCatalog: apiCatalog2,
2274
+ agentSkillsIndex: agentSkillsIndex2,
2275
+ agentsMd: agentsMd2
2276
+ };
2277
+ };
2278
+ var wrapMarkdownAsHtml = (md, title) => {
2279
+ let html = md;
2280
+ html = html.replace(/^#{6}\s+(.+)$/gm, "<h6>$1</h6>");
2281
+ html = html.replace(/^#{5}\s+(.+)$/gm, "<h5>$1</h5>");
2282
+ html = html.replace(/^#{4}\s+(.+)$/gm, "<h4>$1</h4>");
2283
+ html = html.replace(/^#{3}\s+(.+)$/gm, "<h3>$1</h3>");
2284
+ html = html.replace(/^#{2}\s+(.+)$/gm, "<h2>$1</h2>");
2285
+ html = html.replace(/^#{1}\s+(.+)$/gm, "<h1>$1</h1>");
2286
+ html = html.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '<a href="$2">$1</a>');
2287
+ html = html.replace(/^>\s+(.+)$/gm, "<blockquote>$1</blockquote>");
2288
+ html = html.replace(/```(\w*)\n([\s\S]*?)```/g, '<pre><code class="language-$1">$2</code></pre>');
2289
+ html = html.replace(/`([^`]+)`/g, "<code>$1</code>");
2290
+ html = html.replace(/\*\*([^*]+)\*\*/g, "<strong>$1</strong>");
2291
+ html = html.replace(/^[-*]\s+(.+)$/gm, "<li>$1</li>");
2292
+ html = html.replace(/^\d+\.\s+(.+)$/gm, "<li>$1</li>");
2293
+ html = html.replace(/^([^<\n].+)$/gm, "<p>$1</p>");
2294
+ return `<!DOCTYPE html><html><head><title>${title}</title></head><body><main>${html}</main></body></html>`;
2295
+ };
2296
+
2297
+ // src/index.ts
2298
+ var computeGrade = (score) => {
2299
+ if (score >= 95) return "A+";
2300
+ if (score >= 85) return "A";
2301
+ if (score >= 70) return "B";
2302
+ if (score >= 55) return "C";
2303
+ if (score >= 40) return "D";
2304
+ return "F";
2305
+ };
2306
+ var buildRemoteContext = async (targetUrl, config) => {
2307
+ const emit = config.onEvent ?? (() => {
2308
+ });
2309
+ emit({ type: "phase", phase: "fetching" });
2310
+ const baseUrl = new URL(targetUrl);
2311
+ const origin = baseUrl.origin;
2312
+ const [
2313
+ robotsResult,
2314
+ llmsResult,
2315
+ llmsFullResult,
2316
+ sitemapResult,
2317
+ mcpCardResult,
2318
+ apiCatalogResult,
2319
+ agentSkillsResult
2320
+ ] = await Promise.allSettled([
2321
+ fetchText(`${origin}/robots.txt`, config),
2322
+ fetchText(`${origin}/llms.txt`, config),
2323
+ fetchText(`${origin}/llms-full.txt`, config),
2324
+ fetchText(`${origin}/sitemap.xml`, config),
2325
+ fetchText(`${origin}/.well-known/mcp/server-card.json`, config),
2326
+ fetchText(`${origin}/.well-known/api-catalog`, config),
2327
+ fetchText(`${origin}/.well-known/agent-skills/index.json`, config)
2328
+ ]);
2329
+ const robotsTxt = robotsResult.status === "fulfilled" && robotsResult.value?.statusCode === 200 ? robotsResult.value.text : void 0;
2330
+ const llmsTxt = llmsResult.status === "fulfilled" && llmsResult.value?.statusCode === 200 ? llmsResult.value.text : void 0;
2331
+ const llmsFullTxt = llmsFullResult.status === "fulfilled" && llmsFullResult.value?.statusCode === 200 ? llmsFullResult.value.text : void 0;
2332
+ const sitemapXml = sitemapResult.status === "fulfilled" && sitemapResult.value?.statusCode === 200 ? sitemapResult.value.text : void 0;
2333
+ const mcpServerCard2 = mcpCardResult.status === "fulfilled" && mcpCardResult.value?.statusCode === 200 ? mcpCardResult.value.text : void 0;
2334
+ const apiCatalog2 = apiCatalogResult.status === "fulfilled" && apiCatalogResult.value?.statusCode === 200 ? apiCatalogResult.value.text : void 0;
2335
+ const agentSkillsIndex2 = agentSkillsResult.status === "fulfilled" && agentSkillsResult.value?.statusCode === 200 ? agentSkillsResult.value.text : void 0;
2336
+ const agentsMd2 = void 0;
2337
+ const sitemapUrls = sitemapXml ? parseSitemapUrls(sitemapXml) : [];
2338
+ if (!sitemapXml && robotsTxt) {
2339
+ const sitemapMatch = robotsTxt.match(/Sitemap:\s*(.+)/i);
2340
+ if (sitemapMatch) {
2341
+ const altSitemap = await fetchText(sitemapMatch[1].trim(), config);
2342
+ if (altSitemap?.statusCode === 200) {
2343
+ sitemapUrls.push(...parseSitemapUrls(altSitemap.text));
2344
+ }
2345
+ }
2346
+ }
2347
+ let pagesToSample = [];
2348
+ if (sitemapUrls.length > 0) {
2349
+ const shuffled = [...sitemapUrls].sort(() => Math.random() - 0.5);
2350
+ pagesToSample = shuffled.slice(0, config.sampleSize);
2351
+ } else {
2352
+ const mainPage = await fetchPage(targetUrl, config);
2353
+ const linkRegex = /<a[^>]+href=["']([^"'#]+)["']/gi;
2354
+ const links = [];
2355
+ let match;
2356
+ while ((match = linkRegex.exec(mainPage.html)) !== null) {
2357
+ try {
2358
+ const resolved = new URL(match[1], origin);
2359
+ if (resolved.origin === origin && !links.includes(resolved.href)) {
2360
+ links.push(resolved.href);
2361
+ }
2362
+ } catch {
2363
+ }
2364
+ }
2365
+ pagesToSample = [targetUrl, ...links.slice(0, config.sampleSize - 1)];
2366
+ }
2367
+ if (!pagesToSample.includes(targetUrl)) {
2368
+ pagesToSample.unshift(targetUrl);
2369
+ }
2370
+ const sampledPages = await fetchMany(pagesToSample, config);
2371
+ emit({ type: "context-ready", pageCount: sampledPages.length });
2372
+ for (const page of sampledPages) {
2373
+ const mdResult = await fetchWithContentNegotiation(page.url, "text/markdown", config);
2374
+ if (mdResult && mdResult.statusCode === 200 && (mdResult.contentType.includes("text/markdown") || mdResult.contentType.includes("text/plain"))) {
2375
+ page.markdown = mdResult.text;
2376
+ }
2377
+ }
2378
+ return {
2379
+ mode: "remote",
2380
+ targetUrl,
2381
+ baseUrl,
2382
+ sitemapUrls,
2383
+ sampledPages,
2384
+ robotsTxt,
2385
+ llmsTxt,
2386
+ llmsFullTxt,
2387
+ sitemapXml,
2388
+ allUrls: pagesToSample,
2389
+ mcpServerCard: mcpServerCard2,
2390
+ apiCatalog: apiCatalog2,
2391
+ agentSkillsIndex: agentSkillsIndex2,
2392
+ agentsMd: agentsMd2
2393
+ };
2394
+ };
2395
+ var stripUndefined = (obj) => {
2396
+ const result = {};
2397
+ for (const [key, value] of Object.entries(obj)) {
2398
+ if (value !== void 0) {
2399
+ result[key] = value;
2400
+ }
2401
+ }
2402
+ return result;
2403
+ };
2404
+ var runAudit = async (ctx, config, start) => {
2405
+ const emit = config.onEvent ?? (() => {
2406
+ });
2407
+ emit({ type: "phase", phase: "checking" });
2408
+ const allCategoryCount = CHECK_CATEGORIES.length;
2409
+ let checks = config.categories.length < allCategoryCount ? ALL_CHECKS.filter((c) => config.categories.includes(c.category)) : ALL_CHECKS;
2410
+ if (ctx.mode === "local") {
2411
+ checks = checks.filter((c) => !c.requiresNetwork);
2412
+ }
2413
+ const results = [];
2414
+ for (const check of checks) {
2415
+ emit({ type: "check-start", check: { id: check.id, name: check.name, category: check.category } });
2416
+ try {
2417
+ const result = await check.run(ctx);
2418
+ results.push(result);
2419
+ emit({ type: "check-complete", result });
2420
+ } catch (error) {
2421
+ const result = {
2422
+ id: check.id,
2423
+ name: check.name,
2424
+ category: check.category,
2425
+ status: "skip",
2426
+ message: `Check failed: ${error instanceof Error ? error.message : "Unknown error"}`
2427
+ };
2428
+ results.push(result);
2429
+ emit({ type: "check-complete", result });
2430
+ }
2431
+ }
2432
+ emit({ type: "phase", phase: "scoring" });
2433
+ const passed = results.filter((r) => r.status === "pass").length;
2434
+ const warned = results.filter((r) => r.status === "warn").length;
2435
+ const failed = results.filter((r) => r.status === "fail").length;
2436
+ const skipped = results.filter((r) => r.status === "skip" || r.status === "info").length;
2437
+ const total = results.length;
2438
+ const scorable = results.filter((r) => r.status !== "skip" && r.status !== "info");
2439
+ const checkWeights = checks.reduce((acc, c) => ({ ...acc, [c.id]: c.weight }), {});
2440
+ let weightedSum = 0;
2441
+ let totalWeight = 0;
2442
+ for (const result of scorable) {
2443
+ const weight = checkWeights[result.id] ?? 0.5;
2444
+ const score = result.status === "pass" ? 1 : result.status === "warn" ? 0.5 : 0;
2445
+ weightedSum += score * weight;
2446
+ totalWeight += weight;
2447
+ }
2448
+ let overallScore = totalWeight > 0 ? Math.round(weightedSum / totalWeight * 100) : 0;
2449
+ const authGate = results.find((r) => r.id === "auth-gate-detection");
2450
+ if (authGate?.status === "fail" && overallScore > 50) {
2451
+ overallScore = 50;
2452
+ }
2453
+ const categories = {};
2454
+ const activeCategories = config.categories.filter(
2455
+ (cat) => results.some((r) => r.category === cat)
2456
+ );
2457
+ for (const cat of activeCategories) {
2458
+ const catResults = results.filter((r) => r.category === cat);
2459
+ const catScorable = catResults.filter((r) => r.status !== "skip" && r.status !== "info");
2460
+ const catPassed = catResults.filter((r) => r.status === "pass").length;
2461
+ let catWeightedSum = 0;
2462
+ let catTotalWeight = 0;
2463
+ for (const result of catScorable) {
2464
+ const weight = checkWeights[result.id] ?? 0.5;
2465
+ const score = result.status === "pass" ? 1 : result.status === "warn" ? 0.5 : 0;
2466
+ catWeightedSum += score * weight;
2467
+ catTotalWeight += weight;
2468
+ }
2469
+ categories[cat] = {
2470
+ score: catTotalWeight > 0 ? Math.round(catWeightedSum / catTotalWeight * 100) : 0,
2471
+ checks: catResults.length,
2472
+ passed: catPassed
2473
+ };
2474
+ }
2475
+ return {
2476
+ url: ctx.targetUrl,
2477
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2478
+ overall_score: overallScore,
2479
+ grade: computeGrade(overallScore),
2480
+ checks: results,
2481
+ summary: { total, passed, warned, failed, skipped },
2482
+ categories,
2483
+ latency_ms: Date.now() - start
2484
+ };
2485
+ };
2486
+ var audit = async (targetUrl, config = {}) => {
2487
+ const start = Date.now();
2488
+ const fullConfig = { ...DEFAULT_CONFIG, ...stripUndefined(config) };
2489
+ const ctx = await buildRemoteContext(targetUrl, fullConfig);
2490
+ return runAudit(ctx, fullConfig, start);
2491
+ };
2492
+ var auditLocal = async (dirPath, config = {}) => {
2493
+ const start = Date.now();
2494
+ const fullConfig = { ...DEFAULT_CONFIG, ...stripUndefined(config) };
2495
+ const emit = fullConfig.onEvent ?? (() => {
2496
+ });
2497
+ emit({ type: "phase", phase: "fetching" });
2498
+ const ctx = buildLocalContext(dirPath, fullConfig);
2499
+ emit({ type: "context-ready", pageCount: ctx.sampledPages.length });
2500
+ return runAudit(ctx, fullConfig, start);
2501
+ };
2502
+ export {
2503
+ ALL_CHECKS,
2504
+ audit,
2505
+ auditLocal,
2506
+ buildLocalContext
2507
+ };