@agentimization/core 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -8
- package/dist/index.js +416 -167
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -2,14 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
[](https://www.npmjs.com/package/agentimization)
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
╰───────────────────────────────────────────────╯
|
|
12
|
-
```
|
|
5
|
+
<p align="center">
|
|
6
|
+
<picture>
|
|
7
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/antlio/agentimization/main/assets/hero-loading-dark.svg">
|
|
8
|
+
<img src="https://raw.githubusercontent.com/antlio/agentimization/main/assets/hero-loading-light.svg" alt="agentimization" width="620">
|
|
9
|
+
</picture>
|
|
10
|
+
</p>
|
|
13
11
|
|
|
14
12
|
geo audit for agent-ready websites and projects.
|
|
15
13
|
|
package/dist/index.js
CHANGED
|
@@ -52,6 +52,142 @@ var DEFAULT_CONFIG = {
|
|
|
52
52
|
}
|
|
53
53
|
};
|
|
54
54
|
|
|
55
|
+
// src/utils/html.ts
|
|
56
|
+
var stripHtml = (html) => html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
57
|
+
var extractMarkdownLinks = (markdown) => {
|
|
58
|
+
const links = [];
|
|
59
|
+
const linkRegex = /\[.+?\]\(([^)]+)\)/g;
|
|
60
|
+
let match;
|
|
61
|
+
while ((match = linkRegex.exec(markdown)) !== null) {
|
|
62
|
+
links.push(match[1]);
|
|
63
|
+
}
|
|
64
|
+
return links;
|
|
65
|
+
};
|
|
66
|
+
var extractLinks = (html, baseUrl) => {
|
|
67
|
+
const links = [];
|
|
68
|
+
const linkRegex = /<a[^>]+href=["']([^"']+)["']/gi;
|
|
69
|
+
let match;
|
|
70
|
+
while ((match = linkRegex.exec(html)) !== null) {
|
|
71
|
+
try {
|
|
72
|
+
const resolved = new URL(match[1], baseUrl).href;
|
|
73
|
+
links.push(resolved);
|
|
74
|
+
} catch {
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
return links;
|
|
78
|
+
};
|
|
79
|
+
var extractMetaTags = (html) => {
|
|
80
|
+
const meta = {};
|
|
81
|
+
const metaRegex = /<meta[^>]+(?:name|property)=["']([^"']+)["'][^>]+content=["']([^"']+)["']/gi;
|
|
82
|
+
let match;
|
|
83
|
+
while ((match = metaRegex.exec(html)) !== null) {
|
|
84
|
+
meta[match[1].toLowerCase()] = match[2];
|
|
85
|
+
}
|
|
86
|
+
const metaRegex2 = /<meta[^>]+content=["']([^"']+)["'][^>]+(?:name|property)=["']([^"']+)["']/gi;
|
|
87
|
+
while ((match = metaRegex2.exec(html)) !== null) {
|
|
88
|
+
meta[match[2].toLowerCase()] = match[1];
|
|
89
|
+
}
|
|
90
|
+
return meta;
|
|
91
|
+
};
|
|
92
|
+
var extractJsonLd = (html) => {
|
|
93
|
+
const results = [];
|
|
94
|
+
const regex = /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
|
|
95
|
+
let match;
|
|
96
|
+
while ((match = regex.exec(html)) !== null) {
|
|
97
|
+
try {
|
|
98
|
+
results.push(JSON.parse(match[1]));
|
|
99
|
+
} catch {
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
return results;
|
|
103
|
+
};
|
|
104
|
+
var readAttr = (attrs, name) => {
|
|
105
|
+
const re = new RegExp(`\\b${name}=(?:"([^"]*)"|'([^']*)')`, "i");
|
|
106
|
+
const m = attrs.match(re);
|
|
107
|
+
if (!m) return void 0;
|
|
108
|
+
return m[1] ?? m[2];
|
|
109
|
+
};
|
|
110
|
+
var extractImages = (html) => {
|
|
111
|
+
const images = [];
|
|
112
|
+
const imgRegex = /<img\b([^>]*)>/gi;
|
|
113
|
+
let match;
|
|
114
|
+
while ((match = imgRegex.exec(html)) !== null) {
|
|
115
|
+
const attrs = match[1];
|
|
116
|
+
const src = readAttr(attrs, "src");
|
|
117
|
+
if (src === void 0) continue;
|
|
118
|
+
images.push({ src, alt: readAttr(attrs, "alt") });
|
|
119
|
+
}
|
|
120
|
+
return images;
|
|
121
|
+
};
|
|
122
|
+
var extractHeadings = (html) => {
|
|
123
|
+
const headings = [];
|
|
124
|
+
const regex = /<h([1-6])[^>]*>([\s\S]*?)<\/h\1>/gi;
|
|
125
|
+
let match;
|
|
126
|
+
while ((match = regex.exec(html)) !== null) {
|
|
127
|
+
headings.push({
|
|
128
|
+
level: parseInt(match[1], 10),
|
|
129
|
+
text: stripHtml(match[2]).trim()
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
return headings;
|
|
133
|
+
};
|
|
134
|
+
var hasServerRenderedContent = (html) => {
|
|
135
|
+
const withoutScripts = html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "");
|
|
136
|
+
const textContent = stripHtml(withoutScripts);
|
|
137
|
+
return textContent.length > 100;
|
|
138
|
+
};
|
|
139
|
+
var findContentStartPosition = (html) => {
|
|
140
|
+
const markers = [
|
|
141
|
+
/<main[\s>]/i,
|
|
142
|
+
/<article[\s>]/i,
|
|
143
|
+
/id=["']content["']/i,
|
|
144
|
+
/id=["']main["']/i,
|
|
145
|
+
/class=["'][^"']*content[^"']*["']/i,
|
|
146
|
+
/role=["']main["']/i
|
|
147
|
+
];
|
|
148
|
+
for (const marker of markers) {
|
|
149
|
+
const match = html.search(marker);
|
|
150
|
+
if (match >= 0) {
|
|
151
|
+
return match / html.length;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
const firstP = html.search(/<p[\s>]/i);
|
|
155
|
+
if (firstP >= 0) {
|
|
156
|
+
return firstP / html.length;
|
|
157
|
+
}
|
|
158
|
+
return 0.5;
|
|
159
|
+
};
|
|
160
|
+
var extractCodeFences = (markdown) => {
|
|
161
|
+
const fences = [];
|
|
162
|
+
const lines = markdown.split("\n");
|
|
163
|
+
let inFence = false;
|
|
164
|
+
let currentLang = "";
|
|
165
|
+
for (const line of lines) {
|
|
166
|
+
const openMatch = line.match(/^```(\w*)/);
|
|
167
|
+
if (openMatch && !inFence) {
|
|
168
|
+
inFence = true;
|
|
169
|
+
currentLang = openMatch[1] ?? "";
|
|
170
|
+
} else if (line.trim() === "```" && inFence) {
|
|
171
|
+
fences.push({ lang: currentLang, closed: true });
|
|
172
|
+
inFence = false;
|
|
173
|
+
currentLang = "";
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
if (inFence) {
|
|
177
|
+
fences.push({ lang: currentLang, closed: false });
|
|
178
|
+
}
|
|
179
|
+
return fences;
|
|
180
|
+
};
|
|
181
|
+
var parseSitemapUrls = (xml) => {
|
|
182
|
+
const urls = [];
|
|
183
|
+
const regex = /<loc>([^<]+)<\/loc>/gi;
|
|
184
|
+
let match;
|
|
185
|
+
while ((match = regex.exec(xml)) !== null) {
|
|
186
|
+
urls.push(match[1].trim());
|
|
187
|
+
}
|
|
188
|
+
return urls;
|
|
189
|
+
};
|
|
190
|
+
|
|
55
191
|
// src/checks/content-discoverability.ts
|
|
56
192
|
var llmsTxtExists = {
|
|
57
193
|
id: "llms-txt-exists",
|
|
@@ -92,7 +228,7 @@ var llmsTxtValid = {
|
|
|
92
228
|
name: "llms.txt Valid Structure",
|
|
93
229
|
category: "content-discoverability",
|
|
94
230
|
status: "skip",
|
|
95
|
-
message: "Skipped
|
|
231
|
+
message: "Skipped: no llms.txt found"
|
|
96
232
|
};
|
|
97
233
|
}
|
|
98
234
|
const issues = [];
|
|
@@ -137,7 +273,7 @@ var llmsTxtSize = {
|
|
|
137
273
|
name: "llms.txt Size",
|
|
138
274
|
category: "content-discoverability",
|
|
139
275
|
status: "skip",
|
|
140
|
-
message: "Skipped
|
|
276
|
+
message: "Skipped: no llms.txt found"
|
|
141
277
|
};
|
|
142
278
|
}
|
|
143
279
|
const size = ctx.llmsTxt.length;
|
|
@@ -175,7 +311,7 @@ var llmsTxtFreshness = {
|
|
|
175
311
|
name: "llms.txt Coverage",
|
|
176
312
|
category: "content-discoverability",
|
|
177
313
|
status: "skip",
|
|
178
|
-
message: "Skipped
|
|
314
|
+
message: "Skipped: no llms.txt found"
|
|
179
315
|
};
|
|
180
316
|
}
|
|
181
317
|
if (ctx.sitemapUrls.length === 0) {
|
|
@@ -205,11 +341,9 @@ var llmsTxtFreshness = {
|
|
|
205
341
|
return null;
|
|
206
342
|
}
|
|
207
343
|
};
|
|
208
|
-
const linkRegex = /\[.+?\]\(([^)]+)\)/g;
|
|
209
344
|
const llmsKeys = /* @__PURE__ */ new Set();
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
const k = keyFor(match[1]);
|
|
345
|
+
for (const link of extractMarkdownLinks(ctx.llmsTxt)) {
|
|
346
|
+
const k = keyFor(link);
|
|
213
347
|
if (k) llmsKeys.add(k);
|
|
214
348
|
}
|
|
215
349
|
const sitemapKeys = /* @__PURE__ */ new Set();
|
|
@@ -249,7 +383,7 @@ var llmsTxtFreshness = {
|
|
|
249
383
|
category: "content-discoverability",
|
|
250
384
|
status: coveragePct >= 40 || freshnessPct >= 70 ? "warn" : "fail",
|
|
251
385
|
message: `${message}${missingFromLlms > 0 ? ` \xB7 ${missingFromLlms} sitemap pages not in llms.txt` : ""}${staleInLlms > 0 ? ` \xB7 ${staleInLlms} llms.txt links not in sitemap` : ""}`,
|
|
252
|
-
suggestion: coveragePct < freshnessPct ? "Add missing sitemap pages to llms.txt to improve AI agent discoverability." : "Some llms.txt links aren't in the sitemap
|
|
386
|
+
suggestion: coveragePct < freshnessPct ? "Add missing sitemap pages to llms.txt to improve AI agent discoverability." : "Some llms.txt links aren't in the sitemap. They may be stale or your sitemap may be incomplete.",
|
|
253
387
|
metadata: {
|
|
254
388
|
coveragePct,
|
|
255
389
|
freshnessPct,
|
|
@@ -275,15 +409,13 @@ var llmsTxtLinksResolve = {
|
|
|
275
409
|
name: "llms.txt Links Resolve",
|
|
276
410
|
category: "content-discoverability",
|
|
277
411
|
status: "skip",
|
|
278
|
-
message: "Skipped
|
|
412
|
+
message: "Skipped: no llms.txt found"
|
|
279
413
|
};
|
|
280
414
|
}
|
|
281
|
-
const linkRegex = /\[.+?\]\(([^)]+)\)/g;
|
|
282
415
|
const urls = [];
|
|
283
|
-
|
|
284
|
-
while ((match = linkRegex.exec(ctx.llmsTxt)) !== null) {
|
|
416
|
+
for (const link of extractMarkdownLinks(ctx.llmsTxt)) {
|
|
285
417
|
try {
|
|
286
|
-
const resolved2 = new URL(
|
|
418
|
+
const resolved2 = new URL(link, ctx.baseUrl.origin);
|
|
287
419
|
if (resolved2.origin === ctx.baseUrl.origin) {
|
|
288
420
|
urls.push(resolved2.href);
|
|
289
421
|
}
|
|
@@ -324,7 +456,7 @@ var llmsTxtLinksResolve = {
|
|
|
324
456
|
name: "llms.txt Links Resolve",
|
|
325
457
|
category: "content-discoverability",
|
|
326
458
|
status: "fail",
|
|
327
|
-
message: `${resolved}/${sampled.length} sampled links resolve
|
|
459
|
+
message: `${resolved}/${sampled.length} sampled links resolve, ${sampled.length - resolved} broken`,
|
|
328
460
|
suggestion: "Fix broken links in llms.txt. AI agents will fail to fetch these pages.",
|
|
329
461
|
metadata: { resolved, sampled: sampled.length, total: urls.length }
|
|
330
462
|
};
|
|
@@ -343,15 +475,10 @@ var llmsTxtLinksMarkdown = {
|
|
|
343
475
|
name: "llms.txt Links Markdown",
|
|
344
476
|
category: "content-discoverability",
|
|
345
477
|
status: "skip",
|
|
346
|
-
message: "Skipped
|
|
478
|
+
message: "Skipped: no llms.txt found"
|
|
347
479
|
};
|
|
348
480
|
}
|
|
349
|
-
const
|
|
350
|
-
const urls = [];
|
|
351
|
-
let m;
|
|
352
|
-
while ((m = linkRegex.exec(ctx.llmsTxt)) !== null) {
|
|
353
|
-
urls.push(m[1]);
|
|
354
|
-
}
|
|
481
|
+
const urls = extractMarkdownLinks(ctx.llmsTxt);
|
|
355
482
|
if (urls.length === 0) {
|
|
356
483
|
return {
|
|
357
484
|
id: "llms-txt-links-markdown",
|
|
@@ -399,7 +526,7 @@ var llmsTxtLinksMarkdown = {
|
|
|
399
526
|
category: "content-discoverability",
|
|
400
527
|
status: "fail",
|
|
401
528
|
message: `Only ${mdLinks}/${urls.length} llms.txt links point to .md URLs (${pct}%)`,
|
|
402
|
-
suggestion: "Most llms.txt links are HTML-only. Serve a markdown version at .md URLs and link to those
|
|
529
|
+
suggestion: "Most llms.txt links are HTML-only. Serve a markdown version at .md URLs and link to those, so agents get cleaner content and fewer parse failures.",
|
|
403
530
|
metadata: { mdLinks, total: urls.length, pct }
|
|
404
531
|
};
|
|
405
532
|
}
|
|
@@ -501,6 +628,176 @@ var robotsTxtAgentRules = {
|
|
|
501
628
|
};
|
|
502
629
|
}
|
|
503
630
|
};
|
|
631
|
+
var llmsFullExists = {
|
|
632
|
+
id: "llms-full-exists",
|
|
633
|
+
name: "llms-full.txt Exists",
|
|
634
|
+
category: "content-discoverability",
|
|
635
|
+
description: "Checks if llms-full.txt (the complete-content variant) is present at the site root",
|
|
636
|
+
weight: 0.4,
|
|
637
|
+
run: async (ctx) => {
|
|
638
|
+
if (ctx.llmsFullTxt) {
|
|
639
|
+
return {
|
|
640
|
+
id: "llms-full-exists",
|
|
641
|
+
name: "llms-full.txt Exists",
|
|
642
|
+
category: "content-discoverability",
|
|
643
|
+
status: "pass",
|
|
644
|
+
message: ctx.mode === "local" ? "llms-full.txt found in project root" : `llms-full.txt found at ${ctx.baseUrl.origin}/llms-full.txt`
|
|
645
|
+
};
|
|
646
|
+
}
|
|
647
|
+
return {
|
|
648
|
+
id: "llms-full-exists",
|
|
649
|
+
name: "llms-full.txt Exists",
|
|
650
|
+
category: "content-discoverability",
|
|
651
|
+
status: "info",
|
|
652
|
+
message: "No llms-full.txt found (optional)",
|
|
653
|
+
suggestion: "If your llms.txt is large or you want agents to get full content in one fetch, add a /llms-full.txt containing the concatenated markdown of your docs."
|
|
654
|
+
};
|
|
655
|
+
}
|
|
656
|
+
};
|
|
657
|
+
var llmsFullValid = {
|
|
658
|
+
id: "llms-full-valid",
|
|
659
|
+
name: "llms-full.txt Valid Structure",
|
|
660
|
+
category: "content-discoverability",
|
|
661
|
+
description: "Checks if llms-full.txt has recognizable markdown structure (headings, content)",
|
|
662
|
+
weight: 0.4,
|
|
663
|
+
run: async (ctx) => {
|
|
664
|
+
if (!ctx.llmsFullTxt) {
|
|
665
|
+
return {
|
|
666
|
+
id: "llms-full-valid",
|
|
667
|
+
name: "llms-full.txt Valid Structure",
|
|
668
|
+
category: "content-discoverability",
|
|
669
|
+
status: "skip",
|
|
670
|
+
message: "Skipped: no llms-full.txt found"
|
|
671
|
+
};
|
|
672
|
+
}
|
|
673
|
+
const hasHeadings = /^#{1,3}\s+/m.test(ctx.llmsFullTxt);
|
|
674
|
+
const hasProse = ctx.llmsFullTxt.length > 600;
|
|
675
|
+
if (hasHeadings && hasProse) {
|
|
676
|
+
return {
|
|
677
|
+
id: "llms-full-valid",
|
|
678
|
+
name: "llms-full.txt Valid Structure",
|
|
679
|
+
category: "content-discoverability",
|
|
680
|
+
status: "pass",
|
|
681
|
+
message: "llms-full.txt has recognizable markdown structure"
|
|
682
|
+
};
|
|
683
|
+
}
|
|
684
|
+
return {
|
|
685
|
+
id: "llms-full-valid",
|
|
686
|
+
name: "llms-full.txt Valid Structure",
|
|
687
|
+
category: "content-discoverability",
|
|
688
|
+
status: "warn",
|
|
689
|
+
message: `llms-full.txt found but ${!hasHeadings ? "has no markdown headings" : "has little content"}`,
|
|
690
|
+
suggestion: "llms-full.txt should contain the full markdown content of your docs, with headings, so agents can parse it."
|
|
691
|
+
};
|
|
692
|
+
}
|
|
693
|
+
};
|
|
694
|
+
var llmsFullSize = {
|
|
695
|
+
id: "llms-full-size",
|
|
696
|
+
name: "llms-full.txt Size",
|
|
697
|
+
category: "content-discoverability",
|
|
698
|
+
description: "Checks if llms-full.txt size is within the expected range (substantial but not excessive)",
|
|
699
|
+
weight: 0.3,
|
|
700
|
+
run: async (ctx) => {
|
|
701
|
+
if (!ctx.llmsFullTxt) {
|
|
702
|
+
return {
|
|
703
|
+
id: "llms-full-size",
|
|
704
|
+
name: "llms-full.txt Size",
|
|
705
|
+
category: "content-discoverability",
|
|
706
|
+
status: "skip",
|
|
707
|
+
message: "Skipped: no llms-full.txt found"
|
|
708
|
+
};
|
|
709
|
+
}
|
|
710
|
+
const size = ctx.llmsFullTxt.length;
|
|
711
|
+
const MIN = 1e4;
|
|
712
|
+
const MAX = 5e6;
|
|
713
|
+
if (size >= MIN && size <= MAX) {
|
|
714
|
+
return {
|
|
715
|
+
id: "llms-full-size",
|
|
716
|
+
name: "llms-full.txt Size",
|
|
717
|
+
category: "content-discoverability",
|
|
718
|
+
status: "pass",
|
|
719
|
+
message: `llms-full.txt is ${size.toLocaleString()} characters (within expected range)`,
|
|
720
|
+
metadata: { size }
|
|
721
|
+
};
|
|
722
|
+
}
|
|
723
|
+
return {
|
|
724
|
+
id: "llms-full-size",
|
|
725
|
+
name: "llms-full.txt Size",
|
|
726
|
+
category: "content-discoverability",
|
|
727
|
+
status: "warn",
|
|
728
|
+
message: size < MIN ? `llms-full.txt is only ${size.toLocaleString()} characters, smaller than expected for a full-content file` : `llms-full.txt is ${size.toLocaleString()} characters, large enough to overflow agent context windows`,
|
|
729
|
+
suggestion: size < MIN ? "llms-full.txt should contain your complete documentation. If it's this small, llms.txt alone may be enough." : "Consider trimming llms-full.txt or splitting content so agents can fetch what fits their context window.",
|
|
730
|
+
metadata: { size }
|
|
731
|
+
};
|
|
732
|
+
}
|
|
733
|
+
};
|
|
734
|
+
var llmsFullLinksResolve = {
|
|
735
|
+
id: "llms-full-links-resolve",
|
|
736
|
+
name: "llms-full.txt Links Resolve",
|
|
737
|
+
category: "content-discoverability",
|
|
738
|
+
description: "Checks if links in llms-full.txt return 200 OK",
|
|
739
|
+
weight: 0.4,
|
|
740
|
+
requiresNetwork: true,
|
|
741
|
+
run: async (ctx) => {
|
|
742
|
+
if (!ctx.llmsFullTxt) {
|
|
743
|
+
return {
|
|
744
|
+
id: "llms-full-links-resolve",
|
|
745
|
+
name: "llms-full.txt Links Resolve",
|
|
746
|
+
category: "content-discoverability",
|
|
747
|
+
status: "skip",
|
|
748
|
+
message: "Skipped: no llms-full.txt found"
|
|
749
|
+
};
|
|
750
|
+
}
|
|
751
|
+
const urls = [];
|
|
752
|
+
for (const link of extractMarkdownLinks(ctx.llmsFullTxt)) {
|
|
753
|
+
try {
|
|
754
|
+
const resolved2 = new URL(link, ctx.baseUrl.origin);
|
|
755
|
+
if (resolved2.origin === ctx.baseUrl.origin) {
|
|
756
|
+
urls.push(resolved2.href);
|
|
757
|
+
}
|
|
758
|
+
} catch {
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
if (urls.length === 0) {
|
|
762
|
+
return {
|
|
763
|
+
id: "llms-full-links-resolve",
|
|
764
|
+
name: "llms-full.txt Links Resolve",
|
|
765
|
+
category: "content-discoverability",
|
|
766
|
+
status: "info",
|
|
767
|
+
message: "No same-origin links found in llms-full.txt"
|
|
768
|
+
};
|
|
769
|
+
}
|
|
770
|
+
const sampled = urls.slice(0, 10);
|
|
771
|
+
const results = await Promise.allSettled(
|
|
772
|
+
sampled.map(async (url) => {
|
|
773
|
+
const resp = await fetch(url, { method: "HEAD", redirect: "follow" });
|
|
774
|
+
return { url, status: resp.status };
|
|
775
|
+
})
|
|
776
|
+
);
|
|
777
|
+
const resolved = results.filter(
|
|
778
|
+
(r) => r.status === "fulfilled" && r.value.status >= 200 && r.value.status < 400
|
|
779
|
+
).length;
|
|
780
|
+
if (resolved === sampled.length) {
|
|
781
|
+
return {
|
|
782
|
+
id: "llms-full-links-resolve",
|
|
783
|
+
name: "llms-full.txt Links Resolve",
|
|
784
|
+
category: "content-discoverability",
|
|
785
|
+
status: "pass",
|
|
786
|
+
message: `All ${resolved} sampled same-origin links resolve (${urls.length} total links)`,
|
|
787
|
+
metadata: { resolved, sampled: sampled.length, total: urls.length }
|
|
788
|
+
};
|
|
789
|
+
}
|
|
790
|
+
return {
|
|
791
|
+
id: "llms-full-links-resolve",
|
|
792
|
+
name: "llms-full.txt Links Resolve",
|
|
793
|
+
category: "content-discoverability",
|
|
794
|
+
status: "fail",
|
|
795
|
+
message: `${resolved}/${sampled.length} sampled links resolve, ${sampled.length - resolved} broken`,
|
|
796
|
+
suggestion: "Fix broken links in llms-full.txt. AI agents will fail to fetch these pages.",
|
|
797
|
+
metadata: { resolved, sampled: sampled.length, total: urls.length }
|
|
798
|
+
};
|
|
799
|
+
}
|
|
800
|
+
};
|
|
504
801
|
var contentDiscoverabilityChecks = [
|
|
505
802
|
llmsTxtExists,
|
|
506
803
|
llmsTxtValid,
|
|
@@ -508,23 +805,31 @@ var contentDiscoverabilityChecks = [
|
|
|
508
805
|
llmsTxtFreshness,
|
|
509
806
|
llmsTxtLinksResolve,
|
|
510
807
|
llmsTxtLinksMarkdown,
|
|
808
|
+
llmsFullExists,
|
|
809
|
+
llmsFullValid,
|
|
810
|
+
llmsFullSize,
|
|
811
|
+
llmsFullLinksResolve,
|
|
511
812
|
sitemapExists,
|
|
512
813
|
robotsTxtAgentRules
|
|
513
814
|
];
|
|
514
815
|
|
|
515
816
|
// src/utils/fetch.ts
|
|
516
|
-
var
|
|
817
|
+
var BROWSER_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
|
|
818
|
+
var makeHeaders = (config, asBrowser = false) => asBrowser ? {
|
|
819
|
+
"User-Agent": BROWSER_UA,
|
|
820
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
|
821
|
+
} : {
|
|
517
822
|
"User-Agent": config.userAgent ?? DEFAULT_CONFIG.userAgent,
|
|
518
823
|
Accept: "text/html,application/xhtml+xml,text/markdown,text/plain,*/*"
|
|
519
|
-
}
|
|
520
|
-
var fetchPage = async (url, config = {}) => {
|
|
824
|
+
};
|
|
825
|
+
var fetchPage = async (url, config = {}, asBrowser = false) => {
|
|
521
826
|
const timeout = config.timeout ?? DEFAULT_CONFIG.timeout;
|
|
522
827
|
const start = Date.now();
|
|
523
828
|
const controller = new AbortController();
|
|
524
829
|
const timer = setTimeout(() => controller.abort(), timeout);
|
|
525
830
|
try {
|
|
526
831
|
const response = await fetch(url, {
|
|
527
|
-
headers: makeHeaders(config),
|
|
832
|
+
headers: makeHeaders(config, asBrowser),
|
|
528
833
|
signal: controller.signal,
|
|
529
834
|
redirect: "follow"
|
|
530
835
|
});
|
|
@@ -577,13 +882,13 @@ var fetchWithContentNegotiation = async (url, accept, config = {}) => {
|
|
|
577
882
|
clearTimeout(timer);
|
|
578
883
|
}
|
|
579
884
|
};
|
|
580
|
-
var fetchMany = async (urls, config = {}) => {
|
|
885
|
+
var fetchMany = async (urls, config = {}, asBrowser = false) => {
|
|
581
886
|
const concurrency = config.concurrency ?? DEFAULT_CONFIG.concurrency;
|
|
582
887
|
const results = [];
|
|
583
888
|
for (let i = 0; i < urls.length; i += concurrency) {
|
|
584
889
|
const chunk = urls.slice(i, i + concurrency);
|
|
585
890
|
const chunkResults = await Promise.allSettled(
|
|
586
|
-
chunk.map((url) => fetchPage(url, config))
|
|
891
|
+
chunk.map((url) => fetchPage(url, config, asBrowser))
|
|
587
892
|
);
|
|
588
893
|
for (const result of chunkResults) {
|
|
589
894
|
if (result.status === "fulfilled") {
|
|
@@ -749,133 +1054,6 @@ var markdownAvailabilityChecks = [
|
|
|
749
1054
|
markdownContentParity
|
|
750
1055
|
];
|
|
751
1056
|
|
|
752
|
-
// src/utils/html.ts
|
|
753
|
-
var stripHtml = (html) => html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
754
|
-
var extractLinks = (html, baseUrl) => {
|
|
755
|
-
const links = [];
|
|
756
|
-
const linkRegex = /<a[^>]+href=["']([^"']+)["']/gi;
|
|
757
|
-
let match;
|
|
758
|
-
while ((match = linkRegex.exec(html)) !== null) {
|
|
759
|
-
try {
|
|
760
|
-
const resolved = new URL(match[1], baseUrl).href;
|
|
761
|
-
links.push(resolved);
|
|
762
|
-
} catch {
|
|
763
|
-
}
|
|
764
|
-
}
|
|
765
|
-
return links;
|
|
766
|
-
};
|
|
767
|
-
var extractMetaTags = (html) => {
|
|
768
|
-
const meta = {};
|
|
769
|
-
const metaRegex = /<meta[^>]+(?:name|property)=["']([^"']+)["'][^>]+content=["']([^"']+)["']/gi;
|
|
770
|
-
let match;
|
|
771
|
-
while ((match = metaRegex.exec(html)) !== null) {
|
|
772
|
-
meta[match[1].toLowerCase()] = match[2];
|
|
773
|
-
}
|
|
774
|
-
const metaRegex2 = /<meta[^>]+content=["']([^"']+)["'][^>]+(?:name|property)=["']([^"']+)["']/gi;
|
|
775
|
-
while ((match = metaRegex2.exec(html)) !== null) {
|
|
776
|
-
meta[match[2].toLowerCase()] = match[1];
|
|
777
|
-
}
|
|
778
|
-
return meta;
|
|
779
|
-
};
|
|
780
|
-
var extractJsonLd = (html) => {
|
|
781
|
-
const results = [];
|
|
782
|
-
const regex = /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
|
|
783
|
-
let match;
|
|
784
|
-
while ((match = regex.exec(html)) !== null) {
|
|
785
|
-
try {
|
|
786
|
-
results.push(JSON.parse(match[1]));
|
|
787
|
-
} catch {
|
|
788
|
-
}
|
|
789
|
-
}
|
|
790
|
-
return results;
|
|
791
|
-
};
|
|
792
|
-
var readAttr = (attrs, name) => {
|
|
793
|
-
const re = new RegExp(`\\b${name}=(?:"([^"]*)"|'([^']*)')`, "i");
|
|
794
|
-
const m = attrs.match(re);
|
|
795
|
-
if (!m) return void 0;
|
|
796
|
-
return m[1] ?? m[2];
|
|
797
|
-
};
|
|
798
|
-
var extractImages = (html) => {
|
|
799
|
-
const images = [];
|
|
800
|
-
const imgRegex = /<img\b([^>]*)>/gi;
|
|
801
|
-
let match;
|
|
802
|
-
while ((match = imgRegex.exec(html)) !== null) {
|
|
803
|
-
const attrs = match[1];
|
|
804
|
-
const src = readAttr(attrs, "src");
|
|
805
|
-
if (src === void 0) continue;
|
|
806
|
-
images.push({ src, alt: readAttr(attrs, "alt") });
|
|
807
|
-
}
|
|
808
|
-
return images;
|
|
809
|
-
};
|
|
810
|
-
var extractHeadings = (html) => {
|
|
811
|
-
const headings = [];
|
|
812
|
-
const regex = /<h([1-6])[^>]*>([\s\S]*?)<\/h\1>/gi;
|
|
813
|
-
let match;
|
|
814
|
-
while ((match = regex.exec(html)) !== null) {
|
|
815
|
-
headings.push({
|
|
816
|
-
level: parseInt(match[1], 10),
|
|
817
|
-
text: stripHtml(match[2]).trim()
|
|
818
|
-
});
|
|
819
|
-
}
|
|
820
|
-
return headings;
|
|
821
|
-
};
|
|
822
|
-
var hasServerRenderedContent = (html) => {
|
|
823
|
-
const withoutScripts = html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "");
|
|
824
|
-
const textContent = stripHtml(withoutScripts);
|
|
825
|
-
return textContent.length > 100;
|
|
826
|
-
};
|
|
827
|
-
var findContentStartPosition = (html) => {
|
|
828
|
-
const markers = [
|
|
829
|
-
/<main[\s>]/i,
|
|
830
|
-
/<article[\s>]/i,
|
|
831
|
-
/id=["']content["']/i,
|
|
832
|
-
/id=["']main["']/i,
|
|
833
|
-
/class=["'][^"']*content[^"']*["']/i,
|
|
834
|
-
/role=["']main["']/i
|
|
835
|
-
];
|
|
836
|
-
for (const marker of markers) {
|
|
837
|
-
const match = html.search(marker);
|
|
838
|
-
if (match >= 0) {
|
|
839
|
-
return match / html.length;
|
|
840
|
-
}
|
|
841
|
-
}
|
|
842
|
-
const firstP = html.search(/<p[\s>]/i);
|
|
843
|
-
if (firstP >= 0) {
|
|
844
|
-
return firstP / html.length;
|
|
845
|
-
}
|
|
846
|
-
return 0.5;
|
|
847
|
-
};
|
|
848
|
-
var extractCodeFences = (markdown) => {
|
|
849
|
-
const fences = [];
|
|
850
|
-
const lines = markdown.split("\n");
|
|
851
|
-
let inFence = false;
|
|
852
|
-
let currentLang = "";
|
|
853
|
-
for (const line of lines) {
|
|
854
|
-
const openMatch = line.match(/^```(\w*)/);
|
|
855
|
-
if (openMatch && !inFence) {
|
|
856
|
-
inFence = true;
|
|
857
|
-
currentLang = openMatch[1] ?? "";
|
|
858
|
-
} else if (line.trim() === "```" && inFence) {
|
|
859
|
-
fences.push({ lang: currentLang, closed: true });
|
|
860
|
-
inFence = false;
|
|
861
|
-
currentLang = "";
|
|
862
|
-
}
|
|
863
|
-
}
|
|
864
|
-
if (inFence) {
|
|
865
|
-
fences.push({ lang: currentLang, closed: false });
|
|
866
|
-
}
|
|
867
|
-
return fences;
|
|
868
|
-
};
|
|
869
|
-
var parseSitemapUrls = (xml) => {
|
|
870
|
-
const urls = [];
|
|
871
|
-
const regex = /<loc>([^<]+)<\/loc>/gi;
|
|
872
|
-
let match;
|
|
873
|
-
while ((match = regex.exec(xml)) !== null) {
|
|
874
|
-
urls.push(match[1].trim());
|
|
875
|
-
}
|
|
876
|
-
return urls;
|
|
877
|
-
};
|
|
878
|
-
|
|
879
1057
|
// src/checks/page-size.ts
|
|
880
1058
|
var MAX_HTML_CHARS = 5e4;
|
|
881
1059
|
var MAX_MD_CHARS = 5e4;
|
|
@@ -2205,6 +2383,56 @@ var mcpServerCard = {
|
|
|
2205
2383
|
}
|
|
2206
2384
|
}
|
|
2207
2385
|
};
|
|
2386
|
+
var mcpToolCount = {
|
|
2387
|
+
id: "mcp-tool-count",
|
|
2388
|
+
name: "MCP Tool Count",
|
|
2389
|
+
category: "agent-protocols",
|
|
2390
|
+
description: "Checks that the MCP server card exposes at least one tool",
|
|
2391
|
+
weight: 0.4,
|
|
2392
|
+
run: async (ctx) => {
|
|
2393
|
+
if (!ctx.mcpServerCard) {
|
|
2394
|
+
return {
|
|
2395
|
+
id: "mcp-tool-count",
|
|
2396
|
+
name: "MCP Tool Count",
|
|
2397
|
+
category: "agent-protocols",
|
|
2398
|
+
status: "skip",
|
|
2399
|
+
message: "Skipped: no MCP server card found"
|
|
2400
|
+
};
|
|
2401
|
+
}
|
|
2402
|
+
let card;
|
|
2403
|
+
try {
|
|
2404
|
+
card = JSON.parse(ctx.mcpServerCard);
|
|
2405
|
+
} catch {
|
|
2406
|
+
return {
|
|
2407
|
+
id: "mcp-tool-count",
|
|
2408
|
+
name: "MCP Tool Count",
|
|
2409
|
+
category: "agent-protocols",
|
|
2410
|
+
status: "skip",
|
|
2411
|
+
message: "Skipped: MCP server card is invalid JSON"
|
|
2412
|
+
};
|
|
2413
|
+
}
|
|
2414
|
+
const toolCount = Array.isArray(card.tools) ? card.tools.length : Array.isArray(card.capabilities?.tools) ? card.capabilities.tools.length : 0;
|
|
2415
|
+
if (toolCount > 0) {
|
|
2416
|
+
return {
|
|
2417
|
+
id: "mcp-tool-count",
|
|
2418
|
+
name: "MCP Tool Count",
|
|
2419
|
+
category: "agent-protocols",
|
|
2420
|
+
status: "pass",
|
|
2421
|
+
message: `MCP server exposes ${toolCount} tool${toolCount === 1 ? "" : "s"}`,
|
|
2422
|
+
metadata: { toolCount }
|
|
2423
|
+
};
|
|
2424
|
+
}
|
|
2425
|
+
return {
|
|
2426
|
+
id: "mcp-tool-count",
|
|
2427
|
+
name: "MCP Tool Count",
|
|
2428
|
+
category: "agent-protocols",
|
|
2429
|
+
status: "warn",
|
|
2430
|
+
message: "MCP server card found but exposes no tools",
|
|
2431
|
+
suggestion: "List your MCP server's tools in the server card so agents know what actions are available before connecting.",
|
|
2432
|
+
metadata: { toolCount }
|
|
2433
|
+
};
|
|
2434
|
+
}
|
|
2435
|
+
};
|
|
2208
2436
|
var apiCatalog = {
|
|
2209
2437
|
id: "api-catalog",
|
|
2210
2438
|
name: "API Catalog (RFC 9727)",
|
|
@@ -2278,7 +2506,7 @@ var contentSignals = {
|
|
|
2278
2506
|
name: "Content Signals (AI Usage Declarations)",
|
|
2279
2507
|
category: "agent-protocols",
|
|
2280
2508
|
status: "info",
|
|
2281
|
-
message: "No robots.txt found
|
|
2509
|
+
message: "No robots.txt found, cannot check for content signals",
|
|
2282
2510
|
suggestion: "Add a robots.txt with Content Signals directives to declare how AI agents may use your content (ai-train, ai-input, search)."
|
|
2283
2511
|
};
|
|
2284
2512
|
}
|
|
@@ -2463,7 +2691,7 @@ var agentsMd = {
|
|
|
2463
2691
|
category: "agent-protocols",
|
|
2464
2692
|
status: "fail",
|
|
2465
2693
|
message: "No AGENTS.md or AGENT.md found",
|
|
2466
|
-
suggestion: "Add an AGENTS.md at the project root. This is the universal agent configuration file
|
|
2694
|
+
suggestion: "Add an AGENTS.md at the project root. This is the universal agent configuration file, a README for AI coding agents. Include build/test commands, architecture overview, conventions, and any gotchas. Used by 60k+ open-source projects."
|
|
2467
2695
|
};
|
|
2468
2696
|
}
|
|
2469
2697
|
const content = ctx.agentsMd;
|
|
@@ -2515,6 +2743,7 @@ var agentsMd = {
|
|
|
2515
2743
|
};
|
|
2516
2744
|
var agentProtocolChecks = [
|
|
2517
2745
|
mcpServerCard,
|
|
2746
|
+
mcpToolCount,
|
|
2518
2747
|
apiCatalog,
|
|
2519
2748
|
contentSignals,
|
|
2520
2749
|
linkHeaders,
|
|
@@ -2699,7 +2928,7 @@ var buildRemoteContext = async (targetUrl, config) => {
|
|
|
2699
2928
|
const apiCatalog2 = apiCatalogResult.status === "fulfilled" && apiCatalogResult.value?.statusCode === 200 ? apiCatalogResult.value.text : void 0;
|
|
2700
2929
|
const agentSkillsIndex2 = agentSkillsResult.status === "fulfilled" && agentSkillsResult.value?.statusCode === 200 ? agentSkillsResult.value.text : void 0;
|
|
2701
2930
|
const agentsMd2 = void 0;
|
|
2702
|
-
|
|
2931
|
+
let sitemapUrls = sitemapXml ? parseSitemapUrls(sitemapXml) : [];
|
|
2703
2932
|
if (!sitemapXml && robotsTxt) {
|
|
2704
2933
|
const sitemapMatch = robotsTxt.match(/Sitemap:\s*(.+)/i);
|
|
2705
2934
|
if (sitemapMatch) {
|
|
@@ -2709,10 +2938,28 @@ var buildRemoteContext = async (targetUrl, config) => {
|
|
|
2709
2938
|
}
|
|
2710
2939
|
}
|
|
2711
2940
|
}
|
|
2941
|
+
const isSitemapIndex = (sitemapXml ?? "").includes("<sitemapindex");
|
|
2942
|
+
if (isSitemapIndex && sitemapUrls.length > 0) {
|
|
2943
|
+
const nested = await Promise.allSettled(
|
|
2944
|
+
sitemapUrls.slice(0, 20).map((u) => fetchText(u, config))
|
|
2945
|
+
);
|
|
2946
|
+
sitemapUrls = nested.flatMap(
|
|
2947
|
+
(r) => r.status === "fulfilled" && r.value?.statusCode === 200 ? parseSitemapUrls(r.value.text) : []
|
|
2948
|
+
);
|
|
2949
|
+
}
|
|
2712
2950
|
let pagesToSample = [];
|
|
2713
2951
|
if (sitemapUrls.length > 0) {
|
|
2714
|
-
const
|
|
2715
|
-
|
|
2952
|
+
const pathPrefix = baseUrl.pathname.replace(/\/+$/, "");
|
|
2953
|
+
const scoped = pathPrefix.length > 1 ? sitemapUrls.filter((u) => {
|
|
2954
|
+
try {
|
|
2955
|
+
return new URL(u).pathname.startsWith(pathPrefix);
|
|
2956
|
+
} catch {
|
|
2957
|
+
return false;
|
|
2958
|
+
}
|
|
2959
|
+
}) : sitemapUrls;
|
|
2960
|
+
const pool = scoped.length > 0 ? scoped : sitemapUrls;
|
|
2961
|
+
const step = Math.max(1, Math.floor(pool.length / config.sampleSize));
|
|
2962
|
+
pagesToSample = pool.filter((_, i) => i % step === 0).slice(0, config.sampleSize);
|
|
2716
2963
|
} else {
|
|
2717
2964
|
const mainPage = await fetchPage(targetUrl, config);
|
|
2718
2965
|
const linkRegex = /<a[^>]+href=["']([^"'#]+)["']/gi;
|
|
@@ -2732,14 +2979,16 @@ var buildRemoteContext = async (targetUrl, config) => {
|
|
|
2732
2979
|
if (!pagesToSample.includes(targetUrl)) {
|
|
2733
2980
|
pagesToSample.unshift(targetUrl);
|
|
2734
2981
|
}
|
|
2735
|
-
const sampledPages = await fetchMany(pagesToSample, config);
|
|
2982
|
+
const sampledPages = await fetchMany(pagesToSample, config, true);
|
|
2736
2983
|
emit({ type: "context-ready", pageCount: sampledPages.length });
|
|
2737
|
-
|
|
2738
|
-
|
|
2739
|
-
|
|
2740
|
-
|
|
2741
|
-
|
|
2742
|
-
|
|
2984
|
+
await Promise.allSettled(
|
|
2985
|
+
sampledPages.map(async (page) => {
|
|
2986
|
+
const mdResult = await fetchWithContentNegotiation(page.url, "text/markdown", config);
|
|
2987
|
+
if (mdResult && mdResult.statusCode === 200 && (mdResult.contentType.includes("text/markdown") || mdResult.contentType.includes("text/plain"))) {
|
|
2988
|
+
page.markdown = mdResult.text;
|
|
2989
|
+
}
|
|
2990
|
+
})
|
|
2991
|
+
);
|
|
2743
2992
|
return {
|
|
2744
2993
|
mode: "remote",
|
|
2745
2994
|
targetUrl,
|