agentimization 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -8
- package/dist/index.js +414 -165
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -2,14 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
[](https://www.npmjs.com/package/agentimization)
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
╰───────────────────────────────────────────────╯
|
|
12
|
-
```
|
|
5
|
+
<p align="center">
|
|
6
|
+
<picture>
|
|
7
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/antlio/agentimization/main/assets/hero-loading-dark.svg">
|
|
8
|
+
<img src="https://raw.githubusercontent.com/antlio/agentimization/main/assets/hero-loading-light.svg" alt="agentimization" width="620">
|
|
9
|
+
</picture>
|
|
10
|
+
</p>
|
|
13
11
|
|
|
14
12
|
geo audit for agent-ready websites and projects.
|
|
15
13
|
|
package/dist/index.js
CHANGED
|
@@ -4105,6 +4105,140 @@ var DEFAULT_CONFIG = {
|
|
|
4105
4105
|
onEvent: () => {
|
|
4106
4106
|
}
|
|
4107
4107
|
};
|
|
4108
|
+
var stripHtml = (html) => html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
4109
|
+
var extractMarkdownLinks = (markdown) => {
|
|
4110
|
+
const links = [];
|
|
4111
|
+
const linkRegex = /\[.+?\]\(([^)]+)\)/g;
|
|
4112
|
+
let match;
|
|
4113
|
+
while ((match = linkRegex.exec(markdown)) !== null) {
|
|
4114
|
+
links.push(match[1]);
|
|
4115
|
+
}
|
|
4116
|
+
return links;
|
|
4117
|
+
};
|
|
4118
|
+
var extractLinks = (html, baseUrl) => {
|
|
4119
|
+
const links = [];
|
|
4120
|
+
const linkRegex = /<a[^>]+href=["']([^"']+)["']/gi;
|
|
4121
|
+
let match;
|
|
4122
|
+
while ((match = linkRegex.exec(html)) !== null) {
|
|
4123
|
+
try {
|
|
4124
|
+
const resolved = new URL(match[1], baseUrl).href;
|
|
4125
|
+
links.push(resolved);
|
|
4126
|
+
} catch {
|
|
4127
|
+
}
|
|
4128
|
+
}
|
|
4129
|
+
return links;
|
|
4130
|
+
};
|
|
4131
|
+
var extractMetaTags = (html) => {
|
|
4132
|
+
const meta = {};
|
|
4133
|
+
const metaRegex = /<meta[^>]+(?:name|property)=["']([^"']+)["'][^>]+content=["']([^"']+)["']/gi;
|
|
4134
|
+
let match;
|
|
4135
|
+
while ((match = metaRegex.exec(html)) !== null) {
|
|
4136
|
+
meta[match[1].toLowerCase()] = match[2];
|
|
4137
|
+
}
|
|
4138
|
+
const metaRegex2 = /<meta[^>]+content=["']([^"']+)["'][^>]+(?:name|property)=["']([^"']+)["']/gi;
|
|
4139
|
+
while ((match = metaRegex2.exec(html)) !== null) {
|
|
4140
|
+
meta[match[2].toLowerCase()] = match[1];
|
|
4141
|
+
}
|
|
4142
|
+
return meta;
|
|
4143
|
+
};
|
|
4144
|
+
var extractJsonLd = (html) => {
|
|
4145
|
+
const results = [];
|
|
4146
|
+
const regex = /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
|
|
4147
|
+
let match;
|
|
4148
|
+
while ((match = regex.exec(html)) !== null) {
|
|
4149
|
+
try {
|
|
4150
|
+
results.push(JSON.parse(match[1]));
|
|
4151
|
+
} catch {
|
|
4152
|
+
}
|
|
4153
|
+
}
|
|
4154
|
+
return results;
|
|
4155
|
+
};
|
|
4156
|
+
var readAttr = (attrs, name) => {
|
|
4157
|
+
const re = new RegExp(`\\b${name}=(?:"([^"]*)"|'([^']*)')`, "i");
|
|
4158
|
+
const m = attrs.match(re);
|
|
4159
|
+
if (!m) return void 0;
|
|
4160
|
+
return m[1] ?? m[2];
|
|
4161
|
+
};
|
|
4162
|
+
var extractImages = (html) => {
|
|
4163
|
+
const images = [];
|
|
4164
|
+
const imgRegex = /<img\b([^>]*)>/gi;
|
|
4165
|
+
let match;
|
|
4166
|
+
while ((match = imgRegex.exec(html)) !== null) {
|
|
4167
|
+
const attrs = match[1];
|
|
4168
|
+
const src = readAttr(attrs, "src");
|
|
4169
|
+
if (src === void 0) continue;
|
|
4170
|
+
images.push({ src, alt: readAttr(attrs, "alt") });
|
|
4171
|
+
}
|
|
4172
|
+
return images;
|
|
4173
|
+
};
|
|
4174
|
+
var extractHeadings = (html) => {
|
|
4175
|
+
const headings = [];
|
|
4176
|
+
const regex = /<h([1-6])[^>]*>([\s\S]*?)<\/h\1>/gi;
|
|
4177
|
+
let match;
|
|
4178
|
+
while ((match = regex.exec(html)) !== null) {
|
|
4179
|
+
headings.push({
|
|
4180
|
+
level: parseInt(match[1], 10),
|
|
4181
|
+
text: stripHtml(match[2]).trim()
|
|
4182
|
+
});
|
|
4183
|
+
}
|
|
4184
|
+
return headings;
|
|
4185
|
+
};
|
|
4186
|
+
var hasServerRenderedContent = (html) => {
|
|
4187
|
+
const withoutScripts = html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "");
|
|
4188
|
+
const textContent = stripHtml(withoutScripts);
|
|
4189
|
+
return textContent.length > 100;
|
|
4190
|
+
};
|
|
4191
|
+
var findContentStartPosition = (html) => {
|
|
4192
|
+
const markers = [
|
|
4193
|
+
/<main[\s>]/i,
|
|
4194
|
+
/<article[\s>]/i,
|
|
4195
|
+
/id=["']content["']/i,
|
|
4196
|
+
/id=["']main["']/i,
|
|
4197
|
+
/class=["'][^"']*content[^"']*["']/i,
|
|
4198
|
+
/role=["']main["']/i
|
|
4199
|
+
];
|
|
4200
|
+
for (const marker of markers) {
|
|
4201
|
+
const match = html.search(marker);
|
|
4202
|
+
if (match >= 0) {
|
|
4203
|
+
return match / html.length;
|
|
4204
|
+
}
|
|
4205
|
+
}
|
|
4206
|
+
const firstP = html.search(/<p[\s>]/i);
|
|
4207
|
+
if (firstP >= 0) {
|
|
4208
|
+
return firstP / html.length;
|
|
4209
|
+
}
|
|
4210
|
+
return 0.5;
|
|
4211
|
+
};
|
|
4212
|
+
var extractCodeFences = (markdown) => {
|
|
4213
|
+
const fences = [];
|
|
4214
|
+
const lines = markdown.split("\n");
|
|
4215
|
+
let inFence = false;
|
|
4216
|
+
let currentLang = "";
|
|
4217
|
+
for (const line of lines) {
|
|
4218
|
+
const openMatch = line.match(/^```(\w*)/);
|
|
4219
|
+
if (openMatch && !inFence) {
|
|
4220
|
+
inFence = true;
|
|
4221
|
+
currentLang = openMatch[1] ?? "";
|
|
4222
|
+
} else if (line.trim() === "```" && inFence) {
|
|
4223
|
+
fences.push({ lang: currentLang, closed: true });
|
|
4224
|
+
inFence = false;
|
|
4225
|
+
currentLang = "";
|
|
4226
|
+
}
|
|
4227
|
+
}
|
|
4228
|
+
if (inFence) {
|
|
4229
|
+
fences.push({ lang: currentLang, closed: false });
|
|
4230
|
+
}
|
|
4231
|
+
return fences;
|
|
4232
|
+
};
|
|
4233
|
+
var parseSitemapUrls = (xml) => {
|
|
4234
|
+
const urls = [];
|
|
4235
|
+
const regex = /<loc>([^<]+)<\/loc>/gi;
|
|
4236
|
+
let match;
|
|
4237
|
+
while ((match = regex.exec(xml)) !== null) {
|
|
4238
|
+
urls.push(match[1].trim());
|
|
4239
|
+
}
|
|
4240
|
+
return urls;
|
|
4241
|
+
};
|
|
4108
4242
|
var llmsTxtExists = {
|
|
4109
4243
|
id: "llms-txt-exists",
|
|
4110
4244
|
name: "llms.txt Exists",
|
|
@@ -4144,7 +4278,7 @@ var llmsTxtValid = {
|
|
|
4144
4278
|
name: "llms.txt Valid Structure",
|
|
4145
4279
|
category: "content-discoverability",
|
|
4146
4280
|
status: "skip",
|
|
4147
|
-
message: "Skipped
|
|
4281
|
+
message: "Skipped: no llms.txt found"
|
|
4148
4282
|
};
|
|
4149
4283
|
}
|
|
4150
4284
|
const issues = [];
|
|
@@ -4189,7 +4323,7 @@ var llmsTxtSize = {
|
|
|
4189
4323
|
name: "llms.txt Size",
|
|
4190
4324
|
category: "content-discoverability",
|
|
4191
4325
|
status: "skip",
|
|
4192
|
-
message: "Skipped
|
|
4326
|
+
message: "Skipped: no llms.txt found"
|
|
4193
4327
|
};
|
|
4194
4328
|
}
|
|
4195
4329
|
const size = ctx.llmsTxt.length;
|
|
@@ -4227,7 +4361,7 @@ var llmsTxtFreshness = {
|
|
|
4227
4361
|
name: "llms.txt Coverage",
|
|
4228
4362
|
category: "content-discoverability",
|
|
4229
4363
|
status: "skip",
|
|
4230
|
-
message: "Skipped
|
|
4364
|
+
message: "Skipped: no llms.txt found"
|
|
4231
4365
|
};
|
|
4232
4366
|
}
|
|
4233
4367
|
if (ctx.sitemapUrls.length === 0) {
|
|
@@ -4257,11 +4391,9 @@ var llmsTxtFreshness = {
|
|
|
4257
4391
|
return null;
|
|
4258
4392
|
}
|
|
4259
4393
|
};
|
|
4260
|
-
const linkRegex = /\[.+?\]\(([^)]+)\)/g;
|
|
4261
4394
|
const llmsKeys = /* @__PURE__ */ new Set();
|
|
4262
|
-
|
|
4263
|
-
|
|
4264
|
-
const k = keyFor(match[1]);
|
|
4395
|
+
for (const link of extractMarkdownLinks(ctx.llmsTxt)) {
|
|
4396
|
+
const k = keyFor(link);
|
|
4265
4397
|
if (k) llmsKeys.add(k);
|
|
4266
4398
|
}
|
|
4267
4399
|
const sitemapKeys = /* @__PURE__ */ new Set();
|
|
@@ -4301,7 +4433,7 @@ var llmsTxtFreshness = {
|
|
|
4301
4433
|
category: "content-discoverability",
|
|
4302
4434
|
status: coveragePct >= 40 || freshnessPct >= 70 ? "warn" : "fail",
|
|
4303
4435
|
message: `${message}${missingFromLlms > 0 ? ` \xB7 ${missingFromLlms} sitemap pages not in llms.txt` : ""}${staleInLlms > 0 ? ` \xB7 ${staleInLlms} llms.txt links not in sitemap` : ""}`,
|
|
4304
|
-
suggestion: coveragePct < freshnessPct ? "Add missing sitemap pages to llms.txt to improve AI agent discoverability." : "Some llms.txt links aren't in the sitemap
|
|
4436
|
+
suggestion: coveragePct < freshnessPct ? "Add missing sitemap pages to llms.txt to improve AI agent discoverability." : "Some llms.txt links aren't in the sitemap. They may be stale or your sitemap may be incomplete.",
|
|
4305
4437
|
metadata: {
|
|
4306
4438
|
coveragePct,
|
|
4307
4439
|
freshnessPct,
|
|
@@ -4327,15 +4459,13 @@ var llmsTxtLinksResolve = {
|
|
|
4327
4459
|
name: "llms.txt Links Resolve",
|
|
4328
4460
|
category: "content-discoverability",
|
|
4329
4461
|
status: "skip",
|
|
4330
|
-
message: "Skipped
|
|
4462
|
+
message: "Skipped: no llms.txt found"
|
|
4331
4463
|
};
|
|
4332
4464
|
}
|
|
4333
|
-
const linkRegex = /\[.+?\]\(([^)]+)\)/g;
|
|
4334
4465
|
const urls = [];
|
|
4335
|
-
|
|
4336
|
-
while ((match = linkRegex.exec(ctx.llmsTxt)) !== null) {
|
|
4466
|
+
for (const link of extractMarkdownLinks(ctx.llmsTxt)) {
|
|
4337
4467
|
try {
|
|
4338
|
-
const resolved2 = new URL(
|
|
4468
|
+
const resolved2 = new URL(link, ctx.baseUrl.origin);
|
|
4339
4469
|
if (resolved2.origin === ctx.baseUrl.origin) {
|
|
4340
4470
|
urls.push(resolved2.href);
|
|
4341
4471
|
}
|
|
@@ -4376,7 +4506,7 @@ var llmsTxtLinksResolve = {
|
|
|
4376
4506
|
name: "llms.txt Links Resolve",
|
|
4377
4507
|
category: "content-discoverability",
|
|
4378
4508
|
status: "fail",
|
|
4379
|
-
message: `${resolved}/${sampled.length} sampled links resolve
|
|
4509
|
+
message: `${resolved}/${sampled.length} sampled links resolve, ${sampled.length - resolved} broken`,
|
|
4380
4510
|
suggestion: "Fix broken links in llms.txt. AI agents will fail to fetch these pages.",
|
|
4381
4511
|
metadata: { resolved, sampled: sampled.length, total: urls.length }
|
|
4382
4512
|
};
|
|
@@ -4395,15 +4525,10 @@ var llmsTxtLinksMarkdown = {
|
|
|
4395
4525
|
name: "llms.txt Links Markdown",
|
|
4396
4526
|
category: "content-discoverability",
|
|
4397
4527
|
status: "skip",
|
|
4398
|
-
message: "Skipped
|
|
4528
|
+
message: "Skipped: no llms.txt found"
|
|
4399
4529
|
};
|
|
4400
4530
|
}
|
|
4401
|
-
const
|
|
4402
|
-
const urls = [];
|
|
4403
|
-
let m;
|
|
4404
|
-
while ((m = linkRegex.exec(ctx.llmsTxt)) !== null) {
|
|
4405
|
-
urls.push(m[1]);
|
|
4406
|
-
}
|
|
4531
|
+
const urls = extractMarkdownLinks(ctx.llmsTxt);
|
|
4407
4532
|
if (urls.length === 0) {
|
|
4408
4533
|
return {
|
|
4409
4534
|
id: "llms-txt-links-markdown",
|
|
@@ -4451,7 +4576,7 @@ var llmsTxtLinksMarkdown = {
|
|
|
4451
4576
|
category: "content-discoverability",
|
|
4452
4577
|
status: "fail",
|
|
4453
4578
|
message: `Only ${mdLinks}/${urls.length} llms.txt links point to .md URLs (${pct}%)`,
|
|
4454
|
-
suggestion: "Most llms.txt links are HTML-only. Serve a markdown version at .md URLs and link to those
|
|
4579
|
+
suggestion: "Most llms.txt links are HTML-only. Serve a markdown version at .md URLs and link to those, so agents get cleaner content and fewer parse failures.",
|
|
4455
4580
|
metadata: { mdLinks, total: urls.length, pct }
|
|
4456
4581
|
};
|
|
4457
4582
|
}
|
|
@@ -4553,6 +4678,176 @@ var robotsTxtAgentRules = {
|
|
|
4553
4678
|
};
|
|
4554
4679
|
}
|
|
4555
4680
|
};
|
|
4681
|
+
var llmsFullExists = {
|
|
4682
|
+
id: "llms-full-exists",
|
|
4683
|
+
name: "llms-full.txt Exists",
|
|
4684
|
+
category: "content-discoverability",
|
|
4685
|
+
description: "Checks if llms-full.txt (the complete-content variant) is present at the site root",
|
|
4686
|
+
weight: 0.4,
|
|
4687
|
+
run: async (ctx) => {
|
|
4688
|
+
if (ctx.llmsFullTxt) {
|
|
4689
|
+
return {
|
|
4690
|
+
id: "llms-full-exists",
|
|
4691
|
+
name: "llms-full.txt Exists",
|
|
4692
|
+
category: "content-discoverability",
|
|
4693
|
+
status: "pass",
|
|
4694
|
+
message: ctx.mode === "local" ? "llms-full.txt found in project root" : `llms-full.txt found at ${ctx.baseUrl.origin}/llms-full.txt`
|
|
4695
|
+
};
|
|
4696
|
+
}
|
|
4697
|
+
return {
|
|
4698
|
+
id: "llms-full-exists",
|
|
4699
|
+
name: "llms-full.txt Exists",
|
|
4700
|
+
category: "content-discoverability",
|
|
4701
|
+
status: "info",
|
|
4702
|
+
message: "No llms-full.txt found (optional)",
|
|
4703
|
+
suggestion: "If your llms.txt is large or you want agents to get full content in one fetch, add a /llms-full.txt containing the concatenated markdown of your docs."
|
|
4704
|
+
};
|
|
4705
|
+
}
|
|
4706
|
+
};
|
|
4707
|
+
var llmsFullValid = {
|
|
4708
|
+
id: "llms-full-valid",
|
|
4709
|
+
name: "llms-full.txt Valid Structure",
|
|
4710
|
+
category: "content-discoverability",
|
|
4711
|
+
description: "Checks if llms-full.txt has recognizable markdown structure (headings, content)",
|
|
4712
|
+
weight: 0.4,
|
|
4713
|
+
run: async (ctx) => {
|
|
4714
|
+
if (!ctx.llmsFullTxt) {
|
|
4715
|
+
return {
|
|
4716
|
+
id: "llms-full-valid",
|
|
4717
|
+
name: "llms-full.txt Valid Structure",
|
|
4718
|
+
category: "content-discoverability",
|
|
4719
|
+
status: "skip",
|
|
4720
|
+
message: "Skipped: no llms-full.txt found"
|
|
4721
|
+
};
|
|
4722
|
+
}
|
|
4723
|
+
const hasHeadings = /^#{1,3}\s+/m.test(ctx.llmsFullTxt);
|
|
4724
|
+
const hasProse = ctx.llmsFullTxt.length > 600;
|
|
4725
|
+
if (hasHeadings && hasProse) {
|
|
4726
|
+
return {
|
|
4727
|
+
id: "llms-full-valid",
|
|
4728
|
+
name: "llms-full.txt Valid Structure",
|
|
4729
|
+
category: "content-discoverability",
|
|
4730
|
+
status: "pass",
|
|
4731
|
+
message: "llms-full.txt has recognizable markdown structure"
|
|
4732
|
+
};
|
|
4733
|
+
}
|
|
4734
|
+
return {
|
|
4735
|
+
id: "llms-full-valid",
|
|
4736
|
+
name: "llms-full.txt Valid Structure",
|
|
4737
|
+
category: "content-discoverability",
|
|
4738
|
+
status: "warn",
|
|
4739
|
+
message: `llms-full.txt found but ${!hasHeadings ? "has no markdown headings" : "has little content"}`,
|
|
4740
|
+
suggestion: "llms-full.txt should contain the full markdown content of your docs, with headings, so agents can parse it."
|
|
4741
|
+
};
|
|
4742
|
+
}
|
|
4743
|
+
};
|
|
4744
|
+
var llmsFullSize = {
|
|
4745
|
+
id: "llms-full-size",
|
|
4746
|
+
name: "llms-full.txt Size",
|
|
4747
|
+
category: "content-discoverability",
|
|
4748
|
+
description: "Checks if llms-full.txt size is within the expected range (substantial but not excessive)",
|
|
4749
|
+
weight: 0.3,
|
|
4750
|
+
run: async (ctx) => {
|
|
4751
|
+
if (!ctx.llmsFullTxt) {
|
|
4752
|
+
return {
|
|
4753
|
+
id: "llms-full-size",
|
|
4754
|
+
name: "llms-full.txt Size",
|
|
4755
|
+
category: "content-discoverability",
|
|
4756
|
+
status: "skip",
|
|
4757
|
+
message: "Skipped: no llms-full.txt found"
|
|
4758
|
+
};
|
|
4759
|
+
}
|
|
4760
|
+
const size = ctx.llmsFullTxt.length;
|
|
4761
|
+
const MIN = 1e4;
|
|
4762
|
+
const MAX = 5e6;
|
|
4763
|
+
if (size >= MIN && size <= MAX) {
|
|
4764
|
+
return {
|
|
4765
|
+
id: "llms-full-size",
|
|
4766
|
+
name: "llms-full.txt Size",
|
|
4767
|
+
category: "content-discoverability",
|
|
4768
|
+
status: "pass",
|
|
4769
|
+
message: `llms-full.txt is ${size.toLocaleString()} characters (within expected range)`,
|
|
4770
|
+
metadata: { size }
|
|
4771
|
+
};
|
|
4772
|
+
}
|
|
4773
|
+
return {
|
|
4774
|
+
id: "llms-full-size",
|
|
4775
|
+
name: "llms-full.txt Size",
|
|
4776
|
+
category: "content-discoverability",
|
|
4777
|
+
status: "warn",
|
|
4778
|
+
message: size < MIN ? `llms-full.txt is only ${size.toLocaleString()} characters, smaller than expected for a full-content file` : `llms-full.txt is ${size.toLocaleString()} characters, large enough to overflow agent context windows`,
|
|
4779
|
+
suggestion: size < MIN ? "llms-full.txt should contain your complete documentation. If it's this small, llms.txt alone may be enough." : "Consider trimming llms-full.txt or splitting content so agents can fetch what fits their context window.",
|
|
4780
|
+
metadata: { size }
|
|
4781
|
+
};
|
|
4782
|
+
}
|
|
4783
|
+
};
|
|
4784
|
+
var llmsFullLinksResolve = {
|
|
4785
|
+
id: "llms-full-links-resolve",
|
|
4786
|
+
name: "llms-full.txt Links Resolve",
|
|
4787
|
+
category: "content-discoverability",
|
|
4788
|
+
description: "Checks if links in llms-full.txt return 200 OK",
|
|
4789
|
+
weight: 0.4,
|
|
4790
|
+
requiresNetwork: true,
|
|
4791
|
+
run: async (ctx) => {
|
|
4792
|
+
if (!ctx.llmsFullTxt) {
|
|
4793
|
+
return {
|
|
4794
|
+
id: "llms-full-links-resolve",
|
|
4795
|
+
name: "llms-full.txt Links Resolve",
|
|
4796
|
+
category: "content-discoverability",
|
|
4797
|
+
status: "skip",
|
|
4798
|
+
message: "Skipped: no llms-full.txt found"
|
|
4799
|
+
};
|
|
4800
|
+
}
|
|
4801
|
+
const urls = [];
|
|
4802
|
+
for (const link of extractMarkdownLinks(ctx.llmsFullTxt)) {
|
|
4803
|
+
try {
|
|
4804
|
+
const resolved2 = new URL(link, ctx.baseUrl.origin);
|
|
4805
|
+
if (resolved2.origin === ctx.baseUrl.origin) {
|
|
4806
|
+
urls.push(resolved2.href);
|
|
4807
|
+
}
|
|
4808
|
+
} catch {
|
|
4809
|
+
}
|
|
4810
|
+
}
|
|
4811
|
+
if (urls.length === 0) {
|
|
4812
|
+
return {
|
|
4813
|
+
id: "llms-full-links-resolve",
|
|
4814
|
+
name: "llms-full.txt Links Resolve",
|
|
4815
|
+
category: "content-discoverability",
|
|
4816
|
+
status: "info",
|
|
4817
|
+
message: "No same-origin links found in llms-full.txt"
|
|
4818
|
+
};
|
|
4819
|
+
}
|
|
4820
|
+
const sampled = urls.slice(0, 10);
|
|
4821
|
+
const results = await Promise.allSettled(
|
|
4822
|
+
sampled.map(async (url) => {
|
|
4823
|
+
const resp = await fetch(url, { method: "HEAD", redirect: "follow" });
|
|
4824
|
+
return { url, status: resp.status };
|
|
4825
|
+
})
|
|
4826
|
+
);
|
|
4827
|
+
const resolved = results.filter(
|
|
4828
|
+
(r) => r.status === "fulfilled" && r.value.status >= 200 && r.value.status < 400
|
|
4829
|
+
).length;
|
|
4830
|
+
if (resolved === sampled.length) {
|
|
4831
|
+
return {
|
|
4832
|
+
id: "llms-full-links-resolve",
|
|
4833
|
+
name: "llms-full.txt Links Resolve",
|
|
4834
|
+
category: "content-discoverability",
|
|
4835
|
+
status: "pass",
|
|
4836
|
+
message: `All ${resolved} sampled same-origin links resolve (${urls.length} total links)`,
|
|
4837
|
+
metadata: { resolved, sampled: sampled.length, total: urls.length }
|
|
4838
|
+
};
|
|
4839
|
+
}
|
|
4840
|
+
return {
|
|
4841
|
+
id: "llms-full-links-resolve",
|
|
4842
|
+
name: "llms-full.txt Links Resolve",
|
|
4843
|
+
category: "content-discoverability",
|
|
4844
|
+
status: "fail",
|
|
4845
|
+
message: `${resolved}/${sampled.length} sampled links resolve, ${sampled.length - resolved} broken`,
|
|
4846
|
+
suggestion: "Fix broken links in llms-full.txt. AI agents will fail to fetch these pages.",
|
|
4847
|
+
metadata: { resolved, sampled: sampled.length, total: urls.length }
|
|
4848
|
+
};
|
|
4849
|
+
}
|
|
4850
|
+
};
|
|
4556
4851
|
var contentDiscoverabilityChecks = [
|
|
4557
4852
|
llmsTxtExists,
|
|
4558
4853
|
llmsTxtValid,
|
|
@@ -4560,21 +4855,29 @@ var contentDiscoverabilityChecks = [
|
|
|
4560
4855
|
llmsTxtFreshness,
|
|
4561
4856
|
llmsTxtLinksResolve,
|
|
4562
4857
|
llmsTxtLinksMarkdown,
|
|
4858
|
+
llmsFullExists,
|
|
4859
|
+
llmsFullValid,
|
|
4860
|
+
llmsFullSize,
|
|
4861
|
+
llmsFullLinksResolve,
|
|
4563
4862
|
sitemapExists,
|
|
4564
4863
|
robotsTxtAgentRules
|
|
4565
4864
|
];
|
|
4566
|
-
var
|
|
4865
|
+
var BROWSER_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
|
|
4866
|
+
var makeHeaders = (config, asBrowser = false) => asBrowser ? {
|
|
4867
|
+
"User-Agent": BROWSER_UA,
|
|
4868
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
|
4869
|
+
} : {
|
|
4567
4870
|
"User-Agent": config.userAgent ?? DEFAULT_CONFIG.userAgent,
|
|
4568
4871
|
Accept: "text/html,application/xhtml+xml,text/markdown,text/plain,*/*"
|
|
4569
|
-
}
|
|
4570
|
-
var fetchPage = async (url, config = {}) => {
|
|
4872
|
+
};
|
|
4873
|
+
var fetchPage = async (url, config = {}, asBrowser = false) => {
|
|
4571
4874
|
const timeout = config.timeout ?? DEFAULT_CONFIG.timeout;
|
|
4572
4875
|
const start = Date.now();
|
|
4573
4876
|
const controller = new AbortController();
|
|
4574
4877
|
const timer = setTimeout(() => controller.abort(), timeout);
|
|
4575
4878
|
try {
|
|
4576
4879
|
const response = await fetch(url, {
|
|
4577
|
-
headers: makeHeaders(config),
|
|
4880
|
+
headers: makeHeaders(config, asBrowser),
|
|
4578
4881
|
signal: controller.signal,
|
|
4579
4882
|
redirect: "follow"
|
|
4580
4883
|
});
|
|
@@ -4627,13 +4930,13 @@ var fetchWithContentNegotiation = async (url, accept, config = {}) => {
|
|
|
4627
4930
|
clearTimeout(timer);
|
|
4628
4931
|
}
|
|
4629
4932
|
};
|
|
4630
|
-
var fetchMany = async (urls, config = {}) => {
|
|
4933
|
+
var fetchMany = async (urls, config = {}, asBrowser = false) => {
|
|
4631
4934
|
const concurrency = config.concurrency ?? DEFAULT_CONFIG.concurrency;
|
|
4632
4935
|
const results = [];
|
|
4633
4936
|
for (let i = 0; i < urls.length; i += concurrency) {
|
|
4634
4937
|
const chunk = urls.slice(i, i + concurrency);
|
|
4635
4938
|
const chunkResults = await Promise.allSettled(
|
|
4636
|
-
chunk.map((url) => fetchPage(url, config))
|
|
4939
|
+
chunk.map((url) => fetchPage(url, config, asBrowser))
|
|
4637
4940
|
);
|
|
4638
4941
|
for (const result of chunkResults) {
|
|
4639
4942
|
if (result.status === "fulfilled") {
|
|
@@ -4796,131 +5099,6 @@ var markdownAvailabilityChecks = [
|
|
|
4796
5099
|
contentNegotiation,
|
|
4797
5100
|
markdownContentParity
|
|
4798
5101
|
];
|
|
4799
|
-
var stripHtml = (html) => html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
4800
|
-
var extractLinks = (html, baseUrl) => {
|
|
4801
|
-
const links = [];
|
|
4802
|
-
const linkRegex = /<a[^>]+href=["']([^"']+)["']/gi;
|
|
4803
|
-
let match;
|
|
4804
|
-
while ((match = linkRegex.exec(html)) !== null) {
|
|
4805
|
-
try {
|
|
4806
|
-
const resolved = new URL(match[1], baseUrl).href;
|
|
4807
|
-
links.push(resolved);
|
|
4808
|
-
} catch {
|
|
4809
|
-
}
|
|
4810
|
-
}
|
|
4811
|
-
return links;
|
|
4812
|
-
};
|
|
4813
|
-
var extractMetaTags = (html) => {
|
|
4814
|
-
const meta = {};
|
|
4815
|
-
const metaRegex = /<meta[^>]+(?:name|property)=["']([^"']+)["'][^>]+content=["']([^"']+)["']/gi;
|
|
4816
|
-
let match;
|
|
4817
|
-
while ((match = metaRegex.exec(html)) !== null) {
|
|
4818
|
-
meta[match[1].toLowerCase()] = match[2];
|
|
4819
|
-
}
|
|
4820
|
-
const metaRegex2 = /<meta[^>]+content=["']([^"']+)["'][^>]+(?:name|property)=["']([^"']+)["']/gi;
|
|
4821
|
-
while ((match = metaRegex2.exec(html)) !== null) {
|
|
4822
|
-
meta[match[2].toLowerCase()] = match[1];
|
|
4823
|
-
}
|
|
4824
|
-
return meta;
|
|
4825
|
-
};
|
|
4826
|
-
var extractJsonLd = (html) => {
|
|
4827
|
-
const results = [];
|
|
4828
|
-
const regex = /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
|
|
4829
|
-
let match;
|
|
4830
|
-
while ((match = regex.exec(html)) !== null) {
|
|
4831
|
-
try {
|
|
4832
|
-
results.push(JSON.parse(match[1]));
|
|
4833
|
-
} catch {
|
|
4834
|
-
}
|
|
4835
|
-
}
|
|
4836
|
-
return results;
|
|
4837
|
-
};
|
|
4838
|
-
var readAttr = (attrs, name) => {
|
|
4839
|
-
const re = new RegExp(`\\b${name}=(?:"([^"]*)"|'([^']*)')`, "i");
|
|
4840
|
-
const m = attrs.match(re);
|
|
4841
|
-
if (!m) return void 0;
|
|
4842
|
-
return m[1] ?? m[2];
|
|
4843
|
-
};
|
|
4844
|
-
var extractImages = (html) => {
|
|
4845
|
-
const images = [];
|
|
4846
|
-
const imgRegex = /<img\b([^>]*)>/gi;
|
|
4847
|
-
let match;
|
|
4848
|
-
while ((match = imgRegex.exec(html)) !== null) {
|
|
4849
|
-
const attrs = match[1];
|
|
4850
|
-
const src = readAttr(attrs, "src");
|
|
4851
|
-
if (src === void 0) continue;
|
|
4852
|
-
images.push({ src, alt: readAttr(attrs, "alt") });
|
|
4853
|
-
}
|
|
4854
|
-
return images;
|
|
4855
|
-
};
|
|
4856
|
-
var extractHeadings = (html) => {
|
|
4857
|
-
const headings = [];
|
|
4858
|
-
const regex = /<h([1-6])[^>]*>([\s\S]*?)<\/h\1>/gi;
|
|
4859
|
-
let match;
|
|
4860
|
-
while ((match = regex.exec(html)) !== null) {
|
|
4861
|
-
headings.push({
|
|
4862
|
-
level: parseInt(match[1], 10),
|
|
4863
|
-
text: stripHtml(match[2]).trim()
|
|
4864
|
-
});
|
|
4865
|
-
}
|
|
4866
|
-
return headings;
|
|
4867
|
-
};
|
|
4868
|
-
var hasServerRenderedContent = (html) => {
|
|
4869
|
-
const withoutScripts = html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "");
|
|
4870
|
-
const textContent = stripHtml(withoutScripts);
|
|
4871
|
-
return textContent.length > 100;
|
|
4872
|
-
};
|
|
4873
|
-
var findContentStartPosition = (html) => {
|
|
4874
|
-
const markers = [
|
|
4875
|
-
/<main[\s>]/i,
|
|
4876
|
-
/<article[\s>]/i,
|
|
4877
|
-
/id=["']content["']/i,
|
|
4878
|
-
/id=["']main["']/i,
|
|
4879
|
-
/class=["'][^"']*content[^"']*["']/i,
|
|
4880
|
-
/role=["']main["']/i
|
|
4881
|
-
];
|
|
4882
|
-
for (const marker of markers) {
|
|
4883
|
-
const match = html.search(marker);
|
|
4884
|
-
if (match >= 0) {
|
|
4885
|
-
return match / html.length;
|
|
4886
|
-
}
|
|
4887
|
-
}
|
|
4888
|
-
const firstP = html.search(/<p[\s>]/i);
|
|
4889
|
-
if (firstP >= 0) {
|
|
4890
|
-
return firstP / html.length;
|
|
4891
|
-
}
|
|
4892
|
-
return 0.5;
|
|
4893
|
-
};
|
|
4894
|
-
var extractCodeFences = (markdown) => {
|
|
4895
|
-
const fences = [];
|
|
4896
|
-
const lines = markdown.split("\n");
|
|
4897
|
-
let inFence = false;
|
|
4898
|
-
let currentLang = "";
|
|
4899
|
-
for (const line of lines) {
|
|
4900
|
-
const openMatch = line.match(/^```(\w*)/);
|
|
4901
|
-
if (openMatch && !inFence) {
|
|
4902
|
-
inFence = true;
|
|
4903
|
-
currentLang = openMatch[1] ?? "";
|
|
4904
|
-
} else if (line.trim() === "```" && inFence) {
|
|
4905
|
-
fences.push({ lang: currentLang, closed: true });
|
|
4906
|
-
inFence = false;
|
|
4907
|
-
currentLang = "";
|
|
4908
|
-
}
|
|
4909
|
-
}
|
|
4910
|
-
if (inFence) {
|
|
4911
|
-
fences.push({ lang: currentLang, closed: false });
|
|
4912
|
-
}
|
|
4913
|
-
return fences;
|
|
4914
|
-
};
|
|
4915
|
-
var parseSitemapUrls = (xml) => {
|
|
4916
|
-
const urls = [];
|
|
4917
|
-
const regex = /<loc>([^<]+)<\/loc>/gi;
|
|
4918
|
-
let match;
|
|
4919
|
-
while ((match = regex.exec(xml)) !== null) {
|
|
4920
|
-
urls.push(match[1].trim());
|
|
4921
|
-
}
|
|
4922
|
-
return urls;
|
|
4923
|
-
};
|
|
4924
5102
|
var MAX_HTML_CHARS = 5e4;
|
|
4925
5103
|
var MAX_MD_CHARS = 5e4;
|
|
4926
5104
|
var renderingStrategy = {
|
|
@@ -6239,6 +6417,56 @@ var mcpServerCard = {
|
|
|
6239
6417
|
}
|
|
6240
6418
|
}
|
|
6241
6419
|
};
|
|
6420
|
+
var mcpToolCount = {
|
|
6421
|
+
id: "mcp-tool-count",
|
|
6422
|
+
name: "MCP Tool Count",
|
|
6423
|
+
category: "agent-protocols",
|
|
6424
|
+
description: "Checks that the MCP server card exposes at least one tool",
|
|
6425
|
+
weight: 0.4,
|
|
6426
|
+
run: async (ctx) => {
|
|
6427
|
+
if (!ctx.mcpServerCard) {
|
|
6428
|
+
return {
|
|
6429
|
+
id: "mcp-tool-count",
|
|
6430
|
+
name: "MCP Tool Count",
|
|
6431
|
+
category: "agent-protocols",
|
|
6432
|
+
status: "skip",
|
|
6433
|
+
message: "Skipped: no MCP server card found"
|
|
6434
|
+
};
|
|
6435
|
+
}
|
|
6436
|
+
let card;
|
|
6437
|
+
try {
|
|
6438
|
+
card = JSON.parse(ctx.mcpServerCard);
|
|
6439
|
+
} catch {
|
|
6440
|
+
return {
|
|
6441
|
+
id: "mcp-tool-count",
|
|
6442
|
+
name: "MCP Tool Count",
|
|
6443
|
+
category: "agent-protocols",
|
|
6444
|
+
status: "skip",
|
|
6445
|
+
message: "Skipped: MCP server card is invalid JSON"
|
|
6446
|
+
};
|
|
6447
|
+
}
|
|
6448
|
+
const toolCount = Array.isArray(card.tools) ? card.tools.length : Array.isArray(card.capabilities?.tools) ? card.capabilities.tools.length : 0;
|
|
6449
|
+
if (toolCount > 0) {
|
|
6450
|
+
return {
|
|
6451
|
+
id: "mcp-tool-count",
|
|
6452
|
+
name: "MCP Tool Count",
|
|
6453
|
+
category: "agent-protocols",
|
|
6454
|
+
status: "pass",
|
|
6455
|
+
message: `MCP server exposes ${toolCount} tool${toolCount === 1 ? "" : "s"}`,
|
|
6456
|
+
metadata: { toolCount }
|
|
6457
|
+
};
|
|
6458
|
+
}
|
|
6459
|
+
return {
|
|
6460
|
+
id: "mcp-tool-count",
|
|
6461
|
+
name: "MCP Tool Count",
|
|
6462
|
+
category: "agent-protocols",
|
|
6463
|
+
status: "warn",
|
|
6464
|
+
message: "MCP server card found but exposes no tools",
|
|
6465
|
+
suggestion: "List your MCP server's tools in the server card so agents know what actions are available before connecting.",
|
|
6466
|
+
metadata: { toolCount }
|
|
6467
|
+
};
|
|
6468
|
+
}
|
|
6469
|
+
};
|
|
6242
6470
|
var apiCatalog = {
|
|
6243
6471
|
id: "api-catalog",
|
|
6244
6472
|
name: "API Catalog (RFC 9727)",
|
|
@@ -6312,7 +6540,7 @@ var contentSignals = {
|
|
|
6312
6540
|
name: "Content Signals (AI Usage Declarations)",
|
|
6313
6541
|
category: "agent-protocols",
|
|
6314
6542
|
status: "info",
|
|
6315
|
-
message: "No robots.txt found
|
|
6543
|
+
message: "No robots.txt found, cannot check for content signals",
|
|
6316
6544
|
suggestion: "Add a robots.txt with Content Signals directives to declare how AI agents may use your content (ai-train, ai-input, search)."
|
|
6317
6545
|
};
|
|
6318
6546
|
}
|
|
@@ -6497,7 +6725,7 @@ var agentsMd = {
|
|
|
6497
6725
|
category: "agent-protocols",
|
|
6498
6726
|
status: "fail",
|
|
6499
6727
|
message: "No AGENTS.md or AGENT.md found",
|
|
6500
|
-
suggestion: "Add an AGENTS.md at the project root. This is the universal agent configuration file
|
|
6728
|
+
suggestion: "Add an AGENTS.md at the project root. This is the universal agent configuration file, a README for AI coding agents. Include build/test commands, architecture overview, conventions, and any gotchas. Used by 60k+ open-source projects."
|
|
6501
6729
|
};
|
|
6502
6730
|
}
|
|
6503
6731
|
const content = ctx.agentsMd;
|
|
@@ -6549,6 +6777,7 @@ var agentsMd = {
|
|
|
6549
6777
|
};
|
|
6550
6778
|
var agentProtocolChecks = [
|
|
6551
6779
|
mcpServerCard,
|
|
6780
|
+
mcpToolCount,
|
|
6552
6781
|
apiCatalog,
|
|
6553
6782
|
contentSignals,
|
|
6554
6783
|
linkHeaders,
|
|
@@ -6725,7 +6954,7 @@ var buildRemoteContext = async (targetUrl, config) => {
|
|
|
6725
6954
|
const apiCatalog2 = apiCatalogResult.status === "fulfilled" && apiCatalogResult.value?.statusCode === 200 ? apiCatalogResult.value.text : void 0;
|
|
6726
6955
|
const agentSkillsIndex2 = agentSkillsResult.status === "fulfilled" && agentSkillsResult.value?.statusCode === 200 ? agentSkillsResult.value.text : void 0;
|
|
6727
6956
|
const agentsMd2 = void 0;
|
|
6728
|
-
|
|
6957
|
+
let sitemapUrls = sitemapXml ? parseSitemapUrls(sitemapXml) : [];
|
|
6729
6958
|
if (!sitemapXml && robotsTxt) {
|
|
6730
6959
|
const sitemapMatch = robotsTxt.match(/Sitemap:\s*(.+)/i);
|
|
6731
6960
|
if (sitemapMatch) {
|
|
@@ -6735,10 +6964,28 @@ var buildRemoteContext = async (targetUrl, config) => {
|
|
|
6735
6964
|
}
|
|
6736
6965
|
}
|
|
6737
6966
|
}
|
|
6967
|
+
const isSitemapIndex = (sitemapXml ?? "").includes("<sitemapindex");
|
|
6968
|
+
if (isSitemapIndex && sitemapUrls.length > 0) {
|
|
6969
|
+
const nested = await Promise.allSettled(
|
|
6970
|
+
sitemapUrls.slice(0, 20).map((u) => fetchText(u, config))
|
|
6971
|
+
);
|
|
6972
|
+
sitemapUrls = nested.flatMap(
|
|
6973
|
+
(r) => r.status === "fulfilled" && r.value?.statusCode === 200 ? parseSitemapUrls(r.value.text) : []
|
|
6974
|
+
);
|
|
6975
|
+
}
|
|
6738
6976
|
let pagesToSample = [];
|
|
6739
6977
|
if (sitemapUrls.length > 0) {
|
|
6740
|
-
const
|
|
6741
|
-
|
|
6978
|
+
const pathPrefix = baseUrl.pathname.replace(/\/+$/, "");
|
|
6979
|
+
const scoped = pathPrefix.length > 1 ? sitemapUrls.filter((u) => {
|
|
6980
|
+
try {
|
|
6981
|
+
return new URL(u).pathname.startsWith(pathPrefix);
|
|
6982
|
+
} catch {
|
|
6983
|
+
return false;
|
|
6984
|
+
}
|
|
6985
|
+
}) : sitemapUrls;
|
|
6986
|
+
const pool = scoped.length > 0 ? scoped : sitemapUrls;
|
|
6987
|
+
const step = Math.max(1, Math.floor(pool.length / config.sampleSize));
|
|
6988
|
+
pagesToSample = pool.filter((_, i) => i % step === 0).slice(0, config.sampleSize);
|
|
6742
6989
|
} else {
|
|
6743
6990
|
const mainPage = await fetchPage(targetUrl, config);
|
|
6744
6991
|
const linkRegex = /<a[^>]+href=["']([^"'#]+)["']/gi;
|
|
@@ -6758,14 +7005,16 @@ var buildRemoteContext = async (targetUrl, config) => {
|
|
|
6758
7005
|
if (!pagesToSample.includes(targetUrl)) {
|
|
6759
7006
|
pagesToSample.unshift(targetUrl);
|
|
6760
7007
|
}
|
|
6761
|
-
const sampledPages = await fetchMany(pagesToSample, config);
|
|
7008
|
+
const sampledPages = await fetchMany(pagesToSample, config, true);
|
|
6762
7009
|
emit({ type: "context-ready", pageCount: sampledPages.length });
|
|
6763
|
-
|
|
6764
|
-
|
|
6765
|
-
|
|
6766
|
-
|
|
6767
|
-
|
|
6768
|
-
|
|
7010
|
+
await Promise.allSettled(
|
|
7011
|
+
sampledPages.map(async (page) => {
|
|
7012
|
+
const mdResult = await fetchWithContentNegotiation(page.url, "text/markdown", config);
|
|
7013
|
+
if (mdResult && mdResult.statusCode === 200 && (mdResult.contentType.includes("text/markdown") || mdResult.contentType.includes("text/plain"))) {
|
|
7014
|
+
page.markdown = mdResult.text;
|
|
7015
|
+
}
|
|
7016
|
+
})
|
|
7017
|
+
);
|
|
6769
7018
|
return {
|
|
6770
7019
|
mode: "remote",
|
|
6771
7020
|
targetUrl,
|