agent-readiness 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +142 -0
- package/bin/agent-ready.mjs +296 -0
- package/lib/core.mjs +971 -0
- package/lib/fix.mjs +564 -0
- package/lib/github.mjs +51 -0
- package/lib/report.mjs +57 -0
- package/package.json +54 -0
package/lib/core.mjs
ADDED
|
@@ -0,0 +1,971 @@
|
|
|
1
|
+
import { writeFile, mkdir, readFile } from 'node:fs/promises'
|
|
2
|
+
import { join } from 'node:path'
|
|
3
|
+
import { parse } from 'node-html-parser'
|
|
4
|
+
|
|
5
|
+
const UA = 'agent-ready/0.4 (+https://github.com/VeldinS/agent-ready)'
|
|
6
|
+
|
|
7
|
+
// ---------------------------------------------------------------------------
|
|
8
|
+
// Fetching / crawling
|
|
9
|
+
// ---------------------------------------------------------------------------
|
|
10
|
+
|
|
11
|
+
async function tryFetch(url, { timeout = 8000, maxBytes = 5 * 1024 * 1024, signal } = {}) {
|
|
12
|
+
const ctrl = new AbortController()
|
|
13
|
+
const t = setTimeout(() => ctrl.abort(), timeout)
|
|
14
|
+
// Honor an optional caller signal (e.g. a server-route wall-clock deadline) on top of the
|
|
15
|
+
// per-fetch timeout, so the whole crawl can be cancelled — not just each individual request.
|
|
16
|
+
const onAbort = () => ctrl.abort()
|
|
17
|
+
if (signal) {
|
|
18
|
+
if (signal.aborted) ctrl.abort()
|
|
19
|
+
else signal.addEventListener('abort', onAbort, { once: true })
|
|
20
|
+
}
|
|
21
|
+
try {
|
|
22
|
+
const res = await fetch(url, { headers: { 'user-agent': UA }, redirect: 'follow', signal: ctrl.signal })
|
|
23
|
+
if (!res.ok) return null
|
|
24
|
+
const len = Number(res.headers.get('content-length'))
|
|
25
|
+
if (Number.isFinite(len) && len > maxBytes) return null // refuse oversized before reading
|
|
26
|
+
if (!res.body || typeof res.body.getReader !== 'function') return await res.text()
|
|
27
|
+
// Stream with a hard byte cap so a huge/streaming body can't OOM the process.
|
|
28
|
+
const reader = res.body.getReader()
|
|
29
|
+
const decoder = new TextDecoder('utf-8')
|
|
30
|
+
let received = 0
|
|
31
|
+
let out = ''
|
|
32
|
+
for (;;) {
|
|
33
|
+
const { done, value } = await reader.read()
|
|
34
|
+
if (done) break
|
|
35
|
+
received += value.byteLength
|
|
36
|
+
if (received > maxBytes) {
|
|
37
|
+
try {
|
|
38
|
+
await reader.cancel()
|
|
39
|
+
} catch {
|
|
40
|
+
/* ignore */
|
|
41
|
+
}
|
|
42
|
+
ctrl.abort()
|
|
43
|
+
break // keep the truncated head we already have
|
|
44
|
+
}
|
|
45
|
+
out += decoder.decode(value, { stream: true })
|
|
46
|
+
}
|
|
47
|
+
out += decoder.decode()
|
|
48
|
+
return out
|
|
49
|
+
} catch {
|
|
50
|
+
return null
|
|
51
|
+
} finally {
|
|
52
|
+
clearTimeout(t)
|
|
53
|
+
if (signal) signal.removeEventListener('abort', onAbort)
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// A SPA / catch-all that 200s every path serves index.html for /llms.txt etc. — don't count HTML as a real file.
|
|
58
|
+
function looksLikeHtml(s) {
|
|
59
|
+
const h = (s || '').replace(/^/, '').trimStart()
|
|
60
|
+
return /^(<!doctype html|<html[\s>])/i.test(h)
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Run async tasks with a concurrency cap (keeps multi-page crawls polite + fast).
|
|
64
|
+
async function mapLimit(items, limit, fn) {
|
|
65
|
+
const out = new Array(items.length)
|
|
66
|
+
let i = 0
|
|
67
|
+
const count = items.length === 0 ? 0 : Math.max(1, Math.min(limit, items.length))
|
|
68
|
+
const workers = Array.from({ length: count }, async () => {
|
|
69
|
+
while (i < items.length) {
|
|
70
|
+
const idx = i++
|
|
71
|
+
out[idx] = await fn(items[idx], idx)
|
|
72
|
+
}
|
|
73
|
+
})
|
|
74
|
+
await Promise.all(workers)
|
|
75
|
+
return out
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function decodeXml(s) {
|
|
79
|
+
return s
|
|
80
|
+
.replace(/&/g, '&')
|
|
81
|
+
.replace(/</g, '<')
|
|
82
|
+
.replace(/>/g, '>')
|
|
83
|
+
.replace(/"/g, '"')
|
|
84
|
+
.replace(/'/g, "'")
|
|
85
|
+
.replace(/'/g, "'")
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function parseSitemapLocs(xml) {
|
|
89
|
+
const locs = []
|
|
90
|
+
for (const m of xml.matchAll(/<loc>\s*([^<\s]+)\s*<\/loc>/gi)) locs.push(decodeXml(m[1].trim()))
|
|
91
|
+
return locs
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
const NON_PAGE = /\.(jpg|jpeg|png|gif|svg|webp|ico|css|js|mjs|json|xml|pdf|zip|gz|rss|atom|mp4|mp3|woff2?|ttf)(\?|#|$)/i
|
|
95
|
+
const TRACKING = /^(ref|ref_|utm_[a-z]+|fbclid|gclid|mc_cid|mc_eid|igshid|_hsenc|_hsmi)$/i
|
|
96
|
+
// Titles that signal an error/permission page we should not list as real content.
|
|
97
|
+
const ERROR_TITLE = /\b(40[0-9]|not found|page not found|permission error|access denied|forbidden|do not have permission|error)\b/i
|
|
98
|
+
|
|
99
|
+
function sameHost(a, b) {
|
|
100
|
+
try {
|
|
101
|
+
return new URL(a).host === new URL(b).host
|
|
102
|
+
} catch {
|
|
103
|
+
return false
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Strip tracking params + fragment so ref/utm-tagged duplicates collapse to one entry.
|
|
108
|
+
function cleanUrl(raw) {
|
|
109
|
+
try {
|
|
110
|
+
const u = new URL(raw)
|
|
111
|
+
u.hash = ''
|
|
112
|
+
for (const k of [...u.searchParams.keys()]) if (TRACKING.test(k)) u.searchParams.delete(k)
|
|
113
|
+
return u.toString()
|
|
114
|
+
} catch {
|
|
115
|
+
return raw
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Collect candidate page URLs from a sitemap. Bounded BFS so nested sitemap-index
|
|
120
|
+
// files (and CDN-hosted children) still yield pages, without runaway crawling.
|
|
121
|
+
async function collectSitemapUrls(base, sitemapRaw, cap = 25, signal) {
|
|
122
|
+
if (!sitemapRaw) return []
|
|
123
|
+
const seen = new Set()
|
|
124
|
+
const out = []
|
|
125
|
+
const addPage = (u) => {
|
|
126
|
+
if (!/^https?:\/\//i.test(u) || NON_PAGE.test(u)) return
|
|
127
|
+
if (base && !sameHost(u, base)) return
|
|
128
|
+
const c = cleanUrl(u)
|
|
129
|
+
if (seen.has(c)) return
|
|
130
|
+
seen.add(c)
|
|
131
|
+
out.push(c)
|
|
132
|
+
}
|
|
133
|
+
const MAX_SITEMAP_FETCH = 5
|
|
134
|
+
let fetches = 0
|
|
135
|
+
const queue = [sitemapRaw]
|
|
136
|
+
while (queue.length && out.length < cap) {
|
|
137
|
+
const doc = queue.shift()
|
|
138
|
+
if (/<sitemapindex/i.test(doc)) {
|
|
139
|
+
// children are other sitemaps — may legitimately live on another host (CDN).
|
|
140
|
+
const children = parseSitemapLocs(doc).slice(0, MAX_SITEMAP_FETCH - fetches)
|
|
141
|
+
const raws = await mapLimit(children, 3, (u) => {
|
|
142
|
+
fetches++
|
|
143
|
+
return tryFetch(u, { timeout: 7000, signal })
|
|
144
|
+
})
|
|
145
|
+
for (const r of raws) if (r) queue.push(r)
|
|
146
|
+
} else {
|
|
147
|
+
for (const u of parseSitemapLocs(doc)) {
|
|
148
|
+
addPage(u)
|
|
149
|
+
if (out.length >= cap) break
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
return out
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
function extractPageMeta(html) {
|
|
157
|
+
try {
|
|
158
|
+
const root = parse(html)
|
|
159
|
+
const title = clean(root.querySelector('title')?.text || '')
|
|
160
|
+
let description = ''
|
|
161
|
+
for (const m of root.querySelectorAll('meta')) {
|
|
162
|
+
if ((m.getAttribute('name') || '').toLowerCase() === 'description') {
|
|
163
|
+
description = clean(m.getAttribute('content') || '')
|
|
164
|
+
if (description) break
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
const h1 = clean(root.querySelector('h1')?.text || '')
|
|
168
|
+
return { title, description, h1 }
|
|
169
|
+
} catch {
|
|
170
|
+
return { title: '', description: '', h1: '' }
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
function pathOf(url, base) {
|
|
175
|
+
try {
|
|
176
|
+
const u = new URL(url, base || undefined)
|
|
177
|
+
return (u.pathname || '/') + (u.search || '')
|
|
178
|
+
} catch {
|
|
179
|
+
return url
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Same-origin links from the home page — fallback page source when there's no sitemap.
|
|
184
|
+
function sameOriginLinks(home, base, cap = 20) {
|
|
185
|
+
let root
|
|
186
|
+
try {
|
|
187
|
+
root = parse(home || '')
|
|
188
|
+
} catch {
|
|
189
|
+
return []
|
|
190
|
+
}
|
|
191
|
+
const out = []
|
|
192
|
+
const seen = new Set()
|
|
193
|
+
for (const a of root.querySelectorAll('a[href]')) {
|
|
194
|
+
let h = (a.getAttribute('href') || '').trim()
|
|
195
|
+
if (!h || /^(#|mailto:|tel:|javascript:)/i.test(h)) continue
|
|
196
|
+
let url
|
|
197
|
+
if (/^https?:\/\//i.test(h)) {
|
|
198
|
+
if (!sameHost(h, base)) continue
|
|
199
|
+
url = h
|
|
200
|
+
} else if (h.startsWith('/')) {
|
|
201
|
+
url = base + h
|
|
202
|
+
} else {
|
|
203
|
+
continue
|
|
204
|
+
}
|
|
205
|
+
url = cleanUrl(url)
|
|
206
|
+
if (NON_PAGE.test(url) || seen.has(url) || url === base || url === base + '/') continue
|
|
207
|
+
seen.add(url)
|
|
208
|
+
out.push(url)
|
|
209
|
+
if (out.length >= cap) break
|
|
210
|
+
}
|
|
211
|
+
return out
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// Fetch each discovered page and derive a one-line summary for llms.txt.
|
|
215
|
+
async function fetchPageSummaries(urls, base, fetchCap = 18, signal) {
|
|
216
|
+
const targets = urls.slice(0, fetchCap)
|
|
217
|
+
const fetched = await mapLimit(targets, 8, async (url) => {
|
|
218
|
+
const html = await tryFetch(url, { timeout: 7000, signal })
|
|
219
|
+
if (!html) return null
|
|
220
|
+
const meta = extractPageMeta(html)
|
|
221
|
+
const title = oneLine(meta.title || meta.h1 || pathOf(url, base))
|
|
222
|
+
if (ERROR_TITLE.test(title)) return null // skip 404/permission/error pages
|
|
223
|
+
return {
|
|
224
|
+
url,
|
|
225
|
+
path: pathOf(url, base),
|
|
226
|
+
title,
|
|
227
|
+
summary: oneLine(meta.description || meta.h1 || '')
|
|
228
|
+
}
|
|
229
|
+
})
|
|
230
|
+
return fetched.filter(Boolean)
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
const isXmlSitemap = (s) => !!s && /(<urlset|<sitemapindex|<\?xml)/i.test(s)
|
|
234
|
+
|
|
235
|
+
export async function crawl(target, { signal } = {}) {
|
|
236
|
+
const u = new URL(/^https?:\/\//i.test(target) ? target : 'https://' + target)
|
|
237
|
+
const base = u.origin // site-root files (llms/robots/sitemap) live at the ORIGIN, not the scanned path
|
|
238
|
+
const pageUrl = u.href
|
|
239
|
+
const home = await tryFetch(pageUrl, { signal })
|
|
240
|
+
if (home == null) throw new Error(`Could not fetch ${pageUrl}`)
|
|
241
|
+
const [llmsRaw, robotsRaw, sitemapRaw] = await Promise.all([
|
|
242
|
+
tryFetch(base + '/llms.txt', { signal }),
|
|
243
|
+
tryFetch(base + '/robots.txt', { signal }),
|
|
244
|
+
tryFetch(base + '/sitemap.xml', { signal })
|
|
245
|
+
])
|
|
246
|
+
// Reject SPA catch-all HTML masquerading as these files so the score stays honest.
|
|
247
|
+
const llmsTxt = looksLikeHtml(llmsRaw) ? null : llmsRaw
|
|
248
|
+
const robots = looksLikeHtml(robotsRaw) ? null : robotsRaw
|
|
249
|
+
let sitemap = isXmlSitemap(sitemapRaw) ? sitemapRaw : null
|
|
250
|
+
// Honor a "Sitemap:" directive in robots.txt when the conventional path is absent.
|
|
251
|
+
if (!sitemap && robots) {
|
|
252
|
+
const declared = [...robots.matchAll(/^\s*sitemap:\s*(\S+)/gim)].map((m) => m[1])
|
|
253
|
+
for (const sm of declared.slice(0, 3)) {
|
|
254
|
+
const raw = await tryFetch(sm, { timeout: 7000, signal })
|
|
255
|
+
if (isXmlSitemap(raw)) {
|
|
256
|
+
sitemap = raw
|
|
257
|
+
break
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
let urls = await collectSitemapUrls(base, sitemap, 25, signal)
|
|
262
|
+
if (urls.length === 0) urls = sameOriginLinks(home, base) // no sitemap → fall back to home links
|
|
263
|
+
const pages = await fetchPageSummaries(urls, base, 18, signal)
|
|
264
|
+
return { base, home, llmsTxt, robots, sitemap, pages }
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
export async function crawlLocal(path) {
|
|
268
|
+
const home = await readFile(path, 'utf8')
|
|
269
|
+
return { base: '', home, llmsTxt: null, robots: null, sitemap: null, pages: null }
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
// ---------------------------------------------------------------------------
|
|
273
|
+
// Small string helpers
|
|
274
|
+
// ---------------------------------------------------------------------------
|
|
275
|
+
|
|
276
|
+
function clean(s) {
|
|
277
|
+
return String(s == null ? '' : s)
|
|
278
|
+
.replace(/\s+/g, ' ')
|
|
279
|
+
.trim()
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
function oneLine(s, max = 200) {
|
|
283
|
+
const c = clean(s)
|
|
284
|
+
return c.length > max ? c.slice(0, max - 1).trimEnd() + '…' : c
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
function kebab(s) {
|
|
288
|
+
const k = String(s || '')
|
|
289
|
+
.toLowerCase()
|
|
290
|
+
.replace(/[^a-z0-9]+/g, '-')
|
|
291
|
+
.replace(/^-+|-+$/g, '')
|
|
292
|
+
.slice(0, 64)
|
|
293
|
+
return k || 'tool'
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
function lastSegment(p) {
|
|
297
|
+
const parts = String(p || '')
|
|
298
|
+
.split(/[?#]/)[0]
|
|
299
|
+
.split('/')
|
|
300
|
+
.filter(Boolean)
|
|
301
|
+
return parts.length ? parts[parts.length - 1] : ''
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
function uniqName(base, used) {
|
|
305
|
+
let name = base
|
|
306
|
+
let n = 2
|
|
307
|
+
while (used.has(name)) name = `${base}-${n++}`
|
|
308
|
+
used.add(name)
|
|
309
|
+
return name
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
function abs(url, base) {
|
|
313
|
+
if (!url) return url
|
|
314
|
+
if (/^https?:\/\//i.test(url)) return url
|
|
315
|
+
if (base && url.startsWith('/')) return base + url
|
|
316
|
+
return url
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
// A link safe to emit into llms.txt: absolutised, with dangerous schemes rejected.
|
|
320
|
+
function safeHref(url, base) {
|
|
321
|
+
const a = abs(url, base)
|
|
322
|
+
if (!a) return null
|
|
323
|
+
if (/^\s*(javascript|data|vbscript|file):/i.test(a)) return null
|
|
324
|
+
return a
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
// Escape Markdown link-significant characters in crawled page titles/summaries so a
|
|
328
|
+
// hostile title like "x](http://evil)" can't hijack the generated link.
|
|
329
|
+
function mdText(s, max = 200) {
|
|
330
|
+
return oneLine(s, max).replace(/[[\]()\\`]/g, (m) => '\\' + m)
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
function escHtml(s) {
|
|
334
|
+
return String(s == null ? '' : s)
|
|
335
|
+
.replace(/&/g, '&')
|
|
336
|
+
.replace(/</g, '<')
|
|
337
|
+
.replace(/>/g, '>')
|
|
338
|
+
.replace(/"/g, '"')
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
// ---------------------------------------------------------------------------
|
|
342
|
+
// Extraction (analyze) — node-html-parser backed
|
|
343
|
+
// ---------------------------------------------------------------------------
|
|
344
|
+
|
|
345
|
+
const SKIP_INPUT_TYPES = new Set(['hidden', 'submit', 'button', 'image', 'reset'])
|
|
346
|
+
|
|
347
|
+
function schemaType(rawType) {
|
|
348
|
+
const t = String(rawType || '').toLowerCase()
|
|
349
|
+
if (t === 'number' || t === 'range') return 'number'
|
|
350
|
+
if (t === 'checkbox') return 'boolean'
|
|
351
|
+
return 'string'
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
function findLabel(root, el) {
|
|
355
|
+
try {
|
|
356
|
+
const id = el.getAttribute('id')
|
|
357
|
+
if (id) {
|
|
358
|
+
const lbl = root.querySelector(`label[for="${id}"]`)
|
|
359
|
+
if (lbl) return clean(lbl.text)
|
|
360
|
+
}
|
|
361
|
+
const wrap = el.closest && el.closest('label')
|
|
362
|
+
if (wrap) return clean(wrap.text)
|
|
363
|
+
} catch {
|
|
364
|
+
/* selectors with odd ids can throw — ignore */
|
|
365
|
+
}
|
|
366
|
+
return ''
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
function extractForms(root) {
|
|
370
|
+
const forms = []
|
|
371
|
+
for (const form of root.querySelectorAll('form')) {
|
|
372
|
+
const action = form.getAttribute('action') || '/'
|
|
373
|
+
const method = (form.getAttribute('method') || 'get').toUpperCase()
|
|
374
|
+
const inputs = []
|
|
375
|
+
for (const el of form.querySelectorAll('input, select, textarea')) {
|
|
376
|
+
const name = el.getAttribute('name')
|
|
377
|
+
if (!name) continue
|
|
378
|
+
const tag = (el.rawTagName || '').toLowerCase()
|
|
379
|
+
const rawType = tag === 'textarea' ? 'textarea' : tag === 'select' ? 'select' : (el.getAttribute('type') || 'text').toLowerCase()
|
|
380
|
+
if (SKIP_INPUT_TYPES.has(rawType)) continue
|
|
381
|
+
let options
|
|
382
|
+
if (tag === 'select') {
|
|
383
|
+
options = el
|
|
384
|
+
.querySelectorAll('option')
|
|
385
|
+
.map((o) => o.getAttribute('value') ?? clean(o.text))
|
|
386
|
+
.filter((v) => v !== '' && v != null)
|
|
387
|
+
}
|
|
388
|
+
inputs.push({
|
|
389
|
+
name,
|
|
390
|
+
type: rawType,
|
|
391
|
+
schemaType: schemaType(rawType),
|
|
392
|
+
required: el.hasAttribute('required'),
|
|
393
|
+
placeholder: el.getAttribute('placeholder') || '',
|
|
394
|
+
label: findLabel(root, el),
|
|
395
|
+
options
|
|
396
|
+
})
|
|
397
|
+
}
|
|
398
|
+
forms.push({ action, method, inputs })
|
|
399
|
+
}
|
|
400
|
+
return forms
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
function isSearchForm(f) {
|
|
404
|
+
if (f.method !== 'GET') return false
|
|
405
|
+
if (/search/i.test(f.action || '')) return true
|
|
406
|
+
return f.inputs.some((i) => i.type === 'search' || ['q', 'query', 's', 'search'].includes((i.name || '').toLowerCase()))
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// Intent table for inferring WebMCP tools from JS-driven buttons/links (no <form>).
|
|
410
|
+
const INTENTS = [
|
|
411
|
+
{
|
|
412
|
+
intent: 'search',
|
|
413
|
+
name: 'search-site',
|
|
414
|
+
test: (h) => /\bsearch\b/.test(h), // bare "find" matched "Find a store" etc. — too noisy
|
|
415
|
+
description: 'Search this site and return matching results.',
|
|
416
|
+
props: { query: { type: 'string', description: 'Search query' } },
|
|
417
|
+
required: ['query'],
|
|
418
|
+
readOnly: true,
|
|
419
|
+
method: 'GET'
|
|
420
|
+
},
|
|
421
|
+
{
|
|
422
|
+
intent: 'login',
|
|
423
|
+
name: 'log-in',
|
|
424
|
+
test: (h) => /\blog[\s-]?in\b|\bsign[\s-]?in\b|\bsign on\b/.test(h),
|
|
425
|
+
description: 'Log in to an existing account.',
|
|
426
|
+
props: { email: { type: 'string', description: 'Account email' }, password: { type: 'string', description: 'Account password' } },
|
|
427
|
+
required: ['email', 'password'],
|
|
428
|
+
readOnly: false,
|
|
429
|
+
method: 'POST'
|
|
430
|
+
},
|
|
431
|
+
{
|
|
432
|
+
intent: 'signup',
|
|
433
|
+
name: 'sign-up',
|
|
434
|
+
test: (h) => /\bsign[\s-]?up\b|\bregister\b|\bcreate (an )?account\b|\bget started\b|\bjoin free\b|\bstart (your )?free\b/.test(h),
|
|
435
|
+
description: 'Create a new account.',
|
|
436
|
+
props: { email: { type: 'string', description: 'Email address' }, password: { type: 'string', description: 'Chosen password' } },
|
|
437
|
+
required: ['email'],
|
|
438
|
+
readOnly: false,
|
|
439
|
+
method: 'POST'
|
|
440
|
+
},
|
|
441
|
+
{
|
|
442
|
+
intent: 'add-to-cart',
|
|
443
|
+
name: 'add-to-cart',
|
|
444
|
+
test: (h) => /\badd to (cart|bag|basket)\b|\bbuy now\b/.test(h),
|
|
445
|
+
description: 'Add a product to the shopping cart.',
|
|
446
|
+
props: { productId: { type: 'string', description: 'ID of the product to add' }, quantity: { type: 'number', description: 'Quantity', default: 1 } },
|
|
447
|
+
required: ['productId'],
|
|
448
|
+
readOnly: false,
|
|
449
|
+
method: 'POST'
|
|
450
|
+
},
|
|
451
|
+
{
|
|
452
|
+
intent: 'checkout',
|
|
453
|
+
name: 'checkout',
|
|
454
|
+
test: (h) => /\bcheckout\b|\bplace order\b|\bcomplete (order|purchase)\b|\bpay now\b/.test(h),
|
|
455
|
+
description: 'Proceed to checkout and place the order.',
|
|
456
|
+
props: {},
|
|
457
|
+
required: [],
|
|
458
|
+
readOnly: false,
|
|
459
|
+
method: 'POST'
|
|
460
|
+
},
|
|
461
|
+
{
|
|
462
|
+
intent: 'contact',
|
|
463
|
+
name: 'contact',
|
|
464
|
+
test: (h) => /\bcontact( us)?\b|\bget in touch\b|\btalk to (us|sales)\b|\bbook a (demo|call)\b|\brequest a demo\b/.test(h),
|
|
465
|
+
description: 'Send a contact or sales enquiry.',
|
|
466
|
+
props: { name: { type: 'string' }, email: { type: 'string' }, message: { type: 'string', description: 'Your message' } },
|
|
467
|
+
required: ['email', 'message'],
|
|
468
|
+
readOnly: false,
|
|
469
|
+
method: 'POST'
|
|
470
|
+
},
|
|
471
|
+
{
|
|
472
|
+
intent: 'subscribe',
|
|
473
|
+
name: 'subscribe',
|
|
474
|
+
test: (h) => /\bsubscribe\b|\bnewsletter\b|\bmailing list\b|\bnotify me\b|\bstay (in the loop|updated)\b/.test(h),
|
|
475
|
+
description: 'Subscribe an email address to the newsletter.',
|
|
476
|
+
props: { email: { type: 'string', description: 'Email address to subscribe' } },
|
|
477
|
+
required: ['email'],
|
|
478
|
+
readOnly: false,
|
|
479
|
+
method: 'POST'
|
|
480
|
+
}
|
|
481
|
+
]
|
|
482
|
+
|
|
483
|
+
function formIntentSet(forms) {
|
|
484
|
+
const s = new Set()
|
|
485
|
+
for (const f of forms) {
|
|
486
|
+
const pw = f.inputs.filter((i) => i.type === 'password')
|
|
487
|
+
if (pw.length) {
|
|
488
|
+
// A password form covers ONE auth intent — mark the one it actually is so the
|
|
489
|
+
// complementary action (e.g. a separate "Sign up" link) stays inferable.
|
|
490
|
+
const hay = (f.action + ' ' + f.inputs.map((i) => `${i.name} ${i.label} ${i.placeholder}`).join(' ')).toLowerCase()
|
|
491
|
+
const looksSignup = pw.length >= 2 || /register|sign[\s-]?up|create[\s-]?account|create your account|\bjoin\b/.test(hay)
|
|
492
|
+
s.add(looksSignup ? 'signup' : 'login')
|
|
493
|
+
}
|
|
494
|
+
if (isSearchForm(f)) s.add('search')
|
|
495
|
+
const emails = f.inputs.filter((i) => i.type === 'email')
|
|
496
|
+
if (!pw.length && emails.length === 1 && f.inputs.length <= 2) s.add('subscribe')
|
|
497
|
+
}
|
|
498
|
+
return s
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
function insideForm(el) {
|
|
502
|
+
try {
|
|
503
|
+
return !!(el.closest && el.closest('form'))
|
|
504
|
+
} catch {
|
|
505
|
+
return false
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
function inferActions(root, forms) {
|
|
510
|
+
const covered = formIntentSet(forms)
|
|
511
|
+
const candidates = []
|
|
512
|
+
// Elements inside a <form> are already represented by that form's tool — skip them.
|
|
513
|
+
for (const el of root.querySelectorAll('a[href]')) {
|
|
514
|
+
if (insideForm(el)) continue
|
|
515
|
+
candidates.push({
|
|
516
|
+
text: clean(el.text),
|
|
517
|
+
href: el.getAttribute('href'),
|
|
518
|
+
aria: el.getAttribute('aria-label'),
|
|
519
|
+
id: el.getAttribute('id')
|
|
520
|
+
})
|
|
521
|
+
}
|
|
522
|
+
for (const el of root.querySelectorAll('button, input[type=submit], input[type=button], [role=button]')) {
|
|
523
|
+
if (insideForm(el)) continue
|
|
524
|
+
const tag = (el.rawTagName || '').toLowerCase()
|
|
525
|
+
const text = tag === 'input' ? el.getAttribute('value') || '' : clean(el.text)
|
|
526
|
+
candidates.push({ text: clean(text), href: null, aria: el.getAttribute('aria-label'), id: el.getAttribute('id') })
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
const actions = []
|
|
530
|
+
const usedIntents = new Set()
|
|
531
|
+
for (const c of candidates) {
|
|
532
|
+
if (actions.length >= 8) break
|
|
533
|
+
// Match only on SHORT, button-like text + aria-label. Matching href/id (or long
|
|
534
|
+
// headline text) turned article links like ".../instant-checkout" into fake tools.
|
|
535
|
+
const label = clean([c.text, c.aria].filter(Boolean).join(' '))
|
|
536
|
+
const words = label.split(/\s+/).filter(Boolean)
|
|
537
|
+
if (!label || words.length > 5) continue
|
|
538
|
+
const hay = label.toLowerCase()
|
|
539
|
+
for (const intent of INTENTS) {
|
|
540
|
+
if (covered.has(intent.intent) || usedIntents.has(intent.intent)) continue
|
|
541
|
+
if (!intent.test(hay)) continue
|
|
542
|
+
usedIntents.add(intent.intent)
|
|
543
|
+
actions.push({
|
|
544
|
+
name: intent.name,
|
|
545
|
+
intent: intent.intent,
|
|
546
|
+
description: intent.description,
|
|
547
|
+
props: intent.props,
|
|
548
|
+
required: intent.required,
|
|
549
|
+
readOnly: intent.readOnly,
|
|
550
|
+
endpoint: c.href && /^(https?:\/\/|\/)/i.test(c.href) ? c.href : null,
|
|
551
|
+
method: intent.method
|
|
552
|
+
})
|
|
553
|
+
break
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
return actions
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
function pagesFromHome(root, base) {
|
|
560
|
+
const pages = []
|
|
561
|
+
const seen = new Set()
|
|
562
|
+
for (const a of root.querySelectorAll('a[href]')) {
|
|
563
|
+
let h = (a.getAttribute('href') || '').trim()
|
|
564
|
+
if (!h || /^(#|mailto:|tel:|javascript:)/i.test(h)) continue
|
|
565
|
+
let url
|
|
566
|
+
let path
|
|
567
|
+
if (/^https?:\/\//i.test(h)) {
|
|
568
|
+
if (base && !sameHost(h, base)) continue
|
|
569
|
+
url = h
|
|
570
|
+
path = pathOf(h, base)
|
|
571
|
+
} else if (h.startsWith('/')) {
|
|
572
|
+
url = base ? base + h : h
|
|
573
|
+
path = h.split(/[?#]/)[0]
|
|
574
|
+
} else {
|
|
575
|
+
continue
|
|
576
|
+
}
|
|
577
|
+
if (NON_PAGE.test(url)) continue
|
|
578
|
+
if (seen.has(url)) continue
|
|
579
|
+
seen.add(url)
|
|
580
|
+
pages.push({ url, path, title: clean(a.text) || path, summary: '' })
|
|
581
|
+
if (pages.length >= 25) break
|
|
582
|
+
}
|
|
583
|
+
return pages
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
export function analyze({ base = '', home = '', llmsTxt = null, robots = null, sitemap = null, pages = null }) {
|
|
587
|
+
let root
|
|
588
|
+
try {
|
|
589
|
+
root = parse(home || '')
|
|
590
|
+
} catch {
|
|
591
|
+
root = parse('')
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
const title = clean(root.querySelector('title')?.text || '')
|
|
595
|
+
let description = ''
|
|
596
|
+
let hasOg = false
|
|
597
|
+
for (const m of root.querySelectorAll('meta')) {
|
|
598
|
+
const name = (m.getAttribute('name') || '').toLowerCase()
|
|
599
|
+
const prop = (m.getAttribute('property') || '').toLowerCase()
|
|
600
|
+
if (!description && name === 'description') description = clean(m.getAttribute('content') || '')
|
|
601
|
+
if (!hasOg && prop.startsWith('og:')) hasOg = true
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
let hasJsonLd = false
|
|
605
|
+
for (const s of root.querySelectorAll('script')) {
|
|
606
|
+
if ((s.getAttribute('type') || '').toLowerCase().includes('ld+json')) {
|
|
607
|
+
hasJsonLd = true
|
|
608
|
+
break
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
let hasCanonical = false
|
|
613
|
+
let canonicalUrl = ''
|
|
614
|
+
for (const l of root.querySelectorAll('link')) {
|
|
615
|
+
const rel = (l.getAttribute('rel') || '').toLowerCase()
|
|
616
|
+
if (rel.split(/\s+/).includes('canonical')) {
|
|
617
|
+
hasCanonical = true
|
|
618
|
+
canonicalUrl = l.getAttribute('href') || ''
|
|
619
|
+
break
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
const htmlLang = clean(root.querySelector('html')?.getAttribute('lang') || '')
|
|
624
|
+
// Require the imperative API (or our generated scaffold filename) — a bare "webmcp"
|
|
625
|
+
// substring false-passed sites that merely had e.g. id="webmcp".
|
|
626
|
+
const hasWebMcp = /(navigator|document)\.modelContext|modelContext\s*\.\s*registerTool|webmcp\.tools/i.test(home)
|
|
627
|
+
|
|
628
|
+
const landmarks = ['main', 'nav', 'header', 'footer'].filter((t) => root.querySelector(t))
|
|
629
|
+
const h1 = root.querySelectorAll('h1').length
|
|
630
|
+
|
|
631
|
+
const imgs = root.querySelectorAll('img')
|
|
632
|
+
const imgTotal = imgs.length
|
|
633
|
+
const imgWithAlt = imgs.filter((i) => i.hasAttribute('alt')).length
|
|
634
|
+
const altCoverage = imgTotal === 0 ? 1 : imgWithAlt / imgTotal
|
|
635
|
+
const altPass = imgTotal === 0 || altCoverage >= 0.8
|
|
636
|
+
|
|
637
|
+
const forms = extractForms(root)
|
|
638
|
+
const actions = inferActions(root, forms)
|
|
639
|
+
// Empty (not just null) means the crawl found/fetched nothing usable → still try home links.
|
|
640
|
+
const pageList = Array.isArray(pages) && pages.length ? pages : pagesFromHome(root, base)
|
|
641
|
+
|
|
642
|
+
return {
|
|
643
|
+
base,
|
|
644
|
+
title,
|
|
645
|
+
description,
|
|
646
|
+
htmlLang,
|
|
647
|
+
hasCanonical,
|
|
648
|
+
canonicalUrl,
|
|
649
|
+
hasJsonLd,
|
|
650
|
+
hasOg,
|
|
651
|
+
hasWebMcp,
|
|
652
|
+
landmarks,
|
|
653
|
+
h1,
|
|
654
|
+
imgTotal,
|
|
655
|
+
imgWithAlt,
|
|
656
|
+
altCoverage,
|
|
657
|
+
altPass,
|
|
658
|
+
hasLlmsTxt: !!(llmsTxt && llmsTxt.trim().length > 20),
|
|
659
|
+
hasSitemap: !!sitemap,
|
|
660
|
+
hasRobots: !!robots,
|
|
661
|
+
forms,
|
|
662
|
+
actions,
|
|
663
|
+
pages: pageList
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
// ---------------------------------------------------------------------------
|
|
668
|
+
// Scoring — 9 weighted checks, weights sum to 100
|
|
669
|
+
// ---------------------------------------------------------------------------
|
|
670
|
+
|
|
671
|
+
export function score(a) {
|
|
672
|
+
const altPct = Math.round((a.altCoverage ?? 1) * 100)
|
|
673
|
+
const checks = [
|
|
674
|
+
{
|
|
675
|
+
id: 'llms_txt',
|
|
676
|
+
label: 'llms.txt present',
|
|
677
|
+
weight: 22,
|
|
678
|
+
pass: a.hasLlmsTxt,
|
|
679
|
+
fix: 'Add /llms.txt so agents get a clean, token-cheap map of your site (generated below).'
|
|
680
|
+
},
|
|
681
|
+
{
|
|
682
|
+
id: 'webmcp',
|
|
683
|
+
label: 'WebMCP tools (document.modelContext)',
|
|
684
|
+
weight: 22,
|
|
685
|
+
pass: a.hasWebMcp,
|
|
686
|
+
fix: 'Expose your key actions as WebMCP tools so agents can ACT, not just scrape (scaffold generated below).'
|
|
687
|
+
},
|
|
688
|
+
{
|
|
689
|
+
id: 'structured',
|
|
690
|
+
label: 'Structured data (JSON-LD / OpenGraph)',
|
|
691
|
+
weight: 13,
|
|
692
|
+
pass: a.hasJsonLd || a.hasOg,
|
|
693
|
+
fix: 'Add schema.org JSON-LD and OpenGraph tags for machine-readable meaning (snippet generated below).'
|
|
694
|
+
},
|
|
695
|
+
{
|
|
696
|
+
id: 'semantics',
|
|
697
|
+
label: 'Semantic landmarks + an H1',
|
|
698
|
+
weight: 13,
|
|
699
|
+
pass: a.landmarks.length >= 2 && a.h1 >= 1,
|
|
700
|
+
fix: 'Use <main>/<nav>/<header> landmarks and at least one <h1> so agents can parse structure.'
|
|
701
|
+
},
|
|
702
|
+
{
|
|
703
|
+
id: 'metadata',
|
|
704
|
+
label: 'Title + meta description',
|
|
705
|
+
weight: 10,
|
|
706
|
+
pass: !!(a.title && a.description),
|
|
707
|
+
fix: 'Add a <title> and <meta name="description"> so an agent knows what the page is.'
|
|
708
|
+
},
|
|
709
|
+
{
|
|
710
|
+
id: 'discovery',
|
|
711
|
+
label: 'robots.txt + sitemap.xml',
|
|
712
|
+
weight: 8,
|
|
713
|
+
pass: a.hasRobots && a.hasSitemap,
|
|
714
|
+
fix: 'Add robots.txt and sitemap.xml for crawl + discovery.'
|
|
715
|
+
},
|
|
716
|
+
{
|
|
717
|
+
id: 'canonical',
|
|
718
|
+
label: 'Canonical URL (<link rel="canonical">)',
|
|
719
|
+
weight: 4,
|
|
720
|
+
pass: a.hasCanonical,
|
|
721
|
+
fix: 'Add <link rel="canonical" href="..."> so agents dedupe URLs to one authoritative address.'
|
|
722
|
+
},
|
|
723
|
+
{
|
|
724
|
+
id: 'lang',
|
|
725
|
+
label: 'Document language (<html lang>)',
|
|
726
|
+
weight: 4,
|
|
727
|
+
pass: !!a.htmlLang,
|
|
728
|
+
fix: 'Set <html lang="en"> (or your language) so agents know the content language.'
|
|
729
|
+
},
|
|
730
|
+
{
|
|
731
|
+
id: 'alt_text',
|
|
732
|
+
label: `Image alt-text coverage${a.imgTotal ? ` (${a.imgWithAlt}/${a.imgTotal}, ${altPct}%)` : ''}`,
|
|
733
|
+
weight: 4,
|
|
734
|
+
pass: a.altPass,
|
|
735
|
+
fix: 'Add alt text to images (alt="..."), or alt="" for decorative ones, so agents understand visuals.'
|
|
736
|
+
}
|
|
737
|
+
]
|
|
738
|
+
const total = checks.reduce((s, c) => s + (c.pass ? c.weight : 0), 0)
|
|
739
|
+
return { total, checks }
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
// ---------------------------------------------------------------------------
|
|
743
|
+
// Generators
|
|
744
|
+
// ---------------------------------------------------------------------------
|
|
745
|
+
|
|
746
|
+
export function buildLlmsTxt(a) {
|
|
747
|
+
const lines = []
|
|
748
|
+
lines.push(`# ${oneLine(a.title || a.base || 'Site', 120)}`)
|
|
749
|
+
if (a.description) lines.push('', `> ${oneLine(a.description)}`)
|
|
750
|
+
|
|
751
|
+
const pages = (a.pages || []).filter((p) => p && (p.url || p.path))
|
|
752
|
+
if (pages.length) {
|
|
753
|
+
const seen = new Set()
|
|
754
|
+
const rows = []
|
|
755
|
+
for (const p of pages) {
|
|
756
|
+
const link = safeHref(p.url || p.path, a.base)
|
|
757
|
+
if (!link || seen.has(link)) continue
|
|
758
|
+
seen.add(link)
|
|
759
|
+
const label = mdText(p.title || p.path || link, 100)
|
|
760
|
+
const summary = mdText(p.summary || '')
|
|
761
|
+
rows.push(summary ? `- [${label}](${link}): ${summary}` : `- [${label}](${link})`)
|
|
762
|
+
}
|
|
763
|
+
if (rows.length) lines.push('', '## Pages', ...rows)
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
// Only real <form> endpoints go here — inferred (guessed) action endpoints would be
|
|
767
|
+
// misleading in an index file; they live in the WebMCP scaffold (with TODO hedges).
|
|
768
|
+
const acts = []
|
|
769
|
+
for (const f of a.forms) {
|
|
770
|
+
const url = safeHref(f.action, a.base)
|
|
771
|
+
if (!url) continue
|
|
772
|
+
const label = isSearchForm(f) ? 'Search' : `Submit ${lastSegment(f.action) || 'form'}`
|
|
773
|
+
const fields = f.inputs.map((i) => i.name).join(', ')
|
|
774
|
+
acts.push({ label, url, note: `${f.method} form${fields ? `, fields: ${fields}` : ''}` })
|
|
775
|
+
}
|
|
776
|
+
if (acts.length) {
|
|
777
|
+
lines.push('', '## Actions')
|
|
778
|
+
const seen = new Set()
|
|
779
|
+
for (const x of acts) {
|
|
780
|
+
const key = `${x.label}|${x.url}`
|
|
781
|
+
if (seen.has(key)) continue
|
|
782
|
+
seen.add(key)
|
|
783
|
+
lines.push(`- [${mdText(x.label, 80)}](${x.url}): ${mdText(x.note)}`)
|
|
784
|
+
}
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
lines.push('', '<!-- Generated by agent-ready • https://github.com/VeldinS/agent-ready -->')
|
|
788
|
+
return lines.join('\n') + '\n'
|
|
789
|
+
}
|
|
790
|
+
|
|
791
|
+
// Assign an OWN enumerable property even for keys like "__proto__" (which plain
|
|
792
|
+
// bracket-assignment would route to the prototype, silently dropping the field).
|
|
793
|
+
function setProp(obj, key, value) {
|
|
794
|
+
Object.defineProperty(obj, key, { value, enumerable: true, writable: true, configurable: true })
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
function propsFor(inputsOrProps, required) {
|
|
798
|
+
// forms pass an inputs[] array; inferred actions pass a ready-made props object.
|
|
799
|
+
if (Array.isArray(inputsOrProps)) {
|
|
800
|
+
const props = {}
|
|
801
|
+
const req = []
|
|
802
|
+
for (const i of inputsOrProps) {
|
|
803
|
+
if (!i.name) continue
|
|
804
|
+
const p = { type: i.schemaType }
|
|
805
|
+
const desc = i.label || i.placeholder
|
|
806
|
+
if (desc) p.description = oneLine(desc, 120)
|
|
807
|
+
if (i.options && i.options.length) p.enum = i.options
|
|
808
|
+
setProp(props, i.name, p)
|
|
809
|
+
if (i.required) req.push(i.name)
|
|
810
|
+
}
|
|
811
|
+
return { props, required: req }
|
|
812
|
+
}
|
|
813
|
+
return { props: inputsOrProps || {}, required: required || [] }
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
function toolFromForm(f, used) {
|
|
817
|
+
const search = isSearchForm(f)
|
|
818
|
+
const baseName = search ? 'search-site' : `submit-${lastSegment(f.action) || 'form'}`
|
|
819
|
+
const name = uniqName(kebab(baseName), used)
|
|
820
|
+
const { props, required } = propsFor(f.inputs)
|
|
821
|
+
return {
|
|
822
|
+
name,
|
|
823
|
+
description: search ? 'Search this site and return matching results.' : `Submit the ${f.action} form via ${f.method}.`,
|
|
824
|
+
props,
|
|
825
|
+
required,
|
|
826
|
+
readOnly: search,
|
|
827
|
+
endpoint: f.action || '/',
|
|
828
|
+
method: search ? 'GET' : f.method || 'POST'
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
function toolFromAction(ac, used) {
|
|
833
|
+
const name = uniqName(kebab(ac.name), used)
|
|
834
|
+
const { props, required } = propsFor(ac.props, ac.required)
|
|
835
|
+
return {
|
|
836
|
+
name,
|
|
837
|
+
description: ac.description,
|
|
838
|
+
props,
|
|
839
|
+
required,
|
|
840
|
+
readOnly: ac.readOnly,
|
|
841
|
+
endpoint: ac.endpoint,
|
|
842
|
+
method: ac.method || 'POST'
|
|
843
|
+
}
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
function renderTool(t) {
|
|
847
|
+
const L = []
|
|
848
|
+
L.push(' {')
|
|
849
|
+
L.push(` name: ${JSON.stringify(t.name)},`)
|
|
850
|
+
L.push(` description: ${JSON.stringify(t.description)},`)
|
|
851
|
+
if (t.props && Object.keys(t.props).length) {
|
|
852
|
+
L.push(' inputSchema: {')
|
|
853
|
+
L.push(' type: "object",')
|
|
854
|
+
L.push(` properties: ${JSON.stringify(t.props)},`)
|
|
855
|
+
L.push(` required: ${JSON.stringify(t.required || [])}`)
|
|
856
|
+
L.push(' },')
|
|
857
|
+
}
|
|
858
|
+
if (t.readOnly) L.push(' annotations: { readOnlyHint: true },')
|
|
859
|
+
L.push(' async execute(input) {')
|
|
860
|
+
if (t.endpoint) {
|
|
861
|
+
if (String(t.method).toUpperCase() === 'GET') {
|
|
862
|
+
L.push(` // TODO: confirm this maps to your real GET ${t.endpoint} handler`)
|
|
863
|
+
L.push(` const url = new URL(${JSON.stringify(t.endpoint)}, location.href);`)
|
|
864
|
+
L.push(' for (const [k, v] of Object.entries(input || {})) url.searchParams.set(k, v);')
|
|
865
|
+
L.push(' const res = await fetch(url, { method: "GET", headers: { accept: "application/json" } });')
|
|
866
|
+
} else {
|
|
867
|
+
L.push(` // TODO: confirm this maps to your real ${t.method} ${t.endpoint} handler`)
|
|
868
|
+
L.push(` const res = await fetch(${JSON.stringify(t.endpoint)}, {`)
|
|
869
|
+
L.push(` method: ${JSON.stringify(t.method)},`)
|
|
870
|
+
L.push(' headers: { "content-type": "application/json" },')
|
|
871
|
+
L.push(' body: JSON.stringify(input)')
|
|
872
|
+
L.push(' });')
|
|
873
|
+
}
|
|
874
|
+
L.push(' return { content: [{ type: "text", text: await res.text() }] };')
|
|
875
|
+
} else {
|
|
876
|
+
L.push(` // TODO: wire this tool to your on-page "${t.name}" action.`)
|
|
877
|
+
L.push(` return { content: [{ type: "text", text: ${JSON.stringify(`Not implemented: connect the ${t.name} tool to your handler.`)} }] };`)
|
|
878
|
+
}
|
|
879
|
+
L.push(' }')
|
|
880
|
+
L.push(' }')
|
|
881
|
+
return L.join('\n')
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
const EMPTY_TOOLS = ` // No <form> elements or obvious actions were detected on this page.
|
|
885
|
+
// Define your key actions as tools so agents can act on your site, e.g.:
|
|
886
|
+
// {
|
|
887
|
+
// name: "search-site",
|
|
888
|
+
// description: "Search this site and return matching results.",
|
|
889
|
+
// inputSchema: { type: "object", properties: { query: { type: "string" } }, required: ["query"] },
|
|
890
|
+
// annotations: { readOnlyHint: true },
|
|
891
|
+
// async execute(input) {
|
|
892
|
+
// const url = new URL("/search", location.href);
|
|
893
|
+
// url.searchParams.set("q", input.query);
|
|
894
|
+
// const res = await fetch(url);
|
|
895
|
+
// return { content: [{ type: "text", text: await res.text() }] };
|
|
896
|
+
// }
|
|
897
|
+
// }`
|
|
898
|
+
|
|
899
|
+
export function buildWebMcp(a) {
|
|
900
|
+
const used = new Set()
|
|
901
|
+
const tools = []
|
|
902
|
+
for (const f of a.forms) tools.push(toolFromForm(f, used))
|
|
903
|
+
for (const ac of a.actions) tools.push(toolFromAction(ac, used))
|
|
904
|
+
const body = tools.length ? tools.map(renderTool).join(',\n') : EMPTY_TOOLS
|
|
905
|
+
|
|
906
|
+
return `// Generated by agent-ready — WebMCP tool scaffold.
|
|
907
|
+
// Spec: https://webmachinelearning.github.io/webmcp/ • https://github.com/webmachinelearning/webmcp
|
|
908
|
+
// Load this on your site OVER HTTPS so AI agents can ACT on it, not just scrape it.
|
|
909
|
+
// WebMCP exposes document.modelContext.registerTool() and requires a secure context.
|
|
910
|
+
const AGENT_READY_TOOLS = [
|
|
911
|
+
${body}
|
|
912
|
+
];
|
|
913
|
+
|
|
914
|
+
if (typeof document !== "undefined" && document.modelContext) {
|
|
915
|
+
const controller = new AbortController(); // abort controller.signal to unregister these tools
|
|
916
|
+
for (const tool of AGENT_READY_TOOLS) {
|
|
917
|
+
Promise.resolve(document.modelContext.registerTool(tool, { signal: controller.signal })).catch((err) =>
|
|
918
|
+
console.warn("agent-ready: could not register tool", tool.name, err)
|
|
919
|
+
);
|
|
920
|
+
}
|
|
921
|
+
}
|
|
922
|
+
`
|
|
923
|
+
}
|
|
924
|
+
|
|
925
|
+
export function buildStructuredData(a) {
|
|
926
|
+
const name = a.title || a.base || 'Your site'
|
|
927
|
+
const url = a.base || a.canonicalUrl || 'https://your-site.com'
|
|
928
|
+
const description = a.description || 'A short description of what this site is and does.'
|
|
929
|
+
const jsonld = { '@context': 'https://schema.org', '@type': 'WebSite', name, url, description }
|
|
930
|
+
// Escape "<" so a value containing "</script>" cannot break out of the inline JSON-LD block.
|
|
931
|
+
const jsonldSafe = JSON.stringify(jsonld, null, 2).replace(/</g, '\\u003c')
|
|
932
|
+
return `<!-- Generated by agent-ready — paste into <head>. Raises the "structured data" check. -->
|
|
933
|
+
<script type="application/ld+json">
|
|
934
|
+
${jsonldSafe}
|
|
935
|
+
</script>
|
|
936
|
+
<meta property="og:title" content="${escHtml(oneLine(name, 120))}" />
|
|
937
|
+
<meta property="og:description" content="${escHtml(oneLine(description))}" />
|
|
938
|
+
<meta property="og:type" content="website" />
|
|
939
|
+
<meta property="og:url" content="${escHtml(url)}" />
|
|
940
|
+
`
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
export function badgeUrl(total) {
|
|
944
|
+
const color = total >= 80 ? 'brightgreen' : total >= 50 ? 'yellow' : 'red'
|
|
945
|
+
return `https://img.shields.io/badge/agent--ready-${total}%2F100-${color}`
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
export async function writeArtifacts(outDir, a, sc) {
|
|
949
|
+
await mkdir(outDir, { recursive: true })
|
|
950
|
+
await Promise.all([
|
|
951
|
+
writeFile(join(outDir, 'llms.txt'), buildLlmsTxt(a)),
|
|
952
|
+
writeFile(join(outDir, 'webmcp.tools.js'), buildWebMcp(a)),
|
|
953
|
+
writeFile(join(outDir, 'structured-data.html'), buildStructuredData(a)),
|
|
954
|
+
writeFile(
|
|
955
|
+
join(outDir, 'agent-ready.json'),
|
|
956
|
+
JSON.stringify(
|
|
957
|
+
{
|
|
958
|
+
tool: 'agent-ready',
|
|
959
|
+
version: '0.4.0',
|
|
960
|
+
url: a.base,
|
|
961
|
+
score: sc.total,
|
|
962
|
+
badge: badgeUrl(sc.total),
|
|
963
|
+
checks: sc.checks.map((c) => ({ id: c.id, label: c.label, weight: c.weight, pass: c.pass })),
|
|
964
|
+
counts: { pages: a.pages.length, forms: a.forms.length, actions: a.actions.length, images: a.imgTotal }
|
|
965
|
+
},
|
|
966
|
+
null,
|
|
967
|
+
2
|
|
968
|
+
) + '\n'
|
|
969
|
+
)
|
|
970
|
+
])
|
|
971
|
+
}
|