agent-readiness 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/core.mjs ADDED
@@ -0,0 +1,971 @@
1
+ import { writeFile, mkdir, readFile } from 'node:fs/promises'
2
+ import { join } from 'node:path'
3
+ import { parse } from 'node-html-parser'
4
+
5
+ const UA = 'agent-ready/0.4 (+https://github.com/VeldinS/agent-ready)'
6
+
7
+ // ---------------------------------------------------------------------------
8
+ // Fetching / crawling
9
+ // ---------------------------------------------------------------------------
10
+
11
+ async function tryFetch(url, { timeout = 8000, maxBytes = 5 * 1024 * 1024, signal } = {}) {
12
+ const ctrl = new AbortController()
13
+ const t = setTimeout(() => ctrl.abort(), timeout)
14
+ // Honor an optional caller signal (e.g. a server-route wall-clock deadline) on top of the
15
+ // per-fetch timeout, so the whole crawl can be cancelled — not just each individual request.
16
+ const onAbort = () => ctrl.abort()
17
+ if (signal) {
18
+ if (signal.aborted) ctrl.abort()
19
+ else signal.addEventListener('abort', onAbort, { once: true })
20
+ }
21
+ try {
22
+ const res = await fetch(url, { headers: { 'user-agent': UA }, redirect: 'follow', signal: ctrl.signal })
23
+ if (!res.ok) return null
24
+ const len = Number(res.headers.get('content-length'))
25
+ if (Number.isFinite(len) && len > maxBytes) return null // refuse oversized before reading
26
+ if (!res.body || typeof res.body.getReader !== 'function') return await res.text()
27
+ // Stream with a hard byte cap so a huge/streaming body can't OOM the process.
28
+ const reader = res.body.getReader()
29
+ const decoder = new TextDecoder('utf-8')
30
+ let received = 0
31
+ let out = ''
32
+ for (;;) {
33
+ const { done, value } = await reader.read()
34
+ if (done) break
35
+ received += value.byteLength
36
+ if (received > maxBytes) {
37
+ try {
38
+ await reader.cancel()
39
+ } catch {
40
+ /* ignore */
41
+ }
42
+ ctrl.abort()
43
+ break // keep the truncated head we already have
44
+ }
45
+ out += decoder.decode(value, { stream: true })
46
+ }
47
+ out += decoder.decode()
48
+ return out
49
+ } catch {
50
+ return null
51
+ } finally {
52
+ clearTimeout(t)
53
+ if (signal) signal.removeEventListener('abort', onAbort)
54
+ }
55
+ }
56
+
57
+ // A SPA / catch-all that 200s every path serves index.html for /llms.txt etc. — don't count HTML as a real file.
58
+ function looksLikeHtml(s) {
59
+ const h = (s || '').replace(/^/, '').trimStart()
60
+ return /^(<!doctype html|<html[\s>])/i.test(h)
61
+ }
62
+
63
+ // Run async tasks with a concurrency cap (keeps multi-page crawls polite + fast).
64
+ async function mapLimit(items, limit, fn) {
65
+ const out = new Array(items.length)
66
+ let i = 0
67
+ const count = items.length === 0 ? 0 : Math.max(1, Math.min(limit, items.length))
68
+ const workers = Array.from({ length: count }, async () => {
69
+ while (i < items.length) {
70
+ const idx = i++
71
+ out[idx] = await fn(items[idx], idx)
72
+ }
73
+ })
74
+ await Promise.all(workers)
75
+ return out
76
+ }
77
+
78
+ function decodeXml(s) {
79
+ return s
80
+ .replace(/&amp;/g, '&')
81
+ .replace(/&lt;/g, '<')
82
+ .replace(/&gt;/g, '>')
83
+ .replace(/&quot;/g, '"')
84
+ .replace(/&#39;/g, "'")
85
+ .replace(/&apos;/g, "'")
86
+ }
87
+
88
+ function parseSitemapLocs(xml) {
89
+ const locs = []
90
+ for (const m of xml.matchAll(/<loc>\s*([^<\s]+)\s*<\/loc>/gi)) locs.push(decodeXml(m[1].trim()))
91
+ return locs
92
+ }
93
+
94
+ const NON_PAGE = /\.(jpg|jpeg|png|gif|svg|webp|ico|css|js|mjs|json|xml|pdf|zip|gz|rss|atom|mp4|mp3|woff2?|ttf)(\?|#|$)/i
95
+ const TRACKING = /^(ref|ref_|utm_[a-z]+|fbclid|gclid|mc_cid|mc_eid|igshid|_hsenc|_hsmi)$/i
96
+ // Titles that signal an error/permission page we should not list as real content.
97
+ const ERROR_TITLE = /\b(40[0-9]|not found|page not found|permission error|access denied|forbidden|do not have permission|error)\b/i
98
+
99
+ function sameHost(a, b) {
100
+ try {
101
+ return new URL(a).host === new URL(b).host
102
+ } catch {
103
+ return false
104
+ }
105
+ }
106
+
107
+ // Strip tracking params + fragment so ref/utm-tagged duplicates collapse to one entry.
108
+ function cleanUrl(raw) {
109
+ try {
110
+ const u = new URL(raw)
111
+ u.hash = ''
112
+ for (const k of [...u.searchParams.keys()]) if (TRACKING.test(k)) u.searchParams.delete(k)
113
+ return u.toString()
114
+ } catch {
115
+ return raw
116
+ }
117
+ }
118
+
119
+ // Collect candidate page URLs from a sitemap. Bounded BFS so nested sitemap-index
120
+ // files (and CDN-hosted children) still yield pages, without runaway crawling.
121
+ async function collectSitemapUrls(base, sitemapRaw, cap = 25, signal) {
122
+ if (!sitemapRaw) return []
123
+ const seen = new Set()
124
+ const out = []
125
+ const addPage = (u) => {
126
+ if (!/^https?:\/\//i.test(u) || NON_PAGE.test(u)) return
127
+ if (base && !sameHost(u, base)) return
128
+ const c = cleanUrl(u)
129
+ if (seen.has(c)) return
130
+ seen.add(c)
131
+ out.push(c)
132
+ }
133
+ const MAX_SITEMAP_FETCH = 5
134
+ let fetches = 0
135
+ const queue = [sitemapRaw]
136
+ while (queue.length && out.length < cap) {
137
+ const doc = queue.shift()
138
+ if (/<sitemapindex/i.test(doc)) {
139
+ // children are other sitemaps — may legitimately live on another host (CDN).
140
+ const children = parseSitemapLocs(doc).slice(0, MAX_SITEMAP_FETCH - fetches)
141
+ const raws = await mapLimit(children, 3, (u) => {
142
+ fetches++
143
+ return tryFetch(u, { timeout: 7000, signal })
144
+ })
145
+ for (const r of raws) if (r) queue.push(r)
146
+ } else {
147
+ for (const u of parseSitemapLocs(doc)) {
148
+ addPage(u)
149
+ if (out.length >= cap) break
150
+ }
151
+ }
152
+ }
153
+ return out
154
+ }
155
+
156
+ function extractPageMeta(html) {
157
+ try {
158
+ const root = parse(html)
159
+ const title = clean(root.querySelector('title')?.text || '')
160
+ let description = ''
161
+ for (const m of root.querySelectorAll('meta')) {
162
+ if ((m.getAttribute('name') || '').toLowerCase() === 'description') {
163
+ description = clean(m.getAttribute('content') || '')
164
+ if (description) break
165
+ }
166
+ }
167
+ const h1 = clean(root.querySelector('h1')?.text || '')
168
+ return { title, description, h1 }
169
+ } catch {
170
+ return { title: '', description: '', h1: '' }
171
+ }
172
+ }
173
+
174
+ function pathOf(url, base) {
175
+ try {
176
+ const u = new URL(url, base || undefined)
177
+ return (u.pathname || '/') + (u.search || '')
178
+ } catch {
179
+ return url
180
+ }
181
+ }
182
+
183
+ // Same-origin links from the home page — fallback page source when there's no sitemap.
184
+ function sameOriginLinks(home, base, cap = 20) {
185
+ let root
186
+ try {
187
+ root = parse(home || '')
188
+ } catch {
189
+ return []
190
+ }
191
+ const out = []
192
+ const seen = new Set()
193
+ for (const a of root.querySelectorAll('a[href]')) {
194
+ let h = (a.getAttribute('href') || '').trim()
195
+ if (!h || /^(#|mailto:|tel:|javascript:)/i.test(h)) continue
196
+ let url
197
+ if (/^https?:\/\//i.test(h)) {
198
+ if (!sameHost(h, base)) continue
199
+ url = h
200
+ } else if (h.startsWith('/')) {
201
+ url = base + h
202
+ } else {
203
+ continue
204
+ }
205
+ url = cleanUrl(url)
206
+ if (NON_PAGE.test(url) || seen.has(url) || url === base || url === base + '/') continue
207
+ seen.add(url)
208
+ out.push(url)
209
+ if (out.length >= cap) break
210
+ }
211
+ return out
212
+ }
213
+
214
+ // Fetch each discovered page and derive a one-line summary for llms.txt.
215
+ async function fetchPageSummaries(urls, base, fetchCap = 18, signal) {
216
+ const targets = urls.slice(0, fetchCap)
217
+ const fetched = await mapLimit(targets, 8, async (url) => {
218
+ const html = await tryFetch(url, { timeout: 7000, signal })
219
+ if (!html) return null
220
+ const meta = extractPageMeta(html)
221
+ const title = oneLine(meta.title || meta.h1 || pathOf(url, base))
222
+ if (ERROR_TITLE.test(title)) return null // skip 404/permission/error pages
223
+ return {
224
+ url,
225
+ path: pathOf(url, base),
226
+ title,
227
+ summary: oneLine(meta.description || meta.h1 || '')
228
+ }
229
+ })
230
+ return fetched.filter(Boolean)
231
+ }
232
+
233
+ const isXmlSitemap = (s) => !!s && /(<urlset|<sitemapindex|<\?xml)/i.test(s)
234
+
235
+ export async function crawl(target, { signal } = {}) {
236
+ const u = new URL(/^https?:\/\//i.test(target) ? target : 'https://' + target)
237
+ const base = u.origin // site-root files (llms/robots/sitemap) live at the ORIGIN, not the scanned path
238
+ const pageUrl = u.href
239
+ const home = await tryFetch(pageUrl, { signal })
240
+ if (home == null) throw new Error(`Could not fetch ${pageUrl}`)
241
+ const [llmsRaw, robotsRaw, sitemapRaw] = await Promise.all([
242
+ tryFetch(base + '/llms.txt', { signal }),
243
+ tryFetch(base + '/robots.txt', { signal }),
244
+ tryFetch(base + '/sitemap.xml', { signal })
245
+ ])
246
+ // Reject SPA catch-all HTML masquerading as these files so the score stays honest.
247
+ const llmsTxt = looksLikeHtml(llmsRaw) ? null : llmsRaw
248
+ const robots = looksLikeHtml(robotsRaw) ? null : robotsRaw
249
+ let sitemap = isXmlSitemap(sitemapRaw) ? sitemapRaw : null
250
+ // Honor a "Sitemap:" directive in robots.txt when the conventional path is absent.
251
+ if (!sitemap && robots) {
252
+ const declared = [...robots.matchAll(/^\s*sitemap:\s*(\S+)/gim)].map((m) => m[1])
253
+ for (const sm of declared.slice(0, 3)) {
254
+ const raw = await tryFetch(sm, { timeout: 7000, signal })
255
+ if (isXmlSitemap(raw)) {
256
+ sitemap = raw
257
+ break
258
+ }
259
+ }
260
+ }
261
+ let urls = await collectSitemapUrls(base, sitemap, 25, signal)
262
+ if (urls.length === 0) urls = sameOriginLinks(home, base) // no sitemap → fall back to home links
263
+ const pages = await fetchPageSummaries(urls, base, 18, signal)
264
+ return { base, home, llmsTxt, robots, sitemap, pages }
265
+ }
266
+
267
+ export async function crawlLocal(path) {
268
+ const home = await readFile(path, 'utf8')
269
+ return { base: '', home, llmsTxt: null, robots: null, sitemap: null, pages: null }
270
+ }
271
+
272
+ // ---------------------------------------------------------------------------
273
+ // Small string helpers
274
+ // ---------------------------------------------------------------------------
275
+
276
+ function clean(s) {
277
+ return String(s == null ? '' : s)
278
+ .replace(/\s+/g, ' ')
279
+ .trim()
280
+ }
281
+
282
+ function oneLine(s, max = 200) {
283
+ const c = clean(s)
284
+ return c.length > max ? c.slice(0, max - 1).trimEnd() + '…' : c
285
+ }
286
+
287
+ function kebab(s) {
288
+ const k = String(s || '')
289
+ .toLowerCase()
290
+ .replace(/[^a-z0-9]+/g, '-')
291
+ .replace(/^-+|-+$/g, '')
292
+ .slice(0, 64)
293
+ return k || 'tool'
294
+ }
295
+
296
+ function lastSegment(p) {
297
+ const parts = String(p || '')
298
+ .split(/[?#]/)[0]
299
+ .split('/')
300
+ .filter(Boolean)
301
+ return parts.length ? parts[parts.length - 1] : ''
302
+ }
303
+
304
+ function uniqName(base, used) {
305
+ let name = base
306
+ let n = 2
307
+ while (used.has(name)) name = `${base}-${n++}`
308
+ used.add(name)
309
+ return name
310
+ }
311
+
312
+ function abs(url, base) {
313
+ if (!url) return url
314
+ if (/^https?:\/\//i.test(url)) return url
315
+ if (base && url.startsWith('/')) return base + url
316
+ return url
317
+ }
318
+
319
+ // A link safe to emit into llms.txt: absolutised, with dangerous schemes rejected.
320
+ function safeHref(url, base) {
321
+ const a = abs(url, base)
322
+ if (!a) return null
323
+ if (/^\s*(javascript|data|vbscript|file):/i.test(a)) return null
324
+ return a
325
+ }
326
+
327
+ // Escape Markdown link-significant characters in crawled page titles/summaries so a
328
+ // hostile title like "x](http://evil)" can't hijack the generated link.
329
+ function mdText(s, max = 200) {
330
+ return oneLine(s, max).replace(/[[\]()\\`]/g, (m) => '\\' + m)
331
+ }
332
+
333
+ function escHtml(s) {
334
+ return String(s == null ? '' : s)
335
+ .replace(/&/g, '&amp;')
336
+ .replace(/</g, '&lt;')
337
+ .replace(/>/g, '&gt;')
338
+ .replace(/"/g, '&quot;')
339
+ }
340
+
341
+ // ---------------------------------------------------------------------------
342
+ // Extraction (analyze) — node-html-parser backed
343
+ // ---------------------------------------------------------------------------
344
+
345
+ const SKIP_INPUT_TYPES = new Set(['hidden', 'submit', 'button', 'image', 'reset'])
346
+
347
+ function schemaType(rawType) {
348
+ const t = String(rawType || '').toLowerCase()
349
+ if (t === 'number' || t === 'range') return 'number'
350
+ if (t === 'checkbox') return 'boolean'
351
+ return 'string'
352
+ }
353
+
354
+ function findLabel(root, el) {
355
+ try {
356
+ const id = el.getAttribute('id')
357
+ if (id) {
358
+ const lbl = root.querySelector(`label[for="${id}"]`)
359
+ if (lbl) return clean(lbl.text)
360
+ }
361
+ const wrap = el.closest && el.closest('label')
362
+ if (wrap) return clean(wrap.text)
363
+ } catch {
364
+ /* selectors with odd ids can throw — ignore */
365
+ }
366
+ return ''
367
+ }
368
+
369
+ function extractForms(root) {
370
+ const forms = []
371
+ for (const form of root.querySelectorAll('form')) {
372
+ const action = form.getAttribute('action') || '/'
373
+ const method = (form.getAttribute('method') || 'get').toUpperCase()
374
+ const inputs = []
375
+ for (const el of form.querySelectorAll('input, select, textarea')) {
376
+ const name = el.getAttribute('name')
377
+ if (!name) continue
378
+ const tag = (el.rawTagName || '').toLowerCase()
379
+ const rawType = tag === 'textarea' ? 'textarea' : tag === 'select' ? 'select' : (el.getAttribute('type') || 'text').toLowerCase()
380
+ if (SKIP_INPUT_TYPES.has(rawType)) continue
381
+ let options
382
+ if (tag === 'select') {
383
+ options = el
384
+ .querySelectorAll('option')
385
+ .map((o) => o.getAttribute('value') ?? clean(o.text))
386
+ .filter((v) => v !== '' && v != null)
387
+ }
388
+ inputs.push({
389
+ name,
390
+ type: rawType,
391
+ schemaType: schemaType(rawType),
392
+ required: el.hasAttribute('required'),
393
+ placeholder: el.getAttribute('placeholder') || '',
394
+ label: findLabel(root, el),
395
+ options
396
+ })
397
+ }
398
+ forms.push({ action, method, inputs })
399
+ }
400
+ return forms
401
+ }
402
+
403
+ function isSearchForm(f) {
404
+ if (f.method !== 'GET') return false
405
+ if (/search/i.test(f.action || '')) return true
406
+ return f.inputs.some((i) => i.type === 'search' || ['q', 'query', 's', 'search'].includes((i.name || '').toLowerCase()))
407
+ }
408
+
409
+ // Intent table for inferring WebMCP tools from JS-driven buttons/links (no <form>).
410
+ const INTENTS = [
411
+ {
412
+ intent: 'search',
413
+ name: 'search-site',
414
+ test: (h) => /\bsearch\b/.test(h), // bare "find" matched "Find a store" etc. — too noisy
415
+ description: 'Search this site and return matching results.',
416
+ props: { query: { type: 'string', description: 'Search query' } },
417
+ required: ['query'],
418
+ readOnly: true,
419
+ method: 'GET'
420
+ },
421
+ {
422
+ intent: 'login',
423
+ name: 'log-in',
424
+ test: (h) => /\blog[\s-]?in\b|\bsign[\s-]?in\b|\bsign on\b/.test(h),
425
+ description: 'Log in to an existing account.',
426
+ props: { email: { type: 'string', description: 'Account email' }, password: { type: 'string', description: 'Account password' } },
427
+ required: ['email', 'password'],
428
+ readOnly: false,
429
+ method: 'POST'
430
+ },
431
+ {
432
+ intent: 'signup',
433
+ name: 'sign-up',
434
+ test: (h) => /\bsign[\s-]?up\b|\bregister\b|\bcreate (an )?account\b|\bget started\b|\bjoin free\b|\bstart (your )?free\b/.test(h),
435
+ description: 'Create a new account.',
436
+ props: { email: { type: 'string', description: 'Email address' }, password: { type: 'string', description: 'Chosen password' } },
437
+ required: ['email'],
438
+ readOnly: false,
439
+ method: 'POST'
440
+ },
441
+ {
442
+ intent: 'add-to-cart',
443
+ name: 'add-to-cart',
444
+ test: (h) => /\badd to (cart|bag|basket)\b|\bbuy now\b/.test(h),
445
+ description: 'Add a product to the shopping cart.',
446
+ props: { productId: { type: 'string', description: 'ID of the product to add' }, quantity: { type: 'number', description: 'Quantity', default: 1 } },
447
+ required: ['productId'],
448
+ readOnly: false,
449
+ method: 'POST'
450
+ },
451
+ {
452
+ intent: 'checkout',
453
+ name: 'checkout',
454
+ test: (h) => /\bcheckout\b|\bplace order\b|\bcomplete (order|purchase)\b|\bpay now\b/.test(h),
455
+ description: 'Proceed to checkout and place the order.',
456
+ props: {},
457
+ required: [],
458
+ readOnly: false,
459
+ method: 'POST'
460
+ },
461
+ {
462
+ intent: 'contact',
463
+ name: 'contact',
464
+ test: (h) => /\bcontact( us)?\b|\bget in touch\b|\btalk to (us|sales)\b|\bbook a (demo|call)\b|\brequest a demo\b/.test(h),
465
+ description: 'Send a contact or sales enquiry.',
466
+ props: { name: { type: 'string' }, email: { type: 'string' }, message: { type: 'string', description: 'Your message' } },
467
+ required: ['email', 'message'],
468
+ readOnly: false,
469
+ method: 'POST'
470
+ },
471
+ {
472
+ intent: 'subscribe',
473
+ name: 'subscribe',
474
+ test: (h) => /\bsubscribe\b|\bnewsletter\b|\bmailing list\b|\bnotify me\b|\bstay (in the loop|updated)\b/.test(h),
475
+ description: 'Subscribe an email address to the newsletter.',
476
+ props: { email: { type: 'string', description: 'Email address to subscribe' } },
477
+ required: ['email'],
478
+ readOnly: false,
479
+ method: 'POST'
480
+ }
481
+ ]
482
+
483
+ function formIntentSet(forms) {
484
+ const s = new Set()
485
+ for (const f of forms) {
486
+ const pw = f.inputs.filter((i) => i.type === 'password')
487
+ if (pw.length) {
488
+ // A password form covers ONE auth intent — mark the one it actually is so the
489
+ // complementary action (e.g. a separate "Sign up" link) stays inferable.
490
+ const hay = (f.action + ' ' + f.inputs.map((i) => `${i.name} ${i.label} ${i.placeholder}`).join(' ')).toLowerCase()
491
+ const looksSignup = pw.length >= 2 || /register|sign[\s-]?up|create[\s-]?account|create your account|\bjoin\b/.test(hay)
492
+ s.add(looksSignup ? 'signup' : 'login')
493
+ }
494
+ if (isSearchForm(f)) s.add('search')
495
+ const emails = f.inputs.filter((i) => i.type === 'email')
496
+ if (!pw.length && emails.length === 1 && f.inputs.length <= 2) s.add('subscribe')
497
+ }
498
+ return s
499
+ }
500
+
501
+ function insideForm(el) {
502
+ try {
503
+ return !!(el.closest && el.closest('form'))
504
+ } catch {
505
+ return false
506
+ }
507
+ }
508
+
509
+ function inferActions(root, forms) {
510
+ const covered = formIntentSet(forms)
511
+ const candidates = []
512
+ // Elements inside a <form> are already represented by that form's tool — skip them.
513
+ for (const el of root.querySelectorAll('a[href]')) {
514
+ if (insideForm(el)) continue
515
+ candidates.push({
516
+ text: clean(el.text),
517
+ href: el.getAttribute('href'),
518
+ aria: el.getAttribute('aria-label'),
519
+ id: el.getAttribute('id')
520
+ })
521
+ }
522
+ for (const el of root.querySelectorAll('button, input[type=submit], input[type=button], [role=button]')) {
523
+ if (insideForm(el)) continue
524
+ const tag = (el.rawTagName || '').toLowerCase()
525
+ const text = tag === 'input' ? el.getAttribute('value') || '' : clean(el.text)
526
+ candidates.push({ text: clean(text), href: null, aria: el.getAttribute('aria-label'), id: el.getAttribute('id') })
527
+ }
528
+
529
+ const actions = []
530
+ const usedIntents = new Set()
531
+ for (const c of candidates) {
532
+ if (actions.length >= 8) break
533
+ // Match only on SHORT, button-like text + aria-label. Matching href/id (or long
534
+ // headline text) turned article links like ".../instant-checkout" into fake tools.
535
+ const label = clean([c.text, c.aria].filter(Boolean).join(' '))
536
+ const words = label.split(/\s+/).filter(Boolean)
537
+ if (!label || words.length > 5) continue
538
+ const hay = label.toLowerCase()
539
+ for (const intent of INTENTS) {
540
+ if (covered.has(intent.intent) || usedIntents.has(intent.intent)) continue
541
+ if (!intent.test(hay)) continue
542
+ usedIntents.add(intent.intent)
543
+ actions.push({
544
+ name: intent.name,
545
+ intent: intent.intent,
546
+ description: intent.description,
547
+ props: intent.props,
548
+ required: intent.required,
549
+ readOnly: intent.readOnly,
550
+ endpoint: c.href && /^(https?:\/\/|\/)/i.test(c.href) ? c.href : null,
551
+ method: intent.method
552
+ })
553
+ break
554
+ }
555
+ }
556
+ return actions
557
+ }
558
+
559
+ function pagesFromHome(root, base) {
560
+ const pages = []
561
+ const seen = new Set()
562
+ for (const a of root.querySelectorAll('a[href]')) {
563
+ let h = (a.getAttribute('href') || '').trim()
564
+ if (!h || /^(#|mailto:|tel:|javascript:)/i.test(h)) continue
565
+ let url
566
+ let path
567
+ if (/^https?:\/\//i.test(h)) {
568
+ if (base && !sameHost(h, base)) continue
569
+ url = h
570
+ path = pathOf(h, base)
571
+ } else if (h.startsWith('/')) {
572
+ url = base ? base + h : h
573
+ path = h.split(/[?#]/)[0]
574
+ } else {
575
+ continue
576
+ }
577
+ if (NON_PAGE.test(url)) continue
578
+ if (seen.has(url)) continue
579
+ seen.add(url)
580
+ pages.push({ url, path, title: clean(a.text) || path, summary: '' })
581
+ if (pages.length >= 25) break
582
+ }
583
+ return pages
584
+ }
585
+
586
+ export function analyze({ base = '', home = '', llmsTxt = null, robots = null, sitemap = null, pages = null }) {
587
+ let root
588
+ try {
589
+ root = parse(home || '')
590
+ } catch {
591
+ root = parse('')
592
+ }
593
+
594
+ const title = clean(root.querySelector('title')?.text || '')
595
+ let description = ''
596
+ let hasOg = false
597
+ for (const m of root.querySelectorAll('meta')) {
598
+ const name = (m.getAttribute('name') || '').toLowerCase()
599
+ const prop = (m.getAttribute('property') || '').toLowerCase()
600
+ if (!description && name === 'description') description = clean(m.getAttribute('content') || '')
601
+ if (!hasOg && prop.startsWith('og:')) hasOg = true
602
+ }
603
+
604
+ let hasJsonLd = false
605
+ for (const s of root.querySelectorAll('script')) {
606
+ if ((s.getAttribute('type') || '').toLowerCase().includes('ld+json')) {
607
+ hasJsonLd = true
608
+ break
609
+ }
610
+ }
611
+
612
+ let hasCanonical = false
613
+ let canonicalUrl = ''
614
+ for (const l of root.querySelectorAll('link')) {
615
+ const rel = (l.getAttribute('rel') || '').toLowerCase()
616
+ if (rel.split(/\s+/).includes('canonical')) {
617
+ hasCanonical = true
618
+ canonicalUrl = l.getAttribute('href') || ''
619
+ break
620
+ }
621
+ }
622
+
623
+ const htmlLang = clean(root.querySelector('html')?.getAttribute('lang') || '')
624
+ // Require the imperative API (or our generated scaffold filename) — a bare "webmcp"
625
+ // substring false-passed sites that merely had e.g. id="webmcp".
626
+ const hasWebMcp = /(navigator|document)\.modelContext|modelContext\s*\.\s*registerTool|webmcp\.tools/i.test(home)
627
+
628
+ const landmarks = ['main', 'nav', 'header', 'footer'].filter((t) => root.querySelector(t))
629
+ const h1 = root.querySelectorAll('h1').length
630
+
631
+ const imgs = root.querySelectorAll('img')
632
+ const imgTotal = imgs.length
633
+ const imgWithAlt = imgs.filter((i) => i.hasAttribute('alt')).length
634
+ const altCoverage = imgTotal === 0 ? 1 : imgWithAlt / imgTotal
635
+ const altPass = imgTotal === 0 || altCoverage >= 0.8
636
+
637
+ const forms = extractForms(root)
638
+ const actions = inferActions(root, forms)
639
+ // Empty (not just null) means the crawl found/fetched nothing usable → still try home links.
640
+ const pageList = Array.isArray(pages) && pages.length ? pages : pagesFromHome(root, base)
641
+
642
+ return {
643
+ base,
644
+ title,
645
+ description,
646
+ htmlLang,
647
+ hasCanonical,
648
+ canonicalUrl,
649
+ hasJsonLd,
650
+ hasOg,
651
+ hasWebMcp,
652
+ landmarks,
653
+ h1,
654
+ imgTotal,
655
+ imgWithAlt,
656
+ altCoverage,
657
+ altPass,
658
+ hasLlmsTxt: !!(llmsTxt && llmsTxt.trim().length > 20),
659
+ hasSitemap: !!sitemap,
660
+ hasRobots: !!robots,
661
+ forms,
662
+ actions,
663
+ pages: pageList
664
+ }
665
+ }
666
+
667
+ // ---------------------------------------------------------------------------
668
+ // Scoring — 9 weighted checks, weights sum to 100
669
+ // ---------------------------------------------------------------------------
670
+
671
+ export function score(a) {
672
+ const altPct = Math.round((a.altCoverage ?? 1) * 100)
673
+ const checks = [
674
+ {
675
+ id: 'llms_txt',
676
+ label: 'llms.txt present',
677
+ weight: 22,
678
+ pass: a.hasLlmsTxt,
679
+ fix: 'Add /llms.txt so agents get a clean, token-cheap map of your site (generated below).'
680
+ },
681
+ {
682
+ id: 'webmcp',
683
+ label: 'WebMCP tools (document.modelContext)',
684
+ weight: 22,
685
+ pass: a.hasWebMcp,
686
+ fix: 'Expose your key actions as WebMCP tools so agents can ACT, not just scrape (scaffold generated below).'
687
+ },
688
+ {
689
+ id: 'structured',
690
+ label: 'Structured data (JSON-LD / OpenGraph)',
691
+ weight: 13,
692
+ pass: a.hasJsonLd || a.hasOg,
693
+ fix: 'Add schema.org JSON-LD and OpenGraph tags for machine-readable meaning (snippet generated below).'
694
+ },
695
+ {
696
+ id: 'semantics',
697
+ label: 'Semantic landmarks + an H1',
698
+ weight: 13,
699
+ pass: a.landmarks.length >= 2 && a.h1 >= 1,
700
+ fix: 'Use <main>/<nav>/<header> landmarks and at least one <h1> so agents can parse structure.'
701
+ },
702
+ {
703
+ id: 'metadata',
704
+ label: 'Title + meta description',
705
+ weight: 10,
706
+ pass: !!(a.title && a.description),
707
+ fix: 'Add a <title> and <meta name="description"> so an agent knows what the page is.'
708
+ },
709
+ {
710
+ id: 'discovery',
711
+ label: 'robots.txt + sitemap.xml',
712
+ weight: 8,
713
+ pass: a.hasRobots && a.hasSitemap,
714
+ fix: 'Add robots.txt and sitemap.xml for crawl + discovery.'
715
+ },
716
+ {
717
+ id: 'canonical',
718
+ label: 'Canonical URL (<link rel="canonical">)',
719
+ weight: 4,
720
+ pass: a.hasCanonical,
721
+ fix: 'Add <link rel="canonical" href="..."> so agents dedupe URLs to one authoritative address.'
722
+ },
723
+ {
724
+ id: 'lang',
725
+ label: 'Document language (<html lang>)',
726
+ weight: 4,
727
+ pass: !!a.htmlLang,
728
+ fix: 'Set <html lang="en"> (or your language) so agents know the content language.'
729
+ },
730
+ {
731
+ id: 'alt_text',
732
+ label: `Image alt-text coverage${a.imgTotal ? ` (${a.imgWithAlt}/${a.imgTotal}, ${altPct}%)` : ''}`,
733
+ weight: 4,
734
+ pass: a.altPass,
735
+ fix: 'Add alt text to images (alt="..."), or alt="" for decorative ones, so agents understand visuals.'
736
+ }
737
+ ]
738
+ const total = checks.reduce((s, c) => s + (c.pass ? c.weight : 0), 0)
739
+ return { total, checks }
740
+ }
741
+
742
+ // ---------------------------------------------------------------------------
743
+ // Generators
744
+ // ---------------------------------------------------------------------------
745
+
746
+ export function buildLlmsTxt(a) {
747
+ const lines = []
748
+ lines.push(`# ${oneLine(a.title || a.base || 'Site', 120)}`)
749
+ if (a.description) lines.push('', `> ${oneLine(a.description)}`)
750
+
751
+ const pages = (a.pages || []).filter((p) => p && (p.url || p.path))
752
+ if (pages.length) {
753
+ const seen = new Set()
754
+ const rows = []
755
+ for (const p of pages) {
756
+ const link = safeHref(p.url || p.path, a.base)
757
+ if (!link || seen.has(link)) continue
758
+ seen.add(link)
759
+ const label = mdText(p.title || p.path || link, 100)
760
+ const summary = mdText(p.summary || '')
761
+ rows.push(summary ? `- [${label}](${link}): ${summary}` : `- [${label}](${link})`)
762
+ }
763
+ if (rows.length) lines.push('', '## Pages', ...rows)
764
+ }
765
+
766
+ // Only real <form> endpoints go here — inferred (guessed) action endpoints would be
767
+ // misleading in an index file; they live in the WebMCP scaffold (with TODO hedges).
768
+ const acts = []
769
+ for (const f of a.forms) {
770
+ const url = safeHref(f.action, a.base)
771
+ if (!url) continue
772
+ const label = isSearchForm(f) ? 'Search' : `Submit ${lastSegment(f.action) || 'form'}`
773
+ const fields = f.inputs.map((i) => i.name).join(', ')
774
+ acts.push({ label, url, note: `${f.method} form${fields ? `, fields: ${fields}` : ''}` })
775
+ }
776
+ if (acts.length) {
777
+ lines.push('', '## Actions')
778
+ const seen = new Set()
779
+ for (const x of acts) {
780
+ const key = `${x.label}|${x.url}`
781
+ if (seen.has(key)) continue
782
+ seen.add(key)
783
+ lines.push(`- [${mdText(x.label, 80)}](${x.url}): ${mdText(x.note)}`)
784
+ }
785
+ }
786
+
787
+ lines.push('', '<!-- Generated by agent-ready • https://github.com/VeldinS/agent-ready -->')
788
+ return lines.join('\n') + '\n'
789
+ }
790
+
791
+ // Assign an OWN enumerable property even for keys like "__proto__" (which plain
792
+ // bracket-assignment would route to the prototype, silently dropping the field).
793
+ function setProp(obj, key, value) {
794
+ Object.defineProperty(obj, key, { value, enumerable: true, writable: true, configurable: true })
795
+ }
796
+
797
+ function propsFor(inputsOrProps, required) {
798
+ // forms pass an inputs[] array; inferred actions pass a ready-made props object.
799
+ if (Array.isArray(inputsOrProps)) {
800
+ const props = {}
801
+ const req = []
802
+ for (const i of inputsOrProps) {
803
+ if (!i.name) continue
804
+ const p = { type: i.schemaType }
805
+ const desc = i.label || i.placeholder
806
+ if (desc) p.description = oneLine(desc, 120)
807
+ if (i.options && i.options.length) p.enum = i.options
808
+ setProp(props, i.name, p)
809
+ if (i.required) req.push(i.name)
810
+ }
811
+ return { props, required: req }
812
+ }
813
+ return { props: inputsOrProps || {}, required: required || [] }
814
+ }
815
+
816
+ function toolFromForm(f, used) {
817
+ const search = isSearchForm(f)
818
+ const baseName = search ? 'search-site' : `submit-${lastSegment(f.action) || 'form'}`
819
+ const name = uniqName(kebab(baseName), used)
820
+ const { props, required } = propsFor(f.inputs)
821
+ return {
822
+ name,
823
+ description: search ? 'Search this site and return matching results.' : `Submit the ${f.action} form via ${f.method}.`,
824
+ props,
825
+ required,
826
+ readOnly: search,
827
+ endpoint: f.action || '/',
828
+ method: search ? 'GET' : f.method || 'POST'
829
+ }
830
+ }
831
+
832
+ function toolFromAction(ac, used) {
833
+ const name = uniqName(kebab(ac.name), used)
834
+ const { props, required } = propsFor(ac.props, ac.required)
835
+ return {
836
+ name,
837
+ description: ac.description,
838
+ props,
839
+ required,
840
+ readOnly: ac.readOnly,
841
+ endpoint: ac.endpoint,
842
+ method: ac.method || 'POST'
843
+ }
844
+ }
845
+
846
+ function renderTool(t) {
847
+ const L = []
848
+ L.push(' {')
849
+ L.push(` name: ${JSON.stringify(t.name)},`)
850
+ L.push(` description: ${JSON.stringify(t.description)},`)
851
+ if (t.props && Object.keys(t.props).length) {
852
+ L.push(' inputSchema: {')
853
+ L.push(' type: "object",')
854
+ L.push(` properties: ${JSON.stringify(t.props)},`)
855
+ L.push(` required: ${JSON.stringify(t.required || [])}`)
856
+ L.push(' },')
857
+ }
858
+ if (t.readOnly) L.push(' annotations: { readOnlyHint: true },')
859
+ L.push(' async execute(input) {')
860
+ if (t.endpoint) {
861
+ if (String(t.method).toUpperCase() === 'GET') {
862
+ L.push(` // TODO: confirm this maps to your real GET ${t.endpoint} handler`)
863
+ L.push(` const url = new URL(${JSON.stringify(t.endpoint)}, location.href);`)
864
+ L.push(' for (const [k, v] of Object.entries(input || {})) url.searchParams.set(k, v);')
865
+ L.push(' const res = await fetch(url, { method: "GET", headers: { accept: "application/json" } });')
866
+ } else {
867
+ L.push(` // TODO: confirm this maps to your real ${t.method} ${t.endpoint} handler`)
868
+ L.push(` const res = await fetch(${JSON.stringify(t.endpoint)}, {`)
869
+ L.push(` method: ${JSON.stringify(t.method)},`)
870
+ L.push(' headers: { "content-type": "application/json" },')
871
+ L.push(' body: JSON.stringify(input)')
872
+ L.push(' });')
873
+ }
874
+ L.push(' return { content: [{ type: "text", text: await res.text() }] };')
875
+ } else {
876
+ L.push(` // TODO: wire this tool to your on-page "${t.name}" action.`)
877
+ L.push(` return { content: [{ type: "text", text: ${JSON.stringify(`Not implemented: connect the ${t.name} tool to your handler.`)} }] };`)
878
+ }
879
+ L.push(' }')
880
+ L.push(' }')
881
+ return L.join('\n')
882
+ }
883
+
884
+ const EMPTY_TOOLS = ` // No <form> elements or obvious actions were detected on this page.
885
+ // Define your key actions as tools so agents can act on your site, e.g.:
886
+ // {
887
+ // name: "search-site",
888
+ // description: "Search this site and return matching results.",
889
+ // inputSchema: { type: "object", properties: { query: { type: "string" } }, required: ["query"] },
890
+ // annotations: { readOnlyHint: true },
891
+ // async execute(input) {
892
+ // const url = new URL("/search", location.href);
893
+ // url.searchParams.set("q", input.query);
894
+ // const res = await fetch(url);
895
+ // return { content: [{ type: "text", text: await res.text() }] };
896
+ // }
897
+ // }`
898
+
899
+ export function buildWebMcp(a) {
900
+ const used = new Set()
901
+ const tools = []
902
+ for (const f of a.forms) tools.push(toolFromForm(f, used))
903
+ for (const ac of a.actions) tools.push(toolFromAction(ac, used))
904
+ const body = tools.length ? tools.map(renderTool).join(',\n') : EMPTY_TOOLS
905
+
906
+ return `// Generated by agent-ready — WebMCP tool scaffold.
907
+ // Spec: https://webmachinelearning.github.io/webmcp/ • https://github.com/webmachinelearning/webmcp
908
+ // Load this on your site OVER HTTPS so AI agents can ACT on it, not just scrape it.
909
+ // WebMCP exposes document.modelContext.registerTool() and requires a secure context.
910
+ const AGENT_READY_TOOLS = [
911
+ ${body}
912
+ ];
913
+
914
+ if (typeof document !== "undefined" && document.modelContext) {
915
+ const controller = new AbortController(); // abort controller.signal to unregister these tools
916
+ for (const tool of AGENT_READY_TOOLS) {
917
+ Promise.resolve(document.modelContext.registerTool(tool, { signal: controller.signal })).catch((err) =>
918
+ console.warn("agent-ready: could not register tool", tool.name, err)
919
+ );
920
+ }
921
+ }
922
+ `
923
+ }
924
+
925
+ export function buildStructuredData(a) {
926
+ const name = a.title || a.base || 'Your site'
927
+ const url = a.base || a.canonicalUrl || 'https://your-site.com'
928
+ const description = a.description || 'A short description of what this site is and does.'
929
+ const jsonld = { '@context': 'https://schema.org', '@type': 'WebSite', name, url, description }
930
+ // Escape "<" so a value containing "</script>" cannot break out of the inline JSON-LD block.
931
+ const jsonldSafe = JSON.stringify(jsonld, null, 2).replace(/</g, '\\u003c')
932
+ return `<!-- Generated by agent-ready — paste into <head>. Raises the "structured data" check. -->
933
+ <script type="application/ld+json">
934
+ ${jsonldSafe}
935
+ </script>
936
+ <meta property="og:title" content="${escHtml(oneLine(name, 120))}" />
937
+ <meta property="og:description" content="${escHtml(oneLine(description))}" />
938
+ <meta property="og:type" content="website" />
939
+ <meta property="og:url" content="${escHtml(url)}" />
940
+ `
941
+ }
942
+
943
+ export function badgeUrl(total) {
944
+ const color = total >= 80 ? 'brightgreen' : total >= 50 ? 'yellow' : 'red'
945
+ return `https://img.shields.io/badge/agent--ready-${total}%2F100-${color}`
946
+ }
947
+
948
+ export async function writeArtifacts(outDir, a, sc) {
949
+ await mkdir(outDir, { recursive: true })
950
+ await Promise.all([
951
+ writeFile(join(outDir, 'llms.txt'), buildLlmsTxt(a)),
952
+ writeFile(join(outDir, 'webmcp.tools.js'), buildWebMcp(a)),
953
+ writeFile(join(outDir, 'structured-data.html'), buildStructuredData(a)),
954
+ writeFile(
955
+ join(outDir, 'agent-ready.json'),
956
+ JSON.stringify(
957
+ {
958
+ tool: 'agent-ready',
959
+ version: '0.4.0',
960
+ url: a.base,
961
+ score: sc.total,
962
+ badge: badgeUrl(sc.total),
963
+ checks: sc.checks.map((c) => ({ id: c.id, label: c.label, weight: c.weight, pass: c.pass })),
964
+ counts: { pages: a.pages.length, forms: a.forms.length, actions: a.actions.length, images: a.imgTotal }
965
+ },
966
+ null,
967
+ 2
968
+ ) + '\n'
969
+ )
970
+ ])
971
+ }