glippy-mcp 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -12,6 +12,7 @@ It wraps the Glippy desktop app's server-side analysis engine (`geo-checker.js`)
12
12
  - Full 16-category GEO analysis with weighted scoring
13
13
  - robots.txt AI crawler access detection
14
14
  - llms.txt file discovery and parsing
15
+ - **Agent-readiness discovery** - detects emerging agent standards (Content-Signal, llms-full.txt, MCP/A2A/Agent-Skills cards, schemamap, NLWeb, feed discovery)
15
16
  - Sitemap crawling and multi-page analysis
16
17
  - Domain comparison and competitive analysis
17
18
  - Export to styled Markdown or HTML reports
@@ -40,6 +41,7 @@ It wraps the Glippy desktop app's server-side analysis engine (`geo-checker.js`)
40
41
  - [export_report](#export_report)
41
42
  - [export_bulk_report](#export_bulk_report)
42
43
  - [GEO Scoring Categories](#geo-scoring-categories)
44
+ - [Agent-Readiness Discovery](#agent-readiness-discovery)
43
45
  - [Rate Limiting](#rate-limiting)
44
46
  - [Output Formats](#output-formats)
45
47
  - [Chrome Rendering Fallback](#chrome-rendering-fallback)
@@ -234,6 +236,7 @@ Check which AI crawlers are blocked on example.com
234
236
  - AmazonBot
235
237
  - cohere-ai
236
238
  - Sitemap references found in robots.txt
239
+ - Content-Signal directive (`search` / `ai-input` / `ai-train` preferences), when present
237
240
 
238
241
  ---
239
242
 
@@ -470,11 +473,11 @@ The analysis evaluates 16 categories, each with a weight reflecting its importan
470
473
  | 3 | **Accessibility for Agents** | 1.0x | Lang attribute, alt text on images, ARIA labels, descriptive link text |
471
474
  | 4 | **Internal Linking** | 1.0x | Link density, navigation structure, breadcrumb markup |
472
475
  | 5 | **Meta & Discoverability** | 1.0x | Title, meta description, canonical URL, Open Graph tags, hreflang |
473
- | 6 | **Machine Readability** | 1.5x | SSR detection, bot blocking checks, robots.txt rules, llms.txt presence* |
476
+ | 6 | **Machine Readability** | 1.5x | SSR detection, bot blocking checks, robots.txt rules, llms.txt presence*, robots.txt Content-Signal directive, llms-full.txt, HTTP Link discovery headers, Markdown source endpoints, RSS/Atom/JSON feed discovery |
474
477
  | 7 | **Entity & Authority** | 1.0x | Author info, publication dates, organization schema, E-E-A-T signals, credentials, editorial policy, contact completeness |
475
478
  | 8 | **Citability & Answer-Readiness** | 1.3x | FAQ content, data tables, lists, lead paragraph quality |
476
479
  | 9 | **Performance & Crawlability** | 0.3x | Image dimensions, lazy loading, resource hints |
477
- | 10 | **Agent Interactivity** | 0.2x | WebMCP tools, form annotations, agent-callable actions |
480
+ | 10 | **Agent Interactivity** | 0.2x | WebMCP tools, form annotations, agent-callable actions, MCP server card (`/.well-known/mcp/server-card.json`), A2A agent card, Agent-Skills index, NLWeb endpoint, schemamap |
478
481
  | 11 | **Content Positioning** | 1.2x | Brand differentiation, proof points, social proof |
479
482
  | 12 | **Content Freshness** | 0.8x | Date signals, content age, temporal language |
480
483
  | 13 | **Information Density** | 1.0x | Substantive-to-filler ratio, section depth, claim-evidence pairing |
@@ -492,6 +495,29 @@ The analysis evaluates 16 categories, each with a weight reflecting its importan
492
495
 
493
496
  ---
494
497
 
498
+ ## Agent-Readiness Discovery
499
+
500
+ Alongside the established checks, the server probes a set of **emerging agent-readiness standards** (largely from [specification.website](https://specification.website)). These surfaces let agents discover and consume a site without scraping HTML.
501
+
502
+ These checks are **bonus-scored**: a site gets credit when a surface is present, but absence is reported as informational guidance rather than a penalty. This keeps the long tail of sites that have not adopted these new standards from being unfairly marked down, while still rewarding early adopters.
503
+
504
+ | Surface | Where it's checked | What it signals |
505
+ |---------|--------------------|-----------------|
506
+ | **Content-Signal** | robots.txt directive | Machine-readable AI usage preferences (`search` / `ai-input` / `ai-train`). Only `ai-input=no` affects AI answer visibility; `ai-train=no` is treated as a training-only preference with no citation impact. |
507
+ | **llms-full.txt** | `/llms-full.txt` | Concatenated Markdown corpus of the pages listed in llms.txt, for full-context ingestion. Very large files (>5 MB) are flagged. |
508
+ | **HTTP Link discovery** | response `Link` header | Resource discovery via headers (`rel="describedby"`, `api-catalog`, `sitemap`, `mcp`, `service-desc`, `nlweb`) without parsing HTML. |
509
+ | **Markdown source endpoint** | `<link rel="alternate" type="text/markdown">` or content negotiation | A clean `.md` version of each page for agent ingestion. |
510
+ | **Feed discovery** | `<link rel="alternate">` | RSS / Atom / JSON feeds as a machine-readable content stream. |
511
+ | **MCP server card** | `/.well-known/mcp/server-card.json` | Discoverable MCP server (name, version, transport, endpoint, tools). |
512
+ | **A2A agent card** | `/.well-known/agent-card.json` | Agent-to-agent discovery with declared skills. |
513
+ | **Agent-Skills index** | `/.well-known/agent-skills/index.json` | Reusable agent skills exposed with digests. |
514
+ | **NLWeb endpoint** | `<link rel="nlweb">` or `Link` header | Natural-language query endpoint (conventionally `/ask`). |
515
+ | **Schemamap** | `/schemamap.xml` or `<link rel="schemamap">` | Per-resource JSON-LD (`.jsonld`) endpoints for agent-friendly structured data. |
516
+
517
+ Content-Signal, HTTP Link discovery, Markdown source endpoints, llms-full.txt, and feed discovery feed into the **Machine Readability** category; the MCP/A2A/Agent-Skills cards, NLWeb, and schemamap feed into **Agent Interactivity**. The raw findings are also returned under an `agentReadiness` object in `output_format="json"` results.
518
+
519
+ ---
520
+
495
521
  ## Rate Limiting
496
522
 
497
523
  To prevent overwhelming target servers during batch operations, the MCP server enforces per-domain rate limiting:
@@ -689,6 +715,7 @@ research-mcp/
689
715
  - Homepage HTML (static fetch first, Chrome fallback if bot-blocked)
690
716
  - sitemap.xml
691
717
  - UCP profile (/.well-known/ucp)
718
+ - Agent-readiness discovery surfaces: /llms-full.txt, /.well-known/mcp/server-card.json, /.well-known/agent-card.json, /.well-known/agent-skills/index.json, /schemamap.xml
692
719
 
693
720
  2. **Parse HTML with cheerio** (server-side DOM)
694
721
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "glippy-mcp",
3
- "version": "0.3.3",
3
+ "version": "0.4.0",
4
4
  "description": "MCP server for GEO (Generative Engine Optimization) analysis — check any domain's AI-readiness",
5
5
  "main": "src/index.js",
6
6
  "type": "module",
@@ -430,18 +430,29 @@ function analyseRobotsTxt(text) {
430
430
  blocksCrawlers: {},
431
431
  hasWildcardDisallow: false,
432
432
  sitemapUrls: [],
433
+ // Content-Signal directive (specification.website / Cloudflare content
434
+ // signals). Parsed into { search, ai-input, ai-train } -> 'yes'|'no'.
435
+ contentSignals: null,
433
436
  };
434
437
 
435
438
  if (!text) return result;
436
439
 
437
440
  const lines = text.split(/\r?\n/);
438
441
 
439
- // Collect sitemap references.
442
+ // Collect sitemap references and Content-Signal directives.
440
443
  for (const line of lines) {
441
444
  const sitemapMatch = line.match(/^\s*Sitemap\s*:\s*(.+)/i);
442
445
  if (sitemapMatch) {
443
446
  result.sitemapUrls.push(sitemapMatch[1].trim());
444
447
  }
448
+ const signalMatch = line.replace(/#.*$/, '').match(/^\s*Content-Signal\s*:\s*(.+)/i);
449
+ if (signalMatch) {
450
+ if (!result.contentSignals) result.contentSignals = {};
451
+ for (const pair of signalMatch[1].split(',')) {
452
+ const [k, v] = pair.split('=').map((s) => (s || '').trim().toLowerCase());
453
+ if (k && v) result.contentSignals[k] = v;
454
+ }
455
+ }
445
456
  }
446
457
 
447
458
  // Build a minimal per-user-agent rule map.
@@ -1922,11 +1933,21 @@ function checkMeta($, currentUrl) {
1922
1933
  // CHECK CATEGORY 6: Machine Readability
1923
1934
  // ---------------------------------------------------------------------------
1924
1935
 
1925
- function checkMachineReadability($, robotsTxtData, llmsTxtData, responseHeaders) {
1936
+ function checkMachineReadability($, robotsTxtData, llmsTxtData, responseHeaders, agentReadiness = null) {
1926
1937
  const checks = [];
1927
1938
  let score = 0;
1928
1939
  let maxScore = 0;
1929
1940
 
1941
+ // Case-insensitive response header lookup (responseHeaders may use any casing).
1942
+ const getHeader = (name) => {
1943
+ if (!responseHeaders) return '';
1944
+ const lower = name.toLowerCase();
1945
+ for (const k of Object.keys(responseHeaders)) {
1946
+ if (k.toLowerCase() === lower) return String(responseHeaders[k] || '');
1947
+ }
1948
+ return '';
1949
+ };
1950
+
1930
1951
  // Content in initial HTML (SSR check)
1931
1952
  const mainEl = $('main, [role="main"], article');
1932
1953
  const textContent = (mainEl.length > 0 ? mainEl.first().text() : $('body').text() || '').trim();
@@ -2058,7 +2079,7 @@ function checkMachineReadability($, robotsTxtData, llmsTxtData, responseHeaders)
2058
2079
 
2059
2080
  // X-Robots-Tag header
2060
2081
  if (responseHeaders) {
2061
- const xRobotsTag = responseHeaders['x-robots-tag'] || '';
2082
+ const xRobotsTag = getHeader('x-robots-tag');
2062
2083
  if (xRobotsTag) {
2063
2084
  if (xRobotsTag.includes('noindex')) {
2064
2085
  checks.push({ status: 'fail', label: 'X-Robots-Tag header: NOINDEX', detail: `"${xRobotsTag}" - page won't be indexed via header!` });
@@ -2070,6 +2091,92 @@ function checkMachineReadability($, robotsTxtData, llmsTxtData, responseHeaders)
2070
2091
  }
2071
2092
  }
2072
2093
 
2094
+ // ── Agent-readiness discovery (emerging standards: bonus scoring, no penalty ──
2095
+ // for absence so the long tail of sites that haven't adopted them isn't punished).
2096
+
2097
+ // Content-Signal directive in robots.txt (search / ai-input / ai-train).
2098
+ if (robotsTxtData && robotsTxtData.contentSignals) {
2099
+ const cs = robotsTxtData.contentSignals;
2100
+ maxScore += 4;
2101
+ score += 4;
2102
+ const summary = Object.entries(cs).map(([k, v]) => `${k}=${v}`).join(', ');
2103
+ // Only ai-input=no hurts AI citation/answer visibility; ai-train=no is a
2104
+ // training-only preference (no citation impact), consistent with how Glippy
2105
+ // treats training vs citation crawlers elsewhere.
2106
+ if (cs['ai-input'] === 'no') {
2107
+ checks.push({ status: 'info', label: 'robots.txt Content-Signal present (restrictive)', detail: `Declares AI usage preferences: ${summary}. ai-input=no asks AI engines not to use the page for live answers.`, found: [summary] });
2108
+ } else {
2109
+ checks.push({ status: 'pass', label: 'robots.txt Content-Signal present', detail: `Machine-readable AI usage preferences: ${summary}`, found: [summary] });
2110
+ }
2111
+ } else if (robotsTxtData && robotsTxtData.exists) {
2112
+ checks.push({ status: 'info', label: 'No Content-Signal directive in robots.txt', detail: 'Add a Content-Signal line (e.g. "Content-Signal: search=yes, ai-input=yes, ai-train=no") to declare AI usage preferences (specification.website)' });
2113
+ }
2114
+
2115
+ // HTTP Link header discovery (rel=describedby / api-catalog / sitemap / mcp / service-desc).
2116
+ {
2117
+ const linkHeader = getHeader('link');
2118
+ const agentRels = ['describedby', 'api-catalog', 'sitemap', 'mcp', 'service-desc', 'nlweb'];
2119
+ const foundRels = agentRels.filter((rel) => new RegExp(`rel\\s*=\\s*"?${rel}\\b`, 'i').test(linkHeader));
2120
+ if (foundRels.length > 0) {
2121
+ maxScore += 4;
2122
+ score += 4;
2123
+ checks.push({ status: 'pass', label: `HTTP Link header discovery: ${foundRels.join(', ')}`, detail: 'Agents can discover resources from response headers without parsing HTML', found: foundRels });
2124
+ } else {
2125
+ checks.push({ status: 'info', label: 'No agent-discovery HTTP Link headers', detail: 'Expose discovery via Link headers, e.g. Link: </llms.txt>; rel="describedby"; type="text/markdown" (specification.website)' });
2126
+ }
2127
+ }
2128
+
2129
+ // Per-page Markdown source endpoint (link rel=alternate type=text/markdown, or content negotiation).
2130
+ {
2131
+ const mdLink = $('link[rel="alternate"][type="text/markdown"]').attr('href');
2132
+ const vary = getHeader('vary').toLowerCase();
2133
+ const contentLocation = getHeader('content-location');
2134
+ const negotiated = vary.includes('accept') && /\.md(\?|$)/i.test(contentLocation);
2135
+ if (mdLink || negotiated) {
2136
+ maxScore += 4;
2137
+ score += 4;
2138
+ checks.push({ status: 'pass', label: 'Markdown source endpoint advertised', detail: mdLink ? `<link rel="alternate" type="text/markdown" href="${mdLink}">` : 'Served via content negotiation (Vary: Accept + Content-Location .md)', found: mdLink ? [mdLink] : undefined });
2139
+ } else {
2140
+ checks.push({ status: 'info', label: 'No Markdown source endpoint', detail: 'Serve a .md version of each page and advertise it with <link rel="alternate" type="text/markdown"> for clean agent ingestion (specification.website)' });
2141
+ }
2142
+ }
2143
+
2144
+ // llms-full.txt (concatenated markdown of the llms.txt pages).
2145
+ if (agentReadiness && agentReadiness.llmsFullTxt) {
2146
+ const lf = agentReadiness.llmsFullTxt;
2147
+ if (lf.exists && !lf.isHtml) {
2148
+ maxScore += 4;
2149
+ const tooBig = lf.sizeBytes > 5 * 1024 * 1024;
2150
+ if (tooBig) {
2151
+ score += 2;
2152
+ checks.push({ status: 'warn', label: 'llms-full.txt found but very large', detail: `${Math.round(lf.sizeBytes / 1024)} KB - over a couple of MB is suspect and may exceed agent context windows` });
2153
+ } else {
2154
+ score += 4;
2155
+ checks.push({ status: 'pass', label: 'llms-full.txt found', detail: `Concatenated markdown corpus for LLMs (${Math.round(lf.sizeBytes / 1024)} KB)` });
2156
+ }
2157
+ } else if (lf.exists && lf.isHtml) {
2158
+ maxScore += 4;
2159
+ checks.push({ status: 'warn', label: 'llms-full.txt served as HTML', detail: 'Serve llms-full.txt as text/markdown or text/plain, not HTML' });
2160
+ } else {
2161
+ checks.push({ status: 'info', label: 'No llms-full.txt found', detail: 'Add /llms-full.txt with the concatenated markdown of pages in llms.txt for full-context AI ingestion (specification.website)' });
2162
+ }
2163
+ }
2164
+
2165
+ // Feed discovery (RSS / Atom / JSON Feed) - machine-readable formats.
2166
+ {
2167
+ const feedSelectors = 'link[rel="alternate"][type="application/rss+xml"], link[rel="alternate"][type="application/atom+xml"], link[rel="alternate"][type="application/feed+json"], link[rel="alternate"][type="application/json"]';
2168
+ const feeds = $(feedSelectors);
2169
+ if (feeds.length > 0) {
2170
+ maxScore += 3;
2171
+ score += 3;
2172
+ const hrefs = [];
2173
+ feeds.each((_, el) => { const h = $(el).attr('href'); if (h) hrefs.push(h); });
2174
+ checks.push({ status: 'pass', label: `Feed discovery: ${feeds.length} feed(s)`, detail: 'RSS/Atom/JSON feeds give agents a machine-readable content stream', found: hrefs.slice(0, 5) });
2175
+ } else {
2176
+ checks.push({ status: 'info', label: 'No discoverable feed', detail: 'Advertise an RSS/Atom/JSON feed via <link rel="alternate"> for machine-readable content updates (specification.website)' });
2177
+ }
2178
+ }
2179
+
2073
2180
  return { checks, score: maxScore > 0 ? Math.round((score / maxScore) * 100) : 0, category: 'Machine Readability' };
2074
2181
  }
2075
2182
 
@@ -3217,11 +3324,21 @@ function checkPerformance($) {
3217
3324
  // (signing_keys, order webhook_url, etc. become required at this version).
3218
3325
  const LATEST_UCP_VERSION = '2026-04-08';
3219
3326
 
3220
- function checkWebMCP($, pageType, ucpData) {
3327
+ function checkWebMCP($, pageType, ucpData, responseHeaders = null, agentReadiness = null) {
3221
3328
  const checks = [];
3222
3329
  let score = 0;
3223
3330
  let maxScore = 0;
3224
3331
 
3332
+ // Case-insensitive response header lookup.
3333
+ const getHeaderWebMCP = (name) => {
3334
+ if (!responseHeaders) return '';
3335
+ const lower = name.toLowerCase();
3336
+ for (const k of Object.keys(responseHeaders)) {
3337
+ if (k.toLowerCase() === lower) return String(responseHeaders[k] || '');
3338
+ }
3339
+ return '';
3340
+ };
3341
+
3225
3342
  // ── CHECK 1: Declarative WebMCP Tool Detection (DOM-based) ──
3226
3343
  const webmcpForms = $('form[toolname]');
3227
3344
  const toolCount = webmcpForms.length;
@@ -3869,9 +3986,92 @@ function checkWebMCP($, pageType, ucpData) {
3869
3986
  checks.push({ status: 'info', label: 'Shopify-hosted: dual UCP surface expected', detail: 'Per-shop endpoint at /api/ucp/mcp; global catalog at https://discover.shopifyapps.com/global/mcp' });
3870
3987
  }
3871
3988
 
3989
+ // ══════════════════════════════════════════════════════
3990
+ // AGENT DISCOVERY SURFACES (specification.website Agent Readiness)
3991
+ // Emerging standards: bonus scoring (credit when present, info when absent).
3992
+ // ══════════════════════════════════════════════════════
3993
+ let hasDiscoverySurface = false;
3994
+ const ar = agentReadiness || {};
3995
+ const linkHeaderWebMCP = getHeaderWebMCP('link');
3996
+
3997
+ // MCP server discovery: /.well-known/mcp/server-card.json (+ Link rel="mcp").
3998
+ {
3999
+ const card = ar.mcpServerCard;
4000
+ const linkAdvertised = /rel\s*=\s*"?mcp\b/i.test(linkHeaderWebMCP);
4001
+ if (card && card.exists && card.valid) {
4002
+ hasDiscoverySurface = true;
4003
+ maxScore += 5; score += 5;
4004
+ checks.push({ status: 'pass', label: 'MCP server card found', detail: `/.well-known/mcp/server-card.json is published${linkAdvertised ? ' and advertised via Link header' : ''}` });
4005
+ } else if (card && card.exists) {
4006
+ maxScore += 5; score += 2;
4007
+ checks.push({ status: 'warn', label: 'MCP server card invalid JSON', detail: '/.well-known/mcp/server-card.json was reachable but did not parse as JSON' });
4008
+ } else {
4009
+ checks.push({ status: 'info', label: 'No MCP server card', detail: 'Publish /.well-known/mcp/server-card.json (name, version, transport, endpoint, tools) so agents can discover your MCP server (specification.website)' });
4010
+ }
4011
+ }
4012
+
4013
+ // A2A agent card: /.well-known/agent-card.json.
4014
+ {
4015
+ const card = ar.a2aAgentCard;
4016
+ if (card && card.exists && card.valid) {
4017
+ hasDiscoverySurface = true;
4018
+ maxScore += 4; score += 4;
4019
+ checks.push({ status: 'pass', label: 'A2A agent card found', detail: `/.well-known/agent-card.json is valid${card.skillsCount ? ` with ${card.skillsCount} skill(s)` : ''}` });
4020
+ } else if (card && card.exists) {
4021
+ maxScore += 4; score += 2;
4022
+ checks.push({ status: 'warn', label: 'A2A agent card incomplete', detail: 'agent-card.json is missing required fields (name, description, version) or skills' });
4023
+ } else {
4024
+ checks.push({ status: 'info', label: 'No A2A agent card', detail: 'Publish /.well-known/agent-card.json to let other agents discover and call your services (specification.website)' });
4025
+ }
4026
+ }
4027
+
4028
+ // Agent Skills discovery: /.well-known/agent-skills/index.json.
4029
+ {
4030
+ const sk = ar.agentSkills;
4031
+ if (sk && sk.exists && sk.valid && sk.schemaOk) {
4032
+ hasDiscoverySurface = true;
4033
+ maxScore += 4; score += 4;
4034
+ checks.push({ status: 'pass', label: 'Agent Skills index found', detail: `/.well-known/agent-skills/index.json published with ${sk.skillsCount} skill(s)` });
4035
+ } else if (sk && sk.exists) {
4036
+ maxScore += 4; score += 2;
4037
+ checks.push({ status: 'warn', label: 'Agent Skills index incomplete', detail: 'index.json should set $schema to the agentskills discovery schema and list skills with digests' });
4038
+ } else {
4039
+ checks.push({ status: 'info', label: 'No Agent Skills discovery', detail: 'Publish /.well-known/agent-skills/index.json to expose reusable agent skills (specification.website)' });
4040
+ }
4041
+ }
4042
+
4043
+ // NLWeb conversational endpoint (link rel="nlweb" or Link header).
4044
+ {
4045
+ const nlwebLink = $('link[rel="nlweb"]').attr('href');
4046
+ const nlwebHeader = /rel\s*=\s*"?nlweb\b/i.test(linkHeaderWebMCP);
4047
+ if (nlwebLink || nlwebHeader) {
4048
+ hasDiscoverySurface = true;
4049
+ maxScore += 3; score += 3;
4050
+ checks.push({ status: 'pass', label: 'NLWeb endpoint advertised', detail: nlwebLink ? `<link rel="nlweb" href="${nlwebLink}">` : 'Advertised via Link: rel="nlweb"' });
4051
+ } else {
4052
+ checks.push({ status: 'info', label: 'No NLWeb endpoint', detail: 'Expose a natural-language query endpoint (by convention /ask) and advertise it with <link rel="nlweb"> (specification.website)' });
4053
+ }
4054
+ }
4055
+
4056
+ // Schemamap: /schemamap.xml + per-resource JSON-LD endpoints (link rel="schemamap").
4057
+ {
4058
+ const schemamapLink = $('link[rel="schemamap"]').attr('href');
4059
+ const sm = ar.schemamap;
4060
+ if ((sm && sm.exists && sm.valid) || schemamapLink) {
4061
+ hasDiscoverySurface = true;
4062
+ maxScore += 3; score += 3;
4063
+ const detail = sm && sm.exists
4064
+ ? `/schemamap.xml published${sm.resourceCount ? ` with ${sm.resourceCount} resource(s)` : ''}`
4065
+ : `Advertised via <link rel="schemamap" href="${schemamapLink}">`;
4066
+ checks.push({ status: 'pass', label: 'Schemamap found', detail });
4067
+ } else {
4068
+ checks.push({ status: 'info', label: 'No schemamap', detail: 'Publish /schemamap.xml listing per-resource JSON-LD (.jsonld) endpoints for agent-friendly structured data (specification.website)' });
4069
+ }
4070
+ }
4071
+
3872
4072
  // Baseline credit for purely informational pages.
3873
- // If the page has no forms, no WebMCP signals, no UCP profile, and no Shopify
3874
- // surface, there's nothing for it to expose to agents - WebMCP/UCP are N/A here.
4073
+ // If the page has no forms, no WebMCP signals, no UCP profile, no discovery
4074
+ // surface, and no Shopify surface, there's nothing for it to expose to agents.
3875
4075
  // Without this, content-only pages are capped well below 100 even when there's
3876
4076
  // nothing to fix, dragging the overall score unfairly.
3877
4077
  const totalForms = $('form').length;
@@ -3884,7 +4084,8 @@ function checkWebMCP($, pageType, ucpData) {
3884
4084
  !webmcpSDKFound &&
3885
4085
  !hasSchemaActions &&
3886
4086
  !hasUcp &&
3887
- !hasShopify;
4087
+ !hasShopify &&
4088
+ !hasDiscoverySurface;
3888
4089
 
3889
4090
  if (hasNoInteractiveSurface) {
3890
4091
  checks.push({
@@ -4993,7 +5194,7 @@ function checkMultimodal($, jsonLdData) {
4993
5194
  * hasStructuredData: boolean
4994
5195
  * }}
4995
5196
  */
4996
- function analyseHTML(html, domain, robotsTxtData, llmsTxtData, responseHeaders, pathname = '/', ucpData = null) {
5197
+ function analyseHTML(html, domain, robotsTxtData, llmsTxtData, responseHeaders, pathname = '/', ucpData = null, agentReadiness = null) {
4997
5198
  const result = {
4998
5199
  pageType: 'generic',
4999
5200
  categories: [],
@@ -5085,11 +5286,11 @@ function analyseHTML(html, domain, robotsTxtData, llmsTxtData, responseHeaders,
5085
5286
  checkAccessibility($),
5086
5287
  checkInternalLinking($, domain),
5087
5288
  checkMeta($, currentUrl),
5088
- checkMachineReadability($, robotsTxtData, llmsTxtData, responseHeaders),
5289
+ checkMachineReadability($, robotsTxtData, llmsTxtData, responseHeaders, agentReadiness),
5089
5290
  checkEntity($, jsonLdData),
5090
5291
  checkCitability($),
5091
5292
  checkPerformance($),
5092
- checkWebMCP($, pageType, ucpData),
5293
+ checkWebMCP($, pageType, ucpData, responseHeaders, agentReadiness),
5093
5294
  checkContentPositioning($),
5094
5295
  checkContentFreshness($, jsonLdData),
5095
5296
  checkInformationDensity($),
@@ -5222,6 +5423,14 @@ async function checkGEO(domain, options = {}) {
5222
5423
  content: null,
5223
5424
  error: null,
5224
5425
  },
5426
+ // Agent-readiness discovery surfaces (specification.website Agent Readiness).
5427
+ agentReadiness: {
5428
+ llmsFullTxt: { exists: false, url: null, sizeBytes: 0, isHtml: false },
5429
+ mcpServerCard: { exists: false, url: null, valid: false },
5430
+ a2aAgentCard: { exists: false, url: null, valid: false, skillsCount: 0 },
5431
+ agentSkills: { exists: false, url: null, valid: false, schemaOk: false, skillsCount: 0 },
5432
+ schemamap: { exists: false, url: null, valid: false, resourceCount: 0 },
5433
+ },
5225
5434
  securityHeaders: {},
5226
5435
  // Multi-page crawl results
5227
5436
  multiPageCrawl: {
@@ -5266,6 +5475,12 @@ async function checkGEO(domain, options = {}) {
5266
5475
  const homepageUrl = `${baseUrl}/`;
5267
5476
  const sitemapUrl = `${baseUrl}/sitemap.xml`;
5268
5477
  const ucpUrl = `${baseUrl}/.well-known/ucp`;
5478
+ // Agent-readiness discovery resources (specification.website / Agent Readiness).
5479
+ const llmsFullUrl = `${baseUrl}/llms-full.txt`;
5480
+ const mcpCardUrl = `${baseUrl}/.well-known/mcp/server-card.json`;
5481
+ const agentCardUrl = `${baseUrl}/.well-known/agent-card.json`;
5482
+ const agentSkillsUrl = `${baseUrl}/.well-known/agent-skills/index.json`;
5483
+ const schemamapUrl = `${baseUrl}/schemamap.xml`;
5269
5484
 
5270
5485
  output.robotsTxt.url = robotsUrl;
5271
5486
  output.llmsTxt.url = llmsUrl;
@@ -5274,9 +5489,11 @@ async function checkGEO(domain, options = {}) {
5274
5489
  output.ucpProfile.url = ucpUrl;
5275
5490
 
5276
5491
  let robotsRes, llmsRes, homepageRes, sitemapRes, ucpRes;
5492
+ let llmsFullRes, mcpCardRes, agentCardRes, agentSkillsRes, schemamapRes;
5277
5493
 
5278
5494
  try {
5279
- [robotsRes, llmsRes, homepageRes, sitemapRes, ucpRes] = await Promise.all([
5495
+ [robotsRes, llmsRes, homepageRes, sitemapRes, ucpRes,
5496
+ llmsFullRes, mcpCardRes, agentCardRes, agentSkillsRes, schemamapRes] = await Promise.all([
5280
5497
  throttledFetchUrl(robotsUrl, FETCH_TIMEOUT_MS, MAX_TEXT_BODY_SIZE).catch(() => ({ body: null, statusCode: null, headers: {} })),
5281
5498
  throttledFetchUrl(llmsUrl, FETCH_TIMEOUT_MS, MAX_TEXT_BODY_SIZE).catch(() => ({ body: null, statusCode: null, headers: {} })),
5282
5499
  renderMode === 'chrome'
@@ -5284,6 +5501,11 @@ async function checkGEO(domain, options = {}) {
5284
5501
  : throttledFetchUrl(homepageUrl).catch(() => ({ body: null, statusCode: null, headers: {} })),
5285
5502
  throttledFetchUrl(sitemapUrl, FETCH_TIMEOUT_MS, MAX_TEXT_BODY_SIZE).catch(() => ({ body: null, statusCode: null, headers: {} })),
5286
5503
  throttledFetchUrl(ucpUrl, FETCH_TIMEOUT_MS, MAX_TEXT_BODY_SIZE).catch(() => ({ body: null, statusCode: null, headers: {} })),
5504
+ throttledFetchUrl(llmsFullUrl, FETCH_TIMEOUT_MS, MAX_TEXT_BODY_SIZE).catch(() => ({ body: null, statusCode: null, headers: {} })),
5505
+ throttledFetchUrl(mcpCardUrl, FETCH_TIMEOUT_MS, MAX_TEXT_BODY_SIZE).catch(() => ({ body: null, statusCode: null, headers: {} })),
5506
+ throttledFetchUrl(agentCardUrl, FETCH_TIMEOUT_MS, MAX_TEXT_BODY_SIZE).catch(() => ({ body: null, statusCode: null, headers: {} })),
5507
+ throttledFetchUrl(agentSkillsUrl, FETCH_TIMEOUT_MS, MAX_TEXT_BODY_SIZE).catch(() => ({ body: null, statusCode: null, headers: {} })),
5508
+ throttledFetchUrl(schemamapUrl, FETCH_TIMEOUT_MS, MAX_TEXT_BODY_SIZE).catch(() => ({ body: null, statusCode: null, headers: {} })),
5287
5509
  ]);
5288
5510
  } catch (err) {
5289
5511
  output.error = `Failed to fetch resources: ${err.message}`;
@@ -5324,6 +5546,7 @@ async function checkGEO(domain, options = {}) {
5324
5546
  output.robotsTxt.blocksCrawlers = analysis.blocksCrawlers;
5325
5547
  output.robotsTxt.hasWildcardDisallow = analysis.hasWildcardDisallow;
5326
5548
  output.robotsTxt.sitemapReferences = analysis.sitemapUrls;
5549
+ output.robotsTxt.contentSignals = analysis.contentSignals;
5327
5550
  }
5328
5551
  } catch (err) {
5329
5552
  output.robotsTxt.error = err.message;
@@ -5339,6 +5562,66 @@ async function checkGEO(domain, options = {}) {
5339
5562
  output.llmsTxt.error = err.message;
5340
5563
  }
5341
5564
 
5565
+ // --- Agent-readiness discovery surfaces ---
5566
+ try {
5567
+ const ar = output.agentReadiness;
5568
+
5569
+ // /llms-full.txt: concatenated markdown of the pages in llms.txt.
5570
+ ar.llmsFullTxt.url = llmsFullUrl;
5571
+ if (llmsFullRes.statusCode === 200 && llmsFullRes.body) {
5572
+ const body = llmsFullRes.body;
5573
+ const trimmed = body.trimStart().toLowerCase();
5574
+ ar.llmsFullTxt.exists = true;
5575
+ ar.llmsFullTxt.sizeBytes = Buffer.byteLength(body);
5576
+ ar.llmsFullTxt.isHtml = trimmed.startsWith('<!') || trimmed.startsWith('<html');
5577
+ }
5578
+
5579
+ // /.well-known/mcp/server-card.json: MCP server discovery.
5580
+ ar.mcpServerCard.url = mcpCardUrl;
5581
+ if (mcpCardRes.statusCode === 200 && mcpCardRes.body) {
5582
+ ar.mcpServerCard.exists = true;
5583
+ try { JSON.parse(mcpCardRes.body); ar.mcpServerCard.valid = true; } catch { /* invalid json */ }
5584
+ }
5585
+
5586
+ // /.well-known/agent-card.json: A2A agent card.
5587
+ ar.a2aAgentCard.url = agentCardUrl;
5588
+ if (agentCardRes.statusCode === 200 && agentCardRes.body) {
5589
+ ar.a2aAgentCard.exists = true;
5590
+ try {
5591
+ const card = JSON.parse(agentCardRes.body);
5592
+ const requiredOk = !!(card && card.name && card.description && (card.version || card.protocolVersion));
5593
+ ar.a2aAgentCard.valid = requiredOk;
5594
+ ar.a2aAgentCard.skillsCount = Array.isArray(card && card.skills) ? card.skills.length : 0;
5595
+ } catch { /* invalid json */ }
5596
+ }
5597
+
5598
+ // /.well-known/agent-skills/index.json: Agent Skills discovery.
5599
+ ar.agentSkills.url = agentSkillsUrl;
5600
+ if (agentSkillsRes.statusCode === 200 && agentSkillsRes.body) {
5601
+ ar.agentSkills.exists = true;
5602
+ try {
5603
+ const idx = JSON.parse(agentSkillsRes.body);
5604
+ ar.agentSkills.valid = true;
5605
+ ar.agentSkills.schemaOk = typeof idx.$schema === 'string' && idx.$schema.includes('agentskills');
5606
+ ar.agentSkills.skillsCount = Array.isArray(idx.skills) ? idx.skills.length : 0;
5607
+ } catch { /* invalid json */ }
5608
+ }
5609
+
5610
+ // /schemamap.xml: discoverable JSON-LD endpoints per resource.
5611
+ ar.schemamap.url = schemamapUrl;
5612
+ if (schemamapRes.statusCode === 200 && schemamapRes.body) {
5613
+ ar.schemamap.exists = true;
5614
+ const body = schemamapRes.body;
5615
+ if (body.includes('<schemamap') || body.includes('<resource')) {
5616
+ ar.schemamap.valid = true;
5617
+ const matches = body.match(/<resource[\s>]/g);
5618
+ ar.schemamap.resourceCount = matches ? matches.length : 0;
5619
+ }
5620
+ }
5621
+ } catch (err) {
5622
+ // Non-critical: leave defaults.
5623
+ }
5624
+
5342
5625
  // --- /.well-known/ucp ---
5343
5626
  try {
5344
5627
  if (ucpRes.statusCode === 200 && ucpRes.body) {
@@ -5410,6 +5693,7 @@ async function checkGEO(domain, options = {}) {
5410
5693
  homepageRes.headers || {},
5411
5694
  '/',
5412
5695
  output.ucpProfile,
5696
+ output.agentReadiness,
5413
5697
  );
5414
5698
  } else {
5415
5699
  output.homepage.error =
@@ -5511,6 +5795,7 @@ async function checkGEO(domain, options = {}) {
5511
5795
  res.headers || {},
5512
5796
  pathname,
5513
5797
  output.ucpProfile,
5798
+ output.agentReadiness,
5514
5799
  );
5515
5800
  return { url: pageUrl, analysis, error: null };
5516
5801
  }
package/src/index.js CHANGED
@@ -1609,7 +1609,7 @@ function bulkHTMLScript() {
1609
1609
 
1610
1610
  const server = new McpServer({
1611
1611
  name: "glippy-geo",
1612
- version: "0.3.3",
1612
+ version: "0.4.0",
1613
1613
  });
1614
1614
 
1615
1615
  // ---------------------------------------------------------------------------