glippy-mcp 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.js CHANGED
@@ -1,4 +1,4 @@
1
- #!/usr/bin/env node
1
+ #!/usr/bin/env node
2
2
 
3
3
  /**
4
4
  * Glippy MCP Server
@@ -36,6 +36,27 @@ import {
36
36
  parseSitemapUrls,
37
37
  aggregatePageScores,
38
38
  } from "./geo-checker.js";
39
+ import { chromeFetch } from "./chrome-fetcher.js";
40
+
41
+ // Render-mode: how to fetch HTML for scoring.
42
+ // 'static' (default for tools that don't specify) - raw Node fetch, fastest
43
+ // 'chrome' - always render via headless Chrome
44
+ // 'auto' - static first, Chrome fallback on bot-block
45
+ //
46
+ // Chrome modes require a local Chrome/Chromium binary. Auto-resolves from
47
+ // CHROME_PATH / PUPPETEER_EXECUTABLE_PATH / common install locations, or
48
+ // attaches to an already-running Chrome when CHROME_REMOTE_URL is set
49
+ // (e.g. "http://localhost:9222" after launching Chrome with
50
+ // --remote-debugging-port=9222).
51
+ const RENDER_MODES = ["static", "chrome", "auto"];
52
+
53
+ function looksBotBlockedResponse(res) {
54
+ if (!res) return true;
55
+ if (res.statusCode == null) return true;
56
+ if ([401, 403, 407, 429, 503].includes(res.statusCode)) return true;
57
+ if (res.statusCode >= 200 && res.statusCode < 300 && !res.body) return true;
58
+ return false;
59
+ }
39
60
 
40
61
  // ---------------------------------------------------------------------------
41
62
  // License validation
@@ -1597,10 +1618,15 @@ const server = new McpServer({
1597
1618
  server.tool(
1598
1619
  "analyze_domain",
1599
1620
  "Run a comprehensive GEO (Generative Engine Optimization) readiness analysis on a domain. " +
1600
- "Checks robots.txt, llms.txt (note: llms.txt is not currently supported by major AI models having one cannot hurt but is not a meaningful optimization), " +
1601
- "homepage HTML (10 scoring categories), sitemap.xml, and security headers. " +
1621
+ "Checks robots.txt, llms.txt (note: llms.txt is not currently supported by major AI models - having one cannot hurt but is not a meaningful optimization), " +
1622
+ "homepage HTML (16 scoring categories), sitemap.xml, and security headers. " +
1602
1623
  "Returns an overall weighted score (0-100) with per-category breakdowns and actionable recommendations. " +
1603
- "Use output_format='json' to get raw results that can be passed to export_report.",
1624
+ "The response includes a 'Trust Signal Evidence' section (htmlLang, hreflangs, nav/footer link anchor text + href). " +
1625
+ "IMPORTANT: the Entity & Authority heuristic for about/contact/legal/imprint/cookies uses a pattern fallback. " +
1626
+ "When the site is not in English (htmlLang != 'en' or hreflangs indicate otherwise), you (the calling LLM) should " +
1627
+ "inspect navLinks/footerLinks semantically and override the heuristic classification in your final explanation - " +
1628
+ "the anchor text tells you what each link is regardless of URL patterns. " +
1629
+ "Use output_format='json' to get the full raw result (includes homepage.analysis.evidence) for programmatic use.",
1604
1630
  {
1605
1631
  domain: z
1606
1632
  .string()
@@ -1616,6 +1642,14 @@ server.tool(
1616
1642
  .describe(
1617
1643
  "Maximum pages to crawl (1 = homepage only, up to 10 for multi-page analysis). Defaults to 10."
1618
1644
  ),
1645
+ render_mode: z
1646
+ .enum(RENDER_MODES)
1647
+ .optional()
1648
+ .describe(
1649
+ 'How to fetch HTML. "static" (fast, plain Node fetch) is default for this tool. ' +
1650
+ '"auto" tries static first and falls back to a local headless Chrome for bot-blocked responses (401/403/407/429/503 or empty 2xx). ' +
1651
+ '"chrome" always renders via Chrome. Chrome modes need a local Chrome binary (CHROME_PATH) or an attached instance (CHROME_REMOTE_URL).'
1652
+ ),
1619
1653
  output_format: z
1620
1654
  .enum(["text", "json"])
1621
1655
  .optional()
@@ -1624,10 +1658,11 @@ server.tool(
1624
1658
  '"json" returns the raw analysis result object that can be passed to export_report\'s analysis_result parameter.'
1625
1659
  ),
1626
1660
  },
1627
- withLicense(async ({ domain, max_pages, output_format }) => {
1661
+ withLicense(async ({ domain, max_pages, render_mode, output_format }) => {
1628
1662
  try {
1629
1663
  const result = await checkGEO(domain, {
1630
1664
  maxPages: max_pages ?? 10,
1665
+ renderMode: render_mode ?? "static",
1631
1666
  });
1632
1667
 
1633
1668
  if (result.error) {
@@ -1678,6 +1713,36 @@ server.tool(
1678
1713
  }
1679
1714
  }
1680
1715
  lines.push("");
1716
+
1717
+ // Trust Signal Evidence - language-agnostic raw data for LLM reclassification
1718
+ const ev = analysis.evidence;
1719
+ if (ev) {
1720
+ lines.push("## Trust Signal Evidence");
1721
+ lines.push(`htmlLang: ${ev.htmlLang || "(none)"}`);
1722
+ if (ev.hreflangs && ev.hreflangs.length > 0) {
1723
+ lines.push(`hreflangs: ${ev.hreflangs.slice(0, 20).join(", ")}${ev.hreflangs.length > 20 ? ` (+${ev.hreflangs.length - 20} more)` : ""}`);
1724
+ }
1725
+ const isNonEnglish = ev.htmlLang && !/^en(-|$)/i.test(ev.htmlLang);
1726
+ if (isNonEnglish) {
1727
+ lines.push(
1728
+ `NOTE: Site is in '${ev.htmlLang}'. If the Entity & Authority category flagged missing about/contact/legal/imprint/cookies links, ` +
1729
+ `verify against the footerLinks below - anchor text like 'Über uns', '会社概要', 'Impressum', 'Politique de confidentialité' count as trust signals ` +
1730
+ `regardless of URL patterns. Override the heuristic if links are clearly present.`
1731
+ );
1732
+ }
1733
+ const formatLink = (l) => ` - ${l.text || "(no text)"} -> ${l.href}`;
1734
+ if (ev.footerLinks && ev.footerLinks.length > 0) {
1735
+ lines.push(`footerLinks (${ev.footerLinks.length}):`);
1736
+ ev.footerLinks.slice(0, 40).forEach((l) => lines.push(formatLink(l)));
1737
+ if (ev.footerLinks.length > 40) lines.push(` ... (+${ev.footerLinks.length - 40} more)`);
1738
+ }
1739
+ if (ev.navLinks && ev.navLinks.length > 0) {
1740
+ lines.push(`navLinks (${ev.navLinks.length}):`);
1741
+ ev.navLinks.slice(0, 30).forEach((l) => lines.push(formatLink(l)));
1742
+ if (ev.navLinks.length > 30) lines.push(` ... (+${ev.navLinks.length - 30} more)`);
1743
+ }
1744
+ lines.push("");
1745
+ }
1681
1746
  }
1682
1747
 
1683
1748
  // robots.txt
@@ -1941,17 +2006,28 @@ server.tool(
1941
2006
  server.tool(
1942
2007
  "get_geo_summary",
1943
2008
  "Get a concise GEO readiness summary for a domain: overall score, grade, top 3 strengths, and top 3 issues to fix. " +
1944
- "Use this for a quick overview; use analyze_domain for full details.",
2009
+ "Use this for a quick overview; use analyze_domain for full details including the Trust Signal Evidence payload " +
2010
+ "(raw nav/footer links for LLM-driven semantic classification on non-English sites).",
1945
2011
  {
1946
2012
  domain: z
1947
2013
  .string()
1948
2014
  .describe(
1949
2015
  'The domain to check, e.g. "example.com". Do not include https:// prefix.'
1950
2016
  ),
2017
+ render_mode: z
2018
+ .enum(RENDER_MODES)
2019
+ .optional()
2020
+ .describe(
2021
+ 'How to fetch the homepage. "static" (default), "auto" (static with Chrome fallback on bot-block), ' +
2022
+ 'or "chrome" (always render via local headless Chrome).'
2023
+ ),
1951
2024
  },
1952
- withLicense(async ({ domain }) => {
2025
+ withLicense(async ({ domain, render_mode }) => {
1953
2026
  try {
1954
- const result = await checkGEO(domain, { maxPages: 1 });
2027
+ const result = await checkGEO(domain, {
2028
+ maxPages: 1,
2029
+ renderMode: render_mode ?? "static",
2030
+ });
1955
2031
 
1956
2032
  if (result.error) {
1957
2033
  return {
@@ -1987,6 +2063,15 @@ server.tool(
1987
2063
  lines.push(`# GEO Summary: ${result.domain}`);
1988
2064
  lines.push(`Overall Score: ${analysis.overallScore}% (${grade})`);
1989
2065
  lines.push(`Page Type: ${analysis.pageType}`);
2066
+ const evLang = analysis.evidence?.htmlLang;
2067
+ if (evLang) {
2068
+ lines.push(`Site Language: ${evLang}`);
2069
+ if (!/^en(-|$)/i.test(evLang)) {
2070
+ lines.push(
2071
+ `(Non-English site - use analyze_domain for the footerLinks evidence payload to reclassify trust signals semantically.)`
2072
+ );
2073
+ }
2074
+ }
1990
2075
  lines.push("");
1991
2076
 
1992
2077
  // Sort categories by score
@@ -2071,6 +2156,13 @@ server.tool(
2071
2156
  .describe(
2072
2157
  "Maximum pages to crawl per domain (1 = homepage only). Defaults to 10."
2073
2158
  ),
2159
+ render_mode: z
2160
+ .enum(RENDER_MODES)
2161
+ .optional()
2162
+ .describe(
2163
+ 'How to fetch HTML for each domain. "static" (default), "auto" (static with Chrome fallback on bot-block), ' +
2164
+ 'or "chrome" (always render via local headless Chrome).'
2165
+ ),
2074
2166
  output_format: z
2075
2167
  .enum(["text", "json"])
2076
2168
  .optional()
@@ -2082,13 +2174,14 @@ server.tool(
2082
2174
  withTierFeature(
2083
2175
  "compareDomains",
2084
2176
  "Domain comparison requires a Pro or Agency license.",
2085
- async ({ domains, max_pages, output_format }) => {
2177
+ async ({ domains, max_pages, render_mode, output_format }) => {
2086
2178
  const maxPages = max_pages ?? 10;
2179
+ const renderMode = render_mode ?? "static";
2087
2180
 
2088
2181
  // Run all analyses in parallel
2089
2182
  const results = await Promise.allSettled(
2090
2183
  domains.map((domain) =>
2091
- checkGEO(domain, { maxPages }).then((result) => ({
2184
+ checkGEO(domain, { maxPages, renderMode }).then((result) => ({
2092
2185
  domain,
2093
2186
  result,
2094
2187
  }))
@@ -2240,7 +2333,7 @@ const DEFAULT_RATE_LIMIT = parseInt(process.env.GLIPPY_RATE_LIMIT, 10) || 5;
2240
2333
  * @param {number} domainRateLimit - Max requests/second per domain (0 = unlimited)
2241
2334
  * @returns {Promise<{pageResults: object[], domainMeta: Map}>}
2242
2335
  */
2243
- async function analyseUrls(urls, concurrency = 3, domainRateLimit = DEFAULT_RATE_LIMIT) {
2336
+ async function analyseUrls(urls, concurrency = 3, domainRateLimit = DEFAULT_RATE_LIMIT, renderMode = "static") {
2244
2337
  // Group URLs by domain
2245
2338
  const domainMap = new Map(); // domain → [urls]
2246
2339
  for (const url of urls) {
@@ -2318,13 +2411,34 @@ async function analyseUrls(urls, concurrency = 3, domainRateLimit = DEFAULT_RATE
2318
2411
  try {
2319
2412
  const pathname = new URL(url).pathname;
2320
2413
  const meta = domainMeta.get(domain);
2321
- const res = await throttledFetchUrl(url, 15000);
2414
+ let res;
2415
+ let rendered = "static";
2416
+ if (renderMode === "chrome") {
2417
+ res = await chromeFetch(url, 30000);
2418
+ rendered = "chrome";
2419
+ } else {
2420
+ res = await throttledFetchUrl(url, 15000);
2421
+ if (renderMode === "auto" && looksBotBlockedResponse(res)) {
2422
+ const chromeRes = await chromeFetch(url, 30000).catch(() => null);
2423
+ if (
2424
+ chromeRes &&
2425
+ typeof chromeRes.statusCode === "number" &&
2426
+ chromeRes.statusCode >= 200 &&
2427
+ chromeRes.statusCode < 300 &&
2428
+ chromeRes.body
2429
+ ) {
2430
+ res = chromeRes;
2431
+ rendered = "chrome-fallback";
2432
+ }
2433
+ }
2434
+ }
2322
2435
 
2323
- if (res.statusCode !== 200 || !res.body) {
2436
+ if (!res || res.statusCode == null || res.statusCode < 200 || res.statusCode >= 300 || !res.body) {
2324
2437
  return {
2325
2438
  url,
2326
2439
  analysis: null,
2327
- error: res.statusCode ? `HTTP ${res.statusCode}` : "Failed to fetch",
2440
+ error: res && res.statusCode ? `HTTP ${res.statusCode}` : "Failed to fetch",
2441
+ renderMode: rendered,
2328
2442
  };
2329
2443
  }
2330
2444
 
@@ -2337,7 +2451,7 @@ async function analyseUrls(urls, concurrency = 3, domainRateLimit = DEFAULT_RATE
2337
2451
  pathname
2338
2452
  );
2339
2453
 
2340
- return { url, analysis, error: null };
2454
+ return { url, analysis, error: null, renderMode: rendered };
2341
2455
  } catch (err) {
2342
2456
  return { url, analysis: null, error: err.message };
2343
2457
  }
@@ -2453,6 +2567,13 @@ server.tool(
2453
2567
  "Defaults to 5 req/s (or GLIPPY_RATE_LIMIT env var). Set lower for polite crawling, higher if you control the target server. " +
2454
2568
  "Use 0.5 for 1 request every 2 seconds, 10 for aggressive crawling."
2455
2569
  ),
2570
+ render_mode: z
2571
+ .enum(RENDER_MODES)
2572
+ .optional()
2573
+ .describe(
2574
+ 'How to fetch each URL. "static" (default), "auto" (static with Chrome fallback on bot-block), ' +
2575
+ 'or "chrome" (always render via local headless Chrome).'
2576
+ ),
2456
2577
  output_format: z
2457
2578
  .enum(["text", "json", "summary"])
2458
2579
  .optional()
@@ -2480,7 +2601,7 @@ server.tool(
2480
2601
  "Recommended: 10-20 for detailed results to stay within output limits."
2481
2602
  ),
2482
2603
  },
2483
- withLicense(async ({ sitemap_url, max_urls, rate_limit, output_format, offset, limit }) => {
2604
+ withLicense(async ({ sitemap_url, max_urls, rate_limit, render_mode, output_format, offset, limit }) => {
2484
2605
  const features = getFeatures();
2485
2606
 
2486
2607
  // Check if sitemap analysis is available for this tier
@@ -2555,7 +2676,7 @@ server.tool(
2555
2676
 
2556
2677
  // Analyse all URLs with rate limiting
2557
2678
  const rateLimit = rate_limit ?? DEFAULT_RATE_LIMIT;
2558
- const { pageResults } = await analyseUrls(urlsToAnalyse, 3, rateLimit);
2679
+ const { pageResults } = await analyseUrls(urlsToAnalyse, 3, rateLimit, render_mode ?? "static");
2559
2680
  const aggregated = aggregatePageScores(pageResults);
2560
2681
 
2561
2682
  // Summary output mode - compact JSON with minimal page info (ideal for large sitemaps)
@@ -2665,6 +2786,13 @@ server.tool(
2665
2786
  "Defaults to 5 req/s (or GLIPPY_RATE_LIMIT env var). Set lower for polite crawling, higher if you control the target server. " +
2666
2787
  "Use 0.5 for 1 request every 2 seconds, 10 for aggressive crawling."
2667
2788
  ),
2789
+ render_mode: z
2790
+ .enum(RENDER_MODES)
2791
+ .optional()
2792
+ .describe(
2793
+ 'How to fetch each URL. "static" (default), "auto" (static with Chrome fallback on bot-block), ' +
2794
+ 'or "chrome" (always render via local headless Chrome).'
2795
+ ),
2668
2796
  output_format: z
2669
2797
  .enum(["text", "json", "summary"])
2670
2798
  .optional()
@@ -2692,7 +2820,7 @@ server.tool(
2692
2820
  "Recommended: 10-20 for detailed results to stay within output limits."
2693
2821
  ),
2694
2822
  },
2695
- withLicense(async ({ urls, rate_limit, output_format, offset, limit }) => {
2823
+ withLicense(async ({ urls, rate_limit, render_mode, output_format, offset, limit }) => {
2696
2824
  const features = getFeatures();
2697
2825
 
2698
2826
  // Check if batch analysis is available for this tier
@@ -2721,7 +2849,7 @@ server.tool(
2721
2849
 
2722
2850
  try {
2723
2851
  const rateLimit = rate_limit ?? DEFAULT_RATE_LIMIT;
2724
- const { pageResults } = await analyseUrls(urls, 3, rateLimit);
2852
+ const { pageResults } = await analyseUrls(urls, 3, rateLimit, render_mode ?? "static");
2725
2853
  const aggregated = aggregatePageScores(pageResults);
2726
2854
 
2727
2855
  // Summary output mode - compact JSON with minimal page info (ideal for large batches)
@@ -2834,6 +2962,13 @@ server.tool(
2834
2962
  "Maximum pages to crawl (1 = homepage only, up to 10 for multi-page analysis). Defaults to 10. " +
2835
2963
  "Ignored if analysis_result is provided."
2836
2964
  ),
2965
+ render_mode: z
2966
+ .enum(RENDER_MODES)
2967
+ .optional()
2968
+ .describe(
2969
+ 'How to fetch HTML. "static" (default), "auto" (static with Chrome fallback on bot-block), ' +
2970
+ 'or "chrome" (always render via local headless Chrome). Ignored if analysis_result is provided.'
2971
+ ),
2837
2972
  analysis_result: z
2838
2973
  .object({})
2839
2974
  .passthrough()
@@ -2844,7 +2979,7 @@ server.tool(
2844
2979
  "and export in multiple formats without redundant crawling."
2845
2980
  ),
2846
2981
  },
2847
- withLicense(async ({ domain, format, max_pages, analysis_result }) => {
2982
+ withLicense(async ({ domain, format, max_pages, render_mode, analysis_result }) => {
2848
2983
  try {
2849
2984
  let result;
2850
2985
 
@@ -2866,6 +3001,7 @@ server.tool(
2866
3001
  // Run fresh analysis (may use cache automatically)
2867
3002
  result = await checkGEO(domain, {
2868
3003
  maxPages: max_pages ?? 10,
3004
+ renderMode: render_mode ?? "static",
2869
3005
  });
2870
3006
  } else {
2871
3007
  return {
@@ -3003,11 +3139,19 @@ server.tool(
3003
3139
  .describe(
3004
3140
  "Max requests/second per domain for URL/sitemap modes. Defaults to 5. Ignored if analysis_results provided."
3005
3141
  ),
3142
+ render_mode: z
3143
+ .enum(RENDER_MODES)
3144
+ .optional()
3145
+ .describe(
3146
+ 'How to fetch HTML. "static" (default), "auto" (static with Chrome fallback on bot-block), ' +
3147
+ 'or "chrome" (always render via local headless Chrome). Ignored if analysis_results provided.'
3148
+ ),
3006
3149
  },
3007
3150
  withTierFeature(
3008
3151
  "bulkExport",
3009
3152
  "Bulk report exports require a Pro or Agency license.",
3010
- async ({ format, domains, urls, sitemap_url, analysis_results, max_pages, max_urls, rate_limit }) => {
3153
+ async ({ format, domains, urls, sitemap_url, analysis_results, max_pages, max_urls, rate_limit, render_mode }) => {
3154
+ const renderMode = render_mode ?? "static";
3011
3155
  // Validate: exactly one input mode
3012
3156
  const modes = [domains, urls, sitemap_url, analysis_results].filter(Boolean).length;
3013
3157
  if (modes !== 1) {
@@ -3116,7 +3260,7 @@ server.tool(
3116
3260
  const maxPages = max_pages ?? 10;
3117
3261
  const results = await Promise.allSettled(
3118
3262
  domains.map((domain) =>
3119
- checkGEO(domain, { maxPages }).then((result) => ({
3263
+ checkGEO(domain, { maxPages, renderMode }).then((result) => ({
3120
3264
  domain,
3121
3265
  result,
3122
3266
  }))
@@ -3173,7 +3317,7 @@ server.tool(
3173
3317
  // ------------------------------------------------------------------
3174
3318
  if (urls) {
3175
3319
  const rateLimit = rate_limit ?? DEFAULT_RATE_LIMIT;
3176
- const { pageResults } = await analyseUrls(urls, 3, rateLimit);
3320
+ const { pageResults } = await analyseUrls(urls, 3, rateLimit, renderMode);
3177
3321
  const aggregated = aggregatePageScores(pageResults);
3178
3322
  const title = `${urls.length} URLs`;
3179
3323
 
@@ -3239,7 +3383,7 @@ server.tool(
3239
3383
 
3240
3384
  const urlsToAnalyse = allUrls.slice(0, max_urls ?? 50000);
3241
3385
  const rateLimit = rate_limit ?? DEFAULT_RATE_LIMIT;
3242
- const { pageResults } = await analyseUrls(urlsToAnalyse, 3, rateLimit);
3386
+ const { pageResults } = await analyseUrls(urlsToAnalyse, 3, rateLimit, renderMode);
3243
3387
  const aggregated = aggregatePageScores(pageResults);
3244
3388
  const title = `Sitemap: ${sitemap_url} (${urlsToAnalyse.length} of ${allUrls.length} URLs)`;
3245
3389