npm - glippy-mcp - Versions diffs - 0.3.0 → 0.3.2 - Mend

glippy-mcp 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/package.json +1 -1
package/src/geo-checker.js +479 -65

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "glippy-mcp",
-  "version": "0.3.0",
+  "version": "0.3.2",
   "description": "MCP server for GEO (Generative Engine Optimization) analysis — check any domain's AI-readiness",
   "main": "src/index.js",
   "type": "module",

package/src/geo-checker.js CHANGED Viewed

@@ -31,22 +31,105 @@ function looksBotBlocked(res) {
 const FETCH_TIMEOUT_MS = 15_000;
 /**
- * Known AI crawler user-agent tokens.  These are checked against robots.txt
- * Disallow / Allow rules.
+ * Training-only crawlers. Blocking these is informational: it keeps content
+ * out of LLM training corpora but does not affect AI citation surfaces.
  */
-const AI_CRAWLERS = Object.freeze([
+const TRAINING_CRAWLERS = Object.freeze([
   'GPTBot',
-  'Google-Extended',
-  'CCBot',
-  'anthropic-ai',
   'ClaudeBot',
+  'anthropic-ai',
+  'CCBot',
+  'Google-Extended',
+  'Applebot-Extended',
   'Bytespider',
-  'PerplexityBot',
-  'ChatGPT-User',
-  'AmazonBot',
+  'FacebookBot',
+  'Meta-ExternalAgent',
   'cohere-ai',
+  'Diffbot',
+  'Omgili',
+  'Amazonbot',
+  'Timpibot',
+  'ImageSiftBot',
+  // Broadened: SEO/search/training crawlers commonly named in robots.txt.
+  'PetalBot',
+  'MJ12bot',
+  'AwarioBot',
+  'AhrefsBot',
+  'SemrushBot',
+  'DotBot',
+  'SeznamBot',
+  'magpie-crawler',
+  'DataForSeoBot',
+  'iaskbot',
+  'Pangu_Bot',
+  'claude-web',
+  'cohere-training-data-crawler',
+  'meta-externalfetcher',
 ]);
+/**
+ * Citation/retrieval crawlers. Blocking these directly hurts AI visibility
+ * because answer engines cannot fetch content for inline citation.
+ */
+const CITATION_CRAWLERS = Object.freeze([
+  'OAI-SearchBot',
+  'ChatGPT-User',
+  'PerplexityBot',
+  'Perplexity-User',
+  'Applebot',
+  'Bingbot',
+  'Googlebot',
+  'DuckDuckBot',
+  'YouBot',
+  // Broadened: alternative answer engines and search crawlers.
+  'MistralAI-User',
+  'PhindBot',
+  'Komo',
+  'AndiBot',
+  'BraveBot',
+  'KagiBot',
+  'Yep',
+  'NeevaBot',
+  'Exabot',
+  'Qwantify',
+  'Seznam',
+  'GoogleOther',
+  'Google-CloudVertexBot',
+  'BingPreview',
+]);
+/**
+ * Match a User-Agent token against a list of known crawlers using a
+ * case-insensitive longest-prefix match. This prevents short prefixes like
+ * "applebot" from incorrectly absorbing "applebot-extended" matches.
+ *
+ * @param {string} ua - User-Agent token from robots.txt or meta tag.
+ * @param {readonly string[]} crawlers - Crawler list to match against.
+ * @returns {string|null} - The matched crawler name (original casing) or null.
+ */
+function matchCrawler(ua, crawlers) {
+  if (!ua) return null;
+  const lower = ua.toLowerCase();
+  let bestMatch = null;
+  let bestLen = 0;
+  for (const c of crawlers) {
+    const cl = c.toLowerCase();
+    if (lower === cl || lower.startsWith(cl) || lower.includes(cl)) {
+      if (cl.length > bestLen) {
+        bestLen = cl.length;
+        bestMatch = c;
+      }
+    }
+  }
+  return bestMatch;
+}
+/**
+ * Combined AI crawler list, kept for back-compat with downstream callers
+ * that iterate the union (e.g. robots.txt block detection per crawler).
+ */
+const AI_CRAWLERS = Object.freeze([...TRAINING_CRAWLERS, ...CITATION_CRAWLERS]);
 /** Maximum number of redirects to follow when fetching a resource. */
 const MAX_REDIRECTS = 5;
@@ -757,10 +840,68 @@ function aggregatePageScores(pageResults) {
  * @returns {string} - One of: 'faq', 'product', 'article', 'local-business', 'homepage', 'ecommerce', 'saas', 'generic'.
  */
 function detectPageType($, schemaTypes, pathname) {
-  // Check JSON-LD schema types first (most reliable signal)
-  if (['FAQPage'].some((t) => schemaTypes.has(t))) return 'faq';
-  if (['Product', 'Offer'].some((t) => schemaTypes.has(t))) return 'product';
+  // Check JSON-LD schema types first (most reliable signal).
+  // A page can carry FAQPage schema for a small FAQ section while being a long-form
+  // guide. Only classify as "faq" when FAQPage is the dominant structure, otherwise
+  // a 6,400-word guide with a FAQ at the bottom gets penalized as exceeding FAQ length.
+  const allH2s = $('h2');
+  const h2Count = allH2s.length;
+  let questionH2Count = 0;
+  allH2s.each((_, el) => {
+    const t = ($(el).text() || '').trim();
+    if (t.includes('?') || /^(how|what|why|when|where|who|which|can|do|does|is|are|should)\b/i.test(t)) {
+      questionH2Count++;
+    }
+  });
+  const isDominantlyFaq = h2Count > 0 && questionH2Count >= h2Count * 0.7;
+  // Word count for length-based reclassification of FAQ-tagged guides.
+  const mainElForCount = $('main, article, [role="main"]');
+  const mainTextForCount = (mainElForCount.length > 0 ? mainElForCount.text() : $('body').text() || '').trim();
+  const wordCountForType = mainTextForCount.split(/\s+/).filter(w => w.length > 0).length;
+  // Non-FAQ schema types that, when coexisting with FAQPage, signal a hybrid
+  // guide rather than a pure FAQ page.
+  const NON_FAQ_GUIDE_TYPES = [
+    'Article', 'NewsArticle', 'BlogPosting', 'TechArticle', 'HowTo', 'Product',
+    'Dataset', 'Report', 'WebPage',
+    // Broadened: more schema types that imply guide/long-form rather than pure FAQ.
+    'Recipe', 'ScholarlyArticle', 'Guide', 'Course', 'Service',
+    'MedicalEntity', 'MedicalGuideline', 'Book', 'Chapter',
+    'LearningResource', 'Review', 'CollectionPage', 'ItemPage',
+  ];
+  const hasNonFaqGuideType = NON_FAQ_GUIDE_TYPES.some((t) => schemaTypes.has(t));
+  // Heuristic guide-title overrides: title or H1 phrasing strongly implies a guide.
+  const titleText = ($('title').text() || '').trim();
+  const h1Text = ($('h1').first().text() || '').trim();
+  const titleAndH1 = `${titleText} ${h1Text}`;
+  const GUIDE_TITLE_RE = /\b(?:complete|ultimate|definitive|comprehensive)?\s*guide\b/i;
+  const EVERYTHING_RE = /everything you need/i;
+  const HOW_TO_TITLE_RE = /how to/i;
+  const STEP_BY_STEP_RE = /step[- ]by[- ]step/i;
+  const matchesGuideTitle = GUIDE_TITLE_RE.test(titleAndH1)
+    || EVERYTHING_RE.test(titleAndH1)
+    || HOW_TO_TITLE_RE.test(titleAndH1)
+    || STEP_BY_STEP_RE.test(titleAndH1);
+  // Definition-list + multiple H2 sections is a strong guide signal.
+  const hasDefinitionListGuide = $('dl').length > 0 && h2Count >= 2;
+  // Long-form / heading-rich pages should never classify as pure FAQ.
+  const tooLongForFaq = wordCountForType > 2000;
+  const tooManyH2sForFaq = h2Count > 8;
+  if (matchesGuideTitle || hasDefinitionListGuide) return 'article';
+  if (schemaTypes.has('FAQPage') && isDominantlyFaq && !hasNonFaqGuideType
+      && wordCountForType <= 1500 && !tooManyH2sForFaq && !tooLongForFaq) return 'faq';
   if (['Article', 'NewsArticle', 'BlogPosting', 'TechArticle'].some((t) => schemaTypes.has(t))) return 'article';
+  // FAQPage schema present but page is also long-form or carries another guide-type schema:
+  // treat as article so guide-style word/heading expectations apply.
+  if (schemaTypes.has('FAQPage') && (hasNonFaqGuideType || wordCountForType > 1500 || h2Count >= 6 || tooManyH2sForFaq || tooLongForFaq)) return 'article';
+  if (schemaTypes.has('FAQPage') && !tooManyH2sForFaq && !tooLongForFaq) return 'faq';
+  if (schemaTypes.has('FAQPage')) return 'article';
+  if (['Product', 'Offer'].some((t) => schemaTypes.has(t))) return 'product';
   if (['LocalBusiness', 'Restaurant', 'Store'].some((t) => schemaTypes.has(t))) return 'local-business';
   // Heuristic: homepage detection (including language/locale-prefixed homepages like /en/, /de-DE/, /nl/)
@@ -769,9 +910,10 @@ function detectPageType($, schemaTypes, pathname) {
   const normalizedPath = pathname.replace(/^\/[a-z]{2}(?:[-_][a-z]{2,3})?\/?$/i, '/');
   if (normalizedPath === '/' || normalizedPath === '/index.html' || normalizedPath === '/index.php' || normalizedPath === '') return 'homepage';
-  // Heuristic: FAQ page via DOM
+  // Heuristic: FAQ page via DOM. Only treat as FAQ when FAQ-like elements dominate the
+  // structure - if the page has many topic H2s it's a guide that happens to include a FAQ.
   const faqIndicators = $('[class*="faq"], [id*="faq"], details, [class*="accordion"]');
-  if (faqIndicators.length >= 3) return 'faq';
+  if (faqIndicators.length >= 3 && (h2Count < 6 || isDominantlyFaq)) return 'faq';
   // Heuristic: article via DOM
   const hasArticle = $('article').length > 0;
@@ -890,7 +1032,9 @@ function checkStructuredData($, pageType, jsonLdData, jsonLdValid, jsonLdInvalid
     checks.push({ status: 'pass', label: `GEO-critical schema types present (${foundImportant.length})`, detail: foundImportant.join(', ') });
   } else if (foundImportant.length > 0) {
     score += 5;
-    checks.push({ status: 'warn', label: `Only ${foundImportant.length} GEO-critical schema type(s)`, detail: `Found: ${foundImportant.join(', ')}. Consider adding: FAQPage, HowTo, Article, BreadcrumbList` });
+    const suggestions = ['FAQPage', 'HowTo', 'Article', 'BreadcrumbList'].filter((t) => !schemaTypes.has(t));
+    const consider = suggestions.length > 0 ? `. Consider adding: ${suggestions.join(', ')}` : '';
+    checks.push({ status: 'warn', label: `Only ${foundImportant.length} GEO-critical schema type(s)`, detail: `Found: ${foundImportant.join(', ')}${consider}` });
   } else {
     checks.push({ status: 'fail', label: 'No GEO-critical schema types', detail: 'Add FAQPage, Article, Organization, BreadcrumbList, etc.' });
   }
@@ -1835,38 +1979,66 @@ function checkMachineReadability($, robotsTxtData, llmsTxtData, responseHeaders)
     checks.push({ status: 'pass', label: 'No restrictive robots meta', detail: 'Page is open for indexing' });
   }
-  // Check for specific AI bot meta tags
-  const aiBotMeta = ['googlebot', 'bingbot', 'gptbot', 'chatgpt-user', 'anthropic-ai', 'claude-web', 'ccbot', 'google-extended', 'perplexitybot', 'claudebot'];
-  const blockedBots = [];
+  // Check for specific AI bot meta tags. Split blocked bots into training-only
+  // (informational) vs citation crawlers (real penalty) so a noindex on GPTBot
+  // is not weighted the same as a noindex on Googlebot.
+  const trainingBotMeta = TRAINING_CRAWLERS.map(c => c.toLowerCase());
+  const citationBotMeta = CITATION_CRAWLERS.map(c => c.toLowerCase()).concat(['claude-web']);
+  const aiBotMeta = [...new Set([...trainingBotMeta, ...citationBotMeta])];
+  const blockedTrainingBots = [];
+  const blockedCitationBots = [];
   aiBotMeta.forEach((bot) => {
     const content = $(`meta[name="${bot}"]`).attr('content') || '';
     if (content.includes('noindex')) {
-      blockedBots.push(bot);
+      if (citationBotMeta.includes(bot)) {
+        blockedCitationBots.push(bot);
+      } else {
+        blockedTrainingBots.push(bot);
+      }
     }
   });
   maxScore += 15;
-  if (blockedBots.length === 0) {
+  if (blockedCitationBots.length === 0 && blockedTrainingBots.length === 0) {
     score += 15;
     checks.push({ status: 'pass', label: 'No AI bot restrictions in meta', detail: 'No specific bot blocking detected in page HTML' });
+  } else if (blockedCitationBots.length === 0) {
+    score += 15;
+    checks.push({ status: 'info', label: `Training crawler meta blocks: ${blockedTrainingBots.join(', ')}`, detail: 'Training-only blocks do not affect AI citation visibility', found: blockedTrainingBots });
   } else {
-    checks.push({ status: 'warn', label: `AI bot blocking detected: ${blockedBots.join(', ')}`, detail: 'These bots are blocked via meta tags', found: blockedBots });
+    score += Math.max(0, 15 - blockedCitationBots.length * 3);
+    checks.push({ status: 'warn', label: `Citation crawler meta blocks: ${blockedCitationBots.join(', ')}`, detail: 'These citation crawlers are blocked via meta tags', found: blockedCitationBots });
+    if (blockedTrainingBots.length > 0) {
+      checks.push({ status: 'info', label: `Training crawler meta blocks: ${blockedTrainingBots.join(', ')}`, detail: 'Training-only blocks are informational', found: blockedTrainingBots });
+    }
   }
   // robots.txt integration (from server-side fetch)
   if (robotsTxtData) {
     maxScore += 10;
     if (robotsTxtData.exists) {
-      const crawlerBlocked = Object.entries(robotsTxtData.blocksCrawlers || {}).filter(([, v]) => v);
-      if (crawlerBlocked.length === 0) {
+      const blocks = robotsTxtData.blocksCrawlers || {};
+      const trainingLowercase = new Set(TRAINING_CRAWLERS.map(c => c.toLowerCase()));
+      const citationLowercase = new Set(CITATION_CRAWLERS.map(c => c.toLowerCase()));
+      const blockedAll = Object.entries(blocks).filter(([, v]) => v).map(([k]) => k);
+      const blockedTraining = blockedAll.filter(k => trainingLowercase.has(k.toLowerCase()));
+      const blockedCitation = blockedAll.filter(k => citationLowercase.has(k.toLowerCase()));
+      if (blockedCitation.length === 0 && blockedTraining.length === 0) {
         score += 10;
-        checks.push({ status: 'pass', label: 'robots.txt: no AI crawlers blocked', detail: 'All known AI crawlers are allowed' });
+        checks.push({ status: 'pass', label: 'robots.txt: no AI crawlers blocked', detail: 'All known training and citation crawlers are allowed' });
+      } else if (blockedCitation.length === 0) {
+        score += 10;
+        checks.push({ status: 'info', label: `robots.txt: ${blockedTraining.length} training crawler(s) blocked, citation crawlers allowed`, detail: 'Training-only blocks do not affect AI citation visibility', found: blockedTraining });
       } else {
-        score += Math.max(0, 10 - crawlerBlocked.length * 2);
-        checks.push({ status: 'warn', label: `robots.txt: ${crawlerBlocked.length} AI crawler(s) blocked`, detail: 'Blocked crawlers cannot index your content', found: crawlerBlocked.map(([k]) => k) });
+        score += Math.max(0, 10 - blockedCitation.length * 2);
+        checks.push({ status: 'warn', label: `robots.txt: ${blockedCitation.length} citation crawler(s) blocked`, detail: 'Blocking citation crawlers prevents inline AI citations', found: blockedCitation });
+        if (blockedTraining.length > 0) {
+          checks.push({ status: 'info', label: `robots.txt: ${blockedTraining.length} training crawler(s) blocked`, detail: 'Training-only blocks are informational and do not affect AI citation visibility', found: blockedTraining });
+        }
       }
       if (robotsTxtData.hasWildcardDisallow) {
-        checks.push({ status: 'warn', label: 'robots.txt: wildcard Disallow: /', detail: 'All crawlers are blocked by default - only overridden by specific Allow rules' });
+        checks.push({ status: 'warn', label: 'robots.txt: wildcard Disallow: /', detail: 'All crawlers are blocked by default, only overridden by specific Allow rules' });
       }
     } else {
       checks.push({ status: 'warn', label: 'No robots.txt found', detail: 'robots.txt helps control crawler access' });
@@ -2014,7 +2186,29 @@ function checkEntity($, jsonLdData) {
     });
   }
-  // 6. JSON-LD schema author with quality check
+  // 6. JSON-LD schema author with quality check.
+  // Only treat `author` as the page author when it's attached to a content type
+  // (Article, WebPage, Book, etc.) - NOT inside Review/Comment, where `author` is
+  // the reviewer/commenter and shouldn't be credited to the page.
+  const PAGE_AUTHOR_TYPES = new Set([
+    'Article', 'NewsArticle', 'BlogPosting', 'TechArticle', 'ScholarlyArticle', 'Report', 'OpinionNewsArticle',
+    'WebPage', 'AboutPage', 'CollectionPage', 'ItemPage', 'ProfilePage', 'QAPage', 'FAQPage',
+    'Book', 'Chapter', 'CreativeWork', 'CreativeWorkSeries', 'HowTo', 'Recipe', 'Course', 'LearningResource',
+    'VideoObject', 'AudioObject', 'PodcastEpisode', 'Podcast',
+    'DiscussionForumPosting', 'SocialMediaPosting',
+  ]);
+  const SKIP_AUTHOR_TYPES = new Set(['Review', 'Comment', 'UserComments', 'Rating']);
+  const isContentType = (t) => {
+    if (!t) return false;
+    const types = Array.isArray(t) ? t : [t];
+    return types.some((x) => PAGE_AUTHOR_TYPES.has(x));
+  };
+  const isSkipType = (t) => {
+    if (!t) return false;
+    const types = Array.isArray(t) ? t : [t];
+    return types.some((x) => SKIP_AUTHOR_TYPES.has(x));
+  };
   let hasAuthorSchema = false;
   let hasAuthorSameAs = false;
   let hasPersonSchema = false;
@@ -2022,12 +2216,14 @@ function checkEntity($, jsonLdData) {
     try {
       const processSchema = (schema) => {
         if (!schema) return;
-        if (schema.author) {
+        // Skip Review/Comment subtrees - their author is not the page author.
+        if (isSkipType(schema['@type'])) return;
+        if (schema.author && isContentType(schema['@type'])) {
           hasAuthorSchema = true;
           const authors = Array.isArray(schema.author) ? schema.author : [schema.author];
           authors.forEach((a) => {
             if (typeof a === 'string') authorNames.add(a);
-            else if (a.name) {
+            else if (a && a.name) {
               authorNames.add(a.name);
               if (a.sameAs) hasAuthorSameAs = true;
               if (a['@type'] === 'Person') hasPersonSchema = true;
@@ -2038,6 +2234,13 @@ function checkEntity($, jsonLdData) {
           hasPersonSchema = true;
           if (schema.sameAs) hasAuthorSameAs = true;
         }
+        // Recurse into common content-bearing fields, but skip review arrays.
+        ['mainEntity', 'mainEntityOfPage', 'about', 'isPartOf', 'hasPart', 'workExample', 'exampleOfWork'].forEach((key) => {
+          const val = schema[key];
+          if (!val) return;
+          if (Array.isArray(val)) val.forEach(processSchema);
+          else if (typeof val === 'object') processSchema(val);
+        });
       };
       if (Array.isArray(d)) d.forEach(processSchema);
       else if (d['@graph']) d['@graph'].forEach(processSchema);
@@ -2047,14 +2250,17 @@ function checkEntity($, jsonLdData) {
   if (hasAuthorSchema) authorSources.schema.push('JSON-LD author');
   if (hasPersonSchema) authorSources.schema.push('Person schema');
-  // 7. HTML byline elements - extended selectors
+  // 7. HTML byline elements - extended selectors.
+  // Exclude bylines inside review/comment/testimonial containers - they identify the
+  // reviewer, not the page author.
   const bylineSelectors = [
     '[class*="author"]', '[rel="author"]', '[itemprop="author"]',
     '.byline', '.post-author', '.article-author', '.entry-author',
     '[data-author]', '[data-byline]',
     'address.author', '.writer', '.contributor',
   ].join(', ');
-  const authorByline = $(bylineSelectors).first();
+  const reviewContextSel = '[itemtype*="Review"], [itemtype*="Comment"], .review, .reviews, .comment, .comments, .testimonial, .testimonials, [class*="review-"], [class*="reviews-"]';
+  const authorByline = $(bylineSelectors).filter((_, el) => $(el).closest(reviewContextSel).length === 0).first();
   if (authorByline.length > 0) {
     const bylineText = (authorByline.text() || '').trim();
     if (bylineText && bylineText.length < 100) {
@@ -2070,8 +2276,9 @@ function checkEntity($, jsonLdData) {
     authorSources.html.push('address element');
   }
-  // 9. Author profile links
-  const authorLinks = $('a[href*="/author/"], a[href*="/writers/"], a[href*="/contributors/"], a[href*="/team/"], a[rel="author"]');
+  // 9. Author profile links - skip review-context links (reviewer profile links).
+  const authorLinks = $('a[href*="/author/"], a[href*="/writers/"], a[href*="/contributors/"], a[href*="/team/"], a[rel="author"]')
+    .filter((_, el) => $(el).closest(reviewContextSel).length === 0);
   if (authorLinks.length > 0) {
     authorSources.links.push(`${authorLinks.length} author link(s)`);
     authorLinks.each((_, el) => {
@@ -3662,6 +3869,32 @@ function checkWebMCP($, pageType, ucpData) {
     checks.push({ status: 'info', label: 'Shopify-hosted: dual UCP surface expected', detail: 'Per-shop endpoint at /api/ucp/mcp; global catalog at https://discover.shopifyapps.com/global/mcp' });
   }
+  // Baseline credit for purely informational pages.
+  // If the page has no forms, no WebMCP signals, no UCP profile, and no Shopify
+  // surface, there's nothing for it to expose to agents - WebMCP/UCP are N/A here.
+  // Without this, content-only pages are capped well below 100 even when there's
+  // nothing to fix, dragging the overall score unfairly.
+  const totalForms = $('form').length;
+  const hasUcp = !!(ucpData && ucpData.exists && ucpData.content);
+  const hasShopify = !!(ucpData && ucpData.shopifyHosted);
+  const hasNoInteractiveSurface =
+    totalForms === 0 &&
+    toolCount === 0 &&
+    !hasImperativeSignals &&
+    !webmcpSDKFound &&
+    !hasSchemaActions &&
+    !hasUcp &&
+    !hasShopify;
+  if (hasNoInteractiveSurface) {
+    checks.push({
+      status: 'info',
+      label: 'Informational page — Agent Interactivity not applicable',
+      detail: 'No forms or WebMCP/UCP signals detected. Pure-content pages can\'t expose tools to agents, so this category is scored as a baseline rather than penalized.',
+    });
+    return { checks, score: 80, category: 'Agent Interactivity', notApplicable: true };
+  }
   return { checks, score: maxScore > 0 ? Math.round((score / maxScore) * 100) : 0, category: 'Agent Interactivity' };
 }
@@ -3929,8 +4162,18 @@ function checkContentFreshness($, jsonLdData) {
     new RegExp('\\bin ' + currentYear + '\\b', 'i'),
     new RegExp('\\b(as of|updated)\\s+(january|february|march|april|may|june|july|august|september|october|november|december)\\s+' + currentYear + '\\b', 'i'),
   ];
+  // Historical/founding-context phrases - "records from 1841 to present", "since 1990",
+  // "established 1936" - are accurate facts, not stale temporal references.
+  const HISTORICAL_CONTEXT_PATTERNS = [
+    /\b(since|from|established|founded|operating since|serving since|in business since)\s+(in\s+)?\d{4}\b/i,
+    /\b\d{4}\s*(?:[‐-―−\-–—~]|to)\s*(present|current|today|now|\d{4})\b/i,
+    /\b(records?|archives?|documents?|history|heritage|founded|established|originated|dating back)\b[^.]{0,80}\b(from|since|in)\s+\d{4}\b/i,
+    /\b(historical|historic|vintage|legacy)\b/i,
+  ];
+  const hasHistoricalContext = HISTORICAL_CONTEXT_PATTERNS.some(p => p.test(visibleText));
   const hasCurrentRefs = CURRENT_YEAR_PATTERNS.some(p => p.test(visibleText));
-  const hasOutdatedRefs = OUTDATED_TEMPORAL_PATTERNS.some(p => p.test(visibleText));
+  const rawOutdatedHits = OUTDATED_TEMPORAL_PATTERNS.some(p => p.test(visibleText));
+  const hasOutdatedRefs = rawOutdatedHits && !hasHistoricalContext;
   maxScore += 20;
   if (hasCurrentRefs && !hasOutdatedRefs) {
     score += 20;
@@ -3947,21 +4190,68 @@ function checkContentFreshness($, jsonLdData) {
   }
   // 12d. Copyright Year & Footer Freshness (10 pts)
+  // Year ranges ("(c) 1997 - 2026") signal a founding year + current year, take
+  // the END year as the freshness signal, not the founding year.
+  // Also handles enumerated lists like "(c) 2010, 2015, 2026" by taking the max
+  // of all years in the same line as a copyright marker.
   const footerEl = $('footer');
   maxScore += 10;
   if (footerEl.length > 0) {
-    const footerText = footerEl.text();
-    const copyrightMatch = footerText.match(/©\s*(\d{4})/);
-    if (copyrightMatch) {
-      const copyrightYear = parseInt(copyrightMatch[1], 10);
-      if (copyrightYear === currentYear) {
+    // Strip "All Rights Reserved" boilerplate (en/fr/de) before parsing.
+    const rawFooterText = footerEl.text();
+    const footerText = rawFooterText
+      .replace(/all\s+rights\s+reserved/gi, '')
+      .replace(/tous\s+droits\s+r[ée]serv[ée]s/gi, '')
+      .replace(/alle\s+rechte\s+vorbehalten/gi, '');
+    // Broader prefix list: includes bracket variants and "Copyright ©" double prefix.
+    const COPYRIGHT_PREFIX = /(?:©|\(c\)|\(C\)|\[c\]|\[C\]|&copy;|copyright(?:\s*©)?)/i;
+    // Exclude founding year markers so "Est. 1998" / "Since 2001" do not get
+    // mistaken for a copyright year when no actual copyright marker is present.
+    const FOUNDING_PREFIX = /\b(?:est(?:ablished|\.)?|since|founded(?:\s+in)?)\s+\d{4}\b/i;
+    let copyrightYear = null;
+    // Sweep each line for a copyright marker; take the max year found on that line.
+    const lines = footerText.split(/\r?\n|<br\s*\/?>/i);
+    for (const rawLine of lines) {
+      const line = rawLine.trim();
+      if (!line) continue;
+      if (!COPYRIGHT_PREFIX.test(line)) continue;
+      // Skip lines that look like founding-year statements without a real © marker.
+      const hasRealMarker = /(?:©|\(c\)|\(C\)|\[c\]|\[C\]|&copy;|copyright)/i.test(line);
+      if (!hasRealMarker && FOUNDING_PREFIX.test(line)) continue;
+      const yearMatches = line.match(/\b(19|20)\d{2}\b/g);
+      if (yearMatches && yearMatches.length > 0) {
+        const maxYear = Math.max(...yearMatches.map(y => parseInt(y, 10)));
+        if (copyrightYear === null || maxYear > copyrightYear) copyrightYear = maxYear;
+      }
+    }
+    // Fallback: if the footer is a single blob without line breaks, sweep the
+    // whole text but only when a copyright marker exists.
+    if (copyrightYear === null && COPYRIGHT_PREFIX.test(footerText)) {
+      const yearMatches = footerText.match(/\b(19|20)\d{2}\b/g);
+      if (yearMatches && yearMatches.length > 0) {
+        copyrightYear = Math.max(...yearMatches.map(y => parseInt(y, 10)));
+      }
+    }
+    // Supplemental freshness signal: <time datetime="YYYY"> inside <footer>.
+    if (copyrightYear === null) {
+      footerEl.find('time[datetime]').each((_i, tEl) => {
+        const dt = ($(tEl).attr('datetime') || '').trim();
+        const ym = dt.match(/^(\d{4})/);
+        if (ym) {
+          const ty = parseInt(ym[1], 10);
+          if (copyrightYear === null || ty > copyrightYear) copyrightYear = ty;
+        }
+      });
+    }
+    if (copyrightYear !== null) {
+      if (copyrightYear >= currentYear - 1) {
         score += 10;
         checks.push({ status: 'pass', label: `Copyright year current (${copyrightYear})`, detail: `Footer copyright is ${copyrightYear}` });
-      } else if (copyrightYear === currentYear - 1) {
+      } else if (copyrightYear === currentYear - 2) {
         score += 5;
-        checks.push({ status: 'warn', label: `Copyright year slightly old (${copyrightYear})`, detail: `Footer shows ${copyrightYear} — update to ${currentYear}` });
+        checks.push({ status: 'warn', label: `Copyright year slightly old (${copyrightYear})`, detail: `Footer shows ${copyrightYear}, update to ${currentYear}` });
       } else {
-        checks.push({ status: 'fail', label: `Copyright year outdated (${copyrightYear})`, detail: `Footer shows ${copyrightYear} — update to ${currentYear}` });
+        checks.push({ status: 'fail', label: `Copyright year outdated (${copyrightYear})`, detail: `Footer shows ${copyrightYear}, update to ${currentYear}` });
       }
     } else {
       checks.push({ status: 'info', label: 'No copyright year in footer', detail: 'Add a copyright year to signal maintenance' });
@@ -4041,22 +4331,36 @@ function checkInformationDensity($) {
   }
   // 13b. Self-Contained Section Scoring (25 pts)
+  // Sections with structured content (tables w/ headers, lists, definition lists) are
+  // self-contained even at lower word counts - the structure carries the meaning.
   const h2s = $('main h2, article h2, [role="main"] h2');
   maxScore += 25;
   if (h2s.length > 0) {
     let selfContainedCount = 0;
     h2s.each((_i, h2El) => {
       let sectionText = '';
+      let hasStructuredContent = false;
+      let hasLabeledTable = false;
       let sibling = $(h2El).next();
       while (sibling.length > 0 && !sibling.is('h2')) {
         sectionText += (sibling.text() || '') + ' ';
+        if (sibling.is('table, ul, ol, dl') || sibling.find('table, ul, ol, dl').length > 0) {
+          hasStructuredContent = true;
+        }
+        const tablesHere = sibling.is('table') ? sibling : sibling.find('table');
+        tablesHere.each((__, t) => {
+          if ($(t).find('th').length > 0) hasLabeledTable = true;
+        });
         sibling = sibling.next();
       }
       const wordCount = sectionText.trim().split(/\s+/).length;
       const hasData = /\d/.test(sectionText);
       const firstSentence = sectionText.split(/[.!?]/)[0] || '';
       const hasTopicSentence = firstSentence.trim().length > 30;
-      if (wordCount >= 150 && wordCount <= 500 && hasData && hasTopicSentence) {
+      const isStandardComplete = wordCount >= 150 && wordCount <= 500 && hasData && hasTopicSentence;
+      const isStructurallyComplete = hasStructuredContent && wordCount >= 40 && (hasData || hasLabeledTable);
+      const isLabeledTableSection = hasLabeledTable && wordCount >= 10;
+      if (isStandardComplete || isStructurallyComplete || isLabeledTableSection) {
         selfContainedCount++;
       }
     });
@@ -4076,6 +4380,8 @@ function checkInformationDensity($) {
   }
   // 13c. Claim-Evidence Pairing (20 pts)
+  // Tables with header cells provide column-level context for every numeric value,
+  // so data points inside labeled tables are considered already-paired by design.
   const DATA_SENTENCE = /\d+(\.\d+)?(%|x|\$|€|£)/;
   let dataSentences = 0;
   let pairedData = 0;
@@ -4090,14 +4396,26 @@ function checkInformationDensity($) {
       }
     }
   });
+  // Count data cells inside labeled tables - they're context-paired via column headers.
+  let labeledTableDataCells = 0;
+  const pairingTables = mainEl.length > 0 ? mainEl.find('table') : $('table');
+  pairingTables.each((_i, t) => {
+    const $t = $(t);
+    if ($t.find('th').length === 0) return;
+    $t.find('tbody td, td').each((__, td) => {
+      if (DATA_SENTENCE.test($(td).text() || '')) labeledTableDataCells++;
+    });
+  });
   maxScore += 20;
-  if (dataSentences === 0) {
+  const totalData = dataSentences + labeledTableDataCells;
+  const totalPaired = pairedData + labeledTableDataCells;
+  if (totalData === 0) {
     checks.push({ status: 'info', label: 'No data claims detected', detail: 'Add quantitative data points with context' });
   } else {
-    const pairedPct = Math.round((pairedData / dataSentences) * 100);
+    const pairedPct = Math.round((totalPaired / totalData) * 100);
     if (pairedPct > 80) {
       score += 20;
-      checks.push({ status: 'pass', label: `Claims well-paired (${pairedPct}%)`, detail: `${pairedPct}% of data claims have contextual explanations` });
+      checks.push({ status: 'pass', label: `Claims well-paired (${pairedPct}%)`, detail: `${pairedPct}% of data claims have contextual explanations${labeledTableDataCells > 0 ? ` (incl. ${labeledTableDataCells} table cells)` : ''}` });
     } else if (pairedPct >= 50) {
       score += 10;
       checks.push({ status: 'warn', label: `Claims partially paired (${pairedPct}%)`, detail: `${pairedPct}% of data claims have context — add more explanations` });
@@ -4172,6 +4490,18 @@ function checkVerifiability($, domain) {
   const contentText = (mainEl.length > 0 ? mainEl.text() : $('body').text() || '').trim();
   const sentences = contentText.split(/[.!?]+/).filter(s => s.trim().length > 10);
+  // Visible body text (paragraphs, list items, blockquotes) for attribution
+  // patterns that often span sentence boundaries or live in elements that
+  // are tricky to split on punctuation alone.
+  const bodyTextEls = mainEl.length > 0
+    ? mainEl.find('p, li, blockquote, td, dd')
+    : $('p, li, blockquote, td, dd');
+  const bodyTextChunks = [];
+  bodyTextEls.each((_i, el) => {
+    const t = ($(el).text() || '').trim();
+    if (t.length > 0) bodyTextChunks.push(t);
+  });
   // 14a. External Citation Links (30 pts)
   const AUTHORITY_DOMAINS = ['.gov', '.edu', '.org', 'scholar.google', 'pubmed', 'arxiv.org', 'doi.org'];
   const externalLinks = mainEl.length > 0 ? mainEl.find('a[href^="http"]') : $('a[href^="http"]');
@@ -4192,7 +4522,7 @@ function checkVerifiability($, domain) {
     checks.push({ status: 'pass', label: `Strong citations (${totalExternalLinks} external, ${authorityLinks} authority)`, detail: `${totalExternalLinks} external links including ${authorityLinks} authority sources` });
   } else if (totalExternalLinks >= 1) {
     score += 15;
-    checks.push({ status: 'warn', label: `Some citations (${totalExternalLinks} external)`, detail: `${totalExternalLinks} external links — add authority sources (.gov, .edu)` });
+    checks.push({ status: 'warn', label: `Some citations (${totalExternalLinks} external)`, detail: `${totalExternalLinks} external links, add authority sources (.gov, .edu)` });
   } else {
     score += 5;
     checks.push({ status: 'fail', label: 'No external citations', detail: 'Add external links to authoritative sources' });
@@ -4200,25 +4530,58 @@ function checkVerifiability($, domain) {
   // 14b. Source Attribution in Text (25 pts)
   const SOURCE_ATTRIBUTION_PATTERNS = [
-    /\baccording to\s+[A-Z]/,
-    /\ba\s+(study|report|survey|analysis)\s+by\b/i,
-    /\b(published in|cited in|reported by)\b/i,
-    /\b(source|data from|based on)\s*:/i,
-    /\b(research\s+from|findings\s+of)\b/i,
+    /\baccording to\s+(?:the\s+|a\s+|an\s+)?[A-Z][\w'.-]*(?:\s+(?:of|for|on|and|the|de|van)\s+)?[A-Z\w'.-]*/,
+    /\b(?:a|an|the|new|recent|latest|major|landmark)?\s*(?:study|report|survey|analysis|paper|whitepaper|brief)\s+(?:by|from|published by)\b/i,
+    /\b(?:research|data|figures|statistics|findings)\s+(?:by|from|of|published by)\b/i,
+    /\b(?:published in|cited in|reported by|noted by|observed by)\b/i,
+    /\b(?:source|data from|based on)\s*:/i,
+    /\b(?:report|study|analysis)\s+(?:by|from)\b/i,
+    /\b[A-Z][\w'.-]+(?:\s+[A-Z][\w'.-]+){0,4}\s+(?:says|states|reports|found|concluded|notes|observed|estimates)\b/,
     /\[\d+\]/,
-    /\b(et al\.?|ibid\.?)\b/,
+    /\b(?:et al\.?|ibid\.?)\b/,
+    // Broadened patterns: "as reported by", "as documented in", etc.
+    /\bas\s+(?:reported|noted|stated|cited|documented|shown|described|outlined)\s+(?:by|in|on)\b/i,
+    // "per the WHO", "per CDC"
+    /\bper\s+(?:the\s+)?[A-Z]/,
+    // Possessive: "WHO's data", "CDC's findings"
+    /\b[A-Z][A-Za-z.&'-]+(?:'s|’s)\s+(?:data|report|study|analysis|findings|guidance|recommendations|guidelines)\b/,
+    // Parenthetical citation: "(source: ...)", "(via: ...)"
+    /\((?:source|src|via|cf|see)\s*:\s*[^)]+\)/i,
+    // DOI references
+    /\bdoi:\s*10\.\d+/i,
+    // Numeric brackets variants: "[1, 2]", "[1-3]"
+    /\[\d+(?:[,-]\s*\d+)*\]/,
+    // Author-year: "(Smith, 2023)", "(Smith et al., 2023)", "(Smith and Jones, 2023)"
+    /\([A-Z][a-zA-Z]+(?:\s+(?:et\s+al\.?|and\s+[A-Z][a-zA-Z]+))?,\s*\d{4}[a-z]?\)/,
+    // "<Org> data shows/reveals/indicates/suggests/confirms"
+    /\b[A-Z][\w'.-]+(?:\s+[A-Z][\w'.-]+){0,3}\s+data\s+(?:shows|reveals|indicates|suggests|confirms)\b/,
+    // "<Org> figures/findings show/reveal/indicate"
+    /\b[A-Z][\w'.-]+(?:\s+[A-Z][\w'.-]+){0,3}\s+(?:figures|findings)\s+(?:show|reveal|indicate)\b/,
+    // "in a recent study", "in a landmark report"
+    /\bin\s+(?:a|an)\s+(?:recent|new|landmark|seminal)\s+(?:study|report|survey|paper|analysis)\b/i,
+    // "verified by", "confirmed by", "documented in/by"
+    /\b(?:verified|confirmed)\s+by\b/i,
+    /\bdocumented\s+(?:in|by)\b/i,
+    // Government/regulatory bodies: "Department of Health", "Centers for Disease Control"
+    /\b(?:U\.?S\.?\s+)?(?:Department\s+of|Ministry\s+of|Office\s+of|Bureau\s+of|Centers\s+for|Federal|National|Royal)\s+[A-Z]/,
   ];
   let attrCount = 0;
   sentences.forEach(s => {
     if (SOURCE_ATTRIBUTION_PATTERNS.some(p => p.test(s))) attrCount++;
   });
+  bodyTextChunks.forEach(t => {
+    if (SOURCE_ATTRIBUTION_PATTERNS.some(p => p.test(t))) attrCount++;
+  });
   maxScore += 25;
   if (attrCount >= 3) {
     score += 25;
     checks.push({ status: 'pass', label: `Strong source attribution (${attrCount})`, detail: `${attrCount} source attribution patterns detected` });
+  } else if (attrCount >= 2) {
+    score += 18;
+    checks.push({ status: 'pass', label: `Source attribution found (${attrCount})`, detail: `${attrCount} attribution patterns detected` });
   } else if (attrCount >= 1) {
-    score += 12;
-    checks.push({ status: 'warn', label: `Some source attribution (${attrCount})`, detail: `${attrCount} attribution(s) — add more source references` });
+    score += 10;
+    checks.push({ status: 'warn', label: `Some source attribution (${attrCount})`, detail: `${attrCount} attribution(s), add more source references` });
   } else {
     score += 5;
     checks.push({ status: 'info', label: 'No source attribution detected', detail: 'Add "according to", "study by", or citation markers' });
@@ -4457,6 +4820,48 @@ function checkMultimodal($, jsonLdData) {
   }
   // 16b. Figure/Figcaption Usage (25 pts)
+  // Only evaluate coverage against content images. Decorative images (empty
+  // alt, presentation role, callouts, headshots, seals, logos, icons, small
+  // images, content nested in <aside>) are excluded from the denominator.
+  const DECORATIVE_CLASS_HINTS = /(callout|note|highlight|decorative|icon|headshot|avatar|seal|logo|badge|sidebar|bullet|arrow|divider|separator|spacer|pixel|tracking|analytics|placeholder|flag|star|rating)/i;
+  // Filename-style alt text like "img-23.jpg" / "photo.png" indicates a non-descriptive alt.
+  const FILENAME_ALT_RE = /^(?:img|image|photo|picture)?[-_ ]?\d*\.(?:jpg|jpeg|png|gif|svg|webp)$/i;
+  // Tracking pixel hints in src.
+  const TRACKING_SRC_RE = /(?:pixel|beacon|track|analytics)/i;
+  function isDecorativeImage(imgEl) {
+    const $img = $(imgEl);
+    const role = ($img.attr('role') || '').toLowerCase();
+    if (role === 'presentation' || role === 'none') return true;
+    // Explicit decorative attributes.
+    const ariaHidden = ($img.attr('aria-hidden') || '').toLowerCase();
+    if (ariaHidden === 'true') return true;
+    const dataDecorative = ($img.attr('data-decorative') || '').toLowerCase();
+    if (dataDecorative === 'true') return true;
+    const alt = $img.attr('alt');
+    if (alt !== undefined && alt.trim() === '') return true;
+    // Filename-style alt text is non-descriptive and treated as decorative.
+    if (alt !== undefined && FILENAME_ALT_RE.test(alt.trim())) return true;
+    if ($img.closest('aside').length > 0) return true;
+    // Broader ancestor selectors: chrome regions and ad/banner containers.
+    if ($img.closest('header, nav, footer, button, [role="banner"], [role="navigation"], [role="contentinfo"], .ad, .advertisement, .banner').length > 0) return true;
+    const cls = $img.attr('class') || '';
+    if (DECORATIVE_CLASS_HINTS.test(cls)) return true;
+    if ($img.closest(`[class*="callout"], [class*="note"], [class*="highlight"], [class*="decorative"], [class*="seal"], [class*="logo"], [class*="headshot"], [class*="avatar"], [class*="icon"]`).length > 0) return true;
+    const w = parseInt($img.attr('width'), 10);
+    const h = parseInt($img.attr('height'), 10);
+    // Tracking pixel: 1x1 (or 1xN/Nx1) images.
+    if ((Number.isFinite(w) && w === 1) || (Number.isFinite(h) && h === 1)) return true;
+    if (Number.isFinite(w) && w > 0 && w <= 100) return true;
+    if (Number.isFinite(h) && h > 0 && h <= 100) return true;
+    const src = $img.attr('src') || '';
+    if (src && TRACKING_SRC_RE.test(src)) return true;
+    return false;
+  }
+  let contentImageCount = 0;
+  fallbackImages.each((_i, imgEl) => {
+    if (!isDecorativeImage(imgEl)) contentImageCount++;
+  });
   const mainFigures = $('main figure, article figure, [role="main"] figure');
   const fallbackFigures = mainFigures.length > 0 ? mainFigures : $('figure');
   let figuresWithCaption = 0;
@@ -4467,16 +4872,19 @@ function checkMultimodal($, jsonLdData) {
   if (fallbackImages.length === 0) {
     score += 25;
     checks.push({ status: 'info', label: 'No images for figure evaluation', detail: 'No images found on page' });
+  } else if (contentImageCount === 0) {
+    score += 25;
+    checks.push({ status: 'info', label: 'Only decorative images detected', detail: 'No content images require figure/figcaption markup' });
   } else {
-    const figPct = fallbackImages.length > 0 ? Math.round((figuresWithCaption / fallbackImages.length) * 100) : 0;
-    if (figPct > 50) {
+    const figPct = Math.round((figuresWithCaption / contentImageCount) * 100);
+    if (figPct >= 50) {
       score += 25;
-      checks.push({ status: 'pass', label: `Good figure/caption usage (${figPct}%)`, detail: `${figPct}% of images wrapped in <figure> with <figcaption>` });
+      checks.push({ status: 'pass', label: `Good figure/caption usage (${figPct}%)`, detail: `${figuresWithCaption} of ${contentImageCount} content images wrapped in <figure> with <figcaption>` });
     } else if (figuresWithCaption > 0) {
       score += 12;
-      checks.push({ status: 'warn', label: 'Some figure/caption usage', detail: 'Some images use <figure>/<figcaption> — apply to more images' });
+      checks.push({ status: 'warn', label: 'Some figure/caption usage', detail: `${figuresWithCaption} of ${contentImageCount} content images wrapped, extend to remaining content images` });
     } else {
-      checks.push({ status: 'info', label: 'No figure/caption usage', detail: 'Wrap images in <figure> with <figcaption> for better context' });
+      checks.push({ status: 'info', label: 'No figure/caption usage', detail: 'Wrap content images in <figure> with <figcaption> for better context' });
     }
   }
@@ -5179,11 +5587,13 @@ function calculateGeoScore(data) {
   total += robotsScore;
   maxPossible += 5;
-  // 2. AI crawlers NOT blocked (1 pt per crawler, 7 max)
+  // 2. AI crawlers NOT blocked. Only citation crawlers (real impact on AI
+  // visibility) contribute to the score. Training-crawler blocks are reported
+  // in the detail string for transparency but do not deduct points.
   let crawlerScore = 0;
   const blocked = data.robotsTxt.blocksCrawlers || {};
   const crawlerDetails = [];
-  for (const crawler of AI_CRAWLERS) {
+  for (const crawler of CITATION_CRAWLERS) {
     if (blocked[crawler] === false || blocked[crawler] === undefined) {
       crawlerScore += 1;
       crawlerDetails.push(`${crawler}: allowed`);
@@ -5191,9 +5601,13 @@ function calculateGeoScore(data) {
       crawlerDetails.push(`${crawler}: BLOCKED`);
     }
   }
-  breakdown.aiCrawlerAccess = { score: crawlerScore, max: AI_CRAWLERS.length, detail: crawlerDetails.join('; ') };
+  for (const crawler of TRAINING_CRAWLERS) {
+    const status = (blocked[crawler] === false || blocked[crawler] === undefined) ? 'allowed' : 'blocked (training-only, informational)';
+    crawlerDetails.push(`${crawler}: ${status}`);
+  }
+  breakdown.aiCrawlerAccess = { score: crawlerScore, max: CITATION_CRAWLERS.length, detail: crawlerDetails.join('; ') };
   total += crawlerScore;
-  maxPossible += AI_CRAWLERS.length;
+  maxPossible += CITATION_CRAWLERS.length;
   // 3. llms.txt exists (10 pts)
   const llmsScore = data.llmsTxt.exists ? 10 : 0;