glippy-mcp 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/geo-checker.js +479 -65
package/package.json
CHANGED
package/src/geo-checker.js
CHANGED
|
@@ -31,22 +31,105 @@ function looksBotBlocked(res) {
|
|
|
31
31
|
const FETCH_TIMEOUT_MS = 15_000;
|
|
32
32
|
|
|
33
33
|
/**
|
|
34
|
-
*
|
|
35
|
-
*
|
|
34
|
+
* Training-only crawlers. Blocking these is informational: it keeps content
|
|
35
|
+
* out of LLM training corpora but does not affect AI citation surfaces.
|
|
36
36
|
*/
|
|
37
|
-
const
|
|
37
|
+
const TRAINING_CRAWLERS = Object.freeze([
|
|
38
38
|
'GPTBot',
|
|
39
|
-
'Google-Extended',
|
|
40
|
-
'CCBot',
|
|
41
|
-
'anthropic-ai',
|
|
42
39
|
'ClaudeBot',
|
|
40
|
+
'anthropic-ai',
|
|
41
|
+
'CCBot',
|
|
42
|
+
'Google-Extended',
|
|
43
|
+
'Applebot-Extended',
|
|
43
44
|
'Bytespider',
|
|
44
|
-
'
|
|
45
|
-
'
|
|
46
|
-
'AmazonBot',
|
|
45
|
+
'FacebookBot',
|
|
46
|
+
'Meta-ExternalAgent',
|
|
47
47
|
'cohere-ai',
|
|
48
|
+
'Diffbot',
|
|
49
|
+
'Omgili',
|
|
50
|
+
'Amazonbot',
|
|
51
|
+
'Timpibot',
|
|
52
|
+
'ImageSiftBot',
|
|
53
|
+
// Broadened: SEO/search/training crawlers commonly named in robots.txt.
|
|
54
|
+
'PetalBot',
|
|
55
|
+
'MJ12bot',
|
|
56
|
+
'AwarioBot',
|
|
57
|
+
'AhrefsBot',
|
|
58
|
+
'SemrushBot',
|
|
59
|
+
'DotBot',
|
|
60
|
+
'SeznamBot',
|
|
61
|
+
'magpie-crawler',
|
|
62
|
+
'DataForSeoBot',
|
|
63
|
+
'iaskbot',
|
|
64
|
+
'Pangu_Bot',
|
|
65
|
+
'claude-web',
|
|
66
|
+
'cohere-training-data-crawler',
|
|
67
|
+
'meta-externalfetcher',
|
|
48
68
|
]);
|
|
49
69
|
|
|
70
|
+
/**
|
|
71
|
+
* Citation/retrieval crawlers. Blocking these directly hurts AI visibility
|
|
72
|
+
* because answer engines cannot fetch content for inline citation.
|
|
73
|
+
*/
|
|
74
|
+
const CITATION_CRAWLERS = Object.freeze([
|
|
75
|
+
'OAI-SearchBot',
|
|
76
|
+
'ChatGPT-User',
|
|
77
|
+
'PerplexityBot',
|
|
78
|
+
'Perplexity-User',
|
|
79
|
+
'Applebot',
|
|
80
|
+
'Bingbot',
|
|
81
|
+
'Googlebot',
|
|
82
|
+
'DuckDuckBot',
|
|
83
|
+
'YouBot',
|
|
84
|
+
// Broadened: alternative answer engines and search crawlers.
|
|
85
|
+
'MistralAI-User',
|
|
86
|
+
'PhindBot',
|
|
87
|
+
'Komo',
|
|
88
|
+
'AndiBot',
|
|
89
|
+
'BraveBot',
|
|
90
|
+
'KagiBot',
|
|
91
|
+
'Yep',
|
|
92
|
+
'NeevaBot',
|
|
93
|
+
'Exabot',
|
|
94
|
+
'Qwantify',
|
|
95
|
+
'Seznam',
|
|
96
|
+
'GoogleOther',
|
|
97
|
+
'Google-CloudVertexBot',
|
|
98
|
+
'BingPreview',
|
|
99
|
+
]);
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Match a User-Agent token against a list of known crawlers using a
|
|
103
|
+
* case-insensitive longest-prefix match. This prevents short prefixes like
|
|
104
|
+
* "applebot" from incorrectly absorbing "applebot-extended" matches.
|
|
105
|
+
*
|
|
106
|
+
* @param {string} ua - User-Agent token from robots.txt or meta tag.
|
|
107
|
+
* @param {readonly string[]} crawlers - Crawler list to match against.
|
|
108
|
+
* @returns {string|null} - The matched crawler name (original casing) or null.
|
|
109
|
+
*/
|
|
110
|
+
function matchCrawler(ua, crawlers) {
|
|
111
|
+
if (!ua) return null;
|
|
112
|
+
const lower = ua.toLowerCase();
|
|
113
|
+
let bestMatch = null;
|
|
114
|
+
let bestLen = 0;
|
|
115
|
+
for (const c of crawlers) {
|
|
116
|
+
const cl = c.toLowerCase();
|
|
117
|
+
if (lower === cl || lower.startsWith(cl) || lower.includes(cl)) {
|
|
118
|
+
if (cl.length > bestLen) {
|
|
119
|
+
bestLen = cl.length;
|
|
120
|
+
bestMatch = c;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
return bestMatch;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Combined AI crawler list, kept for back-compat with downstream callers
|
|
129
|
+
* that iterate the union (e.g. robots.txt block detection per crawler).
|
|
130
|
+
*/
|
|
131
|
+
const AI_CRAWLERS = Object.freeze([...TRAINING_CRAWLERS, ...CITATION_CRAWLERS]);
|
|
132
|
+
|
|
50
133
|
/** Maximum number of redirects to follow when fetching a resource. */
|
|
51
134
|
const MAX_REDIRECTS = 5;
|
|
52
135
|
|
|
@@ -757,10 +840,68 @@ function aggregatePageScores(pageResults) {
|
|
|
757
840
|
* @returns {string} - One of: 'faq', 'product', 'article', 'local-business', 'homepage', 'ecommerce', 'saas', 'generic'.
|
|
758
841
|
*/
|
|
759
842
|
function detectPageType($, schemaTypes, pathname) {
|
|
760
|
-
// Check JSON-LD schema types first (most reliable signal)
|
|
761
|
-
|
|
762
|
-
|
|
843
|
+
// Check JSON-LD schema types first (most reliable signal).
|
|
844
|
+
// A page can carry FAQPage schema for a small FAQ section while being a long-form
|
|
845
|
+
// guide. Only classify as "faq" when FAQPage is the dominant structure, otherwise
|
|
846
|
+
// a 6,400-word guide with a FAQ at the bottom gets penalized as exceeding FAQ length.
|
|
847
|
+
const allH2s = $('h2');
|
|
848
|
+
const h2Count = allH2s.length;
|
|
849
|
+
let questionH2Count = 0;
|
|
850
|
+
allH2s.each((_, el) => {
|
|
851
|
+
const t = ($(el).text() || '').trim();
|
|
852
|
+
if (t.includes('?') || /^(how|what|why|when|where|who|which|can|do|does|is|are|should)\b/i.test(t)) {
|
|
853
|
+
questionH2Count++;
|
|
854
|
+
}
|
|
855
|
+
});
|
|
856
|
+
const isDominantlyFaq = h2Count > 0 && questionH2Count >= h2Count * 0.7;
|
|
857
|
+
|
|
858
|
+
// Word count for length-based reclassification of FAQ-tagged guides.
|
|
859
|
+
const mainElForCount = $('main, article, [role="main"]');
|
|
860
|
+
const mainTextForCount = (mainElForCount.length > 0 ? mainElForCount.text() : $('body').text() || '').trim();
|
|
861
|
+
const wordCountForType = mainTextForCount.split(/\s+/).filter(w => w.length > 0).length;
|
|
862
|
+
|
|
863
|
+
// Non-FAQ schema types that, when coexisting with FAQPage, signal a hybrid
|
|
864
|
+
// guide rather than a pure FAQ page.
|
|
865
|
+
const NON_FAQ_GUIDE_TYPES = [
|
|
866
|
+
'Article', 'NewsArticle', 'BlogPosting', 'TechArticle', 'HowTo', 'Product',
|
|
867
|
+
'Dataset', 'Report', 'WebPage',
|
|
868
|
+
// Broadened: more schema types that imply guide/long-form rather than pure FAQ.
|
|
869
|
+
'Recipe', 'ScholarlyArticle', 'Guide', 'Course', 'Service',
|
|
870
|
+
'MedicalEntity', 'MedicalGuideline', 'Book', 'Chapter',
|
|
871
|
+
'LearningResource', 'Review', 'CollectionPage', 'ItemPage',
|
|
872
|
+
];
|
|
873
|
+
const hasNonFaqGuideType = NON_FAQ_GUIDE_TYPES.some((t) => schemaTypes.has(t));
|
|
874
|
+
|
|
875
|
+
// Heuristic guide-title overrides: title or H1 phrasing strongly implies a guide.
|
|
876
|
+
const titleText = ($('title').text() || '').trim();
|
|
877
|
+
const h1Text = ($('h1').first().text() || '').trim();
|
|
878
|
+
const titleAndH1 = `${titleText} ${h1Text}`;
|
|
879
|
+
const GUIDE_TITLE_RE = /\b(?:complete|ultimate|definitive|comprehensive)?\s*guide\b/i;
|
|
880
|
+
const EVERYTHING_RE = /everything you need/i;
|
|
881
|
+
const HOW_TO_TITLE_RE = /how to/i;
|
|
882
|
+
const STEP_BY_STEP_RE = /step[- ]by[- ]step/i;
|
|
883
|
+
const matchesGuideTitle = GUIDE_TITLE_RE.test(titleAndH1)
|
|
884
|
+
|| EVERYTHING_RE.test(titleAndH1)
|
|
885
|
+
|| HOW_TO_TITLE_RE.test(titleAndH1)
|
|
886
|
+
|| STEP_BY_STEP_RE.test(titleAndH1);
|
|
887
|
+
|
|
888
|
+
// Definition-list + multiple H2 sections is a strong guide signal.
|
|
889
|
+
const hasDefinitionListGuide = $('dl').length > 0 && h2Count >= 2;
|
|
890
|
+
|
|
891
|
+
// Long-form / heading-rich pages should never classify as pure FAQ.
|
|
892
|
+
const tooLongForFaq = wordCountForType > 2000;
|
|
893
|
+
const tooManyH2sForFaq = h2Count > 8;
|
|
894
|
+
|
|
895
|
+
if (matchesGuideTitle || hasDefinitionListGuide) return 'article';
|
|
896
|
+
if (schemaTypes.has('FAQPage') && isDominantlyFaq && !hasNonFaqGuideType
|
|
897
|
+
&& wordCountForType <= 1500 && !tooManyH2sForFaq && !tooLongForFaq) return 'faq';
|
|
763
898
|
if (['Article', 'NewsArticle', 'BlogPosting', 'TechArticle'].some((t) => schemaTypes.has(t))) return 'article';
|
|
899
|
+
// FAQPage schema present but page is also long-form or carries another guide-type schema:
|
|
900
|
+
// treat as article so guide-style word/heading expectations apply.
|
|
901
|
+
if (schemaTypes.has('FAQPage') && (hasNonFaqGuideType || wordCountForType > 1500 || h2Count >= 6 || tooManyH2sForFaq || tooLongForFaq)) return 'article';
|
|
902
|
+
if (schemaTypes.has('FAQPage') && !tooManyH2sForFaq && !tooLongForFaq) return 'faq';
|
|
903
|
+
if (schemaTypes.has('FAQPage')) return 'article';
|
|
904
|
+
if (['Product', 'Offer'].some((t) => schemaTypes.has(t))) return 'product';
|
|
764
905
|
if (['LocalBusiness', 'Restaurant', 'Store'].some((t) => schemaTypes.has(t))) return 'local-business';
|
|
765
906
|
|
|
766
907
|
// Heuristic: homepage detection (including language/locale-prefixed homepages like /en/, /de-DE/, /nl/)
|
|
@@ -769,9 +910,10 @@ function detectPageType($, schemaTypes, pathname) {
|
|
|
769
910
|
const normalizedPath = pathname.replace(/^\/[a-z]{2}(?:[-_][a-z]{2,3})?\/?$/i, '/');
|
|
770
911
|
if (normalizedPath === '/' || normalizedPath === '/index.html' || normalizedPath === '/index.php' || normalizedPath === '') return 'homepage';
|
|
771
912
|
|
|
772
|
-
// Heuristic: FAQ page via DOM
|
|
913
|
+
// Heuristic: FAQ page via DOM. Only treat as FAQ when FAQ-like elements dominate the
|
|
914
|
+
// structure - if the page has many topic H2s it's a guide that happens to include a FAQ.
|
|
773
915
|
const faqIndicators = $('[class*="faq"], [id*="faq"], details, [class*="accordion"]');
|
|
774
|
-
if (faqIndicators.length >= 3) return 'faq';
|
|
916
|
+
if (faqIndicators.length >= 3 && (h2Count < 6 || isDominantlyFaq)) return 'faq';
|
|
775
917
|
|
|
776
918
|
// Heuristic: article via DOM
|
|
777
919
|
const hasArticle = $('article').length > 0;
|
|
@@ -890,7 +1032,9 @@ function checkStructuredData($, pageType, jsonLdData, jsonLdValid, jsonLdInvalid
|
|
|
890
1032
|
checks.push({ status: 'pass', label: `GEO-critical schema types present (${foundImportant.length})`, detail: foundImportant.join(', ') });
|
|
891
1033
|
} else if (foundImportant.length > 0) {
|
|
892
1034
|
score += 5;
|
|
893
|
-
|
|
1035
|
+
const suggestions = ['FAQPage', 'HowTo', 'Article', 'BreadcrumbList'].filter((t) => !schemaTypes.has(t));
|
|
1036
|
+
const consider = suggestions.length > 0 ? `. Consider adding: ${suggestions.join(', ')}` : '';
|
|
1037
|
+
checks.push({ status: 'warn', label: `Only ${foundImportant.length} GEO-critical schema type(s)`, detail: `Found: ${foundImportant.join(', ')}${consider}` });
|
|
894
1038
|
} else {
|
|
895
1039
|
checks.push({ status: 'fail', label: 'No GEO-critical schema types', detail: 'Add FAQPage, Article, Organization, BreadcrumbList, etc.' });
|
|
896
1040
|
}
|
|
@@ -1835,38 +1979,66 @@ function checkMachineReadability($, robotsTxtData, llmsTxtData, responseHeaders)
|
|
|
1835
1979
|
checks.push({ status: 'pass', label: 'No restrictive robots meta', detail: 'Page is open for indexing' });
|
|
1836
1980
|
}
|
|
1837
1981
|
|
|
1838
|
-
// Check for specific AI bot meta tags
|
|
1839
|
-
|
|
1840
|
-
|
|
1982
|
+
// Check for specific AI bot meta tags. Split blocked bots into training-only
|
|
1983
|
+
// (informational) vs citation crawlers (real penalty) so a noindex on GPTBot
|
|
1984
|
+
// is not weighted the same as a noindex on Googlebot.
|
|
1985
|
+
const trainingBotMeta = TRAINING_CRAWLERS.map(c => c.toLowerCase());
|
|
1986
|
+
const citationBotMeta = CITATION_CRAWLERS.map(c => c.toLowerCase()).concat(['claude-web']);
|
|
1987
|
+
const aiBotMeta = [...new Set([...trainingBotMeta, ...citationBotMeta])];
|
|
1988
|
+
const blockedTrainingBots = [];
|
|
1989
|
+
const blockedCitationBots = [];
|
|
1841
1990
|
aiBotMeta.forEach((bot) => {
|
|
1842
1991
|
const content = $(`meta[name="${bot}"]`).attr('content') || '';
|
|
1843
1992
|
if (content.includes('noindex')) {
|
|
1844
|
-
|
|
1993
|
+
if (citationBotMeta.includes(bot)) {
|
|
1994
|
+
blockedCitationBots.push(bot);
|
|
1995
|
+
} else {
|
|
1996
|
+
blockedTrainingBots.push(bot);
|
|
1997
|
+
}
|
|
1845
1998
|
}
|
|
1846
1999
|
});
|
|
1847
2000
|
|
|
1848
2001
|
maxScore += 15;
|
|
1849
|
-
if (
|
|
2002
|
+
if (blockedCitationBots.length === 0 && blockedTrainingBots.length === 0) {
|
|
1850
2003
|
score += 15;
|
|
1851
2004
|
checks.push({ status: 'pass', label: 'No AI bot restrictions in meta', detail: 'No specific bot blocking detected in page HTML' });
|
|
2005
|
+
} else if (blockedCitationBots.length === 0) {
|
|
2006
|
+
score += 15;
|
|
2007
|
+
checks.push({ status: 'info', label: `Training crawler meta blocks: ${blockedTrainingBots.join(', ')}`, detail: 'Training-only blocks do not affect AI citation visibility', found: blockedTrainingBots });
|
|
1852
2008
|
} else {
|
|
1853
|
-
|
|
2009
|
+
score += Math.max(0, 15 - blockedCitationBots.length * 3);
|
|
2010
|
+
checks.push({ status: 'warn', label: `Citation crawler meta blocks: ${blockedCitationBots.join(', ')}`, detail: 'These citation crawlers are blocked via meta tags', found: blockedCitationBots });
|
|
2011
|
+
if (blockedTrainingBots.length > 0) {
|
|
2012
|
+
checks.push({ status: 'info', label: `Training crawler meta blocks: ${blockedTrainingBots.join(', ')}`, detail: 'Training-only blocks are informational', found: blockedTrainingBots });
|
|
2013
|
+
}
|
|
1854
2014
|
}
|
|
1855
2015
|
|
|
1856
2016
|
// robots.txt integration (from server-side fetch)
|
|
1857
2017
|
if (robotsTxtData) {
|
|
1858
2018
|
maxScore += 10;
|
|
1859
2019
|
if (robotsTxtData.exists) {
|
|
1860
|
-
const
|
|
1861
|
-
|
|
2020
|
+
const blocks = robotsTxtData.blocksCrawlers || {};
|
|
2021
|
+
const trainingLowercase = new Set(TRAINING_CRAWLERS.map(c => c.toLowerCase()));
|
|
2022
|
+
const citationLowercase = new Set(CITATION_CRAWLERS.map(c => c.toLowerCase()));
|
|
2023
|
+
const blockedAll = Object.entries(blocks).filter(([, v]) => v).map(([k]) => k);
|
|
2024
|
+
const blockedTraining = blockedAll.filter(k => trainingLowercase.has(k.toLowerCase()));
|
|
2025
|
+
const blockedCitation = blockedAll.filter(k => citationLowercase.has(k.toLowerCase()));
|
|
2026
|
+
|
|
2027
|
+
if (blockedCitation.length === 0 && blockedTraining.length === 0) {
|
|
1862
2028
|
score += 10;
|
|
1863
|
-
checks.push({ status: 'pass', label: 'robots.txt: no AI crawlers blocked', detail: 'All known
|
|
2029
|
+
checks.push({ status: 'pass', label: 'robots.txt: no AI crawlers blocked', detail: 'All known training and citation crawlers are allowed' });
|
|
2030
|
+
} else if (blockedCitation.length === 0) {
|
|
2031
|
+
score += 10;
|
|
2032
|
+
checks.push({ status: 'info', label: `robots.txt: ${blockedTraining.length} training crawler(s) blocked, citation crawlers allowed`, detail: 'Training-only blocks do not affect AI citation visibility', found: blockedTraining });
|
|
1864
2033
|
} else {
|
|
1865
|
-
score += Math.max(0, 10 -
|
|
1866
|
-
checks.push({ status: 'warn', label: `robots.txt: ${
|
|
2034
|
+
score += Math.max(0, 10 - blockedCitation.length * 2);
|
|
2035
|
+
checks.push({ status: 'warn', label: `robots.txt: ${blockedCitation.length} citation crawler(s) blocked`, detail: 'Blocking citation crawlers prevents inline AI citations', found: blockedCitation });
|
|
2036
|
+
if (blockedTraining.length > 0) {
|
|
2037
|
+
checks.push({ status: 'info', label: `robots.txt: ${blockedTraining.length} training crawler(s) blocked`, detail: 'Training-only blocks are informational and do not affect AI citation visibility', found: blockedTraining });
|
|
2038
|
+
}
|
|
1867
2039
|
}
|
|
1868
2040
|
if (robotsTxtData.hasWildcardDisallow) {
|
|
1869
|
-
checks.push({ status: 'warn', label: 'robots.txt: wildcard Disallow: /', detail: 'All crawlers are blocked by default
|
|
2041
|
+
checks.push({ status: 'warn', label: 'robots.txt: wildcard Disallow: /', detail: 'All crawlers are blocked by default, only overridden by specific Allow rules' });
|
|
1870
2042
|
}
|
|
1871
2043
|
} else {
|
|
1872
2044
|
checks.push({ status: 'warn', label: 'No robots.txt found', detail: 'robots.txt helps control crawler access' });
|
|
@@ -2014,7 +2186,29 @@ function checkEntity($, jsonLdData) {
|
|
|
2014
2186
|
});
|
|
2015
2187
|
}
|
|
2016
2188
|
|
|
2017
|
-
// 6. JSON-LD schema author with quality check
|
|
2189
|
+
// 6. JSON-LD schema author with quality check.
|
|
2190
|
+
// Only treat `author` as the page author when it's attached to a content type
|
|
2191
|
+
// (Article, WebPage, Book, etc.) - NOT inside Review/Comment, where `author` is
|
|
2192
|
+
// the reviewer/commenter and shouldn't be credited to the page.
|
|
2193
|
+
const PAGE_AUTHOR_TYPES = new Set([
|
|
2194
|
+
'Article', 'NewsArticle', 'BlogPosting', 'TechArticle', 'ScholarlyArticle', 'Report', 'OpinionNewsArticle',
|
|
2195
|
+
'WebPage', 'AboutPage', 'CollectionPage', 'ItemPage', 'ProfilePage', 'QAPage', 'FAQPage',
|
|
2196
|
+
'Book', 'Chapter', 'CreativeWork', 'CreativeWorkSeries', 'HowTo', 'Recipe', 'Course', 'LearningResource',
|
|
2197
|
+
'VideoObject', 'AudioObject', 'PodcastEpisode', 'Podcast',
|
|
2198
|
+
'DiscussionForumPosting', 'SocialMediaPosting',
|
|
2199
|
+
]);
|
|
2200
|
+
const SKIP_AUTHOR_TYPES = new Set(['Review', 'Comment', 'UserComments', 'Rating']);
|
|
2201
|
+
const isContentType = (t) => {
|
|
2202
|
+
if (!t) return false;
|
|
2203
|
+
const types = Array.isArray(t) ? t : [t];
|
|
2204
|
+
return types.some((x) => PAGE_AUTHOR_TYPES.has(x));
|
|
2205
|
+
};
|
|
2206
|
+
const isSkipType = (t) => {
|
|
2207
|
+
if (!t) return false;
|
|
2208
|
+
const types = Array.isArray(t) ? t : [t];
|
|
2209
|
+
return types.some((x) => SKIP_AUTHOR_TYPES.has(x));
|
|
2210
|
+
};
|
|
2211
|
+
|
|
2018
2212
|
let hasAuthorSchema = false;
|
|
2019
2213
|
let hasAuthorSameAs = false;
|
|
2020
2214
|
let hasPersonSchema = false;
|
|
@@ -2022,12 +2216,14 @@ function checkEntity($, jsonLdData) {
|
|
|
2022
2216
|
try {
|
|
2023
2217
|
const processSchema = (schema) => {
|
|
2024
2218
|
if (!schema) return;
|
|
2025
|
-
|
|
2219
|
+
// Skip Review/Comment subtrees - their author is not the page author.
|
|
2220
|
+
if (isSkipType(schema['@type'])) return;
|
|
2221
|
+
if (schema.author && isContentType(schema['@type'])) {
|
|
2026
2222
|
hasAuthorSchema = true;
|
|
2027
2223
|
const authors = Array.isArray(schema.author) ? schema.author : [schema.author];
|
|
2028
2224
|
authors.forEach((a) => {
|
|
2029
2225
|
if (typeof a === 'string') authorNames.add(a);
|
|
2030
|
-
else if (a.name) {
|
|
2226
|
+
else if (a && a.name) {
|
|
2031
2227
|
authorNames.add(a.name);
|
|
2032
2228
|
if (a.sameAs) hasAuthorSameAs = true;
|
|
2033
2229
|
if (a['@type'] === 'Person') hasPersonSchema = true;
|
|
@@ -2038,6 +2234,13 @@ function checkEntity($, jsonLdData) {
|
|
|
2038
2234
|
hasPersonSchema = true;
|
|
2039
2235
|
if (schema.sameAs) hasAuthorSameAs = true;
|
|
2040
2236
|
}
|
|
2237
|
+
// Recurse into common content-bearing fields, but skip review arrays.
|
|
2238
|
+
['mainEntity', 'mainEntityOfPage', 'about', 'isPartOf', 'hasPart', 'workExample', 'exampleOfWork'].forEach((key) => {
|
|
2239
|
+
const val = schema[key];
|
|
2240
|
+
if (!val) return;
|
|
2241
|
+
if (Array.isArray(val)) val.forEach(processSchema);
|
|
2242
|
+
else if (typeof val === 'object') processSchema(val);
|
|
2243
|
+
});
|
|
2041
2244
|
};
|
|
2042
2245
|
if (Array.isArray(d)) d.forEach(processSchema);
|
|
2043
2246
|
else if (d['@graph']) d['@graph'].forEach(processSchema);
|
|
@@ -2047,14 +2250,17 @@ function checkEntity($, jsonLdData) {
|
|
|
2047
2250
|
if (hasAuthorSchema) authorSources.schema.push('JSON-LD author');
|
|
2048
2251
|
if (hasPersonSchema) authorSources.schema.push('Person schema');
|
|
2049
2252
|
|
|
2050
|
-
// 7. HTML byline elements - extended selectors
|
|
2253
|
+
// 7. HTML byline elements - extended selectors.
|
|
2254
|
+
// Exclude bylines inside review/comment/testimonial containers - they identify the
|
|
2255
|
+
// reviewer, not the page author.
|
|
2051
2256
|
const bylineSelectors = [
|
|
2052
2257
|
'[class*="author"]', '[rel="author"]', '[itemprop="author"]',
|
|
2053
2258
|
'.byline', '.post-author', '.article-author', '.entry-author',
|
|
2054
2259
|
'[data-author]', '[data-byline]',
|
|
2055
2260
|
'address.author', '.writer', '.contributor',
|
|
2056
2261
|
].join(', ');
|
|
2057
|
-
const
|
|
2262
|
+
const reviewContextSel = '[itemtype*="Review"], [itemtype*="Comment"], .review, .reviews, .comment, .comments, .testimonial, .testimonials, [class*="review-"], [class*="reviews-"]';
|
|
2263
|
+
const authorByline = $(bylineSelectors).filter((_, el) => $(el).closest(reviewContextSel).length === 0).first();
|
|
2058
2264
|
if (authorByline.length > 0) {
|
|
2059
2265
|
const bylineText = (authorByline.text() || '').trim();
|
|
2060
2266
|
if (bylineText && bylineText.length < 100) {
|
|
@@ -2070,8 +2276,9 @@ function checkEntity($, jsonLdData) {
|
|
|
2070
2276
|
authorSources.html.push('address element');
|
|
2071
2277
|
}
|
|
2072
2278
|
|
|
2073
|
-
// 9. Author profile links
|
|
2074
|
-
const authorLinks = $('a[href*="/author/"], a[href*="/writers/"], a[href*="/contributors/"], a[href*="/team/"], a[rel="author"]')
|
|
2279
|
+
// 9. Author profile links - skip review-context links (reviewer profile links).
|
|
2280
|
+
const authorLinks = $('a[href*="/author/"], a[href*="/writers/"], a[href*="/contributors/"], a[href*="/team/"], a[rel="author"]')
|
|
2281
|
+
.filter((_, el) => $(el).closest(reviewContextSel).length === 0);
|
|
2075
2282
|
if (authorLinks.length > 0) {
|
|
2076
2283
|
authorSources.links.push(`${authorLinks.length} author link(s)`);
|
|
2077
2284
|
authorLinks.each((_, el) => {
|
|
@@ -3662,6 +3869,32 @@ function checkWebMCP($, pageType, ucpData) {
|
|
|
3662
3869
|
checks.push({ status: 'info', label: 'Shopify-hosted: dual UCP surface expected', detail: 'Per-shop endpoint at /api/ucp/mcp; global catalog at https://discover.shopifyapps.com/global/mcp' });
|
|
3663
3870
|
}
|
|
3664
3871
|
|
|
3872
|
+
// Baseline credit for purely informational pages.
|
|
3873
|
+
// If the page has no forms, no WebMCP signals, no UCP profile, and no Shopify
|
|
3874
|
+
// surface, there's nothing for it to expose to agents - WebMCP/UCP are N/A here.
|
|
3875
|
+
// Without this, content-only pages are capped well below 100 even when there's
|
|
3876
|
+
// nothing to fix, dragging the overall score unfairly.
|
|
3877
|
+
const totalForms = $('form').length;
|
|
3878
|
+
const hasUcp = !!(ucpData && ucpData.exists && ucpData.content);
|
|
3879
|
+
const hasShopify = !!(ucpData && ucpData.shopifyHosted);
|
|
3880
|
+
const hasNoInteractiveSurface =
|
|
3881
|
+
totalForms === 0 &&
|
|
3882
|
+
toolCount === 0 &&
|
|
3883
|
+
!hasImperativeSignals &&
|
|
3884
|
+
!webmcpSDKFound &&
|
|
3885
|
+
!hasSchemaActions &&
|
|
3886
|
+
!hasUcp &&
|
|
3887
|
+
!hasShopify;
|
|
3888
|
+
|
|
3889
|
+
if (hasNoInteractiveSurface) {
|
|
3890
|
+
checks.push({
|
|
3891
|
+
status: 'info',
|
|
3892
|
+
label: 'Informational page — Agent Interactivity not applicable',
|
|
3893
|
+
detail: 'No forms or WebMCP/UCP signals detected. Pure-content pages can\'t expose tools to agents, so this category is scored as a baseline rather than penalized.',
|
|
3894
|
+
});
|
|
3895
|
+
return { checks, score: 80, category: 'Agent Interactivity', notApplicable: true };
|
|
3896
|
+
}
|
|
3897
|
+
|
|
3665
3898
|
return { checks, score: maxScore > 0 ? Math.round((score / maxScore) * 100) : 0, category: 'Agent Interactivity' };
|
|
3666
3899
|
}
|
|
3667
3900
|
|
|
@@ -3929,8 +4162,18 @@ function checkContentFreshness($, jsonLdData) {
|
|
|
3929
4162
|
new RegExp('\\bin ' + currentYear + '\\b', 'i'),
|
|
3930
4163
|
new RegExp('\\b(as of|updated)\\s+(january|february|march|april|may|june|july|august|september|october|november|december)\\s+' + currentYear + '\\b', 'i'),
|
|
3931
4164
|
];
|
|
4165
|
+
// Historical/founding-context phrases - "records from 1841 to present", "since 1990",
|
|
4166
|
+
// "established 1936" - are accurate facts, not stale temporal references.
|
|
4167
|
+
const HISTORICAL_CONTEXT_PATTERNS = [
|
|
4168
|
+
/\b(since|from|established|founded|operating since|serving since|in business since)\s+(in\s+)?\d{4}\b/i,
|
|
4169
|
+
/\b\d{4}\s*(?:[‐-―−\-–—~]|to)\s*(present|current|today|now|\d{4})\b/i,
|
|
4170
|
+
/\b(records?|archives?|documents?|history|heritage|founded|established|originated|dating back)\b[^.]{0,80}\b(from|since|in)\s+\d{4}\b/i,
|
|
4171
|
+
/\b(historical|historic|vintage|legacy)\b/i,
|
|
4172
|
+
];
|
|
4173
|
+
const hasHistoricalContext = HISTORICAL_CONTEXT_PATTERNS.some(p => p.test(visibleText));
|
|
3932
4174
|
const hasCurrentRefs = CURRENT_YEAR_PATTERNS.some(p => p.test(visibleText));
|
|
3933
|
-
const
|
|
4175
|
+
const rawOutdatedHits = OUTDATED_TEMPORAL_PATTERNS.some(p => p.test(visibleText));
|
|
4176
|
+
const hasOutdatedRefs = rawOutdatedHits && !hasHistoricalContext;
|
|
3934
4177
|
maxScore += 20;
|
|
3935
4178
|
if (hasCurrentRefs && !hasOutdatedRefs) {
|
|
3936
4179
|
score += 20;
|
|
@@ -3947,21 +4190,68 @@ function checkContentFreshness($, jsonLdData) {
|
|
|
3947
4190
|
}
|
|
3948
4191
|
|
|
3949
4192
|
// 12d. Copyright Year & Footer Freshness (10 pts)
|
|
4193
|
+
// Year ranges ("(c) 1997 - 2026") signal a founding year + current year, take
|
|
4194
|
+
// the END year as the freshness signal, not the founding year.
|
|
4195
|
+
// Also handles enumerated lists like "(c) 2010, 2015, 2026" by taking the max
|
|
4196
|
+
// of all years in the same line as a copyright marker.
|
|
3950
4197
|
const footerEl = $('footer');
|
|
3951
4198
|
maxScore += 10;
|
|
3952
4199
|
if (footerEl.length > 0) {
|
|
3953
|
-
|
|
3954
|
-
const
|
|
3955
|
-
|
|
3956
|
-
|
|
3957
|
-
|
|
4200
|
+
// Strip "All Rights Reserved" boilerplate (en/fr/de) before parsing.
|
|
4201
|
+
const rawFooterText = footerEl.text();
|
|
4202
|
+
const footerText = rawFooterText
|
|
4203
|
+
.replace(/all\s+rights\s+reserved/gi, '')
|
|
4204
|
+
.replace(/tous\s+droits\s+r[ée]serv[ée]s/gi, '')
|
|
4205
|
+
.replace(/alle\s+rechte\s+vorbehalten/gi, '');
|
|
4206
|
+
// Broader prefix list: includes bracket variants and "Copyright ©" double prefix.
|
|
4207
|
+
const COPYRIGHT_PREFIX = /(?:©|\(c\)|\(C\)|\[c\]|\[C\]|©|copyright(?:\s*©)?)/i;
|
|
4208
|
+
// Exclude founding year markers so "Est. 1998" / "Since 2001" do not get
|
|
4209
|
+
// mistaken for a copyright year when no actual copyright marker is present.
|
|
4210
|
+
const FOUNDING_PREFIX = /\b(?:est(?:ablished|\.)?|since|founded(?:\s+in)?)\s+\d{4}\b/i;
|
|
4211
|
+
let copyrightYear = null;
|
|
4212
|
+
// Sweep each line for a copyright marker; take the max year found on that line.
|
|
4213
|
+
const lines = footerText.split(/\r?\n|<br\s*\/?>/i);
|
|
4214
|
+
for (const rawLine of lines) {
|
|
4215
|
+
const line = rawLine.trim();
|
|
4216
|
+
if (!line) continue;
|
|
4217
|
+
if (!COPYRIGHT_PREFIX.test(line)) continue;
|
|
4218
|
+
// Skip lines that look like founding-year statements without a real © marker.
|
|
4219
|
+
const hasRealMarker = /(?:©|\(c\)|\(C\)|\[c\]|\[C\]|©|copyright)/i.test(line);
|
|
4220
|
+
if (!hasRealMarker && FOUNDING_PREFIX.test(line)) continue;
|
|
4221
|
+
const yearMatches = line.match(/\b(19|20)\d{2}\b/g);
|
|
4222
|
+
if (yearMatches && yearMatches.length > 0) {
|
|
4223
|
+
const maxYear = Math.max(...yearMatches.map(y => parseInt(y, 10)));
|
|
4224
|
+
if (copyrightYear === null || maxYear > copyrightYear) copyrightYear = maxYear;
|
|
4225
|
+
}
|
|
4226
|
+
}
|
|
4227
|
+
// Fallback: if the footer is a single blob without line breaks, sweep the
|
|
4228
|
+
// whole text but only when a copyright marker exists.
|
|
4229
|
+
if (copyrightYear === null && COPYRIGHT_PREFIX.test(footerText)) {
|
|
4230
|
+
const yearMatches = footerText.match(/\b(19|20)\d{2}\b/g);
|
|
4231
|
+
if (yearMatches && yearMatches.length > 0) {
|
|
4232
|
+
copyrightYear = Math.max(...yearMatches.map(y => parseInt(y, 10)));
|
|
4233
|
+
}
|
|
4234
|
+
}
|
|
4235
|
+
// Supplemental freshness signal: <time datetime="YYYY"> inside <footer>.
|
|
4236
|
+
if (copyrightYear === null) {
|
|
4237
|
+
footerEl.find('time[datetime]').each((_i, tEl) => {
|
|
4238
|
+
const dt = ($(tEl).attr('datetime') || '').trim();
|
|
4239
|
+
const ym = dt.match(/^(\d{4})/);
|
|
4240
|
+
if (ym) {
|
|
4241
|
+
const ty = parseInt(ym[1], 10);
|
|
4242
|
+
if (copyrightYear === null || ty > copyrightYear) copyrightYear = ty;
|
|
4243
|
+
}
|
|
4244
|
+
});
|
|
4245
|
+
}
|
|
4246
|
+
if (copyrightYear !== null) {
|
|
4247
|
+
if (copyrightYear >= currentYear - 1) {
|
|
3958
4248
|
score += 10;
|
|
3959
4249
|
checks.push({ status: 'pass', label: `Copyright year current (${copyrightYear})`, detail: `Footer copyright is ${copyrightYear}` });
|
|
3960
|
-
} else if (copyrightYear === currentYear -
|
|
4250
|
+
} else if (copyrightYear === currentYear - 2) {
|
|
3961
4251
|
score += 5;
|
|
3962
|
-
checks.push({ status: 'warn', label: `Copyright year slightly old (${copyrightYear})`, detail: `Footer shows ${copyrightYear}
|
|
4252
|
+
checks.push({ status: 'warn', label: `Copyright year slightly old (${copyrightYear})`, detail: `Footer shows ${copyrightYear}, update to ${currentYear}` });
|
|
3963
4253
|
} else {
|
|
3964
|
-
checks.push({ status: 'fail', label: `Copyright year outdated (${copyrightYear})`, detail: `Footer shows ${copyrightYear}
|
|
4254
|
+
checks.push({ status: 'fail', label: `Copyright year outdated (${copyrightYear})`, detail: `Footer shows ${copyrightYear}, update to ${currentYear}` });
|
|
3965
4255
|
}
|
|
3966
4256
|
} else {
|
|
3967
4257
|
checks.push({ status: 'info', label: 'No copyright year in footer', detail: 'Add a copyright year to signal maintenance' });
|
|
@@ -4041,22 +4331,36 @@ function checkInformationDensity($) {
|
|
|
4041
4331
|
}
|
|
4042
4332
|
|
|
4043
4333
|
// 13b. Self-Contained Section Scoring (25 pts)
|
|
4334
|
+
// Sections with structured content (tables w/ headers, lists, definition lists) are
|
|
4335
|
+
// self-contained even at lower word counts - the structure carries the meaning.
|
|
4044
4336
|
const h2s = $('main h2, article h2, [role="main"] h2');
|
|
4045
4337
|
maxScore += 25;
|
|
4046
4338
|
if (h2s.length > 0) {
|
|
4047
4339
|
let selfContainedCount = 0;
|
|
4048
4340
|
h2s.each((_i, h2El) => {
|
|
4049
4341
|
let sectionText = '';
|
|
4342
|
+
let hasStructuredContent = false;
|
|
4343
|
+
let hasLabeledTable = false;
|
|
4050
4344
|
let sibling = $(h2El).next();
|
|
4051
4345
|
while (sibling.length > 0 && !sibling.is('h2')) {
|
|
4052
4346
|
sectionText += (sibling.text() || '') + ' ';
|
|
4347
|
+
if (sibling.is('table, ul, ol, dl') || sibling.find('table, ul, ol, dl').length > 0) {
|
|
4348
|
+
hasStructuredContent = true;
|
|
4349
|
+
}
|
|
4350
|
+
const tablesHere = sibling.is('table') ? sibling : sibling.find('table');
|
|
4351
|
+
tablesHere.each((__, t) => {
|
|
4352
|
+
if ($(t).find('th').length > 0) hasLabeledTable = true;
|
|
4353
|
+
});
|
|
4053
4354
|
sibling = sibling.next();
|
|
4054
4355
|
}
|
|
4055
4356
|
const wordCount = sectionText.trim().split(/\s+/).length;
|
|
4056
4357
|
const hasData = /\d/.test(sectionText);
|
|
4057
4358
|
const firstSentence = sectionText.split(/[.!?]/)[0] || '';
|
|
4058
4359
|
const hasTopicSentence = firstSentence.trim().length > 30;
|
|
4059
|
-
|
|
4360
|
+
const isStandardComplete = wordCount >= 150 && wordCount <= 500 && hasData && hasTopicSentence;
|
|
4361
|
+
const isStructurallyComplete = hasStructuredContent && wordCount >= 40 && (hasData || hasLabeledTable);
|
|
4362
|
+
const isLabeledTableSection = hasLabeledTable && wordCount >= 10;
|
|
4363
|
+
if (isStandardComplete || isStructurallyComplete || isLabeledTableSection) {
|
|
4060
4364
|
selfContainedCount++;
|
|
4061
4365
|
}
|
|
4062
4366
|
});
|
|
@@ -4076,6 +4380,8 @@ function checkInformationDensity($) {
|
|
|
4076
4380
|
}
|
|
4077
4381
|
|
|
4078
4382
|
// 13c. Claim-Evidence Pairing (20 pts)
|
|
4383
|
+
// Tables with header cells provide column-level context for every numeric value,
|
|
4384
|
+
// so data points inside labeled tables are considered already-paired by design.
|
|
4079
4385
|
const DATA_SENTENCE = /\d+(\.\d+)?(%|x|\$|€|£)/;
|
|
4080
4386
|
let dataSentences = 0;
|
|
4081
4387
|
let pairedData = 0;
|
|
@@ -4090,14 +4396,26 @@ function checkInformationDensity($) {
|
|
|
4090
4396
|
}
|
|
4091
4397
|
}
|
|
4092
4398
|
});
|
|
4399
|
+
// Count data cells inside labeled tables - they're context-paired via column headers.
|
|
4400
|
+
let labeledTableDataCells = 0;
|
|
4401
|
+
const pairingTables = mainEl.length > 0 ? mainEl.find('table') : $('table');
|
|
4402
|
+
pairingTables.each((_i, t) => {
|
|
4403
|
+
const $t = $(t);
|
|
4404
|
+
if ($t.find('th').length === 0) return;
|
|
4405
|
+
$t.find('tbody td, td').each((__, td) => {
|
|
4406
|
+
if (DATA_SENTENCE.test($(td).text() || '')) labeledTableDataCells++;
|
|
4407
|
+
});
|
|
4408
|
+
});
|
|
4093
4409
|
maxScore += 20;
|
|
4094
|
-
|
|
4410
|
+
const totalData = dataSentences + labeledTableDataCells;
|
|
4411
|
+
const totalPaired = pairedData + labeledTableDataCells;
|
|
4412
|
+
if (totalData === 0) {
|
|
4095
4413
|
checks.push({ status: 'info', label: 'No data claims detected', detail: 'Add quantitative data points with context' });
|
|
4096
4414
|
} else {
|
|
4097
|
-
const pairedPct = Math.round((
|
|
4415
|
+
const pairedPct = Math.round((totalPaired / totalData) * 100);
|
|
4098
4416
|
if (pairedPct > 80) {
|
|
4099
4417
|
score += 20;
|
|
4100
|
-
checks.push({ status: 'pass', label: `Claims well-paired (${pairedPct}%)`, detail: `${pairedPct}% of data claims have contextual explanations` });
|
|
4418
|
+
checks.push({ status: 'pass', label: `Claims well-paired (${pairedPct}%)`, detail: `${pairedPct}% of data claims have contextual explanations${labeledTableDataCells > 0 ? ` (incl. ${labeledTableDataCells} table cells)` : ''}` });
|
|
4101
4419
|
} else if (pairedPct >= 50) {
|
|
4102
4420
|
score += 10;
|
|
4103
4421
|
checks.push({ status: 'warn', label: `Claims partially paired (${pairedPct}%)`, detail: `${pairedPct}% of data claims have context — add more explanations` });
|
|
@@ -4172,6 +4490,18 @@ function checkVerifiability($, domain) {
|
|
|
4172
4490
|
const contentText = (mainEl.length > 0 ? mainEl.text() : $('body').text() || '').trim();
|
|
4173
4491
|
const sentences = contentText.split(/[.!?]+/).filter(s => s.trim().length > 10);
|
|
4174
4492
|
|
|
4493
|
+
// Visible body text (paragraphs, list items, blockquotes) for attribution
|
|
4494
|
+
// patterns that often span sentence boundaries or live in elements that
|
|
4495
|
+
// are tricky to split on punctuation alone.
|
|
4496
|
+
const bodyTextEls = mainEl.length > 0
|
|
4497
|
+
? mainEl.find('p, li, blockquote, td, dd')
|
|
4498
|
+
: $('p, li, blockquote, td, dd');
|
|
4499
|
+
const bodyTextChunks = [];
|
|
4500
|
+
bodyTextEls.each((_i, el) => {
|
|
4501
|
+
const t = ($(el).text() || '').trim();
|
|
4502
|
+
if (t.length > 0) bodyTextChunks.push(t);
|
|
4503
|
+
});
|
|
4504
|
+
|
|
4175
4505
|
// 14a. External Citation Links (30 pts)
|
|
4176
4506
|
const AUTHORITY_DOMAINS = ['.gov', '.edu', '.org', 'scholar.google', 'pubmed', 'arxiv.org', 'doi.org'];
|
|
4177
4507
|
const externalLinks = mainEl.length > 0 ? mainEl.find('a[href^="http"]') : $('a[href^="http"]');
|
|
@@ -4192,7 +4522,7 @@ function checkVerifiability($, domain) {
|
|
|
4192
4522
|
checks.push({ status: 'pass', label: `Strong citations (${totalExternalLinks} external, ${authorityLinks} authority)`, detail: `${totalExternalLinks} external links including ${authorityLinks} authority sources` });
|
|
4193
4523
|
} else if (totalExternalLinks >= 1) {
|
|
4194
4524
|
score += 15;
|
|
4195
|
-
checks.push({ status: 'warn', label: `Some citations (${totalExternalLinks} external)`, detail: `${totalExternalLinks} external links
|
|
4525
|
+
checks.push({ status: 'warn', label: `Some citations (${totalExternalLinks} external)`, detail: `${totalExternalLinks} external links, add authority sources (.gov, .edu)` });
|
|
4196
4526
|
} else {
|
|
4197
4527
|
score += 5;
|
|
4198
4528
|
checks.push({ status: 'fail', label: 'No external citations', detail: 'Add external links to authoritative sources' });
|
|
@@ -4200,25 +4530,58 @@ function checkVerifiability($, domain) {
|
|
|
4200
4530
|
|
|
4201
4531
|
// 14b. Source Attribution in Text (25 pts)
|
|
4202
4532
|
const SOURCE_ATTRIBUTION_PATTERNS = [
|
|
4203
|
-
/\baccording to\s+[A-Z]
|
|
4204
|
-
/\
|
|
4205
|
-
/\b(
|
|
4206
|
-
/\b(
|
|
4207
|
-
/\b(
|
|
4533
|
+
/\baccording to\s+(?:the\s+|a\s+|an\s+)?[A-Z][\w'.-]*(?:\s+(?:of|for|on|and|the|de|van)\s+)?[A-Z\w'.-]*/,
|
|
4534
|
+
/\b(?:a|an|the|new|recent|latest|major|landmark)?\s*(?:study|report|survey|analysis|paper|whitepaper|brief)\s+(?:by|from|published by)\b/i,
|
|
4535
|
+
/\b(?:research|data|figures|statistics|findings)\s+(?:by|from|of|published by)\b/i,
|
|
4536
|
+
/\b(?:published in|cited in|reported by|noted by|observed by)\b/i,
|
|
4537
|
+
/\b(?:source|data from|based on)\s*:/i,
|
|
4538
|
+
/\b(?:report|study|analysis)\s+(?:by|from)\b/i,
|
|
4539
|
+
/\b[A-Z][\w'.-]+(?:\s+[A-Z][\w'.-]+){0,4}\s+(?:says|states|reports|found|concluded|notes|observed|estimates)\b/,
|
|
4208
4540
|
/\[\d+\]/,
|
|
4209
|
-
/\b(et al\.?|ibid\.?)\b/,
|
|
4541
|
+
/\b(?:et al\.?|ibid\.?)\b/,
|
|
4542
|
+
// Broadened patterns: "as reported by", "as documented in", etc.
|
|
4543
|
+
/\bas\s+(?:reported|noted|stated|cited|documented|shown|described|outlined)\s+(?:by|in|on)\b/i,
|
|
4544
|
+
// "per the WHO", "per CDC"
|
|
4545
|
+
/\bper\s+(?:the\s+)?[A-Z]/,
|
|
4546
|
+
// Possessive: "WHO's data", "CDC's findings"
|
|
4547
|
+
/\b[A-Z][A-Za-z.&'-]+(?:'s|’s)\s+(?:data|report|study|analysis|findings|guidance|recommendations|guidelines)\b/,
|
|
4548
|
+
// Parenthetical citation: "(source: ...)", "(via: ...)"
|
|
4549
|
+
/\((?:source|src|via|cf|see)\s*:\s*[^)]+\)/i,
|
|
4550
|
+
// DOI references
|
|
4551
|
+
/\bdoi:\s*10\.\d+/i,
|
|
4552
|
+
// Numeric brackets variants: "[1, 2]", "[1-3]"
|
|
4553
|
+
/\[\d+(?:[,-]\s*\d+)*\]/,
|
|
4554
|
+
// Author-year: "(Smith, 2023)", "(Smith et al., 2023)", "(Smith and Jones, 2023)"
|
|
4555
|
+
/\([A-Z][a-zA-Z]+(?:\s+(?:et\s+al\.?|and\s+[A-Z][a-zA-Z]+))?,\s*\d{4}[a-z]?\)/,
|
|
4556
|
+
// "<Org> data shows/reveals/indicates/suggests/confirms"
|
|
4557
|
+
/\b[A-Z][\w'.-]+(?:\s+[A-Z][\w'.-]+){0,3}\s+data\s+(?:shows|reveals|indicates|suggests|confirms)\b/,
|
|
4558
|
+
// "<Org> figures/findings show/reveal/indicate"
|
|
4559
|
+
/\b[A-Z][\w'.-]+(?:\s+[A-Z][\w'.-]+){0,3}\s+(?:figures|findings)\s+(?:show|reveal|indicate)\b/,
|
|
4560
|
+
// "in a recent study", "in a landmark report"
|
|
4561
|
+
/\bin\s+(?:a|an)\s+(?:recent|new|landmark|seminal)\s+(?:study|report|survey|paper|analysis)\b/i,
|
|
4562
|
+
// "verified by", "confirmed by", "documented in/by"
|
|
4563
|
+
/\b(?:verified|confirmed)\s+by\b/i,
|
|
4564
|
+
/\bdocumented\s+(?:in|by)\b/i,
|
|
4565
|
+
// Government/regulatory bodies: "Department of Health", "Centers for Disease Control"
|
|
4566
|
+
/\b(?:U\.?S\.?\s+)?(?:Department\s+of|Ministry\s+of|Office\s+of|Bureau\s+of|Centers\s+for|Federal|National|Royal)\s+[A-Z]/,
|
|
4210
4567
|
];
|
|
4211
4568
|
let attrCount = 0;
|
|
4212
4569
|
sentences.forEach(s => {
|
|
4213
4570
|
if (SOURCE_ATTRIBUTION_PATTERNS.some(p => p.test(s))) attrCount++;
|
|
4214
4571
|
});
|
|
4572
|
+
bodyTextChunks.forEach(t => {
|
|
4573
|
+
if (SOURCE_ATTRIBUTION_PATTERNS.some(p => p.test(t))) attrCount++;
|
|
4574
|
+
});
|
|
4215
4575
|
maxScore += 25;
|
|
4216
4576
|
if (attrCount >= 3) {
|
|
4217
4577
|
score += 25;
|
|
4218
4578
|
checks.push({ status: 'pass', label: `Strong source attribution (${attrCount})`, detail: `${attrCount} source attribution patterns detected` });
|
|
4579
|
+
} else if (attrCount >= 2) {
|
|
4580
|
+
score += 18;
|
|
4581
|
+
checks.push({ status: 'pass', label: `Source attribution found (${attrCount})`, detail: `${attrCount} attribution patterns detected` });
|
|
4219
4582
|
} else if (attrCount >= 1) {
|
|
4220
|
-
score +=
|
|
4221
|
-
checks.push({ status: 'warn', label: `Some source attribution (${attrCount})`, detail: `${attrCount} attribution(s)
|
|
4583
|
+
score += 10;
|
|
4584
|
+
checks.push({ status: 'warn', label: `Some source attribution (${attrCount})`, detail: `${attrCount} attribution(s), add more source references` });
|
|
4222
4585
|
} else {
|
|
4223
4586
|
score += 5;
|
|
4224
4587
|
checks.push({ status: 'info', label: 'No source attribution detected', detail: 'Add "according to", "study by", or citation markers' });
|
|
@@ -4457,6 +4820,48 @@ function checkMultimodal($, jsonLdData) {
|
|
|
4457
4820
|
}
|
|
4458
4821
|
|
|
4459
4822
|
// 16b. Figure/Figcaption Usage (25 pts)
|
|
4823
|
+
// Only evaluate coverage against content images. Decorative images (empty
|
|
4824
|
+
// alt, presentation role, callouts, headshots, seals, logos, icons, small
|
|
4825
|
+
// images, content nested in <aside>) are excluded from the denominator.
|
|
4826
|
+
const DECORATIVE_CLASS_HINTS = /(callout|note|highlight|decorative|icon|headshot|avatar|seal|logo|badge|sidebar|bullet|arrow|divider|separator|spacer|pixel|tracking|analytics|placeholder|flag|star|rating)/i;
|
|
4827
|
+
// Filename-style alt text like "img-23.jpg" / "photo.png" indicates a non-descriptive alt.
|
|
4828
|
+
const FILENAME_ALT_RE = /^(?:img|image|photo|picture)?[-_ ]?\d*\.(?:jpg|jpeg|png|gif|svg|webp)$/i;
|
|
4829
|
+
// Tracking pixel hints in src.
|
|
4830
|
+
const TRACKING_SRC_RE = /(?:pixel|beacon|track|analytics)/i;
|
|
4831
|
+
function isDecorativeImage(imgEl) {
|
|
4832
|
+
const $img = $(imgEl);
|
|
4833
|
+
const role = ($img.attr('role') || '').toLowerCase();
|
|
4834
|
+
if (role === 'presentation' || role === 'none') return true;
|
|
4835
|
+
// Explicit decorative attributes.
|
|
4836
|
+
const ariaHidden = ($img.attr('aria-hidden') || '').toLowerCase();
|
|
4837
|
+
if (ariaHidden === 'true') return true;
|
|
4838
|
+
const dataDecorative = ($img.attr('data-decorative') || '').toLowerCase();
|
|
4839
|
+
if (dataDecorative === 'true') return true;
|
|
4840
|
+
const alt = $img.attr('alt');
|
|
4841
|
+
if (alt !== undefined && alt.trim() === '') return true;
|
|
4842
|
+
// Filename-style alt text is non-descriptive and treated as decorative.
|
|
4843
|
+
if (alt !== undefined && FILENAME_ALT_RE.test(alt.trim())) return true;
|
|
4844
|
+
if ($img.closest('aside').length > 0) return true;
|
|
4845
|
+
// Broader ancestor selectors: chrome regions and ad/banner containers.
|
|
4846
|
+
if ($img.closest('header, nav, footer, button, [role="banner"], [role="navigation"], [role="contentinfo"], .ad, .advertisement, .banner').length > 0) return true;
|
|
4847
|
+
const cls = $img.attr('class') || '';
|
|
4848
|
+
if (DECORATIVE_CLASS_HINTS.test(cls)) return true;
|
|
4849
|
+
if ($img.closest(`[class*="callout"], [class*="note"], [class*="highlight"], [class*="decorative"], [class*="seal"], [class*="logo"], [class*="headshot"], [class*="avatar"], [class*="icon"]`).length > 0) return true;
|
|
4850
|
+
const w = parseInt($img.attr('width'), 10);
|
|
4851
|
+
const h = parseInt($img.attr('height'), 10);
|
|
4852
|
+
// Tracking pixel: 1x1 (or 1xN/Nx1) images.
|
|
4853
|
+
if ((Number.isFinite(w) && w === 1) || (Number.isFinite(h) && h === 1)) return true;
|
|
4854
|
+
if (Number.isFinite(w) && w > 0 && w <= 100) return true;
|
|
4855
|
+
if (Number.isFinite(h) && h > 0 && h <= 100) return true;
|
|
4856
|
+
const src = $img.attr('src') || '';
|
|
4857
|
+
if (src && TRACKING_SRC_RE.test(src)) return true;
|
|
4858
|
+
return false;
|
|
4859
|
+
}
|
|
4860
|
+
let contentImageCount = 0;
|
|
4861
|
+
fallbackImages.each((_i, imgEl) => {
|
|
4862
|
+
if (!isDecorativeImage(imgEl)) contentImageCount++;
|
|
4863
|
+
});
|
|
4864
|
+
|
|
4460
4865
|
const mainFigures = $('main figure, article figure, [role="main"] figure');
|
|
4461
4866
|
const fallbackFigures = mainFigures.length > 0 ? mainFigures : $('figure');
|
|
4462
4867
|
let figuresWithCaption = 0;
|
|
@@ -4467,16 +4872,19 @@ function checkMultimodal($, jsonLdData) {
|
|
|
4467
4872
|
if (fallbackImages.length === 0) {
|
|
4468
4873
|
score += 25;
|
|
4469
4874
|
checks.push({ status: 'info', label: 'No images for figure evaluation', detail: 'No images found on page' });
|
|
4875
|
+
} else if (contentImageCount === 0) {
|
|
4876
|
+
score += 25;
|
|
4877
|
+
checks.push({ status: 'info', label: 'Only decorative images detected', detail: 'No content images require figure/figcaption markup' });
|
|
4470
4878
|
} else {
|
|
4471
|
-
const figPct =
|
|
4472
|
-
if (figPct
|
|
4879
|
+
const figPct = Math.round((figuresWithCaption / contentImageCount) * 100);
|
|
4880
|
+
if (figPct >= 50) {
|
|
4473
4881
|
score += 25;
|
|
4474
|
-
checks.push({ status: 'pass', label: `Good figure/caption usage (${figPct}%)`, detail: `${
|
|
4882
|
+
checks.push({ status: 'pass', label: `Good figure/caption usage (${figPct}%)`, detail: `${figuresWithCaption} of ${contentImageCount} content images wrapped in <figure> with <figcaption>` });
|
|
4475
4883
|
} else if (figuresWithCaption > 0) {
|
|
4476
4884
|
score += 12;
|
|
4477
|
-
checks.push({ status: 'warn', label: 'Some figure/caption usage', detail:
|
|
4885
|
+
checks.push({ status: 'warn', label: 'Some figure/caption usage', detail: `${figuresWithCaption} of ${contentImageCount} content images wrapped, extend to remaining content images` });
|
|
4478
4886
|
} else {
|
|
4479
|
-
checks.push({ status: 'info', label: 'No figure/caption usage', detail: 'Wrap images in <figure> with <figcaption> for better context' });
|
|
4887
|
+
checks.push({ status: 'info', label: 'No figure/caption usage', detail: 'Wrap content images in <figure> with <figcaption> for better context' });
|
|
4480
4888
|
}
|
|
4481
4889
|
}
|
|
4482
4890
|
|
|
@@ -5179,11 +5587,13 @@ function calculateGeoScore(data) {
|
|
|
5179
5587
|
total += robotsScore;
|
|
5180
5588
|
maxPossible += 5;
|
|
5181
5589
|
|
|
5182
|
-
// 2. AI crawlers NOT blocked
|
|
5590
|
+
// 2. AI crawlers NOT blocked. Only citation crawlers (real impact on AI
|
|
5591
|
+
// visibility) contribute to the score. Training-crawler blocks are reported
|
|
5592
|
+
// in the detail string for transparency but do not deduct points.
|
|
5183
5593
|
let crawlerScore = 0;
|
|
5184
5594
|
const blocked = data.robotsTxt.blocksCrawlers || {};
|
|
5185
5595
|
const crawlerDetails = [];
|
|
5186
|
-
for (const crawler of
|
|
5596
|
+
for (const crawler of CITATION_CRAWLERS) {
|
|
5187
5597
|
if (blocked[crawler] === false || blocked[crawler] === undefined) {
|
|
5188
5598
|
crawlerScore += 1;
|
|
5189
5599
|
crawlerDetails.push(`${crawler}: allowed`);
|
|
@@ -5191,9 +5601,13 @@ function calculateGeoScore(data) {
|
|
|
5191
5601
|
crawlerDetails.push(`${crawler}: BLOCKED`);
|
|
5192
5602
|
}
|
|
5193
5603
|
}
|
|
5194
|
-
|
|
5604
|
+
for (const crawler of TRAINING_CRAWLERS) {
|
|
5605
|
+
const status = (blocked[crawler] === false || blocked[crawler] === undefined) ? 'allowed' : 'blocked (training-only, informational)';
|
|
5606
|
+
crawlerDetails.push(`${crawler}: ${status}`);
|
|
5607
|
+
}
|
|
5608
|
+
breakdown.aiCrawlerAccess = { score: crawlerScore, max: CITATION_CRAWLERS.length, detail: crawlerDetails.join('; ') };
|
|
5195
5609
|
total += crawlerScore;
|
|
5196
|
-
maxPossible +=
|
|
5610
|
+
maxPossible += CITATION_CRAWLERS.length;
|
|
5197
5611
|
|
|
5198
5612
|
// 3. llms.txt exists (10 pts)
|
|
5199
5613
|
const llmsScore = data.llmsTxt.exists ? 10 : 0;
|