@rankcli/agent-runtime 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. package/README.md +242 -0
  2. package/dist/analyzer-2CSWIQGD.mjs +6 -0
  3. package/dist/chunk-YNZYHEYM.mjs +774 -0
  4. package/dist/index.d.mts +4012 -0
  5. package/dist/index.d.ts +4012 -0
  6. package/dist/index.js +29672 -0
  7. package/dist/index.mjs +28602 -0
  8. package/package.json +53 -0
  9. package/scripts/build-deno.ts +134 -0
  10. package/src/audit/ai/analyzer.ts +347 -0
  11. package/src/audit/ai/index.ts +29 -0
  12. package/src/audit/ai/prompts/content-analysis.ts +271 -0
  13. package/src/audit/ai/types.ts +179 -0
  14. package/src/audit/checks/additional-checks.ts +439 -0
  15. package/src/audit/checks/ai-citation-worthiness.ts +399 -0
  16. package/src/audit/checks/ai-content-structure.ts +325 -0
  17. package/src/audit/checks/ai-readiness.ts +339 -0
  18. package/src/audit/checks/anchor-text.ts +179 -0
  19. package/src/audit/checks/answer-conciseness.ts +322 -0
  20. package/src/audit/checks/asset-minification.ts +270 -0
  21. package/src/audit/checks/bing-optimization.ts +206 -0
  22. package/src/audit/checks/brand-mention-optimization.ts +349 -0
  23. package/src/audit/checks/caching-headers.ts +305 -0
  24. package/src/audit/checks/canonical-advanced.ts +150 -0
  25. package/src/audit/checks/canonical-domain.ts +196 -0
  26. package/src/audit/checks/citation-quality.ts +358 -0
  27. package/src/audit/checks/client-rendering.ts +542 -0
  28. package/src/audit/checks/color-contrast.ts +342 -0
  29. package/src/audit/checks/content-freshness.ts +170 -0
  30. package/src/audit/checks/content-science.ts +589 -0
  31. package/src/audit/checks/conversion-elements.ts +526 -0
  32. package/src/audit/checks/crawlability.ts +220 -0
  33. package/src/audit/checks/directory-listing.ts +172 -0
  34. package/src/audit/checks/dom-analysis.ts +191 -0
  35. package/src/audit/checks/dom-size.ts +246 -0
  36. package/src/audit/checks/duplicate-content.ts +194 -0
  37. package/src/audit/checks/eeat-signals.ts +990 -0
  38. package/src/audit/checks/entity-seo.ts +396 -0
  39. package/src/audit/checks/featured-snippet.ts +473 -0
  40. package/src/audit/checks/freshness-signals.ts +443 -0
  41. package/src/audit/checks/funnel-intent.ts +463 -0
  42. package/src/audit/checks/hreflang.ts +174 -0
  43. package/src/audit/checks/html-compliance.ts +302 -0
  44. package/src/audit/checks/image-dimensions.ts +167 -0
  45. package/src/audit/checks/images.ts +160 -0
  46. package/src/audit/checks/indexnow.ts +275 -0
  47. package/src/audit/checks/interactive-tools.ts +475 -0
  48. package/src/audit/checks/internal-link-graph.ts +436 -0
  49. package/src/audit/checks/keyword-analysis.ts +239 -0
  50. package/src/audit/checks/keyword-cannibalization.ts +385 -0
  51. package/src/audit/checks/keyword-placement.ts +471 -0
  52. package/src/audit/checks/links.ts +203 -0
  53. package/src/audit/checks/llms-txt.ts +224 -0
  54. package/src/audit/checks/local-seo.ts +296 -0
  55. package/src/audit/checks/mobile.ts +167 -0
  56. package/src/audit/checks/modern-images.ts +226 -0
  57. package/src/audit/checks/navboost-signals.ts +395 -0
  58. package/src/audit/checks/on-page.ts +209 -0
  59. package/src/audit/checks/page-resources.ts +285 -0
  60. package/src/audit/checks/pagination.ts +180 -0
  61. package/src/audit/checks/performance.ts +153 -0
  62. package/src/audit/checks/platform-presence.ts +580 -0
  63. package/src/audit/checks/redirect-analysis.ts +153 -0
  64. package/src/audit/checks/redirect-chain.ts +389 -0
  65. package/src/audit/checks/resource-hints.ts +420 -0
  66. package/src/audit/checks/responsive-css.ts +247 -0
  67. package/src/audit/checks/responsive-images.ts +396 -0
  68. package/src/audit/checks/review-ecosystem.ts +415 -0
  69. package/src/audit/checks/robots-validation.ts +373 -0
  70. package/src/audit/checks/security-headers.ts +172 -0
  71. package/src/audit/checks/security.ts +144 -0
  72. package/src/audit/checks/serp-preview.ts +251 -0
  73. package/src/audit/checks/site-maturity.ts +444 -0
  74. package/src/audit/checks/social-meta.test.ts +275 -0
  75. package/src/audit/checks/social-meta.ts +134 -0
  76. package/src/audit/checks/soft-404.ts +151 -0
  77. package/src/audit/checks/structured-data.ts +238 -0
  78. package/src/audit/checks/tech-detection.ts +496 -0
  79. package/src/audit/checks/topical-clusters.ts +435 -0
  80. package/src/audit/checks/tracker-bloat.ts +462 -0
  81. package/src/audit/checks/tracking-verification.test.ts +371 -0
  82. package/src/audit/checks/tracking-verification.ts +636 -0
  83. package/src/audit/checks/url-safety.ts +682 -0
  84. package/src/audit/deno-entry.ts +66 -0
  85. package/src/audit/discovery/index.ts +15 -0
  86. package/src/audit/discovery/link-crawler.ts +232 -0
  87. package/src/audit/discovery/repo-routes.ts +347 -0
  88. package/src/audit/engine.ts +620 -0
  89. package/src/audit/fixes/index.ts +209 -0
  90. package/src/audit/fixes/social-meta-fixes.test.ts +329 -0
  91. package/src/audit/fixes/social-meta-fixes.ts +463 -0
  92. package/src/audit/index.ts +74 -0
  93. package/src/audit/runner.test.ts +299 -0
  94. package/src/audit/runner.ts +130 -0
  95. package/src/audit/types.ts +1953 -0
  96. package/src/content/featured-snippet.ts +367 -0
  97. package/src/content/generator.test.ts +534 -0
  98. package/src/content/generator.ts +501 -0
  99. package/src/content/headline.ts +317 -0
  100. package/src/content/index.ts +62 -0
  101. package/src/content/intent.ts +258 -0
  102. package/src/content/keyword-density.ts +349 -0
  103. package/src/content/readability.ts +262 -0
  104. package/src/executor.ts +336 -0
  105. package/src/fixer.ts +416 -0
  106. package/src/frameworks/detector.test.ts +248 -0
  107. package/src/frameworks/detector.ts +371 -0
  108. package/src/frameworks/index.ts +68 -0
  109. package/src/frameworks/recipes/angular.yaml +171 -0
  110. package/src/frameworks/recipes/astro.yaml +206 -0
  111. package/src/frameworks/recipes/django.yaml +180 -0
  112. package/src/frameworks/recipes/laravel.yaml +137 -0
  113. package/src/frameworks/recipes/nextjs.yaml +268 -0
  114. package/src/frameworks/recipes/nuxt.yaml +175 -0
  115. package/src/frameworks/recipes/rails.yaml +188 -0
  116. package/src/frameworks/recipes/react.yaml +202 -0
  117. package/src/frameworks/recipes/sveltekit.yaml +154 -0
  118. package/src/frameworks/recipes/vue.yaml +137 -0
  119. package/src/frameworks/recipes/wordpress.yaml +209 -0
  120. package/src/frameworks/suggestion-engine.ts +320 -0
  121. package/src/geo/geo-content.test.ts +305 -0
  122. package/src/geo/geo-content.ts +266 -0
  123. package/src/geo/geo-history.test.ts +473 -0
  124. package/src/geo/geo-history.ts +433 -0
  125. package/src/geo/geo-tracker.test.ts +359 -0
  126. package/src/geo/geo-tracker.ts +411 -0
  127. package/src/geo/index.ts +10 -0
  128. package/src/git/commit-helper.test.ts +261 -0
  129. package/src/git/commit-helper.ts +329 -0
  130. package/src/git/index.ts +12 -0
  131. package/src/git/pr-helper.test.ts +284 -0
  132. package/src/git/pr-helper.ts +307 -0
  133. package/src/index.ts +66 -0
  134. package/src/keywords/ai-keyword-engine.ts +1062 -0
  135. package/src/keywords/ai-summarizer.ts +387 -0
  136. package/src/keywords/ci-mode.ts +555 -0
  137. package/src/keywords/engine.ts +359 -0
  138. package/src/keywords/index.ts +151 -0
  139. package/src/keywords/llm-judge.ts +357 -0
  140. package/src/keywords/nlp-analysis.ts +706 -0
  141. package/src/keywords/prioritizer.ts +295 -0
  142. package/src/keywords/site-crawler.ts +342 -0
  143. package/src/keywords/sources/autocomplete.ts +139 -0
  144. package/src/keywords/sources/competitive-search.ts +450 -0
  145. package/src/keywords/sources/competitor-analysis.ts +374 -0
  146. package/src/keywords/sources/dataforseo.ts +206 -0
  147. package/src/keywords/sources/free-sources.ts +294 -0
  148. package/src/keywords/sources/gsc.ts +123 -0
  149. package/src/keywords/topic-grouping.ts +327 -0
  150. package/src/keywords/types.ts +144 -0
  151. package/src/keywords/wizard.ts +457 -0
  152. package/src/loader.ts +40 -0
  153. package/src/reports/index.ts +7 -0
  154. package/src/reports/report-generator.test.ts +293 -0
  155. package/src/reports/report-generator.ts +713 -0
  156. package/src/scheduler/alerts.test.ts +458 -0
  157. package/src/scheduler/alerts.ts +328 -0
  158. package/src/scheduler/index.ts +8 -0
  159. package/src/scheduler/scheduled-audit.test.ts +377 -0
  160. package/src/scheduler/scheduled-audit.ts +149 -0
  161. package/src/test/integration-test.ts +325 -0
  162. package/src/tools/analyzer.ts +373 -0
  163. package/src/tools/crawl.ts +293 -0
  164. package/src/tools/files.ts +301 -0
  165. package/src/tools/h1-fixer.ts +249 -0
  166. package/src/tools/index.ts +67 -0
  167. package/src/tracking/github-action.ts +326 -0
  168. package/src/tracking/google-analytics.ts +265 -0
  169. package/src/tracking/index.ts +45 -0
  170. package/src/tracking/report-generator.ts +386 -0
  171. package/src/tracking/search-console.ts +335 -0
  172. package/src/types.ts +134 -0
  173. package/src/utils/http.ts +302 -0
  174. package/src/wasm-adapter.ts +297 -0
  175. package/src/wasm-entry.ts +14 -0
  176. package/tsconfig.json +17 -0
  177. package/tsup.wasm.config.ts +26 -0
  178. package/vitest.config.ts +15 -0
@@ -0,0 +1,150 @@
1
+ import * as cheerio from 'cheerio';
2
+ import { httpGet } from '../../utils/http.js';
3
+ import type { AuditIssue } from '../types.js';
4
+ import { ISSUE_DEFINITIONS } from '../types.js';
5
+
6
+ export interface CanonicalData {
7
+ canonical?: string;
8
+ isSelfReferencing: boolean;
9
+ isCrossDomain: boolean;
10
+ canonicalChain?: string[];
11
+ finalCanonical?: string;
12
+ }
13
+
14
+ export async function analyzeCanonicalAdvanced(
15
+ html: string,
16
+ url: string,
17
+ options: { checkChain?: boolean; sitemapUrls?: string[] } = {}
18
+ ): Promise<{ issues: AuditIssue[]; data: CanonicalData }> {
19
+ const issues: AuditIssue[] = [];
20
+ const $ = cheerio.load(html);
21
+
22
+ const canonicalLink = $('link[rel="canonical"]').attr('href');
23
+ const baseUrl = new URL(url);
24
+
25
+ const data: CanonicalData = {
26
+ canonical: canonicalLink,
27
+ isSelfReferencing: false,
28
+ isCrossDomain: false,
29
+ };
30
+
31
+ if (!canonicalLink) {
32
+ return { issues, data };
33
+ }
34
+
35
+ // Resolve canonical URL
36
+ let canonicalUrl: URL;
37
+ try {
38
+ canonicalUrl = new URL(canonicalLink, url);
39
+ } catch {
40
+ return { issues, data };
41
+ }
42
+
43
+ // Check if self-referencing
44
+ const normalizedUrl = normalizeUrl(url);
45
+ const normalizedCanonical = normalizeUrl(canonicalUrl.href);
46
+ data.isSelfReferencing = normalizedUrl === normalizedCanonical;
47
+
48
+ // Check for cross-domain canonical
49
+ if (canonicalUrl.hostname !== baseUrl.hostname) {
50
+ data.isCrossDomain = true;
51
+ issues.push({
52
+ ...ISSUE_DEFINITIONS.CANONICAL_CROSS_DOMAIN,
53
+ affectedUrls: [url],
54
+ details: {
55
+ canonical: canonicalUrl.href,
56
+ sourceDomain: baseUrl.hostname,
57
+ targetDomain: canonicalUrl.hostname,
58
+ },
59
+ });
60
+ }
61
+
62
+ // Check for canonical chain if enabled
63
+ if (options.checkChain && !data.isSelfReferencing) {
64
+ const chain = await followCanonicalChain(canonicalUrl.href, [url]);
65
+ if (chain.length > 2) {
66
+ data.canonicalChain = chain;
67
+ data.finalCanonical = chain[chain.length - 1];
68
+ issues.push({
69
+ ...ISSUE_DEFINITIONS.CANONICAL_CHAIN,
70
+ affectedUrls: [url],
71
+ details: {
72
+ chain,
73
+ depth: chain.length,
74
+ },
75
+ });
76
+ }
77
+ }
78
+
79
+ // Check if canonical matches sitemap
80
+ if (options.sitemapUrls && options.sitemapUrls.length > 0) {
81
+ const sitemapNormalized = options.sitemapUrls.map(normalizeUrl);
82
+ const urlInSitemap = sitemapNormalized.includes(normalizedUrl);
83
+ const canonicalInSitemap = sitemapNormalized.includes(normalizedCanonical);
84
+
85
+ if (urlInSitemap && !canonicalInSitemap && !data.isSelfReferencing) {
86
+ issues.push({
87
+ ...ISSUE_DEFINITIONS.CANONICAL_MISMATCH_SITEMAP,
88
+ affectedUrls: [url],
89
+ details: {
90
+ pageUrl: url,
91
+ canonical: canonicalUrl.href,
92
+ message: 'Page is in sitemap but its canonical URL is not',
93
+ },
94
+ });
95
+ }
96
+ }
97
+
98
+ return { issues, data };
99
+ }
100
+
101
+ async function followCanonicalChain(
102
+ url: string,
103
+ chain: string[] = [],
104
+ maxDepth: number = 5
105
+ ): Promise<string[]> {
106
+ if (chain.length >= maxDepth) return chain;
107
+
108
+ try {
109
+ const response = await httpGet<string>(url, {
110
+ timeout: 5000,
111
+ maxRedirects: 5,
112
+ validateStatus: (status) => status < 400,
113
+ });
114
+
115
+ const $ = cheerio.load(response.data);
116
+ const nextCanonical = $('link[rel="canonical"]').attr('href');
117
+
118
+ if (!nextCanonical) {
119
+ return [...chain, url];
120
+ }
121
+
122
+ const resolvedCanonical = new URL(nextCanonical, url).href;
123
+ const normalizedCurrent = normalizeUrl(url);
124
+ const normalizedNext = normalizeUrl(resolvedCanonical);
125
+
126
+ // Self-referencing, end of chain
127
+ if (normalizedCurrent === normalizedNext) {
128
+ return [...chain, url];
129
+ }
130
+
131
+ // Circular reference detection
132
+ if (chain.some(u => normalizeUrl(u) === normalizedNext)) {
133
+ return [...chain, url, resolvedCanonical + ' (circular)'];
134
+ }
135
+
136
+ return followCanonicalChain(resolvedCanonical, [...chain, url], maxDepth);
137
+ } catch {
138
+ return [...chain, url];
139
+ }
140
+ }
141
+
142
+ function normalizeUrl(urlStr: string): string {
143
+ try {
144
+ const parsed = new URL(urlStr);
145
+ // Remove trailing slash, lowercase hostname
146
+ return `${parsed.protocol}//${parsed.hostname.toLowerCase()}${parsed.pathname.replace(/\/$/, '')}`;
147
+ } catch {
148
+ return urlStr.toLowerCase().replace(/\/$/, '');
149
+ }
150
+ }
@@ -0,0 +1,196 @@
1
+ /**
2
+ * Canonical Domain Check
3
+ *
4
+ * Verifies that the site properly handles www vs non-www versions
5
+ * to avoid duplicate content issues.
6
+ *
7
+ * Best practice: One version should redirect to the other (301 redirect).
8
+ * Both versions serving content = duplicate content problem.
9
+ */
10
+
11
+ import { httpGet } from '../../utils/http.js';
12
+ import type { AuditIssue } from '../types.js';
13
+
14
+ export interface CanonicalDomainData {
15
+ primaryDomain: string;
16
+ wwwVersion: {
17
+ url: string;
18
+ status: number | null;
19
+ redirectsTo?: string;
20
+ accessible: boolean;
21
+ };
22
+ nonWwwVersion: {
23
+ url: string;
24
+ status: number | null;
25
+ redirectsTo?: string;
26
+ accessible: boolean;
27
+ };
28
+ hasProperRedirect: boolean;
29
+ preferredVersion: 'www' | 'non-www' | 'unknown';
30
+ httpsRedirect: {
31
+ httpToHttps: boolean;
32
+ httpsStatus: number | null;
33
+ };
34
+ }
35
+
36
+ export async function analyzeCanonicalDomain(
37
+ url: string
38
+ ): Promise<{ issues: AuditIssue[]; data: CanonicalDomainData }> {
39
+ const issues: AuditIssue[] = [];
40
+ const parsedUrl = new URL(url);
41
+ const hostname = parsedUrl.hostname;
42
+
43
+ // Determine www and non-www versions
44
+ const isWww = hostname.startsWith('www.');
45
+ const wwwHostname = isWww ? hostname : `www.${hostname}`;
46
+ const nonWwwHostname = isWww ? hostname.substring(4) : hostname;
47
+
48
+ // Skip if hostname has multiple subdomains (e.g., api.staging.example.com)
49
+ const parts = nonWwwHostname.split('.');
50
+ if (parts.length > 2 && !nonWwwHostname.match(/\.(co|com|org|net|gov)\.[a-z]{2}$/i)) {
51
+ // This is a subdomain, not a main domain
52
+ return {
53
+ issues: [],
54
+ data: {
55
+ primaryDomain: hostname,
56
+ wwwVersion: { url: '', status: null, accessible: false },
57
+ nonWwwVersion: { url: '', status: null, accessible: false },
58
+ hasProperRedirect: true, // N/A for subdomains
59
+ preferredVersion: 'unknown',
60
+ httpsRedirect: { httpToHttps: false, httpsStatus: null },
61
+ },
62
+ };
63
+ }
64
+
65
+ const wwwUrl = `https://${wwwHostname}${parsedUrl.pathname}`;
66
+ const nonWwwUrl = `https://${nonWwwHostname}${parsedUrl.pathname}`;
67
+
68
+ // Test both versions
69
+ const [wwwResult, nonWwwResult] = await Promise.all([
70
+ testUrl(wwwUrl),
71
+ testUrl(nonWwwUrl),
72
+ ]);
73
+
74
+ // Determine if proper redirect exists
75
+ let hasProperRedirect = false;
76
+ let preferredVersion: 'www' | 'non-www' | 'unknown' = 'unknown';
77
+
78
+ // Check if www redirects to non-www
79
+ if (wwwResult.status && wwwResult.status >= 300 && wwwResult.status < 400) {
80
+ if (wwwResult.redirectsTo?.includes(nonWwwHostname)) {
81
+ hasProperRedirect = true;
82
+ preferredVersion = 'non-www';
83
+ }
84
+ }
85
+
86
+ // Check if non-www redirects to www
87
+ if (nonWwwResult.status && nonWwwResult.status >= 300 && nonWwwResult.status < 400) {
88
+ if (nonWwwResult.redirectsTo?.includes(wwwHostname)) {
89
+ hasProperRedirect = true;
90
+ preferredVersion = 'www';
91
+ }
92
+ }
93
+
94
+ // Both accessible without redirect = problem
95
+ const bothAccessible =
96
+ wwwResult.accessible &&
97
+ nonWwwResult.accessible &&
98
+ wwwResult.status === 200 &&
99
+ nonWwwResult.status === 200;
100
+
101
+ if (bothAccessible && !hasProperRedirect) {
102
+ issues.push({
103
+ code: 'CANONICAL_WWW_DUPLICATE',
104
+ severity: 'warning',
105
+ category: 'crawlability',
106
+ title: 'Both www and non-www versions accessible',
107
+ description: `Both ${wwwUrl} and ${nonWwwUrl} return 200 OK. This creates duplicate content.`,
108
+ impact: 'Search engines may index both versions, diluting page authority and causing ranking issues.',
109
+ howToFix: 'Set up a 301 redirect from one version to the other. Most sites prefer non-www, but either works. Configure at server/CDN level.',
110
+ affectedUrls: [wwwUrl, nonWwwUrl],
111
+ details: {
112
+ wwwStatus: wwwResult.status,
113
+ nonWwwStatus: nonWwwResult.status,
114
+ recommendation: 'Choose one version as canonical and 301 redirect the other.',
115
+ },
116
+ });
117
+ }
118
+
119
+ // Check HTTP to HTTPS redirect
120
+ const httpUrl = `http://${hostname}${parsedUrl.pathname}`;
121
+ const httpResult = await testUrl(httpUrl);
122
+
123
+ let httpToHttps = false;
124
+ if (httpResult.status && httpResult.status >= 300 && httpResult.status < 400) {
125
+ if (httpResult.redirectsTo?.startsWith('https://')) {
126
+ httpToHttps = true;
127
+ }
128
+ }
129
+
130
+ if (!httpToHttps && httpResult.accessible && httpResult.status === 200) {
131
+ issues.push({
132
+ code: 'CANONICAL_NO_HTTPS_REDIRECT',
133
+ severity: 'warning',
134
+ category: 'security',
135
+ title: 'HTTP does not redirect to HTTPS',
136
+ description: 'The HTTP version of the site is accessible without redirecting to HTTPS.',
137
+ impact: 'Duplicate content issues and security concerns. Users may access insecure version.',
138
+ howToFix: 'Configure a 301 redirect from HTTP to HTTPS at the server or CDN level.',
139
+ affectedUrls: [httpUrl],
140
+ });
141
+ }
142
+
143
+ return {
144
+ issues,
145
+ data: {
146
+ primaryDomain: hostname,
147
+ wwwVersion: {
148
+ url: wwwUrl,
149
+ status: wwwResult.status,
150
+ redirectsTo: wwwResult.redirectsTo,
151
+ accessible: wwwResult.accessible,
152
+ },
153
+ nonWwwVersion: {
154
+ url: nonWwwUrl,
155
+ status: nonWwwResult.status,
156
+ redirectsTo: nonWwwResult.redirectsTo,
157
+ accessible: nonWwwResult.accessible,
158
+ },
159
+ hasProperRedirect,
160
+ preferredVersion,
161
+ httpsRedirect: {
162
+ httpToHttps,
163
+ httpsStatus: httpResult.status,
164
+ },
165
+ },
166
+ };
167
+ }
168
+
169
+ async function testUrl(url: string): Promise<{
170
+ status: number | null;
171
+ redirectsTo?: string;
172
+ accessible: boolean;
173
+ }> {
174
+ try {
175
+ const response = await httpGet<string>(url, {
176
+ timeout: 10000,
177
+ maxRedirects: 0, // Don't follow redirects
178
+ validateStatus: () => true,
179
+ });
180
+
181
+ const status = response.status;
182
+ const location = response.headers.location || response.headers.Location;
183
+
184
+ return {
185
+ status,
186
+ redirectsTo: location,
187
+ accessible: status !== null && status < 500,
188
+ };
189
+ } catch (error) {
190
+ // DNS resolution failure, connection refused, etc.
191
+ return {
192
+ status: null,
193
+ accessible: false,
194
+ };
195
+ }
196
+ }
@@ -0,0 +1,358 @@
1
+ /**
2
+ * Citation Quality Checks
3
+ *
4
+ * AI prioritizes content that cites reputable sources. Well-cited content
5
+ * demonstrates expertise and trustworthiness (E-E-A-T signals).
6
+ * These checks verify proper citation practices.
7
+ */
8
+
9
+ import * as cheerio from 'cheerio';
10
+ import type { AuditIssue } from '../types.js';
11
+
12
+ // Reputable source domains by category
13
+ const REPUTABLE_SOURCES = {
14
+ academic: [
15
+ 'scholar.google.com', 'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov',
16
+ 'arxiv.org', 'jstor.org', 'researchgate.net', 'sciencedirect.com',
17
+ 'nature.com', 'science.org', 'springer.com', 'wiley.com',
18
+ ],
19
+ government: [
20
+ 'gov', 'gov.uk', 'europa.eu', 'un.org', 'who.int', 'cdc.gov',
21
+ 'nih.gov', 'fda.gov', 'epa.gov', 'sec.gov', 'ftc.gov',
22
+ ],
23
+ educational: [
24
+ '.edu', 'mit.edu', 'stanford.edu', 'harvard.edu', 'oxford.ac.uk',
25
+ 'cambridge.org', 'coursera.org', 'khanacademy.org',
26
+ ],
27
+ authoritative: [
28
+ 'wikipedia.org', 'britannica.com', 'statista.com', 'pewresearch.org',
29
+ 'gallup.com', 'mckinsey.com', 'hbr.org', 'forbes.com', 'reuters.com',
30
+ 'apnews.com', 'bbc.com', 'nytimes.com', 'wsj.com', 'economist.com',
31
+ ],
32
+ technical: [
33
+ 'github.com', 'stackoverflow.com', 'developer.mozilla.org', 'w3.org',
34
+ 'ietf.org', 'rfc-editor.org', 'docs.google.com', 'developers.google.com',
35
+ ],
36
+ };
37
+
38
+ export interface CitationQualityData {
39
+ totalLinks: number;
40
+ externalLinks: number;
41
+ citationLinks: number;
42
+ reputableSources: {
43
+ academic: string[];
44
+ government: string[];
45
+ educational: string[];
46
+ authoritative: string[];
47
+ technical: string[];
48
+ };
49
+ citationPatterns: {
50
+ hasInlineCitations: boolean;
51
+ hasReferencesSection: boolean;
52
+ hasStatistics: boolean;
53
+ statisticsWithSources: number;
54
+ statisticsWithoutSources: number;
55
+ };
56
+ citationScore: number;
57
+ }
58
+
59
+ export function analyzeCitationQuality(
60
+ html: string,
61
+ url: string
62
+ ): { issues: AuditIssue[]; data: CitationQualityData } {
63
+ const issues: AuditIssue[] = [];
64
+ const $ = cheerio.load(html);
65
+ const parsedUrl = new URL(url);
66
+ const currentDomain = parsedUrl.hostname;
67
+
68
+ // Remove nav, footer, aside for content analysis
69
+ const $content = $('body').clone();
70
+ $content.find('nav, footer, aside, script, style, noscript').remove();
71
+
72
+ // Collect all links
73
+ const links = $content.find('a[href]');
74
+ let totalLinks = 0;
75
+ let externalLinks = 0;
76
+ let citationLinks = 0;
77
+
78
+ const reputableSources: CitationQualityData['reputableSources'] = {
79
+ academic: [],
80
+ government: [],
81
+ educational: [],
82
+ authoritative: [],
83
+ technical: [],
84
+ };
85
+
86
+ links.each((_, link) => {
87
+ const href = $(link).attr('href');
88
+ if (!href) return;
89
+
90
+ totalLinks++;
91
+
92
+ try {
93
+ const linkUrl = new URL(href, url);
94
+
95
+ // Skip internal links
96
+ if (linkUrl.hostname === currentDomain || linkUrl.hostname.endsWith(`.${currentDomain}`)) {
97
+ return;
98
+ }
99
+
100
+ // Skip common non-citation links
101
+ if (linkUrl.hostname.includes('facebook.com') ||
102
+ linkUrl.hostname.includes('twitter.com') ||
103
+ linkUrl.hostname.includes('instagram.com') ||
104
+ linkUrl.hostname.includes('linkedin.com/share') ||
105
+ linkUrl.hostname.includes('pinterest.com')) {
106
+ return;
107
+ }
108
+
109
+ externalLinks++;
110
+
111
+ // Check if it's a reputable source
112
+ const hostname = linkUrl.hostname.toLowerCase();
113
+
114
+ for (const source of REPUTABLE_SOURCES.academic) {
115
+ if (hostname.includes(source)) {
116
+ reputableSources.academic.push(hostname);
117
+ citationLinks++;
118
+ return;
119
+ }
120
+ }
121
+
122
+ for (const source of REPUTABLE_SOURCES.government) {
123
+ if (hostname.endsWith(source) || hostname.includes(`.${source}`)) {
124
+ reputableSources.government.push(hostname);
125
+ citationLinks++;
126
+ return;
127
+ }
128
+ }
129
+
130
+ for (const source of REPUTABLE_SOURCES.educational) {
131
+ if (hostname.endsWith(source) || hostname.includes(source)) {
132
+ reputableSources.educational.push(hostname);
133
+ citationLinks++;
134
+ return;
135
+ }
136
+ }
137
+
138
+ for (const source of REPUTABLE_SOURCES.authoritative) {
139
+ if (hostname.includes(source)) {
140
+ reputableSources.authoritative.push(hostname);
141
+ citationLinks++;
142
+ return;
143
+ }
144
+ }
145
+
146
+ for (const source of REPUTABLE_SOURCES.technical) {
147
+ if (hostname.includes(source)) {
148
+ reputableSources.technical.push(hostname);
149
+ citationLinks++;
150
+ return;
151
+ }
152
+ }
153
+
154
+ } catch {
155
+ // Invalid URL
156
+ }
157
+ });
158
+
159
+ // Check for citation patterns
160
+ const bodyText = $content.text();
161
+ const bodyHtml = $content.html() || '';
162
+
163
+ // Inline citations like [1], (Source: X), (2023), etc.
164
+ const hasInlineCitations =
165
+ /\[\d+\]/.test(bodyText) ||
166
+ /\(Source:/i.test(bodyText) ||
167
+ /\(Citation:/i.test(bodyText) ||
168
+ /according to/i.test(bodyText) ||
169
+ /study (by|from|shows|found)/i.test(bodyText);
170
+
171
+ // References section
172
+ const hasReferencesSection =
173
+ /(references|sources|citations|bibliography|works cited)/i.test(
174
+ $('h2, h3, h4, h5, h6').text()
175
+ ) ||
176
+ $('#references, .references, #sources, .sources, #citations').length > 0;
177
+
178
+ // Statistics detection
179
+ const statisticPatterns = [
180
+ /(\d+(?:\.\d+)?)\s*%/g, // Percentages
181
+ /(\d+(?:,\d{3})*(?:\.\d+)?)\s*(million|billion|trillion)/gi, // Large numbers
182
+ /\$(\d+(?:,\d{3})*(?:\.\d+)?)/g, // Dollar amounts
183
+ /(\d+)\s*out of\s*(\d+)/gi, // Ratios
184
+ /(\d+)x\s/gi, // Multipliers
185
+ ];
186
+
187
+ let statisticsWithSources = 0;
188
+ let statisticsWithoutSources = 0;
189
+
190
+ // Find paragraphs with statistics
191
+ $content.find('p').each((_, p) => {
192
+ const pText = $(p).text();
193
+ const pHtml = $(p).html() || '';
194
+
195
+ let hasStatistic = false;
196
+ for (const pattern of statisticPatterns) {
197
+ if (pattern.test(pText)) {
198
+ hasStatistic = true;
199
+ pattern.lastIndex = 0; // Reset regex
200
+ break;
201
+ }
202
+ }
203
+
204
+ if (hasStatistic) {
205
+ // Check if paragraph has a citation link
206
+ const hasLink = $(p).find('a[href]').length > 0;
207
+ const hasCitationText = /\(.*\d{4}\)|according to|source:|study/i.test(pText);
208
+
209
+ if (hasLink || hasCitationText) {
210
+ statisticsWithSources++;
211
+ } else {
212
+ statisticsWithoutSources++;
213
+ }
214
+ }
215
+ });
216
+
217
+ const hasStatistics = statisticsWithSources + statisticsWithoutSources > 0;
218
+
219
+ // Calculate citation score (0-100)
220
+ let citationScore = 30; // Base score
221
+
222
+ // Reputable source bonuses
223
+ if (reputableSources.academic.length > 0) citationScore += 15;
224
+ if (reputableSources.government.length > 0) citationScore += 15;
225
+ if (reputableSources.educational.length > 0) citationScore += 10;
226
+ if (reputableSources.authoritative.length > 0) citationScore += 10;
227
+ if (reputableSources.technical.length > 0) citationScore += 5;
228
+
229
+ // Citation pattern bonuses
230
+ if (hasInlineCitations) citationScore += 10;
231
+ if (hasReferencesSection) citationScore += 10;
232
+
233
+ // Statistics citation bonus/penalty
234
+ if (hasStatistics) {
235
+ const citedRatio = statisticsWithSources / (statisticsWithSources + statisticsWithoutSources);
236
+ if (citedRatio >= 0.8) citationScore += 10;
237
+ else if (citedRatio >= 0.5) citationScore += 5;
238
+ else if (citedRatio < 0.3) citationScore -= 10;
239
+ }
240
+
241
+ // Penalty for no external links at all
242
+ if (externalLinks === 0) citationScore -= 20;
243
+
244
+ citationScore = Math.max(0, Math.min(100, citationScore));
245
+
246
+ // Generate issues
247
+
248
+ // No reputable sources cited
249
+ const totalReputable = Object.values(reputableSources).flat().length;
250
+ if (totalReputable === 0 && externalLinks > 0) {
251
+ issues.push({
252
+ code: 'AI_NO_REPUTABLE_CITATIONS',
253
+ severity: 'warning',
254
+ category: 'ai-readiness',
255
+ title: 'No reputable sources cited',
256
+ description: 'Content has external links but none to recognized authoritative sources. AI prioritizes content that cites reputable sources.',
257
+ impact: 'Lower E-E-A-T signals and reduced likelihood of AI citing your content.',
258
+ howToFix: 'Add citations to reputable sources: academic papers, government sites (.gov), educational institutions (.edu), or established publications.',
259
+ affectedUrls: [url],
260
+ details: {
261
+ externalLinks,
262
+ reputableLinks: 0,
263
+ suggestedSources: [
264
+ 'Academic: scholar.google.com, pubmed.ncbi.nlm.nih.gov',
265
+ 'Government: .gov, who.int, cdc.gov',
266
+ 'Technical: developer.mozilla.org, w3.org',
267
+ ],
268
+ },
269
+ });
270
+ }
271
+
272
+ // No external links at all
273
+ if (externalLinks === 0 && totalLinks > 0) {
274
+ issues.push({
275
+ code: 'AI_NO_EXTERNAL_CITATIONS',
276
+ severity: 'warning',
277
+ category: 'ai-readiness',
278
+ title: 'No external sources linked',
279
+ description: 'Content has no external links. AI values content that references and links to authoritative sources.',
280
+ impact: 'Appears self-contained without external validation, reducing trust signals for AI.',
281
+ howToFix: 'Add relevant external links to support your claims. Link to studies, official documentation, or authoritative sources.',
282
+ affectedUrls: [url],
283
+ });
284
+ }
285
+
286
+ // Statistics without sources
287
+ if (statisticsWithoutSources > 0) {
288
+ issues.push({
289
+ code: 'AI_UNCITED_STATISTICS',
290
+ severity: 'warning',
291
+ category: 'ai-readiness',
292
+ title: 'Statistics without source citations',
293
+ description: `Found ${statisticsWithoutSources} statistic(s) without linked sources. AI is trained to prefer claims backed by citations.`,
294
+ impact: 'Uncited statistics reduce content credibility and E-E-A-T signals.',
295
+ howToFix: 'Add source links or citations for all statistics. Include the source name and year when possible (e.g., "according to [Source, 2024]").',
296
+ affectedUrls: [url],
297
+ details: {
298
+ statisticsWithSources,
299
+ statisticsWithoutSources,
300
+ citedRatio: statisticsWithSources / (statisticsWithSources + statisticsWithoutSources),
301
+ },
302
+ });
303
+ }
304
+
305
+ // No references section for content-heavy pages
306
+ const wordCount = bodyText.split(/\s+/).length;
307
+ if (wordCount > 1000 && !hasReferencesSection && citationLinks > 0) {
308
+ issues.push({
309
+ code: 'AI_NO_REFERENCES_SECTION',
310
+ severity: 'notice',
311
+ category: 'ai-readiness',
312
+ title: 'No dedicated references section',
313
+ description: 'Long-form content with citations but no dedicated references section. A references section signals academic rigor.',
314
+ impact: 'Missing opportunity to demonstrate comprehensive research and boost E-E-A-T.',
315
+ howToFix: 'Add a "References" or "Sources" section at the end listing all cited sources with full attribution.',
316
+ affectedUrls: [url],
317
+ });
318
+ }
319
+
320
+ // Low citation score
321
+ if (citationScore < 40) {
322
+ issues.push({
323
+ code: 'AI_LOW_CITATION_QUALITY',
324
+ severity: 'warning',
325
+ category: 'ai-readiness',
326
+ title: 'Content citation quality needs improvement',
327
+ description: `Citation quality score: ${citationScore}/100. Well-cited content ranks higher in AI search results.`,
328
+ impact: 'Low citation quality signals reduce E-E-A-T and AI citation likelihood.',
329
+ howToFix: 'Improve citations: 1) Link to reputable sources, 2) Cite statistics with sources, 3) Add a references section, 4) Use inline citations for claims.',
330
+ affectedUrls: [url],
331
+ details: {
332
+ citationScore,
333
+ externalLinks,
334
+ reputableSources: totalReputable,
335
+ statisticsWithSources,
336
+ statisticsWithoutSources,
337
+ },
338
+ });
339
+ }
340
+
341
+ return {
342
+ issues,
343
+ data: {
344
+ totalLinks,
345
+ externalLinks,
346
+ citationLinks,
347
+ reputableSources,
348
+ citationPatterns: {
349
+ hasInlineCitations,
350
+ hasReferencesSection,
351
+ hasStatistics,
352
+ statisticsWithSources,
353
+ statisticsWithoutSources,
354
+ },
355
+ citationScore,
356
+ },
357
+ };
358
+ }