@rankcli/agent-runtime 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. package/README.md +242 -0
  2. package/dist/analyzer-2CSWIQGD.mjs +6 -0
  3. package/dist/chunk-YNZYHEYM.mjs +774 -0
  4. package/dist/index.d.mts +4012 -0
  5. package/dist/index.d.ts +4012 -0
  6. package/dist/index.js +29672 -0
  7. package/dist/index.mjs +28602 -0
  8. package/package.json +53 -0
  9. package/scripts/build-deno.ts +134 -0
  10. package/src/audit/ai/analyzer.ts +347 -0
  11. package/src/audit/ai/index.ts +29 -0
  12. package/src/audit/ai/prompts/content-analysis.ts +271 -0
  13. package/src/audit/ai/types.ts +179 -0
  14. package/src/audit/checks/additional-checks.ts +439 -0
  15. package/src/audit/checks/ai-citation-worthiness.ts +399 -0
  16. package/src/audit/checks/ai-content-structure.ts +325 -0
  17. package/src/audit/checks/ai-readiness.ts +339 -0
  18. package/src/audit/checks/anchor-text.ts +179 -0
  19. package/src/audit/checks/answer-conciseness.ts +322 -0
  20. package/src/audit/checks/asset-minification.ts +270 -0
  21. package/src/audit/checks/bing-optimization.ts +206 -0
  22. package/src/audit/checks/brand-mention-optimization.ts +349 -0
  23. package/src/audit/checks/caching-headers.ts +305 -0
  24. package/src/audit/checks/canonical-advanced.ts +150 -0
  25. package/src/audit/checks/canonical-domain.ts +196 -0
  26. package/src/audit/checks/citation-quality.ts +358 -0
  27. package/src/audit/checks/client-rendering.ts +542 -0
  28. package/src/audit/checks/color-contrast.ts +342 -0
  29. package/src/audit/checks/content-freshness.ts +170 -0
  30. package/src/audit/checks/content-science.ts +589 -0
  31. package/src/audit/checks/conversion-elements.ts +526 -0
  32. package/src/audit/checks/crawlability.ts +220 -0
  33. package/src/audit/checks/directory-listing.ts +172 -0
  34. package/src/audit/checks/dom-analysis.ts +191 -0
  35. package/src/audit/checks/dom-size.ts +246 -0
  36. package/src/audit/checks/duplicate-content.ts +194 -0
  37. package/src/audit/checks/eeat-signals.ts +990 -0
  38. package/src/audit/checks/entity-seo.ts +396 -0
  39. package/src/audit/checks/featured-snippet.ts +473 -0
  40. package/src/audit/checks/freshness-signals.ts +443 -0
  41. package/src/audit/checks/funnel-intent.ts +463 -0
  42. package/src/audit/checks/hreflang.ts +174 -0
  43. package/src/audit/checks/html-compliance.ts +302 -0
  44. package/src/audit/checks/image-dimensions.ts +167 -0
  45. package/src/audit/checks/images.ts +160 -0
  46. package/src/audit/checks/indexnow.ts +275 -0
  47. package/src/audit/checks/interactive-tools.ts +475 -0
  48. package/src/audit/checks/internal-link-graph.ts +436 -0
  49. package/src/audit/checks/keyword-analysis.ts +239 -0
  50. package/src/audit/checks/keyword-cannibalization.ts +385 -0
  51. package/src/audit/checks/keyword-placement.ts +471 -0
  52. package/src/audit/checks/links.ts +203 -0
  53. package/src/audit/checks/llms-txt.ts +224 -0
  54. package/src/audit/checks/local-seo.ts +296 -0
  55. package/src/audit/checks/mobile.ts +167 -0
  56. package/src/audit/checks/modern-images.ts +226 -0
  57. package/src/audit/checks/navboost-signals.ts +395 -0
  58. package/src/audit/checks/on-page.ts +209 -0
  59. package/src/audit/checks/page-resources.ts +285 -0
  60. package/src/audit/checks/pagination.ts +180 -0
  61. package/src/audit/checks/performance.ts +153 -0
  62. package/src/audit/checks/platform-presence.ts +580 -0
  63. package/src/audit/checks/redirect-analysis.ts +153 -0
  64. package/src/audit/checks/redirect-chain.ts +389 -0
  65. package/src/audit/checks/resource-hints.ts +420 -0
  66. package/src/audit/checks/responsive-css.ts +247 -0
  67. package/src/audit/checks/responsive-images.ts +396 -0
  68. package/src/audit/checks/review-ecosystem.ts +415 -0
  69. package/src/audit/checks/robots-validation.ts +373 -0
  70. package/src/audit/checks/security-headers.ts +172 -0
  71. package/src/audit/checks/security.ts +144 -0
  72. package/src/audit/checks/serp-preview.ts +251 -0
  73. package/src/audit/checks/site-maturity.ts +444 -0
  74. package/src/audit/checks/social-meta.test.ts +275 -0
  75. package/src/audit/checks/social-meta.ts +134 -0
  76. package/src/audit/checks/soft-404.ts +151 -0
  77. package/src/audit/checks/structured-data.ts +238 -0
  78. package/src/audit/checks/tech-detection.ts +496 -0
  79. package/src/audit/checks/topical-clusters.ts +435 -0
  80. package/src/audit/checks/tracker-bloat.ts +462 -0
  81. package/src/audit/checks/tracking-verification.test.ts +371 -0
  82. package/src/audit/checks/tracking-verification.ts +636 -0
  83. package/src/audit/checks/url-safety.ts +682 -0
  84. package/src/audit/deno-entry.ts +66 -0
  85. package/src/audit/discovery/index.ts +15 -0
  86. package/src/audit/discovery/link-crawler.ts +232 -0
  87. package/src/audit/discovery/repo-routes.ts +347 -0
  88. package/src/audit/engine.ts +620 -0
  89. package/src/audit/fixes/index.ts +209 -0
  90. package/src/audit/fixes/social-meta-fixes.test.ts +329 -0
  91. package/src/audit/fixes/social-meta-fixes.ts +463 -0
  92. package/src/audit/index.ts +74 -0
  93. package/src/audit/runner.test.ts +299 -0
  94. package/src/audit/runner.ts +130 -0
  95. package/src/audit/types.ts +1953 -0
  96. package/src/content/featured-snippet.ts +367 -0
  97. package/src/content/generator.test.ts +534 -0
  98. package/src/content/generator.ts +501 -0
  99. package/src/content/headline.ts +317 -0
  100. package/src/content/index.ts +62 -0
  101. package/src/content/intent.ts +258 -0
  102. package/src/content/keyword-density.ts +349 -0
  103. package/src/content/readability.ts +262 -0
  104. package/src/executor.ts +336 -0
  105. package/src/fixer.ts +416 -0
  106. package/src/frameworks/detector.test.ts +248 -0
  107. package/src/frameworks/detector.ts +371 -0
  108. package/src/frameworks/index.ts +68 -0
  109. package/src/frameworks/recipes/angular.yaml +171 -0
  110. package/src/frameworks/recipes/astro.yaml +206 -0
  111. package/src/frameworks/recipes/django.yaml +180 -0
  112. package/src/frameworks/recipes/laravel.yaml +137 -0
  113. package/src/frameworks/recipes/nextjs.yaml +268 -0
  114. package/src/frameworks/recipes/nuxt.yaml +175 -0
  115. package/src/frameworks/recipes/rails.yaml +188 -0
  116. package/src/frameworks/recipes/react.yaml +202 -0
  117. package/src/frameworks/recipes/sveltekit.yaml +154 -0
  118. package/src/frameworks/recipes/vue.yaml +137 -0
  119. package/src/frameworks/recipes/wordpress.yaml +209 -0
  120. package/src/frameworks/suggestion-engine.ts +320 -0
  121. package/src/geo/geo-content.test.ts +305 -0
  122. package/src/geo/geo-content.ts +266 -0
  123. package/src/geo/geo-history.test.ts +473 -0
  124. package/src/geo/geo-history.ts +433 -0
  125. package/src/geo/geo-tracker.test.ts +359 -0
  126. package/src/geo/geo-tracker.ts +411 -0
  127. package/src/geo/index.ts +10 -0
  128. package/src/git/commit-helper.test.ts +261 -0
  129. package/src/git/commit-helper.ts +329 -0
  130. package/src/git/index.ts +12 -0
  131. package/src/git/pr-helper.test.ts +284 -0
  132. package/src/git/pr-helper.ts +307 -0
  133. package/src/index.ts +66 -0
  134. package/src/keywords/ai-keyword-engine.ts +1062 -0
  135. package/src/keywords/ai-summarizer.ts +387 -0
  136. package/src/keywords/ci-mode.ts +555 -0
  137. package/src/keywords/engine.ts +359 -0
  138. package/src/keywords/index.ts +151 -0
  139. package/src/keywords/llm-judge.ts +357 -0
  140. package/src/keywords/nlp-analysis.ts +706 -0
  141. package/src/keywords/prioritizer.ts +295 -0
  142. package/src/keywords/site-crawler.ts +342 -0
  143. package/src/keywords/sources/autocomplete.ts +139 -0
  144. package/src/keywords/sources/competitive-search.ts +450 -0
  145. package/src/keywords/sources/competitor-analysis.ts +374 -0
  146. package/src/keywords/sources/dataforseo.ts +206 -0
  147. package/src/keywords/sources/free-sources.ts +294 -0
  148. package/src/keywords/sources/gsc.ts +123 -0
  149. package/src/keywords/topic-grouping.ts +327 -0
  150. package/src/keywords/types.ts +144 -0
  151. package/src/keywords/wizard.ts +457 -0
  152. package/src/loader.ts +40 -0
  153. package/src/reports/index.ts +7 -0
  154. package/src/reports/report-generator.test.ts +293 -0
  155. package/src/reports/report-generator.ts +713 -0
  156. package/src/scheduler/alerts.test.ts +458 -0
  157. package/src/scheduler/alerts.ts +328 -0
  158. package/src/scheduler/index.ts +8 -0
  159. package/src/scheduler/scheduled-audit.test.ts +377 -0
  160. package/src/scheduler/scheduled-audit.ts +149 -0
  161. package/src/test/integration-test.ts +325 -0
  162. package/src/tools/analyzer.ts +373 -0
  163. package/src/tools/crawl.ts +293 -0
  164. package/src/tools/files.ts +301 -0
  165. package/src/tools/h1-fixer.ts +249 -0
  166. package/src/tools/index.ts +67 -0
  167. package/src/tracking/github-action.ts +326 -0
  168. package/src/tracking/google-analytics.ts +265 -0
  169. package/src/tracking/index.ts +45 -0
  170. package/src/tracking/report-generator.ts +386 -0
  171. package/src/tracking/search-console.ts +335 -0
  172. package/src/types.ts +134 -0
  173. package/src/utils/http.ts +302 -0
  174. package/src/wasm-adapter.ts +297 -0
  175. package/src/wasm-entry.ts +14 -0
  176. package/tsconfig.json +17 -0
  177. package/tsup.wasm.config.ts +26 -0
  178. package/vitest.config.ts +15 -0
@@ -0,0 +1,589 @@
1
+ // Content Science Module - Novel SEO Checks Based on Academic Research
2
+ // Implements: Zipf's Law, Shannon Entropy, BM25 Scoring, Cosine Similarity
3
+ // These techniques differentiate RankCLI from competitors like Ahrefs/SEMrush
4
+
5
+ import * as cheerio from 'cheerio';
6
+ import type { AuditIssue } from '../types.js';
7
+
8
+ // ==================== ZIPF'S LAW ANALYSIS ====================
9
+ // Natural language follows Zipf's law: frequency ∝ 1/rank^α (α ≈ 1)
10
+ // Deviation from this indicates unnatural text (keyword stuffing)
11
+
12
+ export interface ZipfAnalysis {
13
+ alpha: number; // Exponent - natural text ≈ 0.9-1.1
14
+ isNatural: boolean;
15
+ rSquared: number; // Goodness of fit
16
+ stuffedKeywords: Array<{ word: string; deviation: number }>;
17
+ missingMidTail: string[];
18
+ }
19
+
20
+ /**
21
+ * Analyze keyword distribution against Zipf's law
22
+ * Detects keyword stuffing through statistical distribution analysis
23
+ */
24
+ export function analyzeZipfDistribution(html: string): ZipfAnalysis {
25
+ const $ = cheerio.load(html);
26
+ $('script, style, nav, footer, header').remove();
27
+ const text = $('body').text().toLowerCase();
28
+
29
+ // Extract words (min 3 chars to filter stopwords)
30
+ const words = text.match(/\b[a-z]{3,}\b/g) || [];
31
+ if (words.length < 100) {
32
+ return {
33
+ alpha: 1,
34
+ isNatural: true,
35
+ rSquared: 0,
36
+ stuffedKeywords: [],
37
+ missingMidTail: [],
38
+ };
39
+ }
40
+
41
+ // Count word frequencies
42
+ const freq = new Map<string, number>();
43
+ for (const word of words) {
44
+ freq.set(word, (freq.get(word) || 0) + 1);
45
+ }
46
+
47
+ // Sort by frequency (descending)
48
+ const sorted = [...freq.entries()].sort((a, b) => b[1] - a[1]);
49
+
50
+ // Take top 100 words for analysis
51
+ const topWords = sorted.slice(0, Math.min(100, sorted.length));
52
+
53
+ // Fit Zipf's law using log-log linear regression
54
+ // log(freq) = log(C) - α * log(rank)
55
+ const logRanks = topWords.map((_, i) => Math.log(i + 1));
56
+ const logFreqs = topWords.map(([_, f]) => Math.log(f));
57
+
58
+ const { slope, intercept, rSquared } = linearRegression(logRanks, logFreqs);
59
+ const alpha = -slope; // α is negative of slope
60
+
61
+ // Find deviations (potential keyword stuffing)
62
+ const stuffedKeywords: Array<{ word: string; deviation: number }> = [];
63
+ const expectedC = Math.exp(intercept);
64
+
65
+ for (let i = 0; i < topWords.length; i++) {
66
+ const [word, actualFreq] = topWords[i];
67
+ const expectedFreq = expectedC * Math.pow(i + 1, -alpha);
68
+ const deviation = (actualFreq - expectedFreq) / expectedFreq;
69
+
70
+ // Flag words with >50% higher frequency than expected (excluding very common words)
71
+ if (deviation > 0.5 && word.length > 4 && i > 5) {
72
+ stuffedKeywords.push({ word, deviation: Math.round(deviation * 100) });
73
+ }
74
+ }
75
+
76
+ // Find missing mid-tail terms (ranks 20-50 with lower than expected frequency)
77
+ const missingMidTail: string[] = [];
78
+ for (let i = 20; i < Math.min(50, topWords.length); i++) {
79
+ const [word, actualFreq] = topWords[i];
80
+ const expectedFreq = expectedC * Math.pow(i + 1, -alpha);
81
+ const deviation = (actualFreq - expectedFreq) / expectedFreq;
82
+
83
+ if (deviation < -0.3) {
84
+ missingMidTail.push(word);
85
+ }
86
+ }
87
+
88
+ return {
89
+ alpha,
90
+ isNatural: alpha >= 0.8 && alpha <= 1.2 && rSquared > 0.85,
91
+ rSquared,
92
+ stuffedKeywords: stuffedKeywords.slice(0, 10),
93
+ missingMidTail: missingMidTail.slice(0, 5),
94
+ };
95
+ }
96
+
97
+ // ==================== SHANNON ENTROPY ====================
98
+ // Information entropy measures vocabulary diversity
99
+ // Low entropy = repetitive content, High entropy = diverse vocabulary
100
+
101
+ export interface EntropyAnalysis {
102
+ wordEntropy: number;
103
+ normalizedEntropy: number; // 0-1 scale
104
+ vocabularyRichness: number; // Type-token ratio
105
+ repetitionScore: number; // 0-1, higher = more repetitive
106
+ bigramEntropy: number;
107
+ qualityIndicator: 'excellent' | 'good' | 'average' | 'poor';
108
+ }
109
+
110
+ /**
111
+ * Calculate Shannon entropy for content quality assessment
112
+ */
113
+ export function analyzeEntropy(html: string): EntropyAnalysis {
114
+ const $ = cheerio.load(html);
115
+ $('script, style, nav, footer, header').remove();
116
+ const text = $('body').text().toLowerCase();
117
+
118
+ const words = text.match(/\b[a-z]{2,}\b/g) || [];
119
+ if (words.length < 50) {
120
+ return {
121
+ wordEntropy: 0,
122
+ normalizedEntropy: 0,
123
+ vocabularyRichness: 0,
124
+ repetitionScore: 1,
125
+ bigramEntropy: 0,
126
+ qualityIndicator: 'poor',
127
+ };
128
+ }
129
+
130
+ // Word frequency
131
+ const wordFreq = new Map<string, number>();
132
+ for (const word of words) {
133
+ wordFreq.set(word, (wordFreq.get(word) || 0) + 1);
134
+ }
135
+
136
+ // Bigram frequency
137
+ const bigramFreq = new Map<string, number>();
138
+ for (let i = 0; i < words.length - 1; i++) {
139
+ const bigram = `${words[i]} ${words[i + 1]}`;
140
+ bigramFreq.set(bigram, (bigramFreq.get(bigram) || 0) + 1);
141
+ }
142
+
143
+ // Shannon entropy: H = -Σ p(x) * log2(p(x))
144
+ const wordEntropy = calculateShannonEntropy(wordFreq, words.length);
145
+ const bigramEntropy = calculateShannonEntropy(bigramFreq, words.length - 1);
146
+
147
+ // Maximum possible entropy (uniform distribution)
148
+ const maxEntropy = Math.log2(wordFreq.size);
149
+ const normalizedEntropy = maxEntropy > 0 ? wordEntropy / maxEntropy : 0;
150
+
151
+ // Type-token ratio (unique words / total words)
152
+ const vocabularyRichness = wordFreq.size / words.length;
153
+
154
+ // Repetition score (inverse of normalized entropy)
155
+ const repetitionScore = 1 - normalizedEntropy;
156
+
157
+ // Quality indicator
158
+ let qualityIndicator: EntropyAnalysis['qualityIndicator'];
159
+ if (normalizedEntropy > 0.85 && vocabularyRichness > 0.4) {
160
+ qualityIndicator = 'excellent';
161
+ } else if (normalizedEntropy > 0.75 && vocabularyRichness > 0.3) {
162
+ qualityIndicator = 'good';
163
+ } else if (normalizedEntropy > 0.6 && vocabularyRichness > 0.2) {
164
+ qualityIndicator = 'average';
165
+ } else {
166
+ qualityIndicator = 'poor';
167
+ }
168
+
169
+ return {
170
+ wordEntropy,
171
+ normalizedEntropy,
172
+ vocabularyRichness,
173
+ repetitionScore,
174
+ bigramEntropy,
175
+ qualityIndicator,
176
+ };
177
+ }
178
+
179
+ function calculateShannonEntropy(freq: Map<string, number>, total: number): number {
180
+ let entropy = 0;
181
+ for (const count of freq.values()) {
182
+ const p = count / total;
183
+ if (p > 0) {
184
+ entropy -= p * Math.log2(p);
185
+ }
186
+ }
187
+ return entropy;
188
+ }
189
+
190
+ // ==================== BM25 SCORING ====================
191
+ // Okapi BM25 is the ranking function used by Elasticsearch/Lucene
192
+ // More accurate than keyword density for relevance scoring
193
+
194
+ export interface BM25Analysis {
195
+ keyword: string;
196
+ score: number;
197
+ termFrequency: number;
198
+ saturationPoint: boolean; // TF above optimal
199
+ lengthPenalty: number;
200
+ recommendation: string;
201
+ }
202
+
203
+ /**
204
+ * Calculate BM25 relevance score for target keywords
205
+ * Parameters tuned for web content (k1=1.5, b=0.75)
206
+ */
207
+ export function calculateBM25(
208
+ html: string,
209
+ keywords: string[],
210
+ avgDocLength: number = 1500, // Average web page word count
211
+ k1: number = 1.5,
212
+ b: number = 0.75
213
+ ): BM25Analysis[] {
214
+ const $ = cheerio.load(html);
215
+ $('script, style').remove();
216
+ const text = $('body').text().toLowerCase();
217
+ const words = text.match(/\b[a-z]+\b/g) || [];
218
+ const docLength = words.length;
219
+
220
+ const results: BM25Analysis[] = [];
221
+
222
+ for (const keyword of keywords) {
223
+ const keywordLower = keyword.toLowerCase();
224
+ const keywordWords = keywordLower.split(/\s+/);
225
+
226
+ // Count exact phrase or individual word matches
227
+ let tf: number;
228
+ if (keywordWords.length > 1) {
229
+ // Phrase matching
230
+ const regex = new RegExp(keywordLower.replace(/\s+/g, '\\s+'), 'gi');
231
+ tf = (text.match(regex) || []).length;
232
+ } else {
233
+ // Single word
234
+ tf = words.filter((w) => w === keywordLower).length;
235
+ }
236
+
237
+ // BM25 formula (IDF = 1 for single document analysis)
238
+ const numerator = tf * (k1 + 1);
239
+ const denominator = tf + k1 * (1 - b + b * (docLength / avgDocLength));
240
+ const score = numerator / denominator;
241
+
242
+ // Check if additional occurrences would help (saturation check)
243
+ const nextTF = tf + 1;
244
+ const nextScore = (nextTF * (k1 + 1)) / (nextTF + k1 * (1 - b + b * (docLength / avgDocLength)));
245
+ const marginalGain = (nextScore - score) / (score || 1);
246
+ const saturationPoint = marginalGain < 0.1; // Less than 10% improvement
247
+
248
+ // Length penalty
249
+ const lengthPenalty = b * (docLength / avgDocLength - 1);
250
+
251
+ // Generate recommendation
252
+ let recommendation: string;
253
+ if (tf === 0) {
254
+ recommendation = `Add "${keyword}" to your content`;
255
+ } else if (saturationPoint) {
256
+ recommendation = `Keyword "${keyword}" is saturated (${tf} occurrences). Focus on semantic variations.`;
257
+ } else if (lengthPenalty > 0.3) {
258
+ recommendation = `Content is ${Math.round(lengthPenalty * 100)}% longer than average. Consider tightening.`;
259
+ } else if (tf < 3) {
260
+ recommendation = `Consider adding 1-2 more natural mentions of "${keyword}"`;
261
+ } else {
262
+ recommendation = `Good keyword presence for "${keyword}"`;
263
+ }
264
+
265
+ results.push({
266
+ keyword,
267
+ score: Math.round(score * 100) / 100,
268
+ termFrequency: tf,
269
+ saturationPoint,
270
+ lengthPenalty: Math.round(lengthPenalty * 100) / 100,
271
+ recommendation,
272
+ });
273
+ }
274
+
275
+ return results;
276
+ }
277
+
278
+ // ==================== COSINE SIMILARITY ====================
279
+ // Better duplicate/cannibalization detection than exact matching
280
+
281
+ export interface SimilarityAnalysis {
282
+ similarity: number; // 0-1
283
+ sharedTerms: string[];
284
+ uniqueToA: string[];
285
+ uniqueToB: string[];
286
+ recommendation: 'duplicate' | 'cannibalization' | 'related' | 'distinct';
287
+ }
288
+
289
+ /**
290
+ * Build TF-IDF vector for document
291
+ */
292
+ export function buildTFIDFVector(text: string, vocabulary: string[]): number[] {
293
+ const words = text.toLowerCase().match(/\b[a-z]+\b/g) || [];
294
+ const wordCount = new Map<string, number>();
295
+
296
+ for (const word of words) {
297
+ wordCount.set(word, (wordCount.get(word) || 0) + 1);
298
+ }
299
+
300
+ const docLength = words.length || 1;
301
+ const vector: number[] = [];
302
+
303
+ for (const term of vocabulary) {
304
+ const tf = (wordCount.get(term) || 0) / docLength;
305
+ // IDF approximation: log(2) for present terms, 0 for absent
306
+ const idf = wordCount.has(term) ? Math.log(2) : 0;
307
+ vector.push(tf * idf);
308
+ }
309
+
310
+ return vector;
311
+ }
312
+
313
+ /**
314
+ * Calculate cosine similarity between two content pieces
315
+ */
316
+ export function calculateCosineSimilarity(textA: string, textB: string): SimilarityAnalysis {
317
+ const wordsA = new Set((textA.toLowerCase().match(/\b[a-z]{3,}\b/g) || []));
318
+ const wordsB = new Set((textB.toLowerCase().match(/\b[a-z]{3,}\b/g) || []));
319
+
320
+ // Build combined vocabulary
321
+ const vocabulary = [...new Set([...wordsA, ...wordsB])];
322
+
323
+ // Build TF-IDF vectors
324
+ const vectorA = buildTFIDFVector(textA, vocabulary);
325
+ const vectorB = buildTFIDFVector(textB, vocabulary);
326
+
327
+ // Cosine similarity
328
+ let dotProduct = 0;
329
+ let normA = 0;
330
+ let normB = 0;
331
+
332
+ for (let i = 0; i < vocabulary.length; i++) {
333
+ dotProduct += vectorA[i] * vectorB[i];
334
+ normA += vectorA[i] * vectorA[i];
335
+ normB += vectorB[i] * vectorB[i];
336
+ }
337
+
338
+ const similarity = normA > 0 && normB > 0 ? dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)) : 0;
339
+
340
+ // Find shared and unique terms
341
+ const sharedTerms = [...wordsA].filter((w) => wordsB.has(w)).slice(0, 20);
342
+ const uniqueToA = [...wordsA].filter((w) => !wordsB.has(w)).slice(0, 10);
343
+ const uniqueToB = [...wordsB].filter((w) => !wordsA.has(w)).slice(0, 10);
344
+
345
+ // Recommendation based on similarity
346
+ let recommendation: SimilarityAnalysis['recommendation'];
347
+ if (similarity > 0.95) {
348
+ recommendation = 'duplicate';
349
+ } else if (similarity > 0.75) {
350
+ recommendation = 'cannibalization';
351
+ } else if (similarity > 0.5) {
352
+ recommendation = 'related';
353
+ } else {
354
+ recommendation = 'distinct';
355
+ }
356
+
357
+ return {
358
+ similarity: Math.round(similarity * 100) / 100,
359
+ sharedTerms,
360
+ uniqueToA,
361
+ uniqueToB,
362
+ recommendation,
363
+ };
364
+ }
365
+
366
+ // ==================== INFORMATION GAIN ====================
367
+ // Identifies keywords that differentiate top-ranking content
368
+
369
+ export interface InformationGainResult {
370
+ keyword: string;
371
+ informationGain: number;
372
+ presentInTopContent: boolean;
373
+ priority: 'critical' | 'high' | 'medium' | 'low';
374
+ }
375
+
376
+ /**
377
+ * Calculate information gain for keywords (simplified for single-document analysis)
378
+ * Compares content against expected term distribution
379
+ */
380
+ export function calculateInformationGain(
381
+ html: string,
382
+ topicKeywords: string[]
383
+ ): InformationGainResult[] {
384
+ const $ = cheerio.load(html);
385
+ $('script, style').remove();
386
+ const text = $('body').text().toLowerCase();
387
+
388
+ const results: InformationGainResult[] = [];
389
+
390
+ for (const keyword of topicKeywords) {
391
+ const keywordLower = keyword.toLowerCase();
392
+ const isPresent = text.includes(keywordLower);
393
+
394
+ // Simple information gain approximation
395
+ // Higher IG for important topic keywords that are missing
396
+ const wordLength = keywordLower.split(/\s+/).length;
397
+ const baseIG = wordLength > 1 ? 0.5 : 0.3; // Phrases are more specific
398
+
399
+ const informationGain = isPresent ? baseIG * 0.8 : baseIG;
400
+
401
+ let priority: InformationGainResult['priority'];
402
+ if (!isPresent && informationGain > 0.4) {
403
+ priority = 'critical';
404
+ } else if (!isPresent && informationGain > 0.3) {
405
+ priority = 'high';
406
+ } else if (!isPresent) {
407
+ priority = 'medium';
408
+ } else {
409
+ priority = 'low';
410
+ }
411
+
412
+ results.push({
413
+ keyword,
414
+ informationGain: Math.round(informationGain * 100) / 100,
415
+ presentInTopContent: isPresent,
416
+ priority,
417
+ });
418
+ }
419
+
420
+ return results.sort((a, b) => b.informationGain - a.informationGain);
421
+ }
422
+
423
+ // ==================== HELPER FUNCTIONS ====================
424
+
425
+ function linearRegression(x: number[], y: number[]): { slope: number; intercept: number; rSquared: number } {
426
+ const n = x.length;
427
+ let sumX = 0,
428
+ sumY = 0,
429
+ sumXY = 0,
430
+ sumXX = 0,
431
+ sumYY = 0;
432
+
433
+ for (let i = 0; i < n; i++) {
434
+ sumX += x[i];
435
+ sumY += y[i];
436
+ sumXY += x[i] * y[i];
437
+ sumXX += x[i] * x[i];
438
+ sumYY += y[i] * y[i];
439
+ }
440
+
441
+ const slope = (n * sumXY - sumX * sumY) / (n * sumXX - sumX * sumX);
442
+ const intercept = (sumY - slope * sumX) / n;
443
+
444
+ // R-squared
445
+ const yMean = sumY / n;
446
+ let ssRes = 0,
447
+ ssTot = 0;
448
+ for (let i = 0; i < n; i++) {
449
+ const predicted = slope * x[i] + intercept;
450
+ ssRes += (y[i] - predicted) ** 2;
451
+ ssTot += (y[i] - yMean) ** 2;
452
+ }
453
+ const rSquared = ssTot > 0 ? 1 - ssRes / ssTot : 0;
454
+
455
+ return { slope, intercept, rSquared };
456
+ }
457
+
458
+ // ==================== MAIN ANALYSIS FUNCTION ====================
459
+
460
+ export interface ContentScienceData {
461
+ zipf: ZipfAnalysis;
462
+ entropy: EntropyAnalysis;
463
+ bm25: BM25Analysis[];
464
+ overallQuality: 'excellent' | 'good' | 'needs-work' | 'poor';
465
+ }
466
+
467
+ /**
468
+ * Run comprehensive content science analysis
469
+ */
470
+ export function analyzeContentScience(
471
+ html: string,
472
+ url: string,
473
+ targetKeywords: string[] = []
474
+ ): { issues: AuditIssue[]; data: ContentScienceData } {
475
+ const issues: AuditIssue[] = [];
476
+
477
+ // Zipf's Law Analysis
478
+ const zipf = analyzeZipfDistribution(html);
479
+
480
+ if (!zipf.isNatural && zipf.stuffedKeywords.length > 0) {
481
+ issues.push({
482
+ code: 'ZIPF_KEYWORD_STUFFING',
483
+ severity: 'warning',
484
+ category: 'content',
485
+ title: 'Unnatural keyword distribution detected',
486
+ description: `Content deviates from natural language patterns (Zipf α=${zipf.alpha.toFixed(2)}, expected ~1.0). Possible keyword stuffing detected.`,
487
+ impact: 'Search engines can detect unnatural text patterns, potentially triggering spam filters.',
488
+ howToFix: `Reduce repetition of: ${zipf.stuffedKeywords.map((k) => `"${k.word}" (+${k.deviation}%)`).join(', ')}`,
489
+ affectedUrls: [url],
490
+ details: {
491
+ alpha: zipf.alpha,
492
+ rSquared: zipf.rSquared,
493
+ stuffedKeywords: zipf.stuffedKeywords,
494
+ },
495
+ });
496
+ }
497
+
498
+ // Entropy Analysis
499
+ const entropy = analyzeEntropy(html);
500
+
501
+ if (entropy.qualityIndicator === 'poor') {
502
+ issues.push({
503
+ code: 'LOW_CONTENT_ENTROPY',
504
+ severity: 'warning',
505
+ category: 'content',
506
+ title: 'Low vocabulary diversity (thin content signal)',
507
+ description: `Content entropy is low (${entropy.normalizedEntropy.toFixed(2)}). This indicates repetitive or thin content.`,
508
+ impact: 'Low diversity content may be seen as low-quality by search engines.',
509
+ howToFix: 'Expand vocabulary, add more unique insights, and reduce repetitive phrases.',
510
+ affectedUrls: [url],
511
+ details: {
512
+ normalizedEntropy: entropy.normalizedEntropy,
513
+ vocabularyRichness: entropy.vocabularyRichness,
514
+ repetitionScore: entropy.repetitionScore,
515
+ },
516
+ });
517
+ }
518
+
519
+ if (entropy.repetitionScore > 0.7) {
520
+ issues.push({
521
+ code: 'HIGH_CONTENT_REPETITION',
522
+ severity: 'notice',
523
+ category: 'content',
524
+ title: 'High content repetition detected',
525
+ description: `Content shows ${Math.round(entropy.repetitionScore * 100)}% repetition pattern.`,
526
+ impact: 'Highly repetitive content provides less value and may hurt engagement.',
527
+ howToFix: 'Vary your vocabulary and sentence structures. Avoid repeating the same phrases.',
528
+ affectedUrls: [url],
529
+ });
530
+ }
531
+
532
+ // BM25 Analysis (if keywords provided)
533
+ const bm25 = targetKeywords.length > 0 ? calculateBM25(html, targetKeywords) : [];
534
+
535
+ for (const result of bm25) {
536
+ if (result.termFrequency === 0) {
537
+ issues.push({
538
+ code: 'BM25_KEYWORD_MISSING',
539
+ severity: 'warning',
540
+ category: 'on-page',
541
+ title: `Target keyword "${result.keyword}" not found`,
542
+ description: 'This keyword is not present in your content at all.',
543
+ impact: 'Page unlikely to rank for this keyword without any mention of it.',
544
+ howToFix: `Add natural mentions of "${result.keyword}" in your content.`,
545
+ affectedUrls: [url],
546
+ details: { keyword: result.keyword, bm25Score: 0 },
547
+ });
548
+ } else if (result.saturationPoint && result.termFrequency > 10) {
549
+ issues.push({
550
+ code: 'BM25_KEYWORD_SATURATED',
551
+ severity: 'notice',
552
+ category: 'on-page',
553
+ title: `Keyword "${result.keyword}" over-optimized`,
554
+ description: `Found ${result.termFrequency} times. Additional mentions provide diminishing returns.`,
555
+ impact: 'Over-optimization can trigger spam signals.',
556
+ howToFix: 'Use semantic variations and related terms instead of exact repetition.',
557
+ affectedUrls: [url],
558
+ details: { keyword: result.keyword, termFrequency: result.termFrequency },
559
+ });
560
+ }
561
+ }
562
+
563
+ // Overall quality assessment
564
+ let overallQuality: ContentScienceData['overallQuality'];
565
+ const qualityScore =
566
+ (zipf.isNatural ? 30 : 0) +
567
+ (entropy.qualityIndicator === 'excellent' ? 40 : entropy.qualityIndicator === 'good' ? 30 : entropy.qualityIndicator === 'average' ? 20 : 0) +
568
+ (bm25.filter((b) => b.termFrequency > 0 && !b.saturationPoint).length / Math.max(bm25.length, 1)) * 30;
569
+
570
+ if (qualityScore >= 80) {
571
+ overallQuality = 'excellent';
572
+ } else if (qualityScore >= 60) {
573
+ overallQuality = 'good';
574
+ } else if (qualityScore >= 40) {
575
+ overallQuality = 'needs-work';
576
+ } else {
577
+ overallQuality = 'poor';
578
+ }
579
+
580
+ return {
581
+ issues,
582
+ data: {
583
+ zipf,
584
+ entropy,
585
+ bm25,
586
+ overallQuality,
587
+ },
588
+ };
589
+ }